Skip to content

Commit

Permalink
CLN: Stopped object inference in constructors for pandas objects (#58758
Browse files Browse the repository at this point in the history
)

* CLN: Stopped object inference in constructors for pandas objects

* Adjust tests
  • Loading branch information
mroeschke authored Jun 3, 2024
1 parent a5492ee commit 9f71476
Show file tree
Hide file tree
Showing 15 changed files with 48 additions and 136 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,7 @@ Removal of prior version deprecations/changes
- Removed the deprecated ``delim_whitespace`` keyword in :func:`read_csv` and :func:`read_table`, use ``sep=r"\s+"`` instead (:issue:`55569`)
- Require :meth:`SparseDtype.fill_value` to be a valid value for the :meth:`SparseDtype.subtype` (:issue:`53043`)
- Stopped automatically casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`)
- Stopped performing dtype inference in :class:`Index`, :class:`Series` and :class:`DataFrame` constructors when given a pandas object (:class:`Series`, :class:`Index`, :class:`ExtensionArray`), call ``.infer_objects`` on the input to keep the current behavior (:issue:`56012`)
- Stopped performing dtype inference when setting a :class:`Index` into a :class:`DataFrame` (:issue:`56102`)
- Stopped performing dtype inference with in :meth:`Index.insert` with object-dtype index; this often affects the index/columns that result when setting new entries into an empty :class:`Series` or :class:`DataFrame` (:issue:`51363`)
- Removed the "closed" and "unit" keywords in :meth:`TimedeltaIndex.__new__` (:issue:`52628`, :issue:`55499`)
Expand Down
13 changes: 3 additions & 10 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
ContextManager,
cast,
)
import warnings

import numpy as np

Expand Down Expand Up @@ -290,17 +289,11 @@ def box_expected(expected, box_cls, transpose: bool = True):
else:
expected = pd.array(expected, copy=False)
elif box_cls is Index:
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning)
expected = Index(expected)
expected = Index(expected)
elif box_cls is Series:
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning)
expected = Series(expected)
expected = Series(expected)
elif box_cls is DataFrame:
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning)
expected = Series(expected).to_frame()
expected = Series(expected).to_frame()
if transpose:
# for vector operations, we need a DataFrame to be a single-row,
# not a single-column, in order to operate against non-DataFrame
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,7 @@ def sanitize_array(
# Avoid ending up with a NumpyExtensionArray
dtype = dtype.numpy_dtype

data_was_index = isinstance(data, ABCIndex)
infer_object = not isinstance(data, (ABCIndex, ABCSeries))

# extract ndarray or ExtensionArray, ensure we have no NumpyExtensionArray
data = extract_array(data, extract_numpy=True, extract_range=True)
Expand Down Expand Up @@ -607,7 +607,7 @@ def sanitize_array(

if dtype is None:
subarr = data
if data.dtype == object and not data_was_index:
if data.dtype == object and infer_object:
subarr = maybe_infer_to_datetimelike(data)
elif data.dtype.kind == "U" and using_pyarrow_string_dtype():
from pandas.core.arrays.string_ import StringDtype
Expand Down
16 changes: 0 additions & 16 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -728,10 +728,6 @@ def __init__(
NDFrame.__init__(self, data)
return

is_pandas_object = isinstance(data, (Series, Index, ExtensionArray))
data_dtype = getattr(data, "dtype", None)
original_dtype = dtype

# GH47215
if isinstance(index, set):
raise ValueError("index cannot be a set")
Expand Down Expand Up @@ -896,18 +892,6 @@ def __init__(

NDFrame.__init__(self, mgr)

if original_dtype is None and is_pandas_object and data_dtype == np.object_:
if self.dtypes.iloc[0] != data_dtype:
warnings.warn(
"Dtype inference on a pandas object "
"(Series, Index, ExtensionArray) is deprecated. The DataFrame "
"constructor will keep the original dtype in the future. "
"Call `infer_objects` on the result to get the old "
"behavior.",
FutureWarning,
stacklevel=2,
)

# ----------------------------------------------------------------------

def __dataframe__(
Expand Down
22 changes: 5 additions & 17 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,8 +490,6 @@ def __new__(
if not copy and isinstance(data, (ABCSeries, Index)):
refs = data._references

is_pandas_object = isinstance(data, (ABCSeries, Index, ExtensionArray))

# range
if isinstance(data, (range, RangeIndex)):
result = RangeIndex(start=data, copy=copy, name=name)
Expand All @@ -508,7 +506,7 @@ def __new__(
elif is_ea_or_datetimelike_dtype(data_dtype):
pass

elif isinstance(data, (np.ndarray, Index, ABCSeries)):
elif isinstance(data, (np.ndarray, ABCMultiIndex)):
if isinstance(data, ABCMultiIndex):
data = data._values

Expand All @@ -518,7 +516,9 @@ def __new__(
# they are actually ints, e.g. '0' and 0.0
# should not be coerced
data = com.asarray_tuplesafe(data, dtype=_dtype_obj)

elif isinstance(data, (ABCSeries, Index)):
# GH 56244: Avoid potential inference on object types
pass
elif is_scalar(data):
raise cls._raise_scalar_data_error(data)
elif hasattr(data, "__array__"):
Expand Down Expand Up @@ -571,19 +571,7 @@ def __new__(
klass = cls._dtype_to_subclass(arr.dtype)

arr = klass._ensure_array(arr, arr.dtype, copy=False)
result = klass._simple_new(arr, name, refs=refs)
if dtype is None and is_pandas_object and data_dtype == np.object_:
if result.dtype != data_dtype:
warnings.warn(
"Dtype inference on a pandas object "
"(Series, Index, ExtensionArray) is deprecated. The Index "
"constructor will keep the original dtype in the future. "
"Call `infer_objects` on the result to get the old "
"behavior.",
FutureWarning,
stacklevel=2,
)
return result # type: ignore[return-value]
return klass._simple_new(arr, name, refs=refs)

@classmethod
def _ensure_array(cls, data, dtype, copy: bool):
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ def ndarray_to_mgr(
) -> Manager:
# used in DataFrame.__init__
# input must be a ndarray, list, Series, Index, ExtensionArray
infer_object = not isinstance(values, (ABCSeries, Index, ExtensionArray))

if isinstance(values, ABCSeries):
if columns is None:
Expand Down Expand Up @@ -287,15 +288,14 @@ def ndarray_to_mgr(
# if we don't have a dtype specified, then try to convert objects
# on the entire block; this is to convert if we have datetimelike's
# embedded in an object type
if dtype is None and is_object_dtype(values.dtype):
if dtype is None and infer_object and is_object_dtype(values.dtype):
obj_columns = list(values)
maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns]
# don't convert (and copy) the objects if no type inference occurs
if any(x is not y for x, y in zip(obj_columns, maybe_datetime)):
dvals_list = [ensure_block_shape(dval, 2) for dval in maybe_datetime]
block_values = [
new_block_2d(dvals_list[n], placement=BlockPlacement(n))
for n in range(len(dvals_list))
new_block_2d(ensure_block_shape(dval, 2), placement=BlockPlacement(n))
for n, dval in enumerate(maybe_datetime)
]
else:
bp = BlockPlacement(slice(len(columns)))
Expand Down
16 changes: 0 additions & 16 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,10 +389,6 @@ def __init__(
self.name = name
return

is_pandas_object = isinstance(data, (Series, Index, ExtensionArray))
data_dtype = getattr(data, "dtype", None)
original_dtype = dtype

if isinstance(data, (ExtensionArray, np.ndarray)):
if copy is not False:
if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)):
Expand Down Expand Up @@ -438,7 +434,6 @@ def __init__(
data = data.astype(dtype)

refs = data._references
data = data._values
copy = False

elif isinstance(data, np.ndarray):
Expand Down Expand Up @@ -512,17 +507,6 @@ def __init__(
self.name = name
self._set_axis(0, index)

if original_dtype is None and is_pandas_object and data_dtype == np.object_:
if self.dtype != data_dtype:
warnings.warn(
"Dtype inference on a pandas object "
"(Series, Index, ExtensionArray) is deprecated. The Series "
"constructor will keep the original dtype in the future. "
"Call `infer_objects` on the result to get the old behavior.",
FutureWarning,
stacklevel=find_stack_level(),
)

def _init_dict(
self, data: Mapping, index: Index | None = None, dtype: DtypeObj | None = None
):
Expand Down
10 changes: 5 additions & 5 deletions pandas/tests/copy_view/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,12 +228,12 @@ def test_dataframe_from_series_or_index_different_dtype(index_or_series):
assert df._mgr._has_no_reference(0)


def test_dataframe_from_series_infer_datetime():
def test_dataframe_from_series_dont_infer_datetime():
ser = Series([Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype=object)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
df = DataFrame(ser)
assert not np.shares_memory(get_array(ser), get_array(df, 0))
assert df._mgr._has_no_reference(0)
df = DataFrame(ser)
assert df.dtypes.iloc[0] == np.dtype(object)
assert np.shares_memory(get_array(ser), get_array(df, 0))
assert not df._mgr._has_no_reference(0)


@pytest.mark.parametrize("index", [None, [0, 1, 2]])
Expand Down
17 changes: 5 additions & 12 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2702,21 +2702,14 @@ def test_frame_string_inference_block_dim(self):
df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]]))
assert df._mgr.blocks[0].ndim == 2

def test_inference_on_pandas_objects(self):
@pytest.mark.parametrize("klass", [Series, Index])
def test_inference_on_pandas_objects(self, klass):
# GH#56012
idx = Index([Timestamp("2019-12-31")], dtype=object)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = DataFrame(idx, columns=["a"])
assert result.dtypes.iloc[0] != np.object_
result = DataFrame({"a": idx})
obj = klass([Timestamp("2019-12-31")], dtype=object)
result = DataFrame(obj, columns=["a"])
assert result.dtypes.iloc[0] == np.object_

ser = Series([Timestamp("2019-12-31")], dtype=object)

with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = DataFrame(ser, columns=["a"])
assert result.dtypes.iloc[0] != np.object_
result = DataFrame({"a": ser})
result = DataFrame({"a": obj})
assert result.dtypes.iloc[0] == np.object_

def test_dict_keys_returns_rangeindex(self):
Expand Down
16 changes: 5 additions & 11 deletions pandas/tests/indexes/base_class/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,18 +59,12 @@ def test_index_string_inference(self):
ser = Index(["a", 1])
tm.assert_index_equal(ser, expected)

def test_inference_on_pandas_objects(self):
@pytest.mark.parametrize("klass", [Series, Index])
def test_inference_on_pandas_objects(self, klass):
# GH#56012
idx = Index([pd.Timestamp("2019-12-31")], dtype=object)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = Index(idx)
assert result.dtype != np.object_

ser = Series([pd.Timestamp("2019-12-31")], dtype=object)

with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = Index(ser)
assert result.dtype != np.object_
obj = klass([pd.Timestamp("2019-12-31")], dtype=object)
result = Index(obj)
assert result.dtype == np.object_

def test_constructor_not_read_only(self):
# GH#57130
Expand Down
17 changes: 7 additions & 10 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,23 +104,20 @@ def test_constructor_copy(self, using_infer_string):
)
def test_constructor_from_index_dtlike(self, cast_as_obj, index):
if cast_as_obj:
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = Index(index.astype(object))
else:
result = Index(index)

tm.assert_index_equal(result, index)

if isinstance(index, DatetimeIndex):
assert result.tz == index.tz
if cast_as_obj:
result = Index(index.astype(object))
assert result.dtype == np.dtype(object)
if isinstance(index, DatetimeIndex):
# GH#23524 check that Index(dti, dtype=object) does not
# incorrectly raise ValueError, and that nanoseconds are not
# dropped
index += pd.Timedelta(nanoseconds=50)
result = Index(index, dtype=object)
assert result.dtype == np.object_
assert list(result) == list(index)
else:
result = Index(index)

tm.assert_index_equal(result, index)

@pytest.mark.parametrize(
"index,has_tz",
Expand Down
5 changes: 2 additions & 3 deletions pandas/tests/series/accessors/test_dt_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,9 +256,8 @@ def test_dt_accessor_limited_display_api(self):
tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods)))

# Period
idx = period_range("20130101", periods=5, freq="D", name="xxx").astype(object)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
ser = Series(idx)
idx = period_range("20130101", periods=5, freq="D", name="xxx")
ser = Series(idx)
results = get_dir(ser)
tm.assert_almost_equal(
results, sorted(set(ok_for_period + ok_for_period_methods))
Expand Down
6 changes: 2 additions & 4 deletions pandas/tests/series/methods/test_equals.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,15 +82,13 @@ def test_equals_matching_nas():
left = Series([np.datetime64("NaT")], dtype=object)
right = Series([np.datetime64("NaT")], dtype=object)
assert left.equals(right)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
assert Index(left).equals(Index(right))
assert Index(left).equals(Index(right))
assert left.array.equals(right.array)

left = Series([np.timedelta64("NaT")], dtype=object)
right = Series([np.timedelta64("NaT")], dtype=object)
assert left.equals(right)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
assert Index(left).equals(Index(right))
assert Index(left).equals(Index(right))
assert left.array.equals(right.array)

left = Series([np.float64("NaN")], dtype=object)
Expand Down
21 changes: 7 additions & 14 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1318,9 +1318,8 @@ def test_constructor_periodindex(self):
pi = period_range("20130101", periods=5, freq="D")
s = Series(pi)
assert s.dtype == "Period[D]"
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
expected = Series(pi.astype(object))
tm.assert_series_equal(s, expected)
expected = Series(pi.astype(object))
assert expected.dtype == object

def test_constructor_dict(self):
d = {"a": 0.0, "b": 1.0, "c": 2.0}
Expand Down Expand Up @@ -2137,20 +2136,14 @@ def test_series_string_inference_na_first(self):
result = Series([pd.NA, "b"])
tm.assert_series_equal(result, expected)

def test_inference_on_pandas_objects(self):
@pytest.mark.parametrize("klass", [Series, Index])
def test_inference_on_pandas_objects(self, klass):
# GH#56012
ser = Series([Timestamp("2019-12-31")], dtype=object)
with tm.assert_produces_warning(None):
# This doesn't do inference
result = Series(ser)
obj = klass([Timestamp("2019-12-31")], dtype=object)
# This doesn't do inference
result = Series(obj)
assert result.dtype == np.object_

idx = Index([Timestamp("2019-12-31")], dtype=object)

with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = Series(idx)
assert result.dtype != np.object_


class TestSeriesConstructorIndexCoercion:
def test_series_constructor_datetimelike_index_coercion(self):
Expand Down
12 changes: 0 additions & 12 deletions pandas/tests/tseries/frequencies/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
date_range,
period_range,
)
import pandas._testing as tm
from pandas.core.arrays import (
DatetimeArray,
TimedeltaArray,
Expand Down Expand Up @@ -202,17 +201,6 @@ def test_infer_freq_custom(base_delta_code_pair, constructor):
assert frequencies.infer_freq(index) is None


@pytest.mark.parametrize(
"freq,expected", [("Q", "QE-DEC"), ("Q-NOV", "QE-NOV"), ("Q-OCT", "QE-OCT")]
)
def test_infer_freq_index(freq, expected):
rng = period_range("1959Q2", "2009Q3", freq=freq)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
rng = Index(rng.to_timestamp("D", how="e").astype(object))

assert rng.inferred_freq == expected


@pytest.mark.parametrize(
"expected,dates",
list(
Expand Down

0 comments on commit 9f71476

Please sign in to comment.