Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: Deprecate dtype inference on pandas objects #56244

Merged
merged 15 commits into from
Dec 21, 2023
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,7 @@ Other Deprecations
- Deprecated behavior of :meth:`Index.insert` with an object-dtype index silently performing type inference on the result, explicitly call ``result.infer_objects(copy=False)`` for the old behavior instead (:issue:`51363`)
- Deprecated casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`)
- Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`53656`)
- Deprecated dtype inference in :class:`Index`, :class:`Series` and :class:`DataFrame` constructors when giving a pandas input, call ``.infer_objects`` on the input to keep the current behavior (:issue:`56012`)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

on the input -> on the result?

Copy link
Member Author

@phofl phofl Dec 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

either way is totally fine

- Deprecated dtype inference when setting a :class:`Index` into a :class:`DataFrame`, cast explicitly instead (:issue:`56102`)
- Deprecated including the groups in computations when using :meth:`.DataFrameGroupBy.apply` and :meth:`.DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`)
- Deprecated indexing an :class:`Index` with a boolean indexer of length zero (:issue:`55820`)
Expand Down
13 changes: 10 additions & 3 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
ContextManager,
cast,
)
import warnings

import numpy as np

Expand Down Expand Up @@ -285,11 +286,17 @@ def box_expected(expected, box_cls, transpose: bool = True):
else:
expected = pd.array(expected, copy=False)
elif box_cls is Index:
expected = Index(expected)
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning)
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved
expected = Index(expected)
elif box_cls is Series:
expected = Series(expected)
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning)
expected = Series(expected)
elif box_cls is DataFrame:
expected = Series(expected).to_frame()
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning)
expected = Series(expected).to_frame()
if transpose:
# for vector operations, we need a DataFrame to be a single-row,
# not a single-column, in order to operate against non-DataFrame
Expand Down
15 changes: 15 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -726,6 +726,10 @@ def __init__(

manager = _get_option("mode.data_manager", silent=True)

is_pandas_object = isinstance(data, (Series, Index, ExtensionArray))
data_dtype = getattr(data, "dtype", None)
original_dtype = dtype

# GH47215
if isinstance(index, set):
raise ValueError("index cannot be a set")
Expand Down Expand Up @@ -912,6 +916,17 @@ def __init__(

NDFrame.__init__(self, mgr)

if original_dtype is None and is_pandas_object and data_dtype == np.object_:
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved
if self.dtypes.iloc[0] != data_dtype:
warnings.warn(
"Dtype inference on a pandas object "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"in the DataFrame constructor" somewhere in this sentence? I think the "The Index constructor" below is a copy/paste leftover

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah sorry, adjusted

"(Series, Index, ExtensionArray) is deprecated. The Index "
"constructor will keep the original dtype in the future. "
"Call `infer_objects` on the result",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"to get the old behavior"

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added

FutureWarning,
stacklevel=2,
)

# ----------------------------------------------------------------------

def __dataframe__(
Expand Down
15 changes: 14 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,8 @@ def __new__(
if not copy and isinstance(data, (ABCSeries, Index)):
refs = data._references

is_pandas_object = isinstance(data, (ABCSeries, Index, ExtensionArray))

# range
if isinstance(data, (range, RangeIndex)):
result = RangeIndex(start=data, copy=copy, name=name)
Expand Down Expand Up @@ -571,7 +573,18 @@ def __new__(
klass = cls._dtype_to_subclass(arr.dtype)

arr = klass._ensure_array(arr, arr.dtype, copy=False)
return klass._simple_new(arr, name, refs=refs)
result = klass._simple_new(arr, name, refs=refs)
if dtype is None and is_pandas_object and data_dtype == np.object_:
if result.dtype != data_dtype:
warnings.warn(
"Dtype inference on a pandas object "
"(Series, Index, ExtensionArray) is deprecated. The Index "
"constructor will keep the original dtype in the future. "
"Call `infer_objects` on the result",
FutureWarning,
stacklevel=2,
)
return result # type: ignore[return-value]

@classmethod
def _ensure_array(cls, data, dtype, copy: bool):
Expand Down
15 changes: 15 additions & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,10 @@ def __init__(
self.name = name
return

is_pandas_object = isinstance(data, (Series, Index, ExtensionArray))
data_dtype = getattr(data, "dtype", None)
original_dtype = dtype

if isinstance(data, (ExtensionArray, np.ndarray)):
if copy is not False and using_copy_on_write():
if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)):
Expand Down Expand Up @@ -581,6 +585,17 @@ def __init__(
self.name = name
self._set_axis(0, index)

if original_dtype is None and is_pandas_object and data_dtype == np.object_:
if self.dtype != data_dtype:
warnings.warn(
"Dtype inference on a pandas object "
"(Series, Index, ExtensionArray) is deprecated. The Index "
"constructor will keep the original dtype in the future. "
"Call `infer_objects` on the result",
FutureWarning,
stacklevel=find_stack_level(),
)

def _init_dict(
self, data, index: Index | None = None, dtype: DtypeObj | None = None
):
Expand Down
13 changes: 6 additions & 7 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,19 +689,18 @@ def cat(
result = cat_safe(all_cols, sep)

out: Index | Series
if isinstance(self._orig.dtype, CategoricalDtype):
# We need to infer the new categories.
dtype = self._orig.dtype.categories.dtype
else:
dtype = self._orig.dtype
if isinstance(self._orig, ABCIndex):
# add dtype for case that result is all-NA
dtype = None
if isna(result).all():
dtype = object
dtype = object # type: ignore[assignment]

out = Index(result, dtype=dtype, name=self._orig.name)
else: # Series
if isinstance(self._orig.dtype, CategoricalDtype):
# We need to infer the new categories.
dtype = self._orig.dtype.categories.dtype # type: ignore[assignment]
else:
dtype = self._orig.dtype
res_ser = Series(
result, dtype=dtype, index=data.index, name=self._orig.name, copy=False
)
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/copy_view/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,8 @@ def test_dataframe_from_series_or_index_different_dtype(using_copy_on_write, con

def test_dataframe_from_series_infer_datetime(using_copy_on_write):
ser = Series([Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype=object)
df = DataFrame(ser)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
df = DataFrame(ser)
assert not np.shares_memory(get_array(ser), get_array(df, 0))
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2768,6 +2768,23 @@ def test_frame_string_inference_block_dim(self):
df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]]))
assert df._mgr.blocks[0].ndim == 2

def test_inference_on_pandas_objects(self):
# GH#56012
idx = Index([Timestamp("2019-12-31")], dtype=object)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = DataFrame(idx, columns=["a"])
assert result.dtypes.iloc[0] != np.object_
result = DataFrame({"a": idx})
assert result.dtypes.iloc[0] == np.object_

ser = Series([Timestamp("2019-12-31")], dtype=object)

with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = DataFrame(ser, columns=["a"])
assert result.dtypes.iloc[0] != np.object_
result = DataFrame({"a": ser})
assert result.dtypes.iloc[0] == np.object_


class TestDataFrameConstructorIndexInference:
def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self):
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/indexes/base_class/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pandas import (
Index,
MultiIndex,
Series,
)
import pandas._testing as tm

Expand Down Expand Up @@ -57,3 +58,16 @@ def test_index_string_inference(self):
with pd.option_context("future.infer_string", True):
ser = Index(["a", 1])
tm.assert_index_equal(ser, expected)

def test_inference_on_pandas_objects(self):
# GH#56012
idx = Index([pd.Timestamp("2019-12-31")], dtype=object)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = Index(idx)
assert result.dtype != np.object_

ser = Series([pd.Timestamp("2019-12-31")], dtype=object)

with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = Index(ser)
assert result.dtype != np.object_
3 changes: 2 additions & 1 deletion pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ def test_constructor_copy(self, index, using_infer_string):
)
def test_constructor_from_index_dtlike(self, cast_as_obj, index):
if cast_as_obj:
result = Index(index.astype(object))
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = Index(index.astype(object))
else:
result = Index(index)

Expand Down
7 changes: 4 additions & 3 deletions pandas/tests/series/accessors/test_dt_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,9 +259,10 @@ def test_dt_accessor_limited_display_api(self):
tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods)))

# Period
ser = Series(
period_range("20130101", periods=5, freq="D", name="xxx").astype(object)
)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
ser = Series(
period_range("20130101", periods=5, freq="D", name="xxx").astype(object)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you construct the index outside the context so it is obvious where the warning comes from

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

moved

)
results = get_dir(ser)
tm.assert_almost_equal(
results, sorted(set(ok_for_period + ok_for_period_methods))
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/series/methods/test_between.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def test_between(self):
tm.assert_series_equal(result, expected)

def test_between_datetime_object_dtype(self):
ser = Series(bdate_range("1/1/2000", periods=20).astype(object))
ser = Series(bdate_range("1/1/2000", periods=20).astype(object), dtype=object)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The .astype(object) here is redundant

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah yes, changed

ser[::2] = np.nan

result = ser[ser.between(ser[3], ser[17])]
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/series/methods/test_equals.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,15 @@ def test_equals_matching_nas():
left = Series([np.datetime64("NaT")], dtype=object)
right = Series([np.datetime64("NaT")], dtype=object)
assert left.equals(right)
assert Index(left).equals(Index(right))
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
assert Index(left).equals(Index(right))
assert left.array.equals(right.array)

left = Series([np.timedelta64("NaT")], dtype=object)
right = Series([np.timedelta64("NaT")], dtype=object)
assert left.equals(right)
assert Index(left).equals(Index(right))
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
assert Index(left).equals(Index(right))
assert left.array.equals(right.array)

left = Series([np.float64("NaN")], dtype=object)
Expand Down
17 changes: 16 additions & 1 deletion pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1316,7 +1316,8 @@ def test_constructor_periodindex(self):
pi = period_range("20130101", periods=5, freq="D")
s = Series(pi)
assert s.dtype == "Period[D]"
expected = Series(pi.astype(object))
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
expected = Series(pi.astype(object))
tm.assert_series_equal(s, expected)

def test_constructor_dict(self):
Expand Down Expand Up @@ -2137,6 +2138,20 @@ def test_series_string_inference_na_first(self):
result = Series([pd.NA, "b"])
tm.assert_series_equal(result, expected)

def test_inference_on_pandas_objects(self):
# GH#56012
ser = Series([Timestamp("2019-12-31")], dtype=object)
with tm.assert_produces_warning(None):
# This doesn't do inference
result = Series(ser)
assert result.dtype == np.object_

idx = Index([Timestamp("2019-12-31")], dtype=object)

with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = Series(idx)
assert result.dtype != np.object_


class TestSeriesConstructorIndexCoercion:
def test_series_constructor_datetimelike_index_coercion(self):
Expand Down
21 changes: 16 additions & 5 deletions pandas/tests/strings/test_cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,18 @@ def test_str_cat_categorical(

with option_context("future.infer_string", infer_string):
s = Index(["a", "a", "b", "a"], dtype=dtype_caller)
s = s if box == Index else Series(s, index=s)
s = s if box == Index else Series(s, index=s, dtype=s.dtype)
t = Index(["b", "a", "b", "c"], dtype=dtype_target)

expected = Index(["ab", "aa", "bb", "ac"])
expected = Index(
["ab", "aa", "bb", "ac"], dtype=object if dtype_caller == "object" else None
)
expected = (
expected
if box == Index
else Series(expected, index=Index(s, dtype=dtype_caller))
else Series(
expected, index=Index(s, dtype=dtype_caller), dtype=expected.dtype
)
)

# Series/Index with unaligned Index -> t.values
Expand All @@ -123,12 +127,19 @@ def test_str_cat_categorical(

# Series/Index with Series having different Index
t = Series(t.values, index=t.values)
expected = Index(["aa", "aa", "bb", "bb", "aa"])
expected = Index(
["aa", "aa", "bb", "bb", "aa"],
dtype=object if dtype_caller == "object" else None,
)
dtype = object if dtype_caller == "object" else s.dtype.categories.dtype
expected = (
expected
if box == Index
else Series(expected, index=Index(expected.str[:1], dtype=dtype))
else Series(
expected,
index=Index(expected.str[:1], dtype=dtype),
dtype=expected.dtype,
)
)

result = s.str.cat(t, sep=sep)
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/tseries/frequencies/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
date_range,
period_range,
)
import pandas._testing as tm
from pandas.core.arrays import (
DatetimeArray,
TimedeltaArray,
Expand Down Expand Up @@ -206,7 +207,8 @@ def test_infer_freq_custom(base_delta_code_pair, constructor):
)
def test_infer_freq_index(freq, expected):
rng = period_range("1959Q2", "2009Q3", freq=freq)
rng = Index(rng.to_timestamp("D", how="e").astype(object))
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
rng = Index(rng.to_timestamp("D", how="e").astype(object))

assert rng.inferred_freq == expected

Expand Down