Skip to content

Commit

Permalink
DEPR: Deprecate dtype inference on pandas objects (pandas-dev#56244)
Browse files Browse the repository at this point in the history
  • Loading branch information
phofl authored and cbpygit committed Jan 2, 2024
1 parent 819bcce commit 8e4f050
Show file tree
Hide file tree
Showing 16 changed files with 141 additions and 26 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,7 @@ Other Deprecations
- Deprecated behavior of :meth:`Index.insert` with an object-dtype index silently performing type inference on the result, explicitly call ``result.infer_objects(copy=False)`` for the old behavior instead (:issue:`51363`)
- Deprecated casting non-datetimelike values (mainly strings) in :meth:`Series.isin` and :meth:`Index.isin` with ``datetime64``, ``timedelta64``, and :class:`PeriodDtype` dtypes (:issue:`53111`)
- Deprecated downcasting behavior in :meth:`Series.where`, :meth:`DataFrame.where`, :meth:`Series.mask`, :meth:`DataFrame.mask`, :meth:`Series.clip`, :meth:`DataFrame.clip`; in a future version these will not infer object-dtype columns to non-object dtype, or all-round floats to integer dtype. Call ``result.infer_objects(copy=False)`` on the result for object inference, or explicitly cast floats to ints. To opt in to the future version, use ``pd.set_option("future.no_silent_downcasting", True)`` (:issue:`53656`)
- Deprecated dtype inference in :class:`Index`, :class:`Series` and :class:`DataFrame` constructors when giving a pandas input, call ``.infer_objects`` on the input to keep the current behavior (:issue:`56012`)
- Deprecated dtype inference when setting a :class:`Index` into a :class:`DataFrame`, cast explicitly instead (:issue:`56102`)
- Deprecated including the groups in computations when using :meth:`.DataFrameGroupBy.apply` and :meth:`.DataFrameGroupBy.resample`; pass ``include_groups=False`` to exclude the groups (:issue:`7155`)
- Deprecated indexing an :class:`Index` with a boolean indexer of length zero (:issue:`55820`)
Expand Down
13 changes: 10 additions & 3 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
ContextManager,
cast,
)
import warnings

import numpy as np

Expand Down Expand Up @@ -285,11 +286,17 @@ def box_expected(expected, box_cls, transpose: bool = True):
else:
expected = pd.array(expected, copy=False)
elif box_cls is Index:
expected = Index(expected)
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning)
expected = Index(expected)
elif box_cls is Series:
expected = Series(expected)
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning)
expected = Series(expected)
elif box_cls is DataFrame:
expected = Series(expected).to_frame()
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "Dtype inference", category=FutureWarning)
expected = Series(expected).to_frame()
if transpose:
# for vector operations, we need a DataFrame to be a single-row,
# not a single-column, in order to operate against non-DataFrame
Expand Down
16 changes: 16 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -722,6 +722,10 @@ def __init__(

manager = _get_option("mode.data_manager", silent=True)

is_pandas_object = isinstance(data, (Series, Index, ExtensionArray))
data_dtype = getattr(data, "dtype", None)
original_dtype = dtype

# GH47215
if isinstance(index, set):
raise ValueError("index cannot be a set")
Expand Down Expand Up @@ -908,6 +912,18 @@ def __init__(

NDFrame.__init__(self, mgr)

if original_dtype is None and is_pandas_object and data_dtype == np.object_:
if self.dtypes.iloc[0] != data_dtype:
warnings.warn(
"Dtype inference on a pandas object "
"(Series, Index, ExtensionArray) is deprecated. The DataFrame "
"constructor will keep the original dtype in the future. "
"Call `infer_objects` on the result to get the old "
"behavior.",
FutureWarning,
stacklevel=2,
)

# ----------------------------------------------------------------------

def __dataframe__(
Expand Down
16 changes: 15 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,8 @@ def __new__(
if not copy and isinstance(data, (ABCSeries, Index)):
refs = data._references

is_pandas_object = isinstance(data, (ABCSeries, Index, ExtensionArray))

# range
if isinstance(data, (range, RangeIndex)):
result = RangeIndex(start=data, copy=copy, name=name)
Expand Down Expand Up @@ -572,7 +574,19 @@ def __new__(
klass = cls._dtype_to_subclass(arr.dtype)

arr = klass._ensure_array(arr, arr.dtype, copy=False)
return klass._simple_new(arr, name, refs=refs)
result = klass._simple_new(arr, name, refs=refs)
if dtype is None and is_pandas_object and data_dtype == np.object_:
if result.dtype != data_dtype:
warnings.warn(
"Dtype inference on a pandas object "
"(Series, Index, ExtensionArray) is deprecated. The Index "
"constructor will keep the original dtype in the future. "
"Call `infer_objects` on the result to get the old "
"behavior.",
FutureWarning,
stacklevel=2,
)
return result # type: ignore[return-value]

@classmethod
def _ensure_array(cls, data, dtype, copy: bool):
Expand Down
15 changes: 15 additions & 0 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,10 @@ def __init__(
self.name = name
return

is_pandas_object = isinstance(data, (Series, Index, ExtensionArray))
data_dtype = getattr(data, "dtype", None)
original_dtype = dtype

if isinstance(data, (ExtensionArray, np.ndarray)):
if copy is not False and using_copy_on_write():
if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)):
Expand Down Expand Up @@ -581,6 +585,17 @@ def __init__(
self.name = name
self._set_axis(0, index)

if original_dtype is None and is_pandas_object and data_dtype == np.object_:
if self.dtype != data_dtype:
warnings.warn(
"Dtype inference on a pandas object "
"(Series, Index, ExtensionArray) is deprecated. The Series "
"constructor will keep the original dtype in the future. "
"Call `infer_objects` on the result to get the old behavior.",
FutureWarning,
stacklevel=find_stack_level(),
)

def _init_dict(
self, data, index: Index | None = None, dtype: DtypeObj | None = None
):
Expand Down
13 changes: 6 additions & 7 deletions pandas/core/strings/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,19 +689,18 @@ def cat(
result = cat_safe(all_cols, sep)

out: Index | Series
if isinstance(self._orig.dtype, CategoricalDtype):
# We need to infer the new categories.
dtype = self._orig.dtype.categories.dtype
else:
dtype = self._orig.dtype
if isinstance(self._orig, ABCIndex):
# add dtype for case that result is all-NA
dtype = None
if isna(result).all():
dtype = object
dtype = object # type: ignore[assignment]

out = Index(result, dtype=dtype, name=self._orig.name)
else: # Series
if isinstance(self._orig.dtype, CategoricalDtype):
# We need to infer the new categories.
dtype = self._orig.dtype.categories.dtype # type: ignore[assignment]
else:
dtype = self._orig.dtype
res_ser = Series(
result, dtype=dtype, index=data.index, name=self._orig.name, copy=False
)
Expand Down
3 changes: 2 additions & 1 deletion pandas/tests/copy_view/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,8 @@ def test_dataframe_from_series_or_index_different_dtype(using_copy_on_write, con

def test_dataframe_from_series_infer_datetime(using_copy_on_write):
ser = Series([Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype=object)
df = DataFrame(ser)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
df = DataFrame(ser)
assert not np.shares_memory(get_array(ser), get_array(df, 0))
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2768,6 +2768,23 @@ def test_frame_string_inference_block_dim(self):
df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]]))
assert df._mgr.blocks[0].ndim == 2

def test_inference_on_pandas_objects(self):
# GH#56012
idx = Index([Timestamp("2019-12-31")], dtype=object)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = DataFrame(idx, columns=["a"])
assert result.dtypes.iloc[0] != np.object_
result = DataFrame({"a": idx})
assert result.dtypes.iloc[0] == np.object_

ser = Series([Timestamp("2019-12-31")], dtype=object)

with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = DataFrame(ser, columns=["a"])
assert result.dtypes.iloc[0] != np.object_
result = DataFrame({"a": ser})
assert result.dtypes.iloc[0] == np.object_


class TestDataFrameConstructorIndexInference:
def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self):
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/indexes/base_class/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pandas import (
Index,
MultiIndex,
Series,
)
import pandas._testing as tm

Expand Down Expand Up @@ -57,3 +58,16 @@ def test_index_string_inference(self):
with pd.option_context("future.infer_string", True):
ser = Index(["a", 1])
tm.assert_index_equal(ser, expected)

def test_inference_on_pandas_objects(self):
# GH#56012
idx = Index([pd.Timestamp("2019-12-31")], dtype=object)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = Index(idx)
assert result.dtype != np.object_

ser = Series([pd.Timestamp("2019-12-31")], dtype=object)

with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = Index(ser)
assert result.dtype != np.object_
3 changes: 2 additions & 1 deletion pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ def test_constructor_copy(self, index, using_infer_string):
)
def test_constructor_from_index_dtlike(self, cast_as_obj, index):
if cast_as_obj:
result = Index(index.astype(object))
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = Index(index.astype(object))
else:
result = Index(index)

Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/series/accessors/test_dt_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,9 +259,9 @@ def test_dt_accessor_limited_display_api(self):
tm.assert_almost_equal(results, sorted(set(ok_for_dt + ok_for_dt_methods)))

# Period
ser = Series(
period_range("20130101", periods=5, freq="D", name="xxx").astype(object)
)
idx = period_range("20130101", periods=5, freq="D", name="xxx").astype(object)
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
ser = Series(idx)
results = get_dir(ser)
tm.assert_almost_equal(
results, sorted(set(ok_for_period + ok_for_period_methods))
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/series/methods/test_between.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def test_between(self):
tm.assert_series_equal(result, expected)

def test_between_datetime_object_dtype(self):
ser = Series(bdate_range("1/1/2000", periods=20).astype(object))
ser = Series(bdate_range("1/1/2000", periods=20), dtype=object)
ser[::2] = np.nan

result = ser[ser.between(ser[3], ser[17])]
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/series/methods/test_equals.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,15 @@ def test_equals_matching_nas():
left = Series([np.datetime64("NaT")], dtype=object)
right = Series([np.datetime64("NaT")], dtype=object)
assert left.equals(right)
assert Index(left).equals(Index(right))
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
assert Index(left).equals(Index(right))
assert left.array.equals(right.array)

left = Series([np.timedelta64("NaT")], dtype=object)
right = Series([np.timedelta64("NaT")], dtype=object)
assert left.equals(right)
assert Index(left).equals(Index(right))
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
assert Index(left).equals(Index(right))
assert left.array.equals(right.array)

left = Series([np.float64("NaN")], dtype=object)
Expand Down
17 changes: 16 additions & 1 deletion pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1316,7 +1316,8 @@ def test_constructor_periodindex(self):
pi = period_range("20130101", periods=5, freq="D")
s = Series(pi)
assert s.dtype == "Period[D]"
expected = Series(pi.astype(object))
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
expected = Series(pi.astype(object))
tm.assert_series_equal(s, expected)

def test_constructor_dict(self):
Expand Down Expand Up @@ -2137,6 +2138,20 @@ def test_series_string_inference_na_first(self):
result = Series([pd.NA, "b"])
tm.assert_series_equal(result, expected)

def test_inference_on_pandas_objects(self):
# GH#56012
ser = Series([Timestamp("2019-12-31")], dtype=object)
with tm.assert_produces_warning(None):
# This doesn't do inference
result = Series(ser)
assert result.dtype == np.object_

idx = Index([Timestamp("2019-12-31")], dtype=object)

with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
result = Series(idx)
assert result.dtype != np.object_


class TestSeriesConstructorIndexCoercion:
def test_series_constructor_datetimelike_index_coercion(self):
Expand Down
21 changes: 16 additions & 5 deletions pandas/tests/strings/test_cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,14 +98,18 @@ def test_str_cat_categorical(

with option_context("future.infer_string", infer_string):
s = Index(["a", "a", "b", "a"], dtype=dtype_caller)
s = s if box == Index else Series(s, index=s)
s = s if box == Index else Series(s, index=s, dtype=s.dtype)
t = Index(["b", "a", "b", "c"], dtype=dtype_target)

expected = Index(["ab", "aa", "bb", "ac"])
expected = Index(
["ab", "aa", "bb", "ac"], dtype=object if dtype_caller == "object" else None
)
expected = (
expected
if box == Index
else Series(expected, index=Index(s, dtype=dtype_caller))
else Series(
expected, index=Index(s, dtype=dtype_caller), dtype=expected.dtype
)
)

# Series/Index with unaligned Index -> t.values
Expand All @@ -123,12 +127,19 @@ def test_str_cat_categorical(

# Series/Index with Series having different Index
t = Series(t.values, index=t.values)
expected = Index(["aa", "aa", "bb", "bb", "aa"])
expected = Index(
["aa", "aa", "bb", "bb", "aa"],
dtype=object if dtype_caller == "object" else None,
)
dtype = object if dtype_caller == "object" else s.dtype.categories.dtype
expected = (
expected
if box == Index
else Series(expected, index=Index(expected.str[:1], dtype=dtype))
else Series(
expected,
index=Index(expected.str[:1], dtype=dtype),
dtype=expected.dtype,
)
)

result = s.str.cat(t, sep=sep)
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/tseries/frequencies/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
date_range,
period_range,
)
import pandas._testing as tm
from pandas.core.arrays import (
DatetimeArray,
TimedeltaArray,
Expand Down Expand Up @@ -206,7 +207,8 @@ def test_infer_freq_custom(base_delta_code_pair, constructor):
)
def test_infer_freq_index(freq, expected):
rng = period_range("1959Q2", "2009Q3", freq=freq)
rng = Index(rng.to_timestamp("D", how="e").astype(object))
with tm.assert_produces_warning(FutureWarning, match="Dtype inference"):
rng = Index(rng.to_timestamp("D", how="e").astype(object))

assert rng.inferred_freq == expected

Expand Down

0 comments on commit 8e4f050

Please sign in to comment.