From e647fac2e5406399067654f981bcff6ec0b3afc8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 5 Nov 2024 07:58:52 +0100 Subject: [PATCH 01/42] [backport 2.3.x] BUG: Fix copy semantics in ``__array__`` (#60046) (#60189) (cherry picked from commit eacf0326efb709169ebc49f040834670dfe4beb3) Co-authored-by: Joris Van den Bossche Co-authored-by: Sebastian Berg --- doc/source/whatsnew/v2.3.0.rst | 3 ++ pandas/core/arrays/arrow/array.py | 11 ++++- pandas/core/arrays/categorical.py | 33 +++++++++++---- pandas/core/arrays/datetimelike.py | 7 ++++ pandas/core/arrays/interval.py | 5 +++ pandas/core/arrays/masked.py | 12 +++++- pandas/core/arrays/numpy_.py | 3 ++ pandas/core/arrays/period.py | 15 ++++++- pandas/core/arrays/sparse/array.py | 15 +++++-- pandas/core/generic.py | 10 ++++- pandas/core/indexes/base.py | 6 ++- pandas/core/indexes/multi.py | 9 ++++ pandas/core/internals/construction.py | 12 +++--- pandas/core/series.py | 15 +++++-- pandas/tests/arrays/sparse/test_array.py | 31 ++++++++++++++ pandas/tests/arrays/test_datetimelike.py | 8 ++++ pandas/tests/base/test_conversion.py | 41 ++++++++++++++++--- pandas/tests/extension/base/interface.py | 21 ++++++++++ pandas/tests/extension/json/array.py | 10 ++++- pandas/tests/indexes/multi/test_conversion.py | 36 ++++++++++++++++ 20 files changed, 269 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index cda4754bb4acd..405c8fdc30961 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -32,6 +32,9 @@ enhancement1 Other enhancements ^^^^^^^^^^^^^^^^^^ +- The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called + when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been + updated to work correctly with NumPy >= 2 (:issue:`57739`) - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`) - diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 51136961c0fb3..2587d1d53640d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -662,7 +662,16 @@ def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: """Correctly construct numpy arrays when passed to `np.asarray()`.""" - return self.to_numpy(dtype=dtype) + if copy is False: + # TODO: By using `zero_copy_only` it may be possible to implement this + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + elif copy is None: + # `to_numpy(copy=False)` has the meaning of NumPy `copy=None`. + copy = False + + return self.to_numpy(dtype=dtype, copy=copy) def __invert__(self) -> Self: # This is a bit wise op for integer types diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 366253a923f6c..3383f35bb7d55 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -577,11 +577,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: raise ValueError("Cannot convert float NaN to integer") elif len(self.codes) == 0 or len(self.categories) == 0: - result = np.array( - self, - dtype=dtype, - copy=copy, - ) + # For NumPy 1.x compatibility we cannot use copy=None. And + # `copy=False` has the meaning of `copy=None` here: + if not copy: + result = np.asarray(self, dtype=dtype) + else: + result = np.array(self, dtype=dtype) else: # GH8628 (PERF): astype category codes instead of astyping array @@ -1642,6 +1643,17 @@ def __array__( """ The numpy array interface. + Users should not call this directly. Rather, it is invoked by + :func:`numpy.array` and :func:`numpy.asarray`. + + Parameters + ---------- + dtype : np.dtype or None + Specifies the the dtype for the array. + + copy : bool or None, optional + See :func:`numpy.asarray`. + Returns ------- numpy.array @@ -1659,13 +1671,18 @@ def __array__( >>> np.asarray(cat) array(['a', 'b'], dtype=object) """ + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + ret = take_nd(self.categories._values, self._codes) - if dtype and np.dtype(dtype) != self.categories.dtype: - return np.asarray(ret, dtype) # When we're a Categorical[ExtensionArray], like Interval, # we need to ensure __array__ gets all the way to an # ndarray. - return np.asarray(ret) + + # `take_nd` should already make a copy, so don't force again. + return np.asarray(ret, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # for binary ops, use our custom dunder methods diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 81e2f04f2ba2e..990116bad13d1 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -358,7 +358,14 @@ def __array__( ) -> np.ndarray: # used for Timedelta/DatetimeArray, overwritten by PeriodArray if is_object_dtype(dtype): + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) return np.array(list(self), dtype=object) + + if copy is True: + return np.array(self._ndarray, dtype=dtype) return self._ndarray @overload diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 91db7f11bcbe0..5aac3d3b28db5 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1574,6 +1574,11 @@ def __array__( Return the IntervalArray's data as a numpy array of Interval objects (with dtype='object') """ + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + left = self._left right = self._right mask = self.isna() diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index d7e816b9d3781..ba7b8e3e7398e 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -600,7 +600,17 @@ def __array__( the array interface, return my values We return an object array here to preserve our scalar values """ - return self.to_numpy(dtype=dtype) + if copy is False: + if not self._hasna: + # special case, here we can simply return the underlying data + return np.array(self._data, dtype=dtype, copy=copy) + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + + if copy is None: + copy = False # The NumPy copy=False meaning is different here. + return self.to_numpy(dtype=dtype, copy=copy) _HANDLED_TYPES: tuple[type, ...] diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index aafcd82114b97..9f7238a97d808 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -150,6 +150,9 @@ def dtype(self) -> NumpyEADtype: def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: + if copy is not None: + # Note: branch avoids `copy=None` for NumPy 1.x support + return np.array(self._ndarray, dtype=dtype, copy=copy) return np.asarray(self._ndarray, dtype=dtype) def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index c1229e27ab51a..aad7737b8dd94 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -407,8 +407,19 @@ def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: if dtype == "i8": - return self.asi8 - elif dtype == bool: + # For NumPy 1.x compatibility we cannot use copy=None. And + # `copy=False` has the meaning of `copy=None` here: + if not copy: + return np.asarray(self.asi8, dtype=dtype) + else: + return np.array(self.asi8, dtype=dtype) + + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + + if dtype == bool: return ~self._isnan # This will raise TypeError for non-object dtypes diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 82fcfa74ec7d2..13577e366d54b 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -554,11 +554,20 @@ def from_spmatrix(cls, data: spmatrix) -> Self: def __array__( self, dtype: NpDtype | None = None, copy: bool | None = None ) -> np.ndarray: - fill_value = self.fill_value - if self.sp_index.ngaps == 0: # Compat for na dtype and int values. - return self.sp_values + if copy is True: + return np.array(self.sp_values) + else: + return self.sp_values + + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + + fill_value = self.fill_value + if dtype is None: # Can NumPy represent this type? # If not, `np.result_type` will raise. We catch that diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 796357355fef4..bef2d1e1194f9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2150,9 +2150,15 @@ def __array__( self, dtype: npt.DTypeLike | None = None, copy: bool_t | None = None ) -> np.ndarray: values = self._values - arr = np.asarray(values, dtype=dtype) + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + arr = np.asarray(values, dtype=dtype) + else: + arr = np.array(values, dtype=dtype, copy=copy) + if ( - astype_is_view(values.dtype, arr.dtype) + copy is not True + and astype_is_view(values.dtype, arr.dtype) and using_copy_on_write() and self._mgr.is_single_block ): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8e8eb768130fd..fc53e044a6544 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -917,7 +917,11 @@ def __array__(self, dtype=None, copy=None) -> np.ndarray: """ The array interface, return my values. """ - return np.asarray(self._data, dtype=dtype) + if copy is None: + # Note, that the if branch exists for NumPy 1.x support + return np.asarray(self._data, dtype=dtype) + + return np.array(self._data, dtype=dtype, copy=copy) def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs): if any(isinstance(other, (ABCSeries, ABCDataFrame)) for other in inputs): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 091ddbcc099be..9e002ccd3a787 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1311,6 +1311,15 @@ def copy( # type: ignore[override] def __array__(self, dtype=None, copy=None) -> np.ndarray: """the array interface, return my values""" + if copy is False: + # self.values is always a newly construct array, so raise. + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + if copy is True: + # explicit np.array call to ensure a copy is made and unique objects + # are returned, because self.values is cached + return np.array(self.values, dtype=dtype) return self.values def view(self, cls=None) -> Self: diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index f3dbacc02bec9..137648ee52bf7 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -305,12 +305,12 @@ def ndarray_to_mgr( elif isinstance(values, (np.ndarray, ExtensionArray)): # drop subclass info - _copy = ( - copy_on_sanitize - if (dtype is None or astype_is_view(values.dtype, dtype)) - else False - ) - values = np.array(values, copy=_copy) + if copy_on_sanitize and (dtype is None or astype_is_view(values.dtype, dtype)): + # only force a copy now if copy=True was requested + # and a subsequent `astype` will not already result in a copy + values = np.array(values, copy=True, order="F") + else: + values = np.asarray(values) values = _ensure_2d(values) else: diff --git a/pandas/core/series.py b/pandas/core/series.py index 6fd019656d207..4e2e363885594 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -991,7 +991,7 @@ def __array__( the dtype is inferred from the data. copy : bool or None, optional - Unused. + See :func:`numpy.asarray`. Returns ------- @@ -1028,8 +1028,17 @@ def __array__( dtype='datetime64[ns]') """ values = self._values - arr = np.asarray(values, dtype=dtype) - if using_copy_on_write() and astype_is_view(values.dtype, arr.dtype): + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + arr = np.asarray(values, dtype=dtype) + else: + arr = np.array(values, dtype=dtype, copy=copy) + + if copy is True: + return arr + if using_copy_on_write() and ( + copy is False or astype_is_view(values.dtype, arr.dtype) + ): arr = arr.view() arr.flags.writeable = False return arr diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 883d6ea3959ff..1e8d36b184e48 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -4,6 +4,7 @@ import pytest from pandas._libs.sparse import IntIndex +from pandas.compat.numpy import np_version_gt2 import pandas as pd from pandas import ( @@ -478,3 +479,33 @@ def test_zero_sparse_column(): expected = pd.DataFrame({"A": SparseArray([0, 0]), "B": [1, 3]}, index=[0, 2]) tm.assert_frame_equal(result, expected) + + +def test_array_interface(arr_data, arr): + # https://github.com/pandas-dev/pandas/pull/60046 + result = np.asarray(arr) + tm.assert_numpy_array_equal(result, arr_data) + + # it always gives a copy by default + result_copy1 = np.asarray(arr) + result_copy2 = np.asarray(arr) + assert not np.may_share_memory(result_copy1, result_copy2) + + # or with explicit copy=True + result_copy1 = np.array(arr, copy=True) + result_copy2 = np.array(arr, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + # for sparse arrays, copy=False is never allowed + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + np.array(arr, copy=False) + + # except when there are actually no sparse filled values + arr2 = SparseArray(np.array([1, 2, 3])) + result_nocopy1 = np.array(arr2, copy=False) + result_nocopy2 = np.array(arr2, copy=False) + assert np.may_share_memory(result_nocopy1, result_nocopy2) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index ede81264cb415..0397913b69b26 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -1148,9 +1148,17 @@ def test_array_interface(self, arr1d): result = np.asarray(arr, dtype=object) tm.assert_numpy_array_equal(result, expected) + # to int64 gives the underlying representation result = np.asarray(arr, dtype="int64") tm.assert_numpy_array_equal(result, arr.asi8) + result2 = np.asarray(arr, dtype="int64") + assert np.may_share_memory(result, result2) + + result_copy1 = np.array(arr, dtype="int64", copy=True) + result_copy2 = np.array(arr, dtype="int64", copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + # to other dtypes msg = r"float\(\) argument must be a string or a( real)? number, not 'Period'" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index d62599c56e467..e2bf19e2e736c 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -4,6 +4,7 @@ from pandas._config import using_string_dtype from pandas.compat import HAS_PYARROW +from pandas.compat.numpy import np_version_gt2 from pandas.core.dtypes.dtypes import DatetimeTZDtype @@ -297,24 +298,27 @@ def test_array_multiindex_raises(): @pytest.mark.parametrize( - "arr, expected", + "arr, expected, zero_copy", [ - (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)), - (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)), + (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64), True), + (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object), False), ( pd.core.arrays.period_array(["2000", "2001"], freq="D"), np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]), + False, ), - (pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan])), + (pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan]), False), ( IntervalArray.from_breaks([0, 1, 2]), np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object), + False, ), - (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), + (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64), False), # tz-naive datetime ( DatetimeArray._from_sequence(np.array(["2000", "2001"], dtype="M8[ns]")), np.array(["2000", "2001"], dtype="M8[ns]"), + True, ), # tz-aware stays tz`-aware ( @@ -329,6 +333,7 @@ def test_array_multiindex_raises(): Timestamp("2000-01-02", tz="US/Central"), ] ), + False, ), # Timedelta ( @@ -336,6 +341,7 @@ def test_array_multiindex_raises(): np.array([0, 3600000000000], dtype="i8").view("m8[ns]") ), np.array([0, 3600000000000], dtype="m8[ns]"), + True, ), # GH#26406 tz is preserved in Categorical[dt64tz] ( @@ -346,10 +352,11 @@ def test_array_multiindex_raises(): Timestamp("2016-01-02", tz="US/Pacific"), ] ), + False, ), ], ) -def test_to_numpy(arr, expected, index_or_series_or_array, request): +def test_to_numpy(arr, expected, zero_copy, index_or_series_or_array): box = index_or_series_or_array with tm.assert_produces_warning(None): @@ -361,6 +368,28 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request): result = np.asarray(thing) tm.assert_numpy_array_equal(result, expected) + # Additionally, we check the `copy=` semantics for array/asarray + # (these are implemented by us via `__array__`). + result_cp1 = np.array(thing, copy=True) + result_cp2 = np.array(thing, copy=True) + # When called with `copy=True` NumPy/we should ensure a copy was made + assert not np.may_share_memory(result_cp1, result_cp2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + if not zero_copy: + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + # An error is always acceptable for `copy=False` + np.array(thing, copy=False) + + else: + result_nocopy1 = np.array(thing, copy=False) + result_nocopy2 = np.array(thing, copy=False) + # If copy=False was given, these must share the same data + assert np.may_share_memory(result_nocopy1, result_nocopy2) + @pytest.mark.xfail( using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index 6683c87e2b8fc..79eb64b5a654f 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import is_extension_array_dtype from pandas.core.dtypes.dtypes import ExtensionDtype @@ -71,6 +73,25 @@ def test_array_interface(self, data): expected = construct_1d_object_array_from_listlike(list(data)) tm.assert_numpy_array_equal(result, expected) + def test_array_interface_copy(self, data): + result_copy1 = np.array(data, copy=True) + result_copy2 = np.array(data, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + try: + result_nocopy1 = np.array(data, copy=False) + except ValueError: + # An error is always acceptable for `copy=False` + return + + result_nocopy2 = np.array(data, copy=False) + # If copy=False was given and did not raise, these must share the same data + assert np.may_share_memory(result_nocopy1, result_nocopy2) + def test_is_extension_array_dtype(self, data): assert is_extension_array_dtype(data) assert is_extension_array_dtype(data.dtype) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 5cbd45a99ae5c..b6d72c10712f2 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -147,12 +147,20 @@ def __ne__(self, other): return NotImplemented def __array__(self, dtype=None, copy=None): + if copy is False: + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) + if dtype is None: dtype = object if dtype == object: # on py38 builds it looks like numpy is inferring to a non-1D array return construct_1d_object_array_from_listlike(list(self)) - return np.asarray(self.data, dtype=dtype) + if copy is None: + # Note: branch avoids `copy=None` for NumPy 1.x support + return np.asarray(self.data, dtype=dtype) + return np.asarray(self.data, dtype=dtype, copy=copy) @property def nbytes(self) -> int: diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index 3c2ca045d6f99..58a2dc00f937d 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + import pandas as pd from pandas import ( DataFrame, @@ -15,6 +17,40 @@ def test_to_numpy(idx): tm.assert_numpy_array_equal(result, exp) +def test_array_interface(idx): + # https://github.com/pandas-dev/pandas/pull/60046 + result = np.asarray(idx) + expected = np.empty((6,), dtype=object) + expected[:] = [ + ("foo", "one"), + ("foo", "two"), + ("bar", "one"), + ("baz", "two"), + ("qux", "one"), + ("qux", "two"), + ] + tm.assert_numpy_array_equal(result, expected) + + # it always gives a copy by default, but the values are cached, so results + # are still sharing memory + result_copy1 = np.asarray(idx) + result_copy2 = np.asarray(idx) + assert np.may_share_memory(result_copy1, result_copy2) + + # with explicit copy=True, then it is an actual copy + result_copy1 = np.array(idx, copy=True) + result_copy2 = np.array(idx, copy=True) + assert not np.may_share_memory(result_copy1, result_copy2) + + if not np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + return + + # for MultiIndex, copy=False is never allowed + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + np.array(idx, copy=False) + + def test_to_frame(): tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")] From 6a9ef0ca4a2bac53faaeff9ebc0a17cda3535373 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 5 Nov 2024 07:59:48 +0100 Subject: [PATCH 02/42] [backport 2.3.x] TST (string dtype): fix invalid comparison error message and update test (#60176) (#60188) (cherry picked from commit 9ec4a9150ef6dbf6da1248b7252141d48203d941) --- pandas/core/arrays/arrow/array.py | 2 +- pandas/tests/frame/test_arithmetic.py | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 2587d1d53640d..8cf763265fd34 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -737,7 +737,7 @@ def _cmp_method(self, other, op): try: result[valid] = op(np_array[valid], other) except TypeError: - result = ops.invalid_comparison(np_array, other, op) + result = ops.invalid_comparison(self, other, op) result = pa.array(result, type=pa.bool_()) result = pc.if_else(valid, result, None) else: diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 6b4efc41aeffa..eb85c108ca238 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -13,7 +13,6 @@ from pandas._config import using_string_dtype -from pandas.compat import HAS_PYARROW import pandas.util._test_decorators as td import pandas as pd @@ -1563,9 +1562,6 @@ def test_comparisons(self, simple_frame, float_frame, func): with pytest.raises(ValueError, match=msg): func(simple_frame, simple_frame[:2]) - @pytest.mark.xfail( - using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" - ) def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne): # GH 11565 df = DataFrame( @@ -1573,7 +1569,12 @@ def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne) ) f = getattr(operator, compare_operators_no_eq_ne) - msg = "'[<>]=?' not supported between instances of 'str' and 'int'" + msg = "|".join( + [ + "'[<>]=?' not supported between instances of 'str' and 'int'", + "Invalid comparison between dtype=str and int", + ] + ) with pytest.raises(TypeError, match=msg): f(df, 0) From 6f4ebe6cda2a1abeb0d26f22c72d64d4bae89273 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 4 Nov 2024 23:00:28 -0800 Subject: [PATCH 03/42] Backport PR #60134 on branch 2.3.x (TST (string dtype): remove xfails in extension tests + fix categorical/string dispatch) (#60178) Backport PR #60134: TST (string dtype): remove xfails in extension tests + fix categorical/string dispatch Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/string_.py | 1 - pandas/tests/extension/base/ops.py | 26 ---------------------- pandas/tests/extension/test_categorical.py | 2 -- pandas/tests/extension/test_numpy.py | 7 ------ 4 files changed, 36 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 4081e3e1a4779..f5c5cb2a45034 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -915,7 +915,6 @@ def _cmp_method(self, other, op): if not is_array_like(other): other = np.asarray(other) other = other[valid] - other = np.asarray(other) if op.__name__ in ops.ARITHMETIC_BINOPS: result = np.empty_like(self._ndarray, dtype="object") diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 547114ecfddd0..222ff42d45052 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -5,10 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - from pandas.core.dtypes.common import is_string_dtype import pandas as pd @@ -134,12 +130,6 @@ class BaseArithmeticOpsTests(BaseOpsUtil): series_array_exc: type[Exception] | None = TypeError divmod_exc: type[Exception] | None = TypeError - # TODO(infer_string) need to remove import of pyarrow - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # series & scalar if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype): @@ -149,11 +139,6 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators): ser = pd.Series(data) self.check_opname(ser, op_name, ser.iloc[0]) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype): @@ -163,22 +148,12 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): df = pd.DataFrame({"A": data}) self.check_opname(df, op_name, data[0]) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) def test_arith_series_with_array(self, data, all_arithmetic_operators): # ndarray & other series op_name = all_arithmetic_operators ser = pd.Series(data) self.check_opname(ser, op_name, pd.Series([ser.iloc[0]] * len(ser))) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) def test_divmod(self, data): ser = pd.Series(data) self._check_divmod_op(ser, divmod, 1) @@ -194,7 +169,6 @@ def test_divmod_series_array(self, data, data_for_twos): other = pd.Series(other) self._check_divmod_op(other, ops.rdivmod, ser) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_add_series_with_extension_array(self, data): # Check adding an ExtensionArray to a Series of the same dtype matches # the behavior of adding the arrays directly and then wrapping in a diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index fd291908a4f96..135ea67c924d0 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -148,7 +148,6 @@ def test_map(self, data, na_action): result = data.map(lambda x: x, na_action=na_action) tm.assert_extension_array_equal(result, data) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): # frame & scalar op_name = all_arithmetic_operators @@ -160,7 +159,6 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): ) super().test_arith_frame_with_scalar(data, op_name) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request): op_name = all_arithmetic_operators if op_name == "__rmod__": diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index b79b0a98efde4..e38144f4c615b 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -18,8 +18,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.dtypes import NumpyEADtype import pandas as pd @@ -244,7 +242,6 @@ def test_insert_invalid(self, data, invalid_scalar): frame_scalar_exc = None series_array_exc = None - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_divmod(self, data): divmod_exc = None if data.dtype.kind == "O": @@ -252,7 +249,6 @@ def test_divmod(self, data): self.divmod_exc = divmod_exc super().test_divmod(data) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_divmod_series_array(self, data): ser = pd.Series(data) exc = None @@ -261,7 +257,6 @@ def test_divmod_series_array(self, data): self.divmod_exc = exc self._check_divmod_op(ser, divmod, data) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request): opname = all_arithmetic_operators series_scalar_exc = None @@ -275,7 +270,6 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request) self.series_scalar_exc = series_scalar_exc super().test_arith_series_with_scalar(data, all_arithmetic_operators) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_series_with_array(self, data, all_arithmetic_operators): opname = all_arithmetic_operators series_array_exc = None @@ -284,7 +278,6 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators): self.series_array_exc = series_array_exc super().test_arith_series_with_array(data, all_arithmetic_operators) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): opname = all_arithmetic_operators frame_scalar_exc = None From a53604db7364ec467915b1e08d7e3bd6a0aa270e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 5 Nov 2024 19:51:37 +0100 Subject: [PATCH 04/42] [backport 2.3.x] TST (string dtype): avoid hardcoded object dtype for columns in datetime_frame fixture (#60192) (#60198) TST (string dtype): avoid hardcoded object dtype for columns in datetime_frame fixture (#60192) (cherry picked from commit 34387bddffacb158a60a249b08411a8a1fe44455) --- pandas/tests/frame/conftest.py | 2 +- pandas/tests/frame/indexing/test_indexing.py | 1 - pandas/tests/frame/methods/test_to_csv.py | 2 -- 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index e07024b2e2a09..45b5d9b4aa698 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -18,7 +18,7 @@ def datetime_frame() -> DataFrame: """ return DataFrame( np.random.default_rng(2).standard_normal((100, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100, freq="B"), ) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 04dba325f060f..4094f14c50608 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -183,7 +183,6 @@ def test_getitem_boolean(self, mixed_float_frame, mixed_int_frame, datetime_fram if bif[c].dtype != bifw[c].dtype: assert bif[c].dtype == df[c].dtype - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_getitem_boolean_casting(self, datetime_frame): # don't upcast if we don't need to df = datetime_frame.copy() diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 4a65c3929944b..aca3bb5bccd7c 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -35,7 +35,6 @@ def read_csv(self, path, **kwargs): return read_csv(path, **params) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_from_csv1(self, float_frame, datetime_frame): with tm.ensure_clean("__tmp_to_csv_from_csv1__") as path: float_frame.iloc[:5, float_frame.columns.get_loc("A")] = np.nan @@ -533,7 +532,6 @@ def test_to_csv_headers(self): assert return_value is None tm.assert_frame_equal(to_df, recons) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_to_csv_multiindex(self, float_frame, datetime_frame): frame = float_frame old_index = frame.index From 92a0c81c17466a24f22f25dd01ab6685bf857671 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 5 Nov 2024 12:00:21 -0800 Subject: [PATCH 05/42] Backport PR #60197 on branch 2.3.x (TYP/COMPAT: don't use Literal for Series.ndim to avoid tab completion bug in IPython) (#60200) Backport PR #60197: TYP/COMPAT: don't use Literal for Series.ndim to avoid tab completion bug in IPython Co-authored-by: Joris Van den Bossche --- pandas/core/base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index a67003895d288..af8f80db6a347 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -361,8 +361,11 @@ def __len__(self) -> int: # We need this defined here for mypy raise AbstractMethodError(self) + # Temporarily avoid using `-> Literal[1]:` because of an IPython (jedi) bug + # https://github.com/ipython/ipython/issues/14412 + # https://github.com/davidhalter/jedi/issues/1990 @property - def ndim(self) -> Literal[1]: + def ndim(self) -> int: """ Number of dimensions of the underlying data, by definition 1. From 70e8a3ba187b49651910538c4fe9290ffbec96c3 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 5 Nov 2024 12:00:58 -0800 Subject: [PATCH 06/42] Backport PR #60196 on branch 2.3.x (BUG: fix inspect usage when pyarrow or jinja2 is not installed) (#60201) Backport PR #60196: BUG: fix inspect usage when pyarrow or jinja2 is not installed Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v2.3.0.rst | 3 ++- pandas/core/arrays/arrow/accessors.py | 2 +- pandas/core/frame.py | 5 +++++ pandas/tests/frame/test_api.py | 1 - pandas/tests/series/test_api.py | 8 -------- 5 files changed, 8 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 405c8fdc30961..0751554d87dc8 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -174,7 +174,8 @@ Styler Other ^^^^^ -- +- Fixed usage of ``inspect`` when the optional dependencies ``pyarrow`` or ``jinja2`` + are not installed (:issue:`60196`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index 124f8fb6ad8bc..65f0784eaa3fd 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -46,7 +46,7 @@ def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: def _validate(self, data): dtype = data.dtype - if not isinstance(dtype, ArrowDtype): + if pa_version_under10p1 or not isinstance(dtype, ArrowDtype): # Raise AttributeError so that inspect can handle non-struct Series. raise AttributeError(self._validation_msg.format(dtype=dtype)) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1403fc2ceaaf8..ef48090f02c3f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1439,6 +1439,11 @@ def style(self) -> Styler: Please see `Table Visualization <../../user_guide/style.ipynb>`_ for more examples. """ + # Raise AttributeError so that inspect works even if jinja2 is not installed. + has_jinja2 = import_optional_dependency("jinja2", errors="ignore") + if not has_jinja2: + raise AttributeError("The '.style' accessor requires jinja2") + from pandas.io.formats.style import Styler return Styler(self) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index f6c7bd1f49b27..6c6944f806a2a 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -387,7 +387,6 @@ def test_constructor_expanddim(self): def test_inspect_getmembers(self): # GH38740 - pytest.importorskip("jinja2") df = DataFrame() msg = "DataFrame._data is deprecated" with tm.assert_produces_warning( diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index e53cd753a4192..7e10a337cdd3a 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -4,10 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - import pandas as pd from pandas import ( DataFrame, @@ -171,12 +167,8 @@ def test_attrs(self): result = s + 1 assert result.attrs == {"version": 1} - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_inspect_getmembers(self): # GH38782 - pytest.importorskip("jinja2") ser = Series(dtype=object) msg = "Series._data is deprecated" with tm.assert_produces_warning( From b9f1bc6a773aee3f3bde6a226c7c43ccc0f04de4 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 5 Nov 2024 12:29:32 -0800 Subject: [PATCH 07/42] Backport PR #60195 on branch 2.3.x (BUG (string dtype): fix where() for string dtype with python storage) (#60202) Backport PR #60195: BUG (string dtype): fix where() for string dtype with python storage Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/string_.py | 6 ++++++ pandas/tests/frame/indexing/test_where.py | 18 ++++++------------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index f5c5cb2a45034..92c274453b9d1 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -757,6 +757,12 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: # base class implementation that uses __setitem__ ExtensionArray._putmask(self, mask, value) + def _where(self, mask: npt.NDArray[np.bool_], value) -> Self: + # the super() method NDArrayBackedExtensionArray._where uses + # np.putmask which doesn't properly handle None/pd.NA, so using the + # base class implementation that uses __setitem__ + return ExtensionArray._where(self, mask, value) + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: if isinstance(values, BaseStringArray) or ( isinstance(values, ExtensionArray) and is_string_dtype(values.dtype) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index f0d868a4cb583..40506c90f3295 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -6,8 +6,6 @@ from pandas._config import using_string_dtype -from pandas.compat import HAS_PYARROW - from pandas.core.dtypes.common import is_scalar import pandas as pd @@ -985,9 +983,6 @@ def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype): obj.mask(mask, null) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) @given(data=OPTIONAL_ONE_OF_ALL) def test_where_inplace_casting(data): # GH 22051 @@ -1084,19 +1079,18 @@ def test_where_producing_ea_cond_for_np_dtype(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False -) @pytest.mark.parametrize( "replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)] ) -def test_where_int_overflow(replacement, using_infer_string, request): +def test_where_int_overflow(replacement, using_infer_string): # GH 31687 df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]]) if using_infer_string and replacement not in (None, "snake"): - request.node.add_marker( - pytest.mark.xfail(reason="Can't set non-string into string column") - ) + with pytest.raises( + TypeError, match="Cannot set non-string value|Scalar must be NA or str" + ): + df.where(pd.notnull(df), replacement) + return result = df.where(pd.notnull(df), replacement) expected = DataFrame([[1.0, 2e25, "nine"], [replacement, 0.1, replacement]]) From a82cf8e3626d15f06cbef4463ac746750d2229c6 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 5 Nov 2024 19:38:14 -0800 Subject: [PATCH 08/42] Backport PR #60206 on branch 2.3.x (STY: Fix lint error in test_where.py) (#60208) Backport PR #60206: STY: Fix lint error in test_where.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/frame/indexing/test_where.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 40506c90f3295..0517485888b38 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.common import is_scalar import pandas as pd From a83184f826d905357ea4feebfc26f1e82fbcd439 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 6 Nov 2024 19:36:22 +0100 Subject: [PATCH 09/42] [backport 2.3.x] TST (string dtype): update all tests in tests/frame/indexing (#60193) (#60212) * TST (string dtype): update all tests in tests/frame/indexing (#60193) (cherry picked from commit bec2dbca274a4f983790d069279a4b3aec184f49) * update for 2.3.x --- pandas/tests/frame/indexing/test_coercion.py | 24 +++++++++++++----- pandas/tests/frame/indexing/test_indexing.py | 18 +++++--------- pandas/tests/frame/indexing/test_insert.py | 6 ++--- pandas/tests/frame/indexing/test_setitem.py | 26 +++++++++----------- pandas/tests/frame/indexing/test_where.py | 18 +++++++++----- pandas/tests/frame/indexing/test_xs.py | 5 +--- 6 files changed, 51 insertions(+), 46 deletions(-) diff --git a/pandas/tests/frame/indexing/test_coercion.py b/pandas/tests/frame/indexing/test_coercion.py index 9d20821ae8bc6..f7f7b2c7c872a 100644 --- a/pandas/tests/frame/indexing/test_coercion.py +++ b/pandas/tests/frame/indexing/test_coercion.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -99,28 +97,42 @@ def test_6942(indexer_al): assert df.iloc[0, 0] == t2 -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_26395(indexer_al): # .at case fixed by GH#45121 (best guess) df = DataFrame(index=["A", "B", "C"]) df["D"] = 0 indexer_al(df)["C", "D"] = 2 - expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64) + expected = DataFrame( + {"D": [0, 0, 2]}, + index=["A", "B", "C"], + columns=pd.Index(["D"], dtype=object), + dtype=np.int64, + ) tm.assert_frame_equal(df, expected) with tm.assert_produces_warning( FutureWarning, match="Setting an item of incompatible dtype" ): indexer_al(df)["C", "D"] = 44.5 - expected = DataFrame({"D": [0, 0, 44.5]}, index=["A", "B", "C"], dtype=np.float64) + expected = DataFrame( + {"D": [0, 0, 44.5]}, + index=["A", "B", "C"], + columns=pd.Index(["D"], dtype=object), + dtype=np.float64, + ) tm.assert_frame_equal(df, expected) with tm.assert_produces_warning( FutureWarning, match="Setting an item of incompatible dtype" ): indexer_al(df)["C", "D"] = "hello" - expected = DataFrame({"D": [0, 0, "hello"]}, index=["A", "B", "C"], dtype=object) + expected = DataFrame( + {"D": [0, 0, "hello"]}, + index=["A", "B", "C"], + columns=pd.Index(["D"], dtype=object), + dtype=object, + ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 4094f14c50608..1721fe2c0eb8b 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -12,7 +12,6 @@ from pandas._config import using_string_dtype from pandas._libs import iNaT -from pandas.compat import HAS_PYARROW from pandas.errors import ( InvalidIndexError, PerformanceWarning, @@ -518,18 +517,17 @@ def test_setitem_ambig(self, using_infer_string): else: assert dm[2].dtype == np.object_ - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - def test_setitem_None(self, float_frame, using_infer_string): + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + def test_setitem_None(self, float_frame): # GH #766 float_frame[None] = float_frame["A"] - key = None if not using_infer_string else np.nan tm.assert_series_equal( float_frame.iloc[:, -1], float_frame["A"], check_names=False ) tm.assert_series_equal( - float_frame.loc[:, key], float_frame["A"], check_names=False + float_frame.loc[:, None], float_frame["A"], check_names=False ) - tm.assert_series_equal(float_frame[key], float_frame["A"], check_names=False) + tm.assert_series_equal(float_frame[None], float_frame["A"], check_names=False) def test_loc_setitem_boolean_mask_allfalse(self): # GH 9596 @@ -1191,7 +1189,6 @@ def test_setitem_with_unaligned_tz_aware_datetime_column(self): df.loc[[0, 1, 2], "dates"] = column[[1, 0, 2]] tm.assert_series_equal(df["dates"], column) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_loc_setitem_datetimelike_with_inference(self): # GH 7592 # assignment of timedeltas with NaT @@ -1210,13 +1207,10 @@ def test_loc_setitem_datetimelike_with_inference(self): result = df.dtypes expected = Series( [np.dtype("timedelta64[ns]")] * 6 + [np.dtype("datetime64[ns]")] * 2, - index=list("ABCDEFGH"), + index=Index(list("ABCDEFGH"), dtype=object), ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail( - using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)" - ) def test_getitem_boolean_indexing_mixed(self): df = DataFrame( { @@ -1258,7 +1252,7 @@ def test_getitem_boolean_indexing_mixed(self): tm.assert_frame_equal(df2, expected) df["foo"] = "test" - msg = "not supported between instances|unorderable types" + msg = "not supported between instances|unorderable types|Invalid comparison" with pytest.raises(TypeError, match=msg): df[df > 0.3] = 1 diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py index 82b75459f08d0..4cf297b4c037d 100644 --- a/pandas/tests/frame/indexing/test_insert.py +++ b/pandas/tests/frame/indexing/test_insert.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import PerformanceWarning from pandas import ( @@ -62,7 +60,6 @@ def test_insert_column_bug_4032(self): expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=["c", "a", "b"]) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_insert_with_columns_dups(self): # GH#14291 df = DataFrame() @@ -70,7 +67,8 @@ def test_insert_with_columns_dups(self): df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True) df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True) exp = DataFrame( - [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"] + [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], + columns=Index(["A", "A", "A"], dtype=object), ) tm.assert_frame_equal(df, exp) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index bce3cb5dacabe..190218a82d231 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td from pandas.core.dtypes.base import _registry as ea_registry @@ -148,13 +146,16 @@ def test_setitem_different_dtype(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_empty_columns(self): # GH 13522 df = DataFrame(index=["A", "B", "C"]) df["X"] = df.index df["X"] = ["x", "y", "z"] - exp = DataFrame(data={"X": ["x", "y", "z"]}, index=["A", "B", "C"]) + exp = DataFrame( + data={"X": ["x", "y", "z"]}, + index=["A", "B", "C"], + columns=Index(["X"], dtype=object), + ) tm.assert_frame_equal(df, exp) def test_setitem_dt64_index_empty_columns(self): @@ -164,14 +165,15 @@ def test_setitem_dt64_index_empty_columns(self): df["A"] = rng assert df["A"].dtype == np.dtype("M8[ns]") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_timestamp_empty_columns(self): # GH#19843 df = DataFrame(index=range(3)) df["now"] = Timestamp("20130101", tz="UTC").as_unit("ns") expected = DataFrame( - [[Timestamp("20130101", tz="UTC")]] * 3, index=[0, 1, 2], columns=["now"] + [[Timestamp("20130101", tz="UTC")]] * 3, + index=range(3), + columns=Index(["now"], dtype=object), ) tm.assert_frame_equal(df, expected) @@ -204,14 +206,13 @@ def test_setitem_with_unaligned_sparse_value(self): expected = Series(SparseArray([1, 0, 0]), name="new_column") tm.assert_series_equal(df["new_column"], expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_period_preserves_dtype(self): # GH: 26861 data = [Period("2003-12", "D")] result = DataFrame([]) result["a"] = data - expected = DataFrame({"a": data}) + expected = DataFrame({"a": data}, columns=Index(["a"], dtype=object)) tm.assert_frame_equal(result, expected) @@ -677,11 +678,10 @@ def test_setitem_iloc_two_dimensional_generator(self): expected = DataFrame({"a": [1, 2, 3], "b": [4, 1, 1]}) tm.assert_frame_equal(df, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_dtypes_bytes_type_to_object(self): # GH 20734 index = Series(name="id", dtype="S24") - df = DataFrame(index=index) + df = DataFrame(index=index, columns=Index([], dtype="str")) df["a"] = Series(name="a", index=index, dtype=np.uint32) df["b"] = Series(name="b", index=index, dtype="S64") df["c"] = Series(name="c", index=index, dtype="S64") @@ -712,7 +712,6 @@ def test_setitem_ea_dtype_rhs_series(self): # TODO(ArrayManager) set column with 2d column array, see #44788 @td.skip_array_manager_not_yet_implemented - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_npmatrix_2d(self): # GH#42376 # for use-case df["x"] = sparse.random((10, 10)).mean(axis=1) @@ -721,7 +720,7 @@ def test_setitem_npmatrix_2d(self): ) a = np.ones((10, 1)) - df = DataFrame(index=np.arange(10)) + df = DataFrame(index=np.arange(10), columns=Index([], dtype="str")) df["np-array"] = a # Instantiation of `np.matrix` gives PendingDeprecationWarning @@ -936,12 +935,11 @@ def test_setitem_with_expansion_categorical_dtype(self): ser.name = "E" tm.assert_series_equal(result2.sort_index(), ser.sort_index()) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_scalars_no_index(self): # GH#16823 / GH#17894 df = DataFrame() df["foo"] = 1 - expected = DataFrame(columns=["foo"]).astype(np.int64) + expected = DataFrame(columns=Index(["foo"], dtype=object)).astype(np.int64) tm.assert_frame_equal(df, expected) def test_setitem_newcol_tuple_key(self, float_frame): diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 0517485888b38..861147f5b58dd 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -46,7 +46,6 @@ def is_ok(s): class TestDataFrameIndexingWhere: - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_where_get(self, where_frame, float_string_frame): def _check_get(df, cond, check_dtypes=True): other1 = _safe_add(df) @@ -64,7 +63,10 @@ def _check_get(df, cond, check_dtypes=True): # check getting df = where_frame if df is float_string_frame: - msg = "'>' not supported between instances of 'str' and 'int'" + msg = ( + "'>' not supported between instances of 'str' and 'int'" + "|Invalid comparison" + ) with pytest.raises(TypeError, match=msg): df > 0 return @@ -98,7 +100,6 @@ def test_where_upcasting(self): tm.assert_series_equal(result, expected) @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_where_alignment(self, where_frame, float_string_frame): # aligning def _check_align(df, cond, other, check_dtypes=True): @@ -130,7 +131,10 @@ def _check_align(df, cond, other, check_dtypes=True): df = where_frame if df is float_string_frame: - msg = "'>' not supported between instances of 'str' and 'int'" + msg = ( + "'>' not supported between instances of 'str' and 'int'" + "|Invalid comparison" + ) with pytest.raises(TypeError, match=msg): df > 0 return @@ -174,7 +178,6 @@ def test_where_invalid(self): df.mask(0) @pytest.mark.filterwarnings("ignore:Downcasting object dtype arrays:FutureWarning") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_where_set(self, where_frame, float_string_frame, mixed_int_frame): # where inplace @@ -196,7 +199,10 @@ def _check_set(df, cond, check_dtypes=True): df = where_frame if df is float_string_frame: - msg = "'>' not supported between instances of 'str' and 'int'" + msg = ( + "'>' not supported between instances of 'str' and 'int'" + "|Invalid comparison" + ) with pytest.raises(TypeError, match=msg): df > 0 return diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index 4ca435fa5acc5..2aa27d1d6a548 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import SettingWithCopyError from pandas import ( @@ -79,10 +77,9 @@ def test_xs( else: assert (expected == 5).all() - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_xs_corner(self): # pathological mixed-type reordering case - df = DataFrame(index=[0]) + df = DataFrame(index=[0], columns=Index([], dtype="str")) df["A"] = 1.0 df["B"] = "foo" df["C"] = 2.0 From 678266cade4e4f7baa36bd327aa3ca9c8bea061a Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 6 Nov 2024 14:04:42 -0800 Subject: [PATCH 10/42] Backport PR #60215 on branch 2.3.x (BUG (string dtype): fix escaping of newline/tab characters in the repr) (#60220) Backport PR #60215: BUG (string dtype): fix escaping of newline/tab characters in the repr Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/string_.py | 11 +++++++++++ pandas/tests/frame/test_repr.py | 3 --- pandas/tests/series/test_formats.py | 14 +++++++------- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 92c274453b9d1..4801b70a27dd4 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -1,5 +1,6 @@ from __future__ import annotations +from functools import partial import operator from typing import ( TYPE_CHECKING, @@ -64,6 +65,8 @@ from pandas.core.indexers import check_array_indexer from pandas.core.missing import isna +from pandas.io.formats import printing + if TYPE_CHECKING: import pyarrow @@ -387,6 +390,14 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self: raise ValueError return cls._from_sequence(scalars, dtype=dtype) + def _formatter(self, boxed: bool = False): + formatter = partial( + printing.pprint_thing, + escape_chars=("\t", "\r", "\n"), + quote_strings=not boxed, + ) + return formatter + def _str_map( self, f, diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py index f7700af6beea0..6184e791cab5d 100644 --- a/pandas/tests/frame/test_repr.py +++ b/pandas/tests/frame/test_repr.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( NA, Categorical, @@ -176,7 +174,6 @@ def test_repr_mixed_big(self): repr(biggie) - @pytest.mark.xfail(using_string_dtype(), reason="/r in") def test_repr(self): # columns but no index no_index = DataFrame(columns=[0, 1, 3]) diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py index 77e77a9337d63..4f93e7424bfd5 100644 --- a/pandas/tests/series/test_formats.py +++ b/pandas/tests/series/test_formats.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( Categorical, @@ -144,11 +142,13 @@ def test_tidy_repr_name_0(self, arg): rep_str = repr(ser) assert "Name: 0" in rep_str - @pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string): investigate failure" - ) - def test_newline(self): - ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"]) + def test_newline(self, any_string_dtype): + ser = Series( + ["a\n\r\tb"], + name="a\n\r\td", + index=Index(["a\n\r\tf"], dtype=any_string_dtype), + dtype=any_string_dtype, + ) assert "\t" not in repr(ser) assert "\r" not in repr(ser) assert "a\n" not in repr(ser) From b5d061542bd651ae1f59c3dcd49452cd7517c258 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 6 Nov 2024 15:05:50 -0800 Subject: [PATCH 11/42] Backport PR #60222 on branch 2.3.x (ENH (string dtype): accept string_view in addition to string/large_string for ArrowStringArray input) (#60223) Backport PR #60222: ENH (string dtype): accept string_view in addition to string/large_string for ArrowStringArray input Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/string_arrow.py | 7 +++++++ pandas/tests/arrays/string_/test_string_arrow.py | 14 ++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e7dd4f9dc5718..b6e98d8fdc7e5 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -18,6 +18,7 @@ from pandas.compat import ( pa_version_under10p1, pa_version_under13p0, + pa_version_under16p0, ) from pandas.util._exceptions import find_stack_level @@ -65,6 +66,10 @@ def _chk_pyarrow_available() -> None: raise ImportError(msg) +def _is_string_view(typ): + return not pa_version_under16p0 and pa.types.is_string_view(typ) + + # TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from # ObjectStringArrayMixin because we want to have the object-dtype based methods as # fallback for the ones that pyarrow doesn't yet support @@ -122,11 +127,13 @@ def __init__(self, values) -> None: _chk_pyarrow_available() if isinstance(values, (pa.Array, pa.ChunkedArray)) and ( pa.types.is_string(values.type) + or _is_string_view(values.type) or ( pa.types.is_dictionary(values.type) and ( pa.types.is_string(values.type.value_type) or pa.types.is_large_string(values.type.value_type) + or _is_string_view(values.type.value_type) ) ) ): diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 2f3840e92b62a..aa87f5fc0f49a 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -99,6 +99,20 @@ def test_constructor_valid_string_type_value_dictionary(string_type, chunked): assert pa.types.is_large_string(arr._pa_array.type) +@pytest.mark.parametrize("chunked", [True, False]) +def test_constructor_valid_string_view(chunked): + # requires pyarrow>=18 for casting string_view to string + pa = pytest.importorskip("pyarrow", minversion="18") + + arr = pa.array(["1", "2", "3"], pa.string_view()) + if chunked: + arr = pa.chunked_array(arr) + + arr = ArrowStringArray(arr) + # dictionary type get converted to dense large string array + assert pa.types.is_large_string(arr._pa_array.type) + + def test_constructor_from_list(): # GH#27673 pytest.importorskip("pyarrow") From f8c7acc1c01545498b54c98683288377e5210fc3 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 7 Nov 2024 06:31:04 -0800 Subject: [PATCH 12/42] Backport PR #60191 on branch 2.3.x (TST: add extra test case for np.array(obj, copy=False) read-only behaviour) (#60226) Backport PR #60191: TST: add extra test case for np.array(obj, copy=False) read-only behaviour Co-authored-by: Joris Van den Bossche --- pandas/core/generic.py | 6 +++++ pandas/tests/copy_view/test_array.py | 37 ++++++++++++++++++++++++---- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bef2d1e1194f9..e55a54112ee72 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2149,6 +2149,12 @@ def empty(self) -> bool_t: def __array__( self, dtype: npt.DTypeLike | None = None, copy: bool_t | None = None ) -> np.ndarray: + if copy is False and not self._mgr.is_single_block and not self.empty: + # check this manually, otherwise ._values will already return a copy + # and np.array(values, copy=False) will not raise an error + raise ValueError( + "Unable to avoid copy while creating an array as requested." + ) values = self._values if copy is None: # Note: branch avoids `copy=None` for NumPy 1.x support diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py index 9a3f83e0293f5..06d9424450011 100644 --- a/pandas/tests/copy_view/test_array.py +++ b/pandas/tests/copy_view/test_array.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_gt2 + from pandas import ( DataFrame, Series, @@ -15,8 +17,12 @@ @pytest.mark.parametrize( "method", - [lambda ser: ser.values, lambda ser: np.asarray(ser)], - ids=["values", "asarray"], + [ + lambda ser: ser.values, + lambda ser: np.asarray(ser), + lambda ser: np.array(ser, copy=False), + ], + ids=["values", "asarray", "array"], ) def test_series_values(using_copy_on_write, method): ser = Series([1, 2, 3], name="name") @@ -45,8 +51,12 @@ def test_series_values(using_copy_on_write, method): @pytest.mark.parametrize( "method", - [lambda df: df.values, lambda df: np.asarray(df)], - ids=["values", "asarray"], + [ + lambda df: df.values, + lambda df: np.asarray(df), + lambda ser: np.array(ser, copy=False), + ], + ids=["values", "asarray", "array"], ) def test_dataframe_values(using_copy_on_write, using_array_manager, method): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) @@ -100,7 +110,7 @@ def test_series_to_numpy(using_copy_on_write): arr[0] = 0 assert ser.iloc[0] == 0 - # specify copy=False gives a writeable array + # specify copy=True gives a writeable array ser = Series([1, 2, 3], name="name") arr = ser.to_numpy(copy=True) assert not np.shares_memory(arr, get_array(ser, "name")) @@ -174,6 +184,23 @@ def test_dataframe_multiple_numpy_dtypes(): assert not np.shares_memory(arr, get_array(df, "a")) assert arr.flags.writeable is True + if np_version_gt2: + # copy=False semantics are only supported in NumPy>=2. + + with pytest.raises(ValueError, match="Unable to avoid copy while creating"): + arr = np.array(df, copy=False) + + arr = np.array(df, copy=True) + assert arr.flags.writeable is True + + +def test_dataframe_single_block_copy_true(): + # the copy=False/None cases are tested above in test_dataframe_values + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + arr = np.array(df, copy=True) + assert not np.shares_memory(arr, get_array(df, "a")) + assert arr.flags.writeable is True + def test_values_is_ea(using_copy_on_write): df = DataFrame({"a": date_range("2012-01-01", periods=3)}) From 168e3533d916d7a3505cc702687d9744e5488fe4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 7 Nov 2024 15:31:36 +0100 Subject: [PATCH 13/42] [backport 2.3.x] TST (string dtype): avoid explicit object dtype Index in fixture data (#60217) (#60225) (cherry picked from commit 4b04a2f0043ad04b5546750a8947dfeef68cdb75) --- pandas/_testing/__init__.py | 2 ++ pandas/conftest.py | 10 +++++----- pandas/tests/frame/methods/test_align.py | 3 --- pandas/tests/frame/test_reductions.py | 1 - pandas/tests/series/indexing/test_setitem.py | 1 - pandas/tests/series/methods/test_reindex.py | 2 +- pandas/tests/series/methods/test_to_csv.py | 3 --- 7 files changed, 8 insertions(+), 14 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 3aa7c64831efe..2d066b581f1c6 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -519,6 +519,8 @@ def shares_memory(left, right) -> bool: if isinstance(left, MultiIndex): return shares_memory(left._codes, right) if isinstance(left, (Index, Series)): + if isinstance(right, (Index, Series)): + return shares_memory(left._values, right._values) return shares_memory(left._values, right) if isinstance(left, NDArrayBackedExtensionArray): diff --git a/pandas/conftest.py b/pandas/conftest.py index b0818b11ab037..1567708d04b20 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -548,7 +548,7 @@ def multiindex_year_month_day_dataframe_random_data(): """ tdf = DataFrame( np.random.default_rng(2).standard_normal((100, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=100, freq="B"), ) ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() @@ -743,7 +743,7 @@ def string_series() -> Series: """ return Series( np.arange(30, dtype=np.float64) * 1.1, - index=Index([f"i_{i}" for i in range(30)], dtype=object), + index=Index([f"i_{i}" for i in range(30)]), name="series", ) @@ -754,7 +754,7 @@ def object_series() -> Series: Fixture for Series of dtype object with Index of unique strings """ data = [f"foo_{i}" for i in range(30)] - index = Index([f"bar_{i}" for i in range(30)], dtype=object) + index = Index([f"bar_{i}" for i in range(30)]) return Series(data, index=index, name="objects", dtype=object) @@ -846,8 +846,8 @@ def int_frame() -> DataFrame: """ return DataFrame( np.ones((30, 4), dtype=np.int64), - index=Index([f"foo_{i}" for i in range(30)], dtype=object), - columns=Index(list("ABCD"), dtype=object), + index=Index([f"foo_{i}" for i in range(30)]), + columns=Index(list("ABCD")), ) diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py index 15a97a99caa5a..5a9c47866dae8 100644 --- a/pandas/tests/frame/methods/test_align.py +++ b/pandas/tests/frame/methods/test_align.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -157,7 +155,6 @@ def test_align_series_condition(self): expected = DataFrame({"a": [0, 2, 0], "b": [0, 5, 0]}) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_align_int(self, int_frame): # test other non-float types other = DataFrame(index=range(5), columns=["A", "B", "C"]) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 824d53c8d5d13..bee95e8295746 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1058,7 +1058,6 @@ def test_sum_bools(self): # ---------------------------------------------------------------------- # Index of max / min - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("axis", [0, 1]) def test_idxmin(self, float_frame, int_frame, skipna, axis): diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 0b9b1fc080cfe..c28d3c9fedbd5 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -574,7 +574,6 @@ def test_setitem_with_expansion_type_promotion(self): expected = Series([Timestamp("2016-01-01"), 3.0, "foo"], index=["a", "b", "c"]) tm.assert_series_equal(ser, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_not_contained(self, string_series): # set item that's not contained ser = string_series.copy() diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py index 0923a2d42ce10..ecfbecf12bdd3 100644 --- a/pandas/tests/series/methods/test_reindex.py +++ b/pandas/tests/series/methods/test_reindex.py @@ -25,7 +25,7 @@ def test_reindex(datetime_series, string_series): identity = string_series.reindex(string_series.index) - assert np.may_share_memory(string_series.index, identity.index) + assert tm.shares_memory(string_series.index, identity.index) assert identity.index.is_(string_series.index) assert identity.index.identical(string_series.index) diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index efb249fdedf3d..ba75c7786ef72 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import Series import pandas._testing as tm @@ -26,7 +24,6 @@ def read_csv(self, path, **kwargs): return out - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_from_csv(self, datetime_series, string_series): # freq doesn't round-trip datetime_series.index = datetime_series.index._with_freq(None) From 64f9907f0630cf7c18e88abad99f36a4fbf7bc3d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Nov 2024 08:33:24 +0100 Subject: [PATCH 14/42] [backport 2.3.x] BUG (string dtype): fix qualifier in memory usage info (#60221) (#60231) (cherry picked from commit 0937c95777d44462d67fd5b299d4563984e78332) --- pandas/core/indexes/base.py | 4 ++- pandas/core/indexes/multi.py | 9 ++++--- pandas/tests/frame/methods/test_info.py | 34 +++++++++++++++++------- pandas/tests/series/methods/test_info.py | 30 +++++++++++++-------- 4 files changed, 52 insertions(+), 25 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index fc53e044a6544..5da327a82c02b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5326,7 +5326,9 @@ def _is_memory_usage_qualified(self) -> bool: """ Return a boolean if we need a qualified .info display. """ - return is_object_dtype(self.dtype) + return is_object_dtype(self.dtype) or ( + is_string_dtype(self.dtype) and self.dtype.storage == "python" # type: ignore[union-attr] + ) def __contains__(self, key: Any) -> bool: """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 9e002ccd3a787..7cb28214c7289 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -65,6 +65,7 @@ is_list_like, is_object_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -1344,10 +1345,12 @@ def dtype(self) -> np.dtype: def _is_memory_usage_qualified(self) -> bool: """return a boolean if we need a qualified .info display""" - def f(level) -> bool: - return "mixed" in level or "string" in level or "unicode" in level + def f(dtype) -> bool: + return is_object_dtype(dtype) or ( + is_string_dtype(dtype) and dtype.storage == "python" + ) - return any(f(level) for level in self._inferred_type_levels) + return any(f(level.dtype) for level in self.levels) # Cannot determine type of "memory_usage" @doc(Index.memory_usage) # type: ignore[has-type] diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py index 475632667a87a..f0ae00fa6febb 100644 --- a/pandas/tests/frame/methods/test_info.py +++ b/pandas/tests/frame/methods/test_info.py @@ -10,6 +10,7 @@ from pandas._config import using_string_dtype from pandas.compat import ( + HAS_PYARROW, IS64, PYPY, ) @@ -435,18 +436,25 @@ def test_usage_via_getsizeof(): assert abs(diff) < 100 -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_info_memory_usage_qualified(): +def test_info_memory_usage_qualified(using_infer_string): buf = StringIO() df = DataFrame(1, columns=list("ab"), index=[1, 2, 3]) df.info(buf=buf) assert "+" not in buf.getvalue() buf = StringIO() - df = DataFrame(1, columns=list("ab"), index=list("ABC")) + df = DataFrame(1, columns=list("ab"), index=Index(list("ABC"), dtype=object)) df.info(buf=buf) assert "+" in buf.getvalue() + buf = StringIO() + df = DataFrame(1, columns=list("ab"), index=Index(list("ABC"), dtype="str")) + df.info(buf=buf) + if using_infer_string and HAS_PYARROW: + assert "+" not in buf.getvalue() + else: + assert "+" in buf.getvalue() + buf = StringIO() df = DataFrame( 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]) @@ -459,7 +467,10 @@ def test_info_memory_usage_qualified(): 1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]]) ) df.info(buf=buf) - assert "+" in buf.getvalue() + if using_infer_string and HAS_PYARROW: + assert "+" not in buf.getvalue() + else: + assert "+" in buf.getvalue() def test_info_memory_usage_bug_on_multiindex(): @@ -496,16 +507,15 @@ def test_info_categorical(): df.info(buf=buf) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system") -def test_info_int_columns(): +def test_info_int_columns(using_infer_string): # GH#37245 df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"]) buf = StringIO() df.info(show_counts=True, buf=buf) result = buf.getvalue() expected = textwrap.dedent( - """\ + f"""\ Index: 2 entries, A to B Data columns (total 2 columns): @@ -514,19 +524,23 @@ def test_info_int_columns(): 0 1 2 non-null int64 1 2 2 non-null int64 dtypes: int64(2) - memory usage: 48.0+ bytes + memory usage: {'50.0' if using_infer_string and HAS_PYARROW else '48.0+'} bytes """ ) assert result == expected @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_memory_usage_empty_no_warning(): +def test_memory_usage_empty_no_warning(using_infer_string): # GH#50066 df = DataFrame(index=["a", "b"]) with tm.assert_produces_warning(None): result = df.memory_usage() - expected = Series(16 if IS64 else 8, index=["Index"]) + if using_infer_string and HAS_PYARROW: + value = 18 + else: + value = 16 if IS64 else 8 + expected = Series(value, index=["Index"]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_info.py b/pandas/tests/series/methods/test_info.py index 8fac40fe5fb25..7defad8a463f3 100644 --- a/pandas/tests/series/methods/test_info.py +++ b/pandas/tests/series/methods/test_info.py @@ -7,10 +7,14 @@ from pandas._config import using_string_dtype -from pandas.compat import PYPY +from pandas.compat import ( + HAS_PYARROW, + PYPY, +) from pandas import ( CategoricalIndex, + Index, MultiIndex, Series, date_range, @@ -41,7 +45,9 @@ def test_info_categorical(): @pytest.mark.parametrize("verbose", [True, False]) -def test_info_series(lexsorted_two_level_string_multiindex, verbose): +def test_info_series( + lexsorted_two_level_string_multiindex, verbose, using_infer_string +): index = lexsorted_two_level_string_multiindex ser = Series(range(len(index)), index=index, name="sth") buf = StringIO() @@ -63,10 +69,11 @@ def test_info_series(lexsorted_two_level_string_multiindex, verbose): 10 non-null int64 """ ) + qualifier = "" if using_infer_string and HAS_PYARROW else "+" expected += textwrap.dedent( f"""\ dtypes: int64(1) - memory usage: {ser.memory_usage()}.0+ bytes + memory usage: {ser.memory_usage()}.0{qualifier} bytes """ ) assert result == expected @@ -142,20 +149,21 @@ def test_info_memory_usage_deep_pypy(): assert s_object.memory_usage(deep=True) == s_object.memory_usage() -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( - "series, plus", + "index, plus", [ - (Series(1, index=[1, 2, 3]), False), - (Series(1, index=list("ABC")), True), - (Series(1, index=MultiIndex.from_product([range(3), range(3)])), False), + ([1, 2, 3], False), + (Index(list("ABC"), dtype="str"), not (using_string_dtype() and HAS_PYARROW)), + (Index(list("ABC"), dtype=object), True), + (MultiIndex.from_product([range(3), range(3)]), False), ( - Series(1, index=MultiIndex.from_product([range(3), ["foo", "bar"]])), - True, + MultiIndex.from_product([range(3), ["foo", "bar"]]), + not (using_string_dtype() and HAS_PYARROW), ), ], ) -def test_info_memory_usage_qualified(series, plus): +def test_info_memory_usage_qualified(index, plus): + series = Series(1, index=index) buf = StringIO() series.info(buf=buf) if plus: From db68cd5ba1c3102e4e6b560f46420b6e495b4dcd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Nov 2024 09:24:35 +0100 Subject: [PATCH 15/42] [backport 2.3.x] ERR (string dtype): harmonize setitem error message for python and pyarrow storage (#60219) (#60232) (cherry picked from commit 692ea6f9d4b05187a05f0811d3241211855d6efb) --- pandas/core/arrays/arrow/array.py | 4 ++-- pandas/core/arrays/masked.py | 2 +- pandas/core/arrays/string_.py | 12 +++++++++--- pandas/core/arrays/string_arrow.py | 15 ++++++++++++--- pandas/tests/arrays/masked/test_indexing.py | 2 +- pandas/tests/arrays/string_/test_string.py | 17 ++++------------- pandas/tests/frame/indexing/test_indexing.py | 2 +- pandas/tests/frame/indexing/test_where.py | 4 ++-- pandas/tests/indexing/test_indexing.py | 4 +--- pandas/tests/indexing/test_loc.py | 2 +- pandas/tests/series/indexing/test_setitem.py | 4 ++-- 11 files changed, 36 insertions(+), 32 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 8cf763265fd34..e8ce1f4526f89 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1134,7 +1134,7 @@ def fillna( try: fill_value = self._box_pa(value, pa_type=self._pa_array.type) except pa.ArrowTypeError as err: - msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" + msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'" raise TypeError(msg) from err try: @@ -2126,7 +2126,7 @@ def _maybe_convert_setitem_value(self, value): try: value = self._box_pa(value, self._pa_array.type) except pa.ArrowTypeError as err: - msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" + msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'" raise TypeError(msg) from err return value diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index ba7b8e3e7398e..0e839dc7a80bb 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -302,7 +302,7 @@ def _validate_setitem_value(self, value): # Note: without the "str" here, the f-string rendering raises in # py38 builds. - raise TypeError(f"Invalid value '{str(value)}' for dtype {self.dtype}") + raise TypeError(f"Invalid value '{value!s}' for dtype '{self.dtype}'") def __setitem__(self, key, value) -> None: key = check_array_indexer(self, key) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 4801b70a27dd4..aae9f98032eff 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -654,7 +654,8 @@ def _validate_scalar(self, value): return self.dtype.na_value elif not isinstance(value, str): raise TypeError( - f"Cannot set non-string value '{value}' into a string array." + f"Invalid value '{value}' for dtype '{self.dtype}'. Value should be a " + f"string or missing value, got '{type(value).__name__}' instead." ) return value @@ -743,7 +744,9 @@ def __setitem__(self, key, value) -> None: value = self.dtype.na_value elif not isinstance(value, str): raise TypeError( - f"Cannot set non-string value '{value}' into a StringArray." + f"Invalid value '{value}' for dtype '{self.dtype}'. Value should " + f"be a string or missing value, got '{type(value).__name__}' " + "instead." ) else: if not is_array_like(value): @@ -753,7 +756,10 @@ def __setitem__(self, key, value) -> None: # compatible, compatibility with arrow backed strings value = np.asarray(value) if len(value) and not lib.is_string_array(value, skipna=True): - raise TypeError("Must provide strings.") + raise TypeError( + "Invalid value for dtype 'str'. Value should be a " + "string or missing value (or array of those)." + ) mask = isna(value) if mask.any(): diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index b6e98d8fdc7e5..c15e50f698a3d 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -215,7 +215,10 @@ def insert(self, loc: int, item) -> ArrowStringArray: if self.dtype.na_value is np.nan and item is np.nan: item = libmissing.NA if not isinstance(item, str) and item is not libmissing.NA: - raise TypeError("Scalar must be NA or str") + raise TypeError( + f"Invalid value '{item}' for dtype 'str'. Value should be a " + f"string or missing value, got '{type(item).__name__}' instead." + ) return super().insert(loc, item) def _convert_bool_result(self, values, na=lib.no_default, method_name=None): @@ -249,13 +252,19 @@ def _maybe_convert_setitem_value(self, value): if isna(value): value = None elif not isinstance(value, str): - raise TypeError("Scalar must be NA or str") + raise TypeError( + f"Invalid value '{value}' for dtype 'str'. Value should be a " + f"string or missing value, got '{type(value).__name__}' instead." + ) else: value = np.array(value, dtype=object, copy=True) value[isna(value)] = None for v in value: if not (v is None or isinstance(v, str)): - raise TypeError("Must provide strings") + raise TypeError( + "Invalid value for dtype 'str'. Value should be a " + "string or missing value (or array of those)." + ) return super()._maybe_convert_setitem_value(value) def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: diff --git a/pandas/tests/arrays/masked/test_indexing.py b/pandas/tests/arrays/masked/test_indexing.py index 28ee451a7ddd7..753d562c87ffa 100644 --- a/pandas/tests/arrays/masked/test_indexing.py +++ b/pandas/tests/arrays/masked/test_indexing.py @@ -8,7 +8,7 @@ class TestSetitemValidation: def _check_setitem_invalid(self, arr, invalid): - msg = f"Invalid value '{str(invalid)}' for dtype {arr.dtype}" + msg = f"Invalid value '{invalid!s}' for dtype '{arr.dtype}'" msg = re.escape(msg) with pytest.raises(TypeError, match=msg): arr[0] = invalid diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index e511ba62d5d09..14c02723191a8 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -108,14 +108,11 @@ def test_none_to_nan(cls, dtype): def test_setitem_validates(cls, dtype): arr = cls._from_sequence(["a", "b"], dtype=dtype) - if dtype.storage == "python": - msg = "Cannot set non-string value '10' into a StringArray." - else: - msg = "Scalar must be NA or str" + msg = "Invalid value '10' for dtype 'str" with pytest.raises(TypeError, match=msg): arr[0] = 10 - msg = "Must provide strings" + msg = "Invalid value for dtype 'str" with pytest.raises(TypeError, match=msg): arr[:] = np.array([1, 2]) @@ -510,10 +507,7 @@ def test_fillna_args(dtype): expected = pd.array(["a", "b"], dtype=dtype) tm.assert_extension_array_equal(res, expected) - if dtype.storage == "pyarrow": - msg = "Invalid value '1' for dtype str" - else: - msg = "Cannot set non-string value '1' into a StringArray." + msg = "Invalid value '1' for dtype 'str" with pytest.raises(TypeError, match=msg): arr.fillna(value=1) @@ -754,10 +748,7 @@ def test_setitem_scalar_with_mask_validation(dtype): # for other non-string we should also raise an error ser = pd.Series(["a", "b", "c"], dtype=dtype) - if dtype.storage == "python": - msg = "Cannot set non-string value" - else: - msg = "Scalar must be NA or str" + msg = "Invalid value '1' for dtype 'str" with pytest.raises(TypeError, match=msg): ser[mask] = 1 diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 1721fe2c0eb8b..c0ab51a484cdf 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1340,7 +1340,7 @@ def test_setting_mismatched_na_into_nullable_fails( r"timedelta64\[ns\] cannot be converted to (Floating|Integer)Dtype", r"datetime64\[ns\] cannot be converted to (Floating|Integer)Dtype", "'values' contains non-numeric NA", - r"Invalid value '.*' for dtype (U?Int|Float)\d{1,2}", + r"Invalid value '.*' for dtype '(U?Int|Float)\d{1,2}'", ] ) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 861147f5b58dd..5fd3796d0255a 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -976,7 +976,7 @@ def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype): mask = np.array([True, True, False], ndmin=obj.ndim).T - msg = r"Invalid value '.*' for dtype (U?Int|Float)\d{1,2}" + msg = r"Invalid value '.*' for dtype '(U?Int|Float)\d{1,2}'" for null in tm.NP_NAT_OBJECTS + [pd.NaT]: # NaT is an NA value that we should *not* cast to pd.NA dtype @@ -1091,7 +1091,7 @@ def test_where_int_overflow(replacement, using_infer_string): df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]]) if using_infer_string and replacement not in (None, "snake"): with pytest.raises( - TypeError, match="Cannot set non-string value|Scalar must be NA or str" + TypeError, match=f"Invalid value '{replacement}' for dtype 'str'" ): df.where(pd.notnull(df), replacement) return diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 0ff33ba88b16f..07275302dcf9f 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -728,9 +728,7 @@ def run_tests(df, rhs, right_loc, right_iloc): right_iloc["joe"] = [1.0, "@-28", "@-20", "@-12", 17.0] right_iloc["jolie"] = ["@2", -26.0, -18.0, -10.0, "@18"] if using_infer_string: - with pytest.raises( - TypeError, match="Must provide strings|Scalar must be NA or str" - ): + with pytest.raises(TypeError, match="Invalid value"): with tm.assert_produces_warning( FutureWarning, match="incompatible dtype" ): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index bdc6d9aff6f4e..dc4f159cfd3c3 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1292,7 +1292,7 @@ def test_loc_setitem_str_to_small_float_conversion_type(self, using_infer_string # assigning with loc/iloc attempts to set the values inplace, which # in this case is successful if using_infer_string: - with pytest.raises(TypeError, match="Must provide strings"): + with pytest.raises(TypeError, match="Invalid value"): result.loc[result.index, "A"] = [float(x) for x in col_data] else: result.loc[result.index, "A"] = [float(x) for x in col_data] diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index c28d3c9fedbd5..d3ecbfe8f6cc7 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -888,7 +888,7 @@ def test_index_where(self, obj, key, expected, warn, val, using_infer_string): mask[key] = True if using_infer_string and obj.dtype == object: - with pytest.raises(TypeError, match="Scalar must"): + with pytest.raises(TypeError, match="Invalid value"): Index(obj).where(~mask, val) else: res = Index(obj).where(~mask, val) @@ -901,7 +901,7 @@ def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): mask[key] = True if using_infer_string and obj.dtype == object: - with pytest.raises(TypeError, match="Scalar must"): + with pytest.raises(TypeError, match="Invalid value"): Index(obj).putmask(mask, val) else: res = Index(obj).putmask(mask, val) From cacd4bbf24b325603b81d1339505c4deccec7701 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Nov 2024 12:58:17 +0100 Subject: [PATCH 16/42] [backport 2.3.x] TST (string dtype): resolve xfails in pandas/tests/series (#60233) (#60240) (cherry picked from commit 3f7bc81ae6839803ecc0da073fe83e9194759550) --- .../series/accessors/test_dt_accessor.py | 4 -- pandas/tests/series/indexing/test_indexing.py | 21 +++++++--- pandas/tests/series/indexing/test_setitem.py | 38 +++++++++---------- pandas/tests/series/indexing/test_where.py | 17 ++++----- pandas/tests/series/methods/test_replace.py | 34 +++++++++++------ pandas/tests/series/methods/test_unstack.py | 5 +-- pandas/tests/series/test_logical_ops.py | 1 + 7 files changed, 64 insertions(+), 56 deletions(-) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 18ee81581bdc3..a06a3a0d40675 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -11,8 +11,6 @@ import pytest import pytz -from pandas._config import using_string_dtype - from pandas._libs.tslibs.timezones import maybe_get_tz from pandas.errors import SettingWithCopyError @@ -571,7 +569,6 @@ def test_strftime(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_strftime_dt64_days(self): ser = Series(date_range("20130101", periods=5)) ser.iloc[0] = pd.NaT @@ -586,7 +583,6 @@ def test_strftime_dt64_days(self): expected = Index( ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"], - dtype=np.object_, ) # dtype may be S10 or U10 depending on python version tm.assert_index_equal(result, expected) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index a26e541732d36..9ab7dff64b182 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import IndexingError from pandas import ( @@ -270,18 +268,29 @@ def test_slice(string_series, object_series, using_copy_on_write, warn_copy_on_w assert (string_series[10:20] == 0).all() -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_timedelta_assignment(): # GH 8209 s = Series([], dtype=object) s.loc["B"] = timedelta(1) - tm.assert_series_equal(s, Series(Timedelta("1 days"), index=["B"])) + expected = Series( + Timedelta("1 days"), dtype="timedelta64[ns]", index=Index(["B"], dtype=object) + ) + tm.assert_series_equal(s, expected) s = s.reindex(s.index.insert(0, "A")) - tm.assert_series_equal(s, Series([np.nan, Timedelta("1 days")], index=["A", "B"])) + expected = Series( + [np.nan, Timedelta("1 days")], + dtype="timedelta64[ns]", + index=Index(["A", "B"], dtype=object), + ) + tm.assert_series_equal(s, expected) s.loc["A"] = timedelta(1) - expected = Series(Timedelta("1 days"), index=["A", "B"]) + expected = Series( + Timedelta("1 days"), + dtype="timedelta64[ns]", + index=Index(["A", "B"], dtype=object), + ) tm.assert_series_equal(s, expected) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index d3ecbfe8f6cc7..d95ee99489076 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -8,9 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW from pandas.compat.numpy import ( np_version_gt2, np_version_gte1p24, @@ -37,6 +34,7 @@ concat, date_range, interval_range, + isna, period_range, timedelta_range, ) @@ -564,14 +562,16 @@ def test_append_timedelta_does_not_cast(self, td, using_infer_string, request): tm.assert_series_equal(ser, expected) assert isinstance(ser["td"], Timedelta) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_setitem_with_expansion_type_promotion(self): # GH#12599 ser = Series(dtype=object) ser["a"] = Timestamp("2016-01-01") ser["b"] = 3.0 ser["c"] = "foo" - expected = Series([Timestamp("2016-01-01"), 3.0, "foo"], index=["a", "b", "c"]) + expected = Series( + [Timestamp("2016-01-01"), 3.0, "foo"], + index=Index(["a", "b", "c"], dtype=object), + ) tm.assert_series_equal(ser, expected) def test_setitem_not_contained(self, string_series): @@ -850,11 +850,6 @@ def test_mask_key(self, obj, key, expected, warn, val, indexer_sli): indexer_sli(obj)[mask] = val tm.assert_series_equal(obj, expected) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) def test_series_where(self, obj, key, expected, warn, val, is_inplace): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True @@ -870,6 +865,11 @@ def test_series_where(self, obj, key, expected, warn, val, is_inplace): obj = obj.copy() arr = obj._values + if obj.dtype == "string" and not (isinstance(val, str) or isna(val)): + with pytest.raises(TypeError, match="Invalid value"): + obj.where(~mask, val) + return + res = obj.where(~mask, val) if val is NA and res.dtype == object: @@ -882,29 +882,27 @@ def test_series_where(self, obj, key, expected, warn, val, is_inplace): self._check_inplace(is_inplace, orig, arr, obj) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - def test_index_where(self, obj, key, expected, warn, val, using_infer_string): + def test_index_where(self, obj, key, expected, warn, val): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - if using_infer_string and obj.dtype == object: + if obj.dtype == "string" and not (isinstance(val, str) or isna(val)): with pytest.raises(TypeError, match="Invalid value"): - Index(obj).where(~mask, val) + Index(obj, dtype=obj.dtype).where(~mask, val) else: - res = Index(obj).where(~mask, val) + res = Index(obj, dtype=obj.dtype).where(~mask, val) expected_idx = Index(expected, dtype=expected.dtype) tm.assert_index_equal(res, expected_idx) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - def test_index_putmask(self, obj, key, expected, warn, val, using_infer_string): + def test_index_putmask(self, obj, key, expected, warn, val): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - if using_infer_string and obj.dtype == object: + if obj.dtype == "string" and not (isinstance(val, str) or isna(val)): with pytest.raises(TypeError, match="Invalid value"): - Index(obj).putmask(mask, val) + Index(obj, dtype=obj.dtype).putmask(mask, val) else: - res = Index(obj).putmask(mask, val) + res = Index(obj, dtype=obj.dtype).putmask(mask, val) tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index 013386202c966..0fa2f63e5fb36 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.common import is_integer import pandas as pd @@ -232,7 +230,6 @@ def test_where_ndframe_align(): tm.assert_series_equal(out, expected) -@pytest.mark.xfail(using_string_dtype(), reason="can't set ints into string") def test_where_setitem_invalid(): # GH 2702 # make sure correct exceptions are raised on invalid list assignment @@ -242,7 +239,7 @@ def test_where_setitem_invalid(): "different length than the value" ) # slice - s = Series(list("abc")) + s = Series(list("abc"), dtype=object) with pytest.raises(ValueError, match=msg("slice")): s[0:3] = list(range(27)) @@ -252,18 +249,18 @@ def test_where_setitem_invalid(): tm.assert_series_equal(s.astype(np.int64), expected) # slice with step - s = Series(list("abcdef")) + s = Series(list("abcdef"), dtype=object) with pytest.raises(ValueError, match=msg("slice")): s[0:4:2] = list(range(27)) - s = Series(list("abcdef")) + s = Series(list("abcdef"), dtype=object) s[0:4:2] = list(range(2)) expected = Series([0, "b", 1, "d", "e", "f"]) tm.assert_series_equal(s, expected) # neg slices - s = Series(list("abcdef")) + s = Series(list("abcdef"), dtype=object) with pytest.raises(ValueError, match=msg("slice")): s[:-1] = list(range(27)) @@ -273,18 +270,18 @@ def test_where_setitem_invalid(): tm.assert_series_equal(s, expected) # list - s = Series(list("abc")) + s = Series(list("abc"), dtype=object) with pytest.raises(ValueError, match=msg("list-like")): s[[0, 1, 2]] = list(range(27)) - s = Series(list("abc")) + s = Series(list("abc"), dtype=object) with pytest.raises(ValueError, match=msg("list-like")): s[[0, 1, 2]] = list(range(2)) # scalar - s = Series(list("abc")) + s = Series(list("abc"), dtype=object) s[0] = list(range(10)) expected = Series([list(range(10)), "b", "c"]) tm.assert_series_equal(s, expected) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index c59dbc4ed95d7..79a66526a0004 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -391,7 +391,6 @@ def test_replace_mixed_types_with_string(self): expected = pd.Series([1, np.nan, 3, np.nan, 4, 5]) tm.assert_series_equal(expected, result) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "categorical, numeric", [ @@ -399,11 +398,15 @@ def test_replace_mixed_types_with_string(self): (pd.Categorical(["A", "B"], categories=["A", "B"]), [1, 2]), ], ) - def test_replace_categorical(self, categorical, numeric): + def test_replace_categorical(self, categorical, numeric, using_infer_string): # GH 24971, GH#23305 ser = pd.Series(categorical) msg = "Downcasting behavior in `replace`" msg = "with CategoricalDtype is deprecated" + if using_infer_string: + with pytest.raises(TypeError, match="Invalid value"): + ser.replace({"A": 1, "B": 2}) + return with tm.assert_produces_warning(FutureWarning, match=msg): result = ser.replace({"A": 1, "B": 2}) expected = pd.Series(numeric).astype("category") @@ -731,17 +734,25 @@ def test_replace_nullable_numeric(self): with pytest.raises(TypeError, match="Invalid value"): ints.replace(1, 9.5) - @pytest.mark.xfail(using_string_dtype(), reason="can't fill 1 in string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_series(self, regex): # GH-48644 - series = pd.Series(["0"]) + series = pd.Series(["0"], dtype=object) expected = pd.Series([1]) msg = "Downcasting behavior in `replace`" with tm.assert_produces_warning(FutureWarning, match=msg): result = series.replace(to_replace="0", value=1, regex=regex) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("regex", [False, True]) + def test_replace_regex_dtype_series_string(self, regex, using_infer_string): + if not using_infer_string: + # then this is object dtype which is already tested above + return + series = pd.Series(["0"], dtype="str") + with pytest.raises(TypeError, match="Invalid value"): + series.replace(to_replace="0", value=1, regex=regex) + def test_replace_different_int_types(self, any_int_numpy_dtype): # GH#45311 labs = pd.Series([1, 1, 1, 0, 0, 2, 2, 2], dtype=any_int_numpy_dtype) @@ -761,20 +772,19 @@ def test_replace_value_none_dtype_numeric(self, val): expected = pd.Series([1, None], dtype=object) tm.assert_series_equal(result, expected) - def test_replace_change_dtype_series(self, using_infer_string): + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + def test_replace_change_dtype_series(self): # GH#25797 - df = pd.DataFrame.from_dict({"Test": ["0.5", True, "0.6"]}) - warn = FutureWarning if using_infer_string else None - with tm.assert_produces_warning(warn, match="Downcasting"): - df["Test"] = df["Test"].replace([True], [np.nan]) - expected = pd.DataFrame.from_dict({"Test": ["0.5", np.nan, "0.6"]}) + df = pd.DataFrame({"Test": ["0.5", True, "0.6"]}, dtype=object) + df["Test"] = df["Test"].replace([True], [np.nan]) + expected = pd.DataFrame({"Test": ["0.5", np.nan, "0.6"]}, dtype=object) tm.assert_frame_equal(df, expected) - df = pd.DataFrame.from_dict({"Test": ["0.5", None, "0.6"]}) + df = pd.DataFrame({"Test": ["0.5", None, "0.6"]}, dtype=object) df["Test"] = df["Test"].replace([None], [np.nan]) tm.assert_frame_equal(df, expected) - df = pd.DataFrame.from_dict({"Test": ["0.5", None, "0.6"]}) + df = pd.DataFrame({"Test": ["0.5", None, "0.6"]}, dtype=object) df["Test"] = df["Test"].fillna(np.nan) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/series/methods/test_unstack.py b/pandas/tests/series/methods/test_unstack.py index 8569e0f49716a..11995260dd0be 100644 --- a/pandas/tests/series/methods/test_unstack.py +++ b/pandas/tests/series/methods/test_unstack.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -136,11 +134,10 @@ def test_unstack_mixed_type_name_in_multiindex( tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_unstack_multi_index_categorical_values(): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) mi = df.stack(future_stack=True).index.rename(["major", "minor"]) diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index 26bdfcbc6ec56..8d7adc1c1aae6 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -431,6 +431,7 @@ def test_logical_ops_label_based(self, using_infer_string): for e in [Series(["z"])]: if using_infer_string: # TODO(infer_string) should this behave differently? + # -> https://github.com/pandas-dev/pandas/issues/60234 with pytest.raises( TypeError, match="not supported for dtype|unsupported operand type" ): From 9465bf12b3643f3144944e7ecdbf2aa21cdfbaa5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Nov 2024 19:09:01 +0100 Subject: [PATCH 17/42] [2.3.x] CI: skip lxml encode test on Windows (#60238) CI: skip lxml encode test on Windows --- pandas/tests/io/test_html.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 826c0a1ca7cf9..b12098d4904c1 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1381,6 +1381,7 @@ def test_displayed_only_with_many_elements(self, displayed_only, flavor_read_htm expected = DataFrame({"A": [1, 4], "B": [2, 5]}) tm.assert_frame_equal(result, expected) + @td.skip_if_windows() @pytest.mark.filterwarnings( "ignore:You provided Unicode markup but also provided a value for " "from_encoding.*:UserWarning" From 45275859b1d59b4193f5c319410d13dbe2be6233 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 8 Nov 2024 10:44:41 -0800 Subject: [PATCH 18/42] Backport PR #60241 on branch 2.3.x (TST (string dtype): resolve xfail in arrow interface tests) (#60248) Backport PR #60241: TST (string dtype): resolve xfail in arrow interface tests Co-authored-by: Joris Van den Bossche --- pandas/tests/frame/test_arrow_interface.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py index dc163268f64b9..b36b6b5ffe0cc 100644 --- a/pandas/tests/frame/test_arrow_interface.py +++ b/pandas/tests/frame/test_arrow_interface.py @@ -2,8 +2,6 @@ import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -11,9 +9,8 @@ pa = pytest.importorskip("pyarrow") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @td.skip_if_no("pyarrow", min_version="14.0") -def test_dataframe_arrow_interface(): +def test_dataframe_arrow_interface(using_infer_string): df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) capsule = df.__arrow_c_stream__() @@ -25,7 +22,8 @@ def test_dataframe_arrow_interface(): ) table = pa.table(df) - expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + string_type = pa.large_string() if using_infer_string else pa.string() + expected = pa.table({"a": [1, 2, 3], "b": pa.array(["a", "b", "c"], string_type)}) assert table.equals(expected) schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) @@ -34,13 +32,13 @@ def test_dataframe_arrow_interface(): assert table.equals(expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @td.skip_if_no("pyarrow", min_version="15.0") -def test_dataframe_to_arrow(): +def test_dataframe_to_arrow(using_infer_string): df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) table = pa.RecordBatchReader.from_stream(df).read_all() - expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]}) + string_type = pa.large_string() if using_infer_string else pa.string() + expected = pa.table({"a": [1, 2, 3], "b": pa.array(["a", "b", "c"], string_type)}) assert table.equals(expected) schema = pa.schema([("a", pa.int8()), ("b", pa.string())]) From 2c89984d9f7f2aa077d68fd11c12bf2636a3c16a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 9 Nov 2024 00:34:34 +0100 Subject: [PATCH 19/42] [backport 2.3.x] BUG (string dtype): correctly enable idxmin/max for python-storage strings (#60242) (#60256) BUG (string dtype): correctly enable idxmin/max for python-storage strings (#60242) (cherry picked from commit 754d09163ae08f2b87daa41f2263556dbb809616) --- pandas/core/arrays/string_.py | 2 +- pandas/tests/frame/test_reductions.py | 6 ------ pandas/tests/reductions/test_reductions.py | 8 -------- 3 files changed, 1 insertion(+), 15 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index aae9f98032eff..7e8726f96f90a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -846,7 +846,7 @@ def _reduce( else: return nanops.nanall(self._ndarray, skipna=skipna) - if name in ["min", "max", "sum"]: + if name in ["min", "max", "argmin", "argmax", "sum"]: result = getattr(self, name)(skipna=skipna, axis=axis, **kwargs) if keepdims: return self._from_sequence([result], dtype=self.dtype) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index bee95e8295746..84d56864b3219 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import ( IS64, is_platform_windows, @@ -1091,7 +1089,6 @@ def test_idxmin_empty(self, index, skipna, axis): expected = Series(dtype=index.dtype) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("numeric_only", [True, False]) def test_idxmin_numeric_only(self, numeric_only): df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")}) @@ -1108,8 +1105,6 @@ def test_idxmin_axis_2(self, float_frame): with pytest.raises(ValueError, match=msg): frame.idxmin(axis=2) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("axis", [0, 1]) def test_idxmax(self, float_frame, int_frame, skipna, axis): frame = float_frame @@ -1142,7 +1137,6 @@ def test_idxmax_empty(self, index, skipna, axis): expected = Series(dtype=index.dtype) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("numeric_only", [True, False]) def test_idxmax_numeric_only(self, numeric_only): df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")}) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index ed2b01b09bb71..496b00a0547b7 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -7,10 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - import pandas as pd from pandas import ( Categorical, @@ -1246,10 +1242,6 @@ def test_idxminmax_object_dtype(self, using_infer_string): with pytest.raises(TypeError, match=msg): ser3.idxmin(skipna=False) - # TODO(infer_string) implement argmin/max for python string dtype - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_idxminmax_object_frame(self): # GH#4279 df = DataFrame([["zimm", 2.5], ["biff", 1.0], ["bid", 12.0]]) From 08086572f2b23c9dba1e881b4ecd16a55392ee04 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 9 Nov 2024 19:26:07 +0100 Subject: [PATCH 20/42] [backport 2.3.x] TST (string dtype): fix groupby xfails with using_infer_string + update error message (#59430) (#60246) * TST (string dtype): fix groupby xfails with using_infer_string + update error message (#59430) Co-authored-by: Joris Van den Bossche (cherry picked from commit e5dd89d4d74d8e2a06256023717880788f2b10ed) * fix test --------- Co-authored-by: jbrockmendel --- pandas/core/arrays/arrow/array.py | 14 +++++ pandas/core/arrays/base.py | 14 +++++ pandas/core/groupby/groupby.py | 4 +- pandas/tests/frame/test_stack_unstack.py | 4 +- pandas/tests/groupby/aggregate/test_cython.py | 4 +- pandas/tests/groupby/methods/test_quantile.py | 9 ++- pandas/tests/groupby/test_groupby.py | 56 ++++++++++++++----- pandas/tests/groupby/test_groupby_subclass.py | 2 +- pandas/tests/groupby/test_numeric_only.py | 20 +++++-- pandas/tests/groupby/test_raises.py | 50 +++++++++++++++-- .../tests/groupby/transform/test_transform.py | 1 + pandas/tests/resample/test_resample_api.py | 20 ++++++- pandas/tests/reshape/merge/test_join.py | 4 +- pandas/tests/reshape/test_pivot.py | 8 ++- 14 files changed, 167 insertions(+), 43 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e8ce1f4526f89..13e10c8d3a738 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2285,6 +2285,20 @@ def _groupby_op( **kwargs, ): if isinstance(self.dtype, StringDtype): + if how in [ + "prod", + "mean", + "median", + "cumsum", + "cumprod", + "std", + "sem", + "var", + "skew", + ]: + raise TypeError( + f"dtype '{self.dtype}' does not support operation '{how}'" + ) return super()._groupby_op( how=how, has_dropped_na=has_dropped_na, diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index abfe2369b0d0d..62ca2a45fb941 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2369,6 +2369,20 @@ def _groupby_op( # GH#43682 if isinstance(self.dtype, StringDtype): # StringArray + if op.how in [ + "prod", + "mean", + "median", + "cumsum", + "cumprod", + "std", + "sem", + "var", + "skew", + ]: + raise TypeError( + f"dtype '{self.dtype}' does not support operation '{how}'" + ) if op.how not in ["any", "all"]: # Fail early to avoid conversion to object op._get_cython_function(op.kind, op.how, np.dtype(object), False) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 296a601288f9d..c8e2ccc7bdaeb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4394,9 +4394,9 @@ def quantile( starts, ends = lib.generate_slices(splitter._slabels, splitter.ngroups) def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]: - if is_object_dtype(vals.dtype): + if isinstance(vals.dtype, StringDtype) or is_object_dtype(vals.dtype): raise TypeError( - "'quantile' cannot be performed against 'object' dtypes!" + f"dtype '{vals.dtype}' does not support operation 'quantile'" ) inference: DtypeObj | None = None diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 8bb5eb2d5c57a..af84ee021252f 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -2083,7 +2083,7 @@ def test_unstack_period_frame(self): @pytest.mark.filterwarnings( "ignore:The previous implementation of stack is deprecated" ) - def test_stack_multiple_bug(self, future_stack): + def test_stack_multiple_bug(self, future_stack, using_infer_string): # bug when some uniques are not present in the data GH#3170 id_col = ([1] * 3) + ([2] * 3) name = (["a"] * 3) + (["b"] * 3) @@ -2095,6 +2095,8 @@ def test_stack_multiple_bug(self, future_stack): multi.columns.name = "Params" unst = multi.unstack("ID") msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): unst.resample("W-THU").mean() down = unst.resample("W-THU").mean(numeric_only=True) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index fa8a6cb4120b2..2990fb5949242 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -166,14 +166,14 @@ def test_cython_agg_return_dict(): def test_cython_fail_agg(): dr = bdate_range("1/1/2000", periods=50) - ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr) + ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr) grouped = ts.groupby(lambda x: x.month) summed = grouped.sum() msg = "using SeriesGroupBy.sum" with tm.assert_produces_warning(FutureWarning, match=msg): # GH#53425 - expected = grouped.agg(np.sum) + expected = grouped.agg(np.sum).astype(object) tm.assert_series_equal(summed, expected) diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index d3bc815402ade..4269b41a0871b 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -174,7 +174,8 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, def test_quantile_raises(): df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) - with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"): + msg = "dtype 'object' does not support operation 'quantile'" + with pytest.raises(TypeError, match=msg): df.groupby("key").quantile() @@ -253,7 +254,6 @@ def test_groupby_quantile_nullable_array(values, q): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) @pytest.mark.parametrize("numeric_only", [True, False]) def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): @@ -263,9 +263,8 @@ def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only): expected = df.groupby("a")[["b"]].quantile(q) tm.assert_frame_equal(result, expected) else: - with pytest.raises( - TypeError, match="'quantile' cannot be performed against 'object' dtypes!" - ): + msg = "dtype '.*' does not support operation 'quantile'" + with pytest.raises(TypeError, match=msg): df.groupby("a").quantile(q, numeric_only=numeric_only) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 13269ea9c0920..3e2d15ede3648 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -640,7 +640,7 @@ def test_frame_multi_key_function_list(): tm.assert_frame_equal(agged, expected) -def test_frame_multi_key_function_list_partial_failure(): +def test_frame_multi_key_function_list_partial_failure(using_infer_string): data = DataFrame( { "A": [ @@ -691,6 +691,8 @@ def test_frame_multi_key_function_list_partial_failure(): grouped = data.groupby(["A", "B"]) funcs = ["mean", "std"] msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.agg(funcs) @@ -981,9 +983,11 @@ def test_groupby_multi_corner(df): tm.assert_frame_equal(agged, expected) -def test_raises_on_nuisance(df): +def test_raises_on_nuisance(df, using_infer_string): grouped = df.groupby("A") msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -1026,7 +1030,7 @@ def test_keep_nuisance_agg(df, agg_function): ["sum", "mean", "prod", "std", "var", "sem", "median"], ) @pytest.mark.parametrize("numeric_only", [True, False]) -def test_omit_nuisance_agg(df, agg_function, numeric_only): +def test_omit_nuisance_agg(df, agg_function, numeric_only, using_infer_string): # GH 38774, GH 38815 grouped = df.groupby("A") @@ -1034,7 +1038,10 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): if agg_function in no_drop_nuisance and not numeric_only: # Added numeric_only as part of GH#46560; these do not drop nuisance # columns when numeric_only is False - if agg_function in ("std", "sem"): + if using_infer_string: + msg = f"dtype 'str' does not support operation '{agg_function}'" + klass = TypeError + elif agg_function in ("std", "sem"): klass = ValueError msg = "could not convert string to float: 'one'" else: @@ -1055,16 +1062,24 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): tm.assert_frame_equal(result, expected) -def test_raise_on_nuisance_python_single(df): +def test_raise_on_nuisance_python_single(df, using_infer_string): # GH 38815 grouped = df.groupby("A") - with pytest.raises(ValueError, match="could not convert"): + + err = ValueError + msg = "could not convert" + if using_infer_string: + err = TypeError + msg = "dtype 'str' does not support operation 'skew'" + with pytest.raises(err, match=msg): grouped.skew() -def test_raise_on_nuisance_python_multiple(three_group): +def test_raise_on_nuisance_python_multiple(three_group, using_infer_string): grouped = three_group.groupby(["A", "B"]) msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -1102,12 +1117,16 @@ def test_nonsense_func(): df.groupby(lambda x: x + "foo") -def test_wrap_aggregated_output_multindex(multiindex_dataframe_random_data): +def test_wrap_aggregated_output_multindex( + multiindex_dataframe_random_data, using_infer_string +): df = multiindex_dataframe_random_data.T df["baz", "two"] = "peekaboo" keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): df.groupby(keys).agg("mean") agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean") @@ -1299,8 +1318,10 @@ def test_groupby_with_hier_columns(): def test_grouping_ndarray(df): grouped = df.groupby(df["A"].values) + grouped2 = df.groupby(df["A"].rename(None)) + result = grouped.sum() - expected = df.groupby(df["A"].rename(None)).sum() + expected = grouped2.sum() tm.assert_frame_equal(result, expected) @@ -1793,8 +1814,8 @@ def test_no_dummy_key_names(df): result = df.groupby(df["A"].values).sum() assert result.index.name is None - result = df.groupby([df["A"].values, df["B"].values]).sum() - assert result.index.names == (None, None) + result2 = df.groupby([df["A"].values, df["B"].values]).sum() + assert result2.index.names == (None, None) def test_groupby_sort_multiindex_series(): @@ -2099,6 +2120,7 @@ def get_categorical_invalid_expected(): is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype) is_dt64 = df.dtypes.iloc[0].kind == "M" is_cat = isinstance(values, Categorical) + is_str = isinstance(df.dtypes.iloc[0], pd.StringDtype) if ( isinstance(values, Categorical) @@ -2123,13 +2145,15 @@ def get_categorical_invalid_expected(): if op in ["prod", "sum", "skew"]: # ops that require more than just ordered-ness - if is_dt64 or is_cat or is_per: + if is_dt64 or is_cat or is_per or (is_str and op != "sum"): # GH#41291 # datetime64 -> prod and sum are invalid if is_dt64: msg = "datetime64 type does not support" elif is_per: msg = "Period type does not support" + elif is_str: + msg = f"dtype 'str' does not support operation '{op}'" else: msg = "category type does not support" if op == "skew": @@ -3083,7 +3107,7 @@ def test_obj_with_exclusions_duplicate_columns(): def test_groupby_numeric_only_std_no_result(numeric_only): # GH 51080 dicts_non_numeric = [{"a": "foo", "b": "bar"}, {"a": "car", "b": "dar"}] - df = DataFrame(dicts_non_numeric) + df = DataFrame(dicts_non_numeric, dtype=object) dfgb = df.groupby("a", as_index=False, sort=False) if numeric_only: @@ -3142,10 +3166,14 @@ def test_grouping_with_categorical_interval_columns(): def test_groupby_sum_on_nan_should_return_nan(bug_var): # GH 24196 df = DataFrame({"A": [bug_var, bug_var, bug_var, np.nan]}) + if isinstance(bug_var, str): + df = df.astype(object) dfgb = df.groupby(lambda x: x) result = dfgb.sum(min_count=1) - expected_df = DataFrame([bug_var, bug_var, bug_var, None], columns=["A"]) + expected_df = DataFrame( + [bug_var, bug_var, bug_var, None], columns=["A"], dtype=df["A"].dtype + ) tm.assert_frame_equal(result, expected_df) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 1a2acb658ee26..b5523592c3c5c 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -109,7 +109,7 @@ def test_groupby_resample_preserves_subclass(obj): df = obj( { - "Buyer": "Carl Carl Carl Carl Joe Carl".split(), + "Buyer": Series("Carl Carl Carl Carl Joe Carl".split(), dtype=object), "Quantity": [18, 3, 5, 1, 9, 3], "Date": [ datetime(2013, 9, 1, 13, 0), diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index 3b7614347d181..3c1ed20ddcb16 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -29,7 +29,8 @@ def df(self): "group": [1, 1, 2], "int": [1, 2, 3], "float": [4.0, 5.0, 6.0], - "string": list("abc"), + "string": Series(["a", "b", "c"], dtype="str"), + "object": Series(["a", "b", "c"], dtype=object), "category_string": Series(list("abc")).astype("category"), "category_int": [7, 8, 9], "datetime": date_range("20130101", periods=3), @@ -41,6 +42,7 @@ def df(self): "int", "float", "string", + "object", "category_string", "category_int", "datetime", @@ -113,6 +115,7 @@ def test_first_last(self, df, method): "int", "float", "string", + "object", "category_string", "category_int", "datetime", @@ -160,7 +163,9 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): # object dtypes for transformations are not implemented in Cython and # have no Python fallback - exception = NotImplementedError if method.startswith("cum") else TypeError + exception = ( + (NotImplementedError, TypeError) if method.startswith("cum") else TypeError + ) if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"): # The methods default to numeric_only=False and raise TypeError @@ -171,6 +176,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): re.escape(f"agg function failed [how->{method},dtype->object]"), # cumsum/cummin/cummax/cumprod "function is not implemented for this dtype", + f"dtype 'str' does not support operation '{method}'", ] ) with pytest.raises(exception, match=msg): @@ -181,7 +187,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "category type does not support sum operations", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), - re.escape(f"agg function failed [how->{method},dtype->str]"), + f"dtype 'str' does not support operation '{method}'", ] ) with pytest.raises(exception, match=msg): @@ -199,7 +205,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): f"Cannot perform {method} with non-ordered Categorical", re.escape(f"agg function failed [how->{method},dtype->object]"), re.escape(f"agg function failed [how->{method},dtype->string]"), - re.escape(f"agg function failed [how->{method},dtype->str]"), + f"dtype 'str' does not support operation '{method}'", ] ) with pytest.raises(exception, match=msg): @@ -384,7 +390,9 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys): re.escape(f"agg function failed [how->{kernel},dtype->object]"), ] ) - if kernel == "idxmin": + if kernel == "quantile": + msg = "dtype 'object' does not support operation 'quantile'" + elif kernel == "idxmin": msg = "'<' not supported between instances of 'type' and 'type'" elif kernel == "idxmax": msg = "'>' not supported between instances of 'type' and 'type'" @@ -458,7 +466,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request): # that succeed should not be allowed to fail (without deprecation, at least) if groupby_func in fails_on_numeric_object and dtype is object: if groupby_func == "quantile": - msg = "cannot be performed against 'object' dtypes" + msg = "dtype 'object' does not support operation 'quantile'" else: msg = "is not supported for object dtype" warn = FutureWarning if groupby_func == "fillna" else None diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 4ebb26b0289ec..5457f5ba050c6 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( Categorical, DataFrame, @@ -119,10 +117,9 @@ def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""): gb.transform(groupby_func, *args) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("how", ["method", "agg", "transform"]) def test_groupby_raises_string( - how, by, groupby_series, groupby_func, df_with_string_col + how, by, groupby_series, groupby_func, df_with_string_col, using_infer_string ): df = df_with_string_col args = get_groupby_method_args(groupby_func, df) @@ -182,7 +179,7 @@ def test_groupby_raises_string( TypeError, re.escape("agg function failed [how->prod,dtype->object]"), ), - "quantile": (TypeError, "cannot be performed against 'object' dtypes!"), + "quantile": (TypeError, "dtype 'object' does not support operation 'quantile'"), "rank": (None, ""), "sem": (ValueError, "could not convert string to float"), "shift": (None, ""), @@ -196,6 +193,37 @@ def test_groupby_raises_string( ), }[groupby_func] + if using_infer_string: + if groupby_func in [ + "prod", + "mean", + "median", + "cumsum", + "cumprod", + "std", + "sem", + "var", + "skew", + "quantile", + ]: + msg = f"dtype 'str' does not support operation '{groupby_func}'" + if groupby_func in ["sem", "std", "skew"]: + # The object-dtype raises ValueError when trying to convert to numeric. + klass = TypeError + elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow": + # This doesn't go through EA._groupby_op so the message isn't controlled + # there. + msg = "operation 'truediv' not supported for dtype 'str' with dtype 'str'" + elif groupby_func == "diff" and df["d"].dtype.storage == "pyarrow": + # This doesn't go through EA._groupby_op so the message isn't controlled + # there. + msg = "operation 'sub' not supported for dtype 'str' with dtype 'str'" + + elif groupby_func in ["cummin", "cummax"]: + msg = msg.replace("object", "str") + elif groupby_func == "corrwith": + msg = "Cannot perform reduction 'mean' with string dtype" + if groupby_func == "fillna": kind = "Series" if groupby_series else "DataFrame" warn_msg = f"{kind}GroupBy.fillna is deprecated" @@ -222,7 +250,12 @@ def func(x): @pytest.mark.parametrize("how", ["agg", "transform"]) @pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean]) def test_groupby_raises_string_np( - how, by, groupby_series, groupby_func_np, df_with_string_col + how, + by, + groupby_series, + groupby_func_np, + df_with_string_col, + using_infer_string, ): # GH#50749 df = df_with_string_col @@ -239,6 +272,11 @@ def test_groupby_raises_string_np( ), }[groupby_func_np] + if using_infer_string: + if groupby_func_np is np.mean: + klass = TypeError + msg = "dtype 'str' does not support operation 'mean'" + if groupby_series: warn_msg = "using SeriesGroupBy.[sum|mean]" else: diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 2aada753e27f4..a516af7e15943 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -905,6 +905,7 @@ def test_cython_transform_frame_column( ".* is not supported for object dtype", "is not implemented for this dtype", ".* is not supported for str dtype", + "dtype 'str' does not support operation '.*'", ] ) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index af4cf5d4ebae5..74d06117cbb4a 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -188,7 +188,7 @@ def test_api_compat_before_use(attr): getattr(rs, attr) -def tests_raises_on_nuisance(test_frame): +def tests_raises_on_nuisance(test_frame, using_infer_string): df = test_frame df["D"] = "foo" r = df.resample("h") @@ -198,6 +198,8 @@ def tests_raises_on_nuisance(test_frame): expected = r[["A", "B", "C"]].mean() msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): r.mean() result = r.mean(numeric_only=True) @@ -932,7 +934,9 @@ def test_end_and_end_day_origin( ("sem", lib.no_default, "could not convert string to float"), ], ) -def test_frame_downsample_method(method, numeric_only, expected_data): +def test_frame_downsample_method( + method, numeric_only, expected_data, using_infer_string +): # GH#46442 test if `numeric_only` behave as expected for DataFrameGroupBy index = date_range("2018-01-01", periods=2, freq="D") @@ -949,6 +953,11 @@ def test_frame_downsample_method(method, numeric_only, expected_data): if method in ("var", "mean", "median", "prod"): klass = TypeError msg = re.escape(f"agg function failed [how->{method},dtype->") + if using_infer_string: + msg = f"dtype 'str' does not support operation '{method}'" + elif method in ["sum", "std", "sem"] and using_infer_string: + klass = TypeError + msg = f"dtype 'str' does not support operation '{method}'" else: klass = ValueError msg = expected_data @@ -983,7 +992,9 @@ def test_frame_downsample_method(method, numeric_only, expected_data): ("last", lib.no_default, ["cat_2"]), ], ) -def test_series_downsample_method(method, numeric_only, expected_data): +def test_series_downsample_method( + method, numeric_only, expected_data, using_infer_string +): # GH#46442 test if `numeric_only` behave as expected for SeriesGroupBy index = date_range("2018-01-01", periods=2, freq="D") @@ -999,8 +1010,11 @@ def test_series_downsample_method(method, numeric_only, expected_data): func(**kwargs) elif method == "prod": msg = re.escape("agg function failed [how->prod,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'prod'" with pytest.raises(TypeError, match=msg): func(**kwargs) + else: result = func(**kwargs) expected = Series(expected_data, index=expected_index) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index 9188521c71158..4b79860437f72 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -625,7 +625,7 @@ def test_join_non_unique_period_index(self): ) tm.assert_frame_equal(result, expected) - def test_mixed_type_join_with_suffix(self): + def test_mixed_type_join_with_suffix(self, using_infer_string): # GH #916 df = DataFrame( np.random.default_rng(2).standard_normal((20, 6)), @@ -636,6 +636,8 @@ def test_mixed_type_join_with_suffix(self): grouped = df.groupby("id") msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): grouped.mean() mn = grouped.mean(numeric_only=True) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index d0858a0ea5558..75268ccee1d8c 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -948,12 +948,14 @@ def test_margins(self, data): for value_col in table.columns.levels[0]: self._check_output(table[value_col], value_col, data) - def test_no_col(self, data): + def test_no_col(self, data, using_infer_string): # no col # to help with a buglet data.columns = [k * 2 for k in data.columns] msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") table = data.drop(columns="CC").pivot_table( @@ -1003,7 +1005,7 @@ def test_no_col(self, data): ], ) def test_margin_with_only_columns_defined( - self, columns, aggfunc, values, expected_columns + self, columns, aggfunc, values, expected_columns, using_infer_string ): # GH 31016 df = DataFrame( @@ -1027,6 +1029,8 @@ def test_margin_with_only_columns_defined( ) if aggfunc != "sum": msg = re.escape("agg function failed [how->mean,dtype->") + if using_infer_string: + msg = "dtype 'str' does not support operation 'mean'" with pytest.raises(TypeError, match=msg): df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc) if "B" not in columns: From 2054463668ad130c3dfed1e789ed49dd49774409 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 13 Nov 2024 18:40:09 +0100 Subject: [PATCH 21/42] [backport 2.3.x] ENH (string dtype): convert string_view columns to future string dtype instead of object dtype in Parquet/Feather IO (#60235) (#60291) (cherry picked from commit f307a0a3615d93c2177f6581133bdb541e12a93c) --- pandas/compat/__init__.py | 2 ++ pandas/compat/pyarrow.py | 2 ++ pandas/io/_util.py | 9 +++++++-- pandas/tests/io/test_feather.py | 20 ++++++++++++++++++++ 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 38fb0188df5ff..5e82853109015 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -33,6 +33,7 @@ pa_version_under14p1, pa_version_under16p0, pa_version_under17p0, + pa_version_under18p0, ) if TYPE_CHECKING: @@ -191,6 +192,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under14p1", "pa_version_under16p0", "pa_version_under17p0", + "pa_version_under18p0", "HAS_PYARROW", "IS64", "ISMUSL", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 7fa197c4a9824..f579b8a45d386 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -17,6 +17,7 @@ pa_version_under15p0 = _palv < Version("15.0.0") pa_version_under16p0 = _palv < Version("16.0.0") pa_version_under17p0 = _palv < Version("17.0.0") + pa_version_under18p0 = _palv < Version("18.0.0") HAS_PYARROW = True except ImportError: pa_version_under10p1 = True @@ -28,4 +29,5 @@ pa_version_under15p0 = True pa_version_under16p0 = True pa_version_under17p0 = True + pa_version_under18p0 = False HAS_PYARROW = False diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 50a97f1059b5c..f3e6dba1391be 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -4,6 +4,7 @@ import numpy as np +from pandas.compat import pa_version_under18p0 from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -32,7 +33,11 @@ def _arrow_dtype_mapping() -> dict: def arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") - return { + mapping = { pa.string(): pd.StringDtype(na_value=np.nan), pa.large_string(): pd.StringDtype(na_value=np.nan), - }.get + } + if not pa_version_under18p0: + mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan) + + return mapping.get diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 3b4484e44e155..58a5f78ce3258 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -2,6 +2,8 @@ import numpy as np import pytest +from pandas.compat.pyarrow import pa_version_under18p0 + import pandas as pd import pandas._testing as tm @@ -250,3 +252,21 @@ def test_string_inference(self, tmp_path): data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan) ) tm.assert_frame_equal(result, expected) + + @pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0") + def test_string_inference_string_view_type(self, tmp_path): + # GH#54798 + import pyarrow as pa + from pyarrow import feather + + path = tmp_path / "string_view.parquet" + table = pa.table({"a": pa.array([None, "b", "c"], pa.string_view())}) + feather.write_feather(table, path) + + with pd.option_context("future.infer_string", True): + result = read_feather(path) + + expected = pd.DataFrame( + data={"a": [None, "b", "c"]}, dtype=pd.StringDtype(na_value=np.nan) + ) + tm.assert_frame_equal(result, expected) From 54b47df88199741dac449fd0d2060e7dfdf8dd7e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 14 Nov 2024 18:22:39 +0100 Subject: [PATCH 22/42] [backport 2.3.x] BUG (string dtype): replace with non-string to fall back to object dtype (#60285) (#60292) * BUG (string dtype): replace with non-string to fall back to object dtype (#60285) Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> (cherry picked from commit 938832ba325c6efc1710e002c0d3d4d9b3a6c8ba) * updates for 2.3 * fix inplace modification for 2.3.x branch with python storage --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/arrays/string_.py | 43 ++++++++++++-------- pandas/core/dtypes/cast.py | 7 ++++ pandas/core/internals/blocks.py | 40 ++++++++++++++---- pandas/tests/frame/methods/test_replace.py | 9 ---- pandas/tests/series/indexing/test_setitem.py | 18 +++----- pandas/tests/series/methods/test_replace.py | 16 +++----- 7 files changed, 76 insertions(+), 59 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 0751554d87dc8..3e699e1a27b55 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -107,10 +107,10 @@ Conversion Strings ^^^^^^^ - Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`) +- Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) - Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`) - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) -- Interval ^^^^^^^^ diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 7e8726f96f90a..e163a9df8ee10 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -726,20 +726,9 @@ def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]: return arr, self.dtype.na_value - def __setitem__(self, key, value) -> None: - value = extract_array(value, extract_numpy=True) - if isinstance(value, type(self)): - # extract_array doesn't extract NumpyExtensionArray subclasses - value = value._ndarray - - key = check_array_indexer(self, key) - scalar_key = lib.is_scalar(key) - scalar_value = lib.is_scalar(value) - if scalar_key and not scalar_value: - raise ValueError("setting an array element with a sequence.") - - # validate new items - if scalar_value: + def _maybe_convert_setitem_value(self, value): + """Maybe convert value to be pyarrow compatible.""" + if lib.is_scalar(value): if isna(value): value = self.dtype.na_value elif not isinstance(value, str): @@ -749,8 +738,11 @@ def __setitem__(self, key, value) -> None: "instead." ) else: + value = extract_array(value, extract_numpy=True) if not is_array_like(value): value = np.asarray(value, dtype=object) + elif isinstance(value.dtype, type(self.dtype)): + return value else: # cast categories and friends to arrays to see if values are # compatible, compatibility with arrow backed strings @@ -760,11 +752,26 @@ def __setitem__(self, key, value) -> None: "Invalid value for dtype 'str'. Value should be a " "string or missing value (or array of those)." ) + return value - mask = isna(value) - if mask.any(): - value = value.copy() - value[isna(value)] = self.dtype.na_value + def __setitem__(self, key, value) -> None: + value = self._maybe_convert_setitem_value(value) + + key = check_array_indexer(self, key) + scalar_key = lib.is_scalar(key) + scalar_value = lib.is_scalar(value) + if scalar_key and not scalar_value: + raise ValueError("setting an array element with a sequence.") + + if not scalar_value: + if value.dtype == self.dtype: + value = value._ndarray + else: + value = np.asarray(value) + mask = isna(value) + if mask.any(): + value = value.copy() + value[isna(value)] = self.dtype.na_value super().__setitem__(key, value) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7a92b7306beea..1bc944935756e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1754,6 +1754,13 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool: except (ValueError, TypeError): return False + if dtype == "string": + try: + arr._maybe_convert_setitem_value(element) # type: ignore[union-attr] + return True + except (ValueError, TypeError): + return False + # This is technically incorrect, but maintains the behavior of # ExtensionBlock._can_hold_element return True diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 917a65348b7a3..7ee1361912c05 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -84,6 +84,7 @@ ABCNumpyExtensionArray, ABCSeries, ) +from pandas.core.dtypes.inference import is_re from pandas.core.dtypes.missing import ( is_valid_na_for_dtype, isna, @@ -115,6 +116,7 @@ PeriodArray, TimedeltaArray, ) +from pandas.core.arrays.string_ import StringDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.computation import expressions @@ -476,7 +478,9 @@ def split_and_operate(self, func, *args, **kwargs) -> list[Block]: # Up/Down-casting @final - def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: + def coerce_to_target_dtype( + self, other, warn_on_upcast: bool = False, using_cow: bool = False + ) -> Block: """ coerce the current block to a dtype compat for other we will return a block, possibly object, and not raise @@ -528,7 +532,14 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block: f"{self.values.dtype}. Please report a bug at " "https://github.com/pandas-dev/pandas/issues." ) - return self.astype(new_dtype, copy=False) + copy = False + if ( + not using_cow + and isinstance(self.dtype, StringDtype) + and self.dtype.storage == "python" + ): + copy = True + return self.astype(new_dtype, copy=copy, using_cow=using_cow) @final def _maybe_downcast( @@ -879,7 +890,7 @@ def replace( else: return [self] if inplace else [self.copy()] - elif self._can_hold_element(value): + elif self._can_hold_element(value) or (self.dtype == "string" and is_re(value)): # TODO(CoW): Maybe split here as well into columns where mask has True # and rest? blk = self._maybe_copy(using_cow, inplace) @@ -926,12 +937,13 @@ def replace( if value is None or value is NA: blk = self.astype(np.dtype(object)) else: - blk = self.coerce_to_target_dtype(value) + blk = self.coerce_to_target_dtype(value, using_cow=using_cow) return blk.replace( to_replace=to_replace, value=value, inplace=True, mask=mask, + using_cow=using_cow, ) else: @@ -980,16 +992,26 @@ def _replace_regex( ------- List[Block] """ - if not self._can_hold_element(to_replace): + if not is_re(to_replace) and not self._can_hold_element(to_replace): # i.e. only if self.is_object is True, but could in principle include a # String ExtensionBlock if using_cow: return [self.copy(deep=False)] return [self] if inplace else [self.copy()] - rx = re.compile(to_replace) + if is_re(to_replace) and self.dtype not in [object, "string"]: + # only object or string dtype can hold strings, and a regex object + # will only match strings + return [self.copy(deep=False)] - block = self._maybe_copy(using_cow, inplace) + if not ( + self._can_hold_element(value) or (self.dtype == "string" and is_re(value)) + ): + block = self.astype(np.dtype(object)) + else: + block = self._maybe_copy(using_cow, inplace) + + rx = re.compile(to_replace) replace_regex(block.values, rx, value, mask) @@ -1048,7 +1070,9 @@ def replace_list( # Exclude anything that we know we won't contain pairs = [ - (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) + (x, y) + for x, y in zip(src_list, dest_list) + if (self._can_hold_element(x) or (self.dtype == "string" and is_re(x))) ] if not len(pairs): if using_cow: diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index ccee7ca24bd3d..8df9893e73766 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -297,7 +297,6 @@ def test_regex_replace_dict_nested_non_first_character( expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_regex_replace_dict_nested_gh4115(self): df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) @@ -556,7 +555,6 @@ def test_replace_series_dict(self): result = df.replace(s, df.mean()) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_convert(self): # gh 3907 df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) @@ -932,7 +930,6 @@ def test_replace_input_formats_listlike(self): with pytest.raises(ValueError, match=msg): df.replace(to_rep, values[1:]) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_input_formats_scalar(self): df = DataFrame( {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} @@ -961,7 +958,6 @@ def test_replace_limit(self): # TODO pass - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_dict_no_regex(self): answer = Series( { @@ -985,7 +981,6 @@ def test_replace_dict_no_regex(self): result = answer.replace(weights) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_series_no_regex(self): answer = Series( { @@ -1104,7 +1099,6 @@ def test_replace_swapping_bug(self, using_infer_string): expect = DataFrame({"a": ["Y", "N", "Y"]}) tm.assert_frame_equal(res, expect) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_period(self): d = { "fname": { @@ -1141,7 +1135,6 @@ def test_replace_period(self): result = df.replace(d) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") def test_replace_datetime(self): d = { "fname": { @@ -1367,7 +1360,6 @@ def test_replace_commutative(self, df, to_replace, exp): result = df.replace(to_replace) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") @pytest.mark.parametrize( "replacer", [ @@ -1644,7 +1636,6 @@ def test_regex_replace_scalar( expected.loc[expected["a"] == ".", "a"] = expected_replace_val tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_frame(self, regex): # GH-48644 diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index d95ee99489076..a1263e2d30853 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -886,24 +886,16 @@ def test_index_where(self, obj, key, expected, warn, val): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - if obj.dtype == "string" and not (isinstance(val, str) or isna(val)): - with pytest.raises(TypeError, match="Invalid value"): - Index(obj, dtype=obj.dtype).where(~mask, val) - else: - res = Index(obj, dtype=obj.dtype).where(~mask, val) - expected_idx = Index(expected, dtype=expected.dtype) - tm.assert_index_equal(res, expected_idx) + res = Index(obj, dtype=obj.dtype).where(~mask, val) + expected_idx = Index(expected, dtype=expected.dtype) + tm.assert_index_equal(res, expected_idx) def test_index_putmask(self, obj, key, expected, warn, val): mask = np.zeros(obj.shape, dtype=bool) mask[key] = True - if obj.dtype == "string" and not (isinstance(val, str) or isna(val)): - with pytest.raises(TypeError, match="Invalid value"): - Index(obj, dtype=obj.dtype).putmask(mask, val) - else: - res = Index(obj, dtype=obj.dtype).putmask(mask, val) - tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) + res = Index(obj, dtype=obj.dtype).putmask(mask, val) + tm.assert_index_equal(res, Index(expected, dtype=expected.dtype)) @pytest.mark.parametrize( diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 79a66526a0004..0b0cf57a70c3f 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -403,10 +403,6 @@ def test_replace_categorical(self, categorical, numeric, using_infer_string): ser = pd.Series(categorical) msg = "Downcasting behavior in `replace`" msg = "with CategoricalDtype is deprecated" - if using_infer_string: - with pytest.raises(TypeError, match="Invalid value"): - ser.replace({"A": 1, "B": 2}) - return with tm.assert_produces_warning(FutureWarning, match=msg): result = ser.replace({"A": 1, "B": 2}) expected = pd.Series(numeric).astype("category") @@ -745,13 +741,13 @@ def test_replace_regex_dtype_series(self, regex): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("regex", [False, True]) - def test_replace_regex_dtype_series_string(self, regex, using_infer_string): - if not using_infer_string: - # then this is object dtype which is already tested above - return + def test_replace_regex_dtype_series_string(self, regex): series = pd.Series(["0"], dtype="str") - with pytest.raises(TypeError, match="Invalid value"): - series.replace(to_replace="0", value=1, regex=regex) + expected = pd.Series([1], dtype="int64") + msg = "Downcasting behavior in `replace`" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = series.replace(to_replace="0", value=1, regex=regex) + tm.assert_series_equal(result, expected) def test_replace_different_int_types(self, any_int_numpy_dtype): # GH#45311 From c875a53fe39b0215c8b9593bd5bd7a8d60c9683c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 14 Nov 2024 21:19:26 +0100 Subject: [PATCH 23/42] [backport 2.3.x] BUG (string dtype): let fillna with invalid value upcast to object dtype (#60296) (#60316) BUG (string dtype): let fillna with invalid value upcast to object dtype (#60296) * BUG (string dtype): let fillna with invalid value upcast to object dtype * fix fillna limit case + update tests for no longer raising (cherry picked from commit 34c39e9078ea8af12871a92bdcea2058553c9869) --- pandas/core/internals/blocks.py | 6 +++--- pandas/tests/frame/indexing/test_where.py | 8 +------- pandas/tests/series/indexing/test_setitem.py | 6 ------ 3 files changed, 4 insertions(+), 16 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7ee1361912c05..6ae591a5d4ac8 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1710,7 +1710,7 @@ def fillna( return nbs if limit is not None: - mask[mask.cumsum(self.ndim - 1) > limit] = False + mask[mask.cumsum(self.values.ndim - 1) > limit] = False if inplace: nbs = self.putmask( @@ -2136,7 +2136,7 @@ def where( res_values = arr._where(cond, other).T except (ValueError, TypeError): if self.ndim == 1 or self.shape[0] == 1: - if isinstance(self.dtype, IntervalDtype): + if isinstance(self.dtype, (IntervalDtype, StringDtype)): # TestSetitemFloatIntervalWithIntIntervalValues blk = self.coerce_to_target_dtype(orig_other) nbs = blk.where(orig_other, orig_cond, using_cow=using_cow) @@ -2338,7 +2338,7 @@ def fillna( using_cow: bool = False, already_warned=None, ) -> list[Block]: - if isinstance(self.dtype, IntervalDtype): + if isinstance(self.dtype, (IntervalDtype, StringDtype)): # Block.fillna handles coercion (test_fillna_interval) return super().fillna( value=value, diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 5fd3796d0255a..356257bbfec98 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -1086,15 +1086,9 @@ def test_where_producing_ea_cond_for_np_dtype(): @pytest.mark.parametrize( "replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)] ) -def test_where_int_overflow(replacement, using_infer_string): +def test_where_int_overflow(replacement): # GH 31687 df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]]) - if using_infer_string and replacement not in (None, "snake"): - with pytest.raises( - TypeError, match=f"Invalid value '{replacement}' for dtype 'str'" - ): - df.where(pd.notnull(df), replacement) - return result = df.where(pd.notnull(df), replacement) expected = DataFrame([[1.0, 2e25, "nine"], [replacement, 0.1, replacement]]) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index a1263e2d30853..85558e85494eb 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -34,7 +34,6 @@ concat, date_range, interval_range, - isna, period_range, timedelta_range, ) @@ -865,11 +864,6 @@ def test_series_where(self, obj, key, expected, warn, val, is_inplace): obj = obj.copy() arr = obj._values - if obj.dtype == "string" and not (isinstance(val, str) or isna(val)): - with pytest.raises(TypeError, match="Invalid value"): - obj.where(~mask, val) - return - res = obj.where(~mask, val) if val is NA and res.dtype == object: From 9537b2081187313d6a37169418df2bed468f2b9b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 14 Nov 2024 21:19:51 +0100 Subject: [PATCH 24/42] [backport 2.3.x] TST (string dtype): resolve all easy xfails in pandas/tests/groupby (#60314) (#60317) TST (string dtype): resolve all easy xfails in pandas/tests/groupby (#60314) (cherry picked from commit c4a20261c337d68dc470fb6fd6a5505e2c7348d0) --- pandas/tests/groupby/aggregate/test_aggregate.py | 8 ++------ pandas/tests/groupby/aggregate/test_cython.py | 7 +++---- pandas/tests/groupby/aggregate/test_other.py | 6 ++---- pandas/tests/groupby/methods/test_quantile.py | 5 +---- pandas/tests/groupby/methods/test_size.py | 2 ++ pandas/tests/groupby/test_categorical.py | 9 +++++---- pandas/tests/groupby/test_groupby.py | 9 +++------ pandas/tests/groupby/test_groupby_dropna.py | 5 +---- pandas/tests/groupby/test_grouping.py | 10 ++++------ pandas/tests/groupby/test_pipe.py | 6 +----- pandas/tests/groupby/test_reductions.py | 7 ++----- pandas/tests/groupby/test_timegrouper.py | 2 ++ pandas/tests/groupby/transform/test_transform.py | 7 ++----- 13 files changed, 30 insertions(+), 53 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index b267347aaf030..f02a828fe8d17 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import SpecificationError from pandas.core.dtypes.common import is_integer_dtype @@ -335,12 +333,11 @@ def aggfun_1(ser): assert len(result) == 0 -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_wrap_agg_out(three_group): grouped = three_group.groupby(["A", "B"]) def func(ser): - if ser.dtype == object: + if ser.dtype in (object, "string"): raise TypeError("Test error message") return ser.sum() @@ -1101,7 +1098,6 @@ def test_lambda_named_agg(func): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_aggregate_mixed_types(): # GH 16916 df = DataFrame( @@ -1113,7 +1109,7 @@ def test_aggregate_mixed_types(): expected = DataFrame( expected_data, index=Index([2, "group 1"], dtype="object", name="grouping"), - columns=Index(["X", "Y", "Z"], dtype="object"), + columns=Index(["X", "Y", "Z"]), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 2990fb5949242..0d04af3801dbe 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -95,7 +93,6 @@ def test_cython_agg_boolean(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_cython_agg_nothing_to_agg(): frame = DataFrame( {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25} @@ -111,7 +108,9 @@ def test_cython_agg_nothing_to_agg(): result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True) expected = DataFrame( - [], index=frame["a"].sort_values().drop_duplicates(), columns=[] + [], + index=frame["a"].sort_values().drop_duplicates(), + columns=Index([], dtype="str"), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 5904b2f48359e..213704f31aca5 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import SpecificationError import pandas as pd @@ -308,7 +306,6 @@ def test_series_agg_multikey(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_series_agg_multi_pure_python(): data = DataFrame( { @@ -358,7 +355,8 @@ def test_series_agg_multi_pure_python(): ) def bad(x): - assert len(x.values.base) > 0 + if isinstance(x.values, np.ndarray): + assert len(x.values.base) > 0 return "foo" result = data.groupby(["A", "B"]).agg(bad) diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py index 4269b41a0871b..3943590b069ad 100644 --- a/pandas/tests/groupby/methods/test_quantile.py +++ b/pandas/tests/groupby/methods/test_quantile.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -170,11 +168,10 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_quantile_raises(): df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]) - msg = "dtype 'object' does not support operation 'quantile'" + msg = "dtype '(object|str)' does not support operation 'quantile'" with pytest.raises(TypeError, match=msg): df.groupby("key").quantile() diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index fb834ee2a8799..271802c447024 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -108,6 +108,8 @@ def test_size_series_masked_type_returns_Int64(dtype): tm.assert_series_equal(result, expected) +# TODO(infer_string) in case the column is object dtype, it should preserve that dtype +# for the result's index @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_size_strings(any_string_dtype): # GH#55627 diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index cded7a71458fa..447df952fd0e5 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( Categorical, @@ -340,8 +338,7 @@ def test_apply(ordered): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -def test_observed(observed): +def test_observed(request, using_infer_string, observed): # multiple groupers, don't re-expand the output space # of the grouper # gh-14942 (implement) @@ -349,6 +346,10 @@ def test_observed(observed): # gh-8138 (back-compat) # gh-8869 + if using_infer_string and not observed: + # TODO(infer_string) this fails with filling the string column with 0 + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) + cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3e2d15ede3648..9b362164c6149 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1617,7 +1617,6 @@ def test_groupby_two_group_keys_all_nan(): assert result == {} -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_2d_malformed(): d = DataFrame(index=range(2)) d["group"] = ["g1", "g2"] @@ -1626,7 +1625,7 @@ def test_groupby_2d_malformed(): d["label"] = ["l1", "l2"] tmp = d.groupby(["group"]).mean(numeric_only=True) res_values = np.array([[0.0, 1.0], [0.0, 1.0]]) - tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"])) + tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"], dtype=object)) tm.assert_numpy_array_equal(tmp.values, res_values) @@ -2711,7 +2710,6 @@ def test_groupby_all_nan_groups_drop(): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("numeric_only", [True, False]) def test_groupby_empty_multi_column(as_index, numeric_only): # GH 15106 & GH 41998 @@ -2720,7 +2718,7 @@ def test_groupby_empty_multi_column(as_index, numeric_only): result = gb.sum(numeric_only=numeric_only) if as_index: index = MultiIndex([[], []], [[], []], names=["A", "B"]) - columns = ["C"] if not numeric_only else [] + columns = ["C"] if not numeric_only else Index([], dtype="str") else: index = RangeIndex(0) columns = ["A", "B", "C"] if not numeric_only else ["A", "B"] @@ -2728,7 +2726,6 @@ def test_groupby_empty_multi_column(as_index, numeric_only): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_aggregation_non_numeric_dtype(): # GH #43108 df = DataFrame( @@ -2739,7 +2736,7 @@ def test_groupby_aggregation_non_numeric_dtype(): { "v": [[1, 1], [10, 20]], }, - index=Index(["M", "W"], dtype="object", name="MW"), + index=Index(["M", "W"], name="MW"), ) gb = df.groupby(by=["MW"]) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 7e65e56abc4c9..2a9b61aa7ebf5 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat.pyarrow import pa_version_under10p1 from pandas.core.dtypes.missing import na_value_for_dtype @@ -99,7 +97,6 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups( tm.assert_frame_equal(grouped, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "dropna, idx, outputs", [ @@ -126,7 +123,7 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs): df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"]) grouped = df.groupby("a", dropna=dropna).sum() - expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a")) + expected = pd.DataFrame(outputs, index=pd.Index(idx, name="a")) tm.assert_frame_equal(grouped, expected) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 7c0a4b78a123d..9a0e67dea532b 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -9,8 +9,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( CategoricalIndex, @@ -844,7 +842,6 @@ def test_groupby_empty(self): expected = ["name"] assert result == expected - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_level_index_value_all_na(self): # issue 20519 df = DataFrame( @@ -854,7 +851,7 @@ def test_groupby_level_index_value_all_na(self): expected = DataFrame( data=[], index=MultiIndex( - levels=[Index(["x"], dtype="object"), Index([], dtype="float64")], + levels=[Index(["x"], dtype="str"), Index([], dtype="float64")], codes=[[], []], names=["A", "B"], ), @@ -989,12 +986,13 @@ def test_groupby_with_empty(self): grouped = series.groupby(grouper) assert next(iter(grouped), None) is None - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_with_single_column(self): df = DataFrame({"a": list("abssbab")}) tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]]) # GH 13530 - exp = DataFrame(index=Index(["a", "b", "s"], name="a"), columns=[]) + exp = DataFrame( + index=Index(["a", "b", "s"], name="a"), columns=Index([], dtype="str") + ) tm.assert_frame_equal(df.groupby("a").count(), exp) tm.assert_frame_equal(df.groupby("a").sum(), exp) diff --git a/pandas/tests/groupby/test_pipe.py b/pandas/tests/groupby/test_pipe.py index 1044c83e3e56b..ee59a93695bcf 100644 --- a/pandas/tests/groupby/test_pipe.py +++ b/pandas/tests/groupby/test_pipe.py @@ -1,7 +1,4 @@ import numpy as np -import pytest - -from pandas._config import using_string_dtype import pandas as pd from pandas import ( @@ -11,7 +8,6 @@ import pandas._testing as tm -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_pipe(): # Test the pipe method of DataFrameGroupBy. # Issue #17871 @@ -39,7 +35,7 @@ def square(srs): # NDFrame.pipe methods result = df.groupby("A").pipe(f).pipe(square) - index = Index(["bar", "foo"], dtype="object", name="A") + index = Index(["bar", "foo"], name="A") expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index) tm.assert_series_equal(expected, result) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 8e1bbcb43e3f3..599b0aabf85d5 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import iNaT from pandas.core.dtypes.common import pandas_dtype @@ -457,8 +455,7 @@ def test_max_min_non_numeric(): assert "ss" in result -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_max_min_object_multiple_columns(using_array_manager): +def test_max_min_object_multiple_columns(using_array_manager, using_infer_string): # GH#41111 case where the aggregation is valid for some columns but not # others; we split object blocks column-wise, consistent with # DataFrame._reduce @@ -472,7 +469,7 @@ def test_max_min_object_multiple_columns(using_array_manager): ) df._consolidate_inplace() # should already be consolidate, but double-check if not using_array_manager: - assert len(df._mgr.blocks) == 2 + assert len(df._mgr.blocks) == 3 if using_infer_string else 2 gb = df.groupby("A") diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 92dfe146bbb54..3bae719e01b73 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -75,6 +75,8 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): class TestGroupBy: + # TODO(infer_string) resample sum introduces 0's + # https://github.com/pandas-dev/pandas/issues/60229 @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_with_timegrouper(self): # GH 4161 diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index a516af7e15943..18ce6e93de402 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -2,8 +2,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.core.dtypes.common import ensure_platform_int @@ -1229,20 +1227,19 @@ def test_groupby_transform_with_datetimes(func, values): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_groupby_transform_dtype(): # GH 22243 df = DataFrame({"a": [1], "val": [1.35]}) result = df["val"].transform(lambda x: x.map(lambda y: f"+{y}")) - expected1 = Series(["+1.35"], name="val", dtype="object") + expected1 = Series(["+1.35"], name="val") tm.assert_series_equal(result, expected1) result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}")) tm.assert_series_equal(result, expected1) result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+({y})")) - expected2 = Series(["+(1.35)"], name="val", dtype="object") + expected2 = Series(["+(1.35)"], name="val") tm.assert_series_equal(result, expected2) df["val"] = df["val"].astype(object) From aa8adfa6b7451cbc016743596312099403cb4fd4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 14 Nov 2024 23:16:11 +0100 Subject: [PATCH 25/42] [backport 2.3.x] String dtype: enable in SQL IO + resolve all xfails (#60255) (#60315) (cherry picked from commit ba4d1cfdda14bf521ff91d6ad432b21095c417fd) Co-authored-by: Will Ayd --- pandas/_libs/lib.pyx | 8 +++++++- pandas/core/dtypes/cast.py | 2 ++ pandas/core/internals/construction.py | 5 +++-- pandas/io/sql.py | 21 +++++++++++++++++++-- pandas/tests/io/test_sql.py | 23 +++++++++++++---------- 5 files changed, 44 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c23f907aecfab..bc039917aef87 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2741,7 +2741,13 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if using_string_dtype() and is_string_array(objects, skipna=True): + if convert_to_nullable_dtype and is_string_array(objects, skipna=True): + from pandas.core.arrays.string_ import StringDtype + + dtype = StringDtype() + return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) + + elif using_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(na_value=np.nan) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 1bc944935756e..a1ef8a3e27e9c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1163,6 +1163,7 @@ def convert_dtypes( def maybe_infer_to_datetimelike( value: npt.NDArray[np.object_], + convert_to_nullable_dtype: bool = False, ) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray | IntervalArray: """ we might have a array (or single object) that is datetime like, @@ -1200,6 +1201,7 @@ def maybe_infer_to_datetimelike( # numpy would have done it for us. convert_numeric=False, convert_non_numeric=True, + convert_to_nullable_dtype=convert_to_nullable_dtype, dtype_if_all_nat=np.dtype("M8[ns]"), ) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 137648ee52bf7..64fac5fcfcdc2 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -1042,8 +1042,9 @@ def convert(arr): if dtype is None: if arr.dtype == np.dtype("O"): # i.e. maybe_convert_objects didn't convert - arr = maybe_infer_to_datetimelike(arr) - if dtype_backend != "numpy" and arr.dtype == np.dtype("O"): + convert_to_nullable_dtype = dtype_backend != "numpy" + arr = maybe_infer_to_datetimelike(arr, convert_to_nullable_dtype) + if convert_to_nullable_dtype and arr.dtype == np.dtype("O"): new_dtype = StringDtype() arr_cls = new_dtype.construct_array_type() arr = arr_cls._from_sequence(arr, dtype=new_dtype) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 03ef1792f1fb8..07c95806d7326 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -46,6 +46,8 @@ from pandas.core.dtypes.common import ( is_dict_like, is_list_like, + is_object_dtype, + is_string_dtype, ) from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -59,6 +61,7 @@ Series, ) from pandas.core.arrays import ArrowExtensionArray +from pandas.core.arrays.string_ import StringDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.common import maybe_make_list @@ -1331,7 +1334,12 @@ def _harmonize_columns( elif dtype_backend == "numpy" and col_type is float: # floats support NA, can always convert! self.frame[col_name] = df_col.astype(col_type, copy=False) - + elif ( + using_string_dtype() + and is_string_dtype(col_type) + and is_object_dtype(self.frame[col_name]) + ): + self.frame[col_name] = df_col.astype(col_type, copy=False) elif dtype_backend == "numpy" and len(df_col) == df_col.count(): # No NA values, can convert ints and bools if col_type is np.dtype("int64") or col_type is bool: @@ -1418,6 +1426,7 @@ def _get_dtype(self, sqltype): DateTime, Float, Integer, + String, ) if isinstance(sqltype, Float): @@ -1437,6 +1446,10 @@ def _get_dtype(self, sqltype): return date elif isinstance(sqltype, Boolean): return bool + elif isinstance(sqltype, String): + if using_string_dtype(): + return StringDtype(na_value=np.nan) + return object @@ -2218,7 +2231,7 @@ def read_table( elif using_string_dtype(): from pandas.io._util import arrow_string_types_mapper - arrow_string_types_mapper() + mapping = arrow_string_types_mapper() else: mapping = None @@ -2299,6 +2312,10 @@ def read_query( from pandas.io._util import _arrow_dtype_mapping mapping = _arrow_dtype_mapping().get + elif using_string_dtype(): + from pandas.io._util import arrow_string_types_mapper + + mapping = arrow_string_types_mapper() else: mapping = None diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 514eaceaccbe6..29efe7a457ff8 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -63,7 +63,7 @@ pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ), - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), + pytest.mark.single_cpu, ] @@ -685,6 +685,7 @@ def postgresql_psycopg2_conn(postgresql_psycopg2_engine): @pytest.fixture def postgresql_adbc_conn(): + pytest.importorskip("pyarrow") pytest.importorskip("adbc_driver_postgresql") from adbc_driver_postgresql import dbapi @@ -817,6 +818,7 @@ def sqlite_conn_types(sqlite_engine_types): @pytest.fixture def sqlite_adbc_conn(): + pytest.importorskip("pyarrow") pytest.importorskip("adbc_driver_sqlite") from adbc_driver_sqlite import dbapi @@ -986,13 +988,13 @@ def test_dataframe_to_sql(conn, test_frame1, request): @pytest.mark.parametrize("conn", all_connectable) def test_dataframe_to_sql_empty(conn, test_frame1, request): - if conn == "postgresql_adbc_conn": + if conn == "postgresql_adbc_conn" and not using_string_dtype(): request.node.add_marker( pytest.mark.xfail( - reason="postgres ADBC driver cannot insert index with null type", - strict=True, + reason="postgres ADBC driver < 1.2 cannot insert index with null type", ) ) + # GH 51086 if conn is sqlite_engine conn = request.getfixturevalue(conn) empty_df = test_frame1.iloc[:0] @@ -3571,7 +3573,8 @@ def test_read_sql_dtype_backend( result = getattr(pd, func)( f"Select * from {table}", conn, dtype_backend=dtype_backend ) - expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + tm.assert_frame_equal(result, expected) if "adbc" in conn_name: @@ -3621,7 +3624,7 @@ def test_read_sql_dtype_backend_table( with pd.option_context("mode.string_storage", string_storage): result = getattr(pd, func)(table, conn, dtype_backend=dtype_backend) - expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) + expected = dtype_backend_expected(string_storage, dtype_backend, conn_name) tm.assert_frame_equal(result, expected) if "adbc" in conn_name: @@ -4150,7 +4153,7 @@ def tquery(query, con=None): def test_xsqlite_basic(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) assert sql.to_sql(frame, name="test_table", con=sqlite_buildin, index=False) == 10 @@ -4177,7 +4180,7 @@ def test_xsqlite_basic(sqlite_buildin): def test_xsqlite_write_row_by_row(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) frame.iloc[0, 0] = np.nan @@ -4200,7 +4203,7 @@ def test_xsqlite_write_row_by_row(sqlite_buildin): def test_xsqlite_execute(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) create_sql = sql.get_schema(frame, "test") @@ -4221,7 +4224,7 @@ def test_xsqlite_execute(sqlite_buildin): def test_xsqlite_schema(sqlite_buildin): frame = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) create_sql = sql.get_schema(frame, "test") From fe1f4f9c5b8ccbab10d12a9d20e7060a003e59f4 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 15 Nov 2024 13:44:44 -0500 Subject: [PATCH 26/42] Backport PR #60318 on branch 2.3.x (TST (string dtype): resolve all xfails in JSON IO tests) (#60327) Backport PR #60318: TST (string dtype): resolve all xfails in JSON IO tests (cherry picked from commit 9bc88c79e6fd146a44970309bacc90490fdec590) --- pandas/tests/io/json/test_json_table_schema.py | 8 +------- pandas/tests/io/json/test_pandas.py | 14 ++++---------- 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 1e47b3bc38737..1c7320aa7a083 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, @@ -26,10 +24,6 @@ set_default_names, ) -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - @pytest.fixture def df_schema(): @@ -126,7 +120,7 @@ def test_multiindex(self, df_schema, using_infer_string): expected["fields"][0] = { "name": "level_0", "type": "any", - "extDtype": "string", + "extDtype": "str", } expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "str"} assert result == expected diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index a8608434be5ee..10f1e7df648f0 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -118,7 +118,7 @@ def datetime_frame(self): # since that doesn't round-trip, see GH#33711 df = DataFrame( np.random.default_rng(2).standard_normal((30, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=30, freq="B"), ) df.index = df.index._with_freq(None) @@ -203,7 +203,6 @@ def test_roundtrip_simple(self, orient, convert_axes, dtype, float_frame): assert_json_roundtrip_equal(result, expected, orient) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype", [False, np.int64]) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_intframe(self, orient, convert_axes, dtype, int_frame): @@ -281,7 +280,6 @@ def test_roundtrip_empty(self, orient, convert_axes): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_timestamp(self, orient, convert_axes, datetime_frame): # TODO: improve coverage with date_format parameter @@ -709,7 +707,6 @@ def test_series_roundtrip_simple(self, orient, string_series, using_infer_string tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("dtype", [False, None]) def test_series_roundtrip_object(self, orient, dtype, object_series): data = StringIO(object_series.to_json(orient=orient)) @@ -721,6 +718,9 @@ def test_series_roundtrip_object(self, orient, dtype, object_series): if orient != "split": expected.name = None + if using_string_dtype(): + expected = expected.astype("str") + tm.assert_series_equal(result, expected) def test_series_roundtrip_empty(self, orient): @@ -814,7 +814,6 @@ def test_path(self, float_frame, int_frame, datetime_frame): df.to_json(path) read_json(path) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_axis_dates(self, datetime_series, datetime_frame): # frame json = StringIO(datetime_frame.to_json()) @@ -827,7 +826,6 @@ def test_axis_dates(self, datetime_series, datetime_frame): tm.assert_series_equal(result, datetime_series, check_names=False) assert result.name is None - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_convert_dates(self, datetime_series, datetime_frame): # frame df = datetime_frame @@ -898,7 +896,6 @@ def test_convert_dates_infer(self, infer_word): result = read_json(StringIO(ujson_dumps(data)))[["id", infer_word]] tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "date,date_unit", [ @@ -959,7 +956,6 @@ def test_date_format_series_raises(self, datetime_series): with pytest.raises(ValueError, match=msg): ts.to_json(date_format="iso", date_unit="foo") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) def test_date_unit(self, unit, datetime_frame): df = datetime_frame @@ -1065,7 +1061,6 @@ def test_round_trip_exception(self, datapath): res = res.fillna(np.nan, downcast=False) tm.assert_frame_equal(res, df) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.network @pytest.mark.single_cpu @pytest.mark.parametrize( @@ -1474,7 +1469,6 @@ def test_data_frame_size_after_to_json(self): assert size_before == size_after - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "index", [None, [1, 2], [1.0, 2.0], ["a", "b"], ["1", "2"], ["1.", "2."]] ) From e37ffb3067c9032787dfb66836e11900e571ca32 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 15 Nov 2024 10:46:35 -0800 Subject: [PATCH 27/42] Backport PR #60320 on branch 2.3.x (TST (string dtype): resolve xfails in common IO tests) (#60325) Backport PR #60320: TST (string dtype): resolve xfails in common IO tests Co-authored-by: Joris Van den Bossche --- pandas/tests/io/test_clipboard.py | 13 ++++++------ pandas/tests/io/test_common.py | 33 +++++++++++++---------------- pandas/tests/io/test_compression.py | 15 ++++++------- pandas/tests/io/test_gcs.py | 5 ++--- 4 files changed, 29 insertions(+), 37 deletions(-) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 3a52ff5acc0b3..a16c63e8d3d65 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ( PyperclipException, PyperclipWindowsException, @@ -26,10 +24,6 @@ init_qt_clipboard, ) -pytestmark = pytest.mark.xfail( - using_string_dtype(), reason="TODO(infer_string)", strict=False -) - def build_kwargs(sep, excel): kwargs = {} @@ -351,7 +345,7 @@ def test_raw_roundtrip(self, data): @pytest.mark.parametrize("engine", ["c", "python"]) def test_read_clipboard_dtype_backend( - self, clipboard, string_storage, dtype_backend, engine + self, clipboard, string_storage, dtype_backend, engine, using_infer_string ): # GH#50502 if dtype_backend == "pyarrow": @@ -396,6 +390,11 @@ def test_read_clipboard_dtype_backend( ) expected["g"] = ArrowExtensionArray(pa.array([None, None])) + if using_infer_string: + expected.columns = expected.columns.astype( + pd.StringDtype(string_storage, na_value=np.nan) + ) + tm.assert_frame_equal(result, expected) def test_invalid_dtype_backend(self): diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index d38f716cf6a98..d1e42b297f143 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -154,7 +154,6 @@ def test_bytesiowrapper_returns_correct_bytes(self): assert result == data.encode("utf-8") # Test that pyarrow can handle a file opened with get_handle - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_get_handle_pyarrow_compat(self): pa_csv = pytest.importorskip("pyarrow.csv") @@ -169,6 +168,8 @@ def test_get_handle_pyarrow_compat(self): s = StringIO(data) with icom.get_handle(s, "rb", is_text=False) as handles: df = pa_csv.read_csv(handles.handle).to_pandas() + # TODO will have to update this when pyarrow' to_pandas() is fixed + expected = expected.astype("object") tm.assert_frame_equal(df, expected) assert not s.closed @@ -352,7 +353,6 @@ def test_read_fspath_all(self, reader, module, path, datapath): ("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"), ], ) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_write_fspath_all(self, writer_name, writer_kwargs, module): if writer_name in ["to_latex"]: # uses Styler implementation pytest.importorskip("jinja2") @@ -379,7 +379,7 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): expected = f_path.read() assert result == expected - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) hdf support") def test_write_fspath_hdf5(self): # Same test as write_fspath_all, except HDF5 files aren't # necessarily byte-for-byte identical for a given dataframe, so we'll @@ -450,14 +450,13 @@ def test_unknown_engine(self): with tm.ensure_clean() as path: df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.to_csv(path) with pytest.raises(ValueError, match="Unknown engine"): pd.read_csv(path, engine="pyt") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_binary_mode(self): """ 'encoding' shouldn't be passed to 'open' in binary mode. @@ -467,8 +466,8 @@ def test_binary_mode(self): with tm.ensure_clean() as path: df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) df.to_csv(path, mode="w+b") tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) @@ -485,8 +484,8 @@ def test_warning_missing_utf_bom(self, encoding, compression_): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: with tm.assert_produces_warning(UnicodeWarning): @@ -516,15 +515,14 @@ def test_is_fsspec_url(): assert icom.is_fsspec_url("RFC-3986+compliant.spec://something") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("encoding", [None, "utf-8"]) @pytest.mark.parametrize("format", ["csv", "json"]) def test_codecs_encoding(encoding, format): # GH39247 expected = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: with codecs.open(path, mode="w", encoding=encoding) as handle: @@ -537,13 +535,12 @@ def test_codecs_encoding(encoding, format): tm.assert_frame_equal(expected, df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_codecs_get_writer_reader(): # GH39247 expected = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with tm.ensure_clean() as path: with open(path, "wb") as handle: @@ -568,8 +565,8 @@ def test_explicit_encoding(io_class, mode, msg): # wrong mode is requested expected = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) with io_class() as buffer: with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 25504c7b88fdb..af89f0916355e 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -12,8 +12,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import is_platform_windows import pandas as pd @@ -139,7 +137,6 @@ def test_compression_warning(compression_only): df.to_csv(handles.handle, compression=compression_only) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_compression_binary(compression_only): """ Binary file handles support compression. @@ -148,8 +145,8 @@ def test_compression_binary(compression_only): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) # with a file @@ -180,8 +177,8 @@ def test_gzip_reproducibility_file_name(): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) compression_options = {"method": "gzip", "mtime": 1} @@ -203,8 +200,8 @@ def test_gzip_reproducibility_file_object(): """ df = pd.DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=pd.Index(list("ABCD"), dtype=object), - index=pd.Index([f"i-{i}" for i in range(30)], dtype=object), + columns=pd.Index(list("ABCD")), + index=pd.Index([f"i-{i}" for i in range(30)]), ) compression_options = {"method": "gzip", "mtime": 1} diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 81f951b3958b0..c7671bfb513aa 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -147,7 +147,6 @@ def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str): assert result == expected -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("encoding", ["utf-8", "cp1251"]) def test_to_csv_compression_encoding_gcs( gcs_buffer, compression_only, encoding, compression_to_extension @@ -160,8 +159,8 @@ def test_to_csv_compression_encoding_gcs( """ df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) # reference of compressed and encoded file From 4f13697fb71085a1f6b4ffe12a270ff9d423609a Mon Sep 17 00:00:00 2001 From: William Ayd Date: Fri, 15 Nov 2024 17:10:56 -0500 Subject: [PATCH 28/42] Backport PR #60312 on branch 2.3.x (TST (string dtype): resolve xfails in pandas/tests/apply + raise TypeError for ArrowArray accumulate) (#60328) * Backport PR #60312 on branch 2.3.x (TST (string dtype): resolve xfails in pandas/tests/apply + raise TypeError for ArrowArray accumulate) (cherry picked from commit fba5f08f048215a6e0a578f8bad7b7f2c9ee8eef) * 2.3 test function compat --------- Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/arrow/array.py | 6 +++++- pandas/tests/apply/test_invalid_arg.py | 30 +++++++++----------------- pandas/tests/apply/test_str.py | 13 ++++++----- pandas/tests/extension/test_arrow.py | 2 +- 4 files changed, 24 insertions(+), 27 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 13e10c8d3a738..0c1e1d0c63c85 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1633,7 +1633,11 @@ def _accumulate( else: data_to_accum = data_to_accum.cast(pa.int64()) - result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) + try: + result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) + except pa.ArrowNotImplementedError as err: + msg = f"operation '{name}' not supported for dtype '{self.dtype}'" + raise TypeError(msg) from err if convert_to_int: result = result.cast(pa_dtype) diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py index 8963265b0a800..68f3fe36546a0 100644 --- a/pandas/tests/apply/test_invalid_arg.py +++ b/pandas/tests/apply/test_invalid_arg.py @@ -218,18 +218,12 @@ def transform(row): def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_string): # GH 21224 if using_infer_string: - if df.dtypes.iloc[0].storage == "pyarrow": - import pyarrow as pa - - # TODO(infer_string) - # should raise a proper TypeError instead of propagating the pyarrow error - - expected = (expected, pa.lib.ArrowNotImplementedError) - else: - expected = (expected, NotImplementedError) + expected = (expected, NotImplementedError) msg = ( - "can't multiply sequence by non-int of type 'str'|has no kernel|cannot perform" + "can't multiply sequence by non-int of type 'str'" + "|cannot perform cumprod with type str" # NotImplementedError python backend + "|operation 'cumprod' not supported for dtype 'str'" # TypeError pyarrow ) warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): @@ -259,16 +253,12 @@ def test_agg_cython_table_raises_series(series, func, expected, using_infer_stri if func == "median" or func is np.nanmedian or func is np.median: msg = r"Cannot convert \['a' 'b' 'c'\] to numeric" - if using_infer_string: - if series.dtype.storage == "pyarrow": - import pyarrow as pa - - # TODO(infer_string) - # should raise a proper TypeError instead of propagating the pyarrow error - expected = (expected, pa.lib.ArrowNotImplementedError) - else: - expected = (expected, NotImplementedError) - msg = msg + "|does not support|has no kernel|Cannot perform|cannot perform" + if using_infer_string and func in ("cumprod", np.cumprod, np.nancumprod): + expected = (expected, NotImplementedError) + + msg = ( + msg + "|does not support|has no kernel|Cannot perform|cannot perform|operation" + ) warn = None if isinstance(func, str) else FutureWarning with pytest.raises(expected, match=msg): diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py index 8956aed5e9ceb..f916567c6b883 100644 --- a/pandas/tests/apply/test_str.py +++ b/pandas/tests/apply/test_str.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.core.dtypes.common import is_number from pandas import ( @@ -88,7 +86,6 @@ def test_apply_np_transformer(float_frame, op, how): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, func, expected", chain( @@ -147,7 +144,6 @@ def test_agg_cython_table_series(series, func, expected): assert result == expected -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "series, func, expected", chain( @@ -170,10 +166,17 @@ def test_agg_cython_table_series(series, func, expected): ), ), ) -def test_agg_cython_table_transform_series(series, func, expected): +def test_agg_cython_table_transform_series(request, series, func, expected): # GH21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + if series.dtype == "string" and func in ("cumsum", np.cumsum, np.nancumsum): + request.applymarker( + pytest.mark.xfail( + raises=(TypeError, NotImplementedError), + reason="TODO(infer_string) cumsum not yet implemented for string", + ) + ) warn = None if isinstance(func, str) else FutureWarning with tm.assert_produces_warning(warn, match="is currently using Series.*"): result = series.agg(func) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 0ce7a66e0e00c..03ab7c7f1dad8 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -436,7 +436,7 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques request.applymarker( pytest.mark.xfail( reason=f"{all_numeric_accumulations} not implemented for {pa_type}", - raises=NotImplementedError, + raises=TypeError, ) ) From 38565aa88ea27617f71ea754b213088772e3fad7 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sat, 16 Nov 2024 04:34:42 -0500 Subject: [PATCH 29/42] Backport PR #60324: REF: centralize pyarrow Table to pandas conversions and types_mapper handling (#60332) (cherry picked from commit 12d6f602eea98275553ac456f90201151b1f9bf8) Co-authored-by: Joris Van den Bossche --- pandas/io/_util.py | 49 ++++++++++++++++++++++- pandas/io/feather_format.py | 17 +------- pandas/io/json/_json.py | 15 +------ pandas/io/orc.py | 21 +--------- pandas/io/parquet.py | 34 ++++++++-------- pandas/io/parsers/arrow_parser_wrapper.py | 33 ++++++--------- pandas/io/sql.py | 41 ++++--------------- pandas/tests/io/test_sql.py | 4 +- 8 files changed, 92 insertions(+), 122 deletions(-) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index f3e6dba1391be..9373888e28d28 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -1,14 +1,27 @@ from __future__ import annotations -from typing import Callable +from typing import ( + TYPE_CHECKING, + Literal, +) import numpy as np +from pandas._config import using_string_dtype + +from pandas._libs import lib from pandas.compat import pa_version_under18p0 from pandas.compat._optional import import_optional_dependency import pandas as pd +if TYPE_CHECKING: + from collections.abc import Callable + + import pyarrow + + from pandas._typing import DtypeBackend + def _arrow_dtype_mapping() -> dict: pa = import_optional_dependency("pyarrow") @@ -30,7 +43,7 @@ def _arrow_dtype_mapping() -> dict: } -def arrow_string_types_mapper() -> Callable: +def _arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") mapping = { @@ -41,3 +54,35 @@ def arrow_string_types_mapper() -> Callable: mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan) return mapping.get + + +def arrow_table_to_pandas( + table: pyarrow.Table, + dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default, + null_to_int64: bool = False, + to_pandas_kwargs: dict | None = None, +) -> pd.DataFrame: + if to_pandas_kwargs is None: + to_pandas_kwargs = {} + + pa = import_optional_dependency("pyarrow") + + types_mapper: type[pd.ArrowDtype] | None | Callable + if dtype_backend == "numpy_nullable": + mapping = _arrow_dtype_mapping() + if null_to_int64: + # Modify the default mapping to also map null to Int64 + # (to match other engines - only for CSV parser) + mapping[pa.null()] = pd.Int64Dtype() + types_mapper = mapping.get + elif dtype_backend == "pyarrow": + types_mapper = pd.ArrowDtype + elif using_string_dtype(): + types_mapper = _arrow_string_types_mapper() + elif dtype_backend is lib.no_default or dtype_backend == "numpy": + types_mapper = None + else: + raise NotImplementedError + + df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs) + return df diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 68c73483add3f..1bdb732cb10de 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -13,11 +13,10 @@ from pandas.util._decorators import doc from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas.core.api import DataFrame from pandas.core.shared_docs import _shared_docs -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import get_handle if TYPE_CHECKING: @@ -128,16 +127,4 @@ def read_feather( pa_table = feather.read_table( handles.handle, columns=columns, use_threads=bool(use_threads) ) - - if dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get) - - elif dtype_backend == "pyarrow": - return pa_table.to_pandas(types_mapper=pd.ArrowDtype) - - elif using_string_dtype(): - return pa_table.to_pandas(types_mapper=arrow_string_types_mapper()) - else: - raise NotImplementedError + return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 9414f45215029..c0499ce750cf0 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -40,7 +40,6 @@ from pandas.core.dtypes.dtypes import PeriodDtype from pandas import ( - ArrowDtype, DataFrame, Index, MultiIndex, @@ -52,6 +51,7 @@ from pandas.core.reshape.concat import concat from pandas.core.shared_docs import _shared_docs +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( IOHandles, dedup_names, @@ -997,18 +997,7 @@ def read(self) -> DataFrame | Series: if self.engine == "pyarrow": pyarrow_json = import_optional_dependency("pyarrow.json") pa_table = pyarrow_json.read_json(self.data) - - mapping: type[ArrowDtype] | None | Callable - if self.dtype_backend == "pyarrow": - mapping = ArrowDtype - elif self.dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - else: - mapping = None - - return pa_table.to_pandas(types_mapper=mapping) + return arrow_table_to_pandas(pa_table, dtype_backend=self.dtype_backend) elif self.engine == "ujson": if self.lines: if self.chunksize: diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 5706336b71697..d7f473a929568 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -9,16 +9,13 @@ Literal, ) -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas.core.indexes.api import default_index -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( get_handle, is_fsspec_url, @@ -117,21 +114,7 @@ def read_orc( pa_table = orc.read_table( source=source, columns=columns, filesystem=filesystem, **kwargs ) - if dtype_backend is not lib.no_default: - if dtype_backend == "pyarrow": - df = pa_table.to_pandas(types_mapper=pd.ArrowDtype) - else: - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping() - df = pa_table.to_pandas(types_mapper=mapping.get) - return df - else: - if using_string_dtype(): - types_mapper = arrow_string_types_mapper() - else: - types_mapper = None - return pa_table.to_pandas(types_mapper=types_mapper) + return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) def to_orc( diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index cc33c87dfc55a..01e320cdb1b72 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -10,9 +10,11 @@ Literal, ) import warnings -from warnings import catch_warnings +from warnings import ( + catch_warnings, + filterwarnings, +) -from pandas._config import using_string_dtype from pandas._config.config import _get_option from pandas._libs import lib @@ -22,14 +24,13 @@ from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend -import pandas as pd from pandas import ( DataFrame, get_option, ) from pandas.core.shared_docs import _shared_docs -from pandas.io._util import arrow_string_types_mapper +from pandas.io._util import arrow_table_to_pandas from pandas.io.common import ( IOHandles, get_handle, @@ -250,20 +251,10 @@ def read( kwargs["use_pandas_metadata"] = True to_pandas_kwargs = {} - if dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping() - to_pandas_kwargs["types_mapper"] = mapping.get - elif dtype_backend == "pyarrow": - to_pandas_kwargs["types_mapper"] = pd.ArrowDtype # type: ignore[assignment] - elif using_string_dtype(): - to_pandas_kwargs["types_mapper"] = arrow_string_types_mapper() manager = _get_option("mode.data_manager", silent=True) if manager == "array": - to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] - + to_pandas_kwargs["split_blocks"] = True path_or_handle, handles, filesystem = _get_path_or_handle( path, filesystem, @@ -278,7 +269,18 @@ def read( filters=filters, **kwargs, ) - result = pa_table.to_pandas(**to_pandas_kwargs) + + with catch_warnings(): + filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + result = arrow_table_to_pandas( + pa_table, + dtype_backend=dtype_backend, + to_pandas_kwargs=to_pandas_kwargs, + ) if manager == "array": result = result._as_manager("array", copy=False) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index c774638fd73f7..a7f01e6322755 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -3,8 +3,6 @@ from typing import TYPE_CHECKING import warnings -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas.compat._optional import import_optional_dependency from pandas.errors import ( @@ -16,18 +14,14 @@ from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.inference import is_integer -import pandas as pd -from pandas import DataFrame - -from pandas.io._util import ( - _arrow_dtype_mapping, - arrow_string_types_mapper, -) +from pandas.io._util import arrow_table_to_pandas from pandas.io.parsers.base_parser import ParserBase if TYPE_CHECKING: from pandas._typing import ReadBuffer + from pandas import DataFrame + class ArrowParserWrapper(ParserBase): """ @@ -287,17 +281,14 @@ def read(self) -> DataFrame: table = table.cast(new_schema) - if dtype_backend == "pyarrow": - frame = table.to_pandas(types_mapper=pd.ArrowDtype) - elif dtype_backend == "numpy_nullable": - # Modify the default mapping to also - # map null to Int64 (to match other engines) - dtype_mapping = _arrow_dtype_mapping() - dtype_mapping[pa.null()] = pd.Int64Dtype() - frame = table.to_pandas(types_mapper=dtype_mapping.get) - elif using_string_dtype(): - frame = table.to_pandas(types_mapper=arrow_string_types_mapper()) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "make_block is deprecated", + DeprecationWarning, + ) + frame = arrow_table_to_pandas( + table, dtype_backend=dtype_backend, null_to_int64=True + ) - else: - frame = table.to_pandas() return self._finalize_pandas_output(frame) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 07c95806d7326..7027702a696fe 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -49,10 +49,7 @@ is_object_dtype, is_string_dtype, ) -from pandas.core.dtypes.dtypes import ( - ArrowDtype, - DatetimeTZDtype, -) +from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna from pandas import get_option @@ -68,6 +65,8 @@ from pandas.core.internals.construction import convert_object_array from pandas.core.tools.datetimes import to_datetime +from pandas.io._util import arrow_table_to_pandas + if TYPE_CHECKING: from collections.abc import ( Iterator, @@ -2221,23 +2220,10 @@ def read_table( else: stmt = f"SELECT {select_list} FROM {table_name}" - mapping: type[ArrowDtype] | None | Callable - if dtype_backend == "pyarrow": - mapping = ArrowDtype - elif dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - elif using_string_dtype(): - from pandas.io._util import arrow_string_types_mapper - - mapping = arrow_string_types_mapper() - else: - mapping = None - with self.con.cursor() as cur: cur.execute(stmt) - df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + pa_table = cur.fetch_arrow_table() + df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) return _wrap_result_adbc( df, @@ -2305,23 +2291,10 @@ def read_query( if chunksize: raise NotImplementedError("'chunksize' is not implemented for ADBC drivers") - mapping: type[ArrowDtype] | None | Callable - if dtype_backend == "pyarrow": - mapping = ArrowDtype - elif dtype_backend == "numpy_nullable": - from pandas.io._util import _arrow_dtype_mapping - - mapping = _arrow_dtype_mapping().get - elif using_string_dtype(): - from pandas.io._util import arrow_string_types_mapper - - mapping = arrow_string_types_mapper() - else: - mapping = None - with self.con.cursor() as cur: cur.execute(sql) - df = cur.fetch_arrow_table().to_pandas(types_mapper=mapping) + pa_table = cur.fetch_arrow_table() + df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend) return _wrap_result_adbc( df, diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 29efe7a457ff8..3676721c5e6b7 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -959,12 +959,12 @@ def sqlite_buildin_types(sqlite_buildin, types_data): adbc_connectable_iris = [ pytest.param("postgresql_adbc_iris", marks=pytest.mark.db), - pytest.param("sqlite_adbc_iris", marks=pytest.mark.db), + "sqlite_adbc_iris", ] adbc_connectable_types = [ pytest.param("postgresql_adbc_types", marks=pytest.mark.db), - pytest.param("sqlite_adbc_types", marks=pytest.mark.db), + "sqlite_adbc_types", ] From 7958d6c73402ebf995c337fba7cfeadb18ec79e9 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 17 Nov 2024 00:09:02 -0800 Subject: [PATCH 30/42] Backport PR #56013 on branch 2.3.x (BUG: get_indexer rountripping through string dtype) (#60339) Backport PR #56013: BUG: get_indexer rountripping through string dtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.3.0.rst | 2 +- pandas/core/indexes/base.py | 11 ++++++++++- pandas/tests/indexes/object/test_indexing.py | 9 +++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 3e699e1a27b55..473d67acf6e74 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -119,7 +119,7 @@ Interval Indexing ^^^^^^^^ -- +- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`) - Missing diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5da327a82c02b..4896fb0ad1cd2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6695,7 +6695,16 @@ def _maybe_cast_listlike_indexer(self, target) -> Index: """ Analogue to maybe_cast_indexer for get_indexer instead of get_loc. """ - return ensure_index(target) + target_index = ensure_index(target) + if ( + not hasattr(target, "dtype") + and self.dtype == object + and target_index.dtype == "string" + ): + # If we started with a list-like, avoid inference to string dtype if self + # is object dtype (coercing to string dtype will alter the missing values) + target_index = Index(target, dtype=self.dtype) + return target_index @final def _validate_indexer( diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index d3df349027c00..42ef7e7a96f5e 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -62,6 +62,15 @@ def test_get_indexer_with_NA_values( expected = np.array([0, 1, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_infer_string_missing_values(self): + # ensure the passed list is not cast to string but to object so that + # the None value is matched in the index + # https://github.com/pandas-dev/pandas/issues/55834 + idx = Index(["a", "b", None], dtype="object") + result = idx.get_indexer([None, "x"]) + expected = np.array([2, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + class TestGetIndexerNonUnique: def test_get_indexer_non_unique_nas(self, nulls_fixture): From 2cf68693d8f9666b33d64702e5bb731a0734b12e Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 17 Nov 2024 04:16:53 -0800 Subject: [PATCH 31/42] Backport PR #60333 on branch 2.3.x (BUG (string dtype): fix handling of string dtype in interchange protocol) (#60347) Backport PR #60333: BUG (string dtype): fix handling of string dtype in interchange protocol Co-authored-by: William Ayd --- pandas/core/interchange/from_dataframe.py | 12 ++++++++---- pandas/tests/interchange/test_impl.py | 9 ++------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 4162ebc33f0d6..53f18883ea3ad 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -6,6 +6,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas.compat._optional import import_optional_dependency from pandas.errors import SettingWithCopyError @@ -124,8 +126,6 @@ def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame: ------- pd.DataFrame """ - # We need a dict of columns here, with each column being a NumPy array (at - # least for now, deal with non-NumPy dtypes later). columns: dict[str, Any] = {} buffers = [] # hold on to buffers, keeps memory alive for name in df.column_names(): @@ -324,8 +324,12 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: # Add to our list of strings str_list[i] = string - # Convert the string list to a NumPy array - return np.asarray(str_list, dtype="object"), buffers + if using_string_dtype(): + res = pd.Series(str_list, dtype="str") + else: + res = np.asarray(str_list, dtype="object") # type: ignore[assignment] + + return res, buffers # type: ignore[return-value] def parse_datetime_format_str(format_str, data) -> pd.Series | np.ndarray: diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index ef94c4c7aff2c..c32b31c297c5d 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import iNaT from pandas.compat import ( is_ci_environment, @@ -412,7 +410,6 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: pd.api.interchange.from_dataframe(df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_empty_string_column(): # https://github.com/pandas-dev/pandas/issues/56703 df = pd.DataFrame({"a": []}, dtype=str) @@ -421,13 +418,12 @@ def test_empty_string_column(): tm.assert_frame_equal(df, result) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_large_string(): # GH#56702 pytest.importorskip("pyarrow") df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]") result = pd.api.interchange.from_dataframe(df.__dataframe__()) - expected = pd.DataFrame({"a": ["x"]}, dtype="object") + expected = pd.DataFrame({"a": ["x"]}, dtype="str") tm.assert_frame_equal(result, expected) @@ -438,7 +434,6 @@ def test_non_str_names(): assert names == ["0"] -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_non_str_names_w_duplicates(): # https://github.com/pandas-dev/pandas/issues/56701 df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]}) @@ -449,7 +444,7 @@ def test_non_str_names_w_duplicates(): "Expected a Series, got a DataFrame. This likely happened because you " "called __dataframe__ on a DataFrame which, after converting column " r"names to string, resulted in duplicated names: Index\(\['0', '0'\], " - r"dtype='object'\). Please rename these columns before using the " + r"dtype='(str|object)'\). Please rename these columns before using the " "interchange protocol." ), ): From 0bcd25088b38f3c97417e0c3672b4cb5ba7291e9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 17 Nov 2024 17:04:18 +0100 Subject: [PATCH 32/42] [backport 2.3.x] CI: update fastparquet xfails for new release (#60337) (#60344) --- pandas/tests/io/test_parquet.py | 37 +++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index e43aae6a2e9e7..87f9b0108402c 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1223,11 +1223,17 @@ def test_duplicate_columns(self, fp): msg = "Cannot create parquet dataset with duplicate column names" self.check_error_on_write(df, fp, ValueError, msg) - @pytest.mark.xfail( - Version(np.__version__) >= Version("2.0.0"), - reason="fastparquet uses np.float_ in numpy2", - ) - def test_bool_with_none(self, fp): + def test_bool_with_none(self, fp, request): + import fastparquet + + if Version(fastparquet.__version__) < Version("2024.11.0") and Version( + np.__version__ + ) >= Version("2.0.0"): + request.applymarker( + pytest.mark.xfail( + reason=("fastparquet uses np.float_ in numpy2"), + ) + ) df = pd.DataFrame({"a": [True, None, False]}) expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") # Fastparquet bug in 0.7.1 makes it so that this dtype becomes @@ -1342,12 +1348,21 @@ def test_empty_dataframe(self, fp): expected = df.copy() check_round_trip(df, fp, expected=expected) - @pytest.mark.xfail( - _HAVE_FASTPARQUET and Version(fastparquet.__version__) > Version("2022.12"), - reason="fastparquet bug, see https://github.com/dask/fastparquet/issues/929", - ) - @pytest.mark.skipif(using_copy_on_write(), reason="fastparquet writes into Index") - def test_timezone_aware_index(self, fp, timezone_aware_date_list): + def test_timezone_aware_index(self, fp, timezone_aware_date_list, request): + import fastparquet + + if Version(fastparquet.__version__) > Version("2022.12") and Version( + fastparquet.__version__ + ) < Version("2024.11.0"): + request.applymarker( + pytest.mark.xfail( + reason=( + "fastparquet bug, see " + "https://github.com/dask/fastparquet/issues/929" + ), + ) + ) + idx = 5 * [timezone_aware_date_list] df = pd.DataFrame(index=idx, data={"index_as_col": idx}) From 112c2e910d9c9f96ad3c0c5c66bc3928fc687a9e Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 18 Nov 2024 18:33:33 -0500 Subject: [PATCH 33/42] =?UTF-8?q?Backport=20PR=20#60321:=20TST=20(string?= =?UTF-8?q?=20dtype):=20resolve=20all=20xfails=20in=20IO=20pars=E2=80=A6?= =?UTF-8?q?=20(#60330)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Backport PR #60321: TST (string dtype): resolve all xfails in IO parser tests (cherry picked from commit ee3c18f51b393893ed6e31214c7be2f9427ce0c9) * BUG: Avoid RangeIndex conversion in read_csv if dtype is specified (#59316) Co-authored-by: Joris Van den Bossche Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/io/parsers/base_parser.py | 36 +++++++++++++------ .../tests/io/parser/common/test_chunksize.py | 13 +++---- .../io/parser/common/test_file_buffer_url.py | 7 ++-- pandas/tests/io/parser/common/test_index.py | 10 +++--- .../io/parser/dtypes/test_dtypes_basic.py | 22 +++++++++--- pandas/tests/io/parser/test_c_parser_only.py | 13 +++---- pandas/tests/io/parser/test_converters.py | 5 +-- pandas/tests/io/parser/test_index_col.py | 5 +-- pandas/tests/io/parser/test_mangle_dupes.py | 10 +++--- pandas/tests/io/parser/test_na_values.py | 25 +++++++------ pandas/tests/io/parser/test_parse_dates.py | 11 ++---- pandas/tests/io/parser/test_upcast.py | 3 -- 12 files changed, 89 insertions(+), 71 deletions(-) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 09f0f2af8e5c6..40e3ea6450647 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -464,7 +464,11 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: arrays = [] converters = self._clean_mapping(self.converters) - for i, arr in enumerate(index): + if self.index_names is not None: + names: Iterable = self.index_names + else: + names = itertools.cycle([None]) + for i, (arr, name) in enumerate(zip(index, names)): if try_parse_dates and self._should_parse_dates(i): arr = self._date_conv( arr, @@ -504,12 +508,17 @@ def _agg_index(self, index, try_parse_dates: bool = True) -> Index: arr, _ = self._infer_types( arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool ) - arrays.append(arr) - - names = self.index_names - index = ensure_index_from_sequences(arrays, names) + if cast_type is not None: + # Don't perform RangeIndex inference + idx = Index(arr, name=name, dtype=cast_type) + else: + idx = ensure_index_from_sequences([arr], [name]) + arrays.append(idx) - return index + if len(arrays) == 1: + return arrays[0] + else: + return MultiIndex.from_arrays(arrays) @final def _convert_to_ndarrays( @@ -1084,12 +1093,11 @@ def _get_empty_meta(self, columns, dtype: DtypeArg | None = None): dtype_dict: defaultdict[Hashable, Any] if not is_dict_like(dtype): # if dtype == None, default will be object. - default_dtype = dtype or object - dtype_dict = defaultdict(lambda: default_dtype) + dtype_dict = defaultdict(lambda: dtype) else: dtype = cast(dict, dtype) dtype_dict = defaultdict( - lambda: object, + lambda: None, {columns[k] if is_integer(k) else k: v for k, v in dtype.items()}, ) @@ -1106,8 +1114,14 @@ def _get_empty_meta(self, columns, dtype: DtypeArg | None = None): if (index_col is None or index_col is False) or index_names is None: index = default_index(0) else: - data = [Series([], dtype=dtype_dict[name]) for name in index_names] - index = ensure_index_from_sequences(data, names=index_names) + # TODO: We could return default_index(0) if dtype_dict[name] is None + data = [ + Index([], name=name, dtype=dtype_dict[name]) for name in index_names + ] + if len(data) == 1: + index = data[0] + else: + index = MultiIndex.from_arrays(data) index_col.sort() for i, n in enumerate(index_col): diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index 7b70601addcad..5226476ef6eac 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import parsers as libparsers from pandas.errors import DtypeWarning @@ -230,8 +228,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): assert result.a.dtype == float -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -def test_warn_if_chunks_have_mismatched_type(all_parsers): +def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string): warning_type = None parser = all_parsers size = 10000 @@ -259,8 +256,12 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers): "Specify dtype option on import or set low_memory=False.", buf, ) - - assert df.a.dtype == object + if parser.engine == "c" and parser.low_memory: + assert df.a.dtype == object + elif using_infer_string: + assert df.a.dtype == "str" + else: + assert df.a.dtype == object @pytest.mark.parametrize("iterator", [True, False]) diff --git a/pandas/tests/io/parser/common/test_file_buffer_url.py b/pandas/tests/io/parser/common/test_file_buffer_url.py index c13b77f365496..d573b47bb3279 100644 --- a/pandas/tests/io/parser/common/test_file_buffer_url.py +++ b/pandas/tests/io/parser/common/test_file_buffer_url.py @@ -14,8 +14,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ( EmptyDataError, ParserError, @@ -69,14 +67,13 @@ def test_local_file(all_parsers, csv_dir_path): pytest.skip("Failing on: " + " ".join(platform.uname())) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # AssertionError: DataFrame.index are different def test_path_path_lib(all_parsers): parser = all_parsers df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), - columns=Index(list("ABCD"), dtype=object), - index=Index([f"i-{i}" for i in range(30)], dtype=object), + columns=Index(list("ABCD")), + index=Index([f"i-{i}" for i in range(30)]), ) result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0)) tm.assert_frame_equal(df, result) diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 0121af53f1aa4..cdd65223a9c9f 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -8,8 +8,6 @@ import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Index, @@ -87,9 +85,13 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("index_col", [[0, 1], [1, 0]]) -def test_multi_index_no_level_names(all_parsers, index_col): +def test_multi_index_no_level_names( + request, all_parsers, index_col, using_infer_string +): + if using_infer_string and all_parsers.engine == "pyarrow": + # result should have string columns instead of object dtype + request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) data = """index1,index2,A,B,C,D foo,one,2,3,4,5 foo,two,7,8,9,10 diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 787941c5d0376..d28c43c45647a 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ParserWarning import pandas as pd @@ -24,6 +22,8 @@ "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) @@ -54,7 +54,6 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_per_column(all_parsers): parser = all_parsers @@ -68,7 +67,6 @@ def test_dtype_per_column(all_parsers): [[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"] ) expected["one"] = expected["one"].astype(np.float64) - expected["two"] = expected["two"].astype(object) result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str}) tm.assert_frame_equal(result, expected) @@ -598,6 +596,7 @@ def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string): tm.assert_frame_equal(result, expected) +@xfail_pyarrow def test_accurate_parsing_of_large_integers(all_parsers): # GH#52505 data = """SYMBOL,MOMENT,ID,ID_DEAL @@ -608,7 +607,7 @@ def test_accurate_parsing_of_large_integers(all_parsers): AMZN,20230301181139587,2023552585717889759,2023552585717263360 MSFT,20230301181139587,2023552585717889863,2023552585717263361 NVDA,20230301181139587,2023552585717889827,2023552585717263361""" - orders = pd.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) + orders = all_parsers.read_csv(StringIO(data), dtype={"ID_DEAL": pd.Int64Dtype()}) assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263358, "ID_DEAL"]) == 1 assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263359, "ID_DEAL"]) == 1 assert len(orders.loc[orders["ID_DEAL"] == 2023552585717263360, "ID_DEAL"]) == 2 @@ -630,3 +629,16 @@ def test_dtypes_with_usecols(all_parsers): values = ["1", "4"] expected = DataFrame({"a": pd.Series(values, dtype=object), "c": [3, 6]}) tm.assert_frame_equal(result, expected) + + +def test_index_col_with_dtype_no_rangeindex(all_parsers): + data = StringIO("345.5,519.5,0\n519.5,726.5,1") + result = all_parsers.read_csv( + data, + header=None, + names=["start", "stop", "bin_id"], + dtype={"start": np.float32, "stop": np.float32, "bin_id": np.uint32}, + index_col="bin_id", + ).index + expected = pd.Index([0, 1], dtype=np.uint32, name="bin_id") + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 1501479510e17..5b72f76440349 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -17,8 +17,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import ( ParserError, @@ -185,8 +183,7 @@ def error(val: float, actual_val: Decimal) -> Decimal: assert max(precise_errors) <= max(normal_errors) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_usecols_dtypes(c_parser_only): +def test_usecols_dtypes(c_parser_only, using_infer_string): parser = c_parser_only data = """\ 1,2,3 @@ -211,8 +208,12 @@ def test_usecols_dtypes(c_parser_only): dtype={"b": int, "c": float}, ) - assert (result.dtypes == [object, int, float]).all() - assert (result2.dtypes == [object, float]).all() + if using_infer_string: + assert (result.dtypes == ["string", int, float]).all() + assert (result2.dtypes == ["string", float]).all() + else: + assert (result.dtypes == [object, int, float]).all() + assert (result2.dtypes == [object, float]).all() def test_disable_bool_parsing(c_parser_only): diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index a3c6dc8fd0898..1848e1e571fc1 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -186,7 +184,6 @@ def convert_score(x): tm.assert_frame_equal(results[0], results[1]) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("conv_f", [lambda x: x, str]) def test_converter_index_col_bug(all_parsers, conv_f): # see gh-1835 , GH#40589 @@ -205,7 +202,7 @@ def test_converter_index_col_bug(all_parsers, conv_f): StringIO(data), sep=";", index_col="A", converters={"A": conv_f} ) - xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object")) + xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A")) tm.assert_frame_equal(rs, xp) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 6dbfed2b6ae83..9224b743b8917 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( DataFrame, Index, @@ -344,7 +342,6 @@ def test_infer_types_boolean_sum(all_parsers): tm.assert_frame_equal(result, expected, check_index_type=False) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)]) def test_specify_dtype_for_index_col(all_parsers, dtype, val, request): # GH#9435 @@ -355,7 +352,7 @@ def test_specify_dtype_for_index_col(all_parsers, dtype, val, request): pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine") ) result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype}) - expected = DataFrame({"b": [2]}, index=Index([val], name="a")) + expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype)) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 32a8d3b81f470..80c32d3a6262e 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -7,9 +7,10 @@ import pytest -from pandas._config import using_string_dtype - -from pandas import DataFrame +from pandas import ( + DataFrame, + Index, +) import pandas._testing as tm xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") @@ -120,7 +121,6 @@ def test_thorough_mangle_names(all_parsers, data, names, expected): parser.read_csv(StringIO(data), names=names) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # AssertionError: DataFrame.columns are different def test_mangled_unnamed_placeholders(all_parsers): # xref gh-13017 @@ -132,7 +132,7 @@ def test_mangled_unnamed_placeholders(all_parsers): # This test recursively updates `df`. for i in range(3): - expected = DataFrame() + expected = DataFrame(columns=Index([], dtype="str")) for j in range(i + 1): col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1) diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 5f9823f7225f9..dd168aaa45808 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.parsers import STR_NA_VALUES from pandas import ( @@ -260,7 +258,6 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "kwargs,expected", [ @@ -306,7 +303,9 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): ), ], ) -def test_na_values_keep_default(all_parsers, kwargs, expected, request): +def test_na_values_keep_default( + all_parsers, kwargs, expected, request, using_infer_string +): data = """\ A,B,C a,1,one @@ -324,8 +323,9 @@ def test_na_values_keep_default(all_parsers, kwargs, expected, request): with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), **kwargs) return - mark = pytest.mark.xfail() - request.applymarker(mark) + if not using_infer_string or "na_values" in kwargs: + mark = pytest.mark.xfail() + request.applymarker(mark) result = parser.read_csv(StringIO(data), **kwargs) tm.assert_frame_equal(result, expected) @@ -435,8 +435,6 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -@xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case @pytest.mark.parametrize( "na_filter,row_data", [ @@ -444,14 +442,21 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v (False, [["1", "A"], ["nan", "B"], ["3", "C"]]), ], ) -def test_na_values_na_filter_override(all_parsers, na_filter, row_data): +def test_na_values_na_filter_override( + request, all_parsers, na_filter, row_data, using_infer_string +): + parser = all_parsers + if parser.engine == "pyarrow": + # mismatched dtypes in both cases, FutureWarning in the True case + if not (using_infer_string and na_filter): + mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") + request.applymarker(mark) data = """\ A,B 1,A nan,B 3,C """ - parser = all_parsers result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter) expected = DataFrame(row_data, columns=["A", "B"]) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index be2015fca27d1..616fcb81cf055 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -16,8 +16,6 @@ import pytest import pytz -from pandas._config import using_string_dtype - from pandas._libs.tslibs import parsing import pandas as pd @@ -1799,7 +1797,6 @@ def test_parse_timezone(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @skip_pyarrow # pandas.errors.ParserError: CSV parse error @pytest.mark.parametrize( "date_string", @@ -1807,7 +1804,7 @@ def test_parse_timezone(all_parsers): ) def test_invalid_parse_delimited_date(all_parsers, date_string): parser = all_parsers - expected = DataFrame({0: [date_string]}, dtype="object") + expected = DataFrame({0: [date_string]}, dtype="str") result = parser.read_csv( StringIO(date_string), header=None, @@ -2054,7 +2051,6 @@ def test_parse_dates_and_keep_original_column(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_dayfirst_warnings(): # GH 12585 @@ -2087,7 +2083,7 @@ def test_dayfirst_warnings(): # first in DD/MM/YYYY, second in MM/DD/YYYY input = "date\n31/12/2014\n03/30/2011" - expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date") + expected = Index(["31/12/2014", "03/30/2011"], dtype="str", name="date") # A. use dayfirst=True res5 = read_csv( @@ -2204,7 +2200,6 @@ def test_parse_dates_and_string_dtype(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_parse_dot_separated_dates(all_parsers): # https://github.com/pandas-dev/pandas/issues/2586 parser = all_parsers @@ -2214,7 +2209,7 @@ def test_parse_dot_separated_dates(all_parsers): if parser.engine == "pyarrow": expected_index = Index( ["27.03.2003 14:55:00.000", "03.08.2003 15:20:00.000"], - dtype="object", + dtype="str", name="a", ) warn = None diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index 01e576ba40f26..bc4c4c2e24e9c 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.parsers import ( _maybe_upcast, na_values, @@ -86,7 +84,6 @@ def test_maybe_upcaste_all_nan(): tm.assert_extension_array_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("val", [na_values[np.object_], "c"]) def test_maybe_upcast_object(val, string_storage): # GH#36712 From 3bcbf0c02c2ba79e577b5e8a3252a1816895e239 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 25 Nov 2024 15:59:09 -0800 Subject: [PATCH 34/42] Backport PR #60416 on branch 2.3.x (TST: Avoid hashing np.timedelta64 without unit) (#60418) Backport PR #60416: TST: Avoid hashing np.timedelta64 without unit --- pandas/tests/test_algos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 97d6415e0de05..d1e69cfa2b4ee 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1283,7 +1283,7 @@ def test_value_counts_nat(self): result_dt = algos.value_counts(dt) tm.assert_series_equal(result_dt, exp_dt) - exp_td = Series({np.timedelta64(10000): 1}, name="count") + exp_td = Series([1], index=[np.timedelta64(10000)], name="count") with tm.assert_produces_warning(FutureWarning, match=msg): result_td = algos.value_counts(td) tm.assert_series_equal(result_td, exp_td) From 2b37c980553064dd6d0df7f7d3d5335444c1c9c4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 2 Dec 2024 20:01:43 +0100 Subject: [PATCH 35/42] [backport 2.3.x] String dtype: use ObjectEngine for indexing for now correctness over performance (#60329) (#60453) String dtype: use ObjectEngine for indexing for now correctness over performance (#60329) (cherry picked from commit 98f7e4deeff26a5ef993ee27104387a1a6e0d3d3) --- pandas/_libs/index.pyi | 3 + pandas/_libs/index.pyx | 26 +++++ pandas/core/indexes/base.py | 3 +- pandas/tests/indexes/string/test_indexing.py | 104 +++++++++++++++++-- 4 files changed, 124 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 75db47bf3160e..9c3791a642768 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -68,6 +68,9 @@ class MaskedUInt16Engine(MaskedIndexEngine): ... class MaskedUInt8Engine(MaskedIndexEngine): ... class MaskedBoolEngine(MaskedUInt8Engine): ... +class StringObjectEngine(ObjectEngine): + def __init__(self, values: object, na_value) -> None: ... + class BaseMultiIndexCodesEngine: levels: list[np.ndarray] offsets: np.ndarray # ndarray[uint64_t, ndim=1] diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index ee6a11ddab004..365cc7c3cecfc 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -532,6 +532,32 @@ cdef class ObjectEngine(IndexEngine): return loc +cdef class StringObjectEngine(ObjectEngine): + + cdef: + object na_value + bint uses_na + + def __init__(self, ndarray values, na_value): + super().__init__(values) + self.na_value = na_value + self.uses_na = na_value is C_NA + + cdef bint _checknull(self, object val): + if self.uses_na: + return val is C_NA + else: + return util.is_nan(val) + + cdef _check_type(self, object val): + if isinstance(val, str): + return val + elif self._checknull(val): + return self.na_value + else: + raise KeyError(val) + + cdef class DatetimeEngine(Int64Engine): cdef: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4896fb0ad1cd2..ad39907e7400e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -884,6 +884,8 @@ def _engine( # error: Item "ExtensionArray" of "Union[ExtensionArray, # ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr] target_values = self._data._ndarray # type: ignore[union-attr] + elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype): + return libindex.StringObjectEngine(target_values, self.dtype.na_value) # type: ignore[union-attr] # error: Argument 1 to "ExtensionEngine" has incompatible type # "ndarray[Any, Any]"; expected "ExtensionArray" @@ -6133,7 +6135,6 @@ def _should_fallback_to_positional(self) -> bool: def get_indexer_non_unique( self, target ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: - target = ensure_index(target) target = self._maybe_cast_listlike_indexer(target) if not self._should_compare(target) and not self._should_partial_index(target): diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py index 755b7109a5a04..d1a278af337b7 100644 --- a/pandas/tests/indexes/string/test_indexing.py +++ b/pandas/tests/indexes/string/test_indexing.py @@ -6,6 +6,51 @@ import pandas._testing as tm +def _isnan(val): + try: + return val is not pd.NA and np.isnan(val) + except TypeError: + return False + + +class TestGetLoc: + def test_get_loc(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + assert index.get_loc("b") == 1 + + def test_get_loc_raises(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="d"): + index.get_loc("d") + + def test_get_loc_invalid_value(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="1"): + index.get_loc(1) + + def test_get_loc_non_unique(self, any_string_dtype): + index = Index(["a", "b", "a"], dtype=any_string_dtype) + result = index.get_loc("a") + expected = np.array([True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError): + index.get_loc(nulls_fixture) + + def test_get_loc_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype) + if any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and nulls_fixture is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(nulls_fixture)) + ): + with pytest.raises(KeyError): + index.get_loc(nulls_fixture) + else: + assert index.get_loc(nulls_fixture) == 2 + + class TestGetIndexer: @pytest.mark.parametrize( "method,expected", @@ -41,23 +86,60 @@ def test_get_indexer_strings_raises(self, any_string_dtype): ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] ) + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_missing(self, any_string_dtype, null, using_infer_string): + # NaT and Decimal("NaN") from null_fixture are not supported for string dtype + index = Index(["a", "b", null], dtype=any_string_dtype) + result = index.get_indexer(["a", null, "c"]) + if using_infer_string: + expected = np.array([0, 2, -1], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + expected = np.array([0, -1, -1], dtype=np.intp) + else: + expected = np.array([0, 2, -1], dtype=np.intp) -class TestGetIndexerNonUnique: - @pytest.mark.xfail(reason="TODO(infer_string)", strict=False) - def test_get_indexer_non_unique_nas(self, any_string_dtype, nulls_fixture): - index = Index(["a", "b", None], dtype=any_string_dtype) - indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + tm.assert_numpy_array_equal(result, expected) - expected_indexer = np.array([2], dtype=np.intp) - expected_missing = np.array([], dtype=np.intp) + +class TestGetIndexerNonUnique: + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_non_unique_nas( + self, any_string_dtype, null, using_infer_string + ): + index = Index(["a", "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique(["a", null]) + + if using_infer_string: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + expected_indexer = np.array([0, -1], dtype=np.intp) + expected_missing = np.array([1], dtype=np.intp) + else: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) # actually non-unique - index = Index(["a", None, "b", None], dtype=any_string_dtype) - indexer, missing = index.get_indexer_non_unique([nulls_fixture]) - - expected_indexer = np.array([1, 3], dtype=np.intp) + index = Index(["a", null, "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique(["a", null]) + + if using_infer_string: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + pass + else: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) From 4c2d6b4db2ca76bd5da9d04378eafed6dfaed24b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 3 Dec 2024 12:30:28 -0800 Subject: [PATCH 36/42] Backport PR #60461 on branch 2.3.x (PERF: improve construct_1d_object_array_from_listlike) (#60483) Backport PR #60461: PERF: improve construct_1d_object_array_from_listlike Co-authored-by: Joris Van den Bossche --- pandas/core/dtypes/cast.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index a1ef8a3e27e9c..f6fcd887f4528 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -87,8 +87,8 @@ if TYPE_CHECKING: from collections.abc import ( + Collection, Sequence, - Sized, ) from pandas._typing import ( @@ -1586,7 +1586,7 @@ def _maybe_box_and_unbox_datetimelike(value: Scalar, dtype: DtypeObj): return _maybe_unbox_datetimelike(value, dtype) -def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: +def construct_1d_object_array_from_listlike(values: Collection) -> np.ndarray: """ Transform any list-like object in a 1-dimensional numpy array of object dtype. @@ -1604,11 +1604,9 @@ def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: ------- 1-dimensional numpy array of dtype object """ - # numpy will try to interpret nested lists as further dimensions, hence - # making a 1D array that contains list-likes is a bit tricky: - result = np.empty(len(values), dtype="object") - result[:] = values - return result + # numpy will try to interpret nested lists as further dimensions in np.array(), + # hence explicitly making a 1D array using np.fromiter + return np.fromiter(values, dtype="object", count=len(values)) def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.ndarray: From 7f7e3055ecd85686d5c35f9efb98f394bf9a54d8 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 11 Dec 2024 14:13:14 -0800 Subject: [PATCH 37/42] Backport PR #60544 on branch 2.3.x (CI/TST: Use tm.external_error_raised for test_from_arrow_respecting_given_dtype_unsafe) (#60545) Backport PR #60544: CI/TST: Use tm.external_error_raised for test_from_arrow_respecting_given_dtype_unsafe Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/extension/test_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 03ab7c7f1dad8..470ca0673c60e 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1637,7 +1637,7 @@ def test_from_arrow_respecting_given_dtype(): def test_from_arrow_respecting_given_dtype_unsafe(): array = pa.array([1.5, 2.5], type=pa.float64()) - with pytest.raises(pa.ArrowInvalid, match="Float value 1.5 was truncated"): + with tm.external_error_raised(pa.ArrowInvalid): array.to_pandas(types_mapper={pa.float64(): ArrowDtype(pa.int64())}.get) From 9052c9eb2a16324e7c210991e4e96c64ae5e48f3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 11 Dec 2024 16:03:08 -0800 Subject: [PATCH 38/42] CI: Ignore prompting in test-arm when apt-get installing (#60547) --- .circleci/config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 50ff7a81ae103..2c52d7aee4e28 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -15,7 +15,6 @@ jobs: - checkout - run: .circleci/setup_env.sh - run: | - sudo apt-get update && sudo apt-get install -y libegl1 libopengl0 PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH \ LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD \ ci/run_tests.sh From 0c6959d6cfdd5c5672f20a8d64a9cdc601387bd9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 13 Dec 2024 14:19:44 +0100 Subject: [PATCH 39/42] [2.3.x] CI: update fastparquet xfails (#60559) CI: update fastparquet xfails --- pandas/tests/io/test_fsspec.py | 6 +++++- pandas/tests/io/test_gcs.py | 3 --- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 5ed64e3eb0958..cf59e3e4c4934 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -5,6 +5,8 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW + from pandas import ( DataFrame, date_range, @@ -168,7 +170,9 @@ def test_excel_options(fsspectest): assert fsspectest.test[0] == "read" -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") +@pytest.mark.xfail( + using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string) fastparquet" +) def test_to_parquet_new_file(cleared_fs, df1): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index c7671bfb513aa..9fc0f6eb47766 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -7,8 +7,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat.pyarrow import pa_version_under17p0 from pandas import ( @@ -196,7 +194,6 @@ def test_to_csv_compression_encoding_gcs( tm.assert_frame_equal(df, read_df) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" pytest.importorskip("fastparquet") From ffe079172156a3d4213b2c4a64d929bc1e7b6e69 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 13 Dec 2024 14:20:10 +0100 Subject: [PATCH 40/42] [2.3.x] COMPAT: fix construct_1d_object_array_from_listlike for older numpy (#60558) COMPAT: fix construct_1d_object_array_from_listlike for older numpy --- pandas/core/dtypes/cast.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index f6fcd887f4528..d4263f7488a14 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1606,7 +1606,10 @@ def construct_1d_object_array_from_listlike(values: Collection) -> np.ndarray: """ # numpy will try to interpret nested lists as further dimensions in np.array(), # hence explicitly making a 1D array using np.fromiter - return np.fromiter(values, dtype="object", count=len(values)) + result = np.empty(len(values), dtype="object") + for i, obj in enumerate(values): + result[i] = obj + return result def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.ndarray: From 6d9a2b4b5b2395b3438a057ce75f0857cf7a5109 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:56:15 -0800 Subject: [PATCH 41/42] Backport PR #60553: TST: filter possible RuntimeWarning in tests (#60555) * Backport PR #60553: TST: filter possible RuntimeWarning in tests * add more ignores --- pandas/tests/extension/test_interval.py | 25 +++++++++++++++++++ pandas/tests/frame/methods/test_to_numpy.py | 4 +++ pandas/tests/frame/test_constructors.py | 3 +++ pandas/tests/groupby/test_categorical.py | 3 +++ pandas/tests/groupby/test_groupby.py | 1 + pandas/tests/indexes/interval/test_astype.py | 6 +++++ pandas/tests/indexes/interval/test_formats.py | 3 +++ .../tests/indexes/interval/test_indexing.py | 3 +++ pandas/tests/indexes/test_setops.py | 1 + pandas/tests/io/excel/test_writers.py | 3 +++ pandas/tests/reshape/test_cut.py | 1 + 11 files changed, 53 insertions(+) diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 98dd1c5cb615f..6292e6051aa90 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -90,6 +90,31 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: def test_fillna_length_mismatch(self, data_missing): super().test_fillna_length_mismatch(data_missing) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_hash_pandas_object(self, data): + super().test_hash_pandas_object(data) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_hash_pandas_object_works(self, data, as_frame): + super().test_hash_pandas_object_works(data, as_frame) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data, request): + super().test_EA_types(engine, data, request) + + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_astype_str(self, data): + super().test_astype_str(data) + # TODO: either belongs in tests.arrays.interval or move into base tests. def test_fillna_non_scalar_raises(data_missing): diff --git a/pandas/tests/frame/methods/test_to_numpy.py b/pandas/tests/frame/methods/test_to_numpy.py index bdb9b2c055061..0731750aed0cf 100644 --- a/pandas/tests/frame/methods/test_to_numpy.py +++ b/pandas/tests/frame/methods/test_to_numpy.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas.util._test_decorators as td @@ -41,6 +42,9 @@ def test_to_numpy_copy(self, using_copy_on_write): else: assert df.to_numpy(copy=False, na_value=np.nan).base is arr + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_to_numpy_mixed_dtype_to_str(self): # https://github.com/pandas-dev/pandas/issues/35455 df = DataFrame([[Timestamp("2020-01-01 00:00:00"), 100.0]]) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index fd770b368c9da..f16068e0b6538 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2450,6 +2450,9 @@ def test_construct_with_two_categoricalindex_series(self): ) tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_constructor_series_nonexact_categoricalindex(self): # GH 42424 ser = Series(range(100)) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 447df952fd0e5..cba02ae869889 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -67,6 +67,7 @@ def f(a): } +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_apply_use_categorical_name(df): cats = qcut(df.C, 4) @@ -338,6 +339,7 @@ def test_apply(ordered): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_observed(request, using_infer_string, observed): # multiple groupers, don't re-expand the output space # of the grouper @@ -1556,6 +1558,7 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( assert (res.loc[unobserved_cats] == expected).all().all() +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_series_groupby_categorical_aggregation_getitem(): # GH 8870 d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]} diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9b362164c6149..b5588898d4580 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3118,6 +3118,7 @@ def test_groupby_numeric_only_std_no_result(numeric_only): dfgb.std(numeric_only=numeric_only) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_grouping_with_categorical_interval_columns(): # GH#34164 df = DataFrame({"x": [0.1, 0.2, 0.3, -0.4, 0.5], "w": ["a", "b", "a", "c", "a"]}) diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index 59c555b9644a1..dde5f38074efb 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -186,6 +186,12 @@ def test_subtype_datetimelike(self, index, subtype): with pytest.raises(TypeError, match=msg): index.astype(dtype) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) + def test_astype_category(self, index): + super().test_astype_category(index) + class TestDatetimelikeSubtype(AstypeTests): """Tests specific to IntervalIndex with datetime-like subtype""" diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index f858ae137ca4e..73bbfc91028b3 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -59,6 +59,9 @@ def test_repr_floats(self): expected = "(329.973, 345.137] 1\n(345.137, 360.191] 2\ndtype: int64" assert result == expected + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) @pytest.mark.parametrize( "tuples, closed, expected_data", [ diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index fd03047b2c127..b5be7e0713cdf 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -341,6 +341,9 @@ def test_get_indexer_categorical(self, target, ordered): expected = index.get_indexer(target) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_get_indexer_categorical_with_nans(self): # GH#41934 nans in both index and in target ii = IntervalIndex.from_breaks(range(5)) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 3845744dc0717..f6a865ccbb3a0 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -519,6 +519,7 @@ def test_intersection_difference_match_empty(self, index, sort): tm.assert_index_equal(inter, diff, exact=True) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") @pytest.mark.parametrize( "method", ["intersection", "union", "difference", "symmetric_difference"] diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 57091b268a9db..f133423bc6a85 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -755,6 +755,9 @@ def test_excel_date_datetime_format(self, ext, path): # we need to use df_expected to check the result. tm.assert_frame_equal(rs2, df_expected) + @pytest.mark.filterwarnings( + "ignore:invalid value encountered in cast:RuntimeWarning" + ) def test_to_excel_interval_no_labels(self, path, using_infer_string): # see gh-19242 # diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 0811c69859c0d..cab2302b3d877 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -727,6 +727,7 @@ def test_cut_with_duplicated_index_lowest_included(): tm.assert_series_equal(result, expected) +@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning") def test_cut_with_nonexact_categorical_indices(): # GH 42424 From 3362822db00076222d3875a86b5f5625799d8862 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 17 Dec 2024 08:40:01 +0100 Subject: [PATCH 42/42] String dtype (2.3.x): avoid downcasting object to string in fillna/where/interpolate (#60183) --- pandas/_libs/lib.pyi | 3 ++ pandas/_libs/lib.pyx | 7 +++- pandas/core/internals/blocks.py | 38 ++++++++++++++++++--- pandas/tests/frame/methods/test_fillna.py | 21 +++--------- pandas/tests/frame/methods/test_replace.py | 37 +++----------------- pandas/tests/indexing/test_coercion.py | 6 +++- pandas/tests/series/methods/test_replace.py | 3 -- 7 files changed, 57 insertions(+), 58 deletions(-) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index b9fd970e68f5b..71a4d3ae2575f 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -86,6 +86,7 @@ def maybe_convert_objects( safe: bool = ..., convert_numeric: bool = ..., convert_non_numeric: Literal[False] = ..., + convert_string: Literal[False] = ..., convert_to_nullable_dtype: Literal[False] = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> npt.NDArray[np.object_ | np.number]: ... @@ -97,6 +98,7 @@ def maybe_convert_objects( safe: bool = ..., convert_numeric: bool = ..., convert_non_numeric: bool = ..., + convert_string: bool = ..., convert_to_nullable_dtype: Literal[True] = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @@ -108,6 +110,7 @@ def maybe_convert_objects( safe: bool = ..., convert_numeric: bool = ..., convert_non_numeric: bool = ..., + convert_string: bool = ..., convert_to_nullable_dtype: bool = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index bc039917aef87..f72d6a5dad877 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2498,6 +2498,7 @@ def maybe_convert_objects(ndarray[object] objects, bint convert_numeric=True, # NB: different default! bint convert_to_nullable_dtype=False, bint convert_non_numeric=False, + bint convert_string=True, object dtype_if_all_nat=None) -> "ArrayLike": """ Type inference function-- convert object array to proper dtype @@ -2747,7 +2748,11 @@ def maybe_convert_objects(ndarray[object] objects, dtype = StringDtype() return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) - elif using_string_dtype() and is_string_array(objects, skipna=True): + elif ( + convert_string + and using_string_dtype() + and is_string_array(objects, skipna=True) + ): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(na_value=np.nan) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6ae591a5d4ac8..5be83aa38011b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -563,7 +563,12 @@ def _maybe_downcast( return blocks nbs = extend_blocks( - [blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks] + [ + blk.convert( + using_cow=using_cow, copy=not using_cow, convert_string=False + ) + for blk in blocks + ] ) if caller == "fillna": if len(nbs) != len(blocks) or not all( @@ -636,6 +641,7 @@ def convert( *, copy: bool = True, using_cow: bool = False, + convert_string: bool = True, ) -> list[Block]: """ Attempt to coerce any object types to better types. Return a copy @@ -648,7 +654,10 @@ def convert( if self.ndim != 1 and self.shape[0] != 1: blocks = self.split_and_operate( - Block.convert, copy=copy, using_cow=using_cow + Block.convert, + copy=copy, + using_cow=using_cow, + convert_string=convert_string, ) if all(blk.dtype.kind == "O" for blk in blocks): # Avoid fragmenting the block if convert is a no-op @@ -666,6 +675,7 @@ def convert( res_values = lib.maybe_convert_objects( values, # type: ignore[arg-type] convert_non_numeric=True, + convert_string=convert_string, ) refs = None if ( @@ -851,6 +861,7 @@ def replace( mask: npt.NDArray[np.bool_] | None = None, using_cow: bool = False, already_warned=None, + convert_string=None, ) -> list[Block]: """ replace the to_replace value with value, possible to create new @@ -915,7 +926,11 @@ def replace( if get_option("future.no_silent_downcasting") is True: blocks = [blk] else: - blocks = blk.convert(copy=False, using_cow=using_cow) + blocks = blk.convert( + copy=False, + using_cow=using_cow, + convert_string=convert_string or self.dtype != _dtype_obj, + ) if len(blocks) > 1 or blocks[0].dtype != blk.dtype: warnings.warn( # GH#54710 @@ -944,6 +959,7 @@ def replace( inplace=True, mask=mask, using_cow=using_cow, + convert_string=convert_string, ) else: @@ -958,6 +974,7 @@ def replace( inplace=True, mask=mask[i : i + 1], using_cow=using_cow, + convert_string=convert_string, ) ) return blocks @@ -970,6 +987,7 @@ def _replace_regex( inplace: bool = False, mask=None, using_cow: bool = False, + convert_string: bool = True, already_warned=None, ) -> list[Block]: """ @@ -1029,7 +1047,9 @@ def _replace_regex( ) already_warned.warned_already = True - nbs = block.convert(copy=False, using_cow=using_cow) + nbs = block.convert( + copy=False, using_cow=using_cow, convert_string=convert_string + ) opt = get_option("future.no_silent_downcasting") if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt: warnings.warn( @@ -1068,6 +1088,8 @@ def replace_list( values._replace(to_replace=src_list, value=dest_list, inplace=True) return [blk] + convert_string = self.dtype != _dtype_obj + # Exclude anything that we know we won't contain pairs = [ (x, y) @@ -1152,6 +1174,7 @@ def replace_list( inplace=inplace, regex=regex, using_cow=using_cow, + convert_string=convert_string, ) if using_cow and i != src_len: @@ -1174,7 +1197,9 @@ def replace_list( nbs = [] for res_blk in result: converted = res_blk.convert( - copy=True and not using_cow, using_cow=using_cow + copy=True and not using_cow, + using_cow=using_cow, + convert_string=convert_string, ) if len(converted) > 1 or converted[0].dtype != res_blk.dtype: warnings.warn( @@ -1204,6 +1229,7 @@ def _replace_coerce( inplace: bool = True, regex: bool = False, using_cow: bool = False, + convert_string: bool = True, ) -> list[Block]: """ Replace value corresponding to the given boolean array with another @@ -1233,6 +1259,7 @@ def _replace_coerce( inplace=inplace, mask=mask, using_cow=using_cow, + convert_string=convert_string, ) else: if value is None: @@ -1256,6 +1283,7 @@ def _replace_coerce( inplace=inplace, mask=mask, using_cow=using_cow, + convert_string=convert_string, ) # --------------------------------------------------------------------- diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index e2baa2567f5b4..9844122dc4b2d 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -132,21 +132,14 @@ def test_fillna_different_dtype(self, using_infer_string): [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] ) - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.fillna({2: "foo"}) - else: - result = df.fillna({2: "foo"}) + result = df.fillna({2: "foo"}) expected = DataFrame( [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] ) + expected[2] = expected[2].astype("object") tm.assert_frame_equal(result, expected) - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - return_value = df.fillna({2: "foo"}, inplace=True) - else: - return_value = df.fillna({2: "foo"}, inplace=True) + return_value = df.fillna({2: "foo"}, inplace=True) tm.assert_frame_equal(df, expected) assert return_value is None @@ -385,12 +378,8 @@ def test_fillna_dtype_conversion(self, using_infer_string): # empty block df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64") - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.fillna("nan") - else: - result = df.fillna("nan") - expected = DataFrame("nan", index=range(3), columns=["A", "B"]) + result = df.fillna("nan") + expected = DataFrame("nan", index=range(3), columns=["A", "B"], dtype=object) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("val", ["", 1, np.nan, 1.0]) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 8df9893e73766..2ee878893ce70 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -281,20 +281,12 @@ def test_regex_replace_dict_nested(self, mix_abc): tm.assert_frame_equal(res3, expec) tm.assert_frame_equal(res4, expec) - def test_regex_replace_dict_nested_non_first_character( - self, any_string_dtype, using_infer_string - ): + def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype): # GH 25259 dtype = any_string_dtype df = DataFrame({"first": ["abc", "bca", "cab"]}, dtype=dtype) - if using_infer_string and any_string_dtype == "object": - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.replace({"a": "."}, regex=True) - expected = DataFrame({"first": [".bc", "bc.", "c.b"]}) - - else: - result = df.replace({"a": "."}, regex=True) - expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) + result = df.replace({"a": "."}, regex=True) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) def test_regex_replace_dict_nested_gh4115(self): @@ -429,31 +421,12 @@ def test_replace_regex_metachar(self, metachar): ], ) def test_regex_replace_string_types( - self, - data, - to_replace, - expected, - frame_or_series, - any_string_dtype, - using_infer_string, - request, + self, data, to_replace, expected, frame_or_series, any_string_dtype ): # GH-41333, GH-35977 dtype = any_string_dtype obj = frame_or_series(data, dtype=dtype) - if using_infer_string and any_string_dtype == "object": - if len(to_replace) > 1 and isinstance(obj, DataFrame): - request.node.add_marker( - pytest.mark.xfail( - reason="object input array that gets downcasted raises on " - "second pass" - ) - ) - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = obj.replace(to_replace, regex=True) - dtype = "str" - else: - result = obj.replace(to_replace, regex=True) + result = obj.replace(to_replace, regex=True) expected = frame_or_series(expected, dtype=dtype) tm.assert_equal(result, expected) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index ac3bfe3a13a44..4e1697eabf734 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -831,7 +831,7 @@ def replacer(self, how, from_key, to_key): raise ValueError return replacer - def test_replace_series(self, how, to_key, from_key, replacer): + def test_replace_series(self, how, to_key, from_key, replacer, using_infer_string): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") obj = obj.astype(from_key) @@ -856,6 +856,10 @@ def test_replace_series(self, how, to_key, from_key, replacer): else: exp = pd.Series(self.rep[to_key], index=index, name="yyy") + if using_infer_string and exp.dtype == "string" and obj.dtype == object: + # with infer_string, we disable the deprecated downcasting behavior + exp = exp.astype(object) + msg = "Downcasting behavior in `replace`" warn = FutureWarning if ( diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 0b0cf57a70c3f..0c2e0fdc2616f 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd import pandas._testing as tm from pandas.core.arrays import IntervalArray @@ -768,7 +766,6 @@ def test_replace_value_none_dtype_numeric(self, val): expected = pd.Series([1, None], dtype=object) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_replace_change_dtype_series(self): # GH#25797 df = pd.DataFrame({"Test": ["0.5", True, "0.6"]}, dtype=object)