From 9c7b5677efe10bef70ed34f207f7ef996503afff Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 26 Oct 2023 17:38:40 +0200 Subject: [PATCH 01/32] Backport PR #54922 on branch 2.1.x (REGR: Restore _constructor_from_mgr to pass manager object to constructor) (#55704) Backport PR #54922: REGR: Restore _constructor_from_mgr to pass manager object to constructor --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/frame.py | 19 +++++------ pandas/core/series.py | 20 +++++------ pandas/tests/frame/test_arithmetic.py | 3 ++ pandas/tests/frame/test_subclass.py | 34 ++++++++++++++++++- pandas/tests/groupby/test_groupby_subclass.py | 4 +++ pandas/tests/reshape/concat/test_concat.py | 3 ++ pandas/tests/reshape/merge/test_merge.py | 3 ++ .../tests/reshape/merge/test_merge_ordered.py | 3 ++ pandas/tests/series/methods/test_to_frame.py | 5 +++ pandas/tests/series/test_subclass.py | 4 +++ 11 files changed, 77 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 1d2d89e7eadd1..a59cadde422b9 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -29,6 +29,7 @@ Fixed regressions - Fixed performance regression with wide DataFrames, typically involving methods where all columns were accessed individually (:issue:`55256`, :issue:`55245`) - Fixed regression in :func:`merge_asof` raising ``TypeError`` for ``by`` with datetime and timedelta dtypes (:issue:`55453`) - Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite when using ``detect_types`` (:issue:`55554`) +- Fixed regression in construction of certain DataFrame or Series subclasses (:issue:`54922`) .. --------------------------------------------------------------------------- .. _whatsnew_212.bug_fixes: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1614ecf1d7037..c109070ce461d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -639,14 +639,12 @@ def _constructor(self) -> Callable[..., DataFrame]: return DataFrame def _constructor_from_mgr(self, mgr, axes): - df = self._from_mgr(mgr, axes=axes) - - if type(self) is DataFrame: - # fastpath avoiding constructor call - return df + if self._constructor is DataFrame: + # we are pandas.DataFrame (or a subclass that doesn't override _constructor) + return self._from_mgr(mgr, axes=axes) else: assert axes is mgr.axes - return self._constructor(df, copy=False) + return self._constructor(mgr) _constructor_sliced: Callable[..., Series] = Series @@ -654,13 +652,12 @@ def _sliced_from_mgr(self, mgr, axes) -> Series: return Series._from_mgr(mgr, axes) def _constructor_sliced_from_mgr(self, mgr, axes): - ser = self._sliced_from_mgr(mgr, axes=axes) - ser._name = None # caller is responsible for setting real name - if type(self) is DataFrame: - # fastpath avoiding constructor call + if self._constructor_sliced is Series: + ser = self._sliced_from_mgr(mgr, axes) + ser._name = None # caller is responsible for setting real name return ser assert axes is mgr.axes - return self._constructor_sliced(ser, copy=False) + return self._constructor_sliced(mgr) # ---------------------------------------------------------------------- # Constructors diff --git a/pandas/core/series.py b/pandas/core/series.py index 1d83643722ece..8ad049e173507 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -579,14 +579,14 @@ def _constructor(self) -> Callable[..., Series]: return Series def _constructor_from_mgr(self, mgr, axes): - ser = self._from_mgr(mgr, axes=axes) - ser._name = None # caller is responsible for setting real name - if type(self) is Series: - # fastpath avoiding constructor call + if self._constructor is Series: + # we are pandas.Series (or a subclass that doesn't override _constructor) + ser = self._from_mgr(mgr, axes=axes) + ser._name = None # caller is responsible for setting real name return ser else: assert axes is mgr.axes - return self._constructor(ser, copy=False) + return self._constructor(mgr) @property def _constructor_expanddim(self) -> Callable[..., DataFrame]: @@ -610,12 +610,12 @@ def _expanddim_from_mgr(self, mgr, axes) -> DataFrame: return DataFrame._from_mgr(mgr, axes=mgr.axes) def _constructor_expanddim_from_mgr(self, mgr, axes): - df = self._expanddim_from_mgr(mgr, axes) - if type(self) is Series: - # fastpath avoiding constructor - return df + from pandas.core.frame import DataFrame + + if self._constructor_expanddim is DataFrame: + return self._expanddim_from_mgr(mgr, axes) assert axes is mgr.axes - return self._constructor_expanddim(df, copy=False) + return self._constructor_expanddim(mgr) # types @property diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 670d4d4f554a6..e5a8feb7a89d3 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -2078,6 +2078,9 @@ def test_frame_sub_nullable_int(any_int_ea_dtype): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" +) def test_frame_op_subclass_nonclass_constructor(): # GH#43201 subclass._constructor is a function, not the subclass itself diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 37ca52eba6451..8e657f197ca1e 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -10,6 +10,10 @@ ) import pandas._testing as tm +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" +) + @pytest.fixture() def gpd_style_subclass_df(): @@ -734,8 +738,36 @@ def test_replace_list_method(self): # https://github.com/pandas-dev/pandas/pull/46018 df = tm.SubclassedDataFrame({"A": [0, 1, 2]}) msg = "The 'method' keyword in SubclassedDataFrame.replace is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + with tm.assert_produces_warning( + FutureWarning, match=msg, raise_on_extra_warnings=False + ): result = df.replace([1, 2], method="ffill") expected = tm.SubclassedDataFrame({"A": [0, 0, 0]}) assert isinstance(result, tm.SubclassedDataFrame) tm.assert_frame_equal(result, expected) + + +class MySubclassWithMetadata(DataFrame): + _metadata = ["my_metadata"] + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + my_metadata = kwargs.pop("my_metadata", None) + if args and isinstance(args[0], MySubclassWithMetadata): + my_metadata = args[0].my_metadata # type: ignore[has-type] + self.my_metadata = my_metadata + + @property + def _constructor(self): + return MySubclassWithMetadata + + +def test_constructor_with_metadata(): + # https://github.com/pandas-dev/pandas/pull/54922 + # https://github.com/pandas-dev/pandas/issues/55120 + df = MySubclassWithMetadata( + np.random.default_rng(2).random((5, 3)), columns=["A", "B", "C"] + ) + subset = df[["A", "B"]] + assert isinstance(subset, MySubclassWithMetadata) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 773c1e60e97af..678211ea4a053 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -11,6 +11,10 @@ import pandas._testing as tm from pandas.tests.groupby import get_groupby_method_args +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" +) + @pytest.mark.parametrize( "obj", diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 5dde863f246d1..7863aa55152f1 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -591,6 +591,9 @@ def test_duplicate_keys_same_frame(): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" +) @pytest.mark.parametrize( "obj", [ diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 304ec37b1e453..6868f7344c153 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -689,6 +689,9 @@ def test_merge_nan_right2(self): )[["i1", "i2", "i1_", "i3"]] tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" + ) def test_merge_type(self, df, df2): class NotADataFrame(DataFrame): @property diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index 1d0d4e3eb554b..cfb4e92fb45cd 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -70,6 +70,9 @@ def test_multigroup(self, left, right): result = merge_ordered(left, right, on="key", left_by="group") assert result["group"].notna().all() + @pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" + ) def test_merge_type(self, left, right): class NotADataFrame(DataFrame): @property diff --git a/pandas/tests/series/methods/test_to_frame.py b/pandas/tests/series/methods/test_to_frame.py index 01e547aa34b47..0eadf696b34cc 100644 --- a/pandas/tests/series/methods/test_to_frame.py +++ b/pandas/tests/series/methods/test_to_frame.py @@ -1,3 +1,5 @@ +import pytest + from pandas import ( DataFrame, Index, @@ -40,6 +42,9 @@ def test_to_frame(self, datetime_series): ) tm.assert_frame_equal(rs, xp) + @pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" + ) def test_to_frame_expanddim(self): # GH#9762 diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index a3550c6de6780..c2d5afcf884b1 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -4,6 +4,10 @@ import pandas as pd import pandas._testing as tm +pytestmark = pytest.mark.filterwarnings( + "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning" +) + class TestSeriesSubclassing: @pytest.mark.parametrize( From 16a3b01647c5dca09c6767e880bf5754e01a9a76 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 26 Oct 2023 18:05:45 +0200 Subject: [PATCH 02/32] Backport PR #55691 on branch 2.1.x (REGR: fix read_parquet with column of large strings (avoid overflow from concat)) (#55706) Backport PR #55691: REGR: fix read_parquet with column of large strings (avoid overflow from concat) Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/string_.py | 12 ++++++++++-- pandas/tests/io/test_parquet.py | 12 ++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index a59cadde422b9..d2e10b4a42e68 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -28,6 +28,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrameGroupBy.agg` and :meth:`SeriesGroupBy.agg` where if the option ``compute.use_numba`` was set to True, groupby methods not supported by the numba engine would raise a ``TypeError`` (:issue:`55520`) - Fixed performance regression with wide DataFrames, typically involving methods where all columns were accessed individually (:issue:`55256`, :issue:`55245`) - Fixed regression in :func:`merge_asof` raising ``TypeError`` for ``by`` with datetime and timedelta dtypes (:issue:`55453`) +- Fixed regression in :func:`read_parquet` when reading a file with a string column consisting of more than 2 GB of string data and using the ``"string"`` dtype (:issue:`55606`) - Fixed regression in :meth:`DataFrame.to_sql` not roundtripping datetime columns correctly for sqlite when using ``detect_types`` (:issue:`55554`) - Fixed regression in construction of certain DataFrame or Series subclasses (:issue:`54922`) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 58c638e7f59b5..a41214ae17044 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -223,11 +223,19 @@ def __from_arrow__( # pyarrow.ChunkedArray chunks = array.chunks + results = [] + for arr in chunks: + # convert chunk by chunk to numpy and concatenate then, to avoid + # overflow for large string data when concatenating the pyarrow arrays + arr = arr.to_numpy(zero_copy_only=False) + arr = ensure_string_array(arr, na_value=libmissing.NA) + results.append(arr) + if len(chunks) == 0: arr = np.array([], dtype=object) else: - arr = pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False) - arr = ensure_string_array(arr, na_value=libmissing.NA) + arr = np.concatenate(results) + # Bypass validation inside StringArray constructor, see GH#47781 new_string_array = StringArray.__new__(StringArray) NDArrayBacked.__init__( diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 41ae4042e9ae0..1d68f12270b55 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1144,6 +1144,18 @@ def test_infer_string_large_string_type(self, tmp_path, pa): ) tm.assert_frame_equal(result, expected) + # NOTE: this test is not run by default, because it requires a lot of memory (>5GB) + # @pytest.mark.slow + # def test_string_column_above_2GB(self, tmp_path, pa): + # # https://github.com/pandas-dev/pandas/issues/55606 + # # above 2GB of string data + # v1 = b"x" * 100000000 + # v2 = b"x" * 147483646 + # df = pd.DataFrame({"strings": [v1] * 20 + [v2] + ["x"] * 20}, dtype="string") + # df.to_parquet(tmp_path / "test.parquet") + # result = read_parquet(tmp_path / "test.parquet") + # assert result["strings"].dtype == "string" + class TestParquetFastParquet(Base): def test_basic(self, fp, df_full): From a60ad39b4a9febdea9a59d602dad44b1538b0ea5 Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Thu, 26 Oct 2023 12:08:11 -0400 Subject: [PATCH 03/32] RLS: 2.1.2 From d11fa8d3db49d02e36e090d02b1be7eedc8eff56 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 27 Oct 2023 17:04:29 +0200 Subject: [PATCH 04/32] Backport PR #55722 on branch 2.1.x (DOC: Add whatsnew for 2.1.3) (#55723) Backport PR #55722: DOC: Add whatsnew for 2.1.3 Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v2.1.1.rst | 2 +- doc/source/whatsnew/v2.1.2.rst | 2 ++ doc/source/whatsnew/v2.1.3.rst | 41 ++++++++++++++++++++++++++++++++++ 4 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 doc/source/whatsnew/v2.1.3.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index baab20845a2a2..eef65366a2762 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 2.1 .. toctree:: :maxdepth: 2 + v2.1.3 v2.1.2 v2.1.1 v2.1.0 diff --git a/doc/source/whatsnew/v2.1.1.rst b/doc/source/whatsnew/v2.1.1.rst index c52f7db8ec3ec..6101333ae760c 100644 --- a/doc/source/whatsnew/v2.1.1.rst +++ b/doc/source/whatsnew/v2.1.1.rst @@ -52,4 +52,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v2.1.0..v2.1.1|HEAD +.. contributors:: v2.1.0..v2.1.1 diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index d2e10b4a42e68..ade7f767815b1 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -66,3 +66,5 @@ Other Contributors ~~~~~~~~~~~~ + +.. contributors:: v2.1.1..v2.1.2 diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst new file mode 100644 index 0000000000000..1359123ef153e --- /dev/null +++ b/doc/source/whatsnew/v2.1.3.rst @@ -0,0 +1,41 @@ +.. _whatsnew_213: + +What's new in 2.1.3 (November ??, 2023) +--------------------------------------- + +These are the changes in pandas 2.1.3. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_213.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_213.bug_fixes: + +Bug fixes +~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_213.other: + +Other +~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_213.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v2.1.2..v2.1.3|HEAD From c5da5ea15a17a2c0e95495619555b77488e2811b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 27 Oct 2023 18:43:34 +0200 Subject: [PATCH 05/32] Backport PR #55707 on branch 2.1.x (REF: Avoid np.can_cast for scalar inference for NEP 50) (#55716) Backport PR #55707: REF: Avoid np.can_cast for scalar inference for NEP 50 Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- ci/run_tests.sh | 3 ++- pandas/core/dtypes/cast.py | 25 +++++++++++++++++++++++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 48ef21686a26f..6a70ea1df3e71 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -10,7 +10,8 @@ echo PYTHONHASHSEED=$PYTHONHASHSEED COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.toml" -PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" +# TODO: Support NEP 50 and remove NPY_PROMOTION_STATE +PYTEST_CMD="NPY_PROMOTION_STATE=legacy MESONPY_EDITABLE_VERBOSE=1 PYTHONDEVMODE=1 PYTHONWARNDEFAULTENCODING=1 pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 8ed57e9bf5532..9a9a8ed22f282 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -700,7 +700,7 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.integer): - if not np.can_cast(fill_value, dtype): + if not np_can_cast_scalar(fill_value, dtype): # type: ignore[arg-type] # upcast to prevent overflow mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) @@ -1892,4 +1892,25 @@ def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool: """ if not len(rng): return True - return np.can_cast(rng[0], dtype) and np.can_cast(rng[-1], dtype) + return np_can_cast_scalar(rng.start, dtype) and np_can_cast_scalar(rng.stop, dtype) + + +def np_can_cast_scalar(element: Scalar, dtype: np.dtype) -> bool: + """ + np.can_cast pandas-equivalent for pre 2-0 behavior that allowed scalar + inference + + Parameters + ---------- + element : Scalar + dtype : np.dtype + + Returns + ------- + bool + """ + try: + np_can_hold_element(dtype, element) + return True + except (LossySetitemError, NotImplementedError): + return False From 08f2bd4c5da3adae7ea1a82a22fbe7b0494a3d59 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 29 Oct 2023 02:43:00 +0100 Subject: [PATCH 06/32] Backport PR #55747 on branch 2.1.x (DOC: Fix release date for 2.1.2) (#55749) Backport PR #55747: DOC: Fix release date for 2.1.2 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.1.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index ade7f767815b1..a13a273b01257 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -1,6 +1,6 @@ .. _whatsnew_212: -What's new in 2.1.2 (October 25, 2023) +What's new in 2.1.2 (October 26, 2023) --------------------------------------- These are the changes in pandas 2.1.2. See :ref:`release` for a full changelog From bf9168410cb81cdc87928f637fba6638002cb33a Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 1 Nov 2023 23:52:02 +0100 Subject: [PATCH 07/32] Backport PR #55726 on branch 2.1.x (fix segfault with tz_localize_to_utc) (#55796) Backport PR #55726: fix segfault with tz_localize_to_utc Co-authored-by: William Ayd --- pandas/_libs/tslibs/tzconversion.pyx | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index e17d264333264..af27acf16b771 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -332,13 +332,16 @@ timedelta-like} # Shift the delta_idx by if the UTC offset of # the target tz is greater than 0 and we're moving forward # or vice versa - first_delta = info.deltas[0] - if (shift_forward or shift_delta > 0) and first_delta > 0: - delta_idx_offset = 1 - elif (shift_backward or shift_delta < 0) and first_delta < 0: - delta_idx_offset = 1 - else: - delta_idx_offset = 0 + # TODO: delta_idx_offset and info.deltas are needed for zoneinfo timezones, + # but are not applicable for all timezones. Setting the former to 0 and + # length checking the latter avoids UB, but this could use a larger refactor + delta_idx_offset = 0 + if len(info.deltas): + first_delta = info.deltas[0] + if (shift_forward or shift_delta > 0) and first_delta > 0: + delta_idx_offset = 1 + elif (shift_backward or shift_delta < 0) and first_delta < 0: + delta_idx_offset = 1 for i in range(n): val = vals[i] From 8a616b901f1f2a3f8fdcbc8a6e4754330abafba1 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 6 Nov 2023 23:05:51 +0100 Subject: [PATCH 08/32] Backport PR #55817 on branch 2.1.x (COMPAT: Numpy int64 Windows default for Numpy 2.0) (#55851) Backport PR #55817: COMPAT: Numpy int64 Windows default for Numpy 2.0 Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/compat/numpy/__init__.py | 3 ++- pandas/tests/extension/base/reduce.py | 4 +-- pandas/tests/extension/test_arrow.py | 2 +- pandas/tests/extension/test_masked.py | 29 +++++++++++++++++----- pandas/tests/frame/methods/test_reindex.py | 3 ++- pandas/tests/frame/test_reductions.py | 14 +++++++---- 6 files changed, 39 insertions(+), 16 deletions(-) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 51c9892b64a08..0552301b59400 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -11,6 +11,7 @@ np_version_gte1p24 = _nlv >= Version("1.24") np_version_gte1p24p3 = _nlv >= Version("1.24.3") np_version_gte1p25 = _nlv >= Version("1.25") +np_version_gt2 = _nlv >= Version("2.0.0.dev0") is_numpy_dev = _nlv.dev is not None _min_numpy_ver = "1.22.4" @@ -26,7 +27,7 @@ np_long: type np_ulong: type -if _nlv >= Version("2.0.0.dev0"): +if np_version_gt2: try: with warnings.catch_warnings(): warnings.filterwarnings( diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index 43b2df4290eed..91d4855dcefe5 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -39,7 +39,7 @@ def check_reduce(self, s, op_name, skipna): expected = exp_op(skipna=skipna) tm.assert_almost_equal(result, expected) - def _get_expected_reduction_dtype(self, arr, op_name: str): + def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): # Find the expected dtype when the given reduction is done on a DataFrame # column with this array. The default assumes float64-like behavior, # i.e. retains the dtype. @@ -58,7 +58,7 @@ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): kwargs = {"ddof": 1} if op_name in ["var", "std"] else {} - cmp_dtype = self._get_expected_reduction_dtype(arr, op_name) + cmp_dtype = self._get_expected_reduction_dtype(arr, op_name, skipna) # The DataFrame method just calls arr._reduce with keepdims=True, # so this first check is perfunctory. diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index fef4fbea2e485..61474aa94d1c8 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -543,7 +543,7 @@ def test_reduce_series_boolean( return super().test_reduce_series_boolean(data, all_boolean_reductions, skipna) - def _get_expected_reduction_dtype(self, arr, op_name: str): + def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): if op_name in ["max", "min"]: cmp_dtype = arr.dtype elif arr.dtype.name == "decimal128(7, 3)[pyarrow]": diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index c4195be8ea121..588a2fb58d9be 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -20,6 +20,7 @@ IS64, is_platform_windows, ) +from pandas.compat.numpy import np_version_gt2 import pandas as pd import pandas._testing as tm @@ -40,7 +41,7 @@ ) from pandas.tests.extension import base -is_windows_or_32bit = is_platform_windows() or not IS64 +is_windows_or_32bit = (is_platform_windows() and not np_version_gt2) or not IS64 pytestmark = [ pytest.mark.filterwarnings( @@ -325,7 +326,7 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): expected = pd.NA tm.assert_almost_equal(result, expected) - def _get_expected_reduction_dtype(self, arr, op_name: str): + def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): if tm.is_float_dtype(arr.dtype): cmp_dtype = arr.dtype.name elif op_name in ["mean", "median", "var", "std", "skew"]: @@ -335,16 +336,32 @@ def _get_expected_reduction_dtype(self, arr, op_name: str): elif arr.dtype in ["Int64", "UInt64"]: cmp_dtype = arr.dtype.name elif tm.is_signed_integer_dtype(arr.dtype): - cmp_dtype = "Int32" if is_windows_or_32bit else "Int64" + # TODO: Why does Window Numpy 2.0 dtype depend on skipna? + cmp_dtype = ( + "Int32" + if (is_platform_windows() and (not np_version_gt2 or not skipna)) + or not IS64 + else "Int64" + ) elif tm.is_unsigned_integer_dtype(arr.dtype): - cmp_dtype = "UInt32" if is_windows_or_32bit else "UInt64" + cmp_dtype = ( + "UInt32" + if (is_platform_windows() and (not np_version_gt2 or not skipna)) + or not IS64 + else "UInt64" + ) elif arr.dtype.kind == "b": if op_name in ["mean", "median", "var", "std", "skew"]: cmp_dtype = "Float64" elif op_name in ["min", "max"]: cmp_dtype = "boolean" elif op_name in ["sum", "prod"]: - cmp_dtype = "Int32" if is_windows_or_32bit else "Int64" + cmp_dtype = ( + "Int32" + if (is_platform_windows() and (not np_version_gt2 or not skipna)) + or not IS64 + else "Int64" + ) else: raise TypeError("not supposed to reach this") else: @@ -360,7 +377,7 @@ def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool): # overwrite to ensure pd.NA is tested instead of np.nan # https://github.com/pandas-dev/pandas/issues/30958 length = 64 - if not IS64 or is_platform_windows(): + if is_windows_or_32bit: # Item "ExtensionDtype" of "Union[dtype[Any], ExtensionDtype]" has # no attribute "itemsize" if not ser.dtype.itemsize == 8: # type: ignore[union-attr] diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index 0858e33a989b7..678fec835aa82 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -12,6 +12,7 @@ IS64, is_platform_windows, ) +from pandas.compat.numpy import np_version_gt2 import pandas.util._test_decorators as td import pandas as pd @@ -131,7 +132,7 @@ class TestDataFrameSelectReindex: # test_indexing @pytest.mark.xfail( - not IS64 or is_platform_windows(), + not IS64 or (is_platform_windows() and not np_version_gt2), reason="Passes int32 values to DatetimeArray in make_na_array on " "windows, 32bit linux builds", ) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 61d7c125ee5e7..bec1fcd1e7462 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -10,6 +10,7 @@ IS64, is_platform_windows, ) +from pandas.compat.numpy import np_version_gt2 import pandas.util._test_decorators as td import pandas as pd @@ -32,6 +33,7 @@ nanops, ) +is_windows_np2_or_is32 = (is_platform_windows() and not np_version_gt2) or not IS64 is_windows_or_is32 = is_platform_windows() or not IS64 @@ -1766,13 +1768,13 @@ def test_df_empty_min_count_1(self, opname, dtype, exp_dtype): @pytest.mark.parametrize( "opname, dtype, exp_value, exp_dtype", [ - ("sum", "Int8", 0, ("Int32" if is_windows_or_is32 else "Int64")), - ("prod", "Int8", 1, ("Int32" if is_windows_or_is32 else "Int64")), - ("prod", "Int8", 1, ("Int32" if is_windows_or_is32 else "Int64")), + ("sum", "Int8", 0, ("Int32" if is_windows_np2_or_is32 else "Int64")), + ("prod", "Int8", 1, ("Int32" if is_windows_np2_or_is32 else "Int64")), + ("prod", "Int8", 1, ("Int32" if is_windows_np2_or_is32 else "Int64")), ("sum", "Int64", 0, "Int64"), ("prod", "Int64", 1, "Int64"), - ("sum", "UInt8", 0, ("UInt32" if is_windows_or_is32 else "UInt64")), - ("prod", "UInt8", 1, ("UInt32" if is_windows_or_is32 else "UInt64")), + ("sum", "UInt8", 0, ("UInt32" if is_windows_np2_or_is32 else "UInt64")), + ("prod", "UInt8", 1, ("UInt32" if is_windows_np2_or_is32 else "UInt64")), ("sum", "UInt64", 0, "UInt64"), ("prod", "UInt64", 1, "UInt64"), ("sum", "Float32", 0, "Float32"), @@ -1787,6 +1789,8 @@ def test_df_empty_nullable_min_count_0(self, opname, dtype, exp_value, exp_dtype expected = Series([exp_value, exp_value], dtype=exp_dtype) tm.assert_series_equal(result, expected) + # TODO: why does min_count=1 impact the resulting Windows dtype + # differently than min_count=0? @pytest.mark.parametrize( "opname, dtype, exp_dtype", [ From 0bf70c7cf6ff0cc3fd18f44f450bef7e527f984c Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 6 Nov 2023 17:31:11 -0500 Subject: [PATCH 09/32] Backport PR #55761 on branch 2.1.x (BUG: DatetimeIndex.diff raising TypeError) (#55819) * Backport PR #55473: TST: sort index with sliced MultiIndex * BUG: DatetimeIndex.diff raising TypeError (#55761) * BUG: DatetimeIndex.diff raising TypeError * add test and whatsnew * typing * use Index (cherry picked from commit f04da3c1c9f5fd81ce7a04e55a965e4021de2ae9) * fix whatsnew --- doc/source/whatsnew/v2.1.3.rst | 2 +- pandas/core/indexes/base.py | 5 +++-- pandas/tests/indexes/test_datetimelike.py | 8 ++++++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst index 1359123ef153e..3b1cd1c152baa 100644 --- a/doc/source/whatsnew/v2.1.3.rst +++ b/doc/source/whatsnew/v2.1.3.rst @@ -21,7 +21,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b4ef5380d2b30..85b68c1bc2ec7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -7024,7 +7024,8 @@ def infer_objects(self, copy: bool = True) -> Index: result._references.add_index_reference(result) return result - def diff(self, periods: int = 1): + @final + def diff(self, periods: int = 1) -> Index: """ Computes the difference between consecutive values in the Index object. @@ -7050,7 +7051,7 @@ def diff(self, periods: int = 1): Index([nan, 10.0, 10.0, 10.0, 10.0], dtype='float64') """ - return self._constructor(self.to_series().diff(periods)) + return Index(self.to_series().diff(periods)) def round(self, decimals: int = 0): """ diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 71cc7f29c62bc..5ad2e9b2f717e 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -159,3 +159,11 @@ def test_where_cast_str(self, simple_index): result = index.where(mask, ["foo"]) tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"]) + def test_diff(self, unit): + # GH 55080 + dti = pd.to_datetime([10, 20, 30], unit=unit).as_unit(unit) + result = dti.diff(1) + expected = pd.TimedeltaIndex([pd.NaT, 10, 10], unit=unit).as_unit(unit) + tm.assert_index_equal(result, expected) From 7702f416948134d268274dd33f720151e19f3f76 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 7 Nov 2023 05:46:11 -1000 Subject: [PATCH 10/32] Backport PR #55853: DEPS: Use ipython run_cell instead of run_code; remove pytest-asyncio (#55858) --- .github/workflows/unit-tests.yml | 6 +++--- .github/workflows/wheels.yml | 2 +- ci/deps/actions-310.yaml | 1 - ci/deps/actions-311-downstream_compat.yaml | 1 - ci/deps/actions-311-numpydev.yaml | 1 - ci/deps/actions-311-pyarrownightly.yaml | 1 - ci/deps/actions-311.yaml | 1 - ci/deps/actions-39-minimum_versions.yaml | 1 - ci/deps/actions-39.yaml | 1 - ci/deps/actions-pypy-39.yaml | 1 - ci/deps/circle-310-arm64.yaml | 1 - ci/meta.yaml | 1 - environment.yml | 1 - pandas/tests/arrays/categorical/test_warnings.py | 7 ++----- pandas/tests/frame/test_api.py | 8 ++------ pandas/tests/indexes/test_base.py | 6 ++---- pandas/tests/resample/test_resampler_grouper.py | 6 ++---- pandas/util/_test_decorators.py | 12 +----------- pyproject.toml | 6 ++---- requirements-dev.txt | 1 - scripts/tests/data/deps_expected_random.yaml | 1 - scripts/tests/data/deps_minimum.toml | 6 ++---- scripts/tests/data/deps_unmodified_random.yaml | 1 - 23 files changed, 17 insertions(+), 56 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 8ff7d290ea9d1..14abcdbfdb310 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -236,7 +236,7 @@ jobs: . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install numpy --config-settings=setup-args="-Dallow-noblas=true" - python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] cython python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir export PANDAS_CI=1 @@ -274,7 +274,7 @@ jobs: /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev . ~/virtualenvs/pandas-dev/bin/activate python -m pip install --no-cache-dir -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.2.1 - python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 hypothesis>=6.46.1 python -m pip install --no-cache-dir --no-build-isolation -e . python -m pip list --no-cache-dir @@ -347,7 +347,7 @@ jobs: python -m pip install --upgrade pip setuptools wheel meson[ninja]==1.2.1 meson-python==0.13.1 python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 + python -m pip install python-dateutil pytz tzdata cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov python -m pip install -ve . --no-build-isolation --no-index --no-deps python -m pip list diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 6759f7b82eacf..95b48ee4b7426 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -168,7 +168,7 @@ jobs: shell: pwsh run: | $TST_CMD = @" - python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17; + python -m pip install hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0; python -m pip install `$(Get-Item pandas\wheelhouse\*.whl); python -c `'import pandas as pd; pd.test(extra_args=[\"`\"--no-strict-data-files`\"\", \"`\"-m not clipboard and not single_cpu and not slow and not network and not db`\"\"])`'; "@ diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index f8fa04a60a378..82f41811bf56e 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index 8bf5ea51c4bf3..8422aba5239f9 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -15,7 +15,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml index 9a6c05a138f9b..695bd483fc572 100644 --- a/ci/deps/actions-311-numpydev.yaml +++ b/ci/deps/actions-311-numpydev.yaml @@ -18,7 +18,6 @@ dependencies: # causes an InternalError within pytest - pytest-xdist>=2.2.0, <3 - hypothesis>=6.46.1 - - pytest-asyncio>=0.17.0 # pandas dependencies - python-dateutil diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index d7e5523dd2545..b5df68968ea46 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -15,7 +15,6 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - hypothesis>=6.46.1 - - pytest-asyncio>=0.17.0 # required dependencies - python-dateutil diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 7f8cca9e65fe1..cb1771c485297 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index 5d34940aa27e1..e4db8cb0a540b 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -16,7 +16,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index f5ea116e51dd4..745781298b86a 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index a489690776be3..ce02ea2b9a090 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -16,7 +16,6 @@ dependencies: # test dependencies - pytest>=7.3.2 - pytest-cov - - pytest-asyncio>=0.17.0 - pytest-xdist>=2.2.0 - hypothesis>=6.46.1 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index a71523e8c3579..08a422c6d538a 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -14,7 +14,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - pytest-localserver>=0.7.1 - boto3 diff --git a/ci/meta.yaml b/ci/meta.yaml index 77f536a18a204..5b343f60b3dce 100644 --- a/ci/meta.yaml +++ b/ci/meta.yaml @@ -64,7 +64,6 @@ test: requires: - pip - pytest >=7.3.2 - - pytest-asyncio >=0.17.0 - pytest-xdist >=2.2.0 - pytest-cov - hypothesis >=6.46.1 diff --git a/environment.yml b/environment.yml index 98c39378d1ae1..3f6f0de68748b 100644 --- a/environment.yml +++ b/environment.yml @@ -16,7 +16,6 @@ dependencies: - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - coverage # required dependencies diff --git a/pandas/tests/arrays/categorical/test_warnings.py b/pandas/tests/arrays/categorical/test_warnings.py index c0623faccd325..68c59706a6c3b 100644 --- a/pandas/tests/arrays/categorical/test_warnings.py +++ b/pandas/tests/arrays/categorical/test_warnings.py @@ -1,19 +1,16 @@ import pytest -from pandas.util._test_decorators import async_mark - import pandas._testing as tm class TestCategoricalWarnings: - @async_mark() - async def test_tab_complete_warning(self, ip): + def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; c = pd.Categorical([])" - await ip.run_code(code) + ip.run_cell(code) # GH 31324 newer jedi version raises Deprecation warning; # appears resolved 2021-02-02 diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 8fc78629beb0a..aa7aa8964a059 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -7,8 +7,6 @@ from pandas._config.config import option_context -from pandas.util._test_decorators import async_mark - import pandas as pd from pandas import ( DataFrame, @@ -288,8 +286,7 @@ def _check_f(base, f): f = lambda x: x.rename({1: "foo"}, inplace=True) _check_f(d.copy(), f) - @async_mark() - async def test_tab_complete_warning(self, ip, frame_or_series): + def test_tab_complete_warning(self, ip, frame_or_series): # GH 16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter @@ -299,8 +296,7 @@ async def test_tab_complete_warning(self, ip, frame_or_series): else: code = "from pandas import Series; obj = Series(dtype=object)" - await ip.run_code(code) - + ip.run_cell(code) # GH 31324 newer jedi version raises Deprecation warning; # appears resolved 2021-02-02 with tm.assert_produces_warning(None, raise_on_extra_warnings=False): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 8fd1e296fb79a..da4b44227bef3 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -10,7 +10,6 @@ from pandas.compat import IS64 from pandas.errors import InvalidIndexError -from pandas.util._test_decorators import async_mark from pandas.core.dtypes.common import ( is_any_real_numeric_dtype, @@ -1218,14 +1217,13 @@ def test_cached_properties_not_settable(self): with pytest.raises(AttributeError, match="Can't set attribute"): index.is_unique = False - @async_mark() - async def test_tab_complete_warning(self, ip): + def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; idx = pd.Index([1, 2])" - await ip.run_code(code) + ip.run_cell(code) # GH 31324 newer jedi version raises Deprecation warning; # appears resolved 2021-02-02 diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 6abdde0f5ccff..b5288c793dafa 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -4,7 +4,6 @@ import pytest from pandas.compat import is_platform_windows -from pandas.util._test_decorators import async_mark import pandas as pd from pandas import ( @@ -26,8 +25,7 @@ def test_frame(): ) -@async_mark() -async def test_tab_complete_ipython6_warning(ip): +def test_tab_complete_ipython6_warning(ip): from IPython.core.completer import provisionalcompleter code = dedent( @@ -37,7 +35,7 @@ async def test_tab_complete_ipython6_warning(ip): rs = s.resample("D") """ ) - await ip.run_code(code) + ip.run_cell(code) # GH 31324 newer jedi version raises Deprecation warning; # appears resolved 2021-02-02 diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 03011a1ffe622..93c46274fdc1b 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -38,11 +38,11 @@ def test_foo(): if TYPE_CHECKING: from pandas._typing import F + from pandas.compat import ( IS64, is_platform_windows, ) -from pandas.compat._optional import import_optional_dependency from pandas.core.computation.expressions import ( NUMEXPR_INSTALLED, @@ -214,16 +214,6 @@ def documented_fixture(fixture): return documented_fixture -def async_mark(): - try: - import_optional_dependency("pytest_asyncio") - async_mark = pytest.mark.asyncio - except ImportError: - async_mark = pytest.mark.skip(reason="Missing dependency pytest-asyncio") - - return async_mark - - def mark_array_manager_not_yet_implemented(request) -> None: mark = pytest.mark.xfail(reason="Not yet implemented for ArrayManager") request.node.add_marker(mark) diff --git a/pyproject.toml b/pyproject.toml index e9e3e2f67e2f1..d7634894b4835 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,7 @@ repository = 'https://github.com/pandas-dev/pandas' matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] -test = ['hypothesis>=6.46.1', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0'] +test = ['hypothesis>=6.46.1', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0'] performance = ['bottleneck>=1.3.4', 'numba>=0.55.2', 'numexpr>=2.8.0'] computation = ['scipy>=1.8.1', 'xarray>=2022.03.0'] fss = ['fsspec>=2022.05.0'] @@ -109,7 +109,6 @@ all = ['beautifulsoup4>=4.11.1', 'pyreadstat>=1.1.5', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', - 'pytest-asyncio>=0.17.0', 'pyxlsb>=1.0.9', 'qtpy>=2.2.0', 'scipy>=1.8.1', @@ -151,7 +150,7 @@ setup = ['--vsenv'] # For Windows skip = "cp36-* cp37-* cp38-* pp* *_i686 *_ppc64le *_s390x *-musllinux_aarch64" build-verbosity = "3" environment = {LDFLAGS="-Wl,--strip-all"} -test-requires = "hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17" +test-requires = "hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0" test-command = """ PANDAS_CI='1' python -c 'import pandas as pd; \ pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db", "-n 2", "--no-strict-data-files"]); \ @@ -514,7 +513,6 @@ markers = [ "arm_slow: mark a test as slow for arm64 architecture", "arraymanager: mark a test to run with ArrayManager enabled", ] -asyncio_mode = "strict" [tool.mypy] # Import discovery diff --git a/requirements-dev.txt b/requirements-dev.txt index 855eaccfd4f5f..af4f82b068d31 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -9,7 +9,6 @@ meson-python==0.13.1 pytest>=7.3.2 pytest-cov pytest-xdist>=2.2.0 -pytest-asyncio>=0.17.0 coverage python-dateutil numpy<2 diff --git a/scripts/tests/data/deps_expected_random.yaml b/scripts/tests/data/deps_expected_random.yaml index c70025f8f019d..bebb25b9e1bb8 100644 --- a/scripts/tests/data/deps_expected_random.yaml +++ b/scripts/tests/data/deps_expected_random.yaml @@ -14,7 +14,6 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - psutil - - pytest-asyncio>=0.17.0 - boto3 # required dependencies diff --git a/scripts/tests/data/deps_minimum.toml b/scripts/tests/data/deps_minimum.toml index b43815a982139..5fb51856e841e 100644 --- a/scripts/tests/data/deps_minimum.toml +++ b/scripts/tests/data/deps_minimum.toml @@ -55,7 +55,7 @@ repository = 'https://github.com/pandas-dev/pandas' matplotlib = "pandas:plotting._matplotlib" [project.optional-dependencies] -test = ['hypothesis>=6.34.2', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', 'pytest-asyncio>=0.17.0'] +test = ['hypothesis>=6.34.2', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0'] performance = ['bottleneck>=1.3.2', 'numba>=0.53.1', 'numexpr>=2.7.1'] timezone = ['tzdata>=2022.1'] computation = ['scipy>=1.7.1', 'xarray>=0.21.0'] @@ -102,7 +102,6 @@ all = ['beautifulsoup4>=5.9.3', 'pyreadstat>=1.1.2', 'pytest>=7.3.2', 'pytest-xdist>=2.2.0', - 'pytest-asyncio>=0.17.0', 'pyxlsb>=1.0.8', 'qtpy>=2.2.0', 'scipy>=1.7.1', @@ -141,7 +140,7 @@ parentdir_prefix = "pandas-" [tool.cibuildwheel] skip = "cp36-* cp37-* pp37-* *-manylinux_i686 *_ppc64le *_s390x *-musllinux*" build-verbosity = "3" -test-requires = "hypothesis>=6.34.2 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17" +test-requires = "hypothesis>=6.34.2 pytest>=7.3.2 pytest-xdist>=2.2.0" test-command = "python {project}/ci/test_wheels.py" [tool.cibuildwheel.macos] @@ -385,7 +384,6 @@ markers = [ "arm_slow: mark a test as slow for arm64 architecture", "arraymanager: mark a test to run with ArrayManager enabled", ] -asyncio_mode = "strict" [tool.mypy] # Import discovery diff --git a/scripts/tests/data/deps_unmodified_random.yaml b/scripts/tests/data/deps_unmodified_random.yaml index 503eb3c7c7734..e54482282fcc8 100644 --- a/scripts/tests/data/deps_unmodified_random.yaml +++ b/scripts/tests/data/deps_unmodified_random.yaml @@ -14,7 +14,6 @@ dependencies: - pytest-cov - pytest-xdist>=2.2.0 - psutil - - pytest-asyncio>=0.17 - boto3 # required dependencies From c9854d9821b2b7e73f88a3c2d6055c8afcf5dff7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 7 Nov 2023 10:07:46 -1000 Subject: [PATCH 11/32] TST: Make read_csv tests pyarrow 13 compatable on 2.1.x (#55855) * TST: Make read_csv tests pyarrow 13 compatable on 2.1.x * Skip on windows for ARROW_TIMEZONE_DATABASE --- pandas/tests/interchange/test_impl.py | 16 +++++++++++++++- .../tests/io/parser/common/test_common_basic.py | 8 ++++++-- pandas/tests/io/parser/test_parse_dates.py | 8 +++++++- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 8a25a2c1889f3..61d3edcbb2964 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -4,6 +4,10 @@ import pytest from pandas._libs.tslibs import iNaT +from pandas.compat import ( + is_ci_environment, + is_platform_windows, +) import pandas.util._test_decorators as td import pandas as pd @@ -309,11 +313,21 @@ def test_datetimetzdtype(tz, unit): tm.assert_frame_equal(df, from_dataframe(df.__dataframe__())) -def test_interchange_from_non_pandas_tz_aware(): +def test_interchange_from_non_pandas_tz_aware(request): # GH 54239, 54287 pa = pytest.importorskip("pyarrow", "11.0.0") import pyarrow.compute as pc + if is_platform_windows() and is_ci_environment(): + mark = pytest.mark.xfail( + raises=pa.ArrowInvalid, + reason=( + "TODO: Set ARROW_TIMEZONE_DATABASE environment variable " + "on CI to path to the tzdata for pyarrow." + ), + ) + request.node.add_marker(mark) + arr = pa.array([datetime(2020, 1, 1), None, datetime(2020, 1, 2)]) arr = pc.assume_timezone(arr, "Asia/Kathmandu") table = pa.table({"arr": arr}) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 00a26a755756f..442aa5ef87b10 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -86,7 +86,9 @@ def test_read_csv_local(all_parsers, csv1): fname = prefix + str(os.path.abspath(csv1)) result = parser.read_csv(fname, index_col=0, parse_dates=True) - + # TODO: make unit check more specific + if parser.engine == "pyarrow": + result.index = result.index.as_unit("ns") expected = DataFrame( [ [0.980269, 3.685731, -0.364216805298, -1.159738], @@ -177,7 +179,9 @@ def test_read_csv_low_memory_no_rows_with_index(all_parsers): def test_read_csv_dataframe(all_parsers, csv1): parser = all_parsers result = parser.read_csv(csv1, index_col=0, parse_dates=True) - + # TODO: make unit check more specific + if parser.engine == "pyarrow": + result.index = result.index.as_unit("ns") expected = DataFrame( [ [0.980269, 3.685731, -0.364216805298, -1.159738], diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 9f7840588f89e..c6afff56de16b 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -979,12 +979,15 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs): ) -def test_parse_tz_aware(all_parsers, request): +def test_parse_tz_aware(all_parsers): # See gh-1693 parser = all_parsers data = "Date,x\n2012-06-13T01:39:00Z,0.5" result = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) + # TODO: make unit check more specific + if parser.engine == "pyarrow": + result.index = result.index.as_unit("ns") expected = DataFrame( {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") ) @@ -2231,6 +2234,9 @@ def test_parse_dates_arrow_engine(all_parsers): 2000-01-01 00:00:01,1""" result = parser.read_csv(StringIO(data), parse_dates=["a"]) + # TODO: make unit check more specific + if parser.engine == "pyarrow": + result["a"] = result["a"].dt.as_unit("ns") expected = DataFrame( { "a": [ From 569f904d45aeb5d0abb925e9875140afd61e3da4 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 8 Nov 2023 01:08:46 +0100 Subject: [PATCH 12/32] Backport PR #55227 on branch 2.1.x (BUG: Interchange object data buffer has the wrong dtype / from_dataframe incorrect ) (#55863) Backport PR #55227: BUG: Interchange object data buffer has the wrong dtype / from_dataframe incorrect Co-authored-by: Marco Edward Gorelli Co-authored-by: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> --- pandas/core/interchange/from_dataframe.py | 16 ++++++++-------- pandas/tests/interchange/test_impl.py | 22 ++++++++++++++++++++++ 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py index 214fbf9f36435..d45ae37890ba7 100644 --- a/pandas/core/interchange/from_dataframe.py +++ b/pandas/core/interchange/from_dataframe.py @@ -266,10 +266,9 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: assert buffers["offsets"], "String buffers must contain offsets" # Retrieve the data buffer containing the UTF-8 code units - data_buff, protocol_data_dtype = buffers["data"] + data_buff, _ = buffers["data"] # We're going to reinterpret the buffer as uint8, so make sure we can do it safely - assert protocol_data_dtype[1] == 8 - assert protocol_data_dtype[2] in ( + assert col.dtype[2] in ( ArrowCTypes.STRING, ArrowCTypes.LARGE_STRING, ) # format_str == utf-8 @@ -377,15 +376,16 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any """ buffers = col.get_buffers() - _, _, format_str, _ = col.dtype - dbuf, dtype = buffers["data"] + _, col_bit_width, format_str, _ = col.dtype + dbuf, _ = buffers["data"] # Consider dtype being `uint` to get number of units passed since the 01.01.1970 + data = buffer_to_ndarray( dbuf, ( - DtypeKind.UINT, - dtype[1], - getattr(ArrowCTypes, f"UINT{dtype[1]}"), + DtypeKind.INT, + col_bit_width, + getattr(ArrowCTypes, f"INT{col_bit_width}"), Endianness.NATIVE, ), offset=col.offset, diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 61d3edcbb2964..97a388569e261 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -18,6 +18,7 @@ DtypeKind, ) from pandas.core.interchange.from_dataframe import from_dataframe +from pandas.core.interchange.utils import ArrowCTypes @pytest.fixture @@ -340,3 +341,24 @@ def test_interchange_from_non_pandas_tz_aware(request): dtype="datetime64[us, Asia/Kathmandu]", ) tm.assert_frame_equal(expected, result) + + +def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: + # https://github.com/pandas-dev/pandas/issues/54781 + df = pd.DataFrame({"a": ["foo", "bar"]}).__dataframe__() + interchange = df.__dataframe__() + column = interchange.get_column_by_name("a") + buffers = column.get_buffers() + buffers_data = buffers["data"] + buffer_dtype = buffers_data[1] + buffer_dtype = ( + DtypeKind.UINT, + 8, + ArrowCTypes.UINT8, + buffer_dtype[3], + ) + buffers["data"] = (buffers_data[0], buffer_dtype) + column.get_buffers = lambda: buffers + interchange.get_column_by_name = lambda _: column + monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange) + pd.api.interchange.from_dataframe(df) From de8af3c3f4fe9552769b10e458678ff12937bcd0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 9 Nov 2023 18:02:19 +0100 Subject: [PATCH 13/32] Backport PR #55427 on branch 2.1.x (DOC: Remove outdated docs about NumPy's broadcasting) (#55896) Backport PR #55427: DOC: Remove outdated docs about NumPy's broadcasting Co-authored-by: cobalt <61329810+RedGuy12@users.noreply.github.com> --- doc/source/user_guide/basics.rst | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 2e299da5e5794..d3c9d83b943ce 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -408,20 +408,6 @@ raise a ValueError: pd.Series(['foo', 'bar', 'baz']) == pd.Series(['foo']) -Note that this is different from the NumPy behavior where a comparison can -be broadcast: - -.. ipython:: python - - np.array([1, 2, 3]) == np.array([2]) - -or it can return False if broadcasting can not be done: - -.. ipython:: python - :okwarning: - - np.array([1, 2, 3]) == np.array([1, 2]) - Combining overlapping data sets ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 41586668d290d83910c9adcbade49a911025016b Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 9 Nov 2023 21:23:57 +0100 Subject: [PATCH 14/32] Backport PR #55764 on branch 2.1.x (REGR: fix return class in _constructor_from_mgr for simple subclasses) (#55892) Backport PR #55764: REGR: fix return class in _constructor_from_mgr for simple subclasses Co-authored-by: Isaac Virshup --- doc/source/whatsnew/v2.1.3.rst | 2 +- pandas/core/frame.py | 2 +- pandas/core/series.py | 2 +- pandas/tests/frame/test_subclass.py | 41 +++++++++++++++++++++++++++++ 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst index 3b1cd1c152baa..264cd440907fd 100644 --- a/doc/source/whatsnew/v2.1.3.rst +++ b/doc/source/whatsnew/v2.1.3.rst @@ -13,7 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed infinite recursion from operations that return a new object on some DataFrame subclasses (:issue:`55763`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c109070ce461d..605cf49856e16 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -641,7 +641,7 @@ def _constructor(self) -> Callable[..., DataFrame]: def _constructor_from_mgr(self, mgr, axes): if self._constructor is DataFrame: # we are pandas.DataFrame (or a subclass that doesn't override _constructor) - return self._from_mgr(mgr, axes=axes) + return DataFrame._from_mgr(mgr, axes=axes) else: assert axes is mgr.axes return self._constructor(mgr) diff --git a/pandas/core/series.py b/pandas/core/series.py index 8ad049e173507..7b22d89bfe22d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -581,7 +581,7 @@ def _constructor(self) -> Callable[..., Series]: def _constructor_from_mgr(self, mgr, axes): if self._constructor is Series: # we are pandas.Series (or a subclass that doesn't override _constructor) - ser = self._from_mgr(mgr, axes=axes) + ser = Series._from_mgr(mgr, axes=axes) ser._name = None # caller is responsible for setting real name return ser else: diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 8e657f197ca1e..ef78ae62cb4d6 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -771,3 +771,44 @@ def test_constructor_with_metadata(): ) subset = df[["A", "B"]] assert isinstance(subset, MySubclassWithMetadata) + + +class SimpleDataFrameSubClass(DataFrame): + """A subclass of DataFrame that does not define a constructor.""" + + +class SimpleSeriesSubClass(Series): + """A subclass of Series that does not define a constructor.""" + + +class TestSubclassWithoutConstructor: + def test_copy_df(self): + expected = DataFrame({"a": [1, 2, 3]}) + result = SimpleDataFrameSubClass(expected).copy() + + assert ( + type(result) is DataFrame + ) # assert_frame_equal only checks isinstance(lhs, type(rhs)) + tm.assert_frame_equal(result, expected) + + def test_copy_series(self): + expected = Series([1, 2, 3]) + result = SimpleSeriesSubClass(expected).copy() + + tm.assert_series_equal(result, expected) + + def test_series_to_frame(self): + orig = Series([1, 2, 3]) + expected = orig.to_frame() + result = SimpleSeriesSubClass(orig).to_frame() + + assert ( + type(result) is DataFrame + ) # assert_frame_equal only checks isinstance(lhs, type(rhs)) + tm.assert_frame_equal(result, expected) + + def test_groupby(self): + df = SimpleDataFrameSubClass(DataFrame({"a": [1, 2, 3]})) + + for _, v in df.groupby("a"): + assert type(v) is DataFrame From 1691a51ce7d6028ec48ef4f41709f8edce346ab9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 10 Nov 2023 00:24:32 +0100 Subject: [PATCH 15/32] Backport PR #55894 on 2.1.x: Parquet/Feather IO: disable PyExtensionType autoload (#55900) Parquet/Feather IO: disable PyExtensionType autoload (#55894) * Parquet/Feather IO: disable PyExtensionType autoload * don't install hotfix for pyarrow >= 14.0.1 * move patching to extension type definitions * expand error message * fix compat for pyarrow not installed * add whatsnew (cherry picked from commit 851fea0ea38985cd7d5e0a3b7a7a7539b5883307) --- doc/source/whatsnew/v2.1.3.rst | 3 +- pandas/compat/__init__.py | 2 + pandas/compat/pyarrow.py | 2 + pandas/core/arrays/arrow/extension_types.py | 60 +++++++++++++++++++++ pandas/io/feather_format.py | 3 ++ 5 files changed, 69 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst index 264cd440907fd..531559c259b44 100644 --- a/doc/source/whatsnew/v2.1.3.rst +++ b/doc/source/whatsnew/v2.1.3.rst @@ -22,7 +22,8 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :meth:`DatetimeIndex.diff` raising ``TypeError`` (:issue:`55080`) -- +- Bug in :meth:`Index.isin` raising for Arrow backed string and ``None`` value (:issue:`55821`) +- Fix :func:`read_parquet` and :func:`read_feather` for `CVE-2023-47248 `__ (:issue:`55894`) .. --------------------------------------------------------------------------- .. _whatsnew_213.other: diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 684e9dccdc0f9..6896945344b24 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -31,6 +31,7 @@ pa_version_under11p0, pa_version_under13p0, pa_version_under14p0, + pa_version_under14p1, ) if TYPE_CHECKING: @@ -188,6 +189,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under11p0", "pa_version_under13p0", "pa_version_under14p0", + "pa_version_under14p1", "IS64", "ISMUSL", "PY310", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 12f58be109d98..96501b47e07f6 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -16,6 +16,7 @@ pa_version_under12p0 = _palv < Version("12.0.0") pa_version_under13p0 = _palv < Version("13.0.0") pa_version_under14p0 = _palv < Version("14.0.0") + pa_version_under14p1 = _palv < Version("14.0.1") except ImportError: pa_version_under7p0 = True pa_version_under8p0 = True @@ -25,3 +26,4 @@ pa_version_under12p0 = True pa_version_under13p0 = True pa_version_under14p0 = True + pa_version_under14p1 = True diff --git a/pandas/core/arrays/arrow/extension_types.py b/pandas/core/arrays/arrow/extension_types.py index 3249c1c829546..36d536bf86847 100644 --- a/pandas/core/arrays/arrow/extension_types.py +++ b/pandas/core/arrays/arrow/extension_types.py @@ -5,6 +5,8 @@ import pyarrow +from pandas.compat import pa_version_under14p1 + from pandas.core.dtypes.dtypes import ( IntervalDtype, PeriodDtype, @@ -112,3 +114,61 @@ def to_pandas_dtype(self): # register the type with a dummy instance _interval_type = ArrowIntervalType(pyarrow.int64(), "left") pyarrow.register_extension_type(_interval_type) + + +_ERROR_MSG = """\ +Disallowed deserialization of 'arrow.py_extension_type': +storage_type = {storage_type} +serialized = {serialized} +pickle disassembly:\n{pickle_disassembly} + +Reading of untrusted Parquet or Feather files with a PyExtensionType column +allows arbitrary code execution. +If you trust this file, you can enable reading the extension type by one of: + +- upgrading to pyarrow >= 14.0.1, and call `pa.PyExtensionType.set_auto_load(True)` +- install pyarrow-hotfix (`pip install pyarrow-hotfix`) and disable it by running + `import pyarrow_hotfix; pyarrow_hotfix.uninstall()` + +We strongly recommend updating your Parquet/Feather files to use extension types +derived from `pyarrow.ExtensionType` instead, and register this type explicitly. +""" + + +def patch_pyarrow(): + # starting from pyarrow 14.0.1, it has its own mechanism + if not pa_version_under14p1: + return + + # if https://github.com/pitrou/pyarrow-hotfix was installed and enabled + if getattr(pyarrow, "_hotfix_installed", False): + return + + class ForbiddenExtensionType(pyarrow.ExtensionType): + def __arrow_ext_serialize__(self): + return b"" + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + import io + import pickletools + + out = io.StringIO() + pickletools.dis(serialized, out) + raise RuntimeError( + _ERROR_MSG.format( + storage_type=storage_type, + serialized=serialized, + pickle_disassembly=out.getvalue(), + ) + ) + + pyarrow.unregister_extension_type("arrow.py_extension_type") + pyarrow.register_extension_type( + ForbiddenExtensionType(pyarrow.null(), "arrow.py_extension_type") + ) + + pyarrow._hotfix_installed = True + + +patch_pyarrow() diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index c463f6e4d2759..b018b5720d126 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -117,6 +117,9 @@ def read_feather( import_optional_dependency("pyarrow") from pyarrow import feather + # import utils to register the pyarrow extension types + import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401,E501 + check_dtype_backend(dtype_backend) with get_handle( From 92ce245e5d84e05a2943966e328b4e6b632bc8e2 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 10 Nov 2023 14:54:05 +0100 Subject: [PATCH 16/32] Backport PR #55907 on branch 2.1.x (DOC: Add release date for 2.1.3) (#55913) Backport PR #55907: DOC: Add release date for 2.1.3 Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/v2.1.3.rst | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v2.1.3.rst b/doc/source/whatsnew/v2.1.3.rst index 531559c259b44..af626895a9e0e 100644 --- a/doc/source/whatsnew/v2.1.3.rst +++ b/doc/source/whatsnew/v2.1.3.rst @@ -1,6 +1,6 @@ .. _whatsnew_213: -What's new in 2.1.3 (November ??, 2023) +What's new in 2.1.3 (November 10, 2023) --------------------------------------- These are the changes in pandas 2.1.3. See :ref:`release` for a full changelog @@ -14,7 +14,6 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed infinite recursion from operations that return a new object on some DataFrame subclasses (:issue:`55763`) -- .. --------------------------------------------------------------------------- .. _whatsnew_213.bug_fixes: @@ -25,14 +24,6 @@ Bug fixes - Bug in :meth:`Index.isin` raising for Arrow backed string and ``None`` value (:issue:`55821`) - Fix :func:`read_parquet` and :func:`read_feather` for `CVE-2023-47248 `__ (:issue:`55894`) -.. --------------------------------------------------------------------------- -.. _whatsnew_213.other: - -Other -~~~~~ -- -- - .. --------------------------------------------------------------------------- .. _whatsnew_213.contributors: From d9665ca37076d4dd6c18a22e0b3a767fdc097691 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Fri, 10 Nov 2023 16:41:43 +0100 Subject: [PATCH 17/32] Backport PR #55911 on branch 2.1.x (DOC: convert outdated example of NumPy's broadcasting to literal code block) (#55912) Backport PR #55911: DOC: convert outdated example of NumPy's broadcasting to literal code block Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v0.17.0.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst index abbda2ffc9be2..5a668e5fabd9c 100644 --- a/doc/source/whatsnew/v0.17.0.rst +++ b/doc/source/whatsnew/v0.17.0.rst @@ -726,10 +726,10 @@ be broadcast: or it can return False if broadcasting can not be done: -.. ipython:: python - :okwarning: +.. code-block:: ipython - np.array([1, 2, 3]) == np.array([1, 2]) + In [11]: np.array([1, 2, 3]) == np.array([1, 2]) + Out[11]: False Changes to boolean comparisons vs. None ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 2a953cf80b77e4348bf50ed724f8abc0d814d9dd Mon Sep 17 00:00:00 2001 From: Pandas Development Team Date: Fri, 10 Nov 2023 18:14:03 +0100 Subject: [PATCH 18/32] RLS: 2.1.3 From 9bc52cb2f24c9373cfc6928408066689c138993d Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 13 Nov 2023 18:08:11 +0100 Subject: [PATCH 19/32] Backport PR #55924 on branch 2.1.x (DOC: add whatsnew for 2.1.4) (#55937) Backport PR #55924: DOC: add whatsnew for 2.1.4 Co-authored-by: Luke Manley --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v2.1.4.rst | 41 ++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 doc/source/whatsnew/v2.1.4.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index eef65366a2762..3af7a06428f0c 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,7 @@ Version 2.1 .. toctree:: :maxdepth: 2 + v2.1.4 v2.1.3 v2.1.2 v2.1.1 diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst new file mode 100644 index 0000000000000..04bbb0f806cbd --- /dev/null +++ b/doc/source/whatsnew/v2.1.4.rst @@ -0,0 +1,41 @@ +.. _whatsnew_214: + +What's new in 2.1.4 (December ??, 2023) +--------------------------------------- + +These are the changes in pandas 2.1.4. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_214.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_214.bug_fixes: + +Bug fixes +~~~~~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_214.other: + +Other +~~~~~ +- +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_214.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v2.1.3..v2.1.4|HEAD From 486c723191119b69259ff8c9cb23e223c8a96491 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 13 Nov 2023 22:01:59 +0100 Subject: [PATCH 20/32] Backport PR #55920 on branch 2.1.x (Fix off-by-one in aggregations) (#55942) Backport PR #55920: Fix off-by-one in aggregations Co-authored-by: William Ayd --- pandas/core/indexers/objects.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index 694a420ad2494..b516227cf8b06 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -102,7 +102,7 @@ def get_window_bounds( closed: str | None = None, step: int | None = None, ) -> tuple[np.ndarray, np.ndarray]: - if center: + if center or self.window_size == 0: offset = (self.window_size - 1) // 2 else: offset = 0 From 19bbe062b3ae7aa23831e45bb59aaedca3959e67 Mon Sep 17 00:00:00 2001 From: Daniel Isaac Date: Tue, 14 Nov 2023 21:10:07 +0530 Subject: [PATCH 21/32] Fix for 55753 BUG: No kwargs in df.apply(raw=True) [ Backport to 2.1.x ] (#55930) * Fix for 55753 BUG: No kwargs in df.apply(raw=True) [ Backport to 2.1.x ] * Fix for 55753 BUG: No kwargs in df.apply(raw=True) [ Backport to 2.1.x ] * review comment - updated whatsnew --- doc/source/whatsnew/v2.1.4.rst | 2 +- pandas/core/apply.py | 4 +++- pandas/tests/apply/test_frame_apply.py | 5 +++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 04bbb0f806cbd..d61714d9473c6 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -21,7 +21,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55753`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/apply.py b/pandas/core/apply.py index e5683359c2fb9..43bc26f6eb637 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -923,7 +923,9 @@ def wrapper(*args, **kwargs): return wrapper - result = np.apply_along_axis(wrap_function(self.func), self.axis, self.values) + result = np.apply_along_axis( + wrap_function(self.func), self.axis, self.values, *self.args, **self.kwargs + ) # TODO: mixed type case if result.ndim == 2: diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 3a3f73a68374b..92612861e551a 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -38,8 +38,9 @@ def test_apply(float_frame): @pytest.mark.parametrize("axis", [0, 1]) -def test_apply_args(float_frame, axis): - result = float_frame.apply(lambda x, y: x + y, axis, args=(1,)) +@pytest.mark.parametrize("raw", [True, False]) +def test_apply_args(float_frame, axis, raw): + result = float_frame.apply(lambda x, y: x + y, axis, args=(1,), raw=raw) expected = float_frame + 1 tm.assert_frame_equal(result, expected) From c3761492557438e8a0b770f7124719803250acad Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 20 Nov 2023 00:05:15 +0100 Subject: [PATCH 22/32] Backport PR #56063 on branch 2.1.x (CI: Fix pyarrow nightly ci failure) (#56069) Backport PR #56063: CI: Fix pyarrow nightly ci failure Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/compat/pyarrow.py | 2 ++ pandas/tests/io/test_parquet.py | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 96501b47e07f6..be3a038d472ab 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -17,6 +17,7 @@ pa_version_under13p0 = _palv < Version("13.0.0") pa_version_under14p0 = _palv < Version("14.0.0") pa_version_under14p1 = _palv < Version("14.0.1") + pa_version_under15p0 = _palv < Version("15.0.0") except ImportError: pa_version_under7p0 = True pa_version_under8p0 = True @@ -27,3 +28,4 @@ pa_version_under13p0 = True pa_version_under14p0 = True pa_version_under14p1 = True + pa_version_under15p0 = True diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 1d68f12270b55..699f8952e17b9 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -19,6 +19,7 @@ pa_version_under8p0, pa_version_under11p0, pa_version_under13p0, + pa_version_under15p0, ) import pandas as pd @@ -755,7 +756,10 @@ def test_unsupported_float16(self, pa): # Not able to write float 16 column using pyarrow. data = np.arange(2, 10, dtype=np.float16) df = pd.DataFrame(data=data, columns=["fp16"]) - self.check_external_error_on_write(df, pa, pyarrow.ArrowException) + if pa_version_under15p0: + self.check_external_error_on_write(df, pa, pyarrow.ArrowException) + else: + check_round_trip(df, pa) @pytest.mark.xfail( is_platform_windows(), @@ -764,6 +768,7 @@ def test_unsupported_float16(self, pa): "dtypes are passed to_parquet function in windows" ), ) + @pytest.mark.skipif(not pa_version_under15p0, reason="float16 works on 15") @pytest.mark.parametrize("path_type", [str, pathlib.Path]) def test_unsupported_float16_cleanup(self, pa, path_type): # #44847, #44914 From 3db1b22fe58f70ebfc1d6a3597d3a14c220f78bc Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 21 Nov 2023 00:12:53 +0100 Subject: [PATCH 23/32] Backport PR #56085 on branch 2.1.x (CI: Ignore EncodingWarnings from numpy) (#56086) Backport PR #56085: CI: Ignore EncodingWarnings from numpy Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index d7634894b4835..b3b991010f9da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -480,6 +480,8 @@ filterwarnings = [ "error:::pandas", "error::ResourceWarning", "error::pytest.PytestUnraisableExceptionWarning", + # TODO(PY311-minimum): Specify EncodingWarning + "ignore:'encoding' argument not specified.::numpy", "ignore:.*ssl.SSLSocket:pytest.PytestUnraisableExceptionWarning", "ignore:.*ssl.SSLSocket:ResourceWarning", # GH 44844: Can remove once minimum matplotlib version >= 3.7 From f4260b4e4a6bc18759cea65cb0f37432e0c2c79a Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 21 Nov 2023 05:53:42 +0100 Subject: [PATCH 24/32] Backport PR #56093 on branch 2.1.x (CI: Catch EncodingWarnings only in pandas) (#56094) Backport PR #56093: CI: Catch EncodingWarnings only in pandas Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b3b991010f9da..77b49d4474334 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -481,7 +481,9 @@ filterwarnings = [ "error::ResourceWarning", "error::pytest.PytestUnraisableExceptionWarning", # TODO(PY311-minimum): Specify EncodingWarning - "ignore:'encoding' argument not specified.::numpy", + # Ignore 3rd party EncodingWarning but raise on pandas' + "ignore:.*encoding.* argument not specified", + "error:.*encoding.* argument not specified::pandas", "ignore:.*ssl.SSLSocket:pytest.PytestUnraisableExceptionWarning", "ignore:.*ssl.SSLSocket:ResourceWarning", # GH 44844: Can remove once minimum matplotlib version >= 3.7 From a8703c0292fa5daec00ec20caa3c8bea499a3824 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 21 Nov 2023 23:49:49 +0100 Subject: [PATCH 25/32] Backport PR #55639 on branch 2.1.x (BUG: setitem casting object Index to arrow strings) (#56100) BUG: setitem casting object Index to arrow strings (#55639) (cherry picked from commit 639bd6695940be8737db5c47b71b7ec6c4c88fb7) --- doc/source/whatsnew/v2.1.4.rst | 2 ++ pandas/core/construction.py | 12 ++++++++++++ pandas/core/indexes/base.py | 10 +++++++++- pandas/tests/frame/indexing/test_indexing.py | 2 +- pandas/tests/frame/indexing/test_setitem.py | 9 +++++++++ 5 files changed, 33 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index d61714d9473c6..39cd6b39309cf 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -22,6 +22,8 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55753`) +- Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) +- Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 5903187769f08..39f0ddf172e33 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -49,6 +49,7 @@ from pandas.core.dtypes.common import ( is_list_like, is_object_dtype, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import NumpyEADtype @@ -548,6 +549,10 @@ def sanitize_array( # Avoid ending up with a NumpyExtensionArray dtype = dtype.numpy_dtype + object_index = False + if isinstance(data, ABCIndex) and data.dtype == object and dtype is None: + object_index = True + # extract ndarray or ExtensionArray, ensure we have no NumpyExtensionArray data = extract_array(data, extract_numpy=True, extract_range=True) @@ -601,6 +606,13 @@ def sanitize_array( subarr = data if data.dtype == object: subarr = maybe_infer_to_datetimelike(data) + if ( + object_index + and using_pyarrow_string_dtype() + and is_string_dtype(subarr) + ): + # Avoid inference when string option is set + subarr = data elif data.dtype.kind == "U" and using_pyarrow_string_dtype(): from pandas.core.arrays.string_ import StringDtype diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 85b68c1bc2ec7..aaea5a58723a1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -23,6 +23,7 @@ from pandas._config import ( get_option, using_copy_on_write, + using_pyarrow_string_dtype, ) from pandas._libs import ( @@ -6948,7 +6949,14 @@ def insert(self, loc: int, item) -> Index: loc = loc if loc >= 0 else loc - 1 new_values[loc] = item - return Index._with_infer(new_values, name=self.name) + idx = Index._with_infer(new_values, name=self.name) + if ( + using_pyarrow_string_dtype() + and is_string_dtype(idx.dtype) + and new_values.dtype == object + ): + idx = idx.astype(new_values.dtype) + return idx def drop( self, diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index de8df15a9d747..51b0a0a13d90b 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1913,7 +1913,7 @@ def test_add_new_column_infer_string(): df.loc[df["x"] == 1, "y"] = "1" expected = DataFrame( {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")}, - columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + columns=Index(["x", "y"], dtype=object), ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index ccc1249088f9a..fc2e817b1600e 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -753,6 +753,15 @@ def test_setitem_frame_overwrite_with_ea_dtype(self, any_numeric_ea_dtype): ) tm.assert_frame_equal(df, expected) + def test_setitem_string_option_object_index(self): + # GH#55638 + pytest.importorskip("pyarrow") + df = DataFrame({"a": [1, 2]}) + with pd.option_context("future.infer_string", True): + df["b"] = Index(["a", "b"], dtype=object) + expected = DataFrame({"a": [1, 2], "b": Series(["a", "b"], dtype=object)}) + tm.assert_frame_equal(df, expected) + def test_setitem_frame_midx_columns(self): # GH#49121 df = DataFrame({("a", "b"): [10]}) From 24b95019710d23bec6a8f534178dfb84390e7ea0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 22 Nov 2023 10:42:24 +0100 Subject: [PATCH 26/32] Backport PR #55993 on branch 2.1.x (TST update and simplify consortium api test) (#56112) Backport PR #55993: TST update and simplify consortium api test Co-authored-by: Marco Edward Gorelli --- pandas/tests/test_downstream.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index c541c5792ec7c..8e3d7780224fe 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -348,11 +348,9 @@ def test_dataframe_consortium() -> None: expected_1 = ["a", "b"] assert result_1 == expected_1 - ser = Series([1, 2, 3]) + ser = Series([1, 2, 3], name="a") col = ser.__column_consortium_standard__() - result_2 = col.get_value(1) - expected_2 = 2 - assert result_2 == expected_2 + assert col.name == "a" def test_xarray_coerce_unit(): From eecb95fd9cc21e294aaf2962791346c7a770dab1 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Wed, 22 Nov 2023 20:55:51 +0100 Subject: [PATCH 27/32] Backport PR #56109 on branch 2.1.x (CI: Ignore EncodingWarning in assert_produces_warning) (#56116) Backport PR #56109: CI: Ignore EncodingWarning in assert_produces_warning Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/_testing/_warnings.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py index 11cf60ef36a9c..c1c70605d9648 100644 --- a/pandas/_testing/_warnings.py +++ b/pandas/_testing/_warnings.py @@ -13,6 +13,8 @@ ) import warnings +from pandas.compat import PY311 + if TYPE_CHECKING: from collections.abc import ( Generator, @@ -179,6 +181,11 @@ def _assert_caught_no_extra_warnings( # due to these open files. if any("matplotlib" in mod for mod in sys.modules): continue + if PY311 and actual_warning.category == EncodingWarning: + # EncodingWarnings are checked in the CI + # pyproject.toml errors on EncodingWarnings in pandas + # Ignore EncodingWarnings from other libraries + continue extra_warnings.append( ( actual_warning.category.__name__, From 2e48c71d3fdb350bbb9ad63263062c0ef5ffb39b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Thu, 23 Nov 2023 00:19:34 +0100 Subject: [PATCH 28/32] Backport PR #55832 on branch 2.1.x (BUG: Index.getitem returning wrong result with negative step for arrow) (#56121) --- doc/source/whatsnew/v2.1.4.rst | 1 + pandas/core/arrays/arrow/array.py | 12 ++++++++++ pandas/tests/indexes/object/test_indexing.py | 25 +++++++++++++++++--- 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 39cd6b39309cf..e4f973611c578 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -22,6 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55753`) +- Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 9c8f28d660450..a08f3ba44f417 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -552,6 +552,18 @@ def __getitem__(self, item: PositionalIndexer): ) # We are not an array indexer, so maybe e.g. a slice or integer # indexer. We dispatch to pyarrow. + if isinstance(item, slice): + # Arrow bug https://github.com/apache/arrow/issues/38768 + if item.start == item.stop: + pass + elif ( + item.stop is not None + and item.stop < -len(self) + and item.step is not None + and item.step < 0 + ): + item = slice(item.start, None, item.step) + value = self._pa_array[item] if isinstance(value, pa.ChunkedArray): return type(self)(value) diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 87d3afc77d556..93d46ebdd0b51 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -4,6 +4,7 @@ import pytest from pandas._libs.missing import is_matching_na +import pandas.util._test_decorators as td import pandas as pd from pandas import Index @@ -144,6 +145,13 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): class TestSliceLocs: + @pytest.mark.parametrize( + "dtype", + [ + "object", + pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), + ], + ) @pytest.mark.parametrize( "in_slice,expected", [ @@ -167,12 +175,23 @@ class TestSliceLocs: (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] ], ) - def test_slice_locs_negative_step(self, in_slice, expected): - index = Index(list("bcdxy")) + def test_slice_locs_negative_step(self, in_slice, expected, dtype): + index = Index(list("bcdxy"), dtype=dtype) s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) result = index[s_start : s_stop : in_slice.step] - expected = Index(list(expected)) + expected = Index(list(expected), dtype=dtype) + tm.assert_index_equal(result, expected) + + @td.skip_if_no("pyarrow") + def test_slice_locs_negative_step_oob(self): + index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]") + + result = index[-10:5:1] + tm.assert_index_equal(result, index) + + result = index[4:-10:-1] + expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]") tm.assert_index_equal(result, expected) def test_slice_locs_dup(self): From 918193057c56e5a65d7e87ab211c92d43868c560 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Sun, 26 Nov 2023 21:11:45 +0100 Subject: [PATCH 29/32] Backport PR #56152 on branch 2.1.x (BUG: translate losing object dtype with new string dtype) (#56185) Backport PR #56152: BUG: translate losing object dtype with new string dtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.4.rst | 2 +- pandas/core/strings/accessor.py | 22 ++++++++++++---------- pandas/tests/strings/test_find_replace.py | 6 +++++- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index e4f973611c578..eb28e42d303a1 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -25,7 +25,7 @@ Bug fixes - Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) -- +- Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`) .. --------------------------------------------------------------------------- .. _whatsnew_214.other: diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index b299f5d6deab3..e2a3b9378a4f7 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -259,6 +259,7 @@ def _wrap_result( fill_value=np.nan, returns_string: bool = True, returns_bool: bool = False, + dtype=None, ): from pandas import ( Index, @@ -379,29 +380,29 @@ def cons_row(x): out = out.get_level_values(0) return out else: - return Index(result, name=name) + return Index(result, name=name, dtype=dtype) else: index = self._orig.index # This is a mess. - dtype: DtypeObj | str | None + _dtype: DtypeObj | str | None = dtype vdtype = getattr(result, "dtype", None) if self._is_string: if is_bool_dtype(vdtype): - dtype = result.dtype + _dtype = result.dtype elif returns_string: - dtype = self._orig.dtype + _dtype = self._orig.dtype else: - dtype = vdtype - else: - dtype = vdtype + _dtype = vdtype + elif vdtype is not None: + _dtype = vdtype if expand: cons = self._orig._constructor_expanddim - result = cons(result, columns=name, index=index, dtype=dtype) + result = cons(result, columns=name, index=index, dtype=_dtype) else: # Must be a Series cons = self._orig._constructor - result = cons(result, name=name, index=index, dtype=dtype) + result = cons(result, name=name, index=index, dtype=_dtype) result = result.__finalize__(self._orig, method="str") if name is not None and result.ndim == 1: # __finalize__ might copy over the original name, but we may @@ -2317,7 +2318,8 @@ def translate(self, table): dtype: object """ result = self._data.array._str_translate(table) - return self._wrap_result(result) + dtype = object if self._data.dtype == "object" else None + return self._wrap_result(result, dtype=dtype) @forbid_nonstring_types(["bytes"]) def count(self, pat, flags: int = 0): diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 78f0730d730e8..bd64a5dce3b9a 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -5,6 +5,7 @@ import pytest from pandas.errors import PerformanceWarning +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -893,7 +894,10 @@ def test_find_nan(any_string_dtype): # -------------------------------------------------------------------------------------- -def test_translate(index_or_series, any_string_dtype): +@pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] +) +def test_translate(index_or_series, any_string_dtype, infer_string): obj = index_or_series( ["abcdefg", "abcc", "cdddfg", "cdefggg"], dtype=any_string_dtype ) From 2d812e2c7446e82e2e3cda251a106c4696b63bb0 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Mon, 27 Nov 2023 12:09:55 +0100 Subject: [PATCH 30/32] Backport PR #56194 on branch 2.1.x (BUG: hdf can't deal with ea dtype columns) (#56199) Backport PR #56194: BUG: hdf can't deal with ea dtype columns Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.4.rst | 1 + pandas/io/pytables.py | 3 +-- pandas/tests/io/pytables/test_round_trip.py | 15 +++++++++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index eb28e42d303a1..684b68baa123c 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -24,6 +24,7 @@ Bug fixes - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55753`) - Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) +- Fixed bug in :meth:`DataFrame.to_hdf` raising when columns have ``StringDtype`` (:issue:`55088`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d3055f0ad2a38..2a72f7d32b1e7 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -56,7 +56,6 @@ is_bool_dtype, is_complex_dtype, is_list_like, - is_object_dtype, is_string_dtype, needs_i8_conversion, ) @@ -2645,7 +2644,7 @@ class DataIndexableCol(DataCol): is_data_indexable = True def validate_names(self) -> None: - if not is_object_dtype(Index(self.values).dtype): + if not is_string_dtype(Index(self.values).dtype): # TODO: should the message here be more specifically non-str? raise ValueError("cannot have non-object label DataIndexableCol") diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 085db5f521a9f..3bc8c0c4f8e30 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -526,3 +526,18 @@ def test_round_trip_equals(tmp_path, setup_path): tm.assert_frame_equal(df, other) assert df.equals(other) assert other.equals(df) + + +def test_infer_string_columns(tmp_path, setup_path): + # GH# + pytest.importorskip("pyarrow") + path = tmp_path / setup_path + with pd.option_context("future.infer_string", True): + df = DataFrame(1, columns=list("ABCD"), index=list(range(10))).set_index( + ["A", "B"] + ) + expected = df.copy() + df.to_hdf(path, key="df", format="table") + + result = read_hdf(path, "df") + tm.assert_frame_equal(result, expected) From bec5226437905bda0918e28b541ff159a2feffed Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Tue, 28 Nov 2023 18:07:52 +0100 Subject: [PATCH 31/32] Backport PR #56222 on branch 2.1.x (Add doc notes for deprecations) (#56225) Backport PR #56222: Add doc notes for deprecations Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- pandas/core/generic.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3281d245dca56..564d572254f8d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9456,6 +9456,10 @@ def first(self, offset) -> Self: """ Select initial periods of time series data based on a date offset. + .. deprecated:: 2.1 + :meth:`.first` is deprecated and will be removed in a future version. + Please create a mask and filter using `.loc` instead. + For a DataFrame with a sorted DatetimeIndex, this function can select the first few rows based on a date offset. @@ -9535,6 +9539,10 @@ def last(self, offset) -> Self: """ Select final periods of time series data based on a date offset. + .. deprecated:: 2.1 + :meth:`.last` is deprecated and will be removed in a future version. + Please create a mask and filter using `.loc` instead. + For a DataFrame with a sorted DatetimeIndex, this function selects the last few rows based on a date offset. From f99a1fcb906613d70dd5e25385211117cc3cda40 Mon Sep 17 00:00:00 2001 From: "Lumberbot (aka Jack)" <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 30 Nov 2023 20:59:33 +0100 Subject: [PATCH 32/32] Backport PR #56179 on branch 2.1.x (BUG: to_numeric casting to ea for new string dtype) (#56263) Backport PR #56179: BUG: to_numeric casting to ea for new string dtype Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v2.1.4.rst | 1 + pandas/core/tools/numeric.py | 4 +++- pandas/tests/tools/test_to_numeric.py | 13 ++++++++++--- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 684b68baa123c..3a72c0864d29c 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -23,6 +23,7 @@ Bug fixes ~~~~~~~~~ - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55753`) - Bug in :meth:`Index.__getitem__` returning wrong result for Arrow dtypes and negative stepsize (:issue:`55832`) +- Fixed bug in :func:`to_numeric` converting to extension dtype for ``string[pyarrow_numpy]`` dtype (:issue:`56179`) - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`DataFrame.to_hdf` raising when columns have ``StringDtype`` (:issue:`55088`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index a50dbeb110bff..f1b14cdc58b13 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -224,7 +224,8 @@ def to_numeric( set(), coerce_numeric=coerce_numeric, convert_to_masked_nullable=dtype_backend is not lib.no_default - or isinstance(values_dtype, StringDtype), + or isinstance(values_dtype, StringDtype) + and not values_dtype.storage == "pyarrow_numpy", ) except (ValueError, TypeError): if errors == "raise": @@ -239,6 +240,7 @@ def to_numeric( dtype_backend is not lib.no_default and new_mask is None or isinstance(values_dtype, StringDtype) + and not values_dtype.storage == "pyarrow_numpy" ): new_mask = np.zeros(values.shape, dtype=np.bool_) diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 1d969e648b752..7f37e6003f313 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -4,12 +4,15 @@ from numpy import iinfo import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( ArrowDtype, DataFrame, Index, Series, + option_context, to_numeric, ) import pandas._testing as tm @@ -67,10 +70,14 @@ def test_empty(input_kwargs, result_kwargs): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] +) @pytest.mark.parametrize("last_val", ["7", 7]) -def test_series(last_val): - ser = Series(["1", "-3.14", last_val]) - result = to_numeric(ser) +def test_series(last_val, infer_string): + with option_context("future.infer_string", infer_string): + ser = Series(["1", "-3.14", last_val]) + result = to_numeric(ser) expected = Series([1, -3.14, 7]) tm.assert_series_equal(result, expected)