From aa7b17e1f5d596f41950f85e9f00aa9258b5f2a7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Dec 2023 15:06:33 -1000 Subject: [PATCH] BUG: resample with ArrowDtype (#56371) * BUG: resample with ArrowDtype * Typing * xfail for windows * Fix again? * Avoid tuple * Add gh numbers --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/core/groupby/grouper.py | 1 - pandas/core/resample.py | 21 ++++++++++++++-- pandas/tests/resample/test_datetime_index.py | 26 ++++++++++++++++++++ pandas/tests/resample/test_timedelta.py | 11 +++++++++ 5 files changed, 57 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 8a1906d20c243..bb16590b172ef 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -648,6 +648,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.asfreq` and :meth:`Series.asfreq` with a :class:`DatetimeIndex` with non-nanosecond resolution incorrectly converting to nanosecond resolution (:issue:`55958`) - Bug in :meth:`DataFrame.ewm` when passed ``times`` with non-nanosecond ``datetime64`` or :class:`DatetimeTZDtype` dtype (:issue:`56262`) - Bug in :meth:`DataFrame.resample` not respecting ``closed`` and ``label`` arguments for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55282`) +- Bug in :meth:`DataFrame.resample` when resampling on a :class:`ArrowDtype` of ``pyarrow.timestamp`` or ``pyarrow.duration`` type (:issue:`55989`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`) - Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`) - diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index fc914831b7a72..4703c12db602d 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -330,7 +330,6 @@ def _get_grouper( return grouper, obj - @final def _set_grouper( self, obj: NDFrameT, sort: bool = False, *, gpr_index: Index | None = None ) -> tuple[NDFrameT, Index, npt.NDArray[np.intp] | None]: diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 31e41acbf1774..7e9aa2d8a9e3c 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -38,6 +38,7 @@ rewrite_warning, ) +from pandas.core.dtypes.dtypes import ArrowDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCSeries, @@ -48,6 +49,7 @@ ResamplerWindowApply, warn_alias_replacement, ) +from pandas.core.arrays import ArrowExtensionArray from pandas.core.base import ( PandasObject, SelectionMixin, @@ -68,6 +70,7 @@ from pandas.core.groupby.grouper import Grouper from pandas.core.groupby.ops import BinGrouper from pandas.core.indexes.api import MultiIndex +from pandas.core.indexes.base import Index from pandas.core.indexes.datetimes import ( DatetimeIndex, date_range, @@ -109,7 +112,6 @@ from pandas import ( DataFrame, - Index, Series, ) @@ -511,6 +513,9 @@ def _wrap_result(self, result): result.index = _asfreq_compat(obj.index[:0], freq=self.freq) result.name = getattr(obj, "name", None) + if self._timegrouper._arrow_dtype is not None: + result.index = result.index.astype(self._timegrouper._arrow_dtype) + return result @final @@ -2163,6 +2168,7 @@ def __init__( self.fill_method = fill_method self.limit = limit self.group_keys = group_keys + self._arrow_dtype: ArrowDtype | None = None if origin in ("epoch", "start", "start_day", "end", "end_day"): # error: Incompatible types in assignment (expression has type "Union[Union[ @@ -2213,7 +2219,7 @@ def _get_resampler(self, obj: NDFrame, kind=None) -> Resampler: TypeError if incompatible axis """ - _, ax, indexer = self._set_grouper(obj, gpr_index=None) + _, ax, _ = self._set_grouper(obj, gpr_index=None) if isinstance(ax, DatetimeIndex): return DatetimeIndexResampler( obj, @@ -2495,6 +2501,17 @@ def _get_period_bins(self, ax: PeriodIndex): return binner, bins, labels + def _set_grouper( + self, obj: NDFrameT, sort: bool = False, *, gpr_index: Index | None = None + ) -> tuple[NDFrameT, Index, npt.NDArray[np.intp] | None]: + obj, ax, indexer = super()._set_grouper(obj, sort, gpr_index=gpr_index) + if isinstance(ax.dtype, ArrowDtype) and ax.dtype.kind in "Mm": + self._arrow_dtype = ax.dtype + ax = Index( + cast(ArrowExtensionArray, ax.array)._maybe_convert_datelike_array() + ) + return obj, ax, indexer + def _take_new_index( obj: NDFrameT, indexer: npt.NDArray[np.intp], new_index: Index, axis: AxisInt = 0 diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 8a725c6e51e3f..760ed35bab678 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -7,6 +7,8 @@ from pandas._libs import lib from pandas._typing import DatetimeNaTType +from pandas.compat import is_platform_windows +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -2195,3 +2197,27 @@ def test_resample_b_55282(unit): index=exp_dti, ) tm.assert_series_equal(result, expected) + + +@td.skip_if_no("pyarrow") +@pytest.mark.parametrize( + "tz", + [ + None, + pytest.param( + "UTC", + marks=pytest.mark.xfail( + condition=is_platform_windows(), + reason="TODO: Set ARROW_TIMEZONE_DATABASE env var in CI", + ), + ), + ], +) +def test_arrow_timestamp_resample(tz): + # GH 56371 + idx = Series(date_range("2020-01-01", periods=5), dtype="timestamp[ns][pyarrow]") + if tz is not None: + idx = idx.dt.tz_localize(tz) + expected = Series(np.arange(5, dtype=np.float64), index=idx) + result = expected.resample("1D").mean() + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 5d6876343a0c9..7c70670d42908 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -207,3 +209,12 @@ def test_resample_closed_right(): ), ) tm.assert_series_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_arrow_duration_resample(): + # GH 56371 + idx = pd.Index(timedelta_range("1 day", periods=5), dtype="duration[ns][pyarrow]") + expected = Series(np.arange(5, dtype=np.float64), index=idx) + result = expected.resample("1D").mean() + tm.assert_series_equal(result, expected)