Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: rolling with datetime ArrowDtype #56370

Merged
merged 9 commits into from
Dec 28, 2023
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -652,7 +652,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrame.resample` when resampling on a :class:`ArrowDtype` of ``pyarrow.timestamp`` or ``pyarrow.duration`` type (:issue:`55989`)
- Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.BusinessDay` (:issue:`55281`)
- Bug in :meth:`DataFrame.resample` where bin edges were not correct for :class:`~pandas.tseries.offsets.MonthBegin` (:issue:`55271`)
-
- Bug in :meth:`DataFrame.rolling` and :meth:`Series.rolling` where either the ``index`` or ``on`` column was :class:`ArrowDtype` with ``pyarrow.timestamp`` type (:issue:`55849`)

Reshaping
^^^^^^^^^
Expand Down
7 changes: 6 additions & 1 deletion pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
ArrowDtype,
CategoricalDtype,
DatetimeTZDtype,
ExtensionDtype,
Expand Down Expand Up @@ -2499,7 +2500,7 @@ def _validate_inferred_freq(
return freq


def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype) -> str:
def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype | ArrowDtype) -> str:
"""
Return the unit str corresponding to the dtype's resolution.

Expand All @@ -2514,4 +2515,8 @@ def dtype_to_unit(dtype: DatetimeTZDtype | np.dtype) -> str:
"""
if isinstance(dtype, DatetimeTZDtype):
return dtype.unit
elif isinstance(dtype, ArrowDtype):
if dtype.kind not in "mM":
raise ValueError(f"{dtype=} does not have a resolution.")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so? (If you're referring to dtype=, it was intentional to render e.g. "dtype=int64[pyarrow] does not have a resolution")

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe im misunderstanding how f-strings work. id expect this to render "int64[pyarrow]= does not have a resolution"

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In [2]: import pandas as pd

In [3]: import pyarrow as pa

In [4]: dtype = pd.ArrowDtype(pa.int64())

In [5]: f"{dtype=} does not have a resolution"
Out[5]: 'dtype=int64[pyarrow] does not have a resolution'

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

neat!

return dtype.pyarrow_dtype.unit
return np.datetime_data(dtype)[0]
24 changes: 14 additions & 10 deletions pandas/core/window/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
Any,
Callable,
Literal,
cast,
)

import numpy as np
Expand All @@ -39,6 +38,7 @@
is_numeric_dtype,
needs_i8_conversion,
)
from pandas.core.dtypes.dtypes import ArrowDtype
from pandas.core.dtypes.generic import (
ABCDataFrame,
ABCSeries,
Expand Down Expand Up @@ -104,6 +104,7 @@
NDFrameT,
QuantileInterpolation,
WindowingRankType,
npt,
)

from pandas import (
Expand Down Expand Up @@ -404,11 +405,12 @@ def _insert_on_column(self, result: DataFrame, obj: DataFrame) -> None:
result[name] = extra_col

@property
def _index_array(self):
def _index_array(self) -> npt.NDArray[np.int64] | None:
# TODO: why do we get here with e.g. MultiIndex?
if needs_i8_conversion(self._on.dtype):
idx = cast("PeriodIndex | DatetimeIndex | TimedeltaIndex", self._on)
return idx.asi8
if isinstance(self._on, (PeriodIndex, DatetimeIndex, TimedeltaIndex)):
return self._on.asi8
elif isinstance(self._on.dtype, ArrowDtype):
return self._on.to_numpy(dtype=np.int64)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comment/assertion about self._on.dtype.kind?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup good point. Added a kind check

return None

def _resolve_output(self, out: DataFrame, obj: DataFrame) -> DataFrame:
Expand Down Expand Up @@ -439,7 +441,7 @@ def _apply_series(
self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None
) -> Series:
"""
Series version of _apply_blockwise
Series version of _apply_columnwise
"""
obj = self._create_data(self._selected_obj)

Expand All @@ -455,7 +457,7 @@ def _apply_series(
index = self._slice_axis_for_step(obj.index, result)
return obj._constructor(result, index=index, name=obj.name)

def _apply_blockwise(
def _apply_columnwise(
self,
homogeneous_func: Callable[..., ArrayLike],
name: str,
Expand Down Expand Up @@ -614,7 +616,7 @@ def calc(x):
return result

if self.method == "single":
return self._apply_blockwise(homogeneous_func, name, numeric_only)
return self._apply_columnwise(homogeneous_func, name, numeric_only)
else:
return self._apply_tablewise(homogeneous_func, name, numeric_only)

Expand Down Expand Up @@ -1236,7 +1238,9 @@ def calc(x):

return result

return self._apply_blockwise(homogeneous_func, name, numeric_only)[:: self.step]
return self._apply_columnwise(homogeneous_func, name, numeric_only)[
:: self.step
]

@doc(
_shared_docs["aggregate"],
Expand Down Expand Up @@ -1871,7 +1875,7 @@ def _validate(self):
# we allow rolling on a datetimelike index
if (
self.obj.empty
or isinstance(self._on, (DatetimeIndex, TimedeltaIndex, PeriodIndex))
or (isinstance(self._on, PeriodIndex) or self._on.dtype.kind in "Mm")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

id err on the side of being more explicit about the supported dtypes being ArrowDtype

) and isinstance(self.window, (str, BaseOffset, timedelta)):
self._validate_datetimelike_monotonic()

Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/window/test_timeseries_window.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

from pandas import (
DataFrame,
DatetimeIndex,
Index,
MultiIndex,
NaT,
Series,
Expand Down Expand Up @@ -697,3 +700,16 @@ def test_nat_axis_error(msg, axis):
with pytest.raises(ValueError, match=f"{msg} values must not have NaT"):
with tm.assert_produces_warning(FutureWarning, match=warn_msg):
df.rolling("D", axis=axis).mean()


@td.skip_if_no("pyarrow")
def test_arrow_datetime_axis():
# GH 55849
expected = Series(
np.arange(5, dtype=np.float64),
index=Index(
date_range("2020-01-01", periods=5), dtype="timestamp[ns][pyarrow]"
),
)
result = expected.rolling("1D").sum()
tm.assert_series_equal(result, expected)