Skip to content

Commit

Permalink
Backport PR pandas-dev#56650: ENH: Implement dt methods for pyarrow d…
Browse files Browse the repository at this point in the history
…uration types
  • Loading branch information
mroeschke authored and meeseeksmachine committed Dec 28, 2023
1 parent f8e9892 commit 159f54a
Show file tree
Hide file tree
Showing 4 changed files with 231 additions and 1 deletion.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ Other enhancements
- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area`` (:issue:`56492`)
- Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`)
- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
- Implemented :meth:`Series.dt` methods and attributes for :class:`ArrowDtype` with ``pyarrow.duration`` type (:issue:`52284`)
- Implemented :meth:`Series.str.extract` for :class:`ArrowDtype` (:issue:`56268`)
- Improved error message that appears in :meth:`DatetimeIndex.to_period` with frequencies which are not supported as period frequencies, such as ``"BMS"`` (:issue:`56243`)
- Improved error message when constructing :class:`Period` with invalid offsets such as ``"QS"`` (:issue:`55785`)
Expand Down
87 changes: 87 additions & 0 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from pandas._libs import lib
from pandas._libs.tslibs import (
NaT,
Timedelta,
Timestamp,
timezones,
Expand Down Expand Up @@ -2498,6 +2499,92 @@ def _str_wrap(self, width: int, **kwargs):
result = self._apply_elementwise(predicate)
return type(self)(pa.chunked_array(result))

@property
def _dt_days(self):
return type(self)(
pa.array(self._to_timedeltaarray().days, from_pandas=True, type=pa.int32())
)

@property
def _dt_hours(self):
return type(self)(
pa.array(
[
td.components.hours if td is not NaT else None
for td in self._to_timedeltaarray()
],
type=pa.int32(),
)
)

@property
def _dt_minutes(self):
return type(self)(
pa.array(
[
td.components.minutes if td is not NaT else None
for td in self._to_timedeltaarray()
],
type=pa.int32(),
)
)

@property
def _dt_seconds(self):
return type(self)(
pa.array(
self._to_timedeltaarray().seconds, from_pandas=True, type=pa.int32()
)
)

@property
def _dt_milliseconds(self):
return type(self)(
pa.array(
[
td.components.milliseconds if td is not NaT else None
for td in self._to_timedeltaarray()
],
type=pa.int32(),
)
)

@property
def _dt_microseconds(self):
return type(self)(
pa.array(
self._to_timedeltaarray().microseconds,
from_pandas=True,
type=pa.int32(),
)
)

@property
def _dt_nanoseconds(self):
return type(self)(
pa.array(
self._to_timedeltaarray().nanoseconds, from_pandas=True, type=pa.int32()
)
)

def _dt_to_pytimedelta(self):
data = self._pa_array.to_pylist()
if self._dtype.pyarrow_dtype.unit == "ns":
data = [None if ts is None else ts.to_pytimedelta() for ts in data]
return np.array(data, dtype=object)

def _dt_total_seconds(self):
return type(self)(
pa.array(self._to_timedeltaarray().total_seconds(), from_pandas=True)
)

def _dt_as_unit(self, unit: str):
if pa.types.is_date(self.dtype.pyarrow_dtype):
raise NotImplementedError("as_unit not implemented for date types")
pd_array = self._maybe_convert_datelike_array()
# Don't just cast _pa_array in order to follow pandas unit conversion rules
return type(self)(pa.array(pd_array.as_unit(unit), from_pandas=True))

@property
def _dt_year(self):
return type(self)(pc.year(self._pa_array))
Expand Down
39 changes: 38 additions & 1 deletion pandas/core/indexes/accessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,20 @@ def _delegate_method(self, name: str, *args, **kwargs):
return result


@delegate_names(
delegate=ArrowExtensionArray,
accessors=TimedeltaArray._datetimelike_ops,
typ="property",
accessor_mapping=lambda x: f"_dt_{x}",
raise_on_missing=False,
)
@delegate_names(
delegate=ArrowExtensionArray,
accessors=TimedeltaArray._datetimelike_methods,
typ="method",
accessor_mapping=lambda x: f"_dt_{x}",
raise_on_missing=False,
)
@delegate_names(
delegate=ArrowExtensionArray,
accessors=DatetimeArray._datetimelike_ops,
Expand Down Expand Up @@ -213,6 +227,9 @@ def _delegate_method(self, name: str, *args, **kwargs):

return result

def to_pytimedelta(self):
return cast(ArrowExtensionArray, self._parent.array)._dt_to_pytimedelta()

def to_pydatetime(self):
# GH#20306
warnings.warn(
Expand Down Expand Up @@ -241,6 +258,26 @@ def isocalendar(self) -> DataFrame:
)
return iso_calendar_df

@property
def components(self) -> DataFrame:
from pandas import DataFrame

components_df = DataFrame(
{
col: getattr(self._parent.array, f"_dt_{col}")
for col in [
"days",
"hours",
"minutes",
"seconds",
"milliseconds",
"microseconds",
"nanoseconds",
]
}
)
return components_df


@delegate_names(
delegate=DatetimeArray,
Expand Down Expand Up @@ -592,7 +629,7 @@ def __new__(cls, data: Series): # pyright: ignore[reportInconsistentConstructor
index=orig.index,
)

if isinstance(data.dtype, ArrowDtype) and data.dtype.kind == "M":
if isinstance(data.dtype, ArrowDtype) and data.dtype.kind in "Mm":
return ArrowTemporalProperties(data, orig)
if lib.is_np_dtype(data.dtype, "M"):
return DatetimeProperties(data, orig)
Expand Down
105 changes: 105 additions & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2723,6 +2723,111 @@ def test_dt_tz_convert(unit):
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("dtype", ["timestamp[ms][pyarrow]", "duration[ms][pyarrow]"])
def test_as_unit(dtype):
# GH 52284
ser = pd.Series([1000, None], dtype=dtype)
result = ser.dt.as_unit("ns")
expected = ser.astype(dtype.replace("ms", "ns"))
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"prop, expected",
[
["days", 1],
["seconds", 2],
["microseconds", 3],
["nanoseconds", 4],
],
)
def test_dt_timedelta_properties(prop, expected):
# GH 52284
ser = pd.Series(
[
pd.Timedelta(
days=1,
seconds=2,
microseconds=3,
nanoseconds=4,
),
None,
],
dtype=ArrowDtype(pa.duration("ns")),
)
result = getattr(ser.dt, prop)
expected = pd.Series(
ArrowExtensionArray(pa.array([expected, None], type=pa.int32()))
)
tm.assert_series_equal(result, expected)


def test_dt_timedelta_total_seconds():
# GH 52284
ser = pd.Series(
[
pd.Timedelta(
days=1,
seconds=2,
microseconds=3,
nanoseconds=4,
),
None,
],
dtype=ArrowDtype(pa.duration("ns")),
)
result = ser.dt.total_seconds()
expected = pd.Series(
ArrowExtensionArray(pa.array([86402.000003, None], type=pa.float64()))
)
tm.assert_series_equal(result, expected)


def test_dt_to_pytimedelta():
# GH 52284
data = [timedelta(1, 2, 3), timedelta(1, 2, 4)]
ser = pd.Series(data, dtype=ArrowDtype(pa.duration("ns")))

result = ser.dt.to_pytimedelta()
expected = np.array(data, dtype=object)
tm.assert_numpy_array_equal(result, expected)
assert all(type(res) is timedelta for res in result)

expected = ser.astype("timedelta64[ns]").dt.to_pytimedelta()
tm.assert_numpy_array_equal(result, expected)


def test_dt_components():
# GH 52284
ser = pd.Series(
[
pd.Timedelta(
days=1,
seconds=2,
microseconds=3,
nanoseconds=4,
),
None,
],
dtype=ArrowDtype(pa.duration("ns")),
)
result = ser.dt.components
expected = pd.DataFrame(
[[1, 0, 0, 2, 0, 3, 4], [None, None, None, None, None, None, None]],
columns=[
"days",
"hours",
"minutes",
"seconds",
"milliseconds",
"microseconds",
"nanoseconds",
],
dtype="int32[pyarrow]",
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("skipna", [True, False])
def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna):
# GH51624
Expand Down

0 comments on commit 159f54a

Please sign in to comment.