diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 9acbab7a42800..af262f9e6c336 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -526,6 +526,23 @@ Sparse-dtype specific methods and attributes are provided under the Series.sparse.to_coo +.. _api.series.list: + +List accessor +~~~~~~~~~~~~~ + +Arrow list-dtype specific methods and attributes are provided under the +``Series.list`` accessor. + +.. autosummary:: + :toctree: api/ + :template: autosummary/accessor_method.rst + + Series.list.flatten + Series.list.len + Series.list.__getitem__ + + .. _api.series.struct: Struct accessor diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 26b5705a1f3db..65532c4cbd27c 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -64,10 +64,30 @@ DataFrame. (:issue:`54938`) ) series.struct.explode() -.. _whatsnew_220.enhancements.enhancement2: +.. _whatsnew_220.enhancements.list_accessor: -enhancement2 -^^^^^^^^^^^^ +Series.list accessor for PyArrow list data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``Series.list`` accessor provides attributes and methods for processing +data with ``list[pyarrow]`` dtype Series. For example, +:meth:`Series.list.__getitem__` allows indexing pyarrow lists in +a Series. (:issue:`55323`) + +.. ipython:: python + + import pyarrow as pa + series = pd.Series( + [ + [1, 2, 3], + [4, 5], + [6], + ], + dtype=pd.ArrowDtype( + pa.list_(pa.int64()) + ), + ) + series.list[0] .. _whatsnew_220.enhancements.other: diff --git a/pandas/conftest.py b/pandas/conftest.py index b62a4391ca654..efe263a41afe1 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1295,6 +1295,14 @@ def utc_fixture(request): utc_fixture2 = utc_fixture +@pytest.fixture(params=["s", "ms", "us", "ns"]) +def unit(request): + """ + datetime64 units we support. + """ + return request.param + + # ---------------------------------------------------------------- # Dtypes # ---------------------------------------------------------------- diff --git a/pandas/core/arrays/arrow/__init__.py b/pandas/core/arrays/arrow/__init__.py index a3d33f91f597d..5fc50f786fc6a 100644 --- a/pandas/core/arrays/arrow/__init__.py +++ b/pandas/core/arrays/arrow/__init__.py @@ -1,4 +1,7 @@ -from pandas.core.arrays.arrow.accessors import StructAccessor +from pandas.core.arrays.arrow.accessors import ( + ListAccessor, + StructAccessor, +) from pandas.core.arrays.arrow.array import ArrowExtensionArray -__all__ = ["ArrowExtensionArray", "StructAccessor"] +__all__ = ["ArrowExtensionArray", "StructAccessor", "ListAccessor"] diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py index cbd727d842d83..7f88267943526 100644 --- a/pandas/core/arrays/arrow/accessors.py +++ b/pandas/core/arrays/arrow/accessors.py @@ -2,9 +2,16 @@ from __future__ import annotations +from abc import ( + ABCMeta, + abstractmethod, +) from typing import TYPE_CHECKING -from pandas.compat import pa_version_under10p1 +from pandas.compat import ( + pa_version_under10p1, + pa_version_under11p0, +) if not pa_version_under10p1: import pyarrow as pa @@ -13,13 +20,194 @@ from pandas.core.dtypes.dtypes import ArrowDtype if TYPE_CHECKING: + from collections.abc import Iterator + from pandas import ( DataFrame, Series, ) -class StructAccessor: +class ArrowAccessor(metaclass=ABCMeta): + @abstractmethod + def __init__(self, data, validation_msg: str) -> None: + self._data = data + self._validation_msg = validation_msg + self._validate(data) + + @abstractmethod + def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: + pass + + def _validate(self, data): + dtype = data.dtype + if not isinstance(dtype, ArrowDtype): + # Raise AttributeError so that inspect can handle non-struct Series. + raise AttributeError(self._validation_msg.format(dtype=dtype)) + + if not self._is_valid_pyarrow_dtype(dtype.pyarrow_dtype): + # Raise AttributeError so that inspect can handle invalid Series. + raise AttributeError(self._validation_msg.format(dtype=dtype)) + + @property + def _pa_array(self): + return self._data.array._pa_array + + +class ListAccessor(ArrowAccessor): + """ + Accessor object for list data properties of the Series values. + + Parameters + ---------- + data : Series + Series containing Arrow list data. + """ + + def __init__(self, data=None) -> None: + super().__init__( + data, + validation_msg="Can only use the '.list' accessor with " + "'list[pyarrow]' dtype, not {dtype}.", + ) + + def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: + return ( + pa.types.is_list(pyarrow_dtype) + or pa.types.is_fixed_size_list(pyarrow_dtype) + or pa.types.is_large_list(pyarrow_dtype) + ) + + def len(self) -> Series: + """ + Return the length of each list in the Series. + + Returns + ------- + pandas.Series + The length of each list. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... [1, 2, 3], + ... [3], + ... ], + ... dtype=pd.ArrowDtype(pa.list_( + ... pa.int64() + ... )) + ... ) + >>> s.list.len() + 0 3 + 1 1 + dtype: int32[pyarrow] + """ + from pandas import Series + + value_lengths = pc.list_value_length(self._pa_array) + return Series(value_lengths, dtype=ArrowDtype(value_lengths.type)) + + def __getitem__(self, key: int | slice) -> Series: + """ + Index or slice lists in the Series. + + Parameters + ---------- + key : int | slice + Index or slice of indices to access from each list. + + Returns + ------- + pandas.Series + The list at requested index. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... [1, 2, 3], + ... [3], + ... ], + ... dtype=pd.ArrowDtype(pa.list_( + ... pa.int64() + ... )) + ... ) + >>> s.list[0] + 0 1 + 1 3 + dtype: int64[pyarrow] + """ + from pandas import Series + + if isinstance(key, int): + # TODO: Support negative key but pyarrow does not allow + # element index to be an array. + # if key < 0: + # key = pc.add(key, pc.list_value_length(self._pa_array)) + element = pc.list_element(self._pa_array, key) + return Series(element, dtype=ArrowDtype(element.type)) + elif isinstance(key, slice): + if pa_version_under11p0: + raise NotImplementedError( + f"List slice not supported by pyarrow {pa.__version__}." + ) + + # TODO: Support negative start/stop/step, ideally this would be added + # upstream in pyarrow. + start, stop, step = key.start, key.stop, key.step + if start is None: + # TODO: When adding negative step support + # this should be setto last element of array + # when step is negative. + start = 0 + if step is None: + step = 1 + sliced = pc.list_slice(self._pa_array, start, stop, step) + return Series(sliced, dtype=ArrowDtype(sliced.type)) + else: + raise ValueError(f"key must be an int or slice, got {type(key).__name__}") + + def __iter__(self) -> Iterator: + raise TypeError(f"'{type(self).__name__}' object is not iterable") + + def flatten(self) -> Series: + """ + Flatten list values. + + Returns + ------- + pandas.Series + The data from all lists in the series flattened. + + Examples + -------- + >>> import pyarrow as pa + >>> s = pd.Series( + ... [ + ... [1, 2, 3], + ... [3], + ... ], + ... dtype=pd.ArrowDtype(pa.list_( + ... pa.int64() + ... )) + ... ) + >>> s.list.flatten() + 0 1 + 1 2 + 2 3 + 3 3 + dtype: int64[pyarrow] + """ + from pandas import Series + + flattened = pc.list_flatten(self._pa_array) + return Series(flattened, dtype=ArrowDtype(flattened.type)) + + +class StructAccessor(ArrowAccessor): """ Accessor object for structured data properties of the Series values. @@ -29,23 +217,17 @@ class StructAccessor: Series containing Arrow struct data. """ - _validation_msg = ( - "Can only use the '.struct' accessor with 'struct[pyarrow]' dtype, not {dtype}." - ) - def __init__(self, data=None) -> None: - self._parent = data - self._validate(data) - - def _validate(self, data): - dtype = data.dtype - if not isinstance(dtype, ArrowDtype): - # Raise AttributeError so that inspect can handle non-struct Series. - raise AttributeError(self._validation_msg.format(dtype=dtype)) + super().__init__( + data, + validation_msg=( + "Can only use the '.struct' accessor with 'struct[pyarrow]' " + "dtype, not {dtype}." + ), + ) - if not pa.types.is_struct(dtype.pyarrow_dtype): - # Raise AttributeError so that inspect can handle non-struct Series. - raise AttributeError(self._validation_msg.format(dtype=dtype)) + def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool: + return pa.types.is_struct(pyarrow_dtype) @property def dtypes(self) -> Series: @@ -80,7 +262,7 @@ def dtypes(self) -> Series: Series, ) - pa_type = self._parent.dtype.pyarrow_dtype + pa_type = self._data.dtype.pyarrow_dtype types = [ArrowDtype(struct.type) for struct in pa_type] names = [struct.name for struct in pa_type] return Series(types, index=Index(names)) @@ -135,7 +317,7 @@ def field(self, name_or_index: str | int) -> Series: """ from pandas import Series - pa_arr = self._parent.array._pa_array + pa_arr = self._data.array._pa_array if isinstance(name_or_index, int): index = name_or_index elif isinstance(name_or_index, str): @@ -151,7 +333,7 @@ def field(self, name_or_index: str | int) -> Series: return Series( field_arr, dtype=ArrowDtype(field_arr.type), - index=self._parent.index, + index=self._data.index, name=pa_field.name, ) @@ -190,7 +372,7 @@ def explode(self) -> DataFrame: """ from pandas import concat - pa_type = self._parent.dtype.pyarrow_dtype + pa_type = self._pa_array.type return concat( [self.field(i) for i in range(pa_type.num_fields)], axis="columns" ) diff --git a/pandas/core/series.py b/pandas/core/series.py index c5f622a113258..88d3616423155 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -100,7 +100,10 @@ from pandas.core.accessor import CachedAccessor from pandas.core.apply import SeriesApply from pandas.core.arrays import ExtensionArray -from pandas.core.arrays.arrow import StructAccessor +from pandas.core.arrays.arrow import ( + ListAccessor, + StructAccessor, +) from pandas.core.arrays.categorical import CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor from pandas.core.arrays.string_ import StringDtype @@ -5891,6 +5894,7 @@ def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) sparse = CachedAccessor("sparse", SparseAccessor) struct = CachedAccessor("struct", StructAccessor) + list = CachedAccessor("list", ListAccessor) # ---------------------------------------------------------------------- # Add plotting methods to Series diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index ec219941a3afc..4bbbd9266a94a 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -570,8 +570,7 @@ .. deprecated:: 2.1.0 regex : bool or same types as `to_replace`, default False Whether to interpret `to_replace` and/or `value` as regular - expressions. If this is ``True`` then `to_replace` *must* be a - string. Alternatively, this could be a regular expression or a + expressions. Alternatively, this could be a regular expression or a list, dict, or array of regular expressions in which case `to_replace` must be ``None``. method : {{'pad', 'ffill', 'bfill'}} @@ -790,6 +789,32 @@ .. versionchanged:: 1.4.0 Previously the explicit ``None`` was silently ignored. + + When ``regex=True``, ``value`` is not ``None`` and `to_replace` is a string, + the replacement will be applied in all columns of the DataFrame. + + >>> df = pd.DataFrame({{'A': [0, 1, 2, 3, 4], + ... 'B': ['a', 'b', 'c', 'd', 'e'], + ... 'C': ['f', 'g', 'h', 'i', 'j']}}) + + >>> df.replace(to_replace='^[a-g]', value = 'e', regex=True) + A B C + 0 0 e e + 1 1 e e + 2 2 e h + 3 3 e i + 4 4 e j + + If ``value`` is not ``None`` and `to_replace` is a dictionary, the dictionary + keys will be the DataFrame columns that the replacement will be applied. + + >>> df.replace(to_replace={{'B': '^[a-c]', 'C': '^[h-j]'}}, value = 'e', regex=True) + A B C + 0 0 e f + 1 1 e g + 2 2 e e + 3 3 d e + 4 4 e e """ _shared_docs[ diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 190ce0caceb20..928b5c5a24479 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -987,13 +987,13 @@ def test_dt64arr_sub_timestamp_tzaware(self, box_with_array): tm.assert_equal(ser - ts, expected) tm.assert_equal(ts - ser, -expected) - def test_dt64arr_sub_NaT(self, box_with_array): + def test_dt64arr_sub_NaT(self, box_with_array, unit): # GH#18808 - dti = DatetimeIndex([NaT, Timestamp("19900315")]) + dti = DatetimeIndex([NaT, Timestamp("19900315")]).as_unit(unit) ser = tm.box_expected(dti, box_with_array) result = ser - NaT - expected = Series([NaT, NaT], dtype="timedelta64[ns]") + expected = Series([NaT, NaT], dtype=f"timedelta64[{unit}]") expected = tm.box_expected(expected, box_with_array) tm.assert_equal(result, expected) @@ -1001,7 +1001,7 @@ def test_dt64arr_sub_NaT(self, box_with_array): ser_tz = tm.box_expected(dti_tz, box_with_array) result = ser_tz - NaT - expected = Series([NaT, NaT], dtype="timedelta64[ns]") + expected = Series([NaT, NaT], dtype=f"timedelta64[{unit}]") expected = tm.box_expected(expected, box_with_array) tm.assert_equal(result, expected) @@ -1872,15 +1872,15 @@ def test_operators_datetimelike_invalid( # Smoke test op(arg2) - def test_sub_single_tz(self): + def test_sub_single_tz(self, unit): # GH#12290 - s1 = Series([Timestamp("2016-02-10", tz="America/Sao_Paulo")]) - s2 = Series([Timestamp("2016-02-08", tz="America/Sao_Paulo")]) + s1 = Series([Timestamp("2016-02-10", tz="America/Sao_Paulo")]).dt.as_unit(unit) + s2 = Series([Timestamp("2016-02-08", tz="America/Sao_Paulo")]).dt.as_unit(unit) result = s1 - s2 - expected = Series([Timedelta("2days")]) + expected = Series([Timedelta("2days")]).dt.as_unit(unit) tm.assert_series_equal(result, expected) result = s2 - s1 - expected = Series([Timedelta("-2days")]) + expected = Series([Timedelta("-2days")]).dt.as_unit(unit) tm.assert_series_equal(result, expected) def test_dt64tz_series_sub_dtitz(self): @@ -1895,13 +1895,19 @@ def test_dt64tz_series_sub_dtitz(self): res = ser - dti tm.assert_series_equal(res, expected) - def test_sub_datetime_compat(self): + def test_sub_datetime_compat(self, unit): # see GH#14088 - s = Series([datetime(2016, 8, 23, 12, tzinfo=pytz.utc), NaT]) + ser = Series([datetime(2016, 8, 23, 12, tzinfo=pytz.utc), NaT]).dt.as_unit(unit) dt = datetime(2016, 8, 22, 12, tzinfo=pytz.utc) - exp = Series([Timedelta("1 days"), NaT]) - tm.assert_series_equal(s - dt, exp) - tm.assert_series_equal(s - Timestamp(dt), exp) + exp_unit = unit + if unit in ["s", "ms"]: + # The datetime object has "us" so we upcast + exp_unit = "us" + exp = Series([Timedelta("1 days"), NaT]).dt.as_unit(exp_unit) + result = ser - dt + tm.assert_series_equal(result, exp) + result2 = ser - Timestamp(dt) + tm.assert_series_equal(result2, exp) def test_dt64_series_add_mixed_tick_DateOffset(self): # GH#4532 @@ -1922,11 +1928,11 @@ def test_dt64_series_add_mixed_tick_DateOffset(self): ) tm.assert_series_equal(result, expected) - def test_datetime64_ops_nat(self): + def test_datetime64_ops_nat(self, unit): # GH#11349 - datetime_series = Series([NaT, Timestamp("19900315")]) - nat_series_dtype_timestamp = Series([NaT, NaT], dtype="datetime64[ns]") - single_nat_dtype_datetime = Series([NaT], dtype="datetime64[ns]") + datetime_series = Series([NaT, Timestamp("19900315")]).dt.as_unit(unit) + nat_series_dtype_timestamp = Series([NaT, NaT], dtype=f"datetime64[{unit}]") + single_nat_dtype_datetime = Series([NaT], dtype=f"datetime64[{unit}]") # subtraction tm.assert_series_equal(-NaT + datetime_series, nat_series_dtype_timestamp) @@ -2097,16 +2103,16 @@ def test_dti_sub_tdi(self, tz_naive_fixture): with pytest.raises(TypeError, match=msg): tdi.values - dti - def test_dti_isub_tdi(self, tz_naive_fixture): + def test_dti_isub_tdi(self, tz_naive_fixture, unit): # GH#17558 tz = tz_naive_fixture - dti = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) - tdi = pd.timedelta_range("0 days", periods=10) - expected = date_range("2017-01-01", periods=10, tz=tz, freq="-1D") + dti = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10).as_unit(unit) + tdi = pd.timedelta_range("0 days", periods=10, unit=unit) + expected = date_range("2017-01-01", periods=10, tz=tz, freq="-1D", unit=unit) expected = expected._with_freq(None) # isub with TimedeltaIndex - result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) + result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10).as_unit(unit) result -= tdi tm.assert_index_equal(result, expected) @@ -2124,7 +2130,7 @@ def test_dti_isub_tdi(self, tz_naive_fixture): tdi -= dti # isub with timedelta64 array - result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10) + result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10).as_unit(unit) result -= tdi.values tm.assert_index_equal(result, expected) @@ -2158,13 +2164,13 @@ def test_dta_add_sub_index(self, tz_naive_fixture): expected = dti - tdi tm.assert_index_equal(result, expected) - def test_sub_dti_dti(self): + def test_sub_dti_dti(self, unit): # previously performed setop (deprecated in 0.16.0), now changed to # return subtraction -> TimeDeltaIndex (GH ...) - dti = date_range("20130101", periods=3) - dti_tz = date_range("20130101", periods=3).tz_localize("US/Eastern") - expected = TimedeltaIndex([0, 0, 0]) + dti = date_range("20130101", periods=3, unit=unit) + dti_tz = date_range("20130101", periods=3, unit=unit).tz_localize("US/Eastern") + expected = TimedeltaIndex([0, 0, 0]).as_unit(unit) result = dti - dti tm.assert_index_equal(result, expected) @@ -2183,16 +2189,16 @@ def test_sub_dti_dti(self): tm.assert_index_equal(dti, expected) # different length raises ValueError - dti1 = date_range("20130101", periods=3) - dti2 = date_range("20130101", periods=4) + dti1 = date_range("20130101", periods=3, unit=unit) + dti2 = date_range("20130101", periods=4, unit=unit) msg = "cannot add indices of unequal length" with pytest.raises(ValueError, match=msg): dti1 - dti2 # NaN propagation - dti1 = DatetimeIndex(["2012-01-01", np.nan, "2012-01-03"]) - dti2 = DatetimeIndex(["2012-01-02", "2012-01-03", np.nan]) - expected = TimedeltaIndex(["1 days", np.nan, np.nan]) + dti1 = DatetimeIndex(["2012-01-01", np.nan, "2012-01-03"]).as_unit(unit) + dti2 = DatetimeIndex(["2012-01-02", "2012-01-03", np.nan]).as_unit(unit) + expected = TimedeltaIndex(["1 days", np.nan, np.nan]).as_unit(unit) result = dti2 - dti1 tm.assert_index_equal(result, expected) @@ -2295,17 +2301,17 @@ def test_ops_nat_mixed_datetime64_timedelta64(self): nat_series_dtype_timestamp, ) - def test_ufunc_coercions(self): - idx = date_range("2011-01-01", periods=3, freq="2D", name="x") + def test_ufunc_coercions(self, unit): + idx = date_range("2011-01-01", periods=3, freq="2D", name="x", unit=unit) delta = np.timedelta64(1, "D") - exp = date_range("2011-01-02", periods=3, freq="2D", name="x") + exp = date_range("2011-01-02", periods=3, freq="2D", name="x", unit=unit) for result in [idx + delta, np.add(idx, delta)]: assert isinstance(result, DatetimeIndex) tm.assert_index_equal(result, exp) assert result.freq == "2D" - exp = date_range("2010-12-31", periods=3, freq="2D", name="x") + exp = date_range("2010-12-31", periods=3, freq="2D", name="x", unit=unit) for result in [idx - delta, np.subtract(idx, delta)]: assert isinstance(result, DatetimeIndex) @@ -2318,13 +2324,17 @@ def test_ufunc_coercions(self): delta = np.array( [np.timedelta64(1, "D"), np.timedelta64(2, "D"), np.timedelta64(3, "D")] ) - exp = DatetimeIndex(["2011-01-02", "2011-01-05", "2011-01-08"], name="x") + exp = DatetimeIndex( + ["2011-01-02", "2011-01-05", "2011-01-08"], name="x" + ).as_unit(unit) for result in [idx + delta, np.add(idx, delta)]: tm.assert_index_equal(result, exp) assert result.freq == exp.freq - exp = DatetimeIndex(["2010-12-31", "2011-01-01", "2011-01-02"], name="x") + exp = DatetimeIndex( + ["2010-12-31", "2011-01-01", "2011-01-02"], name="x" + ).as_unit(unit) for result in [idx - delta, np.subtract(idx, delta)]: assert isinstance(result, DatetimeIndex) tm.assert_index_equal(result, exp) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 205e6472aaecb..18c6b437743e2 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -336,17 +336,17 @@ def test_subtraction_ops(self): result = tdi - td expected = TimedeltaIndex(["0 days", NaT, "1 days"], name="foo") - tm.assert_index_equal(result, expected, check_names=False) + tm.assert_index_equal(result, expected) result = td - tdi expected = TimedeltaIndex(["0 days", NaT, "-1 days"], name="foo") - tm.assert_index_equal(result, expected, check_names=False) + tm.assert_index_equal(result, expected) result = dti - td expected = DatetimeIndex( ["20121231", "20130101", "20130102"], freq="D", name="bar" ) - tm.assert_index_equal(result, expected, check_names=False) + tm.assert_index_equal(result, expected) result = dt - tdi expected = DatetimeIndex(["20121231", NaT, "20121230"], name="foo") diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 7589517f417e8..2fc6e786e3198 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -141,35 +141,41 @@ def test_categorial_datetimelike(self, method): result = method(i)[0] assert isinstance(result, Timestamp) - def test_iter_box(self): + def test_iter_box_dt64(self, unit): vals = [Timestamp("2011-01-01"), Timestamp("2011-01-02")] - s = Series(vals) - assert s.dtype == "datetime64[ns]" - for res, exp in zip(s, vals): + ser = Series(vals).dt.as_unit(unit) + assert ser.dtype == f"datetime64[{unit}]" + for res, exp in zip(ser, vals): assert isinstance(res, Timestamp) assert res.tz is None assert res == exp + assert res.unit == unit + def test_iter_box_dt64tz(self, unit): vals = [ Timestamp("2011-01-01", tz="US/Eastern"), Timestamp("2011-01-02", tz="US/Eastern"), ] - s = Series(vals) + ser = Series(vals).dt.as_unit(unit) - assert s.dtype == "datetime64[ns, US/Eastern]" - for res, exp in zip(s, vals): + assert ser.dtype == f"datetime64[{unit}, US/Eastern]" + for res, exp in zip(ser, vals): assert isinstance(res, Timestamp) assert res.tz == exp.tz assert res == exp + assert res.unit == unit + def test_iter_box_timedelta64(self, unit): # timedelta vals = [Timedelta("1 days"), Timedelta("2 days")] - s = Series(vals) - assert s.dtype == "timedelta64[ns]" - for res, exp in zip(s, vals): + ser = Series(vals).dt.as_unit(unit) + assert ser.dtype == f"timedelta64[{unit}]" + for res, exp in zip(ser, vals): assert isinstance(res, Timedelta) assert res == exp + assert res.unit == unit + def test_iter_box_period(self): # period vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")] s = Series(vals) @@ -370,7 +376,7 @@ def test_to_numpy_copy(arr, as_series): @pytest.mark.parametrize("as_series", [True, False]) -def test_to_numpy_dtype(as_series): +def test_to_numpy_dtype(as_series, unit): tz = "US/Eastern" obj = pd.DatetimeIndex(["2000", "2001"], tz=tz) if as_series: diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index 3cdfb7fe41e92..c42d064c476bb 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -216,7 +216,7 @@ def test_value_counts_bins(index_or_series): assert s.nunique() == 0 -def test_value_counts_datetime64(index_or_series): +def test_value_counts_datetime64(index_or_series, unit): klass = index_or_series # GH 3002, datetime64[ns] @@ -233,7 +233,7 @@ def test_value_counts_datetime64(index_or_series): "2008-09-09", "2008-09-09", ] - ), + ).as_unit(unit), "food": ["PIE", "GUM", "EGG", "EGG", "PIE", "GUM"], } ) @@ -242,44 +242,52 @@ def test_value_counts_datetime64(index_or_series): s.name = None idx = pd.to_datetime( ["2010-01-01 00:00:00", "2008-09-09 00:00:00", "2009-01-01 00:00:00"] - ) + ).as_unit(unit) expected_s = Series([3, 2, 1], index=idx, name="count") tm.assert_series_equal(s.value_counts(), expected_s) expected = pd.array( np.array( ["2010-01-01 00:00:00", "2009-01-01 00:00:00", "2008-09-09 00:00:00"], - dtype="datetime64[ns]", + dtype=f"datetime64[{unit}]", ) ) + result = s.unique() if isinstance(s, Index): - tm.assert_index_equal(s.unique(), DatetimeIndex(expected)) + tm.assert_index_equal(result, DatetimeIndex(expected)) else: - tm.assert_extension_array_equal(s.unique(), expected) + tm.assert_extension_array_equal(result, expected) assert s.nunique() == 3 # with NaT s = df["dt"].copy() s = klass(list(s.values) + [pd.NaT] * 4) + if klass is Series: + s = s.dt.as_unit(unit) + else: + s = s.as_unit(unit) result = s.value_counts() - assert result.index.dtype == "datetime64[ns]" + assert result.index.dtype == f"datetime64[{unit}]" tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) expected_s = pd.concat( - [Series([4], index=DatetimeIndex([pd.NaT]), name="count"), expected_s] + [ + Series([4], index=DatetimeIndex([pd.NaT]).as_unit(unit), name="count"), + expected_s, + ] ) tm.assert_series_equal(result, expected_s) - assert s.dtype == "datetime64[ns]" + assert s.dtype == f"datetime64[{unit}]" unique = s.unique() - assert unique.dtype == "datetime64[ns]" + assert unique.dtype == f"datetime64[{unit}]" # numpy_array_equal cannot compare pd.NaT if isinstance(s, Index): - exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]) + exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]).as_unit(unit) tm.assert_index_equal(unique, exp_idx) else: tm.assert_extension_array_equal(unique[:3], expected) @@ -288,21 +296,29 @@ def test_value_counts_datetime64(index_or_series): assert s.nunique() == 3 assert s.nunique(dropna=False) == 4 + +def test_value_counts_timedelta64(index_or_series, unit): # timedelta64[ns] - td = df.dt - df.dt + timedelta(1) - td = klass(td, name="dt") + klass = index_or_series + + day = Timedelta(timedelta(1)).as_unit(unit) + tdi = TimedeltaIndex([day], name="dt").as_unit(unit) + + tdvals = np.zeros(6, dtype=f"m8[{unit}]") + day + td = klass(tdvals, name="dt") result = td.value_counts() - expected_s = Series([6], index=Index([Timedelta("1day")], name="dt"), name="count") + expected_s = Series([6], index=tdi, name="count") tm.assert_series_equal(result, expected_s) - expected = TimedeltaIndex(["1 days"], name="dt") + expected = tdi + result = td.unique() if isinstance(td, Index): - tm.assert_index_equal(td.unique(), expected) + tm.assert_index_equal(result, expected) else: - tm.assert_extension_array_equal(td.unique(), expected._values) + tm.assert_extension_array_equal(result, expected._values) - td2 = timedelta(1) + (df.dt - df.dt) + td2 = day + np.zeros(6, dtype=f"m8[{unit}]") td2 = klass(td2, name="dt") result2 = td2.value_counts() tm.assert_series_equal(result2, expected_s) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 156e50d50a9ef..1779f703dd2b7 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -218,15 +218,15 @@ def test_combine_first_align_nan(self): # TODO: this must be int64 assert res["b"].dtype == "int64" - def test_combine_first_timezone(self): + def test_combine_first_timezone(self, unit): # see gh-7630 - data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC") + data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC").as_unit(unit) df1 = DataFrame( columns=["UTCdatetime", "abc"], data=data1, index=pd.date_range("20140627", periods=1), ) - data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC") + data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC").as_unit(unit) df2 = DataFrame( columns=["UTCdatetime", "xyz"], data=data2, @@ -243,29 +243,32 @@ def test_combine_first_timezone(self): }, columns=["UTCdatetime", "abc"], index=pd.date_range("20140627", periods=2, freq="D"), + dtype=f"datetime64[{unit}, UTC]", ) - assert res["UTCdatetime"].dtype == "datetime64[ns, UTC]" - assert res["abc"].dtype == "datetime64[ns, UTC]" + assert res["UTCdatetime"].dtype == f"datetime64[{unit}, UTC]" + assert res["abc"].dtype == f"datetime64[{unit}, UTC]" tm.assert_frame_equal(res, exp) + def test_combine_first_timezone2(self, unit): # see gh-10567 - dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC") + dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC", unit=unit) df1 = DataFrame({"DATE": dts1}) - dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC") + dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC", unit=unit) df2 = DataFrame({"DATE": dts2}) res = df1.combine_first(df2) tm.assert_frame_equal(res, df1) - assert res["DATE"].dtype == "datetime64[ns, UTC]" + assert res["DATE"].dtype == f"datetime64[{unit}, UTC]" + def test_combine_first_timezone3(self, unit): dts1 = pd.DatetimeIndex( ["2011-01-01", "NaT", "2011-01-03", "2011-01-04"], tz="US/Eastern" - ) + ).as_unit(unit) df1 = DataFrame({"DATE": dts1}, index=[1, 3, 5, 7]) dts2 = pd.DatetimeIndex( ["2012-01-01", "2012-01-02", "2012-01-03"], tz="US/Eastern" - ) + ).as_unit(unit) df2 = DataFrame({"DATE": dts2}, index=[2, 4, 5]) res = df1.combine_first(df2) @@ -279,10 +282,12 @@ def test_combine_first_timezone(self): "2011-01-04", ], tz="US/Eastern", - ) + ).as_unit(unit) exp = DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) + # FIXME: parametrizing over unit breaks on non-nano + def test_combine_first_timezone4(self): # different tz dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern") df1 = DataFrame({"DATE": dts1}) @@ -294,9 +299,10 @@ def test_combine_first_timezone(self): tm.assert_frame_equal(res, df1) assert res["DATE"].dtype == "datetime64[ns, US/Eastern]" - dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern") + def test_combine_first_timezone5(self, unit): + dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern", unit=unit) df1 = DataFrame({"DATE": dts1}) - dts2 = pd.date_range("2015-01-01", "2015-01-03") + dts2 = pd.date_range("2015-01-01", "2015-01-03", unit=unit) df2 = DataFrame({"DATE": dts2}) res = df1.combine_first(df2) diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py index b401f182242b1..1b7e669232592 100644 --- a/pandas/tests/frame/methods/test_diff.py +++ b/pandas/tests/frame/methods/test_diff.py @@ -70,15 +70,17 @@ def test_diff_timedelta64_with_nat(self): tm.assert_equal(result, expected) @pytest.mark.parametrize("tz", [None, "UTC"]) - def test_diff_datetime_axis0_with_nat(self, tz): + def test_diff_datetime_axis0_with_nat(self, tz, unit): # GH#32441 - dti = pd.DatetimeIndex(["NaT", "2019-01-01", "2019-01-02"], tz=tz) + dti = pd.DatetimeIndex(["NaT", "2019-01-01", "2019-01-02"], tz=tz).as_unit(unit) ser = Series(dti) df = ser.to_frame() result = df.diff() - ex_index = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta(days=1)]) + ex_index = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta(days=1)]).as_unit( + unit + ) expected = Series(ex_index).to_frame() tm.assert_frame_equal(result, expected) @@ -140,7 +142,7 @@ def test_diff_datetime_axis1(self, tz): ) tm.assert_frame_equal(result, expected) - def test_diff_timedelta(self): + def test_diff_timedelta(self, unit): # GH#4533 df = DataFrame( { @@ -148,11 +150,13 @@ def test_diff_timedelta(self): "value": [1.0, 2.0], } ) + df["time"] = df["time"].dt.as_unit(unit) res = df.diff() exp = DataFrame( [[pd.NaT, np.nan], [pd.Timedelta("00:01:00"), 1]], columns=["time", "value"] ) + exp["time"] = exp["time"].dt.as_unit(unit) tm.assert_frame_equal(res, exp) def test_diff_mixed_dtype(self): diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 4bfe364e5eafc..1f4771f797ff9 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -352,8 +352,9 @@ def test_quantile_multi_empty(self, interp_method): ) tm.assert_frame_equal(result, expected) - def test_quantile_datetime(self): - df = DataFrame({"a": pd.to_datetime(["2010", "2011"]), "b": [0, 5]}) + def test_quantile_datetime(self, unit): + dti = pd.to_datetime(["2010", "2011"]).as_unit(unit) + df = DataFrame({"a": dti, "b": [0, 5]}) # exclude datetime result = df.quantile(0.5, numeric_only=True) @@ -370,17 +371,20 @@ def test_quantile_datetime(self): # datetime w/ multi result = df.quantile([0.5], numeric_only=False) expected = DataFrame( - [[Timestamp("2010-07-02 12:00:00"), 2.5]], index=[0.5], columns=["a", "b"] + {"a": Timestamp("2010-07-02 12:00:00").as_unit(unit), "b": 2.5}, + index=[0.5], ) + # expected["a"] = expected["a"].dt.as_unit(unit) tm.assert_frame_equal(result, expected) # axis = 1 - df["c"] = pd.to_datetime(["2011", "2012"]) + df["c"] = pd.to_datetime(["2011", "2012"]).as_unit(unit) result = df[["a", "c"]].quantile(0.5, axis=1, numeric_only=False) expected = Series( [Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")], index=[0, 1], name=0.5, + dtype=f"M8[{unit}]", ) tm.assert_series_equal(result, expected) @@ -389,6 +393,7 @@ def test_quantile_datetime(self): [[Timestamp("2010-07-02 12:00:00"), Timestamp("2011-07-02 12:00:00")]], index=[0.5], columns=[0, 1], + dtype=f"M8[{unit}]", ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/accessors/test_list_accessor.py b/pandas/tests/series/accessors/test_list_accessor.py new file mode 100644 index 0000000000000..1c60567c1a530 --- /dev/null +++ b/pandas/tests/series/accessors/test_list_accessor.py @@ -0,0 +1,129 @@ +import re + +import pytest + +from pandas import ( + ArrowDtype, + Series, +) +import pandas._testing as tm + +pa = pytest.importorskip("pyarrow") + +from pandas.compat import pa_version_under11p0 + + +@pytest.mark.parametrize( + "list_dtype", + ( + pa.list_(pa.int64()), + pa.list_(pa.int64(), list_size=3), + pa.large_list(pa.int64()), + ), +) +def test_list_getitem(list_dtype): + ser = Series( + [[1, 2, 3], [4, None, 5], None], + dtype=ArrowDtype(list_dtype), + ) + actual = ser.list[1] + expected = Series([2, None, None], dtype="int64[pyarrow]") + tm.assert_series_equal(actual, expected) + + +def test_list_getitem_slice(): + ser = Series( + [[1, 2, 3], [4, None, 5], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + ) + if pa_version_under11p0: + with pytest.raises( + NotImplementedError, match="List slice not supported by pyarrow " + ): + ser.list[1:None:None] + else: + actual = ser.list[1:None:None] + expected = Series( + [[2, 3], [None, 5], None], dtype=ArrowDtype(pa.list_(pa.int64())) + ) + tm.assert_series_equal(actual, expected) + + +def test_list_len(): + ser = Series( + [[1, 2, 3], [4, None], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + ) + actual = ser.list.len() + expected = Series([3, 2, None], dtype=ArrowDtype(pa.int32())) + tm.assert_series_equal(actual, expected) + + +def test_list_flatten(): + ser = Series( + [[1, 2, 3], [4, None], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + ) + actual = ser.list.flatten() + expected = Series([1, 2, 3, 4, None], dtype=ArrowDtype(pa.int64())) + tm.assert_series_equal(actual, expected) + + +def test_list_getitem_slice_invalid(): + ser = Series( + [[1, 2, 3], [4, None, 5], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + ) + if pa_version_under11p0: + with pytest.raises( + NotImplementedError, match="List slice not supported by pyarrow " + ): + ser.list[1:None:0] + else: + with pytest.raises(pa.lib.ArrowInvalid, match=re.escape("`step` must be >= 1")): + ser.list[1:None:0] + + +def test_list_accessor_non_list_dtype(): + ser = Series( + [1, 2, 4], + dtype=ArrowDtype(pa.int64()), + ) + with pytest.raises( + AttributeError, + match=re.escape( + "Can only use the '.list' accessor with 'list[pyarrow]' dtype, " + "not int64[pyarrow]." + ), + ): + ser.list[1:None:0] + + +@pytest.mark.parametrize( + "list_dtype", + ( + pa.list_(pa.int64()), + pa.list_(pa.int64(), list_size=3), + pa.large_list(pa.int64()), + ), +) +def test_list_getitem_invalid_index(list_dtype): + ser = Series( + [[1, 2, 3], [4, None, 5], None], + dtype=ArrowDtype(list_dtype), + ) + with pytest.raises(pa.lib.ArrowInvalid, match="Index -1 is out of bounds"): + ser.list[-1] + with pytest.raises(pa.lib.ArrowInvalid, match="Index 5 is out of bounds"): + ser.list[5] + with pytest.raises(ValueError, match="key must be an int or slice, got str"): + ser.list["abc"] + + +def test_list_accessor_not_iterable(): + ser = Series( + [[1, 2, 3], [4, None], None], + dtype=ArrowDtype(pa.list_(pa.int64())), + ) + with pytest.raises(TypeError, match="'ListAccessor' object is not iterable"): + iter(ser.list) diff --git a/pandas/tests/series/accessors/test_struct_accessor.py b/pandas/tests/series/accessors/test_struct_accessor.py index c645bb6807052..1ec5b3b726d17 100644 --- a/pandas/tests/series/accessors/test_struct_accessor.py +++ b/pandas/tests/series/accessors/test_struct_accessor.py @@ -9,7 +9,6 @@ Series, ) import pandas._testing as tm -from pandas.core.arrays.arrow.accessors import StructAccessor pa = pytest.importorskip("pyarrow") @@ -141,7 +140,11 @@ def test_struct_accessor_explode(): ], ) def test_struct_accessor_api_for_invalid(invalid): - msg = re.escape(StructAccessor._validation_msg.format(dtype=invalid.dtype)) - - with pytest.raises(AttributeError, match=msg): + with pytest.raises( + AttributeError, + match=re.escape( + "Can only use the '.struct' accessor with 'struct[pyarrow]' dtype, " + f"not {invalid.dtype}." + ), + ): invalid.struct diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index fd5d92dc35249..a6d7bcb47fbb2 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -721,59 +721,31 @@ def test_categorical(self): result = pd.unique(ci) tm.assert_index_equal(result, expected) - def test_datetime64tz_aware(self): + def test_datetime64tz_aware(self, unit): # GH 15939 - result = Series( - Index( - [ - Timestamp("20160101", tz="US/Eastern"), - Timestamp("20160101", tz="US/Eastern"), - ] - ) - ).unique() - expected = DatetimeArray._from_sequence( - np.array([Timestamp("2016-01-01 00:00:00-0500", tz="US/Eastern")]) - ) - tm.assert_extension_array_equal(result, expected) - - result = Index( + dti = Index( [ Timestamp("20160101", tz="US/Eastern"), Timestamp("20160101", tz="US/Eastern"), ] - ).unique() - expected = DatetimeIndex( - ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None - ) + ).as_unit(unit) + ser = Series(dti) + + result = ser.unique() + expected = dti[:1]._data + tm.assert_extension_array_equal(result, expected) + + result = dti.unique() + expected = dti[:1] tm.assert_index_equal(result, expected) - result = pd.unique( - Series( - Index( - [ - Timestamp("20160101", tz="US/Eastern"), - Timestamp("20160101", tz="US/Eastern"), - ] - ) - ) - ) - expected = DatetimeArray._from_sequence( - np.array([Timestamp("2016-01-01", tz="US/Eastern")]) - ) + result = pd.unique(ser) + expected = dti[:1]._data tm.assert_extension_array_equal(result, expected) - result = pd.unique( - Index( - [ - Timestamp("20160101", tz="US/Eastern"), - Timestamp("20160101", tz="US/Eastern"), - ] - ) - ) - expected = DatetimeIndex( - ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None - ) + result = pd.unique(dti) + expected = dti[:1] tm.assert_index_equal(result, expected) def test_order_of_appearance(self):