Skip to content

Commit

Permalink
List accessor (#55777)
Browse files Browse the repository at this point in the history
* inital implementation, no documentation

* revert

* non list test

* docstring wip

* add list accessor to series.rst

* whatsnew

* fix

* fix typehint

* private

* fix docstring

* fail on iter

* list_slice only impl in pyarrow 11

* fix docstring?

* fix

* fix test

* fix validation msg

* fix

* fix

* remove private

* maybe fix

* one more remove

---------

Co-authored-by: Rohan Jain <[email protected]>
  • Loading branch information
rohanjain101 and Rohan Jain authored Nov 3, 2023
1 parent 8e0411f commit 2c1d4bb
Show file tree
Hide file tree
Showing 7 changed files with 389 additions and 31 deletions.
17 changes: 17 additions & 0 deletions doc/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,23 @@ Sparse-dtype specific methods and attributes are provided under the
Series.sparse.to_coo


.. _api.series.list:

List accessor
~~~~~~~~~~~~~

Arrow list-dtype specific methods and attributes are provided under the
``Series.list`` accessor.

.. autosummary::
:toctree: api/
:template: autosummary/accessor_method.rst

Series.list.flatten
Series.list.len
Series.list.__getitem__


.. _api.series.struct:

Struct accessor
Expand Down
26 changes: 23 additions & 3 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,30 @@ DataFrame. (:issue:`54938`)
)
series.struct.explode()
.. _whatsnew_220.enhancements.enhancement2:
.. _whatsnew_220.enhancements.list_accessor:

enhancement2
^^^^^^^^^^^^
Series.list accessor for PyArrow list data
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The ``Series.list`` accessor provides attributes and methods for processing
data with ``list[pyarrow]`` dtype Series. For example,
:meth:`Series.list.__getitem__` allows indexing pyarrow lists in
a Series. (:issue:`55323`)

.. ipython:: python
import pyarrow as pa
series = pd.Series(
[
[1, 2, 3],
[4, 5],
[6],
],
dtype=pd.ArrowDtype(
pa.list_(pa.int64())
),
)
series.list[0]
.. _whatsnew_220.enhancements.other:

Expand Down
7 changes: 5 additions & 2 deletions pandas/core/arrays/arrow/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from pandas.core.arrays.arrow.accessors import StructAccessor
from pandas.core.arrays.arrow.accessors import (
ListAccessor,
StructAccessor,
)
from pandas.core.arrays.arrow.array import ArrowExtensionArray

__all__ = ["ArrowExtensionArray", "StructAccessor"]
__all__ = ["ArrowExtensionArray", "StructAccessor", "ListAccessor"]
224 changes: 203 additions & 21 deletions pandas/core/arrays/arrow/accessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,16 @@

from __future__ import annotations

from abc import (
ABCMeta,
abstractmethod,
)
from typing import TYPE_CHECKING

from pandas.compat import pa_version_under10p1
from pandas.compat import (
pa_version_under10p1,
pa_version_under11p0,
)

if not pa_version_under10p1:
import pyarrow as pa
Expand All @@ -13,13 +20,194 @@
from pandas.core.dtypes.dtypes import ArrowDtype

if TYPE_CHECKING:
from collections.abc import Iterator

from pandas import (
DataFrame,
Series,
)


class StructAccessor:
class ArrowAccessor(metaclass=ABCMeta):
@abstractmethod
def __init__(self, data, validation_msg: str) -> None:
self._data = data
self._validation_msg = validation_msg
self._validate(data)

@abstractmethod
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
pass

def _validate(self, data):
dtype = data.dtype
if not isinstance(dtype, ArrowDtype):
# Raise AttributeError so that inspect can handle non-struct Series.
raise AttributeError(self._validation_msg.format(dtype=dtype))

if not self._is_valid_pyarrow_dtype(dtype.pyarrow_dtype):
# Raise AttributeError so that inspect can handle invalid Series.
raise AttributeError(self._validation_msg.format(dtype=dtype))

@property
def _pa_array(self):
return self._data.array._pa_array


class ListAccessor(ArrowAccessor):
"""
Accessor object for list data properties of the Series values.
Parameters
----------
data : Series
Series containing Arrow list data.
"""

def __init__(self, data=None) -> None:
super().__init__(
data,
validation_msg="Can only use the '.list' accessor with "
"'list[pyarrow]' dtype, not {dtype}.",
)

def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
return (
pa.types.is_list(pyarrow_dtype)
or pa.types.is_fixed_size_list(pyarrow_dtype)
or pa.types.is_large_list(pyarrow_dtype)
)

def len(self) -> Series:
"""
Return the length of each list in the Series.
Returns
-------
pandas.Series
The length of each list.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... [1, 2, 3],
... [3],
... ],
... dtype=pd.ArrowDtype(pa.list_(
... pa.int64()
... ))
... )
>>> s.list.len()
0 3
1 1
dtype: int32[pyarrow]
"""
from pandas import Series

value_lengths = pc.list_value_length(self._pa_array)
return Series(value_lengths, dtype=ArrowDtype(value_lengths.type))

def __getitem__(self, key: int | slice) -> Series:
"""
Index or slice lists in the Series.
Parameters
----------
key : int | slice
Index or slice of indices to access from each list.
Returns
-------
pandas.Series
The list at requested index.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... [1, 2, 3],
... [3],
... ],
... dtype=pd.ArrowDtype(pa.list_(
... pa.int64()
... ))
... )
>>> s.list[0]
0 1
1 3
dtype: int64[pyarrow]
"""
from pandas import Series

if isinstance(key, int):
# TODO: Support negative key but pyarrow does not allow
# element index to be an array.
# if key < 0:
# key = pc.add(key, pc.list_value_length(self._pa_array))
element = pc.list_element(self._pa_array, key)
return Series(element, dtype=ArrowDtype(element.type))
elif isinstance(key, slice):
if pa_version_under11p0:
raise NotImplementedError(
f"List slice not supported by pyarrow {pa.__version__}."
)

# TODO: Support negative start/stop/step, ideally this would be added
# upstream in pyarrow.
start, stop, step = key.start, key.stop, key.step
if start is None:
# TODO: When adding negative step support
# this should be setto last element of array
# when step is negative.
start = 0
if step is None:
step = 1
sliced = pc.list_slice(self._pa_array, start, stop, step)
return Series(sliced, dtype=ArrowDtype(sliced.type))
else:
raise ValueError(f"key must be an int or slice, got {type(key).__name__}")

def __iter__(self) -> Iterator:
raise TypeError(f"'{type(self).__name__}' object is not iterable")

def flatten(self) -> Series:
"""
Flatten list values.
Returns
-------
pandas.Series
The data from all lists in the series flattened.
Examples
--------
>>> import pyarrow as pa
>>> s = pd.Series(
... [
... [1, 2, 3],
... [3],
... ],
... dtype=pd.ArrowDtype(pa.list_(
... pa.int64()
... ))
... )
>>> s.list.flatten()
0 1
1 2
2 3
3 3
dtype: int64[pyarrow]
"""
from pandas import Series

flattened = pc.list_flatten(self._pa_array)
return Series(flattened, dtype=ArrowDtype(flattened.type))


class StructAccessor(ArrowAccessor):
"""
Accessor object for structured data properties of the Series values.
Expand All @@ -29,23 +217,17 @@ class StructAccessor:
Series containing Arrow struct data.
"""

_validation_msg = (
"Can only use the '.struct' accessor with 'struct[pyarrow]' dtype, not {dtype}."
)

def __init__(self, data=None) -> None:
self._parent = data
self._validate(data)

def _validate(self, data):
dtype = data.dtype
if not isinstance(dtype, ArrowDtype):
# Raise AttributeError so that inspect can handle non-struct Series.
raise AttributeError(self._validation_msg.format(dtype=dtype))
super().__init__(
data,
validation_msg=(
"Can only use the '.struct' accessor with 'struct[pyarrow]' "
"dtype, not {dtype}."
),
)

if not pa.types.is_struct(dtype.pyarrow_dtype):
# Raise AttributeError so that inspect can handle non-struct Series.
raise AttributeError(self._validation_msg.format(dtype=dtype))
def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
return pa.types.is_struct(pyarrow_dtype)

@property
def dtypes(self) -> Series:
Expand Down Expand Up @@ -80,7 +262,7 @@ def dtypes(self) -> Series:
Series,
)

pa_type = self._parent.dtype.pyarrow_dtype
pa_type = self._data.dtype.pyarrow_dtype
types = [ArrowDtype(struct.type) for struct in pa_type]
names = [struct.name for struct in pa_type]
return Series(types, index=Index(names))
Expand Down Expand Up @@ -135,7 +317,7 @@ def field(self, name_or_index: str | int) -> Series:
"""
from pandas import Series

pa_arr = self._parent.array._pa_array
pa_arr = self._data.array._pa_array
if isinstance(name_or_index, int):
index = name_or_index
elif isinstance(name_or_index, str):
Expand All @@ -151,7 +333,7 @@ def field(self, name_or_index: str | int) -> Series:
return Series(
field_arr,
dtype=ArrowDtype(field_arr.type),
index=self._parent.index,
index=self._data.index,
name=pa_field.name,
)

Expand Down Expand Up @@ -190,7 +372,7 @@ def explode(self) -> DataFrame:
"""
from pandas import concat

pa_type = self._parent.dtype.pyarrow_dtype
pa_type = self._pa_array.type
return concat(
[self.field(i) for i in range(pa_type.num_fields)], axis="columns"
)
6 changes: 5 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,10 @@
from pandas.core.accessor import CachedAccessor
from pandas.core.apply import SeriesApply
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.arrow import StructAccessor
from pandas.core.arrays.arrow import (
ListAccessor,
StructAccessor,
)
from pandas.core.arrays.categorical import CategoricalAccessor
from pandas.core.arrays.sparse import SparseAccessor
from pandas.core.arrays.string_ import StringDtype
Expand Down Expand Up @@ -5891,6 +5894,7 @@ def to_period(self, freq: str | None = None, copy: bool | None = None) -> Series
plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
sparse = CachedAccessor("sparse", SparseAccessor)
struct = CachedAccessor("struct", StructAccessor)
list = CachedAccessor("list", ListAccessor)

# ----------------------------------------------------------------------
# Add plotting methods to Series
Expand Down
Loading

0 comments on commit 2c1d4bb

Please sign in to comment.