Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/2.3.x' into backport-60245
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche committed Dec 18, 2024
2 parents 3aa4a70 + eb22bf8 commit 4e4079e
Show file tree
Hide file tree
Showing 94 changed files with 964 additions and 599 deletions.
1 change: 0 additions & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ jobs:
- checkout
- run: .circleci/setup_env.sh
- run: |
sudo apt-get update && sudo apt-get install -y libegl1 libopengl0
PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH \
LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD \
ci/run_tests.sh
Expand Down
4 changes: 2 additions & 2 deletions doc/source/whatsnew/v2.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,10 @@ Conversion
Strings
^^^^^^^
- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
- Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`)
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
-

Interval
^^^^^^^^
Expand All @@ -119,7 +119,7 @@ Interval

Indexing
^^^^^^^^
-
- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`)
-

Missing
Expand Down
3 changes: 3 additions & 0 deletions pandas/_libs/index.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ class MaskedUInt16Engine(MaskedIndexEngine): ...
class MaskedUInt8Engine(MaskedIndexEngine): ...
class MaskedBoolEngine(MaskedUInt8Engine): ...

class StringObjectEngine(ObjectEngine):
def __init__(self, values: object, na_value) -> None: ...

class BaseMultiIndexCodesEngine:
levels: list[np.ndarray]
offsets: np.ndarray # ndarray[uint64_t, ndim=1]
Expand Down
26 changes: 26 additions & 0 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,32 @@ cdef class ObjectEngine(IndexEngine):
return loc


cdef class StringObjectEngine(ObjectEngine):

cdef:
object na_value
bint uses_na

def __init__(self, ndarray values, na_value):
super().__init__(values)
self.na_value = na_value
self.uses_na = na_value is C_NA

cdef bint _checknull(self, object val):
if self.uses_na:
return val is C_NA
else:
return util.is_nan(val)

cdef _check_type(self, object val):
if isinstance(val, str):
return val
elif self._checknull(val):
return self.na_value
else:
raise KeyError(val)


cdef class DatetimeEngine(Int64Engine):

cdef:
Expand Down
3 changes: 3 additions & 0 deletions pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def maybe_convert_objects(
safe: bool = ...,
convert_numeric: bool = ...,
convert_non_numeric: Literal[False] = ...,
convert_string: Literal[False] = ...,
convert_to_nullable_dtype: Literal[False] = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> npt.NDArray[np.object_ | np.number]: ...
Expand All @@ -97,6 +98,7 @@ def maybe_convert_objects(
safe: bool = ...,
convert_numeric: bool = ...,
convert_non_numeric: bool = ...,
convert_string: bool = ...,
convert_to_nullable_dtype: Literal[True] = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> ArrayLike: ...
Expand All @@ -108,6 +110,7 @@ def maybe_convert_objects(
safe: bool = ...,
convert_numeric: bool = ...,
convert_non_numeric: bool = ...,
convert_string: bool = ...,
convert_to_nullable_dtype: bool = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> ArrayLike: ...
Expand Down
13 changes: 12 additions & 1 deletion pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2498,6 +2498,7 @@ def maybe_convert_objects(ndarray[object] objects,
bint convert_numeric=True, # NB: different default!
bint convert_to_nullable_dtype=False,
bint convert_non_numeric=False,
bint convert_string=True,
object dtype_if_all_nat=None) -> "ArrayLike":
"""
Type inference function-- convert object array to proper dtype
Expand Down Expand Up @@ -2741,7 +2742,17 @@ def maybe_convert_objects(ndarray[object] objects,
seen.object_ = True

elif seen.str_:
if using_string_dtype() and is_string_array(objects, skipna=True):
if convert_to_nullable_dtype and is_string_array(objects, skipna=True):
from pandas.core.arrays.string_ import StringDtype

dtype = StringDtype()
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)

elif (
convert_string
and using_string_dtype()
and is_string_array(objects, skipna=True)
):
from pandas.core.arrays.string_ import StringDtype

dtype = StringDtype(na_value=np.nan)
Expand Down
2 changes: 2 additions & 0 deletions pandas/compat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
pa_version_under14p1,
pa_version_under16p0,
pa_version_under17p0,
pa_version_under18p0,
)

if TYPE_CHECKING:
Expand Down Expand Up @@ -191,6 +192,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]:
"pa_version_under14p1",
"pa_version_under16p0",
"pa_version_under17p0",
"pa_version_under18p0",
"HAS_PYARROW",
"IS64",
"ISMUSL",
Expand Down
2 changes: 2 additions & 0 deletions pandas/compat/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
pa_version_under15p0 = _palv < Version("15.0.0")
pa_version_under16p0 = _palv < Version("16.0.0")
pa_version_under17p0 = _palv < Version("17.0.0")
pa_version_under18p0 = _palv < Version("18.0.0")
HAS_PYARROW = True
except ImportError:
pa_version_under10p1 = True
Expand All @@ -28,4 +29,5 @@
pa_version_under15p0 = True
pa_version_under16p0 = True
pa_version_under17p0 = True
pa_version_under18p0 = False
HAS_PYARROW = False
20 changes: 19 additions & 1 deletion pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1633,7 +1633,11 @@ def _accumulate(
else:
data_to_accum = data_to_accum.cast(pa.int64())

result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
try:
result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
except pa.ArrowNotImplementedError as err:
msg = f"operation '{name}' not supported for dtype '{self.dtype}'"
raise TypeError(msg) from err

if convert_to_int:
result = result.cast(pa_dtype)
Expand Down Expand Up @@ -2285,6 +2289,20 @@ def _groupby_op(
**kwargs,
):
if isinstance(self.dtype, StringDtype):
if how in [
"prod",
"mean",
"median",
"cumsum",
"cumprod",
"std",
"sem",
"var",
"skew",
]:
raise TypeError(
f"dtype '{self.dtype}' does not support operation '{how}'"
)
return super()._groupby_op(
how=how,
has_dropped_na=has_dropped_na,
Expand Down
14 changes: 14 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2369,6 +2369,20 @@ def _groupby_op(
# GH#43682
if isinstance(self.dtype, StringDtype):
# StringArray
if op.how in [
"prod",
"mean",
"median",
"cumsum",
"cumprod",
"std",
"sem",
"var",
"skew",
]:
raise TypeError(
f"dtype '{self.dtype}' does not support operation '{how}'"
)
if op.how not in ["any", "all"]:
# Fail early to avoid conversion to object
op._get_cython_function(op.kind, op.how, np.dtype(object), False)
Expand Down
45 changes: 26 additions & 19 deletions pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -726,20 +726,9 @@ def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]:

return arr, self.dtype.na_value

def __setitem__(self, key, value) -> None:
value = extract_array(value, extract_numpy=True)
if isinstance(value, type(self)):
# extract_array doesn't extract NumpyExtensionArray subclasses
value = value._ndarray

key = check_array_indexer(self, key)
scalar_key = lib.is_scalar(key)
scalar_value = lib.is_scalar(value)
if scalar_key and not scalar_value:
raise ValueError("setting an array element with a sequence.")

# validate new items
if scalar_value:
def _maybe_convert_setitem_value(self, value):
"""Maybe convert value to be pyarrow compatible."""
if lib.is_scalar(value):
if isna(value):
value = self.dtype.na_value
elif not isinstance(value, str):
Expand All @@ -749,8 +738,11 @@ def __setitem__(self, key, value) -> None:
"instead."
)
else:
value = extract_array(value, extract_numpy=True)
if not is_array_like(value):
value = np.asarray(value, dtype=object)
elif isinstance(value.dtype, type(self.dtype)):
return value
else:
# cast categories and friends to arrays to see if values are
# compatible, compatibility with arrow backed strings
Expand All @@ -760,11 +752,26 @@ def __setitem__(self, key, value) -> None:
"Invalid value for dtype 'str'. Value should be a "
"string or missing value (or array of those)."
)
return value

mask = isna(value)
if mask.any():
value = value.copy()
value[isna(value)] = self.dtype.na_value
def __setitem__(self, key, value) -> None:
value = self._maybe_convert_setitem_value(value)

key = check_array_indexer(self, key)
scalar_key = lib.is_scalar(key)
scalar_value = lib.is_scalar(value)
if scalar_key and not scalar_value:
raise ValueError("setting an array element with a sequence.")

if not scalar_value:
if value.dtype == self.dtype:
value = value._ndarray
else:
value = np.asarray(value)
mask = isna(value)
if mask.any():
value = value.copy()
value[isna(value)] = self.dtype.na_value

super().__setitem__(key, value)

Expand Down Expand Up @@ -846,7 +853,7 @@ def _reduce(
else:
return nanops.nanall(self._ndarray, skipna=skipna)

if name in ["min", "max", "sum"]:
if name in ["min", "max", "argmin", "argmax", "sum"]:
result = getattr(self, name)(skipna=skipna, axis=axis, **kwargs)
if keepdims:
return self._from_sequence([result], dtype=self.dtype)
Expand Down
20 changes: 15 additions & 5 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@

if TYPE_CHECKING:
from collections.abc import (
Collection,
Sequence,
Sized,
)

from pandas._typing import (
Expand Down Expand Up @@ -1163,6 +1163,7 @@ def convert_dtypes(

def maybe_infer_to_datetimelike(
value: npt.NDArray[np.object_],
convert_to_nullable_dtype: bool = False,
) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray | IntervalArray:
"""
we might have a array (or single object) that is datetime like,
Expand Down Expand Up @@ -1200,6 +1201,7 @@ def maybe_infer_to_datetimelike(
# numpy would have done it for us.
convert_numeric=False,
convert_non_numeric=True,
convert_to_nullable_dtype=convert_to_nullable_dtype,
dtype_if_all_nat=np.dtype("M8[ns]"),
)

Expand Down Expand Up @@ -1584,7 +1586,7 @@ def _maybe_box_and_unbox_datetimelike(value: Scalar, dtype: DtypeObj):
return _maybe_unbox_datetimelike(value, dtype)


def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray:
def construct_1d_object_array_from_listlike(values: Collection) -> np.ndarray:
"""
Transform any list-like object in a 1-dimensional numpy array of object
dtype.
Expand All @@ -1602,10 +1604,11 @@ def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray:
-------
1-dimensional numpy array of dtype object
"""
# numpy will try to interpret nested lists as further dimensions, hence
# making a 1D array that contains list-likes is a bit tricky:
# numpy will try to interpret nested lists as further dimensions in np.array(),
# hence explicitly making a 1D array using np.fromiter
result = np.empty(len(values), dtype="object")
result[:] = values
for i, obj in enumerate(values):
result[i] = obj
return result


Expand Down Expand Up @@ -1754,6 +1757,13 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool:
except (ValueError, TypeError):
return False

if dtype == "string":
try:
arr._maybe_convert_setitem_value(element) # type: ignore[union-attr]
return True
except (ValueError, TypeError):
return False

# This is technically incorrect, but maintains the behavior of
# ExtensionBlock._can_hold_element
return True
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -4394,9 +4394,9 @@ def quantile(
starts, ends = lib.generate_slices(splitter._slabels, splitter.ngroups)

def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]:
if is_object_dtype(vals.dtype):
if isinstance(vals.dtype, StringDtype) or is_object_dtype(vals.dtype):
raise TypeError(
"'quantile' cannot be performed against 'object' dtypes!"
f"dtype '{vals.dtype}' does not support operation 'quantile'"
)

inference: DtypeObj | None = None
Expand Down
14 changes: 12 additions & 2 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -884,6 +884,8 @@ def _engine(
# error: Item "ExtensionArray" of "Union[ExtensionArray,
# ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr]
target_values = self._data._ndarray # type: ignore[union-attr]
elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype):
return libindex.StringObjectEngine(target_values, self.dtype.na_value) # type: ignore[union-attr]

# error: Argument 1 to "ExtensionEngine" has incompatible type
# "ndarray[Any, Any]"; expected "ExtensionArray"
Expand Down Expand Up @@ -6133,7 +6135,6 @@ def _should_fallback_to_positional(self) -> bool:
def get_indexer_non_unique(
self, target
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
target = ensure_index(target)
target = self._maybe_cast_listlike_indexer(target)

if not self._should_compare(target) and not self._should_partial_index(target):
Expand Down Expand Up @@ -6695,7 +6696,16 @@ def _maybe_cast_listlike_indexer(self, target) -> Index:
"""
Analogue to maybe_cast_indexer for get_indexer instead of get_loc.
"""
return ensure_index(target)
target_index = ensure_index(target)
if (
not hasattr(target, "dtype")
and self.dtype == object
and target_index.dtype == "string"
):
# If we started with a list-like, avoid inference to string dtype if self
# is object dtype (coercing to string dtype will alter the missing values)
target_index = Index(target, dtype=self.dtype)
return target_index

@final
def _validate_indexer(
Expand Down
Loading

0 comments on commit 4e4079e

Please sign in to comment.