Skip to content

Commit

Permalink
Merge branch 'main' into fix-59242
Browse files Browse the repository at this point in the history
  • Loading branch information
kastkeepitjumpinlikekangaroos authored Dec 13, 2024
2 parents 3d83bab + c52846f commit 6ea5785
Show file tree
Hide file tree
Showing 24 changed files with 240 additions and 86 deletions.
1 change: 0 additions & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ jobs:
fi
python -m pip install --no-build-isolation -ve . -Csetup-args="--werror"
PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH
sudo apt-get update && sudo apt-get install -y libegl1 libopengl0
ci/run_tests.sh
test-linux-musl:
docker:
Expand Down
4 changes: 0 additions & 4 deletions ci/code_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.arrays.IntervalArray.length SA01" \
-i "pandas.arrays.NumpyExtensionArray SA01" \
-i "pandas.arrays.TimedeltaArray PR07,SA01" \
-i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \
-i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
-i "pandas.core.groupby.SeriesGroupBy.plot PR02" \
-i "pandas.core.resample.Resampler.max PR01,RT03,SA01" \
Expand All @@ -95,9 +94,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
-i "pandas.core.resample.Resampler.std SA01" \
-i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \
-i "pandas.core.resample.Resampler.var SA01" \
-i "pandas.errors.NullFrequencyError SA01" \
-i "pandas.errors.NumbaUtilError SA01" \
-i "pandas.errors.PerformanceWarning SA01" \
-i "pandas.errors.UndefinedVariableError PR01,SA01" \
-i "pandas.errors.ValueLabelTypeMismatch SA01" \
-i "pandas.io.json.build_table_schema PR07,RT03,SA01" \
Expand Down
2 changes: 0 additions & 2 deletions doc/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,6 @@ Reindexing / selection / label manipulation
DataFrame.duplicated
DataFrame.equals
DataFrame.filter
DataFrame.head
DataFrame.idxmax
DataFrame.idxmin
DataFrame.reindex
Expand All @@ -196,7 +195,6 @@ Reindexing / selection / label manipulation
DataFrame.sample
DataFrame.set_axis
DataFrame.set_index
DataFrame.tail
DataFrame.take
DataFrame.truncate

Expand Down
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -626,6 +626,7 @@ Datetimelike
- Bug in :meth:`DatetimeIndex.union` and :meth:`DatetimeIndex.intersection` when ``unit`` was non-nanosecond (:issue:`59036`)
- Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`)
- Bug in :meth:`to_datetime` not respecting dayfirst if an uncommon date string was passed. (:issue:`58859`)
- Bug in :meth:`to_datetime` on float32 df with year, month, day etc. columns leads to precision issues and incorrect result. (:issue:`60506`)
- Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`)
- Bug in :meth:`to_datetime` wrongly converts when ``arg`` is a ``np.datetime64`` object with unit of ``ps``. (:issue:`60341`)
- Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`)
Expand Down Expand Up @@ -798,6 +799,7 @@ Other
- Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`)
- Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)
- Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`)
- Bug in ``Series.list`` methods not preserving the original name. (:issue:`60522`)
- Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`)

.. ***DO NOT USE THIS SECTION***
Expand Down
24 changes: 20 additions & 4 deletions pandas/core/arrays/arrow/accessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,10 @@ def len(self) -> Series:

value_lengths = pc.list_value_length(self._pa_array)
return Series(
value_lengths, dtype=ArrowDtype(value_lengths.type), index=self._data.index
value_lengths,
dtype=ArrowDtype(value_lengths.type),
index=self._data.index,
name=self._data.name,
)

def __getitem__(self, key: int | slice) -> Series:
Expand Down Expand Up @@ -162,7 +165,10 @@ def __getitem__(self, key: int | slice) -> Series:
# key = pc.add(key, pc.list_value_length(self._pa_array))
element = pc.list_element(self._pa_array, key)
return Series(
element, dtype=ArrowDtype(element.type), index=self._data.index
element,
dtype=ArrowDtype(element.type),
index=self._data.index,
name=self._data.name,
)
elif isinstance(key, slice):
if pa_version_under11p0:
Expand All @@ -181,7 +187,12 @@ def __getitem__(self, key: int | slice) -> Series:
if step is None:
step = 1
sliced = pc.list_slice(self._pa_array, start, stop, step)
return Series(sliced, dtype=ArrowDtype(sliced.type), index=self._data.index)
return Series(
sliced,
dtype=ArrowDtype(sliced.type),
index=self._data.index,
name=self._data.name,
)
else:
raise ValueError(f"key must be an int or slice, got {type(key).__name__}")

Expand Down Expand Up @@ -223,7 +234,12 @@ def flatten(self) -> Series:
counts = pa.compute.list_value_length(self._pa_array)
flattened = pa.compute.list_flatten(self._pa_array)
index = self._data.index.repeat(counts.fill_null(pa.scalar(0, counts.type)))
return Series(flattened, dtype=ArrowDtype(flattened.type), index=index)
return Series(
flattened,
dtype=ArrowDtype(flattened.type),
index=index,
name=self._data.name,
)


class StructAccessor(ArrowAccessor):
Expand Down
73 changes: 38 additions & 35 deletions pandas/core/computation/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,23 +65,23 @@ def set_numexpr_threads(n=None) -> None:
ne.set_num_threads(n)


def _evaluate_standard(op, op_str, a, b):
def _evaluate_standard(op, op_str, left_op, right_op):
"""
Standard evaluation.
"""
if _TEST_MODE:
_store_test_result(False)
return op(a, b)
return op(left_op, right_op)


def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool:
"""return a boolean if we WILL be using numexpr"""
def _can_use_numexpr(op, op_str, left_op, right_op, dtype_check) -> bool:
"""return left_op boolean if we WILL be using numexpr"""
if op_str is not None:
# required min elements (otherwise we are adding overhead)
if a.size > _MIN_ELEMENTS:
if left_op.size > _MIN_ELEMENTS:
# check for dtype compatibility
dtypes: set[str] = set()
for o in [a, b]:
for o in [left_op, right_op]:
# ndarray and Series Case
if hasattr(o, "dtype"):
dtypes |= {o.dtype.name}
Expand All @@ -93,43 +93,43 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool:
return False


def _evaluate_numexpr(op, op_str, a, b):
def _evaluate_numexpr(op, op_str, left_op, right_op):
result = None

if _can_use_numexpr(op, op_str, a, b, "evaluate"):
if _can_use_numexpr(op, op_str, left_op, right_op, "evaluate"):
is_reversed = op.__name__.strip("_").startswith("r")
if is_reversed:
# we were originally called by a reversed op method
a, b = b, a
left_op, right_op = right_op, left_op

a_value = a
b_value = b
left_value = left_op
right_value = right_op

try:
result = ne.evaluate(
f"a_value {op_str} b_value",
local_dict={"a_value": a_value, "b_value": b_value},
f"left_value {op_str} right_value",
local_dict={"left_value": left_value, "right_value": right_value},
casting="safe",
)
except TypeError:
# numexpr raises eg for array ** array with integers
# (https://github.com/pydata/numexpr/issues/379)
pass
except NotImplementedError:
if _bool_arith_fallback(op_str, a, b):
if _bool_arith_fallback(op_str, left_op, right_op):
pass
else:
raise

if is_reversed:
# reverse order to original for fallback
a, b = b, a
left_op, right_op = right_op, left_op

if _TEST_MODE:
_store_test_result(result is not None)

if result is None:
result = _evaluate_standard(op, op_str, a, b)
result = _evaluate_standard(op, op_str, left_op, right_op)

return result

Expand Down Expand Up @@ -170,24 +170,24 @@ def _evaluate_numexpr(op, op_str, a, b):
}


def _where_standard(cond, a, b):
def _where_standard(cond, left_op, right_op):
# Caller is responsible for extracting ndarray if necessary
return np.where(cond, a, b)
return np.where(cond, left_op, right_op)


def _where_numexpr(cond, a, b):
def _where_numexpr(cond, left_op, right_op):
# Caller is responsible for extracting ndarray if necessary
result = None

if _can_use_numexpr(None, "where", a, b, "where"):
if _can_use_numexpr(None, "where", left_op, right_op, "where"):
result = ne.evaluate(
"where(cond_value, a_value, b_value)",
local_dict={"cond_value": cond, "a_value": a, "b_value": b},
local_dict={"cond_value": cond, "a_value": left_op, "b_value": right_op},
casting="safe",
)

if result is None:
result = _where_standard(cond, a, b)
result = _where_standard(cond, left_op, right_op)

return result

Expand All @@ -206,13 +206,13 @@ def _has_bool_dtype(x):
_BOOL_OP_UNSUPPORTED = {"+": "|", "*": "&", "-": "^"}


def _bool_arith_fallback(op_str, a, b) -> bool:
def _bool_arith_fallback(op_str, left_op, right_op) -> bool:
"""
Check if we should fallback to the python `_evaluate_standard` in case
of an unsupported operation by numexpr, which is the case for some
boolean ops.
"""
if _has_bool_dtype(a) and _has_bool_dtype(b):
if _has_bool_dtype(left_op) and _has_bool_dtype(right_op):
if op_str in _BOOL_OP_UNSUPPORTED:
warnings.warn(
f"evaluating in Python space because the {op_str!r} "
Expand All @@ -224,40 +224,43 @@ def _bool_arith_fallback(op_str, a, b) -> bool:
return False


def evaluate(op, a, b, use_numexpr: bool = True):
def evaluate(op, left_op, right_op, use_numexpr: bool = True):
"""
Evaluate and return the expression of the op on a and b.
Evaluate and return the expression of the op on left_op and right_op.
Parameters
----------
op : the actual operand
a : left operand
b : right operand
left_op : left operand
right_op : right operand
use_numexpr : bool, default True
Whether to try to use numexpr.
"""
op_str = _op_str_mapping[op]
if op_str is not None:
if use_numexpr:
# error: "None" not callable
return _evaluate(op, op_str, a, b) # type: ignore[misc]
return _evaluate_standard(op, op_str, a, b)
return _evaluate(op, op_str, left_op, right_op) # type: ignore[misc]
return _evaluate_standard(op, op_str, left_op, right_op)


def where(cond, a, b, use_numexpr: bool = True):
def where(cond, left_op, right_op, use_numexpr: bool = True):
"""
Evaluate the where condition cond on a and b.
Evaluate the where condition cond on left_op and right_op.
Parameters
----------
cond : np.ndarray[bool]
a : return if cond is True
b : return if cond is False
left_op : return if cond is True
right_op : return if cond is False
use_numexpr : bool, default True
Whether to try to use numexpr.
"""
assert _where is not None
return _where(cond, a, b) if use_numexpr else _where_standard(cond, a, b)
if use_numexpr:
return _where(cond, left_op, right_op)
else:
return _where_standard(cond, left_op, right_op)


def set_test_mode(v: bool = True) -> None:
Expand Down
58 changes: 30 additions & 28 deletions pandas/core/computation/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def generate(self, v) -> str:
val = v.tostring(self.encoding)
return f"({self.lhs} {self.op} {val})"

def convert_value(self, v) -> TermValue:
def convert_value(self, conv_val) -> TermValue:
"""
convert the expression that is in the term to something that is
accepted by pytables
Expand All @@ -219,44 +219,44 @@ def stringify(value):
kind = ensure_decoded(self.kind)
meta = ensure_decoded(self.meta)
if kind == "datetime" or (kind and kind.startswith("datetime64")):
if isinstance(v, (int, float)):
v = stringify(v)
v = ensure_decoded(v)
v = Timestamp(v).as_unit("ns")
if v.tz is not None:
v = v.tz_convert("UTC")
return TermValue(v, v._value, kind)
if isinstance(conv_val, (int, float)):
conv_val = stringify(conv_val)
conv_val = ensure_decoded(conv_val)
conv_val = Timestamp(conv_val).as_unit("ns")
if conv_val.tz is not None:
conv_val = conv_val.tz_convert("UTC")
return TermValue(conv_val, conv_val._value, kind)
elif kind in ("timedelta64", "timedelta"):
if isinstance(v, str):
v = Timedelta(v)
if isinstance(conv_val, str):
conv_val = Timedelta(conv_val)
else:
v = Timedelta(v, unit="s")
v = v.as_unit("ns")._value
return TermValue(int(v), v, kind)
conv_val = Timedelta(conv_val, unit="s")
conv_val = conv_val.as_unit("ns")._value
return TermValue(int(conv_val), conv_val, kind)
elif meta == "category":
metadata = extract_array(self.metadata, extract_numpy=True)
result: npt.NDArray[np.intp] | np.intp | int
if v not in metadata:
if conv_val not in metadata:
result = -1
else:
result = metadata.searchsorted(v, side="left")
result = metadata.searchsorted(conv_val, side="left")
return TermValue(result, result, "integer")
elif kind == "integer":
try:
v_dec = Decimal(v)
v_dec = Decimal(conv_val)
except InvalidOperation:
# GH 54186
# convert v to float to raise float's ValueError
float(v)
float(conv_val)
else:
v = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN"))
return TermValue(v, v, kind)
conv_val = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN"))
return TermValue(conv_val, conv_val, kind)
elif kind == "float":
v = float(v)
return TermValue(v, v, kind)
conv_val = float(conv_val)
return TermValue(conv_val, conv_val, kind)
elif kind == "bool":
if isinstance(v, str):
v = v.strip().lower() not in [
if isinstance(conv_val, str):
conv_val = conv_val.strip().lower() not in [
"false",
"f",
"no",
Expand All @@ -268,13 +268,15 @@ def stringify(value):
"",
]
else:
v = bool(v)
return TermValue(v, v, kind)
elif isinstance(v, str):
conv_val = bool(conv_val)
return TermValue(conv_val, conv_val, kind)
elif isinstance(conv_val, str):
# string quoting
return TermValue(v, stringify(v), "string")
return TermValue(conv_val, stringify(conv_val), "string")
else:
raise TypeError(f"Cannot compare {v} of type {type(v)} to {kind} column")
raise TypeError(
f"Cannot compare {conv_val} of type {type(conv_val)} to {kind} column"
)

def convert_values(self) -> None:
pass
Expand Down
Loading

0 comments on commit 6ea5785

Please sign in to comment.