Merge branch 'main' into fix-59242

pandas-dev · Dec 13, 2024 · 6ea5785 · 6ea5785
2 parents 3d83bab + c52846f
commit 6ea5785
Show file tree

Hide file tree

Showing 24 changed files with 240 additions and 86 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -34,7 +34,6 @@ jobs:
             fi
             python -m pip install --no-build-isolation -ve . -Csetup-args="--werror"
             PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH
-            sudo apt-get update && sudo apt-get install -y libegl1 libopengl0
             ci/run_tests.sh
   test-linux-musl:
     docker:

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -84,7 +84,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.arrays.IntervalArray.length SA01" \
         -i "pandas.arrays.NumpyExtensionArray SA01" \
         -i "pandas.arrays.TimedeltaArray PR07,SA01" \
-        -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
         -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \
         -i "pandas.core.resample.Resampler.max PR01,RT03,SA01" \
@@ -95,9 +94,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.resample.Resampler.std SA01" \
         -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \
         -i "pandas.core.resample.Resampler.var SA01" \
-        -i "pandas.errors.NullFrequencyError SA01" \
-        -i "pandas.errors.NumbaUtilError SA01" \
-        -i "pandas.errors.PerformanceWarning SA01" \
         -i "pandas.errors.UndefinedVariableError PR01,SA01" \
         -i "pandas.errors.ValueLabelTypeMismatch SA01" \
         -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \

diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
@@ -185,7 +185,6 @@ Reindexing / selection / label manipulation
    DataFrame.duplicated
    DataFrame.equals
    DataFrame.filter
-   DataFrame.head
    DataFrame.idxmax
    DataFrame.idxmin
    DataFrame.reindex
@@ -196,7 +195,6 @@ Reindexing / selection / label manipulation
    DataFrame.sample
    DataFrame.set_axis
    DataFrame.set_index
-   DataFrame.tail
    DataFrame.take
    DataFrame.truncate
 

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -626,6 +626,7 @@ Datetimelike
 - Bug in :meth:`DatetimeIndex.union` and :meth:`DatetimeIndex.intersection` when ``unit`` was non-nanosecond (:issue:`59036`)
 - Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`)
 - Bug in :meth:`to_datetime` not respecting dayfirst if an uncommon date string was passed. (:issue:`58859`)
+- Bug in :meth:`to_datetime` on float32 df with year, month, day etc. columns leads to precision issues and incorrect result. (:issue:`60506`)
 - Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`)
 - Bug in :meth:`to_datetime` wrongly converts when ``arg`` is a ``np.datetime64`` object with unit of ``ps``. (:issue:`60341`)
 - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`)
@@ -798,6 +799,7 @@ Other
 - Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`)
 - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)
 - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`)
+- Bug in ``Series.list`` methods not preserving the original name. (:issue:`60522`)
 - Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`)
 
 .. ***DO NOT USE THIS SECTION***

diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py
@@ -117,7 +117,10 @@ def len(self) -> Series:
 
         value_lengths = pc.list_value_length(self._pa_array)
         return Series(
-            value_lengths, dtype=ArrowDtype(value_lengths.type), index=self._data.index
+            value_lengths,
+            dtype=ArrowDtype(value_lengths.type),
+            index=self._data.index,
+            name=self._data.name,
         )
 
     def __getitem__(self, key: int | slice) -> Series:
@@ -162,7 +165,10 @@ def __getitem__(self, key: int | slice) -> Series:
             #     key = pc.add(key, pc.list_value_length(self._pa_array))
             element = pc.list_element(self._pa_array, key)
             return Series(
-                element, dtype=ArrowDtype(element.type), index=self._data.index
+                element,
+                dtype=ArrowDtype(element.type),
+                index=self._data.index,
+                name=self._data.name,
             )
         elif isinstance(key, slice):
             if pa_version_under11p0:
@@ -181,7 +187,12 @@ def __getitem__(self, key: int | slice) -> Series:
             if step is None:
                 step = 1
             sliced = pc.list_slice(self._pa_array, start, stop, step)
-            return Series(sliced, dtype=ArrowDtype(sliced.type), index=self._data.index)
+            return Series(
+                sliced,
+                dtype=ArrowDtype(sliced.type),
+                index=self._data.index,
+                name=self._data.name,
+            )
         else:
             raise ValueError(f"key must be an int or slice, got {type(key).__name__}")
 
@@ -223,7 +234,12 @@ def flatten(self) -> Series:
         counts = pa.compute.list_value_length(self._pa_array)
         flattened = pa.compute.list_flatten(self._pa_array)
         index = self._data.index.repeat(counts.fill_null(pa.scalar(0, counts.type)))
-        return Series(flattened, dtype=ArrowDtype(flattened.type), index=index)
+        return Series(
+            flattened,
+            dtype=ArrowDtype(flattened.type),
+            index=index,
+            name=self._data.name,
+        )
 
 
 class StructAccessor(ArrowAccessor):

diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py
@@ -65,23 +65,23 @@ def set_numexpr_threads(n=None) -> None:
         ne.set_num_threads(n)
 
 
-def _evaluate_standard(op, op_str, a, b):
+def _evaluate_standard(op, op_str, left_op, right_op):
     """
     Standard evaluation.
     """
     if _TEST_MODE:
         _store_test_result(False)
-    return op(a, b)
+    return op(left_op, right_op)
 
 
-def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool:
-    """return a boolean if we WILL be using numexpr"""
+def _can_use_numexpr(op, op_str, left_op, right_op, dtype_check) -> bool:
+    """return left_op boolean if we WILL be using numexpr"""
     if op_str is not None:
         # required min elements (otherwise we are adding overhead)
-        if a.size > _MIN_ELEMENTS:
+        if left_op.size > _MIN_ELEMENTS:
             # check for dtype compatibility
             dtypes: set[str] = set()
-            for o in [a, b]:
+            for o in [left_op, right_op]:
                 # ndarray and Series Case
                 if hasattr(o, "dtype"):
                     dtypes |= {o.dtype.name}
@@ -93,43 +93,43 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool:
     return False
 
 
-def _evaluate_numexpr(op, op_str, a, b):
+def _evaluate_numexpr(op, op_str, left_op, right_op):
     result = None
 
-    if _can_use_numexpr(op, op_str, a, b, "evaluate"):
+    if _can_use_numexpr(op, op_str, left_op, right_op, "evaluate"):
         is_reversed = op.__name__.strip("_").startswith("r")
         if is_reversed:
             # we were originally called by a reversed op method
-            a, b = b, a
+            left_op, right_op = right_op, left_op
 
-        a_value = a
-        b_value = b
+        left_value = left_op
+        right_value = right_op
 
         try:
             result = ne.evaluate(
-                f"a_value {op_str} b_value",
-                local_dict={"a_value": a_value, "b_value": b_value},
+                f"left_value {op_str} right_value",
+                local_dict={"left_value": left_value, "right_value": right_value},
                 casting="safe",
             )
         except TypeError:
             # numexpr raises eg for array ** array with integers
             # (https://github.com/pydata/numexpr/issues/379)
             pass
         except NotImplementedError:
-            if _bool_arith_fallback(op_str, a, b):
+            if _bool_arith_fallback(op_str, left_op, right_op):
                 pass
             else:
                 raise
 
         if is_reversed:
             # reverse order to original for fallback
-            a, b = b, a
+            left_op, right_op = right_op, left_op
 
     if _TEST_MODE:
         _store_test_result(result is not None)
 
     if result is None:
-        result = _evaluate_standard(op, op_str, a, b)
+        result = _evaluate_standard(op, op_str, left_op, right_op)
 
     return result
 
@@ -170,24 +170,24 @@ def _evaluate_numexpr(op, op_str, a, b):
 }
 
 
-def _where_standard(cond, a, b):
+def _where_standard(cond, left_op, right_op):
     # Caller is responsible for extracting ndarray if necessary
-    return np.where(cond, a, b)
+    return np.where(cond, left_op, right_op)
 
 
-def _where_numexpr(cond, a, b):
+def _where_numexpr(cond, left_op, right_op):
     # Caller is responsible for extracting ndarray if necessary
     result = None
 
-    if _can_use_numexpr(None, "where", a, b, "where"):
+    if _can_use_numexpr(None, "where", left_op, right_op, "where"):
         result = ne.evaluate(
             "where(cond_value, a_value, b_value)",
-            local_dict={"cond_value": cond, "a_value": a, "b_value": b},
+            local_dict={"cond_value": cond, "a_value": left_op, "b_value": right_op},
             casting="safe",
         )
 
     if result is None:
-        result = _where_standard(cond, a, b)
+        result = _where_standard(cond, left_op, right_op)
 
     return result
 
@@ -206,13 +206,13 @@ def _has_bool_dtype(x):
 _BOOL_OP_UNSUPPORTED = {"+": "|", "*": "&", "-": "^"}
 
 
-def _bool_arith_fallback(op_str, a, b) -> bool:
+def _bool_arith_fallback(op_str, left_op, right_op) -> bool:
     """
     Check if we should fallback to the python `_evaluate_standard` in case
     of an unsupported operation by numexpr, which is the case for some
     boolean ops.
     """
-    if _has_bool_dtype(a) and _has_bool_dtype(b):
+    if _has_bool_dtype(left_op) and _has_bool_dtype(right_op):
         if op_str in _BOOL_OP_UNSUPPORTED:
             warnings.warn(
                 f"evaluating in Python space because the {op_str!r} "
@@ -224,40 +224,43 @@ def _bool_arith_fallback(op_str, a, b) -> bool:
     return False
 
 
-def evaluate(op, a, b, use_numexpr: bool = True):
+def evaluate(op, left_op, right_op, use_numexpr: bool = True):
     """
-    Evaluate and return the expression of the op on a and b.
+    Evaluate and return the expression of the op on left_op and right_op.
 
     Parameters
     ----------
     op : the actual operand
-    a : left operand
-    b : right operand
+    left_op : left operand
+    right_op : right operand
     use_numexpr : bool, default True
         Whether to try to use numexpr.
     """
     op_str = _op_str_mapping[op]
     if op_str is not None:
         if use_numexpr:
             # error: "None" not callable
-            return _evaluate(op, op_str, a, b)  # type: ignore[misc]
-    return _evaluate_standard(op, op_str, a, b)
+            return _evaluate(op, op_str, left_op, right_op)  # type: ignore[misc]
+    return _evaluate_standard(op, op_str, left_op, right_op)
 
 
-def where(cond, a, b, use_numexpr: bool = True):
+def where(cond, left_op, right_op, use_numexpr: bool = True):
     """
-    Evaluate the where condition cond on a and b.
+    Evaluate the where condition cond on left_op and right_op.
 
     Parameters
     ----------
     cond : np.ndarray[bool]
-    a : return if cond is True
-    b : return if cond is False
+    left_op : return if cond is True
+    right_op : return if cond is False
     use_numexpr : bool, default True
         Whether to try to use numexpr.
     """
     assert _where is not None
-    return _where(cond, a, b) if use_numexpr else _where_standard(cond, a, b)
+    if use_numexpr:
+        return _where(cond, left_op, right_op)
+    else:
+        return _where_standard(cond, left_op, right_op)
 
 
 def set_test_mode(v: bool = True) -> None:

diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py
@@ -205,7 +205,7 @@ def generate(self, v) -> str:
         val = v.tostring(self.encoding)
         return f"({self.lhs} {self.op} {val})"
 
-    def convert_value(self, v) -> TermValue:
+    def convert_value(self, conv_val) -> TermValue:
         """
         convert the expression that is in the term to something that is
         accepted by pytables
@@ -219,44 +219,44 @@ def stringify(value):
         kind = ensure_decoded(self.kind)
         meta = ensure_decoded(self.meta)
         if kind == "datetime" or (kind and kind.startswith("datetime64")):
-            if isinstance(v, (int, float)):
-                v = stringify(v)
-            v = ensure_decoded(v)
-            v = Timestamp(v).as_unit("ns")
-            if v.tz is not None:
-                v = v.tz_convert("UTC")
-            return TermValue(v, v._value, kind)
+            if isinstance(conv_val, (int, float)):
+                conv_val = stringify(conv_val)
+            conv_val = ensure_decoded(conv_val)
+            conv_val = Timestamp(conv_val).as_unit("ns")
+            if conv_val.tz is not None:
+                conv_val = conv_val.tz_convert("UTC")
+            return TermValue(conv_val, conv_val._value, kind)
         elif kind in ("timedelta64", "timedelta"):
-            if isinstance(v, str):
-                v = Timedelta(v)
+            if isinstance(conv_val, str):
+                conv_val = Timedelta(conv_val)
             else:
-                v = Timedelta(v, unit="s")
-            v = v.as_unit("ns")._value
-            return TermValue(int(v), v, kind)
+                conv_val = Timedelta(conv_val, unit="s")
+            conv_val = conv_val.as_unit("ns")._value
+            return TermValue(int(conv_val), conv_val, kind)
         elif meta == "category":
             metadata = extract_array(self.metadata, extract_numpy=True)
             result: npt.NDArray[np.intp] | np.intp | int
-            if v not in metadata:
+            if conv_val not in metadata:
                 result = -1
             else:
-                result = metadata.searchsorted(v, side="left")
+                result = metadata.searchsorted(conv_val, side="left")
             return TermValue(result, result, "integer")
         elif kind == "integer":
             try:
-                v_dec = Decimal(v)
+                v_dec = Decimal(conv_val)
             except InvalidOperation:
                 # GH 54186
                 # convert v to float to raise float's ValueError
-                float(v)
+                float(conv_val)
             else:
-                v = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN"))
-            return TermValue(v, v, kind)
+                conv_val = int(v_dec.to_integral_exact(rounding="ROUND_HALF_EVEN"))
+            return TermValue(conv_val, conv_val, kind)
         elif kind == "float":
-            v = float(v)
-            return TermValue(v, v, kind)
+            conv_val = float(conv_val)
+            return TermValue(conv_val, conv_val, kind)
         elif kind == "bool":
-            if isinstance(v, str):
-                v = v.strip().lower() not in [
+            if isinstance(conv_val, str):
+                conv_val = conv_val.strip().lower() not in [
                     "false",
                     "f",
                     "no",
@@ -268,13 +268,15 @@ def stringify(value):
                     "",
                 ]
             else:
-                v = bool(v)
-            return TermValue(v, v, kind)
-        elif isinstance(v, str):
+                conv_val = bool(conv_val)
+            return TermValue(conv_val, conv_val, kind)
+        elif isinstance(conv_val, str):
             # string quoting
-            return TermValue(v, stringify(v), "string")
+            return TermValue(conv_val, stringify(conv_val), "string")
         else:
-            raise TypeError(f"Cannot compare {v} of type {type(v)} to {kind} column")
+            raise TypeError(
+                f"Cannot compare {conv_val} of type {type(conv_val)} to {kind} column"
+            )
 
     def convert_values(self) -> None:
         pass