Merge branch 'main' into issue-37210-to-sql-truncate

pandas-dev · Dec 16, 2024 · 9a96fa0 · 9a96fa0
2 parents 7533db8 + d41884b
commit 9a96fa0
Show file tree

Hide file tree

Showing 37 changed files with 389 additions and 169 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -34,7 +34,6 @@ jobs:
             fi
             python -m pip install --no-build-isolation -ve . -Csetup-args="--werror"
             PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH
-            sudo apt-get update && sudo apt-get install -y libegl1 libopengl0
             ci/run_tests.sh
   test-linux-musl:
     docker:

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -81,10 +81,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timestamp.resolution PR02" \
         -i "pandas.Timestamp.tzinfo GL08" \
         -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
-        -i "pandas.arrays.IntervalArray.length SA01" \
         -i "pandas.arrays.NumpyExtensionArray SA01" \
         -i "pandas.arrays.TimedeltaArray PR07,SA01" \
-        -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
         -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \
         -i "pandas.core.resample.Resampler.max PR01,RT03,SA01" \
@@ -95,9 +93,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.resample.Resampler.std SA01" \
         -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \
         -i "pandas.core.resample.Resampler.var SA01" \
-        -i "pandas.errors.NullFrequencyError SA01" \
-        -i "pandas.errors.NumbaUtilError SA01" \
-        -i "pandas.errors.PerformanceWarning SA01" \
         -i "pandas.errors.UndefinedVariableError PR01,SA01" \
         -i "pandas.errors.ValueLabelTypeMismatch SA01" \
         -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \

diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
@@ -185,7 +185,6 @@ Reindexing / selection / label manipulation
    DataFrame.duplicated
    DataFrame.equals
    DataFrame.filter
-   DataFrame.head
    DataFrame.idxmax
    DataFrame.idxmin
    DataFrame.reindex
@@ -196,7 +195,6 @@ Reindexing / selection / label manipulation
    DataFrame.sample
    DataFrame.set_axis
    DataFrame.set_index
-   DataFrame.tail
    DataFrame.take
    DataFrame.truncate
 

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -56,6 +56,7 @@ Other enhancements
 - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
 - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
 - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
+- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
 - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
 - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
 - :meth:`str.get_dummies` now accepts a  ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
@@ -627,6 +628,7 @@ Datetimelike
 - Bug in :meth:`DatetimeIndex.union` and :meth:`DatetimeIndex.intersection` when ``unit`` was non-nanosecond (:issue:`59036`)
 - Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`)
 - Bug in :meth:`to_datetime` not respecting dayfirst if an uncommon date string was passed. (:issue:`58859`)
+- Bug in :meth:`to_datetime` on float32 df with year, month, day etc. columns leads to precision issues and incorrect result. (:issue:`60506`)
 - Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`)
 - Bug in :meth:`to_datetime` wrongly converts when ``arg`` is a ``np.datetime64`` object with unit of ``ps``. (:issue:`60341`)
 - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`)
@@ -799,6 +801,7 @@ Other
 - Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`)
 - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)
 - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`)
+- Bug in ``Series.list`` methods not preserving the original name. (:issue:`60522`)
 - Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`)
 
 .. ***DO NOT USE THIS SECTION***

diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py
@@ -117,7 +117,10 @@ def len(self) -> Series:
 
         value_lengths = pc.list_value_length(self._pa_array)
         return Series(
-            value_lengths, dtype=ArrowDtype(value_lengths.type), index=self._data.index
+            value_lengths,
+            dtype=ArrowDtype(value_lengths.type),
+            index=self._data.index,
+            name=self._data.name,
         )
 
     def __getitem__(self, key: int | slice) -> Series:
@@ -162,7 +165,10 @@ def __getitem__(self, key: int | slice) -> Series:
             #     key = pc.add(key, pc.list_value_length(self._pa_array))
             element = pc.list_element(self._pa_array, key)
             return Series(
-                element, dtype=ArrowDtype(element.type), index=self._data.index
+                element,
+                dtype=ArrowDtype(element.type),
+                index=self._data.index,
+                name=self._data.name,
             )
         elif isinstance(key, slice):
             if pa_version_under11p0:
@@ -181,7 +187,12 @@ def __getitem__(self, key: int | slice) -> Series:
             if step is None:
                 step = 1
             sliced = pc.list_slice(self._pa_array, start, stop, step)
-            return Series(sliced, dtype=ArrowDtype(sliced.type), index=self._data.index)
+            return Series(
+                sliced,
+                dtype=ArrowDtype(sliced.type),
+                index=self._data.index,
+                name=self._data.name,
+            )
         else:
             raise ValueError(f"key must be an int or slice, got {type(key).__name__}")
 
@@ -223,7 +234,12 @@ def flatten(self) -> Series:
         counts = pa.compute.list_value_length(self._pa_array)
         flattened = pa.compute.list_flatten(self._pa_array)
         index = self._data.index.repeat(counts.fill_null(pa.scalar(0, counts.type)))
-        return Series(flattened, dtype=ArrowDtype(flattened.type), index=index)
+        return Series(
+            flattened,
+            dtype=ArrowDtype(flattened.type),
+            index=index,
+            name=self._data.name,
+        )
 
 
 class StructAccessor(ArrowAccessor):

diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
@@ -1306,6 +1306,20 @@ def length(self) -> Index:
         """
         Return an Index with entries denoting the length of each Interval.
 
+        The length of an interval is calculated as the difference between
+        its `right` and `left` bounds. This property is particularly useful
+        when working with intervals where the size of the interval is an important
+        attribute, such as in time-series analysis or spatial data analysis.
+
+        See Also
+        --------
+        arrays.IntervalArray.left : Return the left endpoints of each Interval in
+            the IntervalArray as an Index.
+        arrays.IntervalArray.right : Return the right endpoints of each Interval in
+            the IntervalArray as an Index.
+        arrays.IntervalArray.mid : Return the midpoint of each Interval in the
+            IntervalArray as an Index.
+
         Examples
         --------
 

diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py
@@ -65,23 +65,23 @@ def set_numexpr_threads(n=None) -> None:
         ne.set_num_threads(n)
 
 
-def _evaluate_standard(op, op_str, a, b):
+def _evaluate_standard(op, op_str, left_op, right_op):
     """
     Standard evaluation.
     """
     if _TEST_MODE:
         _store_test_result(False)
-    return op(a, b)
+    return op(left_op, right_op)
 
 
-def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool:
-    """return a boolean if we WILL be using numexpr"""
+def _can_use_numexpr(op, op_str, left_op, right_op, dtype_check) -> bool:
+    """return left_op boolean if we WILL be using numexpr"""
     if op_str is not None:
         # required min elements (otherwise we are adding overhead)
-        if a.size > _MIN_ELEMENTS:
+        if left_op.size > _MIN_ELEMENTS:
             # check for dtype compatibility
             dtypes: set[str] = set()
-            for o in [a, b]:
+            for o in [left_op, right_op]:
                 # ndarray and Series Case
                 if hasattr(o, "dtype"):
                     dtypes |= {o.dtype.name}
@@ -93,43 +93,43 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool:
     return False
 
 
-def _evaluate_numexpr(op, op_str, a, b):
+def _evaluate_numexpr(op, op_str, left_op, right_op):
     result = None
 
-    if _can_use_numexpr(op, op_str, a, b, "evaluate"):
+    if _can_use_numexpr(op, op_str, left_op, right_op, "evaluate"):
         is_reversed = op.__name__.strip("_").startswith("r")
         if is_reversed:
             # we were originally called by a reversed op method
-            a, b = b, a
+            left_op, right_op = right_op, left_op
 
-        a_value = a
-        b_value = b
+        left_value = left_op
+        right_value = right_op
 
         try:
             result = ne.evaluate(
-                f"a_value {op_str} b_value",
-                local_dict={"a_value": a_value, "b_value": b_value},
+                f"left_value {op_str} right_value",
+                local_dict={"left_value": left_value, "right_value": right_value},
                 casting="safe",
             )
         except TypeError:
             # numexpr raises eg for array ** array with integers
             # (https://github.com/pydata/numexpr/issues/379)
             pass
         except NotImplementedError:
-            if _bool_arith_fallback(op_str, a, b):
+            if _bool_arith_fallback(op_str, left_op, right_op):
                 pass
             else:
                 raise
 
         if is_reversed:
             # reverse order to original for fallback
-            a, b = b, a
+            left_op, right_op = right_op, left_op
 
     if _TEST_MODE:
         _store_test_result(result is not None)
 
     if result is None:
-        result = _evaluate_standard(op, op_str, a, b)
+        result = _evaluate_standard(op, op_str, left_op, right_op)
 
     return result
 
@@ -170,24 +170,24 @@ def _evaluate_numexpr(op, op_str, a, b):
 }
 
 
-def _where_standard(cond, a, b):
+def _where_standard(cond, left_op, right_op):
     # Caller is responsible for extracting ndarray if necessary
-    return np.where(cond, a, b)
+    return np.where(cond, left_op, right_op)
 
 
-def _where_numexpr(cond, a, b):
+def _where_numexpr(cond, left_op, right_op):
     # Caller is responsible for extracting ndarray if necessary
     result = None
 
-    if _can_use_numexpr(None, "where", a, b, "where"):
+    if _can_use_numexpr(None, "where", left_op, right_op, "where"):
         result = ne.evaluate(
             "where(cond_value, a_value, b_value)",
-            local_dict={"cond_value": cond, "a_value": a, "b_value": b},
+            local_dict={"cond_value": cond, "a_value": left_op, "b_value": right_op},
             casting="safe",
         )
 
     if result is None:
-        result = _where_standard(cond, a, b)
+        result = _where_standard(cond, left_op, right_op)
 
     return result
 
@@ -206,13 +206,13 @@ def _has_bool_dtype(x):
 _BOOL_OP_UNSUPPORTED = {"+": "|", "*": "&", "-": "^"}
 
 
-def _bool_arith_fallback(op_str, a, b) -> bool:
+def _bool_arith_fallback(op_str, left_op, right_op) -> bool:
     """
     Check if we should fallback to the python `_evaluate_standard` in case
     of an unsupported operation by numexpr, which is the case for some
     boolean ops.
     """
-    if _has_bool_dtype(a) and _has_bool_dtype(b):
+    if _has_bool_dtype(left_op) and _has_bool_dtype(right_op):
         if op_str in _BOOL_OP_UNSUPPORTED:
             warnings.warn(
                 f"evaluating in Python space because the {op_str!r} "
@@ -224,40 +224,43 @@ def _bool_arith_fallback(op_str, a, b) -> bool:
     return False
 
 
-def evaluate(op, a, b, use_numexpr: bool = True):
+def evaluate(op, left_op, right_op, use_numexpr: bool = True):
     """
-    Evaluate and return the expression of the op on a and b.
+    Evaluate and return the expression of the op on left_op and right_op.
 
     Parameters
     ----------
     op : the actual operand
-    a : left operand
-    b : right operand
+    left_op : left operand
+    right_op : right operand
     use_numexpr : bool, default True
         Whether to try to use numexpr.
     """
     op_str = _op_str_mapping[op]
     if op_str is not None:
         if use_numexpr:
             # error: "None" not callable
-            return _evaluate(op, op_str, a, b)  # type: ignore[misc]
-    return _evaluate_standard(op, op_str, a, b)
+            return _evaluate(op, op_str, left_op, right_op)  # type: ignore[misc]
+    return _evaluate_standard(op, op_str, left_op, right_op)
 
 
-def where(cond, a, b, use_numexpr: bool = True):
+def where(cond, left_op, right_op, use_numexpr: bool = True):
     """
-    Evaluate the where condition cond on a and b.
+    Evaluate the where condition cond on left_op and right_op.
 
     Parameters
     ----------
     cond : np.ndarray[bool]
-    a : return if cond is True
-    b : return if cond is False
+    left_op : return if cond is True
+    right_op : return if cond is False
     use_numexpr : bool, default True
         Whether to try to use numexpr.
     """
     assert _where is not None
-    return _where(cond, a, b) if use_numexpr else _where_standard(cond, a, b)
+    if use_numexpr:
+        return _where(cond, left_op, right_op)
+    else:
+        return _where_standard(cond, left_op, right_op)
 
 
 def set_test_mode(v: bool = True) -> None: