Merge branch 'main' into lock-block-values-refs

pandas-dev · Dec 13, 2024 · 4417f0f · 4417f0f
2 parents e21c3c9 + 9501650
commit 4417f0f
Show file tree

Hide file tree

Showing 20 changed files with 183 additions and 13 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -34,7 +34,6 @@ jobs:
             fi
             python -m pip install --no-build-isolation -ve . -Csetup-args="--werror"
             PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH
-            sudo apt-get update && sudo apt-get install -y libegl1 libopengl0
             ci/run_tests.sh
   test-linux-musl:
     docker:

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -81,7 +81,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timestamp.resolution PR02" \
         -i "pandas.Timestamp.tzinfo GL08" \
         -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
-        -i "pandas.arrays.IntervalArray.length SA01" \
         -i "pandas.arrays.NumpyExtensionArray SA01" \
         -i "pandas.arrays.TimedeltaArray PR07,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -56,6 +56,7 @@ Other enhancements
 - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
 - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
 - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
+- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
 - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
 - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
 - :meth:`str.get_dummies` now accepts a  ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)

diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
@@ -1306,6 +1306,20 @@ def length(self) -> Index:
         """
         Return an Index with entries denoting the length of each Interval.
 
+        The length of an interval is calculated as the difference between
+        its `right` and `left` bounds. This property is particularly useful
+        when working with intervals where the size of the interval is an important
+        attribute, such as in time-series analysis or spatial data analysis.
+
+        See Also
+        --------
+        arrays.IntervalArray.left : Return the left endpoints of each Interval in
+            the IntervalArray as an Index.
+        arrays.IntervalArray.right : Return the right endpoints of each Interval in
+            the IntervalArray as an Index.
+        arrays.IntervalArray.mid : Return the midpoint of each Interval in the
+            IntervalArray as an Index.
+
         Examples
         --------
 

diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py
@@ -108,7 +108,7 @@ def _evaluate_numexpr(op, op_str, left_op, right_op):
         try:
             result = ne.evaluate(
                 f"left_value {op_str} right_value",
-                local_dict={"left_value": left_value, "right_value": right_op},
+                local_dict={"left_value": left_value, "right_value": right_value},
                 casting="safe",
             )
         except TypeError:
@@ -257,7 +257,10 @@ def where(cond, left_op, right_op, use_numexpr: bool = True):
         Whether to try to use numexpr.
     """
     assert _where is not None
-    return _where(cond, left_op, right_op) if use_numexpr else _where_standard(cond, left_op, right_op)
+    if use_numexpr:
+        return _where(cond, left_op, right_op)
+    else:
+        return _where_standard(cond, left_op, right_op)
 
 
 def set_test_mode(v: bool = True) -> None:

diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py
@@ -274,7 +274,9 @@ def stringify(value):
             # string quoting
             return TermValue(conv_val, stringify(conv_val), "string")
         else:
-            raise TypeError(f"Cannot compare {conv_val} of type {type(conv_val)} to {kind} column")
+            raise TypeError(
+                f"Cannot compare {conv_val} of type {type(conv_val)} to {kind} column"
+            )
 
     def convert_values(self) -> None:
         pass

diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py
@@ -490,7 +490,7 @@ def online(
         klass="Series/Dataframe",
         axis="",
     )
-    def aggregate(self, func, *args, **kwargs):
+    def aggregate(self, func=None, *args, **kwargs):
         return super().aggregate(func, *args, **kwargs)
 
     agg = aggregate
@@ -981,7 +981,7 @@ def reset(self) -> None:
         """
         self._mean.reset()
 
-    def aggregate(self, func, *args, **kwargs):
+    def aggregate(self, func=None, *args, **kwargs):
         raise NotImplementedError("aggregate is not implemented.")
 
     def std(self, bias: bool = False, *args, **kwargs):

diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py
@@ -167,7 +167,7 @@ def _get_window_indexer(self) -> BaseIndexer:
         klass="Series/Dataframe",
         axis="",
     )
-    def aggregate(self, func, *args, **kwargs):
+    def aggregate(self, func=None, *args, **kwargs):
         return super().aggregate(func, *args, **kwargs)
 
     agg = aggregate

diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
@@ -44,7 +44,10 @@
 
 from pandas.core._numba import executor
 from pandas.core.algorithms import factorize
-from pandas.core.apply import ResamplerWindowApply
+from pandas.core.apply import (
+    ResamplerWindowApply,
+    reconstruct_func,
+)
 from pandas.core.arrays import ExtensionArray
 from pandas.core.base import SelectionMixin
 import pandas.core.common as com
@@ -646,8 +649,12 @@ def _numba_apply(
             out = obj._constructor(result, index=index, columns=columns)
             return self._resolve_output(out, obj)
 
-    def aggregate(self, func, *args, **kwargs):
+    def aggregate(self, func=None, *args, **kwargs):
+        relabeling, func, columns, order = reconstruct_func(func, **kwargs)
         result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg()
+        if isinstance(result, ABCDataFrame) and relabeling:
+            result = result.iloc[:, order]
+            result.columns = columns  # type: ignore[union-attr]
         if result is None:
             return self.apply(func, raw=False, args=args, kwargs=kwargs)
         return result
@@ -1239,7 +1246,7 @@ def calc(x):
         klass="Series/DataFrame",
         axis="",
     )
-    def aggregate(self, func, *args, **kwargs):
+    def aggregate(self, func=None, *args, **kwargs):
         result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg()
         if result is None:
             # these must apply directly
@@ -1951,7 +1958,7 @@ def _raise_monotonic_error(self, msg: str):
         klass="Series/Dataframe",
         axis="",
     )
-    def aggregate(self, func, *args, **kwargs):
+    def aggregate(self, func=None, *args, **kwargs):
         return super().aggregate(func, *args, **kwargs)
 
     agg = aggregate

diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
@@ -1647,7 +1647,7 @@ def test_from_arrow_respecting_given_dtype():
 
 def test_from_arrow_respecting_given_dtype_unsafe():
     array = pa.array([1.5, 2.5], type=pa.float64())
-    with pytest.raises(pa.ArrowInvalid, match="Float value 1.5 was truncated"):
+    with tm.external_error_raised(pa.ArrowInvalid):
         array.to_pandas(types_mapper={pa.float64(): ArrowDtype(pa.int64())}.get)
 
 

diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py
@@ -101,6 +101,31 @@ def test_fillna_limit_series(self, data_missing):
     def test_fillna_length_mismatch(self, data_missing):
         super().test_fillna_length_mismatch(data_missing)
 
+    @pytest.mark.filterwarnings(
+        "ignore:invalid value encountered in cast:RuntimeWarning"
+    )
+    def test_hash_pandas_object(self, data):
+        super().test_hash_pandas_object(data)
+
+    @pytest.mark.filterwarnings(
+        "ignore:invalid value encountered in cast:RuntimeWarning"
+    )
+    def test_hash_pandas_object_works(self, data, as_frame):
+        super().test_hash_pandas_object_works(data, as_frame)
+
+    @pytest.mark.filterwarnings(
+        "ignore:invalid value encountered in cast:RuntimeWarning"
+    )
+    @pytest.mark.parametrize("engine", ["c", "python"])
+    def test_EA_types(self, engine, data, request):
+        super().test_EA_types(engine, data, request)
+
+    @pytest.mark.filterwarnings(
+        "ignore:invalid value encountered in cast:RuntimeWarning"
+    )
+    def test_astype_str(self, data):
+        super().test_astype_str(data)
+
 
 # TODO: either belongs in tests.arrays.interval or move into base tests.
 def test_fillna_non_scalar_raises(data_missing):

diff --git a/pandas/tests/frame/methods/test_to_numpy.py b/pandas/tests/frame/methods/test_to_numpy.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pytest
 
 from pandas import (
     DataFrame,
@@ -31,6 +32,9 @@ def test_to_numpy_copy(self):
         #  and that can be respected because we are already numpy-float
         assert df.to_numpy(copy=False).base is df.values.base
 
+    @pytest.mark.filterwarnings(
+        "ignore:invalid value encountered in cast:RuntimeWarning"
+    )
     def test_to_numpy_mixed_dtype_to_str(self):
         # https://github.com/pandas-dev/pandas/issues/35455
         df = DataFrame([[Timestamp("2020-01-01 00:00:00"), 100.0]])

diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -2404,6 +2404,9 @@ def test_construct_with_two_categoricalindex_series(self):
         )
         tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.filterwarnings(
+        "ignore:invalid value encountered in cast:RuntimeWarning"
+    )
     def test_constructor_series_nonexact_categoricalindex(self):
         # GH 42424
         ser = Series(range(100))

diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py
@@ -186,6 +186,12 @@ def test_subtype_datetimelike(self, index, subtype):
         with pytest.raises(TypeError, match=msg):
             index.astype(dtype)
 
+    @pytest.mark.filterwarnings(
+        "ignore:invalid value encountered in cast:RuntimeWarning"
+    )
+    def test_astype_category(self, index):
+        super().test_astype_category(index)
+
 
 class TestDatetimelikeSubtype(AstypeTests):
     """Tests specific to IntervalIndex with datetime-like subtype"""

diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py
@@ -59,6 +59,9 @@ def test_repr_floats(self):
         expected = "(329.973, 345.137]    1\n(345.137, 360.191]    2\ndtype: int64"
         assert result == expected
 
+    @pytest.mark.filterwarnings(
+        "ignore:invalid value encountered in cast:RuntimeWarning"
+    )
     @pytest.mark.parametrize(
         "tuples, closed, expected_data",
         [

diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py
@@ -340,6 +340,9 @@ def test_get_indexer_categorical(self, target, ordered):
         expected = index.get_indexer(target)
         tm.assert_numpy_array_equal(result, expected)
 
+    @pytest.mark.filterwarnings(
+        "ignore:invalid value encountered in cast:RuntimeWarning"
+    )
     def test_get_indexer_categorical_with_nans(self):
         # GH#41934 nans in both index and in target
         ii = IntervalIndex.from_breaks(range(5))

diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py
@@ -525,6 +525,7 @@ def test_intersection_difference_match_empty(self, index, sort):
         tm.assert_index_equal(inter, diff, exact=True)
 
 
+@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning")
 @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
 @pytest.mark.parametrize(
     "method", ["intersection", "union", "difference", "symmetric_difference"]

diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py
@@ -800,6 +800,9 @@ def test_excel_date_datetime_format(self, ext, tmp_excel, tmp_path):
         # we need to use df_expected to check the result.
         tm.assert_frame_equal(rs2, df_expected)
 
+    @pytest.mark.filterwarnings(
+        "ignore:invalid value encountered in cast:RuntimeWarning"
+    )
     def test_to_excel_interval_no_labels(self, tmp_excel, using_infer_string):
         # see gh-19242
         #

diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py
@@ -733,6 +733,7 @@ def test_cut_with_duplicated_index_lowest_included():
     tm.assert_series_equal(result, expected)
 
 
+@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning")
 def test_cut_with_nonexact_categorical_indices():
     # GH 42424
 

diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py
@@ -6,6 +6,7 @@
     DatetimeIndex,
     Index,
     MultiIndex,
+    NamedAgg,
     Series,
     Timestamp,
     date_range,
@@ -489,6 +490,36 @@ def test_groupby_rolling_subset_with_closed(self):
         )
         tm.assert_series_equal(result, expected)
 
+    def test_groupby_rolling_agg_namedagg(self):
+        # GH#28333
+        df = DataFrame(
+            {
+                "kind": ["cat", "dog", "cat", "dog", "cat", "dog"],
+                "height": [9.1, 6.0, 9.5, 34.0, 12.0, 8.0],
+                "weight": [7.9, 7.5, 9.9, 198.0, 10.0, 42.0],
+            }
+        )
+        result = (
+            df.groupby("kind")
+            .rolling(2)
+            .agg(
+                total_weight=NamedAgg(column="weight", aggfunc=sum),
+                min_height=NamedAgg(column="height", aggfunc=min),
+            )
+        )
+        expected = DataFrame(
+            {
+                "total_weight": [np.nan, 17.8, 19.9, np.nan, 205.5, 240.0],
+                "min_height": [np.nan, 9.1, 9.5, np.nan, 6.0, 8.0],
+            },
+            index=MultiIndex(
+                [["cat", "dog"], [0, 1, 2, 3, 4, 5]],
+                [[0, 0, 0, 1, 1, 1], [0, 2, 4, 1, 3, 5]],
+                names=["kind", None],
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
     def test_groupby_subset_rolling_subset_with_closed(self):
         # GH 35549
         df = DataFrame(
@@ -1134,6 +1165,36 @@ def test_expanding_apply(self, raw, frame):
         expected.index = expected_index
         tm.assert_frame_equal(result, expected)
 
+    def test_groupby_expanding_agg_namedagg(self):
+        # GH#28333
+        df = DataFrame(
+            {
+                "kind": ["cat", "dog", "cat", "dog", "cat", "dog"],
+                "height": [9.1, 6.0, 9.5, 34.0, 12.0, 8.0],
+                "weight": [7.9, 7.5, 9.9, 198.0, 10.0, 42.0],
+            }
+        )
+        result = (
+            df.groupby("kind")
+            .expanding(1)
+            .agg(
+                total_weight=NamedAgg(column="weight", aggfunc=sum),
+                min_height=NamedAgg(column="height", aggfunc=min),
+            )
+        )
+        expected = DataFrame(
+            {
+                "total_weight": [7.9, 17.8, 27.8, 7.5, 205.5, 247.5],
+                "min_height": [9.1, 9.1, 9.1, 6.0, 6.0, 6.0],
+            },
+            index=MultiIndex(
+                [["cat", "dog"], [0, 1, 2, 3, 4, 5]],
+                [[0, 0, 0, 1, 1, 1], [0, 2, 4, 1, 3, 5]],
+                names=["kind", None],
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
 
 class TestEWM:
     @pytest.mark.parametrize(
@@ -1162,6 +1223,41 @@ def test_methods(self, method, expected_data):
         )
         tm.assert_frame_equal(result, expected)
 
+    def test_groupby_ewm_agg_namedagg(self):
+        # GH#28333
+        df = DataFrame({"A": ["a"] * 4, "B": range(4)})
+        result = (
+            df.groupby("A")
+            .ewm(com=1.0)
+            .agg(
+                B_mean=NamedAgg(column="B", aggfunc="mean"),
+                B_std=NamedAgg(column="B", aggfunc="std"),
+                B_var=NamedAgg(column="B", aggfunc="var"),
+            )
+        )
+        expected = DataFrame(
+            {
+                "B_mean": [
+                    0.0,
+                    0.6666666666666666,
+                    1.4285714285714286,
+                    2.2666666666666666,
+                ],
+                "B_std": [np.nan, 0.707107, 0.963624, 1.177164],
+                "B_var": [np.nan, 0.5, 0.9285714285714286, 1.3857142857142857],
+            },
+            index=MultiIndex.from_tuples(
+                [
+                    ("a", 0),
+                    ("a", 1),
+                    ("a", 2),
+                    ("a", 3),
+                ],
+                names=["A", None],
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
     @pytest.mark.parametrize(
         "method, expected_data",
         [["corr", [np.nan, 1.0, 1.0, 1]], ["cov", [np.nan, 0.5, 0.928571, 1.385714]]],