diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 005818b0779e6..1c474b16d22e8 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -54,6 +54,7 @@ Other enhancements - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) +- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`) - :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi index a6cfbec9b15b9..ee735761e3dc6 100644 --- a/pandas/_libs/window/aggregations.pyi +++ b/pandas/_libs/window/aggregations.pyi @@ -60,6 +60,18 @@ def roll_min( end: np.ndarray, # np.ndarray[np.int64] minp: int, # int64_t ) -> np.ndarray: ... # np.ndarray[float] +def roll_first( + values: np.ndarray, # np.ndarray[np.float64] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] +def roll_last( + values: np.ndarray, # np.ndarray[np.float64] + start: np.ndarray, # np.ndarray[np.int64] + end: np.ndarray, # np.ndarray[np.int64] + minp: int, # int64_t +) -> np.ndarray: ... # np.ndarray[float] def roll_quantile( values: np.ndarray, # const float64_t[:] start: np.ndarray, # np.ndarray[np.int64] diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 5b9ee095d4643..d33c840371d2a 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1133,6 +1133,89 @@ cdef _roll_min_max(ndarray[float64_t] values, return output +# ---------------------------------------------------------------------- +# Rolling first, last + + +def roll_first(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp) -> np.ndarray: + return _roll_first_last(values, start, end, minp, is_first=1) + + +def roll_last(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp) -> np.ndarray: + return _roll_first_last(values, start, end, minp, is_first=0) + + +cdef _roll_first_last(const float64_t[:] values, ndarray[int64_t] start, + ndarray[int64_t] end, int64_t minp, bint is_first): + cdef: + Py_ssize_t i, j, fl_idx + bint is_monotonic_increasing_bounds + int64_t nobs = 0, N = len(start), s, e + float64_t val, res + ndarray[float64_t] output + + is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds( + start, end + ) + + output = np.empty(N, dtype=np.float64) + + if (end - start).max() == 0: + output[:] = NaN + return output + + with nogil: + for i in range(0, N): + s = start[i] + e = end[i] + + if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]: + fl_idx = -1 + nobs = 0 + for j in range(s, e): + val = values[j] + if val == val: + if not is_first or fl_idx < s: + fl_idx = j + nobs += 1 + else: + # handle deletes + for j in range(start[i - 1], s): + val = values[j] + if val == val: + nobs -= 1 + + # update fl_idx if out of range, if first + if is_first and fl_idx < s: + fl_idx = -1 + for j in range(s, end[i - 1]): + val = values[j] + if val == val: + fl_idx = j + break + + # handle adds + for j in range(end[i - 1], e): + val = values[j] + if val == val: + if not is_first or fl_idx < s: + fl_idx = j + nobs += 1 + + if nobs >= minp and fl_idx >= s: + res = values[fl_idx] + else: + res = NaN + + output[i] = res + + if not is_monotonic_increasing_bounds: + nobs = 0 + + return output + cdef enum InterpolationType: LINEAR, diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index bff3a1660eba9..17c93f7d8f34c 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -664,6 +664,64 @@ def skew(self, numeric_only: bool = False): def kurt(self, numeric_only: bool = False): return super().kurt(numeric_only=numeric_only) + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("Examples"), + dedent( + """ + The example below will show an expanding calculation with a window size of + three. + + >>> s = pd.Series(range(5)) + >>> s.expanding(3).first() + 0 NaN + 1 NaN + 2 0.0 + 3 0.0 + 4 0.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="First (left-most) element of the window", + agg_method="first", + ) + def first(self, numeric_only: bool = False): + return super().first(numeric_only=numeric_only) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("Examples"), + dedent( + """ + The example below will show an expanding calculation with a window size of + three. + + >>> s = pd.Series(range(5)) + >>> s.expanding(3).last() + 0 NaN + 1 NaN + 2 2.0 + 3 3.0 + 4 4.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="expanding", + aggregation_description="Last (right-most) element of the window", + agg_method="last", + ) + def last(self, numeric_only: bool = False): + return super().last(numeric_only=numeric_only) + @doc( template_header, create_section_header("Parameters"), diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 385ffb901acf0..7fd0c91aa0b26 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1705,6 +1705,22 @@ def kurt(self, numeric_only: bool = False): numeric_only=numeric_only, ) + def first(self, numeric_only: bool = False): + window_func = window_aggregations.roll_first + return self._apply( + window_func, + name="first", + numeric_only=numeric_only, + ) + + def last(self, numeric_only: bool = False): + window_func = window_aggregations.roll_last + return self._apply( + window_func, + name="last", + numeric_only=numeric_only, + ) + def quantile( self, q: float, @@ -2539,6 +2555,64 @@ def sem(self, ddof: int = 1, numeric_only: bool = False): def kurt(self, numeric_only: bool = False): return super().kurt(numeric_only=numeric_only) + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("Examples"), + dedent( + """ + The example below will show a rolling calculation with a window size of + three. + + >>> s = pd.Series(range(5)) + >>> s.rolling(3).first() + 0 NaN + 1 NaN + 2 0.0 + 3 1.0 + 4 2.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="First (left-most) element of the window", + agg_method="first", + ) + def first(self, numeric_only: bool = False): + return super().first(numeric_only=numeric_only) + + @doc( + template_header, + create_section_header("Parameters"), + kwargs_numeric_only, + create_section_header("Returns"), + template_returns, + create_section_header("Examples"), + dedent( + """ + The example below will show a rolling calculation with a window size of + three. + + >>> s = pd.Series(range(5)) + >>> s.rolling(3).last() + 0 NaN + 1 NaN + 2 2.0 + 3 3.0 + 4 4.0 + dtype: float64 + """ + ).replace("\n", "", 1), + window_method="rolling", + aggregation_description="Last (right-most) element of the window", + agg_method="last", + ) + def last(self, numeric_only: bool = False): + return super().last(numeric_only=numeric_only) + @doc( template_header, create_section_header("Parameters"), diff --git a/pandas/tests/window/test_cython_aggregations.py b/pandas/tests/window/test_cython_aggregations.py index c60cb6ea74ec0..feb25a294c540 100644 --- a/pandas/tests/window/test_cython_aggregations.py +++ b/pandas/tests/window/test_cython_aggregations.py @@ -30,6 +30,8 @@ def _get_rolling_aggregations(): ("roll_median_c", window_aggregations.roll_median_c), ("roll_max", window_aggregations.roll_max), ("roll_min", window_aggregations.roll_min), + ("roll_first", window_aggregations.roll_first), + ("roll_last", window_aggregations.roll_last), ] + [ ( diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index b2f76bdd0e2ad..39cedc3b692da 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -451,6 +451,8 @@ def test_moment_functions_zero_length_pairwise(f): lambda x: x.expanding(min_periods=5).corr(x, pairwise=False), lambda x: x.expanding(min_periods=5).max(), lambda x: x.expanding(min_periods=5).min(), + lambda x: x.expanding(min_periods=5).first(), + lambda x: x.expanding(min_periods=5).last(), lambda x: x.expanding(min_periods=5).sum(), lambda x: x.expanding(min_periods=5).mean(), lambda x: x.expanding(min_periods=5).std(), @@ -596,6 +598,104 @@ def test_expanding_corr_pairwise_diff_length(): tm.assert_frame_equal(result4, expected) +@pytest.mark.parametrize( + "values,method,expected", + [ + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "first", + [float("nan"), float("nan"), 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "first", + [ + float("nan"), + float("nan"), + float("nan"), + float("nan"), + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + ], + ), + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "last", + [float("nan"), float("nan"), 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "last", + [ + float("nan"), + float("nan"), + float("nan"), + float("nan"), + 5.0, + 5.0, + 7.0, + 7.0, + 9.0, + 9.0, + ], + ), + ], +) +def test_expanding_first_last(values, method, expected): + # GH#33155 + x = Series(values) + result = getattr(x.expanding(3), method)() + expected = Series(expected) + tm.assert_almost_equal(result, expected) + + x = DataFrame({"A": values}) + result = getattr(x.expanding(3), method)() + expected = DataFrame({"A": expected}) + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize( + "values,method,expected", + [ + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "first", + [1.0] * 10, + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "first", + [1.0] * 10, + ), + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "last", + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "last", + [1.0, 1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0, 9.0], + ), + ], +) +def test_expanding_first_last_no_minp(values, method, expected): + # GH#33155 + x = Series(values) + result = getattr(x.expanding(min_periods=0), method)() + expected = Series(expected) + tm.assert_almost_equal(result, expected) + + x = DataFrame({"A": values}) + result = getattr(x.expanding(min_periods=0), method)() + expected = DataFrame({"A": expected}) + tm.assert_almost_equal(result, expected) + + def test_expanding_apply_args_kwargs(engine_and_raw): def mean_w_arg(x, const): return np.mean(x) + const diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index f8e804bf434e9..38aae2f8f5fdc 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -91,6 +91,8 @@ def test_getitem_multiple(self, roll_frame): "mean", "min", "max", + "first", + "last", "count", "kurt", "skew", @@ -1062,7 +1064,7 @@ def frame(self): return DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) @pytest.mark.parametrize( - "f", ["sum", "mean", "min", "max", "count", "kurt", "skew"] + "f", ["sum", "mean", "min", "max", "first", "last", "count", "kurt", "skew"] ) def test_expanding(self, f, frame): g = frame.groupby("A", group_keys=False) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index af3194b5085c4..2aaa35ec5ec2c 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1326,6 +1326,82 @@ def test_rolling_corr_timedelta_index(index, window): tm.assert_almost_equal(result, expected) +@pytest.mark.parametrize( + "values,method,expected", + [ + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "first", + [float("nan"), float("nan"), 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "first", + [float("nan")] * 10, + ), + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "last", + [float("nan"), float("nan"), 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "last", + [float("nan")] * 10, + ), + ], +) +def test_rolling_first_last(values, method, expected): + # GH#33155 + x = Series(values) + result = getattr(x.rolling(3), method)() + expected = Series(expected) + tm.assert_almost_equal(result, expected) + + x = DataFrame({"A": values}) + result = getattr(x.rolling(3), method)() + expected = DataFrame({"A": expected}) + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize( + "values,method,expected", + [ + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "first", + [1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "first", + [1.0, 1.0, 1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0], + ), + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "last", + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + ), + ( + [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan], + "last", + [1.0, 1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0, 9.0], + ), + ], +) +def test_rolling_first_last_no_minp(values, method, expected): + # GH#33155 + x = Series(values) + result = getattr(x.rolling(3, min_periods=0), method)() + expected = Series(expected) + tm.assert_almost_equal(result, expected) + + x = DataFrame({"A": values}) + result = getattr(x.rolling(3, min_periods=0), method)() + expected = DataFrame({"A": expected}) + tm.assert_almost_equal(result, expected) + + def test_groupby_rolling_nan_included(): # GH 35542 data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]} diff --git a/pandas/tests/window/test_rolling_functions.py b/pandas/tests/window/test_rolling_functions.py index f77a98ae9a7d9..6820ab7332975 100644 --- a/pandas/tests/window/test_rolling_functions.py +++ b/pandas/tests/window/test_rolling_functions.py @@ -340,6 +340,8 @@ def test_center_reindex_frame(frame, roll_func, kwargs, minp, fill_value): lambda x: x.rolling(window=10, min_periods=5).var(), lambda x: x.rolling(window=10, min_periods=5).skew(), lambda x: x.rolling(window=10, min_periods=5).kurt(), + lambda x: x.rolling(window=10, min_periods=5).first(), + lambda x: x.rolling(window=10, min_periods=5).last(), lambda x: x.rolling(window=10, min_periods=5).quantile(q=0.5), lambda x: x.rolling(window=10, min_periods=5).median(), lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), @@ -501,6 +503,8 @@ def test_rolling_min_max_numeric_types(any_real_numpy_dtype): lambda x: x.rolling(window=10, min_periods=5).var(), lambda x: x.rolling(window=10, min_periods=5).skew(), lambda x: x.rolling(window=10, min_periods=5).kurt(), + lambda x: x.rolling(window=10, min_periods=5).first(), + lambda x: x.rolling(window=10, min_periods=5).last(), lambda x: x.rolling(window=10, min_periods=5).quantile(0.5), lambda x: x.rolling(window=10, min_periods=5).median(), lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index eacdaddfa28b0..043f369566a5d 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -541,6 +541,42 @@ def test_ragged_max(self, ragged): expected["B"] = [0.0, 1, 2, 3, 4] tm.assert_frame_equal(result, expected) + def test_ragged_first(self, ragged): + df = ragged + + result = df.rolling(window="1s", min_periods=1).first() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).first() + expected = df.copy() + expected["B"] = [0.0, 1, 1, 3, 3] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).first() + expected = df.copy() + expected["B"] = [0.0, 0, 0, 1, 1] + tm.assert_frame_equal(result, expected) + + def test_ragged_last(self, ragged): + df = ragged + + result = df.rolling(window="1s", min_periods=1).last() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).last() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).last() + expected = df.copy() + expected["B"] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "freq, op, result_data", [ @@ -586,6 +622,8 @@ def test_freqs_ops(self, freq, op, result_data): "skew", "min", "max", + "first", + "last", ], ) def test_all(self, f, regular):