From 53b84063c68c601f8d8f6d19f4eebdf8317f7a4e Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Tue, 26 Nov 2024 11:11:18 +0100 Subject: [PATCH 1/9] WIP --- docs/api-reference/expr.md | 2 + docs/api-reference/series.md | 2 + narwhals/_arrow/expr.py | 34 ++ narwhals/_arrow/series.py | 67 ++++ narwhals/_dask/expr.py | 58 ++++ narwhals/_pandas_like/expr.py | 34 ++ narwhals/_pandas_like/series.py | 26 ++ narwhals/expr.py | 187 +++++++++++ narwhals/series.py | 186 +++++++++++ narwhals/stable/v1/__init__.py | 379 ++++++++++++++++++++++ tests/expr_and_series/rolling_std_test.py | 103 ++++++ tests/expr_and_series/rolling_var_test.py | 114 +++++++ 12 files changed, 1192 insertions(+) create mode 100644 tests/expr_and_series/rolling_std_test.py create mode 100644 tests/expr_and_series/rolling_var_test.py diff --git a/docs/api-reference/expr.md b/docs/api-reference/expr.md index 91f49a0a1..b589e2d81 100644 --- a/docs/api-reference/expr.md +++ b/docs/api-reference/expr.md @@ -46,7 +46,9 @@ - quantile - replace_strict - rolling_mean + - rolling_std - rolling_sum + - rolling_var - round - sample - shift diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index 7c7a5ed17..9a3b8901e 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -53,7 +53,9 @@ - rename - replace_strict - rolling_mean + - rolling_std - rolling_sum + - rolling_var - round - sample - scatter diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index 4c04a1827..05465d928 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -486,6 +486,40 @@ def rolling_mean( center=center, ) + def rolling_var( + self: Self, + window_size: int, + *, + min_periods: int | None, + center: bool, + ddof: int, + ) -> Self: + return reuse_series_implementation( + self, + "rolling_var", + window_size=window_size, + min_periods=min_periods, + center=center, + ddof=ddof, + ) + + def rolling_std( + self: Self, + window_size: int, + *, + min_periods: int | None, + center: bool, + ddof: int, + ) -> Self: + return reuse_series_implementation( + self, + "rolling_std", + window_size=window_size, + min_periods=min_periods, + center=center, + ddof=ddof, + ) + @property def dt(self: Self) -> ArrowExprDateTimeNamespace: return ArrowExprDateTimeNamespace(self) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index f286b3000..4d3cb050c 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -993,6 +993,73 @@ def rolling_mean( result = result[offset_left + offset_right :] return result + def rolling_var( + self: Self, + window_size: int, + *, + min_periods: int | None, + center: bool, + ddof: int, + ) -> Self: + import pyarrow as pa # ignore-banned-import + import pyarrow.compute as pc # ignore-banned-import + + min_periods = min_periods if min_periods is not None else window_size + if center: + offset_left = window_size // 2 + offset_right = offset_left - ( + window_size % 2 == 0 + ) # subtract one if window_size is even + + native_series = self._native_series + + pad_left = pa.array([None] * offset_left, type=native_series.type) + pad_right = pa.array([None] * offset_right, type=native_series.type) + padded_arr = self._from_native_series( + pa.concat_arrays([pad_left, native_series.combine_chunks(), pad_right]) + ) + else: + padded_arr = self + + cum_sum = padded_arr.cum_sum(reverse=False).fill_null(strategy="forward") + rolling_sum = ( + cum_sum - cum_sum.shift(window_size).fill_null(0) + if window_size != 0 + else cum_sum + ) + + valid_count = padded_arr.cum_count(reverse=False) + count_in_window = valid_count - valid_count.shift(window_size).fill_null(0) + + result = ( + self._from_native_series( + pc.if_else( + (count_in_window >= min_periods)._native_series, + rolling_sum._native_series, + None, + ) + ) + / count_in_window + ) + if center: + result = result[offset_left + offset_right :] + return result + + def rolling_std( + self: Self, + window_size: int, + *, + min_periods: int | None, + center: bool, + ddof: int, + ) -> Self: + return ( + self.rolling_var( + window_size=window_size, min_periods=min_periods, center=center, ddof=ddof + ) + ** 0.5 + ) + def __iter__(self: Self) -> Iterator[Any]: yield from self._native_series.__iter__() diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 58e73792a..381693e81 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -913,6 +913,64 @@ def func( returns_scalar=False, ) + def rolling_var( + self: Self, + window_size: int, + *, + min_periods: int | None, + center: bool, + ddof: int, + ) -> Self: + def func( + _input: dask_expr.Series, + _window: int, + _min_periods: int | None, + _center: bool, # noqa: FBT001 + _ddof: int, + ) -> dask_expr.Series: + return _input.rolling( + window=_window, min_periods=_min_periods, center=_center + ).var(ddof=ddof) + + return self._from_call( + func, + "rolling_var", + window_size, + min_periods, + center, + ddof, + returns_scalar=False, + ) + + def rolling_std( + self: Self, + window_size: int, + *, + min_periods: int | None, + center: bool, + ddof: int, + ) -> Self: + def func( + _input: dask_expr.Series, + _window: int, + _min_periods: int | None, + _center: bool, # noqa: FBT001 + _ddof: int, + ) -> dask_expr.Series: + return _input.rolling( + window=_window, min_periods=_min_periods, center=_center + ).std(ddof=ddof) + + return self._from_call( + func, + "rolling_std", + window_size, + min_periods, + center, + ddof, + returns_scalar=False, + ) + class DaskExprStringNamespace: def __init__(self, expr: DaskExpr) -> None: diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 182ea980f..31961bb3e 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -520,6 +520,40 @@ def rolling_mean( center=center, ) + def rolling_var( + self: Self, + window_size: int, + *, + min_periods: int | None, + center: bool, + ddof: int, + ) -> Self: + return reuse_series_implementation( + self, + "rolling_var", + window_size=window_size, + min_periods=min_periods, + center=center, + ddof=ddof, + ) + + def rolling_std( + self: Self, + window_size: int, + *, + min_periods: int | None, + center: bool, + ddof: int, + ) -> Self: + return reuse_series_implementation( + self, + "rolling_std", + window_size=window_size, + min_periods=min_periods, + center=center, + ddof=ddof, + ) + @property def str(self: Self) -> PandasLikeExprStringNamespace: return PandasLikeExprStringNamespace(self) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index c8520529a..8e159ebd1 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -852,6 +852,32 @@ def rolling_mean( ).mean() return self._from_native_series(result) + def rolling_var( + self: Self, + window_size: int, + *, + min_periods: int | None, + center: bool, + ddof: int, + ) -> Self: + result = self._native_series.rolling( + window=window_size, min_periods=min_periods, center=center + ).var(ddof=ddof) + return self._from_native_series(result) + + def rolling_std( + self: Self, + window_size: int, + *, + min_periods: int | None, + center: bool, + ddof: int, + ) -> Self: + result = self._native_series.rolling( + window=window_size, min_periods=min_periods, center=center + ).std(ddof=ddof) + return self._from_native_series(result) + def __iter__(self: Self) -> Iterator[Any]: yield from self._native_series.__iter__() diff --git a/narwhals/expr.py b/narwhals/expr.py index 9c0a484a5..92d39ba6c 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -3377,6 +3377,193 @@ def rolling_mean( ) ) + def rolling_var( + self: Self, + window_size: int, + *, + min_periods: int | None = None, + center: bool = False, + ddof: int = 1, + ) -> Self: + """Apply a rolling variance (moving variance) over the values. + + !!! warning + This functionality is considered **unstable**. It may be changed at any point + without it being considered a breaking change. + + A window of length `window_size` will traverse the values. The resulting values + will be aggregated to their variance. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Arguments: + window_size: The length of the window in number of elements. It must be a + strictly positive integer. + min_periods: The number of values in the window that should be non-null before + computing a result. If set to `None` (default), it will be set equal to + `window_size`. If provided, it must be a strictly positive integer, and + less than or equal to `window_size`. + center: Set the labels at the center of the window. + ddof: Delta Degrees of Freedom; the divisor for a length N window is N - ddof. + + Returns: + A new expression. + + Examples: + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = {"a": [1.0, 2.0, None, 4.0]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a library agnostic function: + + >>> def agnostic_rolling_var(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... b=nw.col("a").rolling_var(window_size=3, min_periods=1) + ... ).to_native() + + We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + + >>> agnostic_rolling_var(df_pd) + a b + 0 1.0 1.0 + 1 2.0 1.5 + 2 NaN 1.5 + 3 4.0 3.0 + + >>> agnostic_rolling_var(df_pl) + shape: (4, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ 1.5 │ + │ null ┆ 1.5 │ + │ 4.0 ┆ 3.0 │ + └──────┴─────┘ + + >>> agnostic_rolling_var(df_pa) # doctest:+ELLIPSIS + pyarrow.Table + a: double + b: double + ---- + a: [[1,2,null,4]] + b: [[1,1.5,1.5,3]] + """ + window_size, min_periods = _validate_rolling_arguments( + window_size=window_size, min_periods=min_periods + ) + + return self.__class__( + lambda plx: self._call(plx).rolling_var( + window_size=window_size, min_periods=min_periods, center=center, ddof=ddof + ) + ) + + def rolling_std( + self: Self, + window_size: int, + *, + min_periods: int | None = None, + center: bool = False, + ddof: int = 1, + ) -> Self: + """Apply a rolling standard deviation (moving standard deviation) over the values. + + !!! warning + This functionality is considered **unstable**. It may be changed at any point + without it being considered a breaking change. + + A window of length `window_size` will traverse the values. The resulting values + will be aggregated to their standard deviation. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Arguments: + window_size: The length of the window in number of elements. It must be a + strictly positive integer. + min_periods: The number of values in the window that should be non-null before + computing a result. If set to `None` (default), it will be set equal to + `window_size`. If provided, it must be a strictly positive integer, and + less than or equal to `window_size`. + center: Set the labels at the center of the window. + ddof: Delta Degrees of Freedom; the divisor for a length N window is N - ddof. + + Returns: + A new expression. + + Examples: + >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = {"a": [1.0, 2.0, None, 4.0]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a library agnostic function: + + >>> def agnostic_rolling_std(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) + ... return df.with_columns( + ... b=nw.col("a").rolling_std(window_size=3, min_periods=1) + ... ).to_native() + + We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + + >>> agnostic_rolling_std(df_pd) + a b + 0 1.0 1.0 + 1 2.0 1.5 + 2 NaN 1.5 + 3 4.0 3.0 + + >>> agnostic_rolling_std(df_pl) + shape: (4, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ 1.5 │ + │ null ┆ 1.5 │ + │ 4.0 ┆ 3.0 │ + └──────┴─────┘ + + >>> agnostic_rolling_std(df_pa) # doctest:+ELLIPSIS + pyarrow.Table + a: double + b: double + ---- + a: [[1,2,null,4]] + b: [[1,1.5,1.5,3]] + """ + window_size, min_periods = _validate_rolling_arguments( + window_size=window_size, min_periods=min_periods + ) + + return self.__class__( + lambda plx: self._call(plx).rolling_std( + window_size=window_size, + min_periods=min_periods, + center=center, + ddof=ddof, + ) + ) + @property def str(self: Self) -> ExprStringNamespace[Self]: return ExprStringNamespace(self) diff --git a/narwhals/series.py b/narwhals/series.py index 8ffe0b0f4..6d5e372e5 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -3307,6 +3307,192 @@ def rolling_mean( ) ) + def rolling_var( + self: Self, + window_size: int, + *, + min_periods: int | None = None, + center: bool = False, + ddof: int = 1, + ) -> Self: + """Apply a rolling variance (moving variance) over the values. + + !!! warning + This functionality is considered **unstable**. It may be changed at any point + without it being considered a breaking change. + + A window of length `window_size` will traverse the values. The resulting values + will be aggregated to their variance. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Arguments: + window_size: The length of the window in number of elements. It must be a + strictly positive integer. + min_periods: The number of values in the window that should be non-null before + computing a result. If set to `None` (default), it will be set equal to + `window_size`. If provided, it must be a strictly positive integer, and + less than or equal to `window_size`. + center: Set the labels at the center of the window. + ddof: Delta Degrees of Freedom; the divisor for a length N window is N - ddof. + + Returns: + A new expression. + + Examples: + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = [1.0, 2.0, 3.0, 4.0] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a library agnostic function: + + >>> def agnostic_rolling_var(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.rolling_var(window_size=2).to_native() + + We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + + >>> agnostic_rolling_var(s_pd) + 0 NaN + 1 1.5 + 2 2.5 + 3 3.5 + dtype: float64 + + >>> agnostic_rolling_var(s_pl) # doctest:+NORMALIZE_WHITESPACE + shape: (4,) + Series: '' [f64] + [ + null + 1.5 + 2.5 + 3.5 + ] + + >>> agnostic_rolling_var(s_pa) # doctest:+ELLIPSIS + + [ + [ + null, + 1.5, + 2.5, + 3.5 + ] + ] + """ + window_size, min_periods = _validate_rolling_arguments( + window_size=window_size, min_periods=min_periods + ) + + if len(self) == 0: # pragma: no cover + return self + + return self._from_compliant_series( + self._compliant_series.rolling_var( + window_size=window_size, min_periods=min_periods, center=center, ddof=ddof + ) + ) + + def rolling_std( + self: Self, + window_size: int, + *, + min_periods: int | None = None, + center: bool = False, + ddof: int = 1, + ) -> Self: + """Apply a rolling standard deviation (moving standard deviation) over the values. + + !!! warning + This functionality is considered **unstable**. It may be changed at any point + without it being considered a breaking change. + + A window of length `window_size` will traverse the values. The resulting values + will be aggregated to their standard deviation. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Arguments: + window_size: The length of the window in number of elements. It must be a + strictly positive integer. + min_periods: The number of values in the window that should be non-null before + computing a result. If set to `None` (default), it will be set equal to + `window_size`. If provided, it must be a strictly positive integer, and + less than or equal to `window_size`. + center: Set the labels at the center of the window. + ddof: Delta Degrees of Freedom; the divisor for a length N window is N - ddof. + + Returns: + A new expression. + + Examples: + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = [1.0, 2.0, 3.0, 4.0] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a library agnostic function: + + >>> def agnostic_rolling_std(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.rolling_std(window_size=2).to_native() + + We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + + >>> agnostic_rolling_std(s_pd) + 0 NaN + 1 1.5 + 2 2.5 + 3 3.5 + dtype: float64 + + >>> agnostic_rolling_std(s_pl) # doctest:+NORMALIZE_WHITESPACE + shape: (4,) + Series: '' [f64] + [ + null + 1.5 + 2.5 + 3.5 + ] + + >>> agnostic_rolling_std(s_pa) # doctest:+ELLIPSIS + + [ + [ + null, + 1.5, + 2.5, + 3.5 + ] + ] + """ + window_size, min_periods = _validate_rolling_arguments( + window_size=window_size, min_periods=min_periods + ) + + if len(self) == 0: # pragma: no cover + return self + + return self._from_compliant_series( + self._compliant_series.rolling_std( + window_size=window_size, min_periods=min_periods, center=center, ddof=ddof + ) + ) + def __iter__(self: Self) -> Iterator[Any]: yield from self._compliant_series.__iter__() diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index f57212ee6..80234f62c 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -814,6 +814,196 @@ def rolling_mean( center=center, ) + def rolling_var( + self: Self, + window_size: int, + *, + min_periods: int | None = None, + center: bool = False, + ddof: int = 1, + ) -> Self: + """Apply a rolling variance (moving variance) over the values. + + !!! warning + This functionality is considered **unstable**. It may be changed at any point + without it being considered a breaking change. + + A window of length `window_size` will traverse the values. The resulting values + will be aggregated to their variance. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Arguments: + window_size: The length of the window in number of elements. It must be a + strictly positive integer. + min_periods: The number of values in the window that should be non-null before + computing a result. If set to `None` (default), it will be set equal to + `window_size`. If provided, it must be a strictly positive integer, and + less than or equal to `window_size`. + center: Set the labels at the center of the window. + ddof: Delta Degrees of Freedom; the divisor for a length N window is N - ddof. + + Returns: + A new expression. + + Examples: + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = [1.0, 2.0, 3.0, 4.0] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a library agnostic function: + + >>> def agnostic_rolling_var(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.rolling_var(window_size=2).to_native() + + We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + + >>> agnostic_rolling_var(s_pd) + 0 NaN + 1 1.5 + 2 2.5 + 3 3.5 + dtype: float64 + + >>> agnostic_rolling_var(s_pl) # doctest:+NORMALIZE_WHITESPACE + shape: (4,) + Series: '' [f64] + [ + null + 1.5 + 2.5 + 3.5 + ] + + >>> agnostic_rolling_var(s_pa) # doctest:+ELLIPSIS + + [ + [ + null, + 1.5, + 2.5, + 3.5 + ] + ] + """ + from narwhals.exceptions import NarwhalsUnstableWarning + from narwhals.utils import find_stacklevel + + msg = ( + "`Series.rolling_var` is being called from the stable API although considered " + "an unstable feature." + ) + warn(message=msg, category=NarwhalsUnstableWarning, stacklevel=find_stacklevel()) + return super().rolling_var( + window_size=window_size, + min_periods=min_periods, + center=center, + ddof=ddof, + ) + + def rolling_std( + self: Self, + window_size: int, + *, + min_periods: int | None = None, + center: bool = False, + ddof: int = 1, + ) -> Self: + """Apply a rolling standard deviation (moving standard deviation) over the values. + + !!! warning + This functionality is considered **unstable**. It may be changed at any point + without it being considered a breaking change. + + A window of length `window_size` will traverse the values. The resulting values + will be aggregated to their standard deviation. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Arguments: + window_size: The length of the window in number of elements. It must be a + strictly positive integer. + min_periods: The number of values in the window that should be non-null before + computing a result. If set to `None` (default), it will be set equal to + `window_size`. If provided, it must be a strictly positive integer, and + less than or equal to `window_size`. + center: Set the labels at the center of the window. + ddof: Delta Degrees of Freedom; the divisor for a length N window is N - ddof. + + Returns: + A new expression. + + Examples: + >>> import narwhals as nw + >>> from narwhals.typing import IntoSeriesT + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = [1.0, 2.0, 3.0, 4.0] + >>> s_pd = pd.Series(data) + >>> s_pl = pl.Series(data) + >>> s_pa = pa.chunked_array([data]) + + We define a library agnostic function: + + >>> def agnostic_rolling_std(s_native: IntoSeriesT) -> IntoSeriesT: + ... s = nw.from_native(s_native, series_only=True) + ... return s.rolling_std(window_size=2).to_native() + + We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + + >>> agnostic_rolling_std(s_pd) + 0 NaN + 1 1.5 + 2 2.5 + 3 3.5 + dtype: float64 + + >>> agnostic_rolling_std(s_pl) # doctest:+NORMALIZE_WHITESPACE + shape: (4,) + Series: '' [f64] + [ + null + 1.5 + 2.5 + 3.5 + ] + + >>> agnostic_rolling_std(s_pa) # doctest:+ELLIPSIS + + [ + [ + null, + 1.5, + 2.5, + 3.5 + ] + ] + """ + from narwhals.exceptions import NarwhalsUnstableWarning + from narwhals.utils import find_stacklevel + + msg = ( + "`Series.rolling_std` is being called from the stable API although considered " + "an unstable feature." + ) + warn(message=msg, category=NarwhalsUnstableWarning, stacklevel=find_stacklevel()) + return super().rolling_std( + window_size=window_size, + min_periods=min_periods, + center=center, + ddof=ddof, + ) + class Expr(NwExpr): def _l1_norm(self) -> Self: @@ -1112,6 +1302,195 @@ def rolling_mean( center=center, ) + def rolling_var( + self: Self, + window_size: int, + *, + min_periods: int | None = None, + center: bool = False, + ddof: int = 1, + ) -> Self: + """Apply a rolling variance (moving variance) over the values. + + !!! warning + This functionality is considered **unstable**. It may be changed at any point + without it being considered a breaking change. + + A window of length `window_size` will traverse the values. The resulting values + will be aggregated to their variance. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Arguments: + window_size: The length of the window in number of elements. It must be a + strictly positive integer. + min_periods: The number of values in the window that should be non-null before + computing a result. If set to `None` (default), it will be set equal to + `window_size`. If provided, it must be a strictly positive integer, and + less than or equal to `window_size`. + center: Set the labels at the center of the window. + ddof: Delta Degrees of Freedom; the divisor for a length N window is N - ddof. + + Returns: + A new expression. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = {"a": [1.0, 2.0, None, 4.0]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a library agnostic function: + + >>> @nw.narwhalify + ... def agnostic_rolling_var(df): + ... return df.with_columns( + ... b=nw.col("a").rolling_var(window_size=3, min_periods=1) + ... ) + + We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + + >>> agnostic_rolling_var(df_pd) + a b + 0 1.0 1.0 + 1 2.0 1.5 + 2 NaN 1.5 + 3 4.0 3.0 + + >>> agnostic_rolling_var(df_pl) + shape: (4, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ 1.5 │ + │ null ┆ 1.5 │ + │ 4.0 ┆ 3.0 │ + └──────┴─────┘ + + >>> agnostic_rolling_var(df_pa) # doctest:+ELLIPSIS + pyarrow.Table + a: double + b: double + ---- + a: [[1,2,null,4]] + b: [[1,1.5,1.5,3]] + """ + from narwhals.exceptions import NarwhalsUnstableWarning + from narwhals.utils import find_stacklevel + + msg = ( + "`Expr.rolling_var` is being called from the stable API although considered " + "an unstable feature." + ) + warn(message=msg, category=NarwhalsUnstableWarning, stacklevel=find_stacklevel()) + return super().rolling_var( + window_size=window_size, min_periods=min_periods, center=center, ddof=ddof + ) + + def rolling_std( + self: Self, + window_size: int, + *, + min_periods: int | None = None, + center: bool = False, + ddof: int = 1, + ) -> Self: + """Apply a rolling standard deviation (moving standard deviation) over the values. + + !!! warning + This functionality is considered **unstable**. It may be changed at any point + without it being considered a breaking change. + + A window of length `window_size` will traverse the values. The resulting values + will be aggregated to their standard deviation. + + The window at a given row will include the row itself and the `window_size - 1` + elements before it. + + Arguments: + window_size: The length of the window in number of elements. It must be a + strictly positive integer. + min_periods: The number of values in the window that should be non-null before + computing a result. If set to `None` (default), it will be set equal to + `window_size`. If provided, it must be a strictly positive integer, and + less than or equal to `window_size` + center: Set the labels at the center of the window. + ddof: Delta Degrees of Freedom; the divisor for a length N window is N - ddof. + + Returns: + A new expression. + + Examples: + >>> import narwhals as nw + >>> import pandas as pd + >>> import polars as pl + >>> import pyarrow as pa + >>> data = {"a": [1.0, 2.0, None, 4.0]} + >>> df_pd = pd.DataFrame(data) + >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) + + We define a library agnostic function: + + >>> @nw.narwhalify + ... def agnostic_rolling_std(df): + ... return df.with_columns( + ... b=nw.col("a").rolling_std(window_size=3, min_periods=1) + ... ) + + We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: + + >>> agnostic_rolling_std(df_pd) + a b + 0 1.0 1.0 + 1 2.0 1.5 + 2 NaN 1.5 + 3 4.0 3.0 + + >>> agnostic_rolling_std(df_pl) + shape: (4, 2) + ┌──────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪═════╡ + │ 1.0 ┆ 1.0 │ + │ 2.0 ┆ 1.5 │ + │ null ┆ 1.5 │ + │ 4.0 ┆ 3.0 │ + └──────┴─────┘ + + >>> agnostic_rolling_std(df_pa) # doctest:+ELLIPSIS + pyarrow.Table + a: double + b: double + ---- + a: [[1,2,null,4]] + b: [[1,1.5,1.5,3]] + """ + from narwhals.exceptions import NarwhalsUnstableWarning + from narwhals.utils import find_stacklevel + + msg = ( + "`Expr.rolling_std` is being called from the stable API although considered " + "an unstable feature." + ) + warn(message=msg, category=NarwhalsUnstableWarning, stacklevel=find_stacklevel()) + return super().rolling_std( + window_size=window_size, + min_periods=min_periods, + center=center, + ddof=ddof, + ) + class Schema(NwSchema): """Ordered mapping of column names to their data type. diff --git a/tests/expr_and_series/rolling_std_test.py b/tests/expr_and_series/rolling_std_test.py new file mode 100644 index 000000000..1ba13a0c3 --- /dev/null +++ b/tests/expr_and_series/rolling_std_test.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +import random + +import hypothesis.strategies as st +import pandas as pd +import pyarrow as pa +import pytest +from hypothesis import given + +import narwhals.stable.v1 as nw +from tests.utils import PANDAS_VERSION +from tests.utils import Constructor +from tests.utils import ConstructorEager +from tests.utils import assert_equal_data + +data = {"a": [None, 1, 2, None, 4, 6, 11]} + +kwargs_and_expected = { + "x1": {"kwargs": {"window_size": 3}, "expected": [float("nan")] * 6 + [7.0]}, + "x2": { + "kwargs": {"window_size": 3, "min_periods": 1}, + "expected": [float("nan"), 1.0, 1.5, 1.5, 3.0, 5.0, 7.0], + }, + "x3": { + "kwargs": {"window_size": 2, "min_periods": 1}, + "expected": [float("nan"), 1.0, 1.5, 2.0, 4.0, 5.0, 8.5], + }, + "x4": { + "kwargs": {"window_size": 5, "min_periods": 1, "center": True}, + "expected": [1.5, 1.5, 7 / 3, 3.25, 5.75, 7.0, 7.0], + }, + "x5": { + "kwargs": {"window_size": 4, "min_periods": 1, "center": True}, + "expected": [1.0, 1.5, 1.5, 7 / 3, 4.0, 7.0, 7.0], + }, +} + + +@pytest.mark.filterwarnings( + "ignore:`Expr.rolling_std` is being called from the stable API although considered an unstable feature." +) +def test_rolling_std_expr( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "dask" in str(constructor): + # TODO(FBruzzesi): Dask is raising the following error: + # NotImplementedError: Partition size is less than overlapping window size. + # Try using ``df.repartition`` to increase the partition size. + request.applymarker(pytest.mark.xfail) + + df = nw.from_native(constructor(data)) + result = df.select( + **{ + name: nw.col("a").rolling_std(**values["kwargs"]) # type: ignore[arg-type] + for name, values in kwargs_and_expected.items() + } + ) + expected = {name: values["expected"] for name, values in kwargs_and_expected.items()} + + assert_equal_data(result, expected) + + +@pytest.mark.filterwarnings( + "ignore:`Series.rolling_std` is being called from the stable API although considered an unstable feature." +) +def test_rolling_std_series(constructor_eager: ConstructorEager) -> None: + df = nw.from_native(constructor_eager(data), eager_only=True) + + result = df.select( + **{ + name: df["a"].rolling_std(**values["kwargs"]) # type: ignore[arg-type] + for name, values in kwargs_and_expected.items() + } + ) + expected = {name: values["expected"] for name, values in kwargs_and_expected.items()} + assert_equal_data(result, expected) + + +@given( # type: ignore[misc] + center=st.booleans(), + values=st.lists(st.floats(-10, 10), min_size=3, max_size=10), +) +@pytest.mark.skipif(PANDAS_VERSION < (1,), reason="too old for pyarrow") +@pytest.mark.filterwarnings("ignore:.*:narwhals.exceptions.NarwhalsUnstableWarning") +def test_rolling_std_hypothesis(center: bool, values: list[float]) -> None: # noqa: FBT001 + s = pd.Series(values) + n_missing = random.randint(0, len(s) - 1) # noqa: S311 + window_size = random.randint(1, len(s)) # noqa: S311 + min_periods = random.randint(1, window_size) # noqa: S311 + mask = random.sample(range(len(s)), n_missing) + s[mask] = None + df = pd.DataFrame({"a": s}) + expected = ( + s.rolling(window=window_size, center=center, min_periods=min_periods) + .mean() + .to_frame("a") + ) + result = nw.from_native(pa.Table.from_pandas(df)).select( + nw.col("a").rolling_std(window_size, center=center, min_periods=min_periods) + ) + expected_dict = nw.from_native(expected, eager_only=True).to_dict(as_series=False) + assert_equal_data(result, expected_dict) diff --git a/tests/expr_and_series/rolling_var_test.py b/tests/expr_and_series/rolling_var_test.py new file mode 100644 index 000000000..017c46457 --- /dev/null +++ b/tests/expr_and_series/rolling_var_test.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +import random + +import hypothesis.strategies as st +import pandas as pd +import pyarrow as pa +import pytest +from hypothesis import given + +import narwhals.stable.v1 as nw +from tests.utils import PANDAS_VERSION +from tests.utils import Constructor +from tests.utils import ConstructorEager +from tests.utils import assert_equal_data + +data = {"a": [1.0, 2.0, 4.0, 8.0, 12.0, 16.0, 2.0]} + +kwargs_and_expected = { + "x1": {"kwargs": {"window_size": 3}, "expected": [float("nan")] * 6 + [13.0]}, + "x2": { + "kwargs": {"window_size": 3, "min_periods": 1}, + "expected": [float("nan"), 0.0, 0.5, 0.5, 2.0, 2.0, 13.0], + }, + "x3": { + "kwargs": {"window_size": 2, "min_periods": 1}, + "expected": [float("nan"), 0.0, 0.5, 0.0, 0.0, 2.0, 12.5], + }, + "x4": { + "kwargs": {"window_size": 5, "min_periods": 1, "center": True}, + "expected": [ + 0.5, + 0.5, + 2.333333333333332, + 4.916666666666667, + 14.916666666666666, + 13.0, + 13.0, + ], + }, + "x5": { + "kwargs": {"window_size": 4, "min_periods": 1, "center": True}, + "expected": [0.0, 0.5, 0.5, 2.333333333333332, 4.0, 13.0, 13.0], + }, +} + + +@pytest.mark.filterwarnings( + "ignore:`Expr.rolling_var` is being called from the stable API although considered an unstable feature." +) +def test_rolling_var_expr( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + if "dask" in str(constructor): + # TODO(FBruzzesi): Dask is raising the following error: + # NotImplementedError: Partition size is less than overlapping window size. + # Try using ``df.repartition`` to increase the partition size. + request.applymarker(pytest.mark.xfail) + + if "polars" not in str(constructor): + pytest.skip() + + df = nw.from_native(constructor(data)) + result = df.select( + **{ + name: nw.col("a").rolling_var(**values["kwargs"]) # type: ignore[arg-type] + for name, values in kwargs_and_expected.items() + } + ) + expected = {name: values["expected"] for name, values in kwargs_and_expected.items()} + + assert_equal_data(result, expected) + + +@pytest.mark.filterwarnings( + "ignore:`Series.rolling_var` is being called from the stable API although considered an unstable feature." +) +def test_rolling_var_series(constructor_eager: ConstructorEager) -> None: + df = nw.from_native(constructor_eager(data), eager_only=True) + + result = df.select( + **{ + name: df["a"].rolling_var(**values["kwargs"]) # type: ignore[arg-type] + for name, values in kwargs_and_expected.items() + } + ) + expected = {name: values["expected"] for name, values in kwargs_and_expected.items()} + assert_equal_data(result, expected) + + +@given( # type: ignore[misc] + center=st.booleans(), + values=st.lists(st.floats(-10, 10), min_size=3, max_size=10), +) +@pytest.mark.skipif(PANDAS_VERSION < (1,), reason="too old for pyarrow") +@pytest.mark.filterwarnings("ignore:.*:narwhals.exceptions.NarwhalsUnstableWarning") +def test_rolling_var_hypothesis(center: bool, values: list[float]) -> None: # noqa: FBT001 + s = pd.Series(values) + n_missing = random.randint(0, len(s) - 1) # noqa: S311 + window_size = random.randint(1, len(s)) # noqa: S311 + min_periods = random.randint(1, window_size) # noqa: S311 + mask = random.sample(range(len(s)), n_missing) + s[mask] = None + df = pd.DataFrame({"a": s}) + expected = ( + s.rolling(window=window_size, center=center, min_periods=min_periods) + .mean() + .to_frame("a") + ) + result = nw.from_native(pa.Table.from_pandas(df)).select( + nw.col("a").rolling_var(window_size, center=center, min_periods=min_periods) + ) + expected_dict = nw.from_native(expected, eager_only=True).to_dict(as_series=False) + assert_equal_data(result, expected_dict) From 98399ec390285826c983d2ab28067bb6780c4b69 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Tue, 26 Nov 2024 23:33:31 +0100 Subject: [PATCH 2/9] WIP --- narwhals/series.py | 8 ++++---- narwhals/stable/v1/__init__.py | 8 ++++---- tests/expr_and_series/rolling_var_test.py | 23 +++++++++-------------- 3 files changed, 17 insertions(+), 22 deletions(-) diff --git a/narwhals/series.py b/narwhals/series.py index 6d5e372e5..7ac122d39 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -3150,7 +3150,7 @@ def rolling_sum( center: Set the labels at the center of the window. Returns: - A new expression. + A new series. Examples: >>> import narwhals as nw @@ -3243,7 +3243,7 @@ def rolling_mean( center: Set the labels at the center of the window. Returns: - A new expression. + A new series. Examples: >>> import narwhals as nw @@ -3338,7 +3338,7 @@ def rolling_var( ddof: Delta Degrees of Freedom; the divisor for a length N window is N - ddof. Returns: - A new expression. + A new series. Examples: >>> import narwhals as nw @@ -3431,7 +3431,7 @@ def rolling_std( ddof: Delta Degrees of Freedom; the divisor for a length N window is N - ddof. Returns: - A new expression. + A new series. Examples: >>> import narwhals as nw diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index 80234f62c..9fe731c07 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -659,7 +659,7 @@ def rolling_sum( center: Set the labels at the center of the window. Returns: - A new expression. + A new series. Examples: >>> import narwhals as nw @@ -751,7 +751,7 @@ def rolling_mean( center: Set the labels at the center of the window. Returns: - A new expression. + A new series. Examples: >>> import narwhals as nw @@ -845,7 +845,7 @@ def rolling_var( ddof: Delta Degrees of Freedom; the divisor for a length N window is N - ddof. Returns: - A new expression. + A new series. Examples: >>> import narwhals as nw @@ -940,7 +940,7 @@ def rolling_std( ddof: Delta Degrees of Freedom; the divisor for a length N window is N - ddof. Returns: - A new expression. + A new series. Examples: >>> import narwhals as nw diff --git a/tests/expr_and_series/rolling_var_test.py b/tests/expr_and_series/rolling_var_test.py index 017c46457..73267c564 100644 --- a/tests/expr_and_series/rolling_var_test.py +++ b/tests/expr_and_series/rolling_var_test.py @@ -14,33 +14,28 @@ from tests.utils import ConstructorEager from tests.utils import assert_equal_data -data = {"a": [1.0, 2.0, 4.0, 8.0, 12.0, 16.0, 2.0]} +data = {"a": [1.0, 2.0, 1.0, 3.0, 1.0, 4.0, 1.0]} kwargs_and_expected = { - "x1": {"kwargs": {"window_size": 3}, "expected": [float("nan")] * 6 + [13.0]}, + "x1": { + "kwargs": {"window_size": 3}, + "expected": [float("nan"), float("nan"), 1/3, 1, 4/3, 7/3, 3] + }, "x2": { "kwargs": {"window_size": 3, "min_periods": 1}, - "expected": [float("nan"), 0.0, 0.5, 0.5, 2.0, 2.0, 13.0], + "expected": [float("nan"), 0.5, 1/3, 1., 4/3, 7/3, 3], }, "x3": { "kwargs": {"window_size": 2, "min_periods": 1}, - "expected": [float("nan"), 0.0, 0.5, 0.0, 0.0, 2.0, 12.5], + "expected": [float("nan"), 0.5, 0.5, 2.0, 2.0, 4.5, 4.5], }, "x4": { "kwargs": {"window_size": 5, "min_periods": 1, "center": True}, - "expected": [ - 0.5, - 0.5, - 2.333333333333332, - 4.916666666666667, - 14.916666666666666, - 13.0, - 13.0, - ], + "expected": [1/3, 29/30, 4/5, 17/10, 2., 2.25, 3] }, "x5": { "kwargs": {"window_size": 4, "min_periods": 1, "center": True}, - "expected": [0.0, 0.5, 0.5, 2.333333333333332, 4.0, 13.0, 13.0], + "expected": [0.5, 1/3, 29/30, 29/30, 2.25, 2.25, 3], }, } From d92ae41547d12d0309bc50921006a61ec88c8298 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Tue, 26 Nov 2024 23:40:21 +0100 Subject: [PATCH 3/9] it's working --- narwhals/_arrow/series.py | 24 ++++++++++++++--------- tests/expr_and_series/rolling_var_test.py | 20 +++++++++---------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 4d3cb050c..2fc05b838 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -1028,19 +1028,25 @@ def rolling_var( else cum_sum ) + cum_sum_sq = ( + padded_arr.__pow__(2).cum_sum(reverse=False).fill_null(strategy="forward") + ) + rolling_sum_sq = ( + cum_sum_sq - cum_sum_sq.shift(window_size).fill_null(0) + if window_size != 0 + else cum_sum_sq + ) + valid_count = padded_arr.cum_count(reverse=False) count_in_window = valid_count - valid_count.shift(window_size).fill_null(0) - result = ( - self._from_native_series( - pc.if_else( - (count_in_window >= min_periods)._native_series, - rolling_sum._native_series, - None, - ) + result = self._from_native_series( + pc.if_else( + (count_in_window >= min_periods)._native_series, + (rolling_sum_sq - (rolling_sum**2 / count_in_window))._native_series, + None, ) - / count_in_window - ) + ) / (count_in_window - ddof) if center: result = result[offset_left + offset_right :] return result diff --git a/tests/expr_and_series/rolling_var_test.py b/tests/expr_and_series/rolling_var_test.py index 73267c564..7b384b809 100644 --- a/tests/expr_and_series/rolling_var_test.py +++ b/tests/expr_and_series/rolling_var_test.py @@ -19,11 +19,11 @@ kwargs_and_expected = { "x1": { "kwargs": {"window_size": 3}, - "expected": [float("nan"), float("nan"), 1/3, 1, 4/3, 7/3, 3] + "expected": [float("nan"), float("nan"), 1 / 3, 1, 4 / 3, 7 / 3, 3], }, "x2": { "kwargs": {"window_size": 3, "min_periods": 1}, - "expected": [float("nan"), 0.5, 1/3, 1., 4/3, 7/3, 3], + "expected": [float("nan"), 0.5, 1 / 3, 1.0, 4 / 3, 7 / 3, 3], }, "x3": { "kwargs": {"window_size": 2, "min_periods": 1}, @@ -31,11 +31,11 @@ }, "x4": { "kwargs": {"window_size": 5, "min_periods": 1, "center": True}, - "expected": [1/3, 29/30, 4/5, 17/10, 2., 2.25, 3] + "expected": [1 / 3, 11 / 12, 4 / 5, 17 / 10, 2.0, 2.25, 3], }, "x5": { "kwargs": {"window_size": 4, "min_periods": 1, "center": True}, - "expected": [0.5, 1/3, 29/30, 29/30, 2.25, 2.25, 3], + "expected": [0.5, 1 / 3, 11 / 12, 11 / 12, 2.25, 2.25, 3], }, } @@ -52,9 +52,6 @@ def test_rolling_var_expr( # Try using ``df.repartition`` to increase the partition size. request.applymarker(pytest.mark.xfail) - if "polars" not in str(constructor): - pytest.skip() - df = nw.from_native(constructor(data)) result = df.select( **{ @@ -86,10 +83,11 @@ def test_rolling_var_series(constructor_eager: ConstructorEager) -> None: @given( # type: ignore[misc] center=st.booleans(), values=st.lists(st.floats(-10, 10), min_size=3, max_size=10), + ddof=st.integers(min_value=0), ) @pytest.mark.skipif(PANDAS_VERSION < (1,), reason="too old for pyarrow") @pytest.mark.filterwarnings("ignore:.*:narwhals.exceptions.NarwhalsUnstableWarning") -def test_rolling_var_hypothesis(center: bool, values: list[float]) -> None: # noqa: FBT001 +def test_rolling_var_hypothesis(center: bool, values: list[float], ddof: int) -> None: # noqa: FBT001 s = pd.Series(values) n_missing = random.randint(0, len(s) - 1) # noqa: S311 window_size = random.randint(1, len(s)) # noqa: S311 @@ -99,11 +97,13 @@ def test_rolling_var_hypothesis(center: bool, values: list[float]) -> None: # n df = pd.DataFrame({"a": s}) expected = ( s.rolling(window=window_size, center=center, min_periods=min_periods) - .mean() + .var(ddof=ddof) .to_frame("a") ) result = nw.from_native(pa.Table.from_pandas(df)).select( - nw.col("a").rolling_var(window_size, center=center, min_periods=min_periods) + nw.col("a").rolling_var( + window_size, center=center, min_periods=min_periods, ddof=ddof + ) ) expected_dict = nw.from_native(expected, eager_only=True).to_dict(as_series=False) assert_equal_data(result, expected_dict) From a6b315b1a0aae4f31c7fb636ddce411790c6376d Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Wed, 27 Nov 2024 10:27:05 +0100 Subject: [PATCH 4/9] all good for finite --- narwhals/_arrow/series.py | 79 +++------------ narwhals/_arrow/utils.py | 36 +++++++ narwhals/expr.py | 50 +++++----- narwhals/series.py | 50 +++++----- narwhals/stable/v1/__init__.py | 114 +++++++++++----------- tests/expr_and_series/rolling_std_test.py | 57 +++-------- tests/expr_and_series/rolling_var_test.py | 4 + 7 files changed, 179 insertions(+), 211 deletions(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 2fc05b838..4827acc95 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -12,6 +12,7 @@ from narwhals._arrow.utils import floordiv_compat from narwhals._arrow.utils import narwhals_to_native_dtype from narwhals._arrow.utils import native_to_narwhals_dtype +from narwhals._arrow.utils import pad_series from narwhals._arrow.utils import parse_datetime_format from narwhals._arrow.utils import validate_column_comparand from narwhals.translate import to_py_scalar @@ -901,34 +902,19 @@ def rolling_sum( min_periods: int | None, center: bool, ) -> Self: - import pyarrow as pa # ignore-banned-import import pyarrow.compute as pc # ignore-banned-import min_periods = min_periods if min_periods is not None else window_size - if center: - offset_left = window_size // 2 - offset_right = offset_left - ( - window_size % 2 == 0 - ) # subtract one if window_size is even - - native_series = self._native_series - - pad_left = pa.array([None] * offset_left, type=native_series.type) - pad_right = pa.array([None] * offset_right, type=native_series.type) - padded_arr = self._from_native_series( - pa.concat_arrays([pad_left, native_series.combine_chunks(), pad_right]) - ) - else: - padded_arr = self + padded_series, offset = pad_series(self, window_size=window_size, center=center) - cum_sum = padded_arr.cum_sum(reverse=False).fill_null(strategy="forward") + cum_sum = padded_series.cum_sum(reverse=False).fill_null(strategy="forward") rolling_sum = ( cum_sum - cum_sum.shift(window_size).fill_null(0) if window_size != 0 else cum_sum ) - valid_count = padded_arr.cum_count(reverse=False) + valid_count = padded_series.cum_count(reverse=False) count_in_window = valid_count - valid_count.shift(window_size).fill_null(0) result = self._from_native_series( @@ -938,9 +924,7 @@ def rolling_sum( None, ) ) - if center: - result = result[offset_left + offset_right :] - return result + return result[offset:] def rolling_mean( self: Self, @@ -949,34 +933,19 @@ def rolling_mean( min_periods: int | None, center: bool, ) -> Self: - import pyarrow as pa # ignore-banned-import import pyarrow.compute as pc # ignore-banned-import min_periods = min_periods if min_periods is not None else window_size - if center: - offset_left = window_size // 2 - offset_right = offset_left - ( - window_size % 2 == 0 - ) # subtract one if window_size is even - - native_series = self._native_series - - pad_left = pa.array([None] * offset_left, type=native_series.type) - pad_right = pa.array([None] * offset_right, type=native_series.type) - padded_arr = self._from_native_series( - pa.concat_arrays([pad_left, native_series.combine_chunks(), pad_right]) - ) - else: - padded_arr = self + padded_series, offset = pad_series(self, window_size=window_size, center=center) - cum_sum = padded_arr.cum_sum(reverse=False).fill_null(strategy="forward") + cum_sum = padded_series.cum_sum(reverse=False).fill_null(strategy="forward") rolling_sum = ( cum_sum - cum_sum.shift(window_size).fill_null(0) if window_size != 0 else cum_sum ) - valid_count = padded_arr.cum_count(reverse=False) + valid_count = padded_series.cum_count(reverse=False) count_in_window = valid_count - valid_count.shift(window_size).fill_null(0) result = ( @@ -989,9 +958,7 @@ def rolling_mean( ) / count_in_window ) - if center: - result = result[offset_left + offset_right :] - return result + return result[offset:] def rolling_var( self: Self, @@ -1001,27 +968,12 @@ def rolling_var( center: bool, ddof: int, ) -> Self: - import pyarrow as pa # ignore-banned-import import pyarrow.compute as pc # ignore-banned-import min_periods = min_periods if min_periods is not None else window_size - if center: - offset_left = window_size // 2 - offset_right = offset_left - ( - window_size % 2 == 0 - ) # subtract one if window_size is even - - native_series = self._native_series - - pad_left = pa.array([None] * offset_left, type=native_series.type) - pad_right = pa.array([None] * offset_right, type=native_series.type) - padded_arr = self._from_native_series( - pa.concat_arrays([pad_left, native_series.combine_chunks(), pad_right]) - ) - else: - padded_arr = self + padded_series, offset = pad_series(self, window_size=window_size, center=center) - cum_sum = padded_arr.cum_sum(reverse=False).fill_null(strategy="forward") + cum_sum = padded_series.cum_sum(reverse=False).fill_null(strategy="forward") rolling_sum = ( cum_sum - cum_sum.shift(window_size).fill_null(0) if window_size != 0 @@ -1029,7 +981,7 @@ def rolling_var( ) cum_sum_sq = ( - padded_arr.__pow__(2).cum_sum(reverse=False).fill_null(strategy="forward") + padded_series.__pow__(2).cum_sum(reverse=False).fill_null(strategy="forward") ) rolling_sum_sq = ( cum_sum_sq - cum_sum_sq.shift(window_size).fill_null(0) @@ -1037,7 +989,7 @@ def rolling_var( else cum_sum_sq ) - valid_count = padded_arr.cum_count(reverse=False) + valid_count = padded_series.cum_count(reverse=False) count_in_window = valid_count - valid_count.shift(window_size).fill_null(0) result = self._from_native_series( @@ -1047,9 +999,8 @@ def rolling_var( None, ) ) / (count_in_window - ddof) - if center: - result = result[offset_left + offset_right :] - return result + + return result[offset:] def rolling_std( self: Self, diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py index 2ad2b3344..385301ea2 100644 --- a/narwhals/_arrow/utils.py +++ b/narwhals/_arrow/utils.py @@ -446,3 +446,39 @@ def _parse_time_format(arr: pa.Array) -> str: if pc.all(matches.is_valid()).as_py(): return time_fmt return "" + + +def pad_series( + series: ArrowSeries, *, window_size: int, center: bool +) -> tuple[ArrowSeries, int]: + """Pad series with None values on the left and/or right side, depending on the specified parameters. + + Arguments: + series: The input ArrowSeries to be padded. + window_size: The desired size of the window. + center: Specifies whether to center the padding or not. + + Returns: + A tuple containing the padded ArrowSeries and the offset value. + """ + import pyarrow as pa # ignore-banned-import + + if center: + offset_left = window_size // 2 + offset_right = offset_left - ( + window_size % 2 == 0 + ) # subtract one if window_size is even + + native_series = series._native_series + + pad_left = pa.array([None] * offset_left, type=native_series.type) + pad_right = pa.array([None] * offset_right, type=native_series.type) + padded_arr = series._from_native_series( + pa.concat_arrays([pad_left, native_series.combine_chunks(), pad_right]) + ) + offset = offset_left + offset_right + else: + padded_arr = series + offset = 0 + + return padded_arr, offset diff --git a/narwhals/expr.py b/narwhals/expr.py index 92d39ba6c..ec95a2beb 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -3433,10 +3433,10 @@ def rolling_var( >>> agnostic_rolling_var(df_pd) a b - 0 1.0 1.0 - 1 2.0 1.5 - 2 NaN 1.5 - 3 4.0 3.0 + 0 1.0 NaN + 1 2.0 0.5 + 2 NaN 0.5 + 3 4.0 2.0 >>> agnostic_rolling_var(df_pl) shape: (4, 2) @@ -3445,10 +3445,10 @@ def rolling_var( │ --- ┆ --- │ │ f64 ┆ f64 │ ╞══════╪═════╡ - │ 1.0 ┆ 1.0 │ - │ 2.0 ┆ 1.5 │ - │ null ┆ 1.5 │ - │ 4.0 ┆ 3.0 │ + │ 1.0 ┆ 0.0 │ + │ 2.0 ┆ 0.5 │ + │ null ┆ 0.5 │ + │ 4.0 ┆ 2.0 │ └──────┴─────┘ >>> agnostic_rolling_var(df_pa) # doctest:+ELLIPSIS @@ -3457,7 +3457,7 @@ def rolling_var( b: double ---- a: [[1,2,null,4]] - b: [[1,1.5,1.5,3]] + b: [[nan,0.5,0.5,2]] """ window_size, min_periods = _validate_rolling_arguments( window_size=window_size, min_periods=min_periods @@ -3524,24 +3524,24 @@ def rolling_std( We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: >>> agnostic_rolling_std(df_pd) - a b - 0 1.0 1.0 - 1 2.0 1.5 - 2 NaN 1.5 - 3 4.0 3.0 + a b + 0 1.0 NaN + 1 2.0 0.707107 + 2 NaN 0.707107 + 3 4.0 1.414214 >>> agnostic_rolling_std(df_pl) shape: (4, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪═════╡ - │ 1.0 ┆ 1.0 │ - │ 2.0 ┆ 1.5 │ - │ null ┆ 1.5 │ - │ 4.0 ┆ 3.0 │ - └──────┴─────┘ + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 1.0 ┆ 0.0 │ + │ 2.0 ┆ 0.707107 │ + │ null ┆ 0.707107 │ + │ 4.0 ┆ 1.414214 │ + └──────┴──────────┘ >>> agnostic_rolling_std(df_pa) # doctest:+ELLIPSIS pyarrow.Table @@ -3549,7 +3549,7 @@ def rolling_std( b: double ---- a: [[1,2,null,4]] - b: [[1,1.5,1.5,3]] + b: [[nan,0.7071067811865476,0.7071067811865476,1.4142135623730951]] """ window_size, min_periods = _validate_rolling_arguments( window_size=window_size, min_periods=min_periods diff --git a/narwhals/series.py b/narwhals/series.py index 7ac122d39..e7768dfc4 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -3346,7 +3346,7 @@ def rolling_var( >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa - >>> data = [1.0, 2.0, 3.0, 4.0] + >>> data = [1.0, 3.0, 1.0, 4.0] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) >>> s_pa = pa.chunked_array([data]) @@ -3355,15 +3355,15 @@ def rolling_var( >>> def agnostic_rolling_var(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) - ... return s.rolling_var(window_size=2).to_native() + ... return s.rolling_var(window_size=2, min_periods=1).to_native() We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: >>> agnostic_rolling_var(s_pd) 0 NaN - 1 1.5 - 2 2.5 - 3 3.5 + 1 2.0 + 2 2.0 + 3 4.5 dtype: float64 >>> agnostic_rolling_var(s_pl) # doctest:+NORMALIZE_WHITESPACE @@ -3371,19 +3371,19 @@ def rolling_var( Series: '' [f64] [ null - 1.5 - 2.5 - 3.5 + 2.0 + 2.0 + 4.5 ] >>> agnostic_rolling_var(s_pa) # doctest:+ELLIPSIS [ [ - null, - 1.5, - 2.5, - 3.5 + nan, + 2, + 2, + 4.5 ] ] """ @@ -3439,7 +3439,7 @@ def rolling_std( >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa - >>> data = [1.0, 2.0, 3.0, 4.0] + >>> data = [1.0, 3.0, 1.0, 4.0] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) >>> s_pa = pa.chunked_array([data]) @@ -3448,15 +3448,15 @@ def rolling_std( >>> def agnostic_rolling_std(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) - ... return s.rolling_std(window_size=2).to_native() + ... return s.rolling_std(window_size=2, min_periods=1).to_native() We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: >>> agnostic_rolling_std(s_pd) - 0 NaN - 1 1.5 - 2 2.5 - 3 3.5 + 0 NaN + 1 1.414214 + 2 1.414214 + 3 2.121320 dtype: float64 >>> agnostic_rolling_std(s_pl) # doctest:+NORMALIZE_WHITESPACE @@ -3464,19 +3464,19 @@ def rolling_std( Series: '' [f64] [ null - 1.5 - 2.5 - 3.5 + 1.414214 + 1.414214 + 2.12132 ] >>> agnostic_rolling_std(s_pa) # doctest:+ELLIPSIS [ [ - null, - 1.5, - 2.5, - 3.5 + nan, + 1.4142135623730951, + 1.4142135623730951, + 2.1213203435596424 ] ] """ diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index 9fe731c07..7cacdc27c 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -853,7 +853,7 @@ def rolling_var( >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa - >>> data = [1.0, 2.0, 3.0, 4.0] + >>> data = [1.0, 3.0, 1.0, 4.0] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) >>> s_pa = pa.chunked_array([data]) @@ -862,15 +862,15 @@ def rolling_var( >>> def agnostic_rolling_var(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) - ... return s.rolling_var(window_size=2).to_native() + ... return s.rolling_var(window_size=2, min_periods=1).to_native() We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: >>> agnostic_rolling_var(s_pd) 0 NaN - 1 1.5 - 2 2.5 - 3 3.5 + 1 2.0 + 2 2.0 + 3 4.5 dtype: float64 >>> agnostic_rolling_var(s_pl) # doctest:+NORMALIZE_WHITESPACE @@ -878,19 +878,19 @@ def rolling_var( Series: '' [f64] [ null - 1.5 - 2.5 - 3.5 + 2.0 + 2.0 + 4.5 ] >>> agnostic_rolling_var(s_pa) # doctest:+ELLIPSIS [ [ - null, - 1.5, - 2.5, - 3.5 + nan, + 2, + 2, + 4.5 ] ] """ @@ -948,7 +948,7 @@ def rolling_std( >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa - >>> data = [1.0, 2.0, 3.0, 4.0] + >>> data = [1.0, 3.0, 1.0, 4.0] >>> s_pd = pd.Series(data) >>> s_pl = pl.Series(data) >>> s_pa = pa.chunked_array([data]) @@ -957,15 +957,15 @@ def rolling_std( >>> def agnostic_rolling_std(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) - ... return s.rolling_std(window_size=2).to_native() + ... return s.rolling_std(window_size=2, min_periods=1).to_native() We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: >>> agnostic_rolling_std(s_pd) - 0 NaN - 1 1.5 - 2 2.5 - 3 3.5 + 0 NaN + 1 1.414214 + 2 1.414214 + 3 2.121320 dtype: float64 >>> agnostic_rolling_std(s_pl) # doctest:+NORMALIZE_WHITESPACE @@ -973,19 +973,19 @@ def rolling_std( Series: '' [f64] [ null - 1.5 - 2.5 - 3.5 + 1.414214 + 1.414214 + 2.12132 ] >>> agnostic_rolling_std(s_pa) # doctest:+ELLIPSIS [ [ - null, - 1.5, - 2.5, - 3.5 + nan, + 1.4142135623730951, + 1.4142135623730951, + 2.1213203435596424 ] ] """ @@ -1337,6 +1337,7 @@ def rolling_var( Examples: >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa @@ -1347,20 +1348,20 @@ def rolling_var( We define a library agnostic function: - >>> @nw.narwhalify - ... def agnostic_rolling_var(df): + >>> def agnostic_rolling_var(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) ... return df.with_columns( ... b=nw.col("a").rolling_var(window_size=3, min_periods=1) - ... ) + ... ).to_native() We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: >>> agnostic_rolling_var(df_pd) a b - 0 1.0 1.0 - 1 2.0 1.5 - 2 NaN 1.5 - 3 4.0 3.0 + 0 1.0 NaN + 1 2.0 0.5 + 2 NaN 0.5 + 3 4.0 2.0 >>> agnostic_rolling_var(df_pl) shape: (4, 2) @@ -1369,10 +1370,10 @@ def rolling_var( │ --- ┆ --- │ │ f64 ┆ f64 │ ╞══════╪═════╡ - │ 1.0 ┆ 1.0 │ - │ 2.0 ┆ 1.5 │ - │ null ┆ 1.5 │ - │ 4.0 ┆ 3.0 │ + │ 1.0 ┆ 0.0 │ + │ 2.0 ┆ 0.5 │ + │ null ┆ 0.5 │ + │ 4.0 ┆ 2.0 │ └──────┴─────┘ >>> agnostic_rolling_var(df_pa) # doctest:+ELLIPSIS @@ -1381,7 +1382,7 @@ def rolling_var( b: double ---- a: [[1,2,null,4]] - b: [[1,1.5,1.5,3]] + b: [[nan,0.5,0.5,2]] """ from narwhals.exceptions import NarwhalsUnstableWarning from narwhals.utils import find_stacklevel @@ -1430,6 +1431,7 @@ def rolling_std( Examples: >>> import narwhals as nw + >>> from narwhals.typing import IntoFrameT >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa @@ -1440,33 +1442,33 @@ def rolling_std( We define a library agnostic function: - >>> @nw.narwhalify - ... def agnostic_rolling_std(df): + >>> def agnostic_rolling_std(df_native: IntoFrameT) -> IntoFrameT: + ... df = nw.from_native(df_native) ... return df.with_columns( ... b=nw.col("a").rolling_std(window_size=3, min_periods=1) - ... ) + ... ).to_native() We can then pass any supported library such as Pandas, Polars, or PyArrow to `func`: >>> agnostic_rolling_std(df_pd) - a b - 0 1.0 1.0 - 1 2.0 1.5 - 2 NaN 1.5 - 3 4.0 3.0 + a b + 0 1.0 NaN + 1 2.0 0.707107 + 2 NaN 0.707107 + 3 4.0 1.414214 >>> agnostic_rolling_std(df_pl) shape: (4, 2) - ┌──────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ f64 ┆ f64 │ - ╞══════╪═════╡ - │ 1.0 ┆ 1.0 │ - │ 2.0 ┆ 1.5 │ - │ null ┆ 1.5 │ - │ 4.0 ┆ 3.0 │ - └──────┴─────┘ + ┌──────┬──────────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞══════╪══════════╡ + │ 1.0 ┆ 0.0 │ + │ 2.0 ┆ 0.707107 │ + │ null ┆ 0.707107 │ + │ 4.0 ┆ 1.414214 │ + └──────┴──────────┘ >>> agnostic_rolling_std(df_pa) # doctest:+ELLIPSIS pyarrow.Table @@ -1474,7 +1476,7 @@ def rolling_std( b: double ---- a: [[1,2,null,4]] - b: [[1,1.5,1.5,3]] + b: [[nan,0.7071067811865476,0.7071067811865476,1.4142135623730951]] """ from narwhals.exceptions import NarwhalsUnstableWarning from narwhals.utils import find_stacklevel diff --git a/tests/expr_and_series/rolling_std_test.py b/tests/expr_and_series/rolling_std_test.py index 1ba13a0c3..172654f26 100644 --- a/tests/expr_and_series/rolling_std_test.py +++ b/tests/expr_and_series/rolling_std_test.py @@ -1,38 +1,39 @@ from __future__ import annotations -import random - -import hypothesis.strategies as st -import pandas as pd -import pyarrow as pa +import numpy as np import pytest -from hypothesis import given import narwhals.stable.v1 as nw -from tests.utils import PANDAS_VERSION from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import assert_equal_data -data = {"a": [None, 1, 2, None, 4, 6, 11]} +data = {"a": [1.0, 2.0, 1.0, 3.0, 1.0, 4.0, 1.0]} kwargs_and_expected = { - "x1": {"kwargs": {"window_size": 3}, "expected": [float("nan")] * 6 + [7.0]}, + "x1": { + "kwargs": {"window_size": 3}, + "expected": np.sqrt([float("nan"), float("nan"), 1 / 3, 1, 4 / 3, 7 / 3, 3]), + }, "x2": { "kwargs": {"window_size": 3, "min_periods": 1}, - "expected": [float("nan"), 1.0, 1.5, 1.5, 3.0, 5.0, 7.0], + "expected": np.sqrt([float("nan"), 0.5, 1 / 3, 1.0, 4 / 3, 7 / 3, 3]), }, "x3": { "kwargs": {"window_size": 2, "min_periods": 1}, - "expected": [float("nan"), 1.0, 1.5, 2.0, 4.0, 5.0, 8.5], + "expected": np.sqrt([float("nan"), 0.5, 0.5, 2.0, 2.0, 4.5, 4.5]), }, "x4": { "kwargs": {"window_size": 5, "min_periods": 1, "center": True}, - "expected": [1.5, 1.5, 7 / 3, 3.25, 5.75, 7.0, 7.0], + "expected": np.sqrt([1 / 3, 11 / 12, 4 / 5, 17 / 10, 2.0, 2.25, 3]), }, "x5": { "kwargs": {"window_size": 4, "min_periods": 1, "center": True}, - "expected": [1.0, 1.5, 1.5, 7 / 3, 4.0, 7.0, 7.0], + "expected": np.sqrt([0.5, 1 / 3, 11 / 12, 11 / 12, 2.25, 2.25, 3]), + }, + "x6": { + "kwargs": {"window_size": 3, "ddof": 2}, + "expected": np.sqrt([float("nan"), float("nan"), 2 / 3, 2.0, 8 / 3, 14 / 3, 6.0]), }, } @@ -52,7 +53,7 @@ def test_rolling_std_expr( df = nw.from_native(constructor(data)) result = df.select( **{ - name: nw.col("a").rolling_std(**values["kwargs"]) # type: ignore[arg-type] + name: nw.col("a").rolling_std(**values["kwargs"]) for name, values in kwargs_and_expected.items() } ) @@ -69,35 +70,9 @@ def test_rolling_std_series(constructor_eager: ConstructorEager) -> None: result = df.select( **{ - name: df["a"].rolling_std(**values["kwargs"]) # type: ignore[arg-type] + name: df["a"].rolling_std(**values["kwargs"]) for name, values in kwargs_and_expected.items() } ) expected = {name: values["expected"] for name, values in kwargs_and_expected.items()} assert_equal_data(result, expected) - - -@given( # type: ignore[misc] - center=st.booleans(), - values=st.lists(st.floats(-10, 10), min_size=3, max_size=10), -) -@pytest.mark.skipif(PANDAS_VERSION < (1,), reason="too old for pyarrow") -@pytest.mark.filterwarnings("ignore:.*:narwhals.exceptions.NarwhalsUnstableWarning") -def test_rolling_std_hypothesis(center: bool, values: list[float]) -> None: # noqa: FBT001 - s = pd.Series(values) - n_missing = random.randint(0, len(s) - 1) # noqa: S311 - window_size = random.randint(1, len(s)) # noqa: S311 - min_periods = random.randint(1, window_size) # noqa: S311 - mask = random.sample(range(len(s)), n_missing) - s[mask] = None - df = pd.DataFrame({"a": s}) - expected = ( - s.rolling(window=window_size, center=center, min_periods=min_periods) - .mean() - .to_frame("a") - ) - result = nw.from_native(pa.Table.from_pandas(df)).select( - nw.col("a").rolling_std(window_size, center=center, min_periods=min_periods) - ) - expected_dict = nw.from_native(expected, eager_only=True).to_dict(as_series=False) - assert_equal_data(result, expected_dict) diff --git a/tests/expr_and_series/rolling_var_test.py b/tests/expr_and_series/rolling_var_test.py index 7b384b809..1470a7efc 100644 --- a/tests/expr_and_series/rolling_var_test.py +++ b/tests/expr_and_series/rolling_var_test.py @@ -37,6 +37,10 @@ "kwargs": {"window_size": 4, "min_periods": 1, "center": True}, "expected": [0.5, 1 / 3, 11 / 12, 11 / 12, 2.25, 2.25, 3], }, + "x6": { + "kwargs": {"window_size": 3, "ddof": 2}, + "expected": [float("nan"), float("nan"), 2 / 3, 2.0, 8 / 3, 14 / 3, 6.0], + }, } From ff52d0c19206101ba0fe23b662114feda7c97e13 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Thu, 28 Nov 2024 22:13:39 +0100 Subject: [PATCH 5/9] fix hyp test --- tests/expr_and_series/rolling_var_test.py | 24 ++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/tests/expr_and_series/rolling_var_test.py b/tests/expr_and_series/rolling_var_test.py index 1470a7efc..60702a4f1 100644 --- a/tests/expr_and_series/rolling_var_test.py +++ b/tests/expr_and_series/rolling_var_test.py @@ -4,6 +4,7 @@ import hypothesis.strategies as st import pandas as pd +import polars as pl import pyarrow as pa import pytest from hypothesis import given @@ -86,17 +87,17 @@ def test_rolling_var_series(constructor_eager: ConstructorEager) -> None: @given( # type: ignore[misc] center=st.booleans(), - values=st.lists(st.floats(-10, 10), min_size=3, max_size=10), - ddof=st.integers(min_value=0), + values=st.lists(st.floats(-10, 10), min_size=5, max_size=10), ) @pytest.mark.skipif(PANDAS_VERSION < (1,), reason="too old for pyarrow") @pytest.mark.filterwarnings("ignore:.*:narwhals.exceptions.NarwhalsUnstableWarning") -def test_rolling_var_hypothesis(center: bool, values: list[float], ddof: int) -> None: # noqa: FBT001 +def test_rolling_var_hypothesis(center: bool, values: list[float]) -> None: # noqa: FBT001 s = pd.Series(values) - n_missing = random.randint(0, len(s) - 1) # noqa: S311 - window_size = random.randint(1, len(s)) # noqa: S311 - min_periods = random.randint(1, window_size) # noqa: S311 - mask = random.sample(range(len(s)), n_missing) + window_size = random.randint(2, len(s)) # noqa: S311 + min_periods = random.randint(2, window_size) # noqa: S311 + ddof = random.randint(0, min_periods - 1) # noqa: S311 + mask = random.sample(range(len(s)), 2) + s[mask] = None df = pd.DataFrame({"a": s}) expected = ( @@ -104,6 +105,7 @@ def test_rolling_var_hypothesis(center: bool, values: list[float], ddof: int) -> .var(ddof=ddof) .to_frame("a") ) + result = nw.from_native(pa.Table.from_pandas(df)).select( nw.col("a").rolling_var( window_size, center=center, min_periods=min_periods, ddof=ddof @@ -111,3 +113,11 @@ def test_rolling_var_hypothesis(center: bool, values: list[float], ddof: int) -> ) expected_dict = nw.from_native(expected, eager_only=True).to_dict(as_series=False) assert_equal_data(result, expected_dict) + + result = nw.from_native(pl.from_pandas(df)).select( + nw.col("a").rolling_var( + window_size, center=center, min_periods=min_periods, ddof=ddof + ) + ) + expected_dict = nw.from_native(expected, eager_only=True).to_dict(as_series=False) + assert_equal_data(result, expected_dict) From ef61859bc16818944d42f4e152f8eb60a485848a Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Thu, 28 Nov 2024 22:21:48 +0100 Subject: [PATCH 6/9] pyarrow denominator fix --- narwhals/_arrow/series.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 21e50be1b..baae9f38a 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -1000,13 +1000,13 @@ def rolling_var( valid_count = padded_series.cum_count(reverse=False) count_in_window = valid_count - valid_count.shift(window_size).fill_null(0) - result = self._from_native_series( + result: Self = self._from_native_series( pc.if_else( (count_in_window >= min_periods)._native_series, (rolling_sum_sq - (rolling_sum**2 / count_in_window))._native_series, None, ) - ) / (count_in_window - ddof) + ) / max(count_in_window - ddof, 0) # type: ignore[call-overload] return result[offset:] From 688b69219eb568bdad4860817c6b160dd21dd706 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Thu, 28 Nov 2024 22:27:48 +0100 Subject: [PATCH 7/9] use max_element_wise --- narwhals/_arrow/series.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index baae9f38a..f2b2c82e4 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -1000,13 +1000,15 @@ def rolling_var( valid_count = padded_series.cum_count(reverse=False) count_in_window = valid_count - valid_count.shift(window_size).fill_null(0) - result: Self = self._from_native_series( + result = self._from_native_series( pc.if_else( (count_in_window >= min_periods)._native_series, (rolling_sum_sq - (rolling_sum**2 / count_in_window))._native_series, None, ) - ) / max(count_in_window - ddof, 0) # type: ignore[call-overload] + ) / self._from_native_series( + pc.max_element_wise((count_in_window - ddof)._native_series, 0) + ) return result[offset:] From a3cf75a84c62e72a240da19723dd6206ab3ea196 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Thu, 28 Nov 2024 22:35:24 +0100 Subject: [PATCH 8/9] split test via parametrize for better log --- tests/expr_and_series/rolling_var_test.py | 61 +++++++++++++---------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/tests/expr_and_series/rolling_var_test.py b/tests/expr_and_series/rolling_var_test.py index 60702a4f1..74b8b0b24 100644 --- a/tests/expr_and_series/rolling_var_test.py +++ b/tests/expr_and_series/rolling_var_test.py @@ -1,6 +1,7 @@ from __future__ import annotations import random +from typing import Any import hypothesis.strategies as st import pandas as pd @@ -17,40 +18,53 @@ data = {"a": [1.0, 2.0, 1.0, 3.0, 1.0, 4.0, 1.0]} -kwargs_and_expected = { - "x1": { +kwargs_and_expected = ( + { + "name": "x1", "kwargs": {"window_size": 3}, "expected": [float("nan"), float("nan"), 1 / 3, 1, 4 / 3, 7 / 3, 3], }, - "x2": { + { + "name": "x2", "kwargs": {"window_size": 3, "min_periods": 1}, "expected": [float("nan"), 0.5, 1 / 3, 1.0, 4 / 3, 7 / 3, 3], }, - "x3": { + { + "name": "x3", "kwargs": {"window_size": 2, "min_periods": 1}, "expected": [float("nan"), 0.5, 0.5, 2.0, 2.0, 4.5, 4.5], }, - "x4": { + { + "name": "x4", "kwargs": {"window_size": 5, "min_periods": 1, "center": True}, "expected": [1 / 3, 11 / 12, 4 / 5, 17 / 10, 2.0, 2.25, 3], }, - "x5": { + { + "name": "x5", "kwargs": {"window_size": 4, "min_periods": 1, "center": True}, "expected": [0.5, 1 / 3, 11 / 12, 11 / 12, 2.25, 2.25, 3], }, - "x6": { + { + "name": "x6", "kwargs": {"window_size": 3, "ddof": 2}, "expected": [float("nan"), float("nan"), 2 / 3, 2.0, 8 / 3, 14 / 3, 6.0], }, -} +) @pytest.mark.filterwarnings( "ignore:`Expr.rolling_var` is being called from the stable API although considered an unstable feature." ) +@pytest.mark.parametrize("kwargs_and_expected", kwargs_and_expected) def test_rolling_var_expr( - request: pytest.FixtureRequest, constructor: Constructor + request: pytest.FixtureRequest, + constructor: Constructor, + kwargs_and_expected: dict[str, Any], ) -> None: + name = kwargs_and_expected["name"] + kwargs = kwargs_and_expected["kwargs"] + expected = kwargs_and_expected["expected"] + if "dask" in str(constructor): # TODO(FBruzzesi): Dask is raising the following error: # NotImplementedError: Partition size is less than overlapping window size. @@ -58,31 +72,26 @@ def test_rolling_var_expr( request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) - result = df.select( - **{ - name: nw.col("a").rolling_var(**values["kwargs"]) # type: ignore[arg-type] - for name, values in kwargs_and_expected.items() - } - ) - expected = {name: values["expected"] for name, values in kwargs_and_expected.items()} + result = df.select(nw.col("a").rolling_var(**kwargs).alias(name)) - assert_equal_data(result, expected) + assert_equal_data(result, {name: expected}) @pytest.mark.filterwarnings( "ignore:`Series.rolling_var` is being called from the stable API although considered an unstable feature." ) -def test_rolling_var_series(constructor_eager: ConstructorEager) -> None: +@pytest.mark.parametrize("kwargs_and_expected", kwargs_and_expected) +def test_rolling_var_series( + constructor_eager: ConstructorEager, kwargs_and_expected: dict[str, Any] +) -> None: + name = kwargs_and_expected["name"] + kwargs = kwargs_and_expected["kwargs"] + expected = kwargs_and_expected["expected"] + df = nw.from_native(constructor_eager(data), eager_only=True) + result = df.select(df["a"].rolling_var(**kwargs).alias(name)) - result = df.select( - **{ - name: df["a"].rolling_var(**values["kwargs"]) # type: ignore[arg-type] - for name, values in kwargs_and_expected.items() - } - ) - expected = {name: values["expected"] for name, values in kwargs_and_expected.items()} - assert_equal_data(result, expected) + assert_equal_data(result, {name: expected}) @given( # type: ignore[misc] From a3af2dd84ae22bc61932a8ee819018ed03d59db8 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Thu, 28 Nov 2024 22:40:15 +0100 Subject: [PATCH 9/9] split test via parametrize for better log in std --- tests/expr_and_series/rolling_std_test.py | 62 +++++++++++++---------- 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/tests/expr_and_series/rolling_std_test.py b/tests/expr_and_series/rolling_std_test.py index 172654f26..57b994310 100644 --- a/tests/expr_and_series/rolling_std_test.py +++ b/tests/expr_and_series/rolling_std_test.py @@ -1,5 +1,7 @@ from __future__ import annotations +from typing import Any + import numpy as np import pytest @@ -10,40 +12,53 @@ data = {"a": [1.0, 2.0, 1.0, 3.0, 1.0, 4.0, 1.0]} -kwargs_and_expected = { - "x1": { +kwargs_and_expected = ( + { + "name": "x1", "kwargs": {"window_size": 3}, "expected": np.sqrt([float("nan"), float("nan"), 1 / 3, 1, 4 / 3, 7 / 3, 3]), }, - "x2": { + { + "name": "x2", "kwargs": {"window_size": 3, "min_periods": 1}, "expected": np.sqrt([float("nan"), 0.5, 1 / 3, 1.0, 4 / 3, 7 / 3, 3]), }, - "x3": { + { + "name": "x3", "kwargs": {"window_size": 2, "min_periods": 1}, "expected": np.sqrt([float("nan"), 0.5, 0.5, 2.0, 2.0, 4.5, 4.5]), }, - "x4": { + { + "name": "x4", "kwargs": {"window_size": 5, "min_periods": 1, "center": True}, "expected": np.sqrt([1 / 3, 11 / 12, 4 / 5, 17 / 10, 2.0, 2.25, 3]), }, - "x5": { + { + "name": "x5", "kwargs": {"window_size": 4, "min_periods": 1, "center": True}, "expected": np.sqrt([0.5, 1 / 3, 11 / 12, 11 / 12, 2.25, 2.25, 3]), }, - "x6": { + { + "name": "x6", "kwargs": {"window_size": 3, "ddof": 2}, "expected": np.sqrt([float("nan"), float("nan"), 2 / 3, 2.0, 8 / 3, 14 / 3, 6.0]), }, -} +) @pytest.mark.filterwarnings( "ignore:`Expr.rolling_std` is being called from the stable API although considered an unstable feature." ) +@pytest.mark.parametrize("kwargs_and_expected", kwargs_and_expected) def test_rolling_std_expr( - request: pytest.FixtureRequest, constructor: Constructor + request: pytest.FixtureRequest, + constructor: Constructor, + kwargs_and_expected: dict[str, Any], ) -> None: + name = kwargs_and_expected["name"] + kwargs = kwargs_and_expected["kwargs"] + expected = kwargs_and_expected["expected"] + if "dask" in str(constructor): # TODO(FBruzzesi): Dask is raising the following error: # NotImplementedError: Partition size is less than overlapping window size. @@ -51,28 +66,23 @@ def test_rolling_std_expr( request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) - result = df.select( - **{ - name: nw.col("a").rolling_std(**values["kwargs"]) - for name, values in kwargs_and_expected.items() - } - ) - expected = {name: values["expected"] for name, values in kwargs_and_expected.items()} + result = df.select(nw.col("a").rolling_std(**kwargs).alias(name)) - assert_equal_data(result, expected) + assert_equal_data(result, {name: expected}) @pytest.mark.filterwarnings( "ignore:`Series.rolling_std` is being called from the stable API although considered an unstable feature." ) -def test_rolling_std_series(constructor_eager: ConstructorEager) -> None: +@pytest.mark.parametrize("kwargs_and_expected", kwargs_and_expected) +def test_rolling_std_series( + constructor_eager: ConstructorEager, kwargs_and_expected: dict[str, Any] +) -> None: + name = kwargs_and_expected["name"] + kwargs = kwargs_and_expected["kwargs"] + expected = kwargs_and_expected["expected"] + df = nw.from_native(constructor_eager(data), eager_only=True) + result = df.select(df["a"].rolling_std(**kwargs).alias(name)) - result = df.select( - **{ - name: df["a"].rolling_std(**values["kwargs"]) - for name, values in kwargs_and_expected.items() - } - ) - expected = {name: values["expected"] for name, values in kwargs_and_expected.items()} - assert_equal_data(result, expected) + assert_equal_data(result, {name: expected})