Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add limit_area to ffill/bfill #56531

Merged
merged 5 commits into from
Dec 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ Other enhancements
- :func:`tseries.api.guess_datetime_format` is now part of the public API (:issue:`54727`)
- :meth:`ExtensionArray._explode` interface method added to allow extension type implementations of the ``explode`` method (:issue:`54833`)
- :meth:`ExtensionArray.duplicated` added to allow extension type implementations of the ``duplicated`` method (:issue:`55255`)
- :meth:`Series.ffill`, :meth:`Series.bfill`, :meth:`DataFrame.ffill`, and :meth:`DataFrame.bfill` have gained the argument ``limit_area`` (:issue:`56492`)
- Allow passing ``read_only``, ``data_only`` and ``keep_links`` arguments to openpyxl using ``engine_kwargs`` of :func:`read_excel` (:issue:`55027`)
- DataFrame.apply now allows the usage of numba (via ``engine="numba"``) to JIT compile the passed function, allowing for potential speedups (:issue:`54666`)
- Implement masked algorithms for :meth:`Series.value_counts` (:issue:`54984`)
Expand Down
37 changes: 36 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -7097,6 +7097,7 @@ def _pad_or_backfill(
axis: None | Axis = None,
inplace: bool_t = False,
limit: None | int = None,
limit_area: Literal["inside", "outside"] | None = None,
downcast: dict | None = None,
):
if axis is None:
Expand All @@ -7110,14 +7111,17 @@ def _pad_or_backfill(
# in all axis=1 cases, and remove axis kward from mgr.pad_or_backfill.
if inplace:
raise NotImplementedError()
result = self.T._pad_or_backfill(method=method, limit=limit).T
result = self.T._pad_or_backfill(
method=method, limit=limit, limit_area=limit_area
).T

return result

new_mgr = self._mgr.pad_or_backfill(
method=method,
axis=self._get_block_manager_axis(axis),
limit=limit,
limit_area=limit_area,
inplace=inplace,
downcast=downcast,
)
Expand Down Expand Up @@ -7477,6 +7481,7 @@ def ffill(
axis: None | Axis = ...,
inplace: Literal[False] = ...,
limit: None | int = ...,
limit_area: Literal["inside", "outside"] | None = ...,
downcast: dict | None | lib.NoDefault = ...,
) -> Self:
...
Expand All @@ -7488,6 +7493,7 @@ def ffill(
axis: None | Axis = ...,
inplace: Literal[True],
limit: None | int = ...,
limit_area: Literal["inside", "outside"] | None = ...,
downcast: dict | None | lib.NoDefault = ...,
) -> None:
...
Expand All @@ -7499,6 +7505,7 @@ def ffill(
axis: None | Axis = ...,
inplace: bool_t = ...,
limit: None | int = ...,
limit_area: Literal["inside", "outside"] | None = ...,
downcast: dict | None | lib.NoDefault = ...,
) -> Self | None:
...
Expand All @@ -7514,6 +7521,7 @@ def ffill(
axis: None | Axis = None,
inplace: bool_t = False,
limit: None | int = None,
limit_area: Literal["inside", "outside"] | None = None,
downcast: dict | None | lib.NoDefault = lib.no_default,
) -> Self | None:
"""
Expand All @@ -7535,6 +7543,17 @@ def ffill(
be partially filled. If method is not specified, this is the
maximum number of entries along the entire axis where NaNs will be
filled. Must be greater than 0 if not None.
limit_area : {{`None`, 'inside', 'outside'}}, default None
If limit is specified, consecutive NaNs will be filled with this
restriction.

* ``None``: No fill restriction.
* 'inside': Only fill NaNs surrounded by valid values
(interpolate).
* 'outside': Only fill NaNs outside valid values (extrapolate).

.. versionadded:: 2.2.0

downcast : dict, default is None
A dict of item->dtype of what to downcast if possible,
or the string 'infer' which will try to downcast to an appropriate
Expand Down Expand Up @@ -7606,6 +7625,7 @@ def ffill(
axis=axis,
inplace=inplace,
limit=limit,
limit_area=limit_area,
# error: Argument "downcast" to "_fillna_with_method" of "NDFrame"
# has incompatible type "Union[Dict[Any, Any], None,
# Literal[_NoDefault.no_default]]"; expected "Optional[Dict[Any, Any]]"
Expand Down Expand Up @@ -7653,6 +7673,7 @@ def bfill(
axis: None | Axis = ...,
inplace: Literal[False] = ...,
limit: None | int = ...,
limit_area: Literal["inside", "outside"] | None = ...,
downcast: dict | None | lib.NoDefault = ...,
) -> Self:
...
Expand All @@ -7675,6 +7696,7 @@ def bfill(
axis: None | Axis = ...,
inplace: bool_t = ...,
limit: None | int = ...,
limit_area: Literal["inside", "outside"] | None = ...,
downcast: dict | None | lib.NoDefault = ...,
) -> Self | None:
...
Expand All @@ -7690,6 +7712,7 @@ def bfill(
axis: None | Axis = None,
inplace: bool_t = False,
limit: None | int = None,
limit_area: Literal["inside", "outside"] | None = None,
downcast: dict | None | lib.NoDefault = lib.no_default,
) -> Self | None:
"""
Expand All @@ -7711,6 +7734,17 @@ def bfill(
be partially filled. If method is not specified, this is the
maximum number of entries along the entire axis where NaNs will be
filled. Must be greater than 0 if not None.
limit_area : {{`None`, 'inside', 'outside'}}, default None
If limit is specified, consecutive NaNs will be filled with this
restriction.

* ``None``: No fill restriction.
* 'inside': Only fill NaNs surrounded by valid values
(interpolate).
* 'outside': Only fill NaNs outside valid values (extrapolate).

.. versionadded:: 2.2.0

downcast : dict, default is None
A dict of item->dtype of what to downcast if possible,
or the string 'infer' which will try to downcast to an appropriate
Expand Down Expand Up @@ -7793,6 +7827,7 @@ def bfill(
axis=axis,
inplace=inplace,
limit=limit,
limit_area=limit_area,
# error: Argument "downcast" to "_fillna_with_method" of "NDFrame"
# has incompatible type "Union[Dict[Any, Any], None,
# Literal[_NoDefault.no_default]]"; expected "Optional[Dict[Any, Any]]"
Expand Down
97 changes: 97 additions & 0 deletions pandas/tests/frame/methods/test_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,3 +857,100 @@ def test_pad_backfill_deprecated(func):
df = DataFrame({"a": [1, 2, 3]})
with tm.assert_produces_warning(FutureWarning):
getattr(df, func)()


@pytest.mark.parametrize(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

im a bit late here, but these tests look really similar to the series ones. is there a viable way to share them?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes - but where would they go? Just all in frame/methods?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

usually we put them in the frame/methods/test_foo.py file and parametrize over frame_or_series

"data, expected_data, method, kwargs",
(
pytest.param(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[np.nan, np.nan, 3.0, 3.0, 3.0, 3.0, 7.0, np.nan, np.nan],
"ffill",
{"limit_area": "inside"},
marks=pytest.mark.xfail(
reason="GH#41813 - limit_area applied to the wrong axis"
),
),
pytest.param(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[np.nan, np.nan, 3.0, 3.0, np.nan, np.nan, 7.0, np.nan, np.nan],
"ffill",
{"limit_area": "inside", "limit": 1},
marks=pytest.mark.xfail(
reason="GH#41813 - limit_area applied to the wrong axis"
),
),
pytest.param(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, 7.0],
"ffill",
{"limit_area": "outside"},
marks=pytest.mark.xfail(
reason="GH#41813 - limit_area applied to the wrong axis"
),
),
pytest.param(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan],
"ffill",
{"limit_area": "outside", "limit": 1},
marks=pytest.mark.xfail(
reason="GH#41813 - limit_area applied to the wrong axis"
),
),
(
[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
"ffill",
{"limit_area": "outside", "limit": 1},
),
(
range(5),
range(5),
"ffill",
{"limit_area": "outside", "limit": 1},
),
pytest.param(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[np.nan, np.nan, 3.0, 7.0, 7.0, 7.0, 7.0, np.nan, np.nan],
"bfill",
{"limit_area": "inside"},
marks=pytest.mark.xfail(
reason="GH#41813 - limit_area applied to the wrong axis"
),
),
pytest.param(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[np.nan, np.nan, 3.0, np.nan, np.nan, 7.0, 7.0, np.nan, np.nan],
"bfill",
{"limit_area": "inside", "limit": 1},
marks=pytest.mark.xfail(
reason="GH#41813 - limit_area applied to the wrong axis"
),
),
pytest.param(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan],
"bfill",
{"limit_area": "outside"},
marks=pytest.mark.xfail(
reason="GH#41813 - limit_area applied to the wrong axis"
),
),
pytest.param(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan],
"bfill",
{"limit_area": "outside", "limit": 1},
marks=pytest.mark.xfail(
reason="GH#41813 - limit_area applied to the wrong axis"
),
),
),
)
def test_ffill_bfill_limit_area(data, expected_data, method, kwargs):
# GH#56492
df = DataFrame(data)
expected = DataFrame(expected_data)
result = getattr(df, method)(**kwargs)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like the tests all use numpy dtypes; would this work with EAs? id expect the limit_area kwd to be added to EA._pad_of_backfill but dont see it there

Copy link
Member Author

@rhshadrach rhshadrach Dec 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks - I missed this, interpolate currently ignores limit_area with ffill/bfill and EAs, so this will too. The implementation appears to be straightforward enough to add. I think I should be able to do this in the next week, but do we want to revert this PR for the RC?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

im not going to lose sleep either way

Copy link
Member

@mroeschke mroeschke Dec 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I should be able to do this in the next week, but do we want to revert this PR for the RC?

I would say it's fine to just add this after the RC

tm.assert_frame_equal(result, expected)
4 changes: 2 additions & 2 deletions pandas/tests/groupby/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def test_frame_consistency(groupby_func):
elif groupby_func in ("median", "prod", "sem"):
exclude_expected = {"axis", "kwargs", "skipna"}
elif groupby_func in ("backfill", "bfill", "ffill", "pad"):
exclude_expected = {"downcast", "inplace", "axis"}
exclude_expected = {"downcast", "inplace", "axis", "limit_area"}
elif groupby_func in ("cummax", "cummin"):
exclude_expected = {"skipna", "args"}
exclude_result = {"numeric_only"}
Expand Down Expand Up @@ -240,7 +240,7 @@ def test_series_consistency(request, groupby_func):
elif groupby_func in ("median", "prod", "sem"):
exclude_expected = {"axis", "kwargs", "skipna"}
elif groupby_func in ("backfill", "bfill", "ffill", "pad"):
exclude_expected = {"downcast", "inplace", "axis"}
exclude_expected = {"downcast", "inplace", "axis", "limit_area"}
elif groupby_func in ("cummax", "cummin"):
exclude_expected = {"skipna", "args"}
exclude_result = {"numeric_only"}
Expand Down
73 changes: 73 additions & 0 deletions pandas/tests/series/methods/test_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -1080,3 +1080,76 @@ def test_pad_backfill_deprecated(self, func):
ser = Series([1, 2, 3])
with tm.assert_produces_warning(FutureWarning):
getattr(ser, func)()


@pytest.mark.parametrize(
"data, expected_data, method, kwargs",
(
(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[np.nan, np.nan, 3.0, 3.0, 3.0, 3.0, 7.0, np.nan, np.nan],
"ffill",
{"limit_area": "inside"},
),
(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[np.nan, np.nan, 3.0, 3.0, np.nan, np.nan, 7.0, np.nan, np.nan],
"ffill",
{"limit_area": "inside", "limit": 1},
),
(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, 7.0],
"ffill",
{"limit_area": "outside"},
),
(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan],
"ffill",
{"limit_area": "outside", "limit": 1},
),
(
[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
"ffill",
{"limit_area": "outside", "limit": 1},
),
(
range(5),
range(5),
"ffill",
{"limit_area": "outside", "limit": 1},
),
(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[np.nan, np.nan, 3.0, 7.0, 7.0, 7.0, 7.0, np.nan, np.nan],
"bfill",
{"limit_area": "inside"},
),
(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[np.nan, np.nan, 3.0, np.nan, np.nan, 7.0, 7.0, np.nan, np.nan],
"bfill",
{"limit_area": "inside", "limit": 1},
),
(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan],
"bfill",
{"limit_area": "outside"},
),
(
[np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan],
[np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan],
"bfill",
{"limit_area": "outside", "limit": 1},
),
),
)
def test_ffill_bfill_limit_area(data, expected_data, method, kwargs):
# GH#56492
s = Series(data)
expected = Series(expected_data)
result = getattr(s, method)(**kwargs)
tm.assert_series_equal(result, expected)