From ae76ad49003536f02e697fb6077c4ec3a90ce048 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 4 Dec 2023 11:08:25 +0100 Subject: [PATCH] CoW: Warn for cases that go through putmask (#56168) Co-authored-by: Joris Van den Bossche --- pandas/core/generic.py | 3 +- pandas/core/internals/base.py | 24 +++++++- pandas/core/internals/blocks.py | 58 ++++++++++++++++++- pandas/core/internals/managers.py | 25 +------- pandas/core/series.py | 2 +- .../test_chained_assignment_deprecation.py | 2 +- pandas/tests/copy_view/test_clip.py | 8 ++- pandas/tests/copy_view/test_indexing.py | 19 ++---- pandas/tests/copy_view/test_methods.py | 25 +++++--- pandas/tests/copy_view/test_replace.py | 8 ++- pandas/tests/frame/methods/test_replace.py | 6 +- 11 files changed, 120 insertions(+), 60 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 376354aedea63..ace6323e84c77 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10578,6 +10578,7 @@ def _where( inplace: bool_t = False, axis: Axis | None = None, level=None, + warn: bool_t = True, ): """ Equivalent to public method `where`, except that `other` is not @@ -10708,7 +10709,7 @@ def _where( # we may have different type blocks come out of putmask, so # reconstruct the block manager - new_data = self._mgr.putmask(mask=cond, new=other, align=align) + new_data = self._mgr.putmask(mask=cond, new=other, align=align, warn=warn) result = self._constructor_from_mgr(new_data, axes=new_data.axes) return self._update_inplace(result) diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py index fe366c7375c6a..664856b828347 100644 --- a/pandas/core/internals/base.py +++ b/pandas/core/internals/base.py @@ -14,7 +14,10 @@ import numpy as np -from pandas._config import using_copy_on_write +from pandas._config import ( + using_copy_on_write, + warn_copy_on_write, +) from pandas._libs import ( algos as libalgos, @@ -49,6 +52,16 @@ ) +class _AlreadyWarned: + def __init__(self): + # This class is used on the manager level to the block level to + # ensure that we warn only once. The block method can update the + # warned_already option without returning a value to keep the + # interface consistent. This is only a temporary solution for + # CoW warnings. + self.warned_already = False + + class DataManager(PandasObject): # TODO share more methods/attributes @@ -196,19 +209,26 @@ def where(self, other, cond, align: bool) -> Self: ) @final - def putmask(self, mask, new, align: bool = True) -> Self: + def putmask(self, mask, new, align: bool = True, warn: bool = True) -> Self: if align: align_keys = ["new", "mask"] else: align_keys = ["mask"] new = extract_array(new, extract_numpy=True) + already_warned = None + if warn_copy_on_write(): + already_warned = _AlreadyWarned() + if not warn: + already_warned.warned_already = True + return self.apply_with_block( "putmask", align_keys=align_keys, mask=mask, new=new, using_cow=using_copy_on_write(), + already_warned=already_warned, ) @final diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 535d18f99f0ef..a06d266870edc 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -18,6 +18,7 @@ from pandas._config import ( get_option, using_copy_on_write, + warn_copy_on_write, ) from pandas._libs import ( @@ -136,6 +137,29 @@ _dtype_obj = np.dtype("object") +COW_WARNING_GENERAL_MSG = """\ +Setting a value on a view: behaviour will change in pandas 3.0. +You are mutating a Series or DataFrame object, and currently this mutation will +also have effect on other Series or DataFrame objects that share data with this +object. In pandas 3.0 (with Copy-on-Write), updating one Series or DataFrame object +will never modify another. +""" + + +COW_WARNING_SETITEM_MSG = """\ +Setting a value on a view: behaviour will change in pandas 3.0. +Currently, the mutation will also have effect on the object that shares data +with this object. For example, when setting a value in a Series that was +extracted from a column of a DataFrame, that DataFrame will also be updated: + + ser = df["col"] + ser[0] = 0 <--- in pandas 2, this also updates `df` + +In pandas 3.0 (with Copy-on-Write), updating one Series/DataFrame will never +modify another, and thus in the example above, `df` will not be changed. +""" + + def maybe_split(meth: F) -> F: """ If we have a multi-column block, split and operate block-wise. Otherwise @@ -1355,7 +1379,9 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block: values[indexer] = casted return self - def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: + def putmask( + self, mask, new, using_cow: bool = False, already_warned=None + ) -> list[Block]: """ putmask the data to the block; it is possible that we may create a new dtype of block @@ -1388,6 +1414,19 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: return [self.copy(deep=False)] return [self] + if ( + warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + try: casted = np_can_hold_element(values.dtype, new) @@ -2020,7 +2059,9 @@ def where( return [nb] @final - def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: + def putmask( + self, mask, new, using_cow: bool = False, already_warned=None + ) -> list[Block]: """ See Block.putmask.__doc__ """ @@ -2038,6 +2079,19 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]: return [self.copy(deep=False)] return [self] + if ( + warn_copy_on_write() + and already_warned is not None + and not already_warned.warned_already + ): + if self.refs.has_reference(): + warnings.warn( + COW_WARNING_GENERAL_MSG, + FutureWarning, + stacklevel=find_stack_level(), + ) + already_warned.warned_already = True + self = self._maybe_copy(using_cow, inplace=True) values = self.values if values.ndim == 2: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c11150eb4c4d7..a02f31d4483b2 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -72,6 +72,8 @@ interleaved_dtype, ) from pandas.core.internals.blocks import ( + COW_WARNING_GENERAL_MSG, + COW_WARNING_SETITEM_MSG, Block, NumpyBlock, ensure_block_shape, @@ -100,29 +102,6 @@ from pandas.api.extensions import ExtensionArray -COW_WARNING_GENERAL_MSG = """\ -Setting a value on a view: behaviour will change in pandas 3.0. -You are mutating a Series or DataFrame object, and currently this mutation will -also have effect on other Series or DataFrame objects that share data with this -object. In pandas 3.0 (with Copy-on-Write), updating one Series or DataFrame object -will never modify another. -""" - - -COW_WARNING_SETITEM_MSG = """\ -Setting a value on a view: behaviour will change in pandas 3.0. -Currently, the mutation will also have effect on the object that shares data -with this object. For example, when setting a value in a Series that was -extracted from a column of a DataFrame, that DataFrame will also be updated: - - ser = df["col"] - ser[0] = 0 <--- in pandas 2, this also updates `df` - -In pandas 3.0 (with Copy-on-Write), updating one Series/DataFrame will never -modify another, and thus in the example above, `df` will not be changed. -""" - - class BaseBlockManager(DataManager): """ Core internal data structure to implement DataFrame, Series, etc. diff --git a/pandas/core/series.py b/pandas/core/series.py index 6da0351766c91..ff03b5071e3b1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1320,7 +1320,7 @@ def __setitem__(self, key, value) -> None: # otherwise with listlike other we interpret series[mask] = other # as series[mask] = other[mask] try: - self._where(~key, value, inplace=True) + self._where(~key, value, inplace=True, warn=warn) except InvalidIndexError: # test_where_dups self.iloc[key] = value diff --git a/pandas/tests/copy_view/test_chained_assignment_deprecation.py b/pandas/tests/copy_view/test_chained_assignment_deprecation.py index 32d0f55f67185..b51a5920917d6 100644 --- a/pandas/tests/copy_view/test_chained_assignment_deprecation.py +++ b/pandas/tests/copy_view/test_chained_assignment_deprecation.py @@ -76,7 +76,7 @@ def test_methods_iloc_getitem_item_cache(func, args, using_copy_on_write): @pytest.mark.parametrize( "indexer", [0, [0, 1], slice(0, 2), np.array([True, False, True])] ) -def test_series_setitem(indexer, using_copy_on_write): +def test_series_setitem(indexer, using_copy_on_write, warn_copy_on_write): # ensure we only get a single warning for those typical cases of chained # assignment df = DataFrame({"a": [1, 2, 3], "b": 1}) diff --git a/pandas/tests/copy_view/test_clip.py b/pandas/tests/copy_view/test_clip.py index 13ddd479c7c67..7ed6a1f803ead 100644 --- a/pandas/tests/copy_view/test_clip.py +++ b/pandas/tests/copy_view/test_clip.py @@ -8,12 +8,16 @@ from pandas.tests.copy_view.util import get_array -def test_clip_inplace_reference(using_copy_on_write): +def test_clip_inplace_reference(using_copy_on_write, warn_copy_on_write): df = DataFrame({"a": [1.5, 2, 3]}) df_copy = df.copy() arr_a = get_array(df, "a") view = df[:] - df.clip(lower=2, inplace=True) + if warn_copy_on_write: + with tm.assert_cow_warning(): + df.clip(lower=2, inplace=True) + else: + df.clip(lower=2, inplace=True) if using_copy_on_write: assert not np.shares_memory(get_array(df, "a"), arr_a) diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 72b7aea3709c0..355eb2db0ef09 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -367,10 +367,11 @@ def test_subset_set_with_mask(backend, using_copy_on_write, warn_copy_on_write): mask = subset > 3 - # TODO(CoW-warn) should warn -> mask is a DataFrame, which ends up going through - # DataFrame._where(..., inplace=True) - if using_copy_on_write or warn_copy_on_write: + if using_copy_on_write: subset[mask] = 0 + elif warn_copy_on_write: + with tm.assert_cow_warning(): + subset[mask] = 0 else: with pd.option_context("chained_assignment", "warn"): with tm.assert_produces_warning(SettingWithCopyWarning): @@ -867,18 +868,8 @@ def test_series_subset_set_with_indexer( and indexer.dtype.kind == "i" ): warn = FutureWarning - is_mask = ( - indexer_si is tm.setitem - and isinstance(indexer, np.ndarray) - and indexer.dtype.kind == "b" - ) if warn_copy_on_write: - # TODO(CoW-warn) should also warn for setting with mask - # -> Series.__setitem__ with boolean mask ends up using Series._set_values - # or Series._where depending on value being set - with tm.assert_cow_warning( - not is_mask, raise_on_extra_warnings=warn is not None - ): + with tm.assert_cow_warning(raise_on_extra_warnings=warn is not None): indexer_si(subset)[indexer] = 0 else: with tm.assert_produces_warning(warn, match=msg): diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 806842dcab57a..ba8e4bd684198 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1407,11 +1407,12 @@ def test_items(using_copy_on_write, warn_copy_on_write): @pytest.mark.parametrize("dtype", ["int64", "Int64"]) -def test_putmask(using_copy_on_write, dtype): +def test_putmask(using_copy_on_write, dtype, warn_copy_on_write): df = DataFrame({"a": [1, 2], "b": 1, "c": 2}, dtype=dtype) view = df[:] df_orig = df.copy() - df[df == df] = 5 + with tm.assert_cow_warning(warn_copy_on_write): + df[df == df] = 5 if using_copy_on_write: assert not np.shares_memory(get_array(view, "a"), get_array(df, "a")) @@ -1445,15 +1446,21 @@ def test_putmask_aligns_rhs_no_reference(using_copy_on_write, dtype): @pytest.mark.parametrize( "val, exp, warn", [(5.5, True, FutureWarning), (5, False, None)] ) -def test_putmask_dont_copy_some_blocks(using_copy_on_write, val, exp, warn): +def test_putmask_dont_copy_some_blocks( + using_copy_on_write, val, exp, warn, warn_copy_on_write +): df = DataFrame({"a": [1, 2], "b": 1, "c": 1.5}) view = df[:] df_orig = df.copy() indexer = DataFrame( [[True, False, False], [True, False, False]], columns=list("abc") ) - with tm.assert_produces_warning(warn, match="incompatible dtype"): - df[indexer] = val + if warn_copy_on_write: + with tm.assert_cow_warning(): + df[indexer] = val + else: + with tm.assert_produces_warning(warn, match="incompatible dtype"): + df[indexer] = val if using_copy_on_write: assert not np.shares_memory(get_array(view, "a"), get_array(df, "a")) @@ -1796,13 +1803,17 @@ def test_update_frame(using_copy_on_write, warn_copy_on_write): tm.assert_frame_equal(view, expected) -def test_update_series(using_copy_on_write): +def test_update_series(using_copy_on_write, warn_copy_on_write): ser1 = Series([1.0, 2.0, 3.0]) ser2 = Series([100.0], index=[1]) ser1_orig = ser1.copy() view = ser1[:] - ser1.update(ser2) + if warn_copy_on_write: + with tm.assert_cow_warning(): + ser1.update(ser2) + else: + ser1.update(ser2) expected = Series([1.0, 100.0, 3.0]) tm.assert_series_equal(ser1, expected) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index d11a2893becdc..3d8559a1905fc 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -279,14 +279,18 @@ def test_replace_categorical(using_copy_on_write, val): @pytest.mark.parametrize("method", ["where", "mask"]) -def test_masking_inplace(using_copy_on_write, method): +def test_masking_inplace(using_copy_on_write, method, warn_copy_on_write): df = DataFrame({"a": [1.5, 2, 3]}) df_orig = df.copy() arr_a = get_array(df, "a") view = df[:] method = getattr(df, method) - method(df["a"] > 1.6, -1, inplace=True) + if warn_copy_on_write: + with tm.assert_cow_warning(): + method(df["a"] > 1.6, -1, inplace=True) + else: + method(df["a"] > 1.6, -1, inplace=True) if using_copy_on_write: assert not np.shares_memory(get_array(df, "a"), arr_a) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 4b32d3de59ca2..13e2c1a249ac2 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -729,11 +729,7 @@ def test_replace_for_new_dtypes(self, datetime_frame): tsframe.loc[tsframe.index[:5], "A"] = np.nan tsframe.loc[tsframe.index[-5:], "A"] = np.nan - tsframe.loc[tsframe.index[:5], "B"] = -1e8 - - b = tsframe["B"] - b[b == -1e8] = np.nan - tsframe["B"] = b + tsframe.loc[tsframe.index[:5], "B"] = np.nan msg = "DataFrame.fillna with 'method' is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): # TODO: what is this even testing?