Skip to content

Commit

Permalink
CoW: Warn for cases that go through putmask (#56168)
Browse files Browse the repository at this point in the history
Co-authored-by: Joris Van den Bossche <[email protected]>
  • Loading branch information
phofl and jorisvandenbossche authored Dec 4, 2023
1 parent 5165102 commit ae76ad4
Show file tree
Hide file tree
Showing 11 changed files with 120 additions and 60 deletions.
3 changes: 2 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10578,6 +10578,7 @@ def _where(
inplace: bool_t = False,
axis: Axis | None = None,
level=None,
warn: bool_t = True,
):
"""
Equivalent to public method `where`, except that `other` is not
Expand Down Expand Up @@ -10708,7 +10709,7 @@ def _where(
# we may have different type blocks come out of putmask, so
# reconstruct the block manager

new_data = self._mgr.putmask(mask=cond, new=other, align=align)
new_data = self._mgr.putmask(mask=cond, new=other, align=align, warn=warn)
result = self._constructor_from_mgr(new_data, axes=new_data.axes)
return self._update_inplace(result)

Expand Down
24 changes: 22 additions & 2 deletions pandas/core/internals/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@

import numpy as np

from pandas._config import using_copy_on_write
from pandas._config import (
using_copy_on_write,
warn_copy_on_write,
)

from pandas._libs import (
algos as libalgos,
Expand Down Expand Up @@ -49,6 +52,16 @@
)


class _AlreadyWarned:
def __init__(self):
# This class is used on the manager level to the block level to
# ensure that we warn only once. The block method can update the
# warned_already option without returning a value to keep the
# interface consistent. This is only a temporary solution for
# CoW warnings.
self.warned_already = False


class DataManager(PandasObject):
# TODO share more methods/attributes

Expand Down Expand Up @@ -196,19 +209,26 @@ def where(self, other, cond, align: bool) -> Self:
)

@final
def putmask(self, mask, new, align: bool = True) -> Self:
def putmask(self, mask, new, align: bool = True, warn: bool = True) -> Self:
if align:
align_keys = ["new", "mask"]
else:
align_keys = ["mask"]
new = extract_array(new, extract_numpy=True)

already_warned = None
if warn_copy_on_write():
already_warned = _AlreadyWarned()
if not warn:
already_warned.warned_already = True

return self.apply_with_block(
"putmask",
align_keys=align_keys,
mask=mask,
new=new,
using_cow=using_copy_on_write(),
already_warned=already_warned,
)

@final
Expand Down
58 changes: 56 additions & 2 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from pandas._config import (
get_option,
using_copy_on_write,
warn_copy_on_write,
)

from pandas._libs import (
Expand Down Expand Up @@ -136,6 +137,29 @@
_dtype_obj = np.dtype("object")


COW_WARNING_GENERAL_MSG = """\
Setting a value on a view: behaviour will change in pandas 3.0.
You are mutating a Series or DataFrame object, and currently this mutation will
also have effect on other Series or DataFrame objects that share data with this
object. In pandas 3.0 (with Copy-on-Write), updating one Series or DataFrame object
will never modify another.
"""


COW_WARNING_SETITEM_MSG = """\
Setting a value on a view: behaviour will change in pandas 3.0.
Currently, the mutation will also have effect on the object that shares data
with this object. For example, when setting a value in a Series that was
extracted from a column of a DataFrame, that DataFrame will also be updated:
ser = df["col"]
ser[0] = 0 <--- in pandas 2, this also updates `df`
In pandas 3.0 (with Copy-on-Write), updating one Series/DataFrame will never
modify another, and thus in the example above, `df` will not be changed.
"""


def maybe_split(meth: F) -> F:
"""
If we have a multi-column block, split and operate block-wise. Otherwise
Expand Down Expand Up @@ -1355,7 +1379,9 @@ def setitem(self, indexer, value, using_cow: bool = False) -> Block:
values[indexer] = casted
return self

def putmask(self, mask, new, using_cow: bool = False) -> list[Block]:
def putmask(
self, mask, new, using_cow: bool = False, already_warned=None
) -> list[Block]:
"""
putmask the data to the block; it is possible that we may create a
new dtype of block
Expand Down Expand Up @@ -1388,6 +1414,19 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]:
return [self.copy(deep=False)]
return [self]

if (
warn_copy_on_write()
and already_warned is not None
and not already_warned.warned_already
):
if self.refs.has_reference():
warnings.warn(
COW_WARNING_GENERAL_MSG,
FutureWarning,
stacklevel=find_stack_level(),
)
already_warned.warned_already = True

try:
casted = np_can_hold_element(values.dtype, new)

Expand Down Expand Up @@ -2020,7 +2059,9 @@ def where(
return [nb]

@final
def putmask(self, mask, new, using_cow: bool = False) -> list[Block]:
def putmask(
self, mask, new, using_cow: bool = False, already_warned=None
) -> list[Block]:
"""
See Block.putmask.__doc__
"""
Expand All @@ -2038,6 +2079,19 @@ def putmask(self, mask, new, using_cow: bool = False) -> list[Block]:
return [self.copy(deep=False)]
return [self]

if (
warn_copy_on_write()
and already_warned is not None
and not already_warned.warned_already
):
if self.refs.has_reference():
warnings.warn(
COW_WARNING_GENERAL_MSG,
FutureWarning,
stacklevel=find_stack_level(),
)
already_warned.warned_already = True

self = self._maybe_copy(using_cow, inplace=True)
values = self.values
if values.ndim == 2:
Expand Down
25 changes: 2 additions & 23 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@
interleaved_dtype,
)
from pandas.core.internals.blocks import (
COW_WARNING_GENERAL_MSG,
COW_WARNING_SETITEM_MSG,
Block,
NumpyBlock,
ensure_block_shape,
Expand Down Expand Up @@ -100,29 +102,6 @@
from pandas.api.extensions import ExtensionArray


COW_WARNING_GENERAL_MSG = """\
Setting a value on a view: behaviour will change in pandas 3.0.
You are mutating a Series or DataFrame object, and currently this mutation will
also have effect on other Series or DataFrame objects that share data with this
object. In pandas 3.0 (with Copy-on-Write), updating one Series or DataFrame object
will never modify another.
"""


COW_WARNING_SETITEM_MSG = """\
Setting a value on a view: behaviour will change in pandas 3.0.
Currently, the mutation will also have effect on the object that shares data
with this object. For example, when setting a value in a Series that was
extracted from a column of a DataFrame, that DataFrame will also be updated:
ser = df["col"]
ser[0] = 0 <--- in pandas 2, this also updates `df`
In pandas 3.0 (with Copy-on-Write), updating one Series/DataFrame will never
modify another, and thus in the example above, `df` will not be changed.
"""


class BaseBlockManager(DataManager):
"""
Core internal data structure to implement DataFrame, Series, etc.
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1320,7 +1320,7 @@ def __setitem__(self, key, value) -> None:
# otherwise with listlike other we interpret series[mask] = other
# as series[mask] = other[mask]
try:
self._where(~key, value, inplace=True)
self._where(~key, value, inplace=True, warn=warn)
except InvalidIndexError:
# test_where_dups
self.iloc[key] = value
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def test_methods_iloc_getitem_item_cache(func, args, using_copy_on_write):
@pytest.mark.parametrize(
"indexer", [0, [0, 1], slice(0, 2), np.array([True, False, True])]
)
def test_series_setitem(indexer, using_copy_on_write):
def test_series_setitem(indexer, using_copy_on_write, warn_copy_on_write):
# ensure we only get a single warning for those typical cases of chained
# assignment
df = DataFrame({"a": [1, 2, 3], "b": 1})
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/copy_view/test_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,16 @@
from pandas.tests.copy_view.util import get_array


def test_clip_inplace_reference(using_copy_on_write):
def test_clip_inplace_reference(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3]})
df_copy = df.copy()
arr_a = get_array(df, "a")
view = df[:]
df.clip(lower=2, inplace=True)
if warn_copy_on_write:
with tm.assert_cow_warning():
df.clip(lower=2, inplace=True)
else:
df.clip(lower=2, inplace=True)

if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a"), arr_a)
Expand Down
19 changes: 5 additions & 14 deletions pandas/tests/copy_view/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,10 +367,11 @@ def test_subset_set_with_mask(backend, using_copy_on_write, warn_copy_on_write):

mask = subset > 3

# TODO(CoW-warn) should warn -> mask is a DataFrame, which ends up going through
# DataFrame._where(..., inplace=True)
if using_copy_on_write or warn_copy_on_write:
if using_copy_on_write:
subset[mask] = 0
elif warn_copy_on_write:
with tm.assert_cow_warning():
subset[mask] = 0
else:
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(SettingWithCopyWarning):
Expand Down Expand Up @@ -867,18 +868,8 @@ def test_series_subset_set_with_indexer(
and indexer.dtype.kind == "i"
):
warn = FutureWarning
is_mask = (
indexer_si is tm.setitem
and isinstance(indexer, np.ndarray)
and indexer.dtype.kind == "b"
)
if warn_copy_on_write:
# TODO(CoW-warn) should also warn for setting with mask
# -> Series.__setitem__ with boolean mask ends up using Series._set_values
# or Series._where depending on value being set
with tm.assert_cow_warning(
not is_mask, raise_on_extra_warnings=warn is not None
):
with tm.assert_cow_warning(raise_on_extra_warnings=warn is not None):
indexer_si(subset)[indexer] = 0
else:
with tm.assert_produces_warning(warn, match=msg):
Expand Down
25 changes: 18 additions & 7 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -1407,11 +1407,12 @@ def test_items(using_copy_on_write, warn_copy_on_write):


@pytest.mark.parametrize("dtype", ["int64", "Int64"])
def test_putmask(using_copy_on_write, dtype):
def test_putmask(using_copy_on_write, dtype, warn_copy_on_write):
df = DataFrame({"a": [1, 2], "b": 1, "c": 2}, dtype=dtype)
view = df[:]
df_orig = df.copy()
df[df == df] = 5
with tm.assert_cow_warning(warn_copy_on_write):
df[df == df] = 5

if using_copy_on_write:
assert not np.shares_memory(get_array(view, "a"), get_array(df, "a"))
Expand Down Expand Up @@ -1445,15 +1446,21 @@ def test_putmask_aligns_rhs_no_reference(using_copy_on_write, dtype):
@pytest.mark.parametrize(
"val, exp, warn", [(5.5, True, FutureWarning), (5, False, None)]
)
def test_putmask_dont_copy_some_blocks(using_copy_on_write, val, exp, warn):
def test_putmask_dont_copy_some_blocks(
using_copy_on_write, val, exp, warn, warn_copy_on_write
):
df = DataFrame({"a": [1, 2], "b": 1, "c": 1.5})
view = df[:]
df_orig = df.copy()
indexer = DataFrame(
[[True, False, False], [True, False, False]], columns=list("abc")
)
with tm.assert_produces_warning(warn, match="incompatible dtype"):
df[indexer] = val
if warn_copy_on_write:
with tm.assert_cow_warning():
df[indexer] = val
else:
with tm.assert_produces_warning(warn, match="incompatible dtype"):
df[indexer] = val

if using_copy_on_write:
assert not np.shares_memory(get_array(view, "a"), get_array(df, "a"))
Expand Down Expand Up @@ -1796,13 +1803,17 @@ def test_update_frame(using_copy_on_write, warn_copy_on_write):
tm.assert_frame_equal(view, expected)


def test_update_series(using_copy_on_write):
def test_update_series(using_copy_on_write, warn_copy_on_write):
ser1 = Series([1.0, 2.0, 3.0])
ser2 = Series([100.0], index=[1])
ser1_orig = ser1.copy()
view = ser1[:]

ser1.update(ser2)
if warn_copy_on_write:
with tm.assert_cow_warning():
ser1.update(ser2)
else:
ser1.update(ser2)

expected = Series([1.0, 100.0, 3.0])
tm.assert_series_equal(ser1, expected)
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/copy_view/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,14 +279,18 @@ def test_replace_categorical(using_copy_on_write, val):


@pytest.mark.parametrize("method", ["where", "mask"])
def test_masking_inplace(using_copy_on_write, method):
def test_masking_inplace(using_copy_on_write, method, warn_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3]})
df_orig = df.copy()
arr_a = get_array(df, "a")
view = df[:]

method = getattr(df, method)
method(df["a"] > 1.6, -1, inplace=True)
if warn_copy_on_write:
with tm.assert_cow_warning():
method(df["a"] > 1.6, -1, inplace=True)
else:
method(df["a"] > 1.6, -1, inplace=True)

if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a"), arr_a)
Expand Down
6 changes: 1 addition & 5 deletions pandas/tests/frame/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,11 +729,7 @@ def test_replace_for_new_dtypes(self, datetime_frame):

tsframe.loc[tsframe.index[:5], "A"] = np.nan
tsframe.loc[tsframe.index[-5:], "A"] = np.nan
tsframe.loc[tsframe.index[:5], "B"] = -1e8

b = tsframe["B"]
b[b == -1e8] = np.nan
tsframe["B"] = b
tsframe.loc[tsframe.index[:5], "B"] = np.nan
msg = "DataFrame.fillna with 'method' is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
# TODO: what is this even testing?
Expand Down

0 comments on commit ae76ad4

Please sign in to comment.