Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CoW: add warning mode for cases that will change behaviour #55428

Merged
merged 11 commits into from
Oct 30, 2023
4 changes: 4 additions & 0 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ jobs:
env_file: actions-311.yaml
pattern: "not slow and not network and not single_cpu"
pandas_copy_on_write: "1"
- name: "Copy-on-Write (warnings)"
env_file: actions-311.yaml
pattern: "not slow and not network and not single_cpu"
pandas_copy_on_write: "warn"
- name: "Pypy"
env_file: actions-pypy-39.yaml
pattern: "not slow and not network and not single_cpu"
Expand Down
14 changes: 13 additions & 1 deletion pandas/_config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"option_context",
"options",
"using_copy_on_write",
"warn_copy_on_write",
]
from pandas._config import config
from pandas._config import dates # pyright: ignore[reportUnusedImport] # noqa: F401
Expand All @@ -32,7 +33,18 @@

def using_copy_on_write() -> bool:
_mode_options = _global_config["mode"]
return _mode_options["copy_on_write"] and _mode_options["data_manager"] == "block"
return (
_mode_options["copy_on_write"] is True
and _mode_options["data_manager"] == "block"
)


def warn_copy_on_write() -> bool:
_mode_options = _global_config["mode"]
return (
_mode_options["copy_on_write"] == "warn"
and _mode_options["data_manager"] == "block"
)


def using_nullable_dtypes() -> bool:
Expand Down
13 changes: 12 additions & 1 deletion pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1994,7 +1994,18 @@ def using_copy_on_write() -> bool:
Fixture to check if Copy-on-Write is enabled.
"""
return (
pd.options.mode.copy_on_write
pd.options.mode.copy_on_write is True
and _get_option("mode.data_manager", silent=True) == "block"
)


@pytest.fixture
def warn_copy_on_write() -> bool:
"""
Fixture to check if Copy-on-Write is enabled.
"""
return (
pd.options.mode.copy_on_write == "warn"
and _get_option("mode.data_manager", silent=True) == "block"
)

Expand Down
6 changes: 4 additions & 2 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,9 +476,11 @@ def use_inf_as_na_cb(key) -> None:
"copy_on_write",
# Get the default from an environment variable, if set, otherwise defaults
# to False. This environment variable can be set for testing.
os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "1",
"warn"
if os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "warn"
else os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "1",
copy_on_write_doc,
validator=is_bool,
validator=is_one_of_factory([True, False, "warn"]),
)


Expand Down
11 changes: 9 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from pandas._config import (
config,
using_copy_on_write,
warn_copy_on_write,
)

from pandas._libs import lib
Expand Down Expand Up @@ -4396,7 +4397,7 @@ def _check_setitem_copy(self, t: str = "setting", force: bool_t = False):
df.iloc[0:5]['group'] = 'a'

"""
if using_copy_on_write():
if using_copy_on_write() or warn_copy_on_write():
return

# return early if the check is not needed
Expand Down Expand Up @@ -12374,14 +12375,20 @@ def _inplace_method(self, other, op) -> Self:
"""
Wrap arithmetic method to operate inplace.
"""
warn = True
if not PYPY and warn_copy_on_write():
if sys.getrefcount(self) <= 5:
# we are probably in an inplace setitem context (e.g. df['a'] += 1)
warn = False

result = op(self, other)

if self.ndim == 1 and result._indexed_same(self) and result.dtype == self.dtype:
# GH#36498 this inplace op can _actually_ be inplace.
# Item "ArrayManager" of "Union[ArrayManager, SingleArrayManager,
# BlockManager, SingleBlockManager]" has no attribute "setitem_inplace"
self._mgr.setitem_inplace( # type: ignore[union-attr]
slice(None), result._values
slice(None), result._values, warn=warn
)
return self

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def array(self) -> ArrayLike:
# error: "SingleDataManager" has no attribute "arrays"; maybe "array"
return self.arrays[0] # type: ignore[attr-defined]

def setitem_inplace(self, indexer, value) -> None:
def setitem_inplace(self, indexer, value, warn: bool = True) -> None:
"""
Set values with indexer.

Expand Down
36 changes: 31 additions & 5 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@

import numpy as np

from pandas._config import using_copy_on_write
from pandas._config import (
using_copy_on_write,
warn_copy_on_write,
)

from pandas._libs import (
internals as libinternals,
Expand Down Expand Up @@ -96,6 +99,20 @@
from pandas.api.extensions import ExtensionArray


COW_WARNING_SETITEM_MSG = """\
Setting a value on a view: behaviour will change in pandas 3.0.
Currently, the mutation will also have effect on the object that shares data
with this object. For example, when setting a value in a Series that was
extracted from a column of a DataFrame, that DataFrame will also be updated:

ser = df["col"]
ser[0] = 0 <--- in pandas 2, this also updates `df`

In pandas 3.0 (with Copy-on-Write), updating one Series/DataFrame will never
modify another, and thus in the example above, `df` will not be changed.
"""
Comment on lines +103 to +114
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One question is: how elaborate do we want to make this warning message?

One the one hand, I think one can argue that it can be short and point to a doc page where it is explained in more detail (which I actually should add anyway).

On the other hand, I also want it to be understandable for the many people that won't read the docs about this in detail.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would anticipate users also wanting to remove/address this warning, so in light of that, I would opt to move the explanation to a doc page if we can include potential actions users can take to remove this warning (e.g. set the copy-on-write mode to True)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it OK to leave this as is, for now, and do a separate PR to write the documentation and update the warnings (add links to the docs, potentially shorten it)?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah sure thing 👍

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree with the doc page



class BaseBlockManager(DataManager):
"""
Core internal data structure to implement DataFrame, Series, etc.
Expand Down Expand Up @@ -1978,7 +1995,7 @@ def get_numeric_data(self, copy: bool = False) -> Self:
def _can_hold_na(self) -> bool:
return self._block._can_hold_na

def setitem_inplace(self, indexer, value) -> None:
def setitem_inplace(self, indexer, value, warn: bool = True) -> None:
"""
Set values with indexer.

Expand All @@ -1988,9 +2005,18 @@ def setitem_inplace(self, indexer, value) -> None:
in place, not returning a new Manager (and Block), and thus never changing
the dtype.
"""
if using_copy_on_write() and not self._has_no_reference(0):
self.blocks = (self._block.copy(),)
self._cache.clear()
using_cow = using_copy_on_write()
warn_cow = warn_copy_on_write()
if using_cow or warn_cow and not self._has_no_reference(0):
if using_cow:
self.blocks = (self._block.copy(),)
self._cache.clear()
elif warn and warn_cow:
warnings.warn(
COW_WARNING_SETITEM_MSG,
FutureWarning,
stacklevel=find_stack_level(),
)

super().setitem_inplace(indexer, value)

Expand Down
68 changes: 47 additions & 21 deletions pandas/tests/copy_view/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,9 @@ def test_subset_row_slice(backend, using_copy_on_write):
@pytest.mark.parametrize(
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
def test_subset_column_slice(backend, using_copy_on_write, using_array_manager, dtype):
def test_subset_column_slice(
backend, using_copy_on_write, warn_copy_on_write, using_array_manager, dtype
):
# Case: taking a subset of the columns of a DataFrame using a slice
# + afterwards modifying the subset
dtype_backend, DataFrame, _ = backend
Expand All @@ -159,10 +161,14 @@ def test_subset_column_slice(backend, using_copy_on_write, using_array_manager,

subset.iloc[0, 0] = 0
assert not np.shares_memory(get_array(subset, "b"), get_array(df, "b"))

else:
# we only get a warning in case of a single block
warn = SettingWithCopyWarning if single_block else None
# TODO(CoW-warn) should warn
warn = (
SettingWithCopyWarning
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am confused, shouldn't this be silent when CoW is enabled as well?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, and that using_copy_on_Write case is already handled in the if block above, so in this else I still need to handle the warn_copy_on_write case.

I could actually do a elif warn_copy_on_write separate block, that might be easier to read and which is also the pattern I started to use in many other places.

if (single_block and not warn_copy_on_write)
else None
)
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(warn):
subset.iloc[0, 0] = 0
Expand Down Expand Up @@ -303,7 +309,9 @@ def test_subset_iloc_rows_columns(
[slice(0, 2), np.array([True, True, False]), np.array([0, 1])],
ids=["slice", "mask", "array"],
)
def test_subset_set_with_row_indexer(backend, indexer_si, indexer, using_copy_on_write):
def test_subset_set_with_row_indexer(
backend, indexer_si, indexer, using_copy_on_write, warn_copy_on_write
):
# Case: setting values with a row indexer on a viewing subset
# subset[indexer] = value and subset.iloc[indexer] = value
_, DataFrame, _ = backend
Expand All @@ -318,7 +326,8 @@ def test_subset_set_with_row_indexer(backend, indexer_si, indexer, using_copy_on
):
pytest.skip("setitem with labels selects on columns")

if using_copy_on_write:
# TODO(CoW-warn) should warn
if using_copy_on_write or warn_copy_on_write:
indexer_si(subset)[indexer] = 0
else:
# INFO iloc no longer raises warning since pandas 1.4
Expand All @@ -340,7 +349,7 @@ def test_subset_set_with_row_indexer(backend, indexer_si, indexer, using_copy_on
tm.assert_frame_equal(df, df_orig)


def test_subset_set_with_mask(backend, using_copy_on_write):
def test_subset_set_with_mask(backend, using_copy_on_write, warn_copy_on_write):
# Case: setting values with a mask on a viewing subset: subset[mask] = value
_, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]})
Expand All @@ -349,7 +358,8 @@ def test_subset_set_with_mask(backend, using_copy_on_write):

mask = subset > 3

if using_copy_on_write:
# TODO(CoW-warn) should warn
if using_copy_on_write or warn_copy_on_write:
subset[mask] = 0
else:
with pd.option_context("chained_assignment", "warn"):
Expand All @@ -370,7 +380,7 @@ def test_subset_set_with_mask(backend, using_copy_on_write):
tm.assert_frame_equal(df, df_orig)


def test_subset_set_column(backend, using_copy_on_write):
def test_subset_set_column(backend, using_copy_on_write, warn_copy_on_write):
# Case: setting a single column on a viewing subset -> subset[col] = value
dtype_backend, DataFrame, _ = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
Expand All @@ -382,7 +392,8 @@ def test_subset_set_column(backend, using_copy_on_write):
else:
arr = pd.array([10, 11], dtype="Int64")

if using_copy_on_write:
# TODO(CoW-warn) should warn
if using_copy_on_write or warn_copy_on_write:
subset["a"] = arr
else:
with pd.option_context("chained_assignment", "warn"):
Expand Down Expand Up @@ -472,7 +483,7 @@ def test_subset_set_column_with_loc2(backend, using_copy_on_write, using_array_m
@pytest.mark.parametrize(
"dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
)
def test_subset_set_columns(backend, using_copy_on_write, dtype):
def test_subset_set_columns(backend, using_copy_on_write, warn_copy_on_write, dtype):
# Case: setting multiple columns on a viewing subset
# -> subset[[col1, col2]] = value
dtype_backend, DataFrame, _ = backend
Expand All @@ -482,7 +493,8 @@ def test_subset_set_columns(backend, using_copy_on_write, dtype):
df_orig = df.copy()
subset = df[1:3]

if using_copy_on_write:
# TODO(CoW-warn) should warn
if using_copy_on_write or warn_copy_on_write:
subset[["a", "c"]] = 0
else:
with pd.option_context("chained_assignment", "warn"):
Expand Down Expand Up @@ -879,7 +891,9 @@ def test_del_series(backend):
# Accessing column as Series


def test_column_as_series(backend, using_copy_on_write, using_array_manager):
def test_column_as_series(
backend, using_copy_on_write, warn_copy_on_write, using_array_manager
):
# Case: selecting a single column now also uses Copy-on-Write
dtype_backend, DataFrame, Series = backend
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
Expand All @@ -892,10 +906,14 @@ def test_column_as_series(backend, using_copy_on_write, using_array_manager):
if using_copy_on_write or using_array_manager:
s[0] = 0
else:
warn = SettingWithCopyWarning if dtype_backend == "numpy" else None
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(warn):
if warn_copy_on_write:
with tm.assert_produces_warning(FutureWarning):
s[0] = 0
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How important do we find it that we ensure the exact message? (given that this is only temporary, maybe not super important?)

(although I suppose it is not difficult to just add match="Setting a value on a view" to all cases I am adding in this PR.

I don't know how useful it will be to add a custom tm.assert_produces_cow_warning, but it would make the tm.assert_produces_warning(FutureWarning, match="Setting a value on a view"): a bit shorter.

But when adding warnings for more cases, the exact warning message might also differ.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's okay being lax with matching the warning message during testing (but noting a comment that it's CoW related if it's not obvious from the test)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but noting a comment that it's CoW related if it's not obvious from the test

That might be a reason to actually make something like a tm.assert_produces_cow_warning, then it's also clear without adding a comment everywhere, and will be easier to remove all of those.

else:
warn = SettingWithCopyWarning if dtype_backend == "numpy" else None
with pd.option_context("chained_assignment", "warn"):
with tm.assert_produces_warning(warn):
s[0] = 0

expected = Series([0, 2, 3], name="a")
tm.assert_series_equal(s, expected)
Expand All @@ -910,7 +928,7 @@ def test_column_as_series(backend, using_copy_on_write, using_array_manager):


def test_column_as_series_set_with_upcast(
backend, using_copy_on_write, using_array_manager
backend, using_copy_on_write, using_array_manager, warn_copy_on_write
):
# Case: selecting a single column now also uses Copy-on-Write -> when
# setting a value causes an upcast, we don't need to update the parent
Expand All @@ -921,10 +939,12 @@ def test_column_as_series_set_with_upcast(

s = df["a"]
if dtype_backend == "nullable":
with pytest.raises(TypeError, match="Invalid value"):
s[0] = "foo"
warn = FutureWarning if warn_copy_on_write else None
with tm.assert_produces_warning(warn):
with pytest.raises(TypeError, match="Invalid value"):
s[0] = "foo"
expected = Series([1, 2, 3], name="a")
elif using_copy_on_write or using_array_manager:
elif using_copy_on_write or warn_copy_on_write or using_array_manager:
with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"):
s[0] = "foo"
expected = Series(["foo", 2, 3], dtype=object, name="a")
Expand Down Expand Up @@ -962,7 +982,12 @@ def test_column_as_series_set_with_upcast(
ids=["getitem", "loc", "iloc"],
)
def test_column_as_series_no_item_cache(
request, backend, method, using_copy_on_write, using_array_manager
request,
backend,
method,
using_copy_on_write,
warn_copy_on_write,
using_array_manager,
):
# Case: selecting a single column (which now also uses Copy-on-Write to protect
# the view) should always give a new object (i.e. not make use of a cache)
Expand All @@ -979,7 +1004,8 @@ def test_column_as_series_no_item_cache(
else:
assert s1 is s2

if using_copy_on_write or using_array_manager:
# TODO(CoW-warn) should warn
if using_copy_on_write or warn_copy_on_write or using_array_manager:
s1.iloc[0] = 0
else:
warn = SettingWithCopyWarning if dtype_backend == "numpy" else None
Expand Down
Loading
Loading