Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CoW warning mode: warn in case of chained assignment #55522

Merged
merged 15 commits into from
Nov 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,9 @@ repos:
# pytest raises without context
|\s\ pytest.raises

# TODO
# pytest.warns (use tm.assert_produces_warning instead)
|pytest\.warns
# |pytest\.warns

# os.remove
|os\.remove
Expand Down
2 changes: 1 addition & 1 deletion doc/source/user_guide/basics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ using ``fillna`` if you wish).
.. ipython:: python

df2 = df.copy()
df2["three"]["a"] = 1.0
df2.loc["a", "three"] = 1.0
df
df2
df + df2
Expand Down
1 change: 1 addition & 0 deletions doc/source/user_guide/copy_on_write.rst
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ Chained assignment references a technique where an object is updated through
two subsequent indexing operations, e.g.

.. ipython:: python
:okwarning:

with pd.option_context("mode.copy_on_write", False):
df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
Expand Down
3 changes: 1 addition & 2 deletions doc/source/whatsnew/v0.13.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,10 @@ Highlights include:

This would previously segfault:

.. ipython:: python
.. code-block:: python

df = pd.DataFrame({"A": np.array(["foo", "bar", "bah", "foo", "bar"])})
df["A"].iloc[0] = np.nan
df

The recommended way to do this type of assignment is:

Expand Down
27 changes: 21 additions & 6 deletions pandas/_testing/contexts.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
)
import uuid

from pandas._config import using_copy_on_write

from pandas.compat import PYPY
from pandas.errors import ChainedAssignmentError

Expand Down Expand Up @@ -193,9 +195,14 @@ def use_numexpr(use, min_elements=None) -> Generator[None, None, None]:
set_option("compute.use_numexpr", olduse)


def raises_chained_assignment_error(extra_warnings=(), extra_match=()):
def raises_chained_assignment_error(warn=True, extra_warnings=(), extra_match=()):
from pandas._testing import assert_produces_warning

if not warn:
from contextlib import nullcontext

return nullcontext()

if PYPY and not extra_warnings:
from contextlib import nullcontext

Expand All @@ -206,12 +213,20 @@ def raises_chained_assignment_error(extra_warnings=(), extra_match=()):
match="|".join(extra_match),
)
else:
match = (
"A value is trying to be set on a copy of a DataFrame or Series "
"through chained assignment"
)
if using_copy_on_write():
warning = ChainedAssignmentError
match = (
"A value is trying to be set on a copy of a DataFrame or Series "
"through chained assignment"
)
else:
warning = FutureWarning # type: ignore[assignment]
# TODO update match
match = "ChainedAssignmentError"
if extra_warnings:
warning = (warning, *extra_warnings) # type: ignore[assignment]
return assert_produces_warning(
(ChainedAssignmentError, *extra_warnings),
warning,
match="|".join((match, *extra_match)),
)

Expand Down
13 changes: 13 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
_chained_assignment_method_msg,
_chained_assignment_msg,
_chained_assignment_warning_method_msg,
_chained_assignment_warning_msg,
)
from pandas.util._decorators import (
Appender,
Expand Down Expand Up @@ -4205,6 +4206,18 @@ def __setitem__(self, key, value) -> None:
warnings.warn(
_chained_assignment_msg, ChainedAssignmentError, stacklevel=2
)
# elif not PYPY and not using_copy_on_write():
elif not PYPY and warn_copy_on_write():
if sys.getrefcount(self) <= 3: # and (
# warn_copy_on_write()
# or (
# not warn_copy_on_write()
# and self._mgr.blocks[0].refs.has_reference()
# )
# ):
warnings.warn(
_chained_assignment_warning_msg, FutureWarning, stacklevel=2
)

key = com.apply_if_callable(key, self)

Expand Down
17 changes: 16 additions & 1 deletion pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@

import numpy as np

from pandas._config import using_copy_on_write
from pandas._config import (
using_copy_on_write,
warn_copy_on_write,
)

from pandas._libs.indexing import NDFrameIndexerBase
from pandas._libs.lib import item_from_zerodim
Expand All @@ -25,6 +28,8 @@
InvalidIndexError,
LossySetitemError,
_chained_assignment_msg,
_chained_assignment_warning_msg,
_check_cacher,
)
from pandas.util._decorators import doc
from pandas.util._exceptions import find_stack_level
Expand Down Expand Up @@ -881,6 +886,16 @@ def __setitem__(self, key, value) -> None:
warnings.warn(
_chained_assignment_msg, ChainedAssignmentError, stacklevel=2
)
elif not PYPY and not using_copy_on_write():
ctr = sys.getrefcount(self.obj)
ref_count = 2
if not warn_copy_on_write() and _check_cacher(self.obj):
# see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221
ref_count += 1
if ctr <= ref_count:
warnings.warn(
_chained_assignment_warning_msg, FutureWarning, stacklevel=2
)

check_dict_or_set_indexers(key)
if isinstance(key, tuple):
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ def apply_with_block(self, f, align_keys=None, **kwargs) -> Self:

return type(self)(result_arrays, self._axes)

def setitem(self, indexer, value) -> Self:
def setitem(self, indexer, value, warn: bool = True) -> Self:
return self.apply_with_block("setitem", indexer=indexer, value=value)

def diff(self, n: int) -> Self:
Expand Down Expand Up @@ -1187,7 +1187,7 @@ def apply(self, func, **kwargs) -> Self: # type: ignore[override]
new_array = getattr(self.array, func)(**kwargs)
return type(self)([new_array], self._axes)

def setitem(self, indexer, value) -> SingleArrayManager:
def setitem(self, indexer, value, warn: bool = True) -> SingleArrayManager:
"""
Set values with indexer.

Expand Down
6 changes: 3 additions & 3 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ def apply(
# Alias so we can share code with ArrayManager
apply_with_block = apply

def setitem(self, indexer, value) -> Self:
def setitem(self, indexer, value, warn: bool = True) -> Self:
"""
Set values with indexer.

Expand All @@ -396,7 +396,7 @@ def setitem(self, indexer, value) -> Self:
if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim:
raise ValueError(f"Cannot set values with ndim > {self.ndim}")

if warn_copy_on_write() and not self._has_no_reference(0):
if warn and warn_copy_on_write() and not self._has_no_reference(0):
warnings.warn(
COW_WARNING_GENERAL_MSG,
FutureWarning,
Expand Down Expand Up @@ -2059,7 +2059,7 @@ def setitem_inplace(self, indexer, value, warn: bool = True) -> None:
if using_cow:
self.blocks = (self._block.copy(),)
self._cache.clear()
elif warn and warn_cow:
elif warn_cow and warn:
warnings.warn(
COW_WARNING_SETITEM_MSG,
FutureWarning,
Expand Down
50 changes: 36 additions & 14 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@

import numpy as np

from pandas._config import using_copy_on_write
from pandas._config import (
using_copy_on_write,
warn_copy_on_write,
)
from pandas._config.config import _get_option

from pandas._libs import (
Expand All @@ -45,6 +48,7 @@
_chained_assignment_method_msg,
_chained_assignment_msg,
_chained_assignment_warning_method_msg,
_chained_assignment_warning_msg,
_check_cacher,
)
from pandas.util._decorators import (
Expand Down Expand Up @@ -1209,11 +1213,29 @@ def _get_value(self, label, takeable: bool = False):
return self.iloc[loc]

def __setitem__(self, key, value) -> None:
warn = True
if not PYPY and using_copy_on_write():
if sys.getrefcount(self) <= 3:
warnings.warn(
_chained_assignment_msg, ChainedAssignmentError, stacklevel=2
)
elif not PYPY and not using_copy_on_write():
ctr = sys.getrefcount(self)
ref_count = 3
if not warn_copy_on_write() and _check_cacher(self):
# see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221
ref_count += 1
if ctr <= ref_count and (
warn_copy_on_write()
or (
not warn_copy_on_write()
and self._mgr.blocks[0].refs.has_reference() # type: ignore[union-attr]
)
):
warn = False
warnings.warn(
_chained_assignment_warning_msg, FutureWarning, stacklevel=2
)

check_dict_or_set_indexers(key)
key = com.apply_if_callable(key, self)
Expand All @@ -1224,10 +1246,10 @@ def __setitem__(self, key, value) -> None:

if isinstance(key, slice):
indexer = self.index._convert_slice_indexer(key, kind="getitem")
return self._set_values(indexer, value)
return self._set_values(indexer, value, warn=warn)

try:
self._set_with_engine(key, value)
self._set_with_engine(key, value, warn=warn)
except KeyError:
# We have a scalar (or for MultiIndex or object-dtype, scalar-like)
# key that is not present in self.index.
Expand Down Expand Up @@ -1293,18 +1315,18 @@ def __setitem__(self, key, value) -> None:
return

else:
self._set_with(key, value)
self._set_with(key, value, warn=warn)

if cacher_needs_updating:
self._maybe_update_cacher(inplace=True)

def _set_with_engine(self, key, value) -> None:
def _set_with_engine(self, key, value, warn: bool = True) -> None:
loc = self.index.get_loc(key)

# this is equivalent to self._values[key] = value
self._mgr.setitem_inplace(loc, value)
self._mgr.setitem_inplace(loc, value, warn=warn)

def _set_with(self, key, value) -> None:
def _set_with(self, key, value, warn: bool = True) -> None:
# We got here via exception-handling off of InvalidIndexError, so
# key should always be listlike at this point.
assert not isinstance(key, tuple)
Expand All @@ -1315,7 +1337,7 @@ def _set_with(self, key, value) -> None:

if not self.index._should_fallback_to_positional:
# Regardless of the key type, we're treating it as labels
self._set_labels(key, value)
self._set_labels(key, value, warn=warn)

else:
# Note: key_type == "boolean" should not occur because that
Expand All @@ -1332,23 +1354,23 @@ def _set_with(self, key, value) -> None:
FutureWarning,
stacklevel=find_stack_level(),
)
self._set_values(key, value)
self._set_values(key, value, warn=warn)
else:
self._set_labels(key, value)
self._set_labels(key, value, warn=warn)

def _set_labels(self, key, value) -> None:
def _set_labels(self, key, value, warn: bool = True) -> None:
key = com.asarray_tuplesafe(key)
indexer: np.ndarray = self.index.get_indexer(key)
mask = indexer == -1
if mask.any():
raise KeyError(f"{key[mask]} not in index")
self._set_values(indexer, value)
self._set_values(indexer, value, warn=warn)

def _set_values(self, key, value) -> None:
def _set_values(self, key, value, warn: bool = True) -> None:
if isinstance(key, (Index, Series)):
key = key._values

self._mgr = self._mgr.setitem(indexer=key, value=value)
self._mgr = self._mgr.setitem(indexer=key, value=value, warn=warn)
self._maybe_update_cacher()

def _set_value(self, label, value, takeable: bool = False) -> None:
Expand Down
18 changes: 18 additions & 0 deletions pandas/errors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,24 @@ class ChainedAssignmentError(Warning):
)


_chained_assignment_warning_msg = (
"ChainedAssignmentError: behaviour will change in pandas 3.0!\n"
"You are setting values through chained assignment. Currently this works "
"in certain cases, but when using Copy-on-Write (which will become the "
"default behaviour in pandas 3.0) this will never work to update the "
"original DataFrame or Series, because the intermediate object on which "
"we are setting values will behave as a copy.\n"
"A typical example is when you are setting values in a column of a "
"DataFrame, like:\n\n"
'df["col"][row_indexer] = value\n\n'
'Use `df.loc[row_indexer, "col"] = values` instead, to perform the '
"assignment in a single step and ensure this keeps updating the original `df`.\n\n"
"See the caveats in the documentation: "
"https://pandas.pydata.org/pandas-docs/stable/user_guide/"
"indexing.html#returning-a-view-versus-a-copy\n"
)


_chained_assignment_warning_method_msg = (
"A value is trying to be set on a copy of a DataFrame or Series "
"through chained assignment using an inplace method.\n"
Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/copy_view/test_chained_assignment_deprecation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import numpy as np
import pytest

from pandas.errors import ChainedAssignmentError

from pandas import DataFrame
import pandas._testing as tm

Expand Down Expand Up @@ -61,3 +64,24 @@ def test_methods_iloc_getitem_item_cache(func, args, using_copy_on_write):
else:
with tm.assert_cow_warning(match="A value"):
df["a"].fillna(0, inplace=True)


# TODO(CoW-warn) expand the cases
@pytest.mark.parametrize(
"indexer", [0, [0, 1], slice(0, 2), np.array([True, False, True])]
)
def test_series_setitem(indexer, using_copy_on_write):
# ensure we only get a single warning for those typical cases of chained
# assignment
df = DataFrame({"a": [1, 2, 3], "b": 1})

# using custom check instead of tm.assert_produces_warning because that doesn't
# fail if multiple warnings are raised
with pytest.warns() as record:
df["a"][indexer] = 0
assert len(record) == 1
if using_copy_on_write:
assert record[0].category == ChainedAssignmentError
else:
assert record[0].category == FutureWarning
assert "ChainedAssignmentError" in record[0].message.args[0]
Loading
Loading