Skip to content

Commit

Permalink
CoW warning mode: warn in case of chained assignment (#55522)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche authored Nov 28, 2023
1 parent ae6d279 commit 7012d6a
Show file tree
Hide file tree
Showing 23 changed files with 219 additions and 188 deletions.
3 changes: 2 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,9 @@ repos:
# pytest raises without context
|\s\ pytest.raises
# TODO
# pytest.warns (use tm.assert_produces_warning instead)
|pytest\.warns
# |pytest\.warns
# os.remove
|os\.remove
Expand Down
2 changes: 1 addition & 1 deletion doc/source/user_guide/basics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ using ``fillna`` if you wish).
.. ipython:: python
df2 = df.copy()
df2["three"]["a"] = 1.0
df2.loc["a", "three"] = 1.0
df
df2
df + df2
Expand Down
1 change: 1 addition & 0 deletions doc/source/user_guide/copy_on_write.rst
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ Chained assignment references a technique where an object is updated through
two subsequent indexing operations, e.g.

.. ipython:: python
:okwarning:
with pd.option_context("mode.copy_on_write", False):
df = pd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})
Expand Down
3 changes: 1 addition & 2 deletions doc/source/whatsnew/v0.13.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,10 @@ Highlights include:

This would previously segfault:

.. ipython:: python
.. code-block:: python
df = pd.DataFrame({"A": np.array(["foo", "bar", "bah", "foo", "bar"])})
df["A"].iloc[0] = np.nan
df
The recommended way to do this type of assignment is:

Expand Down
27 changes: 21 additions & 6 deletions pandas/_testing/contexts.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
)
import uuid

from pandas._config import using_copy_on_write

from pandas.compat import PYPY
from pandas.errors import ChainedAssignmentError

Expand Down Expand Up @@ -193,9 +195,14 @@ def use_numexpr(use, min_elements=None) -> Generator[None, None, None]:
set_option("compute.use_numexpr", olduse)


def raises_chained_assignment_error(extra_warnings=(), extra_match=()):
def raises_chained_assignment_error(warn=True, extra_warnings=(), extra_match=()):
from pandas._testing import assert_produces_warning

if not warn:
from contextlib import nullcontext

return nullcontext()

if PYPY and not extra_warnings:
from contextlib import nullcontext

Expand All @@ -206,12 +213,20 @@ def raises_chained_assignment_error(extra_warnings=(), extra_match=()):
match="|".join(extra_match),
)
else:
match = (
"A value is trying to be set on a copy of a DataFrame or Series "
"through chained assignment"
)
if using_copy_on_write():
warning = ChainedAssignmentError
match = (
"A value is trying to be set on a copy of a DataFrame or Series "
"through chained assignment"
)
else:
warning = FutureWarning # type: ignore[assignment]
# TODO update match
match = "ChainedAssignmentError"
if extra_warnings:
warning = (warning, *extra_warnings) # type: ignore[assignment]
return assert_produces_warning(
(ChainedAssignmentError, *extra_warnings),
warning,
match="|".join((match, *extra_match)),
)

Expand Down
13 changes: 13 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
_chained_assignment_method_msg,
_chained_assignment_msg,
_chained_assignment_warning_method_msg,
_chained_assignment_warning_msg,
)
from pandas.util._decorators import (
Appender,
Expand Down Expand Up @@ -4205,6 +4206,18 @@ def __setitem__(self, key, value) -> None:
warnings.warn(
_chained_assignment_msg, ChainedAssignmentError, stacklevel=2
)
# elif not PYPY and not using_copy_on_write():
elif not PYPY and warn_copy_on_write():
if sys.getrefcount(self) <= 3: # and (
# warn_copy_on_write()
# or (
# not warn_copy_on_write()
# and self._mgr.blocks[0].refs.has_reference()
# )
# ):
warnings.warn(
_chained_assignment_warning_msg, FutureWarning, stacklevel=2
)

key = com.apply_if_callable(key, self)

Expand Down
17 changes: 16 additions & 1 deletion pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@

import numpy as np

from pandas._config import using_copy_on_write
from pandas._config import (
using_copy_on_write,
warn_copy_on_write,
)

from pandas._libs.indexing import NDFrameIndexerBase
from pandas._libs.lib import item_from_zerodim
Expand All @@ -25,6 +28,8 @@
InvalidIndexError,
LossySetitemError,
_chained_assignment_msg,
_chained_assignment_warning_msg,
_check_cacher,
)
from pandas.util._decorators import doc
from pandas.util._exceptions import find_stack_level
Expand Down Expand Up @@ -881,6 +886,16 @@ def __setitem__(self, key, value) -> None:
warnings.warn(
_chained_assignment_msg, ChainedAssignmentError, stacklevel=2
)
elif not PYPY and not using_copy_on_write():
ctr = sys.getrefcount(self.obj)
ref_count = 2
if not warn_copy_on_write() and _check_cacher(self.obj):
# see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221
ref_count += 1
if ctr <= ref_count:
warnings.warn(
_chained_assignment_warning_msg, FutureWarning, stacklevel=2
)

check_dict_or_set_indexers(key)
if isinstance(key, tuple):
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ def apply_with_block(self, f, align_keys=None, **kwargs) -> Self:

return type(self)(result_arrays, self._axes)

def setitem(self, indexer, value) -> Self:
def setitem(self, indexer, value, warn: bool = True) -> Self:
return self.apply_with_block("setitem", indexer=indexer, value=value)

def diff(self, n: int) -> Self:
Expand Down Expand Up @@ -1187,7 +1187,7 @@ def apply(self, func, **kwargs) -> Self: # type: ignore[override]
new_array = getattr(self.array, func)(**kwargs)
return type(self)([new_array], self._axes)

def setitem(self, indexer, value) -> SingleArrayManager:
def setitem(self, indexer, value, warn: bool = True) -> SingleArrayManager:
"""
Set values with indexer.
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ def apply(
# Alias so we can share code with ArrayManager
apply_with_block = apply

def setitem(self, indexer, value) -> Self:
def setitem(self, indexer, value, warn: bool = True) -> Self:
"""
Set values with indexer.
Expand All @@ -396,7 +396,7 @@ def setitem(self, indexer, value) -> Self:
if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim:
raise ValueError(f"Cannot set values with ndim > {self.ndim}")

if warn_copy_on_write() and not self._has_no_reference(0):
if warn and warn_copy_on_write() and not self._has_no_reference(0):
warnings.warn(
COW_WARNING_GENERAL_MSG,
FutureWarning,
Expand Down Expand Up @@ -2059,7 +2059,7 @@ def setitem_inplace(self, indexer, value, warn: bool = True) -> None:
if using_cow:
self.blocks = (self._block.copy(),)
self._cache.clear()
elif warn and warn_cow:
elif warn_cow and warn:
warnings.warn(
COW_WARNING_SETITEM_MSG,
FutureWarning,
Expand Down
50 changes: 36 additions & 14 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@

import numpy as np

from pandas._config import using_copy_on_write
from pandas._config import (
using_copy_on_write,
warn_copy_on_write,
)
from pandas._config.config import _get_option

from pandas._libs import (
Expand All @@ -45,6 +48,7 @@
_chained_assignment_method_msg,
_chained_assignment_msg,
_chained_assignment_warning_method_msg,
_chained_assignment_warning_msg,
_check_cacher,
)
from pandas.util._decorators import (
Expand Down Expand Up @@ -1221,11 +1225,29 @@ def _get_value(self, label, takeable: bool = False):
return self.iloc[loc]

def __setitem__(self, key, value) -> None:
warn = True
if not PYPY and using_copy_on_write():
if sys.getrefcount(self) <= 3:
warnings.warn(
_chained_assignment_msg, ChainedAssignmentError, stacklevel=2
)
elif not PYPY and not using_copy_on_write():
ctr = sys.getrefcount(self)
ref_count = 3
if not warn_copy_on_write() and _check_cacher(self):
# see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221
ref_count += 1
if ctr <= ref_count and (
warn_copy_on_write()
or (
not warn_copy_on_write()
and self._mgr.blocks[0].refs.has_reference() # type: ignore[union-attr]
)
):
warn = False
warnings.warn(
_chained_assignment_warning_msg, FutureWarning, stacklevel=2
)

check_dict_or_set_indexers(key)
key = com.apply_if_callable(key, self)
Expand All @@ -1236,10 +1258,10 @@ def __setitem__(self, key, value) -> None:

if isinstance(key, slice):
indexer = self.index._convert_slice_indexer(key, kind="getitem")
return self._set_values(indexer, value)
return self._set_values(indexer, value, warn=warn)

try:
self._set_with_engine(key, value)
self._set_with_engine(key, value, warn=warn)
except KeyError:
# We have a scalar (or for MultiIndex or object-dtype, scalar-like)
# key that is not present in self.index.
Expand Down Expand Up @@ -1305,18 +1327,18 @@ def __setitem__(self, key, value) -> None:
return

else:
self._set_with(key, value)
self._set_with(key, value, warn=warn)

if cacher_needs_updating:
self._maybe_update_cacher(inplace=True)

def _set_with_engine(self, key, value) -> None:
def _set_with_engine(self, key, value, warn: bool = True) -> None:
loc = self.index.get_loc(key)

# this is equivalent to self._values[key] = value
self._mgr.setitem_inplace(loc, value)
self._mgr.setitem_inplace(loc, value, warn=warn)

def _set_with(self, key, value) -> None:
def _set_with(self, key, value, warn: bool = True) -> None:
# We got here via exception-handling off of InvalidIndexError, so
# key should always be listlike at this point.
assert not isinstance(key, tuple)
Expand All @@ -1327,7 +1349,7 @@ def _set_with(self, key, value) -> None:

if not self.index._should_fallback_to_positional:
# Regardless of the key type, we're treating it as labels
self._set_labels(key, value)
self._set_labels(key, value, warn=warn)

else:
# Note: key_type == "boolean" should not occur because that
Expand All @@ -1344,23 +1366,23 @@ def _set_with(self, key, value) -> None:
FutureWarning,
stacklevel=find_stack_level(),
)
self._set_values(key, value)
self._set_values(key, value, warn=warn)
else:
self._set_labels(key, value)
self._set_labels(key, value, warn=warn)

def _set_labels(self, key, value) -> None:
def _set_labels(self, key, value, warn: bool = True) -> None:
key = com.asarray_tuplesafe(key)
indexer: np.ndarray = self.index.get_indexer(key)
mask = indexer == -1
if mask.any():
raise KeyError(f"{key[mask]} not in index")
self._set_values(indexer, value)
self._set_values(indexer, value, warn=warn)

def _set_values(self, key, value) -> None:
def _set_values(self, key, value, warn: bool = True) -> None:
if isinstance(key, (Index, Series)):
key = key._values

self._mgr = self._mgr.setitem(indexer=key, value=value)
self._mgr = self._mgr.setitem(indexer=key, value=value, warn=warn)
self._maybe_update_cacher()

def _set_value(self, label, value, takeable: bool = False) -> None:
Expand Down
18 changes: 18 additions & 0 deletions pandas/errors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,24 @@ class ChainedAssignmentError(Warning):
)


_chained_assignment_warning_msg = (
"ChainedAssignmentError: behaviour will change in pandas 3.0!\n"
"You are setting values through chained assignment. Currently this works "
"in certain cases, but when using Copy-on-Write (which will become the "
"default behaviour in pandas 3.0) this will never work to update the "
"original DataFrame or Series, because the intermediate object on which "
"we are setting values will behave as a copy.\n"
"A typical example is when you are setting values in a column of a "
"DataFrame, like:\n\n"
'df["col"][row_indexer] = value\n\n'
'Use `df.loc[row_indexer, "col"] = values` instead, to perform the '
"assignment in a single step and ensure this keeps updating the original `df`.\n\n"
"See the caveats in the documentation: "
"https://pandas.pydata.org/pandas-docs/stable/user_guide/"
"indexing.html#returning-a-view-versus-a-copy\n"
)


_chained_assignment_warning_method_msg = (
"A value is trying to be set on a copy of a DataFrame or Series "
"through chained assignment using an inplace method.\n"
Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/copy_view/test_chained_assignment_deprecation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import numpy as np
import pytest

from pandas.errors import ChainedAssignmentError

from pandas import DataFrame
import pandas._testing as tm

Expand Down Expand Up @@ -61,3 +64,24 @@ def test_methods_iloc_getitem_item_cache(func, args, using_copy_on_write):
else:
with tm.assert_cow_warning(match="A value"):
df["a"].fillna(0, inplace=True)


# TODO(CoW-warn) expand the cases
@pytest.mark.parametrize(
"indexer", [0, [0, 1], slice(0, 2), np.array([True, False, True])]
)
def test_series_setitem(indexer, using_copy_on_write):
# ensure we only get a single warning for those typical cases of chained
# assignment
df = DataFrame({"a": [1, 2, 3], "b": 1})

# using custom check instead of tm.assert_produces_warning because that doesn't
# fail if multiple warnings are raised
with pytest.warns() as record:
df["a"][indexer] = 0
assert len(record) == 1
if using_copy_on_write:
assert record[0].category == ChainedAssignmentError
else:
assert record[0].category == FutureWarning
assert "ChainedAssignmentError" in record[0].message.args[0]
Loading

0 comments on commit 7012d6a

Please sign in to comment.