Skip to content

Commit

Permalink
CoW warning mode: add warning for single block setitem (#55838)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche authored Nov 10, 2023
1 parent c684830 commit 09ed69e
Show file tree
Hide file tree
Showing 40 changed files with 433 additions and 194 deletions.
3 changes: 2 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from pandas._config import (
get_option,
using_copy_on_write,
warn_copy_on_write,
)
from pandas._config.config import _get_option

Expand Down Expand Up @@ -4538,7 +4539,7 @@ def _clear_item_cache(self) -> None:

def _get_item_cache(self, item: Hashable) -> Series:
"""Return the cached item, item represents a label indexer."""
if using_copy_on_write():
if using_copy_on_write() or warn_copy_on_write():
loc = self.columns.get_loc(item)
return self._ixs(loc, axis=1)

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -12392,7 +12392,7 @@ def _inplace_method(self, other, op) -> Self:
"""
warn = True
if not PYPY and warn_copy_on_write():
if sys.getrefcount(self) <= 5:
if sys.getrefcount(self) <= 4:
# we are probably in an inplace setitem context (e.g. df['a'] += 1)
warn = False

Expand Down
7 changes: 5 additions & 2 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@

import numpy as np

from pandas._config import using_copy_on_write
from pandas._config import (
using_copy_on_write,
warn_copy_on_write,
)

from pandas._libs import lib
from pandas._libs.tslibs import OutOfBoundsDatetime
Expand Down Expand Up @@ -966,7 +969,7 @@ def is_in_axis(key) -> bool:
def is_in_obj(gpr) -> bool:
if not hasattr(gpr, "name"):
return False
if using_copy_on_write():
if using_copy_on_write() or warn_copy_on_write():
# For the CoW case, we check the references to determine if the
# series is part of the object
try:
Expand Down
28 changes: 25 additions & 3 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,15 @@
from pandas.api.extensions import ExtensionArray


COW_WARNING_GENERAL_MSG = """\
Setting a value on a view: behaviour will change in pandas 3.0.
You are mutating a Series or DataFrame object, and currently this mutation will
also have effect on other Series or DataFrame objects that share data with this
object. In pandas 3.0 (with Copy-on-Write), updating one Series or DataFrame object
will never modify another.
"""


COW_WARNING_SETITEM_MSG = """\
Setting a value on a view: behaviour will change in pandas 3.0.
Currently, the mutation will also have effect on the object that shares data
Expand Down Expand Up @@ -387,7 +396,14 @@ def setitem(self, indexer, value) -> Self:
if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim:
raise ValueError(f"Cannot set values with ndim > {self.ndim}")

if using_copy_on_write() and not self._has_no_reference(0):
if warn_copy_on_write() and not self._has_no_reference(0):
warnings.warn(
COW_WARNING_GENERAL_MSG,
FutureWarning,
stacklevel=find_stack_level(),
)

elif using_copy_on_write() and not self._has_no_reference(0):
# this method is only called if there is a single block -> hardcoded 0
# Split blocks to only copy the columns we want to modify
if self.ndim == 2 and isinstance(indexer, tuple):
Expand Down Expand Up @@ -1951,9 +1967,15 @@ def get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> Self:
return type(self)(blk.copy(deep=False), self.index)
array = blk.values[indexer]

if isinstance(indexer, np.ndarray) and indexer.dtype.kind == "b":
# boolean indexing always gives a copy with numpy
refs = None
else:
# TODO(CoW) in theory only need to track reference if new_array is a view
refs = blk.refs

bp = BlockPlacement(slice(0, len(array)))
# TODO(CoW) in theory only need to track reference if new_array is a view
block = type(blk)(array, placement=bp, ndim=1, refs=blk.refs)
block = type(blk)(array, placement=bp, ndim=1, refs=refs)

new_idx = self.index[indexer]
return type(self)(block, new_idx)
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/apply/test_frame_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1453,7 +1453,7 @@ def test_apply_dtype(col):
tm.assert_series_equal(result, expected)


def test_apply_mutating(using_array_manager, using_copy_on_write):
def test_apply_mutating(using_array_manager, using_copy_on_write, warn_copy_on_write):
# GH#35462 case where applied func pins a new BlockManager to a row
df = DataFrame({"a": range(100), "b": range(100, 200)})
df_orig = df.copy()
Expand All @@ -1467,7 +1467,8 @@ def func(row):
expected = df.copy()
expected["a"] += 1

result = df.apply(func, axis=1)
with tm.assert_cow_warning(warn_copy_on_write):
result = df.apply(func, axis=1)

tm.assert_frame_equal(result, expected)
if using_copy_on_write or using_array_manager:
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/copy_view/index/test_datetimeindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
)
import pandas._testing as tm

pytestmark = pytest.mark.filterwarnings(
"ignore:Setting a value on a view:FutureWarning"
)


@pytest.mark.parametrize(
"cons",
Expand Down
30 changes: 18 additions & 12 deletions pandas/tests/copy_view/index/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ def index_view(index_data=[1, 2]):
return idx, view


def test_set_index_update_column(using_copy_on_write):
def test_set_index_update_column(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1, 2], "b": 1})
df = df.set_index("a", drop=False)
expected = df.index.copy(deep=True)
df.iloc[0, 0] = 100
with tm.assert_cow_warning(warn_copy_on_write):
df.iloc[0, 0] = 100
if using_copy_on_write:
tm.assert_index_equal(df.index, expected)
else:
Expand All @@ -39,49 +40,53 @@ def test_set_index_drop_update_column(using_copy_on_write):
tm.assert_index_equal(df.index, expected)


def test_set_index_series(using_copy_on_write):
def test_set_index_series(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1, 2], "b": 1.5})
ser = Series([10, 11])
df = df.set_index(ser)
expected = df.index.copy(deep=True)
ser.iloc[0] = 100
with tm.assert_cow_warning(warn_copy_on_write):
ser.iloc[0] = 100
if using_copy_on_write:
tm.assert_index_equal(df.index, expected)
else:
tm.assert_index_equal(df.index, Index([100, 11]))


def test_assign_index_as_series(using_copy_on_write):
def test_assign_index_as_series(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1, 2], "b": 1.5})
ser = Series([10, 11])
df.index = ser
expected = df.index.copy(deep=True)
ser.iloc[0] = 100
with tm.assert_cow_warning(warn_copy_on_write):
ser.iloc[0] = 100
if using_copy_on_write:
tm.assert_index_equal(df.index, expected)
else:
tm.assert_index_equal(df.index, Index([100, 11]))


def test_assign_index_as_index(using_copy_on_write):
def test_assign_index_as_index(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1, 2], "b": 1.5})
ser = Series([10, 11])
rhs_index = Index(ser)
df.index = rhs_index
rhs_index = None # overwrite to clear reference
expected = df.index.copy(deep=True)
ser.iloc[0] = 100
with tm.assert_cow_warning(warn_copy_on_write):
ser.iloc[0] = 100
if using_copy_on_write:
tm.assert_index_equal(df.index, expected)
else:
tm.assert_index_equal(df.index, Index([100, 11]))


def test_index_from_series(using_copy_on_write):
def test_index_from_series(using_copy_on_write, warn_copy_on_write):
ser = Series([1, 2])
idx = Index(ser)
expected = idx.copy(deep=True)
ser.iloc[0] = 100
with tm.assert_cow_warning(warn_copy_on_write):
ser.iloc[0] = 100
if using_copy_on_write:
tm.assert_index_equal(idx, expected)
else:
Expand All @@ -96,12 +101,13 @@ def test_index_from_series_copy(using_copy_on_write):
assert np.shares_memory(get_array(ser), arr)


def test_index_from_index(using_copy_on_write):
def test_index_from_index(using_copy_on_write, warn_copy_on_write):
ser = Series([1, 2])
idx = Index(ser)
idx = Index(idx)
expected = idx.copy(deep=True)
ser.iloc[0] = 100
with tm.assert_cow_warning(warn_copy_on_write):
ser.iloc[0] = 100
if using_copy_on_write:
tm.assert_index_equal(idx, expected)
else:
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/copy_view/index/test_periodindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
)
import pandas._testing as tm

pytestmark = pytest.mark.filterwarnings(
"ignore:Setting a value on a view:FutureWarning"
)


@pytest.mark.parametrize(
"cons",
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/copy_view/index/test_timedeltaindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
)
import pandas._testing as tm

pytestmark = pytest.mark.filterwarnings(
"ignore:Setting a value on a view:FutureWarning"
)


@pytest.mark.parametrize(
"cons",
Expand Down
40 changes: 27 additions & 13 deletions pandas/tests/copy_view/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@


@pytest.mark.parametrize("dtype", [None, "int64"])
def test_series_from_series(dtype, using_copy_on_write):
def test_series_from_series(dtype, using_copy_on_write, warn_copy_on_write):
# Case: constructing a Series from another Series object follows CoW rules:
# a new object is returned and thus mutations are not propagated
ser = Series([1, 2, 3], name="name")
Expand All @@ -43,7 +43,8 @@ def test_series_from_series(dtype, using_copy_on_write):
assert not np.shares_memory(get_array(ser), get_array(result))
else:
# mutating shallow copy does mutate original
result.iloc[0] = 0
with tm.assert_cow_warning(warn_copy_on_write):
result.iloc[0] = 0
assert ser.iloc[0] == 0
# and still shares memory
assert np.shares_memory(get_array(ser), get_array(result))
Expand All @@ -57,11 +58,12 @@ def test_series_from_series(dtype, using_copy_on_write):
assert result.iloc[0] == 1
else:
# mutating original does mutate shallow copy
ser.iloc[0] = 0
with tm.assert_cow_warning(warn_copy_on_write):
ser.iloc[0] = 0
assert result.iloc[0] == 0


def test_series_from_series_with_reindex(using_copy_on_write):
def test_series_from_series_with_reindex(using_copy_on_write, warn_copy_on_write):
# Case: constructing a Series from another Series with specifying an index
# that potentially requires a reindex of the values
ser = Series([1, 2, 3], name="name")
Expand All @@ -76,7 +78,8 @@ def test_series_from_series_with_reindex(using_copy_on_write):
]:
result = Series(ser, index=index)
assert np.shares_memory(ser.values, result.values)
result.iloc[0] = 0
with tm.assert_cow_warning(warn_copy_on_write):
result.iloc[0] = 0
if using_copy_on_write:
assert ser.iloc[0] == 1
else:
Expand Down Expand Up @@ -153,6 +156,7 @@ def test_series_from_index_different_dtypes(using_copy_on_write):
assert ser._mgr._has_no_reference(0)


@pytest.mark.filterwarnings("ignore:Setting a value on a view:FutureWarning")
@pytest.mark.parametrize("fastpath", [False, True])
@pytest.mark.parametrize("dtype", [None, "int64"])
@pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)])
Expand Down Expand Up @@ -186,7 +190,9 @@ def test_series_from_block_manager_different_dtype(using_copy_on_write):

@pytest.mark.parametrize("use_mgr", [True, False])
@pytest.mark.parametrize("columns", [None, ["a"]])
def test_dataframe_constructor_mgr_or_df(using_copy_on_write, columns, use_mgr):
def test_dataframe_constructor_mgr_or_df(
using_copy_on_write, warn_copy_on_write, columns, use_mgr
):
df = DataFrame({"a": [1, 2, 3]})
df_orig = df.copy()

Expand All @@ -201,7 +207,8 @@ def test_dataframe_constructor_mgr_or_df(using_copy_on_write, columns, use_mgr):
new_df = DataFrame(data)

assert np.shares_memory(get_array(df, "a"), get_array(new_df, "a"))
new_df.iloc[0] = 100
with tm.assert_cow_warning(warn_copy_on_write and not use_mgr):
new_df.iloc[0] = 100

if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a"), get_array(new_df, "a"))
Expand All @@ -215,7 +222,7 @@ def test_dataframe_constructor_mgr_or_df(using_copy_on_write, columns, use_mgr):
@pytest.mark.parametrize("index", [None, [0, 1, 2]])
@pytest.mark.parametrize("columns", [None, ["a", "b"], ["a", "b", "c"]])
def test_dataframe_from_dict_of_series(
request, using_copy_on_write, columns, index, dtype
request, using_copy_on_write, warn_copy_on_write, columns, index, dtype
):
# Case: constructing a DataFrame from Series objects with copy=False
# has to do a lazy following CoW rules
Expand All @@ -235,6 +242,7 @@ def test_dataframe_from_dict_of_series(
assert np.shares_memory(get_array(result, "a"), get_array(s1))

# mutating the new dataframe doesn't mutate original
# TODO(CoW-warn) this should also warn
result.iloc[0, 0] = 10
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(s1))
Expand All @@ -248,7 +256,8 @@ def test_dataframe_from_dict_of_series(
result = DataFrame(
{"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False
)
s1.iloc[0] = 10
with tm.assert_cow_warning(warn_copy_on_write):
s1.iloc[0] = 10
if using_copy_on_write:
assert not np.shares_memory(get_array(result, "a"), get_array(s1))
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -278,15 +287,19 @@ def test_dataframe_from_dict_of_series_with_reindex(dtype):
@pytest.mark.parametrize(
"data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], None)]
)
def test_dataframe_from_series_or_index(using_copy_on_write, data, dtype, cons):
def test_dataframe_from_series_or_index(
using_copy_on_write, warn_copy_on_write, data, dtype, cons
):
obj = cons(data, dtype=dtype)
obj_orig = obj.copy()
df = DataFrame(obj, dtype=dtype)
assert np.shares_memory(get_array(obj), get_array(df, 0))
if using_copy_on_write:
assert not df._mgr._has_no_reference(0)

df.iloc[0, 0] = data[-1]
# TODO(CoW-warn) should not warn for an index?
with tm.assert_cow_warning(warn_copy_on_write):
df.iloc[0, 0] = data[-1]
if using_copy_on_write:
tm.assert_equal(obj, obj_orig)

Expand Down Expand Up @@ -341,15 +354,16 @@ def test_frame_from_numpy_array(using_copy_on_write, copy, using_array_manager):
assert np.shares_memory(get_array(df, 0), arr)


def test_dataframe_from_records_with_dataframe(using_copy_on_write):
def test_dataframe_from_records_with_dataframe(using_copy_on_write, warn_copy_on_write):
df = DataFrame({"a": [1, 2, 3]})
df_orig = df.copy()
with tm.assert_produces_warning(FutureWarning):
df2 = DataFrame.from_records(df)
if using_copy_on_write:
assert not df._mgr._has_no_reference(0)
assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
df2.iloc[0, 0] = 100
with tm.assert_cow_warning(warn_copy_on_write):
df2.iloc[0, 0] = 100
if using_copy_on_write:
tm.assert_frame_equal(df, df_orig)
else:
Expand Down
Loading

0 comments on commit 09ed69e

Please sign in to comment.