Skip to content

Commit

Permalink
ENH: Avoid copying whole block for single block case (pandas-dev#51435)
Browse files Browse the repository at this point in the history
  • Loading branch information
phofl authored Aug 11, 2023
1 parent 248c966 commit 0582e35
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 15 deletions.
24 changes: 23 additions & 1 deletion pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,8 +370,30 @@ def setitem(self, indexer, value) -> Self:
raise ValueError(f"Cannot set values with ndim > {self.ndim}")

if using_copy_on_write() and not self._has_no_reference(0):
# if being referenced -> perform Copy-on-Write and clear the reference
# this method is only called if there is a single block -> hardcoded 0
# Split blocks to only copy the columns we want to modify
if self.ndim == 2 and isinstance(indexer, tuple):
blk_loc = self.blklocs[indexer[1]]
if is_list_like(blk_loc) and blk_loc.ndim == 2:
blk_loc = np.squeeze(blk_loc, axis=0)
elif not is_list_like(blk_loc):
# Keep dimension and copy data later
blk_loc = [blk_loc] # type: ignore[assignment]
if len(blk_loc) == 0:
return self.copy(deep=False)

values = self.blocks[0].values
if values.ndim == 2:
values = values[blk_loc]
# "T" has no attribute "_iset_split_block"
self._iset_split_block( # type: ignore[attr-defined]
0, blk_loc, values
)
# first block equals values
self.blocks[0].setitem((indexer[0], np.arange(len(blk_loc))), value)
return self
# No need to split if we either set all columns or on a single block
# manager
self = self.copy()

return self.apply("setitem", indexer=indexer, value=value)
Expand Down
11 changes: 5 additions & 6 deletions pandas/tests/copy_view/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1028,16 +1028,15 @@ def test_dataframe_add_column_from_series(backend, using_copy_on_write):
(tm.iloc, (slice(None), 0)),
],
)
@pytest.mark.parametrize(
"col", [[0.1, 0.2, 0.3], [7, 8, 9]], ids=["mixed-block", "single-block"]
)
def test_set_value_copy_only_necessary_column(
using_copy_on_write,
indexer_func,
indexer,
val,
using_copy_on_write, indexer_func, indexer, val, col
):
# When setting inplace, only copy column that is modified instead of the whole
# block (by splitting the block)
# TODO multi-block only for now
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": col})
df_orig = df.copy()
view = df[:]

Expand Down
22 changes: 14 additions & 8 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,8 @@ def test_shift_no_op(using_copy_on_write):

df.iloc[0, 0] = 0
if using_copy_on_write:
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
tm.assert_frame_equal(df2, df_orig)


Expand Down Expand Up @@ -532,16 +533,16 @@ def test_shift_columns(using_copy_on_write):
df2 = df.shift(periods=1, axis=1)

assert np.shares_memory(get_array(df2, "2020-01-02"), get_array(df, "2020-01-01"))
df.iloc[0, 1] = 0
df.iloc[0, 0] = 0
if using_copy_on_write:
assert not np.shares_memory(
get_array(df2, "2020-01-02"), get_array(df, "2020-01-01")
)
expected = DataFrame(
[[np.nan, 1], [np.nan, 3], [np.nan, 5]],
columns=date_range("2020-01-01", "2020-01-02"),
)
tm.assert_frame_equal(df2, expected)
expected = DataFrame(
[[np.nan, 1], [np.nan, 3], [np.nan, 5]],
columns=date_range("2020-01-01", "2020-01-02"),
)
tm.assert_frame_equal(df2, expected)


def test_pop(using_copy_on_write):
Expand Down Expand Up @@ -1335,13 +1336,18 @@ def test_droplevel(using_copy_on_write):

if using_copy_on_write:
assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))

# mutating df2 triggers a copy-on-write for that column / block
df2.iloc[0, 0] = 0

assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
if using_copy_on_write:
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))

tm.assert_frame_equal(df, df_orig)


Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2646,6 +2646,13 @@ def test_loc_indexer_all_false_broadcast(self):
df.loc[np.array([False], dtype=np.bool_), ["a"]] = df["b"]
tm.assert_frame_equal(df, expected)

def test_loc_indexer_length_one(self):
# GH#51435
df = DataFrame({"a": ["x"], "b": ["y"]}, dtype=object)
expected = DataFrame({"a": ["y"], "b": ["y"]}, dtype=object)
df.loc[np.array([True], dtype=np.bool_), ["a"]] = df["b"]
tm.assert_frame_equal(df, expected)


class TestLocListlike:
@pytest.mark.parametrize("box", [lambda x: x, np.asarray, list])
Expand Down

0 comments on commit 0582e35

Please sign in to comment.