Skip to content

Commit

Permalink
BUG (string dtype): fix inplace mutation with copy=False in ensure_st…
Browse files Browse the repository at this point in the history
…ring_array (pandas-dev#59756) (#2)

* BUG (string dtype): fix inplace mutation with copy=False in ensure_string_array

* update

Co-authored-by: Joris Van den Bossche <[email protected]>
  • Loading branch information
tigerhawkvok and jorisvandenbossche authored Sep 9, 2024
1 parent d3121ba commit 21b574e
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 11 deletions.
18 changes: 12 additions & 6 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -733,7 +733,9 @@ cpdef ndarray[object] ensure_string_array(
convert_na_value : bool, default True
If False, existing na values will be used unchanged in the new array.
copy : bool, default True
Whether to ensure that a new array is returned.
Whether to ensure that a new array is returned. When True, a new array
is always returned. When False, a new array is only returned when needed
to avoid mutating the input array.
skipna : bool, default True
Whether or not to coerce nulls to their stringified form
(e.g. if False, NaN becomes 'nan').
Expand Down Expand Up @@ -762,11 +764,15 @@ cpdef ndarray[object] ensure_string_array(

result = np.asarray(arr, dtype="object")

if copy and (result is arr or np.shares_memory(arr, result)):
# GH#54654
result = result.copy()
elif not copy and result is arr:
already_copied = False
if result is arr or np.may_share_memory(arr, result):
# if np.asarray(..) did not make a copy of the input arr, we still need
# to do that to avoid mutating the input array
# GH#54654: share_memory check is needed for rare cases where np.asarray
# returns a new object without making a copy of the actual data
if copy:
result = result.copy()
else:
already_copied = False
elif not copy and not result.flags.writeable:
# Weird edge case where result is a view
already_copied = False
Expand Down
18 changes: 13 additions & 5 deletions pandas/tests/copy_view/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from pandas.compat import HAS_PYARROW
from pandas.compat.pyarrow import pa_version_under12p0
import pandas.util._test_decorators as td

from pandas import (
DataFrame,
Expand Down Expand Up @@ -111,7 +110,8 @@ def test_astype_string_and_object_update_original(dtype, new_dtype):
tm.assert_frame_equal(df2, df_orig)


def test_astype_string_copy_on_pickle_roundrip():
def test_astype_str_copy_on_pickle_roundrip():
# TODO(infer_string) this test can be removed after 3.0 (once str is the default)
# https://github.com/pandas-dev/pandas/issues/54654
# ensure_string_array may alter array inplace
base = Series(np.array([(1, 2), None, 1], dtype="object"))
Expand All @@ -120,14 +120,22 @@ def test_astype_string_copy_on_pickle_roundrip():
tm.assert_series_equal(base, base_copy)


@td.skip_if_no("pyarrow")
def test_astype_string_read_only_on_pickle_roundrip():
def test_astype_string_copy_on_pickle_roundrip(any_string_dtype):
# https://github.com/pandas-dev/pandas/issues/54654
# ensure_string_array may alter array inplace
base = Series(np.array([(1, 2), None, 1], dtype="object"))
base_copy = pickle.loads(pickle.dumps(base))
base_copy.astype(any_string_dtype)
tm.assert_series_equal(base, base_copy)


def test_astype_string_read_only_on_pickle_roundrip(any_string_dtype):
# https://github.com/pandas-dev/pandas/issues/54654
# ensure_string_array may alter read-only array inplace
base = Series(np.array([(1, 2), None, 1], dtype="object"))
base_copy = pickle.loads(pickle.dumps(base))
base_copy._values.flags.writeable = False
base_copy.astype("string[pyarrow]")
base_copy.astype(any_string_dtype)
tm.assert_series_equal(base, base_copy)


Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/libs/test_lib.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pickle

import numpy as np
import pytest

Expand Down Expand Up @@ -283,3 +285,15 @@ def test_no_default_pickle():
# GH#40397
obj = tm.round_trip_pickle(lib.no_default)
assert obj is lib.no_default


def test_ensure_string_array_copy():
# ensure the original array is not modified in case of copy=False with
# pickle-roundtripped object dtype array
# https://github.com/pandas-dev/pandas/issues/54654
arr = np.array(["a", None], dtype=object)
arr = pickle.loads(pickle.dumps(arr))
result = lib.ensure_string_array(arr, copy=False)
assert not np.shares_memory(arr, result)
assert arr[1] is None
assert result[1] is np.nan

0 comments on commit 21b574e

Please sign in to comment.