Skip to content

Commit

Permalink
[backport 2.3.x] TST (string dtype): resolve xfails in pandas/tests/c…
Browse files Browse the repository at this point in the history
…opy_view (#60245) (#60257)
  • Loading branch information
jorisvandenbossche authored Dec 19, 2024
1 parent 75a1007 commit c079337
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 61 deletions.
28 changes: 9 additions & 19 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
TYPE_CHECKING,
Callable,
ContextManager,
cast,
)
import warnings

Expand All @@ -23,8 +22,6 @@

from pandas.compat import pa_version_under10p1

from pandas.core.dtypes.common import is_string_dtype

import pandas as pd
from pandas import (
ArrowDtype,
Expand Down Expand Up @@ -83,8 +80,8 @@
with_csv_dialect,
)
from pandas.core.arrays import (
ArrowExtensionArray,
BaseMaskedArray,
ExtensionArray,
NumpyExtensionArray,
)
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
Expand All @@ -96,7 +93,6 @@
NpDtype,
)

from pandas.core.arrays import ArrowExtensionArray

UNSIGNED_INT_NUMPY_DTYPES: list[NpDtype] = ["uint8", "uint16", "uint32", "uint64"]
UNSIGNED_INT_EA_DTYPES: list[Dtype] = ["UInt8", "UInt16", "UInt32", "UInt64"]
Expand Down Expand Up @@ -530,24 +526,18 @@ def shares_memory(left, right) -> bool:
if isinstance(left, pd.core.arrays.IntervalArray):
return shares_memory(left._left, right) or shares_memory(left._right, right)

if (
isinstance(left, ExtensionArray)
and is_string_dtype(left.dtype)
and left.dtype.storage == "pyarrow" # type: ignore[attr-defined]
):
# https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
left = cast("ArrowExtensionArray", left)
if (
isinstance(right, ExtensionArray)
and is_string_dtype(right.dtype)
and right.dtype.storage == "pyarrow" # type: ignore[attr-defined]
):
right = cast("ArrowExtensionArray", right)
if isinstance(left, ArrowExtensionArray):
if isinstance(right, ArrowExtensionArray):
# https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
left_pa_data = left._pa_array
right_pa_data = right._pa_array
left_buf1 = left_pa_data.chunk(0).buffers()[1]
right_buf1 = right_pa_data.chunk(0).buffers()[1]
return left_buf1 == right_buf1
return left_buf1.address == right_buf1.address
else:
# if we have one one ArrowExtensionArray and one other array, assume
# they can only share memory if they share the same numpy buffer
return np.shares_memory(left, right)

if isinstance(left, BaseMaskedArray) and isinstance(right, BaseMaskedArray):
# By convention, we'll say these share memory if they share *either*
Expand Down
22 changes: 12 additions & 10 deletions pandas/tests/copy_view/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import HAS_PYARROW
from pandas.compat.pyarrow import pa_version_under12p0
import pandas.util._test_decorators as td
Expand Down Expand Up @@ -244,7 +242,6 @@ def test_astype_arrow_timestamp(using_copy_on_write):
)


@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_convert_dtypes_infer_objects(using_copy_on_write):
ser = Series(["a", "b", "c"])
ser_orig = ser.copy()
Expand All @@ -256,30 +253,35 @@ def test_convert_dtypes_infer_objects(using_copy_on_write):
)

if using_copy_on_write:
assert np.shares_memory(get_array(ser), get_array(result))
assert tm.shares_memory(get_array(ser), get_array(result))
else:
assert not np.shares_memory(get_array(ser), get_array(result))

result.iloc[0] = "x"
tm.assert_series_equal(ser, ser_orig)


@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_convert_dtypes(using_copy_on_write):
def test_convert_dtypes(using_copy_on_write, using_infer_string):
df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]})
df_orig = df.copy()
df2 = df.convert_dtypes()

if using_copy_on_write:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert np.shares_memory(get_array(df2, "d"), get_array(df, "d"))
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
if using_infer_string and HAS_PYARROW:
# TODO the default nullable string dtype still uses python storage
# this should be changed to pyarrow if installed
assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
else:
assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d"))
assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c"))
else:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
assert not np.shares_memory(get_array(df2, "d"), get_array(df, "d"))

df2.iloc[0, 0] = "x"
df2.iloc[0, 1] = 10
tm.assert_frame_equal(df, df_orig)
1 change: 0 additions & 1 deletion pandas/tests/copy_view/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,6 @@ def test_concat_copy_keyword(using_copy_on_write, copy):
assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b"))


# @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
@pytest.mark.parametrize(
"func",
[
Expand Down
43 changes: 24 additions & 19 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import HAS_PYARROW
from pandas.errors import SettingWithCopyWarning

Expand Down Expand Up @@ -953,15 +951,19 @@ def test_head_tail(method, using_copy_on_write, warn_copy_on_write):
tm.assert_frame_equal(df, df_orig)


@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_infer_objects(using_copy_on_write):
df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"})
def test_infer_objects(using_copy_on_write, using_infer_string):
df = DataFrame(
{"a": [1, 2], "b": Series(["x", "y"], dtype=object), "c": 1, "d": "x"}
)
df_orig = df.copy()
df2 = df.infer_objects()

if using_copy_on_write:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
if using_infer_string:
assert not tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
else:
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))

else:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
Expand All @@ -975,19 +977,16 @@ def test_infer_objects(using_copy_on_write):
tm.assert_frame_equal(df, df_orig)


@pytest.mark.xfail(
using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
)
def test_infer_objects_no_reference(using_copy_on_write):
def test_infer_objects_no_reference(using_copy_on_write, using_infer_string):
df = DataFrame(
{
"a": [1, 2],
"b": "c",
"b": Series(["x", "y"], dtype=object),
"c": 1,
"d": Series(
[Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
),
"e": "b",
"e": Series(["z", "w"], dtype=object),
}
)
df = df.infer_objects()
Expand All @@ -1001,16 +1000,22 @@ def test_infer_objects_no_reference(using_copy_on_write):
df.iloc[0, 3] = Timestamp("2018-12-31")
if using_copy_on_write:
assert np.shares_memory(arr_a, get_array(df, "a"))
# TODO(CoW): Block splitting causes references here
assert not np.shares_memory(arr_b, get_array(df, "b"))
if using_infer_string:
# note that the underlying memory of arr_b has been copied anyway
# because of the assignment, but the EA is updated inplace so still
# appears the share memory
assert tm.shares_memory(arr_b, get_array(df, "b"))
else:
# TODO(CoW): Block splitting causes references here
assert not np.shares_memory(arr_b, get_array(df, "b"))
assert np.shares_memory(arr_d, get_array(df, "d"))


def test_infer_objects_reference(using_copy_on_write):
def test_infer_objects_reference(using_copy_on_write, using_infer_string):
df = DataFrame(
{
"a": [1, 2],
"b": "c",
"b": Series(["x", "y"], dtype=object),
"c": 1,
"d": Series(
[Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
Expand All @@ -1029,7 +1034,8 @@ def test_infer_objects_reference(using_copy_on_write):
df.iloc[0, 3] = Timestamp("2018-12-31")
if using_copy_on_write:
assert not np.shares_memory(arr_a, get_array(df, "a"))
assert not np.shares_memory(arr_b, get_array(df, "b"))
if not using_infer_string or HAS_PYARROW:
assert not np.shares_memory(arr_b, get_array(df, "b"))
assert np.shares_memory(arr_d, get_array(df, "d"))


Expand Down Expand Up @@ -1184,15 +1190,14 @@ def test_sort_values_inplace(using_copy_on_write, obj, kwargs, warn_copy_on_writ
assert np.shares_memory(get_array(obj, "a"), get_array(view, "a"))


@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
@pytest.mark.parametrize("decimals", [-1, 0, 1])
def test_round(using_copy_on_write, warn_copy_on_write, decimals):
df = DataFrame({"a": [1, 2], "b": "c"})
df_orig = df.copy()
df2 = df.round(decimals=decimals)

if using_copy_on_write:
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
# TODO: Make inplace by using out parameter of ndarray.round?
if decimals >= 0:
# Ensure lazy copy if no-op
Expand Down
18 changes: 6 additions & 12 deletions pandas/tests/copy_view/test_replace.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import HAS_PYARROW

from pandas import (
Categorical,
DataFrame,
Expand All @@ -14,7 +10,6 @@
from pandas.tests.copy_view.util import get_array


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.parametrize(
"replace_kwargs",
[
Expand All @@ -31,15 +26,15 @@
],
)
def test_replace(using_copy_on_write, replace_kwargs):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()

df_replaced = df.replace(**replace_kwargs)

if using_copy_on_write:
if (df_replaced["b"] == df["b"]).all():
assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
assert tm.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))

# mutating squeezed df triggers a copy-on-write for that column/block
df_replaced.loc[0, "c"] = -1
Expand All @@ -61,26 +56,25 @@ def test_replace_regex_inplace_refs(using_copy_on_write, warn_copy_on_write):
with tm.assert_cow_warning(warn_copy_on_write):
df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
if using_copy_on_write:
assert not np.shares_memory(arr, get_array(df, "a"))
assert not tm.shares_memory(arr, get_array(df, "a"))
assert df._mgr._has_no_reference(0)
tm.assert_frame_equal(view, df_orig)
else:
assert np.shares_memory(arr, get_array(df, "a"))


@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_replace_regex_inplace(using_copy_on_write):
df = DataFrame({"a": ["aaa", "bbb"]})
arr = get_array(df, "a")
df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert np.shares_memory(arr, get_array(df, "a"))
assert tm.shares_memory(arr, get_array(df, "a"))

df_orig = df.copy()
df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True)
tm.assert_frame_equal(df_orig, df)
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))


def test_replace_regex_inplace_no_op(using_copy_on_write):
Expand Down Expand Up @@ -362,7 +356,7 @@ def test_replace_object_list_inplace(using_copy_on_write, value):
arr = get_array(df, "a")
df.replace(["c"], value, inplace=True)
if using_copy_on_write or value is None:
assert np.shares_memory(arr, get_array(df, "a"))
assert tm.shares_memory(arr, get_array(df, "a"))
else:
# This could be inplace
assert not np.shares_memory(arr, get_array(df, "a"))
Expand Down

0 comments on commit c079337

Please sign in to comment.