Skip to content

Commit

Permalink
TST (string dtype): resolve xfails in pandas/tests/copy_view (#60245)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche authored Nov 8, 2024
1 parent 7740c4e commit 084b199
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 57 deletions.
28 changes: 9 additions & 19 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from typing import (
TYPE_CHECKING,
ContextManager,
cast,
)

import numpy as np
Expand All @@ -21,8 +20,6 @@

from pandas.compat import pa_version_under10p1

from pandas.core.dtypes.common import is_string_dtype

import pandas as pd
from pandas import (
ArrowDtype,
Expand Down Expand Up @@ -77,8 +74,8 @@
with_csv_dialect,
)
from pandas.core.arrays import (
ArrowExtensionArray,
BaseMaskedArray,
ExtensionArray,
NumpyExtensionArray,
)
from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
Expand All @@ -92,7 +89,6 @@
NpDtype,
)

from pandas.core.arrays import ArrowExtensionArray

UNSIGNED_INT_NUMPY_DTYPES: list[NpDtype] = ["uint8", "uint16", "uint32", "uint64"]
UNSIGNED_INT_EA_DTYPES: list[Dtype] = ["UInt8", "UInt16", "UInt32", "UInt64"]
Expand Down Expand Up @@ -512,24 +508,18 @@ def shares_memory(left, right) -> bool:
if isinstance(left, pd.core.arrays.IntervalArray):
return shares_memory(left._left, right) or shares_memory(left._right, right)

if (
isinstance(left, ExtensionArray)
and is_string_dtype(left.dtype)
and left.dtype.storage == "pyarrow" # type: ignore[attr-defined]
):
# https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
left = cast("ArrowExtensionArray", left)
if (
isinstance(right, ExtensionArray)
and is_string_dtype(right.dtype)
and right.dtype.storage == "pyarrow" # type: ignore[attr-defined]
):
right = cast("ArrowExtensionArray", right)
if isinstance(left, ArrowExtensionArray):
if isinstance(right, ArrowExtensionArray):
# https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
left_pa_data = left._pa_array
right_pa_data = right._pa_array
left_buf1 = left_pa_data.chunk(0).buffers()[1]
right_buf1 = right_pa_data.chunk(0).buffers()[1]
return left_buf1 == right_buf1
return left_buf1.address == right_buf1.address
else:
# if we have one one ArrowExtensionArray and one other array, assume
# they can only share memory if they share the same numpy buffer
return np.shares_memory(left, right)

if isinstance(left, BaseMaskedArray) and isinstance(right, BaseMaskedArray):
# By convention, we'll say these share memory if they share *either*
Expand Down
22 changes: 12 additions & 10 deletions pandas/tests/copy_view/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import HAS_PYARROW
from pandas.compat.pyarrow import pa_version_under12p0

Expand Down Expand Up @@ -206,7 +204,6 @@ def test_astype_arrow_timestamp():
assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._pa_array)


@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_convert_dtypes_infer_objects():
ser = Series(["a", "b", "c"])
ser_orig = ser.copy()
Expand All @@ -217,20 +214,25 @@ def test_convert_dtypes_infer_objects():
convert_string=False,
)

assert np.shares_memory(get_array(ser), get_array(result))
assert tm.shares_memory(get_array(ser), get_array(result))
result.iloc[0] = "x"
tm.assert_series_equal(ser, ser_orig)


@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_convert_dtypes():
def test_convert_dtypes(using_infer_string):
df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]})
df_orig = df.copy()
df2 = df.convert_dtypes()

assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert np.shares_memory(get_array(df2, "d"), get_array(df, "d"))
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
if using_infer_string and HAS_PYARROW:
# TODO the default nullable string dtype still uses python storage
# this should be changed to pyarrow if installed
assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
else:
assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d"))
assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c"))
df2.iloc[0, 0] = "x"
df2.iloc[0, 1] = 10
tm.assert_frame_equal(df, df_orig)
1 change: 0 additions & 1 deletion pandas/tests/copy_view/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,6 @@ def test_concat_copy_keyword():
assert np.shares_memory(get_array(df2, "b"), get_array(result, "b"))


# @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
@pytest.mark.parametrize(
"func",
[
Expand Down
38 changes: 21 additions & 17 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import HAS_PYARROW

import pandas as pd
Expand Down Expand Up @@ -716,14 +714,18 @@ def test_head_tail(method):
tm.assert_frame_equal(df, df_orig)


@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_infer_objects():
df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"})
def test_infer_objects(using_infer_string):
df = DataFrame(
{"a": [1, 2], "b": Series(["x", "y"], dtype=object), "c": 1, "d": "x"}
)
df_orig = df.copy()
df2 = df.infer_objects()

assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
if using_infer_string and HAS_PYARROW:
assert not tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
else:
assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))

df2.iloc[0, 0] = 0
df2.iloc[0, 1] = "d"
Expand All @@ -732,19 +734,16 @@ def test_infer_objects():
tm.assert_frame_equal(df, df_orig)


@pytest.mark.xfail(
using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
)
def test_infer_objects_no_reference():
def test_infer_objects_no_reference(using_infer_string):
df = DataFrame(
{
"a": [1, 2],
"b": "c",
"b": Series(["x", "y"], dtype=object),
"c": 1,
"d": Series(
[Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
),
"e": "b",
"e": Series(["z", "w"], dtype=object),
}
)
df = df.infer_objects()
Expand All @@ -757,16 +756,22 @@ def test_infer_objects_no_reference():
df.iloc[0, 1] = "d"
df.iloc[0, 3] = Timestamp("2018-12-31")
assert np.shares_memory(arr_a, get_array(df, "a"))
# TODO(CoW): Block splitting causes references here
assert not np.shares_memory(arr_b, get_array(df, "b"))
if using_infer_string and HAS_PYARROW:
# note that the underlying memory of arr_b has been copied anyway
# because of the assignment, but the EA is updated inplace so still
# appears the share memory
assert tm.shares_memory(arr_b, get_array(df, "b"))
else:
# TODO(CoW): Block splitting causes references here
assert not np.shares_memory(arr_b, get_array(df, "b"))
assert np.shares_memory(arr_d, get_array(df, "d"))


def test_infer_objects_reference():
df = DataFrame(
{
"a": [1, 2],
"b": "c",
"b": Series(["x", "y"], dtype=object),
"c": 1,
"d": Series(
[Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
Expand Down Expand Up @@ -904,14 +909,13 @@ def test_sort_values_inplace(obj, kwargs):
tm.assert_equal(view, obj_orig)


@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
@pytest.mark.parametrize("decimals", [-1, 0, 1])
def test_round(decimals):
df = DataFrame({"a": [1, 2], "b": "c"})
df_orig = df.copy()
df2 = df.round(decimals=decimals)

assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
# TODO: Make inplace by using out parameter of ndarray.round?
if decimals >= 0:
# Ensure lazy copy if no-op
Expand Down
14 changes: 4 additions & 10 deletions pandas/tests/copy_view/test_replace.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import HAS_PYARROW

from pandas import (
Categorical,
DataFrame,
Expand All @@ -13,7 +9,6 @@
from pandas.tests.copy_view.util import get_array


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.parametrize(
"replace_kwargs",
[
Expand All @@ -30,14 +25,14 @@
],
)
def test_replace(replace_kwargs):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
df_orig = df.copy()

df_replaced = df.replace(**replace_kwargs)

if (df_replaced["b"] == df["b"]).all():
assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
assert tm.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))

# mutating squeezed df triggers a copy-on-write for that column/block
df_replaced.loc[0, "c"] = -1
Expand All @@ -61,18 +56,17 @@ def test_replace_regex_inplace_refs():
tm.assert_frame_equal(view, df_orig)


@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
def test_replace_regex_inplace():
df = DataFrame({"a": ["aaa", "bbb"]})
arr = get_array(df, "a")
df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
assert df._mgr._has_no_reference(0)
assert np.shares_memory(arr, get_array(df, "a"))
assert tm.shares_memory(arr, get_array(df, "a"))

df_orig = df.copy()
df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True)
tm.assert_frame_equal(df_orig, df)
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))


def test_replace_regex_inplace_no_op():
Expand Down

0 comments on commit 084b199

Please sign in to comment.