[backport 2.3.x] TST (string dtype): resolve xfails in pandas/tests/c…

…opy_view (#60245) (#60257)
pandas-dev · Dec 19, 2024 · c079337 · c079337
1 parent 75a1007
commit c079337
Show file tree

Hide file tree

Showing 5 changed files with 51 additions and 61 deletions.
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -8,7 +8,6 @@
     TYPE_CHECKING,
     Callable,
     ContextManager,
-    cast,
 )
 import warnings
 
@@ -23,8 +22,6 @@
 
 from pandas.compat import pa_version_under10p1
 
-from pandas.core.dtypes.common import is_string_dtype
-
 import pandas as pd
 from pandas import (
     ArrowDtype,
@@ -83,8 +80,8 @@
     with_csv_dialect,
 )
 from pandas.core.arrays import (
+    ArrowExtensionArray,
     BaseMaskedArray,
-    ExtensionArray,
     NumpyExtensionArray,
 )
 from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
@@ -96,7 +93,6 @@
         NpDtype,
     )
 
-    from pandas.core.arrays import ArrowExtensionArray
 
 UNSIGNED_INT_NUMPY_DTYPES: list[NpDtype] = ["uint8", "uint16", "uint32", "uint64"]
 UNSIGNED_INT_EA_DTYPES: list[Dtype] = ["UInt8", "UInt16", "UInt32", "UInt64"]
@@ -530,24 +526,18 @@ def shares_memory(left, right) -> bool:
     if isinstance(left, pd.core.arrays.IntervalArray):
         return shares_memory(left._left, right) or shares_memory(left._right, right)
 
-    if (
-        isinstance(left, ExtensionArray)
-        and is_string_dtype(left.dtype)
-        and left.dtype.storage == "pyarrow"  # type: ignore[attr-defined]
-    ):
-        # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
-        left = cast("ArrowExtensionArray", left)
-        if (
-            isinstance(right, ExtensionArray)
-            and is_string_dtype(right.dtype)
-            and right.dtype.storage == "pyarrow"  # type: ignore[attr-defined]
-        ):
-            right = cast("ArrowExtensionArray", right)
+    if isinstance(left, ArrowExtensionArray):
+        if isinstance(right, ArrowExtensionArray):
+            # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
             left_pa_data = left._pa_array
             right_pa_data = right._pa_array
             left_buf1 = left_pa_data.chunk(0).buffers()[1]
             right_buf1 = right_pa_data.chunk(0).buffers()[1]
-            return left_buf1 == right_buf1
+            return left_buf1.address == right_buf1.address
+        else:
+            # if we have one one ArrowExtensionArray and one other array, assume
+            # they can only share memory if they share the same numpy buffer
+            return np.shares_memory(left, right)
 
     if isinstance(left, BaseMaskedArray) and isinstance(right, BaseMaskedArray):
         # By convention, we'll say these share memory if they share *either*

diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat import HAS_PYARROW
 from pandas.compat.pyarrow import pa_version_under12p0
 import pandas.util._test_decorators as td
@@ -244,7 +242,6 @@ def test_astype_arrow_timestamp(using_copy_on_write):
             )
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 def test_convert_dtypes_infer_objects(using_copy_on_write):
     ser = Series(["a", "b", "c"])
     ser_orig = ser.copy()
@@ -256,30 +253,35 @@ def test_convert_dtypes_infer_objects(using_copy_on_write):
     )
 
     if using_copy_on_write:
-        assert np.shares_memory(get_array(ser), get_array(result))
+        assert tm.shares_memory(get_array(ser), get_array(result))
     else:
         assert not np.shares_memory(get_array(ser), get_array(result))
 
     result.iloc[0] = "x"
     tm.assert_series_equal(ser, ser_orig)
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
-def test_convert_dtypes(using_copy_on_write):
+def test_convert_dtypes(using_copy_on_write, using_infer_string):
     df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]})
     df_orig = df.copy()
     df2 = df.convert_dtypes()
 
     if using_copy_on_write:
-        assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
-        assert np.shares_memory(get_array(df2, "d"), get_array(df, "d"))
-        assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
-        assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+        if using_infer_string and HAS_PYARROW:
+            # TODO the default nullable string dtype still uses python storage
+            # this should be changed to pyarrow if installed
+            assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+        else:
+            assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+        assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d"))
+        assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+        assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c"))
     else:
         assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
         assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
         assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
         assert not np.shares_memory(get_array(df2, "d"), get_array(df, "d"))
 
     df2.iloc[0, 0] = "x"
+    df2.iloc[0, 1] = 10
     tm.assert_frame_equal(df, df_orig)
diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py
@@ -201,7 +201,6 @@ def test_concat_copy_keyword(using_copy_on_write, copy):
         assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
 
 
-# @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 @pytest.mark.parametrize(
     "func",
     [

diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat import HAS_PYARROW
 from pandas.errors import SettingWithCopyWarning
 
@@ -953,15 +951,19 @@ def test_head_tail(method, using_copy_on_write, warn_copy_on_write):
     tm.assert_frame_equal(df, df_orig)
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
-def test_infer_objects(using_copy_on_write):
-    df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"})
+def test_infer_objects(using_copy_on_write, using_infer_string):
+    df = DataFrame(
+        {"a": [1, 2], "b": Series(["x", "y"], dtype=object), "c": 1, "d": "x"}
+    )
     df_orig = df.copy()
     df2 = df.infer_objects()
 
     if using_copy_on_write:
         assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
-        assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+        if using_infer_string:
+            assert not tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+        else:
+            assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
 
     else:
         assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
@@ -975,19 +977,16 @@ def test_infer_objects(using_copy_on_write):
     tm.assert_frame_equal(df, df_orig)
 
 
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-)
-def test_infer_objects_no_reference(using_copy_on_write):
+def test_infer_objects_no_reference(using_copy_on_write, using_infer_string):
     df = DataFrame(
         {
             "a": [1, 2],
-            "b": "c",
+            "b": Series(["x", "y"], dtype=object),
             "c": 1,
             "d": Series(
                 [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
             ),
-            "e": "b",
+            "e": Series(["z", "w"], dtype=object),
         }
     )
     df = df.infer_objects()
@@ -1001,16 +1000,22 @@ def test_infer_objects_no_reference(using_copy_on_write):
     df.iloc[0, 3] = Timestamp("2018-12-31")
     if using_copy_on_write:
         assert np.shares_memory(arr_a, get_array(df, "a"))
-        # TODO(CoW): Block splitting causes references here
-        assert not np.shares_memory(arr_b, get_array(df, "b"))
+        if using_infer_string:
+            # note that the underlying memory of arr_b has been copied anyway
+            # because of the assignment, but the EA is updated inplace so still
+            # appears the share memory
+            assert tm.shares_memory(arr_b, get_array(df, "b"))
+        else:
+            # TODO(CoW): Block splitting causes references here
+            assert not np.shares_memory(arr_b, get_array(df, "b"))
         assert np.shares_memory(arr_d, get_array(df, "d"))
 
 
-def test_infer_objects_reference(using_copy_on_write):
+def test_infer_objects_reference(using_copy_on_write, using_infer_string):
     df = DataFrame(
         {
             "a": [1, 2],
-            "b": "c",
+            "b": Series(["x", "y"], dtype=object),
             "c": 1,
             "d": Series(
                 [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
@@ -1029,7 +1034,8 @@ def test_infer_objects_reference(using_copy_on_write):
     df.iloc[0, 3] = Timestamp("2018-12-31")
     if using_copy_on_write:
         assert not np.shares_memory(arr_a, get_array(df, "a"))
-        assert not np.shares_memory(arr_b, get_array(df, "b"))
+        if not using_infer_string or HAS_PYARROW:
+            assert not np.shares_memory(arr_b, get_array(df, "b"))
         assert np.shares_memory(arr_d, get_array(df, "d"))
 
 
@@ -1184,15 +1190,14 @@ def test_sort_values_inplace(using_copy_on_write, obj, kwargs, warn_copy_on_writ
         assert np.shares_memory(get_array(obj, "a"), get_array(view, "a"))
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 @pytest.mark.parametrize("decimals", [-1, 0, 1])
 def test_round(using_copy_on_write, warn_copy_on_write, decimals):
     df = DataFrame({"a": [1, 2], "b": "c"})
     df_orig = df.copy()
     df2 = df.round(decimals=decimals)
 
     if using_copy_on_write:
-        assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+        assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
         # TODO: Make inplace by using out parameter of ndarray.round?
         if decimals >= 0:
             # Ensure lazy copy if no-op

diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py
@@ -1,10 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
-from pandas.compat import HAS_PYARROW
-
 from pandas import (
     Categorical,
     DataFrame,
@@ -14,7 +10,6 @@
 from pandas.tests.copy_view.util import get_array
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @pytest.mark.parametrize(
     "replace_kwargs",
     [
@@ -31,15 +26,15 @@
     ],
 )
 def test_replace(using_copy_on_write, replace_kwargs):
-    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
     df_orig = df.copy()
 
     df_replaced = df.replace(**replace_kwargs)
 
     if using_copy_on_write:
         if (df_replaced["b"] == df["b"]).all():
             assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
-        assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
+        assert tm.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
 
     # mutating squeezed df triggers a copy-on-write for that column/block
     df_replaced.loc[0, "c"] = -1
@@ -61,26 +56,25 @@ def test_replace_regex_inplace_refs(using_copy_on_write, warn_copy_on_write):
     with tm.assert_cow_warning(warn_copy_on_write):
         df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
     if using_copy_on_write:
-        assert not np.shares_memory(arr, get_array(df, "a"))
+        assert not tm.shares_memory(arr, get_array(df, "a"))
         assert df._mgr._has_no_reference(0)
         tm.assert_frame_equal(view, df_orig)
     else:
         assert np.shares_memory(arr, get_array(df, "a"))
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 def test_replace_regex_inplace(using_copy_on_write):
     df = DataFrame({"a": ["aaa", "bbb"]})
     arr = get_array(df, "a")
     df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
     if using_copy_on_write:
         assert df._mgr._has_no_reference(0)
-    assert np.shares_memory(arr, get_array(df, "a"))
+    assert tm.shares_memory(arr, get_array(df, "a"))
 
     df_orig = df.copy()
     df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True)
     tm.assert_frame_equal(df_orig, df)
-    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
 
 
 def test_replace_regex_inplace_no_op(using_copy_on_write):
@@ -362,7 +356,7 @@ def test_replace_object_list_inplace(using_copy_on_write, value):
     arr = get_array(df, "a")
     df.replace(["c"], value, inplace=True)
     if using_copy_on_write or value is None:
-        assert np.shares_memory(arr, get_array(df, "a"))
+        assert tm.shares_memory(arr, get_array(df, "a"))
     else:
         # This could be inplace
         assert not np.shares_memory(arr, get_array(df, "a"))
-Original file line number
+Diff line change
@@ Expand Up / @@ -201,7 +201,6 @@ def test_concat_copy_keyword(using_copy_on_write, copy): @@
             assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
-    # @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
     @pytest.mark.parametrize(
         "func",
         [
@@ Expand Down @@