From 75a1007e6c40ec765fb3764935e84bb34acf0163 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 18 Dec 2024 21:21:32 +0100
Subject: [PATCH 1/2] [backport 2.3.x] TST (string dtype): un-xfail string
 tests specific to object dtype (#59433) (#60180)

Co-authored-by: jbrockmendel <jbrockmendel@gmail.com>
---
 pandas/tests/copy_view/test_interp_fillna.py | 13 ++++------
 pandas/tests/copy_view/test_replace.py       |  3 +--
 pandas/tests/test_algos.py                   | 26 ++++++++++++++------
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py
index 338b76cbf1e7a..d0c4fa53faab9 100644
--- a/pandas/tests/copy_view/test_interp_fillna.py
+++ b/pandas/tests/copy_view/test_interp_fillna.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     NA,
     ArrowDtype,
@@ -137,10 +135,9 @@ def test_interp_fill_functions_inplace(
         assert np.shares_memory(arr, get_array(df, "a")) is (dtype == "float64")
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-def test_interpolate_cleaned_fill_method(using_copy_on_write):
-    # Check that "method is set to None" case works correctly
+def test_interpolate_cannot_with_object_dtype(using_copy_on_write):
     df = DataFrame({"a": ["a", np.nan, "c"], "b": 1})
+    df["a"] = df["a"].astype(object)
     df_orig = df.copy()
 
     msg = "DataFrame.interpolate with object dtype"
@@ -159,16 +156,16 @@ def test_interpolate_cleaned_fill_method(using_copy_on_write):
     tm.assert_frame_equal(df, df_orig)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-def test_interpolate_object_convert_no_op(using_copy_on_write):
+def test_interpolate_object_convert_no_op(using_copy_on_write, using_infer_string):
     df = DataFrame({"a": ["a", "b", "c"], "b": 1})
+    df["a"] = df["a"].astype(object)
     arr_a = get_array(df, "a")
     msg = "DataFrame.interpolate with method=pad is deprecated"
     with tm.assert_produces_warning(FutureWarning, match=msg):
         df.interpolate(method="pad", inplace=True)
 
     # Now CoW makes a copy, it should not!
-    if using_copy_on_write:
+    if using_copy_on_write and not using_infer_string:
         assert df._mgr._has_no_reference(0)
         assert np.shares_memory(arr_a, get_array(df, "a"))
 
diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py
index bc3edb1f72214..9e24ce319e3bf 100644
--- a/pandas/tests/copy_view/test_replace.py
+++ b/pandas/tests/copy_view/test_replace.py
@@ -356,10 +356,9 @@ def test_replace_empty_list(using_copy_on_write):
         assert not df2._mgr._has_no_reference(0)
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 @pytest.mark.parametrize("value", ["d", None])
 def test_replace_object_list_inplace(using_copy_on_write, value):
-    df = DataFrame({"a": ["a", "b", "c"]})
+    df = DataFrame({"a": ["a", "b", "c"]}, dtype=object)
     arr = get_array(df, "a")
     df.replace(["c"], value, inplace=True)
     if using_copy_on_write or value is None:
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index d1e69cfa2b4ee..80ee0f6e067f9 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1704,12 +1704,17 @@ def test_unique_complex_numbers(self, array, expected):
 
 
 class TestHashTable:
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize(
         "htable, data",
         [
-            (ht.PyObjectHashTable, [f"foo_{i}" for i in range(1000)]),
-            (ht.StringHashTable, [f"foo_{i}" for i in range(1000)]),
+            (
+                ht.PyObjectHashTable,
+                np.array([f"foo_{i}" for i in range(1000)], dtype=object),
+            ),
+            (
+                ht.StringHashTable,
+                np.array([f"foo_{i}" for i in range(1000)], dtype=object),
+            ),
             (ht.Float64HashTable, np.arange(1000, dtype=np.float64)),
             (ht.Int64HashTable, np.arange(1000, dtype=np.int64)),
             (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)),
@@ -1717,7 +1722,7 @@ class TestHashTable:
     )
     def test_hashtable_unique(self, htable, data, writable):
         # output of maker has guaranteed unique elements
-        s = Series(data)
+        s = Series(data, dtype=data.dtype)
         if htable == ht.Float64HashTable:
             # add NaN for float column
             s.loc[500] = np.nan
@@ -1744,12 +1749,17 @@ def test_hashtable_unique(self, htable, data, writable):
         reconstr = result_unique[result_inverse]
         tm.assert_numpy_array_equal(reconstr, s_duplicated.values)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize(
         "htable, data",
         [
-            (ht.PyObjectHashTable, [f"foo_{i}" for i in range(1000)]),
-            (ht.StringHashTable, [f"foo_{i}" for i in range(1000)]),
+            (
+                ht.PyObjectHashTable,
+                np.array([f"foo_{i}" for i in range(1000)], dtype=object),
+            ),
+            (
+                ht.StringHashTable,
+                np.array([f"foo_{i}" for i in range(1000)], dtype=object),
+            ),
             (ht.Float64HashTable, np.arange(1000, dtype=np.float64)),
             (ht.Int64HashTable, np.arange(1000, dtype=np.int64)),
             (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)),
@@ -1757,7 +1767,7 @@ def test_hashtable_unique(self, htable, data, writable):
     )
     def test_hashtable_factorize(self, htable, writable, data):
         # output of maker has guaranteed unique elements
-        s = Series(data)
+        s = Series(data, dtype=data.dtype)
         if htable == ht.Float64HashTable:
             # add NaN for float column
             s.loc[500] = np.nan

From c07933716ef30860e66373b10fd0177c22cb5970 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 19 Dec 2024 09:42:30 +0100
Subject: [PATCH 2/2] [backport 2.3.x] TST (string dtype): resolve xfails in
 pandas/tests/copy_view (#60245) (#60257)

---
 pandas/_testing/__init__.py              | 28 +++++----------
 pandas/tests/copy_view/test_astype.py    | 22 ++++++------
 pandas/tests/copy_view/test_functions.py |  1 -
 pandas/tests/copy_view/test_methods.py   | 43 +++++++++++++-----------
 pandas/tests/copy_view/test_replace.py   | 18 ++++------
 5 files changed, 51 insertions(+), 61 deletions(-)

diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
index 2d066b581f1c6..d7197f23ce1e4 100644
--- a/pandas/_testing/__init__.py
+++ b/pandas/_testing/__init__.py
@@ -8,7 +8,6 @@
     TYPE_CHECKING,
     Callable,
     ContextManager,
-    cast,
 )
 import warnings
 
@@ -23,8 +22,6 @@
 
 from pandas.compat import pa_version_under10p1
 
-from pandas.core.dtypes.common import is_string_dtype
-
 import pandas as pd
 from pandas import (
     ArrowDtype,
@@ -83,8 +80,8 @@
     with_csv_dialect,
 )
 from pandas.core.arrays import (
+    ArrowExtensionArray,
     BaseMaskedArray,
-    ExtensionArray,
     NumpyExtensionArray,
 )
 from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
@@ -96,7 +93,6 @@
         NpDtype,
     )
 
-    from pandas.core.arrays import ArrowExtensionArray
 
 UNSIGNED_INT_NUMPY_DTYPES: list[NpDtype] = ["uint8", "uint16", "uint32", "uint64"]
 UNSIGNED_INT_EA_DTYPES: list[Dtype] = ["UInt8", "UInt16", "UInt32", "UInt64"]
@@ -530,24 +526,18 @@ def shares_memory(left, right) -> bool:
     if isinstance(left, pd.core.arrays.IntervalArray):
         return shares_memory(left._left, right) or shares_memory(left._right, right)
 
-    if (
-        isinstance(left, ExtensionArray)
-        and is_string_dtype(left.dtype)
-        and left.dtype.storage == "pyarrow"  # type: ignore[attr-defined]
-    ):
-        # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
-        left = cast("ArrowExtensionArray", left)
-        if (
-            isinstance(right, ExtensionArray)
-            and is_string_dtype(right.dtype)
-            and right.dtype.storage == "pyarrow"  # type: ignore[attr-defined]
-        ):
-            right = cast("ArrowExtensionArray", right)
+    if isinstance(left, ArrowExtensionArray):
+        if isinstance(right, ArrowExtensionArray):
+            # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
             left_pa_data = left._pa_array
             right_pa_data = right._pa_array
             left_buf1 = left_pa_data.chunk(0).buffers()[1]
             right_buf1 = right_pa_data.chunk(0).buffers()[1]
-            return left_buf1 == right_buf1
+            return left_buf1.address == right_buf1.address
+        else:
+            # if we have one one ArrowExtensionArray and one other array, assume
+            # they can only share memory if they share the same numpy buffer
+            return np.shares_memory(left, right)
 
     if isinstance(left, BaseMaskedArray) and isinstance(right, BaseMaskedArray):
         # By convention, we'll say these share memory if they share *either*
diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py
index e0e3f6dc058a4..45fc3333c49a7 100644
--- a/pandas/tests/copy_view/test_astype.py
+++ b/pandas/tests/copy_view/test_astype.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat import HAS_PYARROW
 from pandas.compat.pyarrow import pa_version_under12p0
 import pandas.util._test_decorators as td
@@ -244,7 +242,6 @@ def test_astype_arrow_timestamp(using_copy_on_write):
             )
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 def test_convert_dtypes_infer_objects(using_copy_on_write):
     ser = Series(["a", "b", "c"])
     ser_orig = ser.copy()
@@ -256,7 +253,7 @@ def test_convert_dtypes_infer_objects(using_copy_on_write):
     )
 
     if using_copy_on_write:
-        assert np.shares_memory(get_array(ser), get_array(result))
+        assert tm.shares_memory(get_array(ser), get_array(result))
     else:
         assert not np.shares_memory(get_array(ser), get_array(result))
 
@@ -264,17 +261,21 @@ def test_convert_dtypes_infer_objects(using_copy_on_write):
     tm.assert_series_equal(ser, ser_orig)
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
-def test_convert_dtypes(using_copy_on_write):
+def test_convert_dtypes(using_copy_on_write, using_infer_string):
     df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]})
     df_orig = df.copy()
     df2 = df.convert_dtypes()
 
     if using_copy_on_write:
-        assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
-        assert np.shares_memory(get_array(df2, "d"), get_array(df, "d"))
-        assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
-        assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+        if using_infer_string and HAS_PYARROW:
+            # TODO the default nullable string dtype still uses python storage
+            # this should be changed to pyarrow if installed
+            assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+        else:
+            assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+        assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d"))
+        assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+        assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c"))
     else:
         assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
         assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
@@ -282,4 +283,5 @@ def test_convert_dtypes(using_copy_on_write):
         assert not np.shares_memory(get_array(df2, "d"), get_array(df, "d"))
 
     df2.iloc[0, 0] = "x"
+    df2.iloc[0, 1] = 10
     tm.assert_frame_equal(df, df_orig)
diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py
index 23ed7f9edcd22..eefd27964e6ae 100644
--- a/pandas/tests/copy_view/test_functions.py
+++ b/pandas/tests/copy_view/test_functions.py
@@ -201,7 +201,6 @@ def test_concat_copy_keyword(using_copy_on_write, copy):
         assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
 
 
-# @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 @pytest.mark.parametrize(
     "func",
     [
diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py
index 295d93580f451..09738fe1023fb 100644
--- a/pandas/tests/copy_view/test_methods.py
+++ b/pandas/tests/copy_view/test_methods.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat import HAS_PYARROW
 from pandas.errors import SettingWithCopyWarning
 
@@ -953,15 +951,19 @@ def test_head_tail(method, using_copy_on_write, warn_copy_on_write):
     tm.assert_frame_equal(df, df_orig)
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
-def test_infer_objects(using_copy_on_write):
-    df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"})
+def test_infer_objects(using_copy_on_write, using_infer_string):
+    df = DataFrame(
+        {"a": [1, 2], "b": Series(["x", "y"], dtype=object), "c": 1, "d": "x"}
+    )
     df_orig = df.copy()
     df2 = df.infer_objects()
 
     if using_copy_on_write:
         assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
-        assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+        if using_infer_string:
+            assert not tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+        else:
+            assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
 
     else:
         assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
@@ -975,19 +977,16 @@ def test_infer_objects(using_copy_on_write):
     tm.assert_frame_equal(df, df_orig)
 
 
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-)
-def test_infer_objects_no_reference(using_copy_on_write):
+def test_infer_objects_no_reference(using_copy_on_write, using_infer_string):
     df = DataFrame(
         {
             "a": [1, 2],
-            "b": "c",
+            "b": Series(["x", "y"], dtype=object),
             "c": 1,
             "d": Series(
                 [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
             ),
-            "e": "b",
+            "e": Series(["z", "w"], dtype=object),
         }
     )
     df = df.infer_objects()
@@ -1001,16 +1000,22 @@ def test_infer_objects_no_reference(using_copy_on_write):
     df.iloc[0, 3] = Timestamp("2018-12-31")
     if using_copy_on_write:
         assert np.shares_memory(arr_a, get_array(df, "a"))
-        # TODO(CoW): Block splitting causes references here
-        assert not np.shares_memory(arr_b, get_array(df, "b"))
+        if using_infer_string:
+            # note that the underlying memory of arr_b has been copied anyway
+            # because of the assignment, but the EA is updated inplace so still
+            # appears the share memory
+            assert tm.shares_memory(arr_b, get_array(df, "b"))
+        else:
+            # TODO(CoW): Block splitting causes references here
+            assert not np.shares_memory(arr_b, get_array(df, "b"))
         assert np.shares_memory(arr_d, get_array(df, "d"))
 
 
-def test_infer_objects_reference(using_copy_on_write):
+def test_infer_objects_reference(using_copy_on_write, using_infer_string):
     df = DataFrame(
         {
             "a": [1, 2],
-            "b": "c",
+            "b": Series(["x", "y"], dtype=object),
             "c": 1,
             "d": Series(
                 [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
@@ -1029,7 +1034,8 @@ def test_infer_objects_reference(using_copy_on_write):
     df.iloc[0, 3] = Timestamp("2018-12-31")
     if using_copy_on_write:
         assert not np.shares_memory(arr_a, get_array(df, "a"))
-        assert not np.shares_memory(arr_b, get_array(df, "b"))
+        if not using_infer_string or HAS_PYARROW:
+            assert not np.shares_memory(arr_b, get_array(df, "b"))
         assert np.shares_memory(arr_d, get_array(df, "d"))
 
 
@@ -1184,7 +1190,6 @@ def test_sort_values_inplace(using_copy_on_write, obj, kwargs, warn_copy_on_writ
         assert np.shares_memory(get_array(obj, "a"), get_array(view, "a"))
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 @pytest.mark.parametrize("decimals", [-1, 0, 1])
 def test_round(using_copy_on_write, warn_copy_on_write, decimals):
     df = DataFrame({"a": [1, 2], "b": "c"})
@@ -1192,7 +1197,7 @@ def test_round(using_copy_on_write, warn_copy_on_write, decimals):
     df2 = df.round(decimals=decimals)
 
     if using_copy_on_write:
-        assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+        assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
         # TODO: Make inplace by using out parameter of ndarray.round?
         if decimals >= 0:
             # Ensure lazy copy if no-op
diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py
index 9e24ce319e3bf..c6c9eca47f3f4 100644
--- a/pandas/tests/copy_view/test_replace.py
+++ b/pandas/tests/copy_view/test_replace.py
@@ -1,10 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
-from pandas.compat import HAS_PYARROW
-
 from pandas import (
     Categorical,
     DataFrame,
@@ -14,7 +10,6 @@
 from pandas.tests.copy_view.util import get_array
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @pytest.mark.parametrize(
     "replace_kwargs",
     [
@@ -31,7 +26,7 @@
     ],
 )
 def test_replace(using_copy_on_write, replace_kwargs):
-    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
     df_orig = df.copy()
 
     df_replaced = df.replace(**replace_kwargs)
@@ -39,7 +34,7 @@ def test_replace(using_copy_on_write, replace_kwargs):
     if using_copy_on_write:
         if (df_replaced["b"] == df["b"]).all():
             assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
-        assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
+        assert tm.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
 
     # mutating squeezed df triggers a copy-on-write for that column/block
     df_replaced.loc[0, "c"] = -1
@@ -61,26 +56,25 @@ def test_replace_regex_inplace_refs(using_copy_on_write, warn_copy_on_write):
     with tm.assert_cow_warning(warn_copy_on_write):
         df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
     if using_copy_on_write:
-        assert not np.shares_memory(arr, get_array(df, "a"))
+        assert not tm.shares_memory(arr, get_array(df, "a"))
         assert df._mgr._has_no_reference(0)
         tm.assert_frame_equal(view, df_orig)
     else:
         assert np.shares_memory(arr, get_array(df, "a"))
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 def test_replace_regex_inplace(using_copy_on_write):
     df = DataFrame({"a": ["aaa", "bbb"]})
     arr = get_array(df, "a")
     df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
     if using_copy_on_write:
         assert df._mgr._has_no_reference(0)
-    assert np.shares_memory(arr, get_array(df, "a"))
+    assert tm.shares_memory(arr, get_array(df, "a"))
 
     df_orig = df.copy()
     df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True)
     tm.assert_frame_equal(df_orig, df)
-    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
 
 
 def test_replace_regex_inplace_no_op(using_copy_on_write):
@@ -362,7 +356,7 @@ def test_replace_object_list_inplace(using_copy_on_write, value):
     arr = get_array(df, "a")
     df.replace(["c"], value, inplace=True)
     if using_copy_on_write or value is None:
-        assert np.shares_memory(arr, get_array(df, "a"))
+        assert tm.shares_memory(arr, get_array(df, "a"))
     else:
         # This could be inplace
         assert not np.shares_memory(arr, get_array(df, "a"))