From 75a1007e6c40ec765fb3764935e84bb34acf0163 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 18 Dec 2024 21:21:32 +0100 Subject: [PATCH 1/2] [backport 2.3.x] TST (string dtype): un-xfail string tests specific to object dtype (#59433) (#60180) Co-authored-by: jbrockmendel --- pandas/tests/copy_view/test_interp_fillna.py | 13 ++++------ pandas/tests/copy_view/test_replace.py | 3 +-- pandas/tests/test_algos.py | 26 ++++++++++++++------ 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py index 338b76cbf1e7a..d0c4fa53faab9 100644 --- a/pandas/tests/copy_view/test_interp_fillna.py +++ b/pandas/tests/copy_view/test_interp_fillna.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas import ( NA, ArrowDtype, @@ -137,10 +135,9 @@ def test_interp_fill_functions_inplace( assert np.shares_memory(arr, get_array(df, "a")) is (dtype == "float64") -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_interpolate_cleaned_fill_method(using_copy_on_write): - # Check that "method is set to None" case works correctly +def test_interpolate_cannot_with_object_dtype(using_copy_on_write): df = DataFrame({"a": ["a", np.nan, "c"], "b": 1}) + df["a"] = df["a"].astype(object) df_orig = df.copy() msg = "DataFrame.interpolate with object dtype" @@ -159,16 +156,16 @@ def test_interpolate_cleaned_fill_method(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") -def test_interpolate_object_convert_no_op(using_copy_on_write): +def test_interpolate_object_convert_no_op(using_copy_on_write, using_infer_string): df = DataFrame({"a": ["a", "b", "c"], "b": 1}) + df["a"] = df["a"].astype(object) arr_a = get_array(df, "a") msg = "DataFrame.interpolate with method=pad is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): df.interpolate(method="pad", inplace=True) # Now CoW makes a copy, it should not! - if using_copy_on_write: + if using_copy_on_write and not using_infer_string: assert df._mgr._has_no_reference(0) assert np.shares_memory(arr_a, get_array(df, "a")) diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index bc3edb1f72214..9e24ce319e3bf 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -356,10 +356,9 @@ def test_replace_empty_list(using_copy_on_write): assert not df2._mgr._has_no_reference(0) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize("value", ["d", None]) def test_replace_object_list_inplace(using_copy_on_write, value): - df = DataFrame({"a": ["a", "b", "c"]}) + df = DataFrame({"a": ["a", "b", "c"]}, dtype=object) arr = get_array(df, "a") df.replace(["c"], value, inplace=True) if using_copy_on_write or value is None: diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index d1e69cfa2b4ee..80ee0f6e067f9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1704,12 +1704,17 @@ def test_unique_complex_numbers(self, array, expected): class TestHashTable: - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "htable, data", [ - (ht.PyObjectHashTable, [f"foo_{i}" for i in range(1000)]), - (ht.StringHashTable, [f"foo_{i}" for i in range(1000)]), + ( + ht.PyObjectHashTable, + np.array([f"foo_{i}" for i in range(1000)], dtype=object), + ), + ( + ht.StringHashTable, + np.array([f"foo_{i}" for i in range(1000)], dtype=object), + ), (ht.Float64HashTable, np.arange(1000, dtype=np.float64)), (ht.Int64HashTable, np.arange(1000, dtype=np.int64)), (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)), @@ -1717,7 +1722,7 @@ class TestHashTable: ) def test_hashtable_unique(self, htable, data, writable): # output of maker has guaranteed unique elements - s = Series(data) + s = Series(data, dtype=data.dtype) if htable == ht.Float64HashTable: # add NaN for float column s.loc[500] = np.nan @@ -1744,12 +1749,17 @@ def test_hashtable_unique(self, htable, data, writable): reconstr = result_unique[result_inverse] tm.assert_numpy_array_equal(reconstr, s_duplicated.values) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "htable, data", [ - (ht.PyObjectHashTable, [f"foo_{i}" for i in range(1000)]), - (ht.StringHashTable, [f"foo_{i}" for i in range(1000)]), + ( + ht.PyObjectHashTable, + np.array([f"foo_{i}" for i in range(1000)], dtype=object), + ), + ( + ht.StringHashTable, + np.array([f"foo_{i}" for i in range(1000)], dtype=object), + ), (ht.Float64HashTable, np.arange(1000, dtype=np.float64)), (ht.Int64HashTable, np.arange(1000, dtype=np.int64)), (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)), @@ -1757,7 +1767,7 @@ def test_hashtable_unique(self, htable, data, writable): ) def test_hashtable_factorize(self, htable, writable, data): # output of maker has guaranteed unique elements - s = Series(data) + s = Series(data, dtype=data.dtype) if htable == ht.Float64HashTable: # add NaN for float column s.loc[500] = np.nan From c07933716ef30860e66373b10fd0177c22cb5970 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 19 Dec 2024 09:42:30 +0100 Subject: [PATCH 2/2] [backport 2.3.x] TST (string dtype): resolve xfails in pandas/tests/copy_view (#60245) (#60257) --- pandas/_testing/__init__.py | 28 +++++---------- pandas/tests/copy_view/test_astype.py | 22 ++++++------ pandas/tests/copy_view/test_functions.py | 1 - pandas/tests/copy_view/test_methods.py | 43 +++++++++++++----------- pandas/tests/copy_view/test_replace.py | 18 ++++------ 5 files changed, 51 insertions(+), 61 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 2d066b581f1c6..d7197f23ce1e4 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -8,7 +8,6 @@ TYPE_CHECKING, Callable, ContextManager, - cast, ) import warnings @@ -23,8 +22,6 @@ from pandas.compat import pa_version_under10p1 -from pandas.core.dtypes.common import is_string_dtype - import pandas as pd from pandas import ( ArrowDtype, @@ -83,8 +80,8 @@ with_csv_dialect, ) from pandas.core.arrays import ( + ArrowExtensionArray, BaseMaskedArray, - ExtensionArray, NumpyExtensionArray, ) from pandas.core.arrays._mixins import NDArrayBackedExtensionArray @@ -96,7 +93,6 @@ NpDtype, ) - from pandas.core.arrays import ArrowExtensionArray UNSIGNED_INT_NUMPY_DTYPES: list[NpDtype] = ["uint8", "uint16", "uint32", "uint64"] UNSIGNED_INT_EA_DTYPES: list[Dtype] = ["UInt8", "UInt16", "UInt32", "UInt64"] @@ -530,24 +526,18 @@ def shares_memory(left, right) -> bool: if isinstance(left, pd.core.arrays.IntervalArray): return shares_memory(left._left, right) or shares_memory(left._right, right) - if ( - isinstance(left, ExtensionArray) - and is_string_dtype(left.dtype) - and left.dtype.storage == "pyarrow" # type: ignore[attr-defined] - ): - # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 - left = cast("ArrowExtensionArray", left) - if ( - isinstance(right, ExtensionArray) - and is_string_dtype(right.dtype) - and right.dtype.storage == "pyarrow" # type: ignore[attr-defined] - ): - right = cast("ArrowExtensionArray", right) + if isinstance(left, ArrowExtensionArray): + if isinstance(right, ArrowExtensionArray): + # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 left_pa_data = left._pa_array right_pa_data = right._pa_array left_buf1 = left_pa_data.chunk(0).buffers()[1] right_buf1 = right_pa_data.chunk(0).buffers()[1] - return left_buf1 == right_buf1 + return left_buf1.address == right_buf1.address + else: + # if we have one one ArrowExtensionArray and one other array, assume + # they can only share memory if they share the same numpy buffer + return np.shares_memory(left, right) if isinstance(left, BaseMaskedArray) and isinstance(right, BaseMaskedArray): # By convention, we'll say these share memory if they share *either* diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index e0e3f6dc058a4..45fc3333c49a7 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import pa_version_under12p0 import pandas.util._test_decorators as td @@ -244,7 +242,6 @@ def test_astype_arrow_timestamp(using_copy_on_write): ) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_convert_dtypes_infer_objects(using_copy_on_write): ser = Series(["a", "b", "c"]) ser_orig = ser.copy() @@ -256,7 +253,7 @@ def test_convert_dtypes_infer_objects(using_copy_on_write): ) if using_copy_on_write: - assert np.shares_memory(get_array(ser), get_array(result)) + assert tm.shares_memory(get_array(ser), get_array(result)) else: assert not np.shares_memory(get_array(ser), get_array(result)) @@ -264,17 +261,21 @@ def test_convert_dtypes_infer_objects(using_copy_on_write): tm.assert_series_equal(ser, ser_orig) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") -def test_convert_dtypes(using_copy_on_write): +def test_convert_dtypes(using_copy_on_write, using_infer_string): df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]}) df_orig = df.copy() df2 = df.convert_dtypes() if using_copy_on_write: - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - assert np.shares_memory(get_array(df2, "d"), get_array(df, "d")) - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + if using_infer_string and HAS_PYARROW: + # TODO the default nullable string dtype still uses python storage + # this should be changed to pyarrow if installed + assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d")) + assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c")) else: assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) @@ -282,4 +283,5 @@ def test_convert_dtypes(using_copy_on_write): assert not np.shares_memory(get_array(df2, "d"), get_array(df, "d")) df2.iloc[0, 0] = "x" + df2.iloc[0, 1] = 10 tm.assert_frame_equal(df, df_orig) diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index 23ed7f9edcd22..eefd27964e6ae 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -201,7 +201,6 @@ def test_concat_copy_keyword(using_copy_on_write, copy): assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b")) -# @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize( "func", [ diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 295d93580f451..09738fe1023fb 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import HAS_PYARROW from pandas.errors import SettingWithCopyWarning @@ -953,15 +951,19 @@ def test_head_tail(method, using_copy_on_write, warn_copy_on_write): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") -def test_infer_objects(using_copy_on_write): - df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"}) +def test_infer_objects(using_copy_on_write, using_infer_string): + df = DataFrame( + {"a": [1, 2], "b": Series(["x", "y"], dtype=object), "c": 1, "d": "x"} + ) df_orig = df.copy() df2 = df.infer_objects() if using_copy_on_write: assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + if using_infer_string: + assert not tm.shares_memory(get_array(df2, "b"), get_array(df, "b")) + else: + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) else: assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) @@ -975,19 +977,16 @@ def test_infer_objects(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) -def test_infer_objects_no_reference(using_copy_on_write): +def test_infer_objects_no_reference(using_copy_on_write, using_infer_string): df = DataFrame( { "a": [1, 2], - "b": "c", + "b": Series(["x", "y"], dtype=object), "c": 1, "d": Series( [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object" ), - "e": "b", + "e": Series(["z", "w"], dtype=object), } ) df = df.infer_objects() @@ -1001,16 +1000,22 @@ def test_infer_objects_no_reference(using_copy_on_write): df.iloc[0, 3] = Timestamp("2018-12-31") if using_copy_on_write: assert np.shares_memory(arr_a, get_array(df, "a")) - # TODO(CoW): Block splitting causes references here - assert not np.shares_memory(arr_b, get_array(df, "b")) + if using_infer_string: + # note that the underlying memory of arr_b has been copied anyway + # because of the assignment, but the EA is updated inplace so still + # appears the share memory + assert tm.shares_memory(arr_b, get_array(df, "b")) + else: + # TODO(CoW): Block splitting causes references here + assert not np.shares_memory(arr_b, get_array(df, "b")) assert np.shares_memory(arr_d, get_array(df, "d")) -def test_infer_objects_reference(using_copy_on_write): +def test_infer_objects_reference(using_copy_on_write, using_infer_string): df = DataFrame( { "a": [1, 2], - "b": "c", + "b": Series(["x", "y"], dtype=object), "c": 1, "d": Series( [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object" @@ -1029,7 +1034,8 @@ def test_infer_objects_reference(using_copy_on_write): df.iloc[0, 3] = Timestamp("2018-12-31") if using_copy_on_write: assert not np.shares_memory(arr_a, get_array(df, "a")) - assert not np.shares_memory(arr_b, get_array(df, "b")) + if not using_infer_string or HAS_PYARROW: + assert not np.shares_memory(arr_b, get_array(df, "b")) assert np.shares_memory(arr_d, get_array(df, "d")) @@ -1184,7 +1190,6 @@ def test_sort_values_inplace(using_copy_on_write, obj, kwargs, warn_copy_on_writ assert np.shares_memory(get_array(obj, "a"), get_array(view, "a")) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize("decimals", [-1, 0, 1]) def test_round(using_copy_on_write, warn_copy_on_write, decimals): df = DataFrame({"a": [1, 2], "b": "c"}) @@ -1192,7 +1197,7 @@ def test_round(using_copy_on_write, warn_copy_on_write, decimals): df2 = df.round(decimals=decimals) if using_copy_on_write: - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b")) # TODO: Make inplace by using out parameter of ndarray.round? if decimals >= 0: # Ensure lazy copy if no-op diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 9e24ce319e3bf..c6c9eca47f3f4 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -1,10 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - from pandas import ( Categorical, DataFrame, @@ -14,7 +10,6 @@ from pandas.tests.copy_view.util import get_array -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "replace_kwargs", [ @@ -31,7 +26,7 @@ ], ) def test_replace(using_copy_on_write, replace_kwargs): - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df_replaced = df.replace(**replace_kwargs) @@ -39,7 +34,7 @@ def test_replace(using_copy_on_write, replace_kwargs): if using_copy_on_write: if (df_replaced["b"] == df["b"]).all(): assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b")) - assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) + assert tm.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) # mutating squeezed df triggers a copy-on-write for that column/block df_replaced.loc[0, "c"] = -1 @@ -61,26 +56,25 @@ def test_replace_regex_inplace_refs(using_copy_on_write, warn_copy_on_write): with tm.assert_cow_warning(warn_copy_on_write): df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True) if using_copy_on_write: - assert not np.shares_memory(arr, get_array(df, "a")) + assert not tm.shares_memory(arr, get_array(df, "a")) assert df._mgr._has_no_reference(0) tm.assert_frame_equal(view, df_orig) else: assert np.shares_memory(arr, get_array(df, "a")) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_replace_regex_inplace(using_copy_on_write): df = DataFrame({"a": ["aaa", "bbb"]}) arr = get_array(df, "a") df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True) if using_copy_on_write: assert df._mgr._has_no_reference(0) - assert np.shares_memory(arr, get_array(df, "a")) + assert tm.shares_memory(arr, get_array(df, "a")) df_orig = df.copy() df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True) tm.assert_frame_equal(df_orig, df) - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) def test_replace_regex_inplace_no_op(using_copy_on_write): @@ -362,7 +356,7 @@ def test_replace_object_list_inplace(using_copy_on_write, value): arr = get_array(df, "a") df.replace(["c"], value, inplace=True) if using_copy_on_write or value is None: - assert np.shares_memory(arr, get_array(df, "a")) + assert tm.shares_memory(arr, get_array(df, "a")) else: # This could be inplace assert not np.shares_memory(arr, get_array(df, "a"))