From 084b1999cffde35bf9e49e5e5b8a5a0482bf927d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 8 Nov 2024 22:47:07 +0100 Subject: [PATCH] TST (string dtype): resolve xfails in pandas/tests/copy_view (#60245) --- pandas/_testing/__init__.py | 28 ++++++----------- pandas/tests/copy_view/test_astype.py | 22 +++++++------- pandas/tests/copy_view/test_functions.py | 1 - pandas/tests/copy_view/test_methods.py | 38 +++++++++++++----------- pandas/tests/copy_view/test_replace.py | 14 +++------ 5 files changed, 46 insertions(+), 57 deletions(-) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 0a110d69c7a70..e092d65f08dd4 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -7,7 +7,6 @@ from typing import ( TYPE_CHECKING, ContextManager, - cast, ) import numpy as np @@ -21,8 +20,6 @@ from pandas.compat import pa_version_under10p1 -from pandas.core.dtypes.common import is_string_dtype - import pandas as pd from pandas import ( ArrowDtype, @@ -77,8 +74,8 @@ with_csv_dialect, ) from pandas.core.arrays import ( + ArrowExtensionArray, BaseMaskedArray, - ExtensionArray, NumpyExtensionArray, ) from pandas.core.arrays._mixins import NDArrayBackedExtensionArray @@ -92,7 +89,6 @@ NpDtype, ) - from pandas.core.arrays import ArrowExtensionArray UNSIGNED_INT_NUMPY_DTYPES: list[NpDtype] = ["uint8", "uint16", "uint32", "uint64"] UNSIGNED_INT_EA_DTYPES: list[Dtype] = ["UInt8", "UInt16", "UInt32", "UInt64"] @@ -512,24 +508,18 @@ def shares_memory(left, right) -> bool: if isinstance(left, pd.core.arrays.IntervalArray): return shares_memory(left._left, right) or shares_memory(left._right, right) - if ( - isinstance(left, ExtensionArray) - and is_string_dtype(left.dtype) - and left.dtype.storage == "pyarrow" # type: ignore[attr-defined] - ): - # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 - left = cast("ArrowExtensionArray", left) - if ( - isinstance(right, ExtensionArray) - and is_string_dtype(right.dtype) - and right.dtype.storage == "pyarrow" # type: ignore[attr-defined] - ): - right = cast("ArrowExtensionArray", right) + if isinstance(left, ArrowExtensionArray): + if isinstance(right, ArrowExtensionArray): + # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669 left_pa_data = left._pa_array right_pa_data = right._pa_array left_buf1 = left_pa_data.chunk(0).buffers()[1] right_buf1 = right_pa_data.chunk(0).buffers()[1] - return left_buf1 == right_buf1 + return left_buf1.address == right_buf1.address + else: + # if we have one one ArrowExtensionArray and one other array, assume + # they can only share memory if they share the same numpy buffer + return np.shares_memory(left, right) if isinstance(left, BaseMaskedArray) and isinstance(right, BaseMaskedArray): # By convention, we'll say these share memory if they share *either* diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 80c30f2d0c26e..91f5badeb9728 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import pa_version_under12p0 @@ -206,7 +204,6 @@ def test_astype_arrow_timestamp(): assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._pa_array) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_convert_dtypes_infer_objects(): ser = Series(["a", "b", "c"]) ser_orig = ser.copy() @@ -217,20 +214,25 @@ def test_convert_dtypes_infer_objects(): convert_string=False, ) - assert np.shares_memory(get_array(ser), get_array(result)) + assert tm.shares_memory(get_array(ser), get_array(result)) result.iloc[0] = "x" tm.assert_series_equal(ser, ser_orig) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") -def test_convert_dtypes(): +def test_convert_dtypes(using_infer_string): df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]}) df_orig = df.copy() df2 = df.convert_dtypes() - assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - assert np.shares_memory(get_array(df2, "d"), get_array(df, "d")) - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) - assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) + if using_infer_string and HAS_PYARROW: + # TODO the default nullable string dtype still uses python storage + # this should be changed to pyarrow if installed + assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) + else: + assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d")) + assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c")) df2.iloc[0, 0] = "x" + df2.iloc[0, 1] = 10 tm.assert_frame_equal(df, df_orig) diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py index fcdece6077829..32fea794975b6 100644 --- a/pandas/tests/copy_view/test_functions.py +++ b/pandas/tests/copy_view/test_functions.py @@ -153,7 +153,6 @@ def test_concat_copy_keyword(): assert np.shares_memory(get_array(df2, "b"), get_array(result, "b")) -# @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize( "func", [ diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 92e1ba750fae2..250697c91ff13 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import HAS_PYARROW import pandas as pd @@ -716,14 +714,18 @@ def test_head_tail(method): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") -def test_infer_objects(): - df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"}) +def test_infer_objects(using_infer_string): + df = DataFrame( + {"a": [1, 2], "b": Series(["x", "y"], dtype=object), "c": 1, "d": "x"} + ) df_orig = df.copy() df2 = df.infer_objects() assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + if using_infer_string and HAS_PYARROW: + assert not tm.shares_memory(get_array(df2, "b"), get_array(df, "b")) + else: + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) df2.iloc[0, 0] = 0 df2.iloc[0, 1] = "d" @@ -732,19 +734,16 @@ def test_infer_objects(): tm.assert_frame_equal(df, df_orig) -@pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" -) -def test_infer_objects_no_reference(): +def test_infer_objects_no_reference(using_infer_string): df = DataFrame( { "a": [1, 2], - "b": "c", + "b": Series(["x", "y"], dtype=object), "c": 1, "d": Series( [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object" ), - "e": "b", + "e": Series(["z", "w"], dtype=object), } ) df = df.infer_objects() @@ -757,8 +756,14 @@ def test_infer_objects_no_reference(): df.iloc[0, 1] = "d" df.iloc[0, 3] = Timestamp("2018-12-31") assert np.shares_memory(arr_a, get_array(df, "a")) - # TODO(CoW): Block splitting causes references here - assert not np.shares_memory(arr_b, get_array(df, "b")) + if using_infer_string and HAS_PYARROW: + # note that the underlying memory of arr_b has been copied anyway + # because of the assignment, but the EA is updated inplace so still + # appears the share memory + assert tm.shares_memory(arr_b, get_array(df, "b")) + else: + # TODO(CoW): Block splitting causes references here + assert not np.shares_memory(arr_b, get_array(df, "b")) assert np.shares_memory(arr_d, get_array(df, "d")) @@ -766,7 +771,7 @@ def test_infer_objects_reference(): df = DataFrame( { "a": [1, 2], - "b": "c", + "b": Series(["x", "y"], dtype=object), "c": 1, "d": Series( [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object" @@ -904,14 +909,13 @@ def test_sort_values_inplace(obj, kwargs): tm.assert_equal(view, obj_orig) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @pytest.mark.parametrize("decimals", [-1, 0, 1]) def test_round(decimals): df = DataFrame({"a": [1, 2], "b": "c"}) df_orig = df.copy() df2 = df.round(decimals=decimals) - assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b")) # TODO: Make inplace by using out parameter of ndarray.round? if decimals >= 0: # Ensure lazy copy if no-op diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index e57514bffdf1e..d4838a5e68ab8 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -1,10 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - -from pandas.compat import HAS_PYARROW - from pandas import ( Categorical, DataFrame, @@ -13,7 +9,6 @@ from pandas.tests.copy_view.util import get_array -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "replace_kwargs", [ @@ -30,14 +25,14 @@ ], ) def test_replace(replace_kwargs): - df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df_replaced = df.replace(**replace_kwargs) if (df_replaced["b"] == df["b"]).all(): assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b")) - assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) + assert tm.shares_memory(get_array(df_replaced, "c"), get_array(df, "c")) # mutating squeezed df triggers a copy-on-write for that column/block df_replaced.loc[0, "c"] = -1 @@ -61,18 +56,17 @@ def test_replace_regex_inplace_refs(): tm.assert_frame_equal(view, df_orig) -@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") def test_replace_regex_inplace(): df = DataFrame({"a": ["aaa", "bbb"]}) arr = get_array(df, "a") df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True) assert df._mgr._has_no_reference(0) - assert np.shares_memory(arr, get_array(df, "a")) + assert tm.shares_memory(arr, get_array(df, "a")) df_orig = df.copy() df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True) tm.assert_frame_equal(df_orig, df) - assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a")) def test_replace_regex_inplace_no_op():