diff --git a/pandas/tests/base/test_constructors.py b/pandas/tests/base/test_constructors.py index 4e954891c2d98..f3ac60f672ee1 100644 --- a/pandas/tests/base/test_constructors.py +++ b/pandas/tests/base/test_constructors.py @@ -138,7 +138,9 @@ class TestConstruction: "object-string", ], ) - def test_constructor_datetime_outofbound(self, a, constructor): + def test_constructor_datetime_outofbound( + self, a, constructor, request, using_infer_string + ): # GH-26853 (+ bug GH-26206 out of bound non-ns unit) # No dtype specified (dtype inference) @@ -150,7 +152,10 @@ def test_constructor_datetime_outofbound(self, a, constructor): assert result.dtype == "M8[s]" else: result = constructor(a) - assert result.dtype == "object" + if using_infer_string and "object-string" in request.node.callspec.id: + assert result.dtype == "string" + else: + assert result.dtype == "object" tm.assert_numpy_array_equal(result.to_numpy(), a) # Explicit dtype specified diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 2fc6e786e3198..4f3e4d3365179 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -20,6 +20,7 @@ SparseArray, TimedeltaArray, ) +from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics class TestToIterable: @@ -215,7 +216,9 @@ def test_iter_box_period(self): ), ], ) -def test_values_consistent(arr, expected_type, dtype): +def test_values_consistent(arr, expected_type, dtype, using_infer_string): + if using_infer_string and dtype == "object": + expected_type = ArrowStringArrayNumpySemantics l_values = Series(arr)._values r_values = pd.Index(arr)._values assert type(l_values) is expected_type @@ -358,17 +361,23 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request): @pytest.mark.parametrize( "arr", [np.array([1, 2, 3], dtype="int64"), np.array(["a", "b", "c"], dtype=object)] ) -def test_to_numpy_copy(arr, as_series): +def test_to_numpy_copy(arr, as_series, using_infer_string): obj = pd.Index(arr, copy=False) if as_series: obj = Series(obj.values, copy=False) # no copy by default result = obj.to_numpy() - assert np.shares_memory(arr, result) is True + if using_infer_string and arr.dtype == object: + assert np.shares_memory(arr, result) is False + else: + assert np.shares_memory(arr, result) is True result = obj.to_numpy(copy=False) - assert np.shares_memory(arr, result) is True + if using_infer_string and arr.dtype == object: + assert np.shares_memory(arr, result) is False + else: + assert np.shares_memory(arr, result) is True # copy=True result = obj.to_numpy(copy=True) diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index c6fd4955d2d63..15daca86b14ee 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.compat import PYPY from pandas.core.dtypes.common import ( @@ -80,7 +82,10 @@ def test_ndarray_compat_properties(index_or_series_obj): assert Series([1]).item() == 1 -@pytest.mark.skipif(PYPY, reason="not relevant for PyPy") +@pytest.mark.skipif( + PYPY or using_pyarrow_string_dtype(), + reason="not relevant for PyPy doesn't work properly for arrow strings", +) def test_memory_usage(index_or_series_memory_obj): obj = index_or_series_memory_obj # Clear index caches so that len(obj) == 0 report 0 memory usage @@ -175,7 +180,9 @@ def test_access_by_position(index_flat): assert index[-1] == index[size - 1] msg = f"index {size} is out of bounds for axis 0 with size {size}" - if is_dtype_equal(index.dtype, "string[pyarrow]"): + if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal( + index.dtype, "string[pyarrow_numpy]" + ): msg = "index out of bounds" with pytest.raises(IndexError, match=msg): index[size] diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 4c845d8f24d01..d3fe144f70cfc 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas as pd import pandas._testing as tm from pandas.tests.base.common import allow_na_ops @@ -98,6 +100,7 @@ def test_nunique_null(null_obj, index_or_series_obj): @pytest.mark.single_cpu +@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="decoding fails") def test_unique_bad_unicode(index_or_series): # regression test for #34550 uval = "\ud83d" # smiley emoji diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index c42d064c476bb..bdec72a0c7ccb 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -14,6 +14,7 @@ Series, Timedelta, TimedeltaIndex, + array, ) import pandas._testing as tm from pandas.tests.base.common import allow_na_ops @@ -113,7 +114,7 @@ def test_value_counts_null(null_obj, index_or_series_obj): tm.assert_series_equal(result, expected) -def test_value_counts_inferred(index_or_series): +def test_value_counts_inferred(index_or_series, using_infer_string): klass = index_or_series s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] s = klass(s_values) @@ -125,7 +126,9 @@ def test_value_counts_inferred(index_or_series): tm.assert_index_equal(s.unique(), exp) else: exp = np.unique(np.array(s_values, dtype=np.object_)) - tm.assert_numpy_array_equal(s.unique(), exp) + if using_infer_string: + exp = array(exp) + tm.assert_equal(s.unique(), exp) assert s.nunique() == 4 # don't sort, have to sort after the fact as not sorting is @@ -147,7 +150,7 @@ def test_value_counts_inferred(index_or_series): tm.assert_series_equal(hist, expected) -def test_value_counts_bins(index_or_series): +def test_value_counts_bins(index_or_series, using_infer_string): klass = index_or_series s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] s = klass(s_values) @@ -201,7 +204,9 @@ def test_value_counts_bins(index_or_series): tm.assert_index_equal(s.unique(), exp) else: exp = np.array(["a", "b", np.nan, "d"], dtype=object) - tm.assert_numpy_array_equal(s.unique(), exp) + if using_infer_string: + exp = array(exp) + tm.assert_equal(s.unique(), exp) assert s.nunique() == 3 s = klass({}) if klass is dict else klass({}, dtype=object) @@ -246,7 +251,7 @@ def test_value_counts_datetime64(index_or_series, unit): expected_s = Series([3, 2, 1], index=idx, name="count") tm.assert_series_equal(s.value_counts(), expected_s) - expected = pd.array( + expected = array( np.array( ["2010-01-01 00:00:00", "2009-01-01 00:00:00", "2008-09-09 00:00:00"], dtype=f"datetime64[{unit}]",