From 9d8bce74f5387826dfaf28dc1b87e849efc3e8f7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 26 Aug 2023 12:40:38 +0200 Subject: [PATCH] Use NaN as na_value for new pyarrow_numpy StringDtype (#54585) --- pandas/core/arrays/string_.py | 10 +++-- pandas/tests/arrays/string_/test_string.py | 41 +++++++++++++------- pandas/tests/strings/__init__.py | 9 +++-- pandas/tests/strings/test_split_partition.py | 9 +++-- 4 files changed, 45 insertions(+), 24 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index f0e1d194cd88f..2394b9af2015e 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -101,10 +101,14 @@ class StringDtype(StorageExtensionDtype): # base class "StorageExtensionDtype") with class variable name: ClassVar[str] = "string" # type: ignore[misc] - #: StringDtype().na_value uses pandas.NA + #: StringDtype().na_value uses pandas.NA except the implementation that + # follows NumPy semantics, which uses nan. @property - def na_value(self) -> libmissing.NAType: - return libmissing.NA + def na_value(self) -> libmissing.NAType | float: # type: ignore[override] + if self.storage == "pyarrow_numpy": + return np.nan + else: + return libmissing.NA _metadata = ("storage",) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index b8f872529bc1a..24d8e43708b91 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -17,6 +17,13 @@ ) +def na_val(dtype): + if dtype.storage == "pyarrow_numpy": + return np.nan + else: + return pd.NA + + @pytest.fixture def dtype(string_storage): """Fixture giving StringDtype from parametrized 'string_storage'""" @@ -31,26 +38,34 @@ def cls(dtype): def test_repr(dtype): df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype=dtype)}) - expected = " A\n0 a\n1 \n2 b" + if dtype.storage == "pyarrow_numpy": + expected = " A\n0 a\n1 NaN\n2 b" + else: + expected = " A\n0 a\n1 \n2 b" assert repr(df) == expected - expected = "0 a\n1 \n2 b\nName: A, dtype: string" + if dtype.storage == "pyarrow_numpy": + expected = "0 a\n1 NaN\n2 b\nName: A, dtype: string" + else: + expected = "0 a\n1 \n2 b\nName: A, dtype: string" assert repr(df.A) == expected if dtype.storage == "pyarrow": arr_name = "ArrowStringArray" + expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" elif dtype.storage == "pyarrow_numpy": arr_name = "ArrowStringArrayNumpySemantics" + expected = f"<{arr_name}>\n['a', nan, 'b']\nLength: 3, dtype: string" else: arr_name = "StringArray" - expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" + expected = f"<{arr_name}>\n['a', , 'b']\nLength: 3, dtype: string" assert repr(df.A.array) == expected def test_none_to_nan(cls): a = cls._from_sequence(["a", None, "b"]) assert a[1] is not None - assert a[1] is pd.NA + assert a[1] is na_val(a.dtype) def test_setitem_validates(cls): @@ -213,13 +228,9 @@ def test_comparison_methods_scalar(comparison_op, dtype): other = "a" result = getattr(a, op_name)(other) if dtype.storage == "pyarrow_numpy": - expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) - expected = ( - pd.array(expected, dtype="boolean") - .to_numpy(na_value=False) - .astype(np.bool_) - ) - tm.assert_numpy_array_equal(result, expected) + expected = np.array([getattr(item, op_name)(other) for item in a]) + expected[1] = False + tm.assert_numpy_array_equal(result, expected.astype(np.bool_)) else: expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean" expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object) @@ -415,7 +426,7 @@ def test_min_max(method, skipna, dtype, request): expected = "a" if method == "min" else "c" assert result == expected else: - assert result is pd.NA + assert result is na_val(arr.dtype) @pytest.mark.parametrize("method", ["min", "max"]) @@ -483,7 +494,7 @@ def test_arrow_roundtrip(dtype, string_storage2): expected = df.astype(f"string[{string_storage2}]") tm.assert_frame_equal(result, expected) # ensure the missing value is represented by NA and not np.nan or None - assert result.loc[2, "a"] is pd.NA + assert result.loc[2, "a"] is na_val(result["a"].dtype) def test_arrow_load_from_zero_chunks(dtype, string_storage2): @@ -581,7 +592,7 @@ def test_astype_from_float_dtype(float_dtype, dtype): def test_to_numpy_returns_pdna_default(dtype): arr = pd.array(["a", pd.NA, "b"], dtype=dtype) result = np.array(arr) - expected = np.array(["a", pd.NA, "b"], dtype=object) + expected = np.array(["a", na_val(dtype), "b"], dtype=object) tm.assert_numpy_array_equal(result, expected) @@ -621,7 +632,7 @@ def test_setitem_scalar_with_mask_validation(dtype): mask = np.array([False, True, False]) ser[mask] = None - assert ser.array[1] is pd.NA + assert ser.array[1] is na_val(ser.dtype) # for other non-string we should also raise an error ser = pd.Series(["a", "b", "c"], dtype=dtype) diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py index bf119f2721ed4..01b49b5e5b633 100644 --- a/pandas/tests/strings/__init__.py +++ b/pandas/tests/strings/__init__.py @@ -1,4 +1,4 @@ -# Needed for new arrow string dtype +import numpy as np import pandas as pd @@ -7,6 +7,9 @@ def _convert_na_value(ser, expected): if ser.dtype != object: - # GH#18463 - expected = expected.fillna(pd.NA) + if ser.dtype.storage == "pyarrow_numpy": + expected = expected.fillna(np.nan) + else: + # GH#18463 + expected = expected.fillna(pd.NA) return expected diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py index 7fabe238d2b86..0a7d409773dd6 100644 --- a/pandas/tests/strings/test_split_partition.py +++ b/pandas/tests/strings/test_split_partition.py @@ -12,7 +12,10 @@ Series, _testing as tm, ) -from pandas.tests.strings import _convert_na_value +from pandas.tests.strings import ( + _convert_na_value, + object_pyarrow_numpy, +) @pytest.mark.parametrize("method", ["split", "rsplit"]) @@ -113,8 +116,8 @@ def test_split_object_mixed(expand, method): def test_split_n(any_string_dtype, method, n): s = Series(["a b", pd.NA, "b c"], dtype=any_string_dtype) expected = Series([["a", "b"], pd.NA, ["b", "c"]]) - result = getattr(s.str, method)(" ", n=n) + expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) @@ -381,7 +384,7 @@ def test_split_nan_expand(any_string_dtype): # check that these are actually np.nan/pd.NA and not None # TODO see GH 18463 # tm.assert_frame_equal does not differentiate - if any_string_dtype == "object": + if any_string_dtype in object_pyarrow_numpy: assert all(np.isnan(x) for x in result.iloc[1]) else: assert all(x is pd.NA for x in result.iloc[1])