From b05e31ad2d46be4173eaba8bb3fe72f6710c87d2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 26 Nov 2023 20:51:39 +0100 Subject: [PATCH 1/7] Adjust tests in array directory for new string option --- .../tests/arrays/boolean/test_arithmetic.py | 16 +++++++-- .../tests/arrays/categorical/test_astype.py | 2 +- .../arrays/categorical/test_constructors.py | 3 ++ .../arrays/categorical/test_operators.py | 4 +-- pandas/tests/arrays/categorical/test_repr.py | 30 ++++++++++++----- .../tests/arrays/floating/test_arithmetic.py | 21 +++++++++--- .../tests/arrays/integer/test_arithmetic.py | 33 ++++++++++++++----- pandas/tests/arrays/integer/test_reduction.py | 4 ++- pandas/tests/arrays/string_/test_string.py | 10 ++++++ pandas/tests/arrays/test_array.py | 2 +- 10 files changed, 95 insertions(+), 30 deletions(-) diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py index 197e83121567e..0c4fcf149eb20 100644 --- a/pandas/tests/arrays/boolean/test_arithmetic.py +++ b/pandas/tests/arrays/boolean/test_arithmetic.py @@ -90,9 +90,16 @@ def test_op_int8(left_array, right_array, opname): # ----------------------------------------------------------------------------- -def test_error_invalid_values(data, all_arithmetic_operators): +def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): # invalid ops + if using_infer_string: + import pyarrow as pa + + err = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) + else: + err = TypeError + op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) @@ -110,9 +117,10 @@ def test_error_invalid_values(data, all_arithmetic_operators): [ r"unsupported operand type\(s\) for", "Concatenation operation is not implemented for NumPy arrays", + "has no kernel", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(err, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes @@ -123,7 +131,9 @@ def test_error_invalid_values(data, all_arithmetic_operators): r"unsupported operand type\(s\) for", "can only concatenate str", "not all arguments converted during string formatting", + "has no kernel", + "not implemented", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(err, match=msg): ops(pd.Series("foo", index=s.index)) diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py index 7fba150c9113f..a2a53af6ab1ad 100644 --- a/pandas/tests/arrays/categorical/test_astype.py +++ b/pandas/tests/arrays/categorical/test_astype.py @@ -89,7 +89,7 @@ def test_astype(self, ordered): expected = np.array(cat) tm.assert_numpy_array_equal(result, expected) - msg = r"Cannot cast object dtype to float64" + msg = r"Cannot cast object|string dtype to float64" with pytest.raises(ValueError, match=msg): cat.astype(float) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index e25e31e2f2e9e..50aaa42e09f22 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -447,6 +449,7 @@ def test_constructor_str_unknown(self): with pytest.raises(ValueError, match="Unknown dtype"): Categorical([1, 2], dtype="foo") + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="Can't be NumPy strings") def test_constructor_np_strs(self): # GH#31499 Hashtable.map_locations needs to work on np.str_ objects cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")]) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index a1e50917fed98..9e658ec2a799a 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -92,7 +92,7 @@ def test_comparisons(self, factor): cat > cat_unordered # comparison (in both directions) with Series will raise - s = Series(["b", "b", "b"]) + s = Series(["b", "b", "b"], dtype=object) msg = ( "Cannot compare a Categorical for op __gt__ with type " r"" @@ -108,7 +108,7 @@ def test_comparisons(self, factor): # comparison with numpy.array will raise in both direction, but only on # newer numpy versions - a = np.array(["b", "b", "b"]) + a = np.array(["b", "b", "b"], dtype=object) with pytest.raises(TypeError, match=msg): cat > a with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index dca171bf81047..d6f93fbbd912f 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -1,9 +1,13 @@ import numpy as np +import pytest + +from pandas._config import using_pyarrow_string_dtype from pandas import ( Categorical, CategoricalDtype, CategoricalIndex, + Index, Series, date_range, option_context, @@ -13,11 +17,17 @@ class TestCategoricalReprWithFactor: - def test_print(self, factor): - expected = [ - "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", - "Categories (3, object): ['a' < 'b' < 'c']", - ] + def test_print(self, factor, using_infer_string): + if using_infer_string: + expected = [ + "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", + "Categories (3, string): [a < b < c]", + ] + else: + expected = [ + "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", + "Categories (3, object): ['a' < 'b' < 'c']", + ] expected = "\n".join(expected) actual = repr(factor) assert actual == expected @@ -26,7 +36,7 @@ def test_print(self, factor): class TestCategoricalRepr: def test_big_print(self): codes = np.array([0, 1, 2, 0, 1, 2] * 100) - dtype = CategoricalDtype(categories=["a", "b", "c"]) + dtype = CategoricalDtype(categories=Index(["a", "b", "c"], dtype=object)) factor = Categorical.from_codes(codes, dtype=dtype) expected = [ "['a', 'b', 'c', 'a', 'b', ..., 'b', 'c', 'a', 'b', 'c']", @@ -40,13 +50,13 @@ def test_big_print(self): assert actual == expected def test_empty_print(self): - factor = Categorical([], ["a", "b", "c"]) + factor = Categorical([], Index(["a", "b", "c"], dtype=object)) expected = "[], Categories (3, object): ['a', 'b', 'c']" actual = repr(factor) assert actual == expected assert expected == actual - factor = Categorical([], ["a", "b", "c"], ordered=True) + factor = Categorical([], Index(["a", "b", "c"], dtype=object), ordered=True) expected = "[], Categories (3, object): ['a' < 'b' < 'c']" actual = repr(factor) assert expected == actual @@ -66,6 +76,10 @@ def test_print_none_width(self): with option_context("display.width", None): assert exp == repr(a) + @pytest.mark.skipif( + using_pyarrow_string_dtype(), + reason="Change once infer_string is set to True by default", + ) def test_unicode_print(self): c = Categorical(["aaaaa", "bb", "cccc"] * 20) expected = """\ diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py index 056c22d8c1131..ba081bd01062a 100644 --- a/pandas/tests/arrays/floating/test_arithmetic.py +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -122,11 +122,18 @@ def test_arith_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- -def test_error_invalid_values(data, all_arithmetic_operators): +def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) + if using_infer_string: + import pyarrow as pa + + errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) + else: + errs = TypeError + # invalid scalars msg = "|".join( [ @@ -140,15 +147,17 @@ def test_error_invalid_values(data, all_arithmetic_operators): "ufunc '.*' not supported for the input types, and the inputs could not", "ufunc '.*' did not contain a loop with signature matching types", "Concatenation operation is not implemented for NumPy arrays", + "has no kernel", + "not implemented", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops("foo") - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(pd.Series("foo", index=s.index)) msg = "|".join( @@ -167,9 +176,11 @@ def test_error_invalid_values(data, all_arithmetic_operators): ), r"ufunc 'add' cannot use operands with types dtype\('float\d{2}'\)", "cannot subtract DatetimeArray from ndarray", + "has no kernel", + "not implemented", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(pd.Series(pd.date_range("20180101", periods=len(s)))) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index ce6c245cd0f37..d979dd445a61a 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -172,11 +172,18 @@ def test_numpy_zero_dim_ndarray(other): # ----------------------------------------------------------------------------- -def test_error_invalid_values(data, all_arithmetic_operators): +def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string): op = all_arithmetic_operators s = pd.Series(data) ops = getattr(s, op) + if using_infer_string: + import pyarrow as pa + + errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError) + else: + errs = TypeError + # invalid scalars msg = "|".join( [ @@ -188,20 +195,26 @@ def test_error_invalid_values(data, all_arithmetic_operators): "ufunc '.*' not supported for the input types, and the inputs could not", "ufunc '.*' did not contain a loop with signature matching types", "Addition/subtraction of integers and integer-arrays with Timestamp", + "has no kernel", + "not implemented", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops("foo") - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(pd.Timestamp("20180101")) # invalid array-likes str_ser = pd.Series("foo", index=s.index) # with pytest.raises(TypeError, match=msg): - if all_arithmetic_operators in [ - "__mul__", - "__rmul__", - ]: # (data[~data.isna()] >= 0).all(): + if ( + all_arithmetic_operators + in [ + "__mul__", + "__rmul__", + ] + and not using_infer_string + ): # (data[~data.isna()] >= 0).all(): res = ops(str_ser) expected = pd.Series(["foo" * x for x in data], index=s.index) expected = expected.fillna(np.nan) @@ -210,7 +223,7 @@ def test_error_invalid_values(data, all_arithmetic_operators): # more-correct than np.nan here. tm.assert_series_equal(res, expected) else: - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(str_ser) msg = "|".join( @@ -223,9 +236,11 @@ def test_error_invalid_values(data, all_arithmetic_operators): r"can only concatenate str \(not \"int\"\) to str", "not all arguments converted during string", "cannot subtract DatetimeArray from ndarray", + "has no kernel", + "not implemented", ] ) - with pytest.raises(TypeError, match=msg): + with pytest.raises(errs, match=msg): ops(pd.Series(pd.date_range("20180101", periods=len(s)))) diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py index 1c91cd25ba69c..db04862e4ea07 100644 --- a/pandas/tests/arrays/integer/test_reduction.py +++ b/pandas/tests/arrays/integer/test_reduction.py @@ -102,7 +102,9 @@ def test_groupby_reductions(op, expected): ["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")], ], ) -def test_mixed_reductions(op, expected): +def test_mixed_reductions(op, expected, using_infer_string): + if op in ["any", "all"] and using_infer_string: + expected = expected.astype("bool") df = DataFrame( { "A": ["a", "b", "b"], diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 524a6632e5544..ab551653f6c03 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.compat.pyarrow import pa_version_under12p0 from pandas.core.dtypes.common import is_dtype_equal @@ -488,6 +490,10 @@ def test_arrow_array(dtype): assert arr.equals(expected) +@pytest.mark.xfail( + using_pyarrow_string_dtype(), + reason="infer_string takes precedence over string storage", +) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_roundtrip(dtype, string_storage2): # roundtrip possible from arrow 1.0.0 @@ -506,6 +512,10 @@ def test_arrow_roundtrip(dtype, string_storage2): assert result.loc[2, "a"] is na_val(result["a"].dtype) +@pytest.mark.xfail( + using_pyarrow_string_dtype(), + reason="infer_string takes precedence over string storage", +) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") def test_arrow_load_from_zero_chunks(dtype, string_storage2): # GH-41040 diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index eb6e93b490574..2491b4c3de46b 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -440,7 +440,7 @@ def test_array_unboxes(index_or_series): def test_array_to_numpy_na(): # GH#40638 - arr = pd.array([pd.NA, 1], dtype="string") + arr = pd.array([pd.NA, 1], dtype="string[python]") result = arr.to_numpy(na_value=True, dtype=bool) expected = np.array([True, True]) tm.assert_numpy_array_equal(result, expected) From f570a4ffb3cc4411849d12deffbd5cc3bd9da3a2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 26 Nov 2023 20:53:35 +0100 Subject: [PATCH 2/7] BUG: value_counts not preserving object dtype --- doc/source/whatsnew/v2.1.4.rst | 1 + pandas/core/algorithms.py | 4 +++- pandas/tests/series/methods/test_value_counts.py | 11 +++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 77ce303dc1bfe..41355645fca26 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -26,6 +26,7 @@ Bug fixes - Fixed bug in :meth:`DataFrame.__setitem__` casting :class:`Index` with object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`Index.insert` casting object-dtype to PyArrow backed strings when ``infer_string`` option is set (:issue:`55638`) - Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`) +- Fixed bug in :meth:`Series.value_counts` not preserving object dtype when ``infer_string`` is set (:issue:`56187`) .. --------------------------------------------------------------------------- .. _whatsnew_214.other: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 82de8ae96160f..1d93845f2f2ed 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -871,6 +871,8 @@ def value_counts_internal( Series, ) + input_dtype = None if not isinstance(values, Series) else values.dtype + index_name = getattr(values, "name", None) name = "proportion" if normalize else "count" @@ -929,7 +931,7 @@ def value_counts_internal( # For backwards compatibility, we let Index do its normal type # inference, _except_ for if if infers from object to bool. - idx = Index(keys) + idx = Index(keys, dtype=input_dtype) if idx.dtype == bool and keys.dtype == object: idx = idx.astype(object) elif idx.dtype != keys.dtype: diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index 859010d9c79c6..422f0fac37f6d 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -269,3 +269,14 @@ def test_value_counts_masked(self): [2, 1, 1], index=Index([2, 1, 3], dtype=dtype), dtype=dtype, name="count" ) tm.assert_series_equal(result, expected) + + def test_value_counts_infer_string(self): + # GH#56187 + pytest.importorskip("pyarrow") + + ser = Series(["a", "b"], dtype=object) + + with pd.option_context("future.infer_string", True): + result = ser.value_counts() + expected = Series([1, 1], index=Index(["a", "b"], dtype=object), name="count") + tm.assert_series_equal(result, expected) From 0c6bf14d4e0f7e3744b97cde3440c8ca47f696bb Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 26 Nov 2023 21:08:55 +0100 Subject: [PATCH 3/7] Adjust tests in array folder for new string option --- pandas/core/algorithms.py | 5 ++- .../arrays/categorical/test_operators.py | 2 +- pandas/tests/arrays/string_/test_string.py | 32 +++++++++++-------- .../tests/arrays/string_/test_string_arrow.py | 32 ++++++++++++------- 4 files changed, 44 insertions(+), 27 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1d93845f2f2ed..dc2eefc00e2b6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -934,7 +934,10 @@ def value_counts_internal( idx = Index(keys, dtype=input_dtype) if idx.dtype == bool and keys.dtype == object: idx = idx.astype(object) - elif idx.dtype != keys.dtype: + elif ( + idx.dtype != keys.dtype # noqa: PLR1714 + and idx.dtype != "string[pyarrow_numpy]" + ): warnings.warn( # GH#56161 "The behavior of value_counts with object-dtype is deprecated. " diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 9e658ec2a799a..16b941eab4830 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -248,7 +248,7 @@ def test_comparisons(self, data, reverse, base): cat_base = Series( Categorical(base, categories=cat.cat.categories, ordered=True) ) - s = Series(base) + s = Series(base, dtype=object if base == list("bbb") else None) a = np.array(base) # comparisons need to take categories ordering into account diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index ab551653f6c03..cf45b4cf6dcf9 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -5,8 +5,6 @@ import numpy as np import pytest -from pandas._config import using_pyarrow_string_dtype - from pandas.compat.pyarrow import pa_version_under12p0 from pandas.core.dtypes.common import is_dtype_equal @@ -196,7 +194,7 @@ def test_mul(dtype, request, arrow_string_storage): @pytest.mark.xfail(reason="GH-28527") def test_add_strings(dtype): arr = pd.array(["a", "b", "c", "d"], dtype=dtype) - df = pd.DataFrame([["t", "y", "v", "w"]]) + df = pd.DataFrame([["t", "y", "v", "w"]], dtype=object) assert arr.__add__(df) is NotImplemented result = arr + df @@ -490,15 +488,18 @@ def test_arrow_array(dtype): assert arr.equals(expected) -@pytest.mark.xfail( - using_pyarrow_string_dtype(), - reason="infer_string takes precedence over string storage", -) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_roundtrip(dtype, string_storage2): +def test_arrow_roundtrip(dtype, string_storage2, request, using_infer_string): # roundtrip possible from arrow 1.0.0 pa = pytest.importorskip("pyarrow") + if using_infer_string and string_storage2 != "pyarrow_numpy": + request.applymarker( + pytest.mark.xfail( + reason="infer_string takes precedence over string storage" + ) + ) + data = pd.array(["a", "b", None], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) @@ -512,15 +513,20 @@ def test_arrow_roundtrip(dtype, string_storage2): assert result.loc[2, "a"] is na_val(result["a"].dtype) -@pytest.mark.xfail( - using_pyarrow_string_dtype(), - reason="infer_string takes precedence over string storage", -) @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning") -def test_arrow_load_from_zero_chunks(dtype, string_storage2): +def test_arrow_load_from_zero_chunks( + dtype, string_storage2, request, using_infer_string +): # GH-41040 pa = pytest.importorskip("pyarrow") + if using_infer_string and string_storage2 != "pyarrow_numpy": + request.applymarker( + pytest.mark.xfail( + reason="infer_string takes precedence over string storage" + ) + ) + data = pd.array([], dtype=dtype) df = pd.DataFrame({"a": data}) table = pa.table(df) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index a801a845bc7be..a022dfffbdd2b 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -26,7 +26,9 @@ def test_eq_all_na(): tm.assert_extension_array_equal(result, expected) -def test_config(string_storage): +def test_config(string_storage, request, using_infer_string): + if using_infer_string and string_storage != "pyarrow_numpy": + request.applymarker(pytest.mark.xfail(reason="infer string takes precedence")) with pd.option_context("string_storage", string_storage): assert StringDtype().storage == string_storage result = pd.array(["a", "b"]) @@ -101,7 +103,7 @@ def test_constructor_from_list(): assert result.dtype.storage == "pyarrow" -def test_from_sequence_wrong_dtype_raises(): +def test_from_sequence_wrong_dtype_raises(using_infer_string): pytest.importorskip("pyarrow") with pd.option_context("string_storage", "python"): ArrowStringArray._from_sequence(["a", None, "c"], dtype="string") @@ -114,15 +116,19 @@ def test_from_sequence_wrong_dtype_raises(): ArrowStringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]") - with pytest.raises(AssertionError, match=None): - with pd.option_context("string_storage", "python"): - ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + if not using_infer_string: + with pytest.raises(AssertionError, match=None): + with pd.option_context("string_storage", "python"): + ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) with pd.option_context("string_storage", "pyarrow"): ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) - with pytest.raises(AssertionError, match=None): - ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python")) + if not using_infer_string: + with pytest.raises(AssertionError, match=None): + ArrowStringArray._from_sequence( + ["a", None, "c"], dtype=StringDtype("python") + ) ArrowStringArray._from_sequence(["a", None, "c"], dtype=StringDtype("pyarrow")) @@ -137,13 +143,15 @@ def test_from_sequence_wrong_dtype_raises(): with pytest.raises(AssertionError, match=None): StringArray._from_sequence(["a", None, "c"], dtype="string[pyarrow]") - with pd.option_context("string_storage", "python"): - StringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) - - with pytest.raises(AssertionError, match=None): - with pd.option_context("string_storage", "pyarrow"): + if not using_infer_string: + with pd.option_context("string_storage", "python"): StringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + if not using_infer_string: + with pytest.raises(AssertionError, match=None): + with pd.option_context("string_storage", "pyarrow"): + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype()) + StringArray._from_sequence(["a", None, "c"], dtype=StringDtype("python")) with pytest.raises(AssertionError, match=None): From 8efb8f20b2601a5d1832949b67db895bd4f8a035 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 26 Nov 2023 21:33:41 +0100 Subject: [PATCH 4/7] Fixup --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1d93845f2f2ed..8e1b42f531b06 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -931,7 +931,7 @@ def value_counts_internal( # For backwards compatibility, we let Index do its normal type # inference, _except_ for if if infers from object to bool. - idx = Index(keys, dtype=input_dtype) + idx = Index(keys, dtype=input_dtype if input_dtype != "float16" else None) if idx.dtype == bool and keys.dtype == object: idx = idx.astype(object) elif idx.dtype != keys.dtype: From 260433629eebd9e51bf59c180a6ef0b4715ade7a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 26 Nov 2023 22:13:30 +0100 Subject: [PATCH 5/7] Fix --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 277c0f189df7a..2f325b413c3ca 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -935,7 +935,7 @@ def value_counts_internal( if idx.dtype == bool and keys.dtype == object: idx = idx.astype(object) elif ( - idx.dtype != keys.dtype # noqa: PLR1714 + idx.dtype != keys.dtype # noqa: PLR1714 # noqa: R1714 and idx.dtype != "string[pyarrow_numpy]" ): warnings.warn( From 86ac45a4b7c477ab605e10c80fbb78fc882e11fe Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sun, 26 Nov 2023 23:24:50 +0100 Subject: [PATCH 6/7] Fix --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2f325b413c3ca..1c8f876dff408 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -935,7 +935,7 @@ def value_counts_internal( if idx.dtype == bool and keys.dtype == object: idx = idx.astype(object) elif ( - idx.dtype != keys.dtype # noqa: PLR1714 # noqa: R1714 + idx.dtype != keys.dtype # noqa: PLR1714 # # pylint: disable=R1714 and idx.dtype != "string[pyarrow_numpy]" ): warnings.warn( From c53c2b52590b9d39be510c8556a94b92ca838290 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 9 Dec 2023 00:31:57 +0100 Subject: [PATCH 7/7] Revert "BUG: value_counts not preserving object dtype" This reverts commit f570a4ff --- doc/source/whatsnew/v2.1.4.rst | 1 - pandas/core/algorithms.py | 4 +--- pandas/tests/series/methods/test_value_counts.py | 11 ----------- 3 files changed, 1 insertion(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v2.1.4.rst b/doc/source/whatsnew/v2.1.4.rst index 07197369abc59..9cc79b7090499 100644 --- a/doc/source/whatsnew/v2.1.4.rst +++ b/doc/source/whatsnew/v2.1.4.rst @@ -34,7 +34,6 @@ Bug fixes - Fixed bug in :meth:`Series.reset_index` not preserving object dtype when ``infer_string`` is set (:issue:`56160`) - Fixed bug in :meth:`Series.str.split` and :meth:`Series.str.rsplit` when ``pat=None`` for :class:`ArrowDtype` with ``pyarrow.string`` (:issue:`56271`) - Fixed bug in :meth:`Series.str.translate` losing object dtype when string option is set (:issue:`56152`) -- Fixed bug in :meth:`Series.value_counts` not preserving object dtype when ``infer_string`` is set (:issue:`56187`) .. --------------------------------------------------------------------------- .. _whatsnew_214.contributors: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1c8f876dff408..cfa41a4e1969b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -871,8 +871,6 @@ def value_counts_internal( Series, ) - input_dtype = None if not isinstance(values, Series) else values.dtype - index_name = getattr(values, "name", None) name = "proportion" if normalize else "count" @@ -931,7 +929,7 @@ def value_counts_internal( # For backwards compatibility, we let Index do its normal type # inference, _except_ for if if infers from object to bool. - idx = Index(keys, dtype=input_dtype if input_dtype != "float16" else None) + idx = Index(keys) if idx.dtype == bool and keys.dtype == object: idx = idx.astype(object) elif ( diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index 422f0fac37f6d..859010d9c79c6 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -269,14 +269,3 @@ def test_value_counts_masked(self): [2, 1, 1], index=Index([2, 1, 3], dtype=dtype), dtype=dtype, name="count" ) tm.assert_series_equal(result, expected) - - def test_value_counts_infer_string(self): - # GH#56187 - pytest.importorskip("pyarrow") - - ser = Series(["a", "b"], dtype=object) - - with pd.option_context("future.infer_string", True): - result = ser.value_counts() - expected = Series([1, 1], index=Index(["a", "b"], dtype=object), name="count") - tm.assert_series_equal(result, expected)