Merge branch 'main' into merge_asof

pandas-dev · Dec 11, 2023 · 56101e3 · 56101e3
2 parents d8ccef3 + b4c9df8
commit 56101e3
Show file tree

Hide file tree

Showing 16 changed files with 210 additions and 52 deletions.
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -218,6 +218,7 @@ Other enhancements
 
 - :meth:`~DataFrame.to_sql` with method parameter set to ``multi`` works with Oracle on the backend
 - :attr:`Series.attrs` / :attr:`DataFrame.attrs` now uses a deepcopy for propagating ``attrs`` (:issue:`54134`).
+- :func:`get_dummies` now returning  extension dtypes ``boolean`` or ``bool[pyarrow]`` that are compatible with the input dtype (:issue:`56273`)
 - :func:`read_csv` now supports ``on_bad_lines`` parameter with ``engine="pyarrow"``. (:issue:`54480`)
 - :func:`read_sas` returns ``datetime64`` dtypes with resolutions better matching those stored natively in SAS, and avoids returning object-dtype in cases that cannot be stored with ``datetime64[ns]`` dtype (:issue:`56127`)
 - :func:`read_spss` now returns a :class:`DataFrame` that stores the metadata in :attr:`DataFrame.attrs`. (:issue:`54264`)

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -932,7 +932,10 @@ def value_counts_internal(
             idx = Index(keys)
             if idx.dtype == bool and keys.dtype == object:
                 idx = idx.astype(object)
-            elif idx.dtype != keys.dtype:
+            elif (
+                idx.dtype != keys.dtype  # noqa: PLR1714  # # pylint: disable=R1714
+                and idx.dtype != "string[pyarrow_numpy]"
+            ):
                 warnings.warn(
                     # GH#56161
                     "The behavior of value_counts with object-dtype is deprecated. "

diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py
@@ -21,9 +21,14 @@
     is_object_dtype,
     pandas_dtype,
 )
+from pandas.core.dtypes.dtypes import (
+    ArrowDtype,
+    CategoricalDtype,
+)
 
 from pandas.core.arrays import SparseArray
 from pandas.core.arrays.categorical import factorize_from_iterable
+from pandas.core.arrays.string_ import StringDtype
 from pandas.core.frame import DataFrame
 from pandas.core.indexes.api import (
     Index,
@@ -244,8 +249,25 @@ def _get_dummies_1d(
     # Series avoids inconsistent NaN handling
     codes, levels = factorize_from_iterable(Series(data, copy=False))
 
-    if dtype is None:
+    if dtype is None and hasattr(data, "dtype"):
+        input_dtype = data.dtype
+        if isinstance(input_dtype, CategoricalDtype):
+            input_dtype = input_dtype.categories.dtype
+
+        if isinstance(input_dtype, ArrowDtype):
+            import pyarrow as pa
+
+            dtype = ArrowDtype(pa.bool_())  # type: ignore[assignment]
+        elif (
+            isinstance(input_dtype, StringDtype)
+            and input_dtype.storage != "pyarrow_numpy"
+        ):
+            dtype = pandas_dtype("boolean")  # type: ignore[assignment]
+        else:
+            dtype = np.dtype(bool)
+    elif dtype is None:
         dtype = np.dtype(bool)
+
     _dtype = pandas_dtype(dtype)
 
     if is_object_dtype(_dtype):

diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py
@@ -90,9 +90,16 @@ def test_op_int8(left_array, right_array, opname):
 # -----------------------------------------------------------------------------
 
 
-def test_error_invalid_values(data, all_arithmetic_operators):
+def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
     # invalid ops
 
+    if using_infer_string:
+        import pyarrow as pa
+
+        err = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
+    else:
+        err = TypeError
+
     op = all_arithmetic_operators
     s = pd.Series(data)
     ops = getattr(s, op)
@@ -110,9 +117,10 @@ def test_error_invalid_values(data, all_arithmetic_operators):
         [
             r"unsupported operand type\(s\) for",
             "Concatenation operation is not implemented for NumPy arrays",
+            "has no kernel",
         ]
     )
-    with pytest.raises(TypeError, match=msg):
+    with pytest.raises(err, match=msg):
         ops(pd.Timestamp("20180101"))
 
     # invalid array-likes
@@ -123,7 +131,9 @@ def test_error_invalid_values(data, all_arithmetic_operators):
                 r"unsupported operand type\(s\) for",
                 "can only concatenate str",
                 "not all arguments converted during string formatting",
+                "has no kernel",
+                "not implemented",
             ]
         )
-        with pytest.raises(TypeError, match=msg):
+        with pytest.raises(err, match=msg):
             ops(pd.Series("foo", index=s.index))
diff --git a/pandas/tests/arrays/categorical/test_astype.py b/pandas/tests/arrays/categorical/test_astype.py
@@ -89,7 +89,7 @@ def test_astype(self, ordered):
         expected = np.array(cat)
         tm.assert_numpy_array_equal(result, expected)
 
-        msg = r"Cannot cast object dtype to float64"
+        msg = r"Cannot cast object|string dtype to float64"
         with pytest.raises(ValueError, match=msg):
             cat.astype(float)
 

diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
@@ -6,6 +6,8 @@
 import numpy as np
 import pytest
 
+from pandas._config import using_pyarrow_string_dtype
+
 from pandas.core.dtypes.common import (
     is_float_dtype,
     is_integer_dtype,
@@ -447,6 +449,7 @@ def test_constructor_str_unknown(self):
         with pytest.raises(ValueError, match="Unknown dtype"):
             Categorical([1, 2], dtype="foo")
 
+    @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="Can't be NumPy strings")
     def test_constructor_np_strs(self):
         # GH#31499 Hashtable.map_locations needs to work on np.str_ objects
         cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")])

diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py
@@ -92,7 +92,7 @@ def test_comparisons(self, factor):
             cat > cat_unordered
 
         # comparison (in both directions) with Series will raise
-        s = Series(["b", "b", "b"])
+        s = Series(["b", "b", "b"], dtype=object)
         msg = (
             "Cannot compare a Categorical for op __gt__ with type "
             r"<class 'numpy\.ndarray'>"
@@ -108,7 +108,7 @@ def test_comparisons(self, factor):
 
         # comparison with numpy.array will raise in both direction, but only on
         # newer numpy versions
-        a = np.array(["b", "b", "b"])
+        a = np.array(["b", "b", "b"], dtype=object)
         with pytest.raises(TypeError, match=msg):
             cat > a
         with pytest.raises(TypeError, match=msg):
@@ -248,7 +248,7 @@ def test_comparisons(self, data, reverse, base):
         cat_base = Series(
             Categorical(base, categories=cat.cat.categories, ordered=True)
         )
-        s = Series(base)
+        s = Series(base, dtype=object if base == list("bbb") else None)
         a = np.array(base)
 
         # comparisons need to take categories ordering into account

diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py
@@ -1,9 +1,13 @@
 import numpy as np
+import pytest
+
+from pandas._config import using_pyarrow_string_dtype
 
 from pandas import (
     Categorical,
     CategoricalDtype,
     CategoricalIndex,
+    Index,
     Series,
     date_range,
     option_context,
@@ -13,11 +17,17 @@
 
 
 class TestCategoricalReprWithFactor:
-    def test_print(self, factor):
-        expected = [
-            "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
-            "Categories (3, object): ['a' < 'b' < 'c']",
-        ]
+    def test_print(self, factor, using_infer_string):
+        if using_infer_string:
+            expected = [
+                "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
+                "Categories (3, string): [a < b < c]",
+            ]
+        else:
+            expected = [
+                "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']",
+                "Categories (3, object): ['a' < 'b' < 'c']",
+            ]
         expected = "\n".join(expected)
         actual = repr(factor)
         assert actual == expected
@@ -26,7 +36,7 @@ def test_print(self, factor):
 class TestCategoricalRepr:
     def test_big_print(self):
         codes = np.array([0, 1, 2, 0, 1, 2] * 100)
-        dtype = CategoricalDtype(categories=["a", "b", "c"])
+        dtype = CategoricalDtype(categories=Index(["a", "b", "c"], dtype=object))
         factor = Categorical.from_codes(codes, dtype=dtype)
         expected = [
             "['a', 'b', 'c', 'a', 'b', ..., 'b', 'c', 'a', 'b', 'c']",
@@ -40,13 +50,13 @@ def test_big_print(self):
         assert actual == expected
 
     def test_empty_print(self):
-        factor = Categorical([], ["a", "b", "c"])
+        factor = Categorical([], Index(["a", "b", "c"], dtype=object))
         expected = "[], Categories (3, object): ['a', 'b', 'c']"
         actual = repr(factor)
         assert actual == expected
 
         assert expected == actual
-        factor = Categorical([], ["a", "b", "c"], ordered=True)
+        factor = Categorical([], Index(["a", "b", "c"], dtype=object), ordered=True)
         expected = "[], Categories (3, object): ['a' < 'b' < 'c']"
         actual = repr(factor)
         assert expected == actual
@@ -66,6 +76,10 @@ def test_print_none_width(self):
         with option_context("display.width", None):
             assert exp == repr(a)
 
+    @pytest.mark.skipif(
+        using_pyarrow_string_dtype(),
+        reason="Change once infer_string is set to True by default",
+    )
     def test_unicode_print(self):
         c = Categorical(["aaaaa", "bb", "cccc"] * 20)
         expected = """\

diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py
@@ -122,11 +122,18 @@ def test_arith_zero_dim_ndarray(other):
 # -----------------------------------------------------------------------------
 
 
-def test_error_invalid_values(data, all_arithmetic_operators):
+def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
     op = all_arithmetic_operators
     s = pd.Series(data)
     ops = getattr(s, op)
 
+    if using_infer_string:
+        import pyarrow as pa
+
+        errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
+    else:
+        errs = TypeError
+
     # invalid scalars
     msg = "|".join(
         [
@@ -140,15 +147,17 @@ def test_error_invalid_values(data, all_arithmetic_operators):
             "ufunc '.*' not supported for the input types, and the inputs could not",
             "ufunc '.*' did not contain a loop with signature matching types",
             "Concatenation operation is not implemented for NumPy arrays",
+            "has no kernel",
+            "not implemented",
         ]
     )
-    with pytest.raises(TypeError, match=msg):
+    with pytest.raises(errs, match=msg):
         ops("foo")
-    with pytest.raises(TypeError, match=msg):
+    with pytest.raises(errs, match=msg):
         ops(pd.Timestamp("20180101"))
 
     # invalid array-likes
-    with pytest.raises(TypeError, match=msg):
+    with pytest.raises(errs, match=msg):
         ops(pd.Series("foo", index=s.index))
 
     msg = "|".join(
@@ -167,9 +176,11 @@ def test_error_invalid_values(data, all_arithmetic_operators):
             ),
             r"ufunc 'add' cannot use operands with types dtype\('float\d{2}'\)",
             "cannot subtract DatetimeArray from ndarray",
+            "has no kernel",
+            "not implemented",
         ]
     )
-    with pytest.raises(TypeError, match=msg):
+    with pytest.raises(errs, match=msg):
         ops(pd.Series(pd.date_range("20180101", periods=len(s))))
 
 

diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py
@@ -172,11 +172,18 @@ def test_numpy_zero_dim_ndarray(other):
 # -----------------------------------------------------------------------------
 
 
-def test_error_invalid_values(data, all_arithmetic_operators):
+def test_error_invalid_values(data, all_arithmetic_operators, using_infer_string):
     op = all_arithmetic_operators
     s = pd.Series(data)
     ops = getattr(s, op)
 
+    if using_infer_string:
+        import pyarrow as pa
+
+        errs = (TypeError, pa.lib.ArrowNotImplementedError, NotImplementedError)
+    else:
+        errs = TypeError
+
     # invalid scalars
     msg = "|".join(
         [
@@ -188,20 +195,26 @@ def test_error_invalid_values(data, all_arithmetic_operators):
             "ufunc '.*' not supported for the input types, and the inputs could not",
             "ufunc '.*' did not contain a loop with signature matching types",
             "Addition/subtraction of integers and integer-arrays with Timestamp",
+            "has no kernel",
+            "not implemented",
         ]
     )
-    with pytest.raises(TypeError, match=msg):
+    with pytest.raises(errs, match=msg):
         ops("foo")
-    with pytest.raises(TypeError, match=msg):
+    with pytest.raises(errs, match=msg):
         ops(pd.Timestamp("20180101"))
 
     # invalid array-likes
     str_ser = pd.Series("foo", index=s.index)
     # with pytest.raises(TypeError, match=msg):
-    if all_arithmetic_operators in [
-        "__mul__",
-        "__rmul__",
-    ]:  # (data[~data.isna()] >= 0).all():
+    if (
+        all_arithmetic_operators
+        in [
+            "__mul__",
+            "__rmul__",
+        ]
+        and not using_infer_string
+    ):  # (data[~data.isna()] >= 0).all():
         res = ops(str_ser)
         expected = pd.Series(["foo" * x for x in data], index=s.index)
         expected = expected.fillna(np.nan)
@@ -210,7 +223,7 @@ def test_error_invalid_values(data, all_arithmetic_operators):
         #  more-correct than np.nan here.
         tm.assert_series_equal(res, expected)
     else:
-        with pytest.raises(TypeError, match=msg):
+        with pytest.raises(errs, match=msg):
             ops(str_ser)
 
     msg = "|".join(
@@ -223,9 +236,11 @@ def test_error_invalid_values(data, all_arithmetic_operators):
             r"can only concatenate str \(not \"int\"\) to str",
             "not all arguments converted during string",
             "cannot subtract DatetimeArray from ndarray",
+            "has no kernel",
+            "not implemented",
         ]
     )
-    with pytest.raises(TypeError, match=msg):
+    with pytest.raises(errs, match=msg):
         ops(pd.Series(pd.date_range("20180101", periods=len(s))))
 
 

diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py
@@ -102,7 +102,9 @@ def test_groupby_reductions(op, expected):
         ["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")],
     ],
 )
-def test_mixed_reductions(op, expected):
+def test_mixed_reductions(op, expected, using_infer_string):
+    if op in ["any", "all"] and using_infer_string:
+        expected = expected.astype("bool")
     df = DataFrame(
         {
             "A": ["a", "b", "b"],