String dtype (2.3.x): avoid downcasting object to string in fillna/wh…

…ere/interpolate (pandas-dev#60183)
meeseeksmachine · Dec 17, 2024 · 3362822 · 3362822
1 parent 6d9a2b4
commit 3362822
Show file tree

Hide file tree

Showing 7 changed files with 57 additions and 58 deletions.
diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
@@ -86,6 +86,7 @@ def maybe_convert_objects(
     safe: bool = ...,
     convert_numeric: bool = ...,
     convert_non_numeric: Literal[False] = ...,
+    convert_string: Literal[False] = ...,
     convert_to_nullable_dtype: Literal[False] = ...,
     dtype_if_all_nat: DtypeObj | None = ...,
 ) -> npt.NDArray[np.object_ | np.number]: ...
@@ -97,6 +98,7 @@ def maybe_convert_objects(
     safe: bool = ...,
     convert_numeric: bool = ...,
     convert_non_numeric: bool = ...,
+    convert_string: bool = ...,
     convert_to_nullable_dtype: Literal[True] = ...,
     dtype_if_all_nat: DtypeObj | None = ...,
 ) -> ArrayLike: ...
@@ -108,6 +110,7 @@ def maybe_convert_objects(
     safe: bool = ...,
     convert_numeric: bool = ...,
     convert_non_numeric: bool = ...,
+    convert_string: bool = ...,
     convert_to_nullable_dtype: bool = ...,
     dtype_if_all_nat: DtypeObj | None = ...,
 ) -> ArrayLike: ...

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -2498,6 +2498,7 @@ def maybe_convert_objects(ndarray[object] objects,
                           bint convert_numeric=True,  # NB: different default!
                           bint convert_to_nullable_dtype=False,
                           bint convert_non_numeric=False,
+                          bint convert_string=True,
                           object dtype_if_all_nat=None) -> "ArrayLike":
     """
     Type inference function-- convert object array to proper dtype
@@ -2747,7 +2748,11 @@ def maybe_convert_objects(ndarray[object] objects,
             dtype = StringDtype()
             return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
 
-        elif using_string_dtype() and is_string_array(objects, skipna=True):
+        elif (
+            convert_string
+            and using_string_dtype()
+            and is_string_array(objects, skipna=True)
+        ):
             from pandas.core.arrays.string_ import StringDtype
 
             dtype = StringDtype(na_value=np.nan)

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -563,7 +563,12 @@ def _maybe_downcast(
                 return blocks
 
             nbs = extend_blocks(
-                [blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks]
+                [
+                    blk.convert(
+                        using_cow=using_cow, copy=not using_cow, convert_string=False
+                    )
+                    for blk in blocks
+                ]
             )
             if caller == "fillna":
                 if len(nbs) != len(blocks) or not all(
@@ -636,6 +641,7 @@ def convert(
         *,
         copy: bool = True,
         using_cow: bool = False,
+        convert_string: bool = True,
     ) -> list[Block]:
         """
         Attempt to coerce any object types to better types. Return a copy
@@ -648,7 +654,10 @@ def convert(
 
         if self.ndim != 1 and self.shape[0] != 1:
             blocks = self.split_and_operate(
-                Block.convert, copy=copy, using_cow=using_cow
+                Block.convert,
+                copy=copy,
+                using_cow=using_cow,
+                convert_string=convert_string,
             )
             if all(blk.dtype.kind == "O" for blk in blocks):
                 # Avoid fragmenting the block if convert is a no-op
@@ -666,6 +675,7 @@ def convert(
         res_values = lib.maybe_convert_objects(
             values,  # type: ignore[arg-type]
             convert_non_numeric=True,
+            convert_string=convert_string,
         )
         refs = None
         if (
@@ -851,6 +861,7 @@ def replace(
         mask: npt.NDArray[np.bool_] | None = None,
         using_cow: bool = False,
         already_warned=None,
+        convert_string=None,
     ) -> list[Block]:
         """
         replace the to_replace value with value, possible to create new
@@ -915,7 +926,11 @@ def replace(
                 if get_option("future.no_silent_downcasting") is True:
                     blocks = [blk]
                 else:
-                    blocks = blk.convert(copy=False, using_cow=using_cow)
+                    blocks = blk.convert(
+                        copy=False,
+                        using_cow=using_cow,
+                        convert_string=convert_string or self.dtype != _dtype_obj,
+                    )
                     if len(blocks) > 1 or blocks[0].dtype != blk.dtype:
                         warnings.warn(
                             # GH#54710
@@ -944,6 +959,7 @@ def replace(
                 inplace=True,
                 mask=mask,
                 using_cow=using_cow,
+                convert_string=convert_string,
             )
 
         else:
@@ -958,6 +974,7 @@ def replace(
                         inplace=True,
                         mask=mask[i : i + 1],
                         using_cow=using_cow,
+                        convert_string=convert_string,
                     )
                 )
             return blocks
@@ -970,6 +987,7 @@ def _replace_regex(
         inplace: bool = False,
         mask=None,
         using_cow: bool = False,
+        convert_string: bool = True,
         already_warned=None,
     ) -> list[Block]:
         """
@@ -1029,7 +1047,9 @@ def _replace_regex(
                 )
                 already_warned.warned_already = True
 
-        nbs = block.convert(copy=False, using_cow=using_cow)
+        nbs = block.convert(
+            copy=False, using_cow=using_cow, convert_string=convert_string
+        )
         opt = get_option("future.no_silent_downcasting")
         if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt:
             warnings.warn(
@@ -1068,6 +1088,8 @@ def replace_list(
             values._replace(to_replace=src_list, value=dest_list, inplace=True)
             return [blk]
 
+        convert_string = self.dtype != _dtype_obj
+
         # Exclude anything that we know we won't contain
         pairs = [
             (x, y)
@@ -1152,6 +1174,7 @@ def replace_list(
                     inplace=inplace,
                     regex=regex,
                     using_cow=using_cow,
+                    convert_string=convert_string,
                 )
 
                 if using_cow and i != src_len:
@@ -1174,7 +1197,9 @@ def replace_list(
                     nbs = []
                     for res_blk in result:
                         converted = res_blk.convert(
-                            copy=True and not using_cow, using_cow=using_cow
+                            copy=True and not using_cow,
+                            using_cow=using_cow,
+                            convert_string=convert_string,
                         )
                         if len(converted) > 1 or converted[0].dtype != res_blk.dtype:
                             warnings.warn(
@@ -1204,6 +1229,7 @@ def _replace_coerce(
         inplace: bool = True,
         regex: bool = False,
         using_cow: bool = False,
+        convert_string: bool = True,
     ) -> list[Block]:
         """
         Replace value corresponding to the given boolean array with another
@@ -1233,6 +1259,7 @@ def _replace_coerce(
                 inplace=inplace,
                 mask=mask,
                 using_cow=using_cow,
+                convert_string=convert_string,
             )
         else:
             if value is None:
@@ -1256,6 +1283,7 @@ def _replace_coerce(
                 inplace=inplace,
                 mask=mask,
                 using_cow=using_cow,
+                convert_string=convert_string,
             )
 
     # ---------------------------------------------------------------------

diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py
@@ -132,21 +132,14 @@ def test_fillna_different_dtype(self, using_infer_string):
             [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]]
         )
 
-        if using_infer_string:
-            with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
-                result = df.fillna({2: "foo"})
-        else:
-            result = df.fillna({2: "foo"})
+        result = df.fillna({2: "foo"})
         expected = DataFrame(
             [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]]
         )
+        expected[2] = expected[2].astype("object")
         tm.assert_frame_equal(result, expected)
 
-        if using_infer_string:
-            with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
-                return_value = df.fillna({2: "foo"}, inplace=True)
-        else:
-            return_value = df.fillna({2: "foo"}, inplace=True)
+        return_value = df.fillna({2: "foo"}, inplace=True)
         tm.assert_frame_equal(df, expected)
         assert return_value is None
 
@@ -385,12 +378,8 @@ def test_fillna_dtype_conversion(self, using_infer_string):
 
         # empty block
         df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64")
-        if using_infer_string:
-            with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
-                result = df.fillna("nan")
-        else:
-            result = df.fillna("nan")
-        expected = DataFrame("nan", index=range(3), columns=["A", "B"])
+        result = df.fillna("nan")
+        expected = DataFrame("nan", index=range(3), columns=["A", "B"], dtype=object)
         tm.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize("val", ["", 1, np.nan, 1.0])

diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py
@@ -281,20 +281,12 @@ def test_regex_replace_dict_nested(self, mix_abc):
         tm.assert_frame_equal(res3, expec)
         tm.assert_frame_equal(res4, expec)
 
-    def test_regex_replace_dict_nested_non_first_character(
-        self, any_string_dtype, using_infer_string
-    ):
+    def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype):
         # GH 25259
         dtype = any_string_dtype
         df = DataFrame({"first": ["abc", "bca", "cab"]}, dtype=dtype)
-        if using_infer_string and any_string_dtype == "object":
-            with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
-                result = df.replace({"a": "."}, regex=True)
-            expected = DataFrame({"first": [".bc", "bc.", "c.b"]})
-
-        else:
-            result = df.replace({"a": "."}, regex=True)
-            expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype)
+        result = df.replace({"a": "."}, regex=True)
+        expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype)
         tm.assert_frame_equal(result, expected)
 
     def test_regex_replace_dict_nested_gh4115(self):
@@ -429,31 +421,12 @@ def test_replace_regex_metachar(self, metachar):
         ],
     )
     def test_regex_replace_string_types(
-        self,
-        data,
-        to_replace,
-        expected,
-        frame_or_series,
-        any_string_dtype,
-        using_infer_string,
-        request,
+        self, data, to_replace, expected, frame_or_series, any_string_dtype
     ):
         # GH-41333, GH-35977
         dtype = any_string_dtype
         obj = frame_or_series(data, dtype=dtype)
-        if using_infer_string and any_string_dtype == "object":
-            if len(to_replace) > 1 and isinstance(obj, DataFrame):
-                request.node.add_marker(
-                    pytest.mark.xfail(
-                        reason="object input array that gets downcasted raises on "
-                        "second pass"
-                    )
-                )
-            with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
-                result = obj.replace(to_replace, regex=True)
-                dtype = "str"
-        else:
-            result = obj.replace(to_replace, regex=True)
+        result = obj.replace(to_replace, regex=True)
         expected = frame_or_series(expected, dtype=dtype)
 
         tm.assert_equal(result, expected)

diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py
@@ -831,7 +831,7 @@ def replacer(self, how, from_key, to_key):
             raise ValueError
         return replacer
 
-    def test_replace_series(self, how, to_key, from_key, replacer):
+    def test_replace_series(self, how, to_key, from_key, replacer, using_infer_string):
         index = pd.Index([3, 4], name="xxx")
         obj = pd.Series(self.rep[from_key], index=index, name="yyy")
         obj = obj.astype(from_key)
@@ -856,6 +856,10 @@ def test_replace_series(self, how, to_key, from_key, replacer):
         else:
             exp = pd.Series(self.rep[to_key], index=index, name="yyy")
 
+        if using_infer_string and exp.dtype == "string" and obj.dtype == object:
+            # with infer_string, we disable the deprecated downcasting behavior
+            exp = exp.astype(object)
+
         msg = "Downcasting behavior in `replace`"
         warn = FutureWarning
         if (

diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 import pandas._testing as tm
 from pandas.core.arrays import IntervalArray
@@ -768,7 +766,6 @@ def test_replace_value_none_dtype_numeric(self, val):
         expected = pd.Series([1, None], dtype=object)
         tm.assert_series_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_replace_change_dtype_series(self):
         # GH#25797
         df = pd.DataFrame({"Test": ["0.5", True, "0.6"]}, dtype=object)