From d95620bc5454e48fd1d88919f767330e1181f3c9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 4 Nov 2024 14:03:11 +0100 Subject: [PATCH 1/4] String dtype (2.3.x): avoid downcasting object to string in fillna/where/interpolate --- pandas/_libs/lib.pyx | 7 ++++++- pandas/core/internals/blocks.py | 19 +++++++++++++++---- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index c23f907aecfab..c03cc073da174 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2498,6 +2498,7 @@ def maybe_convert_objects(ndarray[object] objects, bint convert_numeric=True, # NB: different default! bint convert_to_nullable_dtype=False, bint convert_non_numeric=False, + bint convert_string=True, object dtype_if_all_nat=None) -> "ArrayLike": """ Type inference function-- convert object array to proper dtype @@ -2741,7 +2742,11 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if using_string_dtype() and is_string_array(objects, skipna=True): + if ( + convert_string + and using_string_dtype() + and is_string_array(objects, skipna=True) + ): from pandas.core.arrays.string_ import StringDtype dtype = StringDtype(na_value=np.nan) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 917a65348b7a3..875af3b3df6ab 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -552,7 +552,12 @@ def _maybe_downcast( return blocks nbs = extend_blocks( - [blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks] + [ + blk.convert( + using_cow=using_cow, copy=not using_cow, convert_string=False + ) + for blk in blocks + ] ) if caller == "fillna": if len(nbs) != len(blocks) or not all( @@ -625,6 +630,7 @@ def convert( *, copy: bool = True, using_cow: bool = False, + convert_string: bool = True, ) -> list[Block]: """ Attempt to coerce any object types to better types. Return a copy @@ -655,6 +661,7 @@ def convert( res_values = lib.maybe_convert_objects( values, # type: ignore[arg-type] convert_non_numeric=True, + convert_string=convert_string, ) refs = None if ( @@ -904,7 +911,9 @@ def replace( if get_option("future.no_silent_downcasting") is True: blocks = [blk] else: - blocks = blk.convert(copy=False, using_cow=using_cow) + blocks = blk.convert( + copy=False, using_cow=using_cow, convert_string=False + ) if len(blocks) > 1 or blocks[0].dtype != blk.dtype: warnings.warn( # GH#54710 @@ -1007,7 +1016,7 @@ def _replace_regex( ) already_warned.warned_already = True - nbs = block.convert(copy=False, using_cow=using_cow) + nbs = block.convert(copy=False, using_cow=using_cow, convert_string=False) opt = get_option("future.no_silent_downcasting") if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt: warnings.warn( @@ -1150,7 +1159,9 @@ def replace_list( nbs = [] for res_blk in result: converted = res_blk.convert( - copy=True and not using_cow, using_cow=using_cow + copy=True and not using_cow, + using_cow=using_cow, + convert_string=False, ) if len(converted) > 1 or converted[0].dtype != res_blk.dtype: warnings.warn( From dd0a6a26364c72e247b2c805d06b24c9baa427cf Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 5 Nov 2024 09:54:41 +0100 Subject: [PATCH 2/4] still raise a warning when it would cast from numeric to string --- pandas/core/internals/blocks.py | 23 ++++++++++--- pandas/tests/frame/methods/test_fillna.py | 21 +++--------- pandas/tests/frame/methods/test_replace.py | 37 +++------------------ pandas/tests/indexing/test_coercion.py | 6 +++- pandas/tests/series/methods/test_replace.py | 8 ++--- 5 files changed, 38 insertions(+), 57 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 875af3b3df6ab..98d4e5b5bc3a8 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -643,7 +643,10 @@ def convert( if self.ndim != 1 and self.shape[0] != 1: blocks = self.split_and_operate( - Block.convert, copy=copy, using_cow=using_cow + Block.convert, + copy=copy, + using_cow=using_cow, + convert_string=convert_string, ) if all(blk.dtype.kind == "O" for blk in blocks): # Avoid fragmenting the block if convert is a no-op @@ -847,6 +850,7 @@ def replace( mask: npt.NDArray[np.bool_] | None = None, using_cow: bool = False, already_warned=None, + convert_string=None, ) -> list[Block]: """ replace the to_replace value with value, possible to create new @@ -912,7 +916,9 @@ def replace( blocks = [blk] else: blocks = blk.convert( - copy=False, using_cow=using_cow, convert_string=False + copy=False, + using_cow=using_cow, + convert_string=convert_string or self.dtype != _dtype_obj, ) if len(blocks) > 1 or blocks[0].dtype != blk.dtype: warnings.warn( @@ -941,6 +947,7 @@ def replace( value=value, inplace=True, mask=mask, + convert_string=convert_string, ) else: @@ -955,6 +962,7 @@ def replace( inplace=True, mask=mask[i : i + 1], using_cow=using_cow, + convert_string=convert_string, ) ) return blocks @@ -1016,7 +1024,9 @@ def _replace_regex( ) already_warned.warned_already = True - nbs = block.convert(copy=False, using_cow=using_cow, convert_string=False) + nbs = block.convert( + copy=False, using_cow=using_cow, convert_string=self.dtype != _dtype_obj + ) opt = get_option("future.no_silent_downcasting") if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt: warnings.warn( @@ -1047,6 +1057,8 @@ def replace_list( """ values = self.values + convert_string = self.dtype != _dtype_obj + if isinstance(values, Categorical): # TODO: avoid special-casing # GH49404 @@ -1137,6 +1149,7 @@ def replace_list( inplace=inplace, regex=regex, using_cow=using_cow, + convert_string=convert_string, ) if using_cow and i != src_len: @@ -1161,7 +1174,7 @@ def replace_list( converted = res_blk.convert( copy=True and not using_cow, using_cow=using_cow, - convert_string=False, + convert_string=convert_string, ) if len(converted) > 1 or converted[0].dtype != res_blk.dtype: warnings.warn( @@ -1191,6 +1204,7 @@ def _replace_coerce( inplace: bool = True, regex: bool = False, using_cow: bool = False, + convert_string: bool = True, ) -> list[Block]: """ Replace value corresponding to the given boolean array with another @@ -1243,6 +1257,7 @@ def _replace_coerce( inplace=inplace, mask=mask, using_cow=using_cow, + convert_string=convert_string, ) # --------------------------------------------------------------------- diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index e2baa2567f5b4..9844122dc4b2d 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -132,21 +132,14 @@ def test_fillna_different_dtype(self, using_infer_string): [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] ) - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.fillna({2: "foo"}) - else: - result = df.fillna({2: "foo"}) + result = df.fillna({2: "foo"}) expected = DataFrame( [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] ) + expected[2] = expected[2].astype("object") tm.assert_frame_equal(result, expected) - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - return_value = df.fillna({2: "foo"}, inplace=True) - else: - return_value = df.fillna({2: "foo"}, inplace=True) + return_value = df.fillna({2: "foo"}, inplace=True) tm.assert_frame_equal(df, expected) assert return_value is None @@ -385,12 +378,8 @@ def test_fillna_dtype_conversion(self, using_infer_string): # empty block df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64") - if using_infer_string: - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.fillna("nan") - else: - result = df.fillna("nan") - expected = DataFrame("nan", index=range(3), columns=["A", "B"]) + result = df.fillna("nan") + expected = DataFrame("nan", index=range(3), columns=["A", "B"], dtype=object) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("val", ["", 1, np.nan, 1.0]) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index ccee7ca24bd3d..1c07b0db54fc4 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -281,20 +281,12 @@ def test_regex_replace_dict_nested(self, mix_abc): tm.assert_frame_equal(res3, expec) tm.assert_frame_equal(res4, expec) - def test_regex_replace_dict_nested_non_first_character( - self, any_string_dtype, using_infer_string - ): + def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype): # GH 25259 dtype = any_string_dtype df = DataFrame({"first": ["abc", "bca", "cab"]}, dtype=dtype) - if using_infer_string and any_string_dtype == "object": - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = df.replace({"a": "."}, regex=True) - expected = DataFrame({"first": [".bc", "bc.", "c.b"]}) - - else: - result = df.replace({"a": "."}, regex=True) - expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) + result = df.replace({"a": "."}, regex=True) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) @pytest.mark.xfail(using_string_dtype(), reason="can't set float into string") @@ -430,31 +422,12 @@ def test_replace_regex_metachar(self, metachar): ], ) def test_regex_replace_string_types( - self, - data, - to_replace, - expected, - frame_or_series, - any_string_dtype, - using_infer_string, - request, + self, data, to_replace, expected, frame_or_series, any_string_dtype ): # GH-41333, GH-35977 dtype = any_string_dtype obj = frame_or_series(data, dtype=dtype) - if using_infer_string and any_string_dtype == "object": - if len(to_replace) > 1 and isinstance(obj, DataFrame): - request.node.add_marker( - pytest.mark.xfail( - reason="object input array that gets downcasted raises on " - "second pass" - ) - ) - with tm.assert_produces_warning(FutureWarning, match="Downcasting"): - result = obj.replace(to_replace, regex=True) - dtype = "str" - else: - result = obj.replace(to_replace, regex=True) + result = obj.replace(to_replace, regex=True) expected = frame_or_series(expected, dtype=dtype) tm.assert_equal(result, expected) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index ac3bfe3a13a44..4e1697eabf734 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -831,7 +831,7 @@ def replacer(self, how, from_key, to_key): raise ValueError return replacer - def test_replace_series(self, how, to_key, from_key, replacer): + def test_replace_series(self, how, to_key, from_key, replacer, using_infer_string): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") obj = obj.astype(from_key) @@ -856,6 +856,10 @@ def test_replace_series(self, how, to_key, from_key, replacer): else: exp = pd.Series(self.rep[to_key], index=index, name="yyy") + if using_infer_string and exp.dtype == "string" and obj.dtype == object: + # with infer_string, we disable the deprecated downcasting behavior + exp = exp.astype(object) + msg = "Downcasting behavior in `replace`" warn = FutureWarning if ( diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index c59dbc4ed95d7..f50499d30c198 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -764,14 +764,14 @@ def test_replace_value_none_dtype_numeric(self, val): def test_replace_change_dtype_series(self, using_infer_string): # GH#25797 df = pd.DataFrame.from_dict({"Test": ["0.5", True, "0.6"]}) - warn = FutureWarning if using_infer_string else None - with tm.assert_produces_warning(warn, match="Downcasting"): - df["Test"] = df["Test"].replace([True], [np.nan]) - expected = pd.DataFrame.from_dict({"Test": ["0.5", np.nan, "0.6"]}) + df["Test"] = df["Test"].replace([True], [np.nan]) + expected = pd.DataFrame({"Test": ["0.5", np.nan, "0.6"]}, dtype=object) tm.assert_frame_equal(df, expected) df = pd.DataFrame.from_dict({"Test": ["0.5", None, "0.6"]}) df["Test"] = df["Test"].replace([None], [np.nan]) + if using_infer_string: + expected = expected.astype("str") tm.assert_frame_equal(df, expected) df = pd.DataFrame.from_dict({"Test": ["0.5", None, "0.6"]}) From 850573d0fde6b6a66051394d6e9294a79aaba467 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 5 Nov 2024 09:57:06 +0100 Subject: [PATCH 3/4] update typing --- pandas/_libs/lib.pyi | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index b9fd970e68f5b..71a4d3ae2575f 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -86,6 +86,7 @@ def maybe_convert_objects( safe: bool = ..., convert_numeric: bool = ..., convert_non_numeric: Literal[False] = ..., + convert_string: Literal[False] = ..., convert_to_nullable_dtype: Literal[False] = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> npt.NDArray[np.object_ | np.number]: ... @@ -97,6 +98,7 @@ def maybe_convert_objects( safe: bool = ..., convert_numeric: bool = ..., convert_non_numeric: bool = ..., + convert_string: bool = ..., convert_to_nullable_dtype: Literal[True] = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @@ -108,6 +110,7 @@ def maybe_convert_objects( safe: bool = ..., convert_numeric: bool = ..., convert_non_numeric: bool = ..., + convert_string: bool = ..., convert_to_nullable_dtype: bool = ..., dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... From 60ff8f9433788426763c26ae8dacc5fc79fb95ac Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 5 Nov 2024 11:28:29 +0100 Subject: [PATCH 4/4] try fix categorical case --- pandas/core/internals/blocks.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 98d4e5b5bc3a8..f891df68d86b9 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -975,6 +975,7 @@ def _replace_regex( inplace: bool = False, mask=None, using_cow: bool = False, + convert_string: bool = True, already_warned=None, ) -> list[Block]: """ @@ -1025,7 +1026,7 @@ def _replace_regex( already_warned.warned_already = True nbs = block.convert( - copy=False, using_cow=using_cow, convert_string=self.dtype != _dtype_obj + copy=False, using_cow=using_cow, convert_string=convert_string ) opt = get_option("future.no_silent_downcasting") if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt: @@ -1057,8 +1058,6 @@ def replace_list( """ values = self.values - convert_string = self.dtype != _dtype_obj - if isinstance(values, Categorical): # TODO: avoid special-casing # GH49404 @@ -1067,6 +1066,8 @@ def replace_list( values._replace(to_replace=src_list, value=dest_list, inplace=True) return [blk] + convert_string = self.dtype != _dtype_obj + # Exclude anything that we know we won't contain pairs = [ (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) @@ -1234,6 +1235,7 @@ def _replace_coerce( inplace=inplace, mask=mask, using_cow=using_cow, + convert_string=convert_string, ) else: if value is None: