Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

String dtype (2.3.x): avoid downcasting object to string in fillna/where/interpolate #60183

3 changes: 3 additions & 0 deletions pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def maybe_convert_objects(
safe: bool = ...,
convert_numeric: bool = ...,
convert_non_numeric: Literal[False] = ...,
convert_string: Literal[False] = ...,
convert_to_nullable_dtype: Literal[False] = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> npt.NDArray[np.object_ | np.number]: ...
Expand All @@ -97,6 +98,7 @@ def maybe_convert_objects(
safe: bool = ...,
convert_numeric: bool = ...,
convert_non_numeric: bool = ...,
convert_string: bool = ...,
convert_to_nullable_dtype: Literal[True] = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> ArrayLike: ...
Expand All @@ -108,6 +110,7 @@ def maybe_convert_objects(
safe: bool = ...,
convert_numeric: bool = ...,
convert_non_numeric: bool = ...,
convert_string: bool = ...,
convert_to_nullable_dtype: bool = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> ArrayLike: ...
Expand Down
7 changes: 6 additions & 1 deletion pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2498,6 +2498,7 @@ def maybe_convert_objects(ndarray[object] objects,
bint convert_numeric=True, # NB: different default!
bint convert_to_nullable_dtype=False,
bint convert_non_numeric=False,
bint convert_string=True,
object dtype_if_all_nat=None) -> "ArrayLike":
"""
Type inference function-- convert object array to proper dtype
Expand Down Expand Up @@ -2747,7 +2748,11 @@ def maybe_convert_objects(ndarray[object] objects,
dtype = StringDtype()
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)

elif using_string_dtype() and is_string_array(objects, skipna=True):
elif (
convert_string
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In what cases would we have convert_string be False and using_string_dtype be True? Not saying this implementation is wrong, just hard to grok at first glance

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This combination of convert_string=False with using_string_dtype() being True is exactly what I am using inside the replace/fillna implementation, to pass down to maybe_convert_objects that I don't want to cast object to string if I know that we started with object dtype, and that this dtype should be preserved

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But I fully agree it is all a bit confusing, and the implementation is not very clear (I mostly got to the current state just by getting all our tests to pass, but it is difficult to assess whether it is complete. However, given that this is about "not raising a useless warning", I think it is not the worst thing if we would have missed a case where we would still raise the warning)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah OK thanks that is helpful. As a nit, maybe preserve_object_dtype would be a better keyword, but not something I think is a blocker. This all needs a good cleanup past once we get through the 2.3 push anyway

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As a nit, maybe preserve_object_dtype would be a better keyword

It's only specifically not converting object to string here (all other inferred types are still used), so preserve_object_dtype would be a bit too generic I think

and using_string_dtype()
and is_string_array(objects, skipna=True)
):
from pandas.core.arrays.string_ import StringDtype

dtype = StringDtype(na_value=np.nan)
Expand Down
38 changes: 33 additions & 5 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,12 @@ def _maybe_downcast(
return blocks

nbs = extend_blocks(
[blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks]
[
blk.convert(
using_cow=using_cow, copy=not using_cow, convert_string=False
)
for blk in blocks
]
)
if caller == "fillna":
if len(nbs) != len(blocks) or not all(
Expand Down Expand Up @@ -636,6 +641,7 @@ def convert(
*,
copy: bool = True,
using_cow: bool = False,
convert_string: bool = True,
) -> list[Block]:
"""
Attempt to coerce any object types to better types. Return a copy
Expand All @@ -648,7 +654,10 @@ def convert(

if self.ndim != 1 and self.shape[0] != 1:
blocks = self.split_and_operate(
Block.convert, copy=copy, using_cow=using_cow
Block.convert,
copy=copy,
using_cow=using_cow,
convert_string=convert_string,
)
if all(blk.dtype.kind == "O" for blk in blocks):
# Avoid fragmenting the block if convert is a no-op
Expand All @@ -666,6 +675,7 @@ def convert(
res_values = lib.maybe_convert_objects(
values, # type: ignore[arg-type]
convert_non_numeric=True,
convert_string=convert_string,
)
refs = None
if (
Expand Down Expand Up @@ -851,6 +861,7 @@ def replace(
mask: npt.NDArray[np.bool_] | None = None,
using_cow: bool = False,
already_warned=None,
convert_string=None,
) -> list[Block]:
"""
replace the to_replace value with value, possible to create new
Expand Down Expand Up @@ -915,7 +926,11 @@ def replace(
if get_option("future.no_silent_downcasting") is True:
blocks = [blk]
else:
blocks = blk.convert(copy=False, using_cow=using_cow)
blocks = blk.convert(
copy=False,
using_cow=using_cow,
convert_string=convert_string or self.dtype != _dtype_obj,
)
if len(blocks) > 1 or blocks[0].dtype != blk.dtype:
warnings.warn(
# GH#54710
Expand Down Expand Up @@ -944,6 +959,7 @@ def replace(
inplace=True,
mask=mask,
using_cow=using_cow,
convert_string=convert_string,
)

else:
Expand All @@ -958,6 +974,7 @@ def replace(
inplace=True,
mask=mask[i : i + 1],
using_cow=using_cow,
convert_string=convert_string,
)
)
return blocks
Expand All @@ -970,6 +987,7 @@ def _replace_regex(
inplace: bool = False,
mask=None,
using_cow: bool = False,
convert_string: bool = True,
already_warned=None,
) -> list[Block]:
"""
Expand Down Expand Up @@ -1029,7 +1047,9 @@ def _replace_regex(
)
already_warned.warned_already = True

nbs = block.convert(copy=False, using_cow=using_cow)
nbs = block.convert(
copy=False, using_cow=using_cow, convert_string=convert_string
)
opt = get_option("future.no_silent_downcasting")
if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt:
warnings.warn(
Expand Down Expand Up @@ -1068,6 +1088,8 @@ def replace_list(
values._replace(to_replace=src_list, value=dest_list, inplace=True)
return [blk]

convert_string = self.dtype != _dtype_obj

# Exclude anything that we know we won't contain
pairs = [
(x, y)
Expand Down Expand Up @@ -1152,6 +1174,7 @@ def replace_list(
inplace=inplace,
regex=regex,
using_cow=using_cow,
convert_string=convert_string,
)

if using_cow and i != src_len:
Expand All @@ -1174,7 +1197,9 @@ def replace_list(
nbs = []
for res_blk in result:
converted = res_blk.convert(
copy=True and not using_cow, using_cow=using_cow
copy=True and not using_cow,
using_cow=using_cow,
convert_string=convert_string,
)
if len(converted) > 1 or converted[0].dtype != res_blk.dtype:
warnings.warn(
Expand Down Expand Up @@ -1204,6 +1229,7 @@ def _replace_coerce(
inplace: bool = True,
regex: bool = False,
using_cow: bool = False,
convert_string: bool = True,
) -> list[Block]:
"""
Replace value corresponding to the given boolean array with another
Expand Down Expand Up @@ -1233,6 +1259,7 @@ def _replace_coerce(
inplace=inplace,
mask=mask,
using_cow=using_cow,
convert_string=convert_string,
)
else:
if value is None:
Expand All @@ -1256,6 +1283,7 @@ def _replace_coerce(
inplace=inplace,
mask=mask,
using_cow=using_cow,
convert_string=convert_string,
)

# ---------------------------------------------------------------------
Expand Down
21 changes: 5 additions & 16 deletions pandas/tests/frame/methods/test_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,21 +132,14 @@ def test_fillna_different_dtype(self, using_infer_string):
[["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]]
)

if using_infer_string:
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
result = df.fillna({2: "foo"})
else:
result = df.fillna({2: "foo"})
result = df.fillna({2: "foo"})
expected = DataFrame(
[["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]]
)
expected[2] = expected[2].astype("object")
tm.assert_frame_equal(result, expected)

if using_infer_string:
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
return_value = df.fillna({2: "foo"}, inplace=True)
else:
return_value = df.fillna({2: "foo"}, inplace=True)
return_value = df.fillna({2: "foo"}, inplace=True)
tm.assert_frame_equal(df, expected)
assert return_value is None

Expand Down Expand Up @@ -385,12 +378,8 @@ def test_fillna_dtype_conversion(self, using_infer_string):

# empty block
df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64")
if using_infer_string:
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
result = df.fillna("nan")
else:
result = df.fillna("nan")
expected = DataFrame("nan", index=range(3), columns=["A", "B"])
result = df.fillna("nan")
expected = DataFrame("nan", index=range(3), columns=["A", "B"], dtype=object)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("val", ["", 1, np.nan, 1.0])
Expand Down
37 changes: 5 additions & 32 deletions pandas/tests/frame/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,20 +281,12 @@ def test_regex_replace_dict_nested(self, mix_abc):
tm.assert_frame_equal(res3, expec)
tm.assert_frame_equal(res4, expec)

def test_regex_replace_dict_nested_non_first_character(
self, any_string_dtype, using_infer_string
):
def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype):
# GH 25259
dtype = any_string_dtype
df = DataFrame({"first": ["abc", "bca", "cab"]}, dtype=dtype)
if using_infer_string and any_string_dtype == "object":
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
result = df.replace({"a": "."}, regex=True)
expected = DataFrame({"first": [".bc", "bc.", "c.b"]})

else:
result = df.replace({"a": "."}, regex=True)
expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype)
result = df.replace({"a": "."}, regex=True)
expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype)
tm.assert_frame_equal(result, expected)

def test_regex_replace_dict_nested_gh4115(self):
Expand Down Expand Up @@ -429,31 +421,12 @@ def test_replace_regex_metachar(self, metachar):
],
)
def test_regex_replace_string_types(
self,
data,
to_replace,
expected,
frame_or_series,
any_string_dtype,
using_infer_string,
request,
self, data, to_replace, expected, frame_or_series, any_string_dtype
):
# GH-41333, GH-35977
dtype = any_string_dtype
obj = frame_or_series(data, dtype=dtype)
if using_infer_string and any_string_dtype == "object":
if len(to_replace) > 1 and isinstance(obj, DataFrame):
request.node.add_marker(
pytest.mark.xfail(
reason="object input array that gets downcasted raises on "
"second pass"
)
)
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
result = obj.replace(to_replace, regex=True)
dtype = "str"
else:
result = obj.replace(to_replace, regex=True)
result = obj.replace(to_replace, regex=True)
expected = frame_or_series(expected, dtype=dtype)

tm.assert_equal(result, expected)
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/indexing/test_coercion.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,7 +831,7 @@ def replacer(self, how, from_key, to_key):
raise ValueError
return replacer

def test_replace_series(self, how, to_key, from_key, replacer):
def test_replace_series(self, how, to_key, from_key, replacer, using_infer_string):
index = pd.Index([3, 4], name="xxx")
obj = pd.Series(self.rep[from_key], index=index, name="yyy")
obj = obj.astype(from_key)
Expand All @@ -856,6 +856,10 @@ def test_replace_series(self, how, to_key, from_key, replacer):
else:
exp = pd.Series(self.rep[to_key], index=index, name="yyy")

if using_infer_string and exp.dtype == "string" and obj.dtype == object:
# with infer_string, we disable the deprecated downcasting behavior
exp = exp.astype(object)

msg = "Downcasting behavior in `replace`"
warn = FutureWarning
if (
Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/series/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import IntervalArray
Expand Down Expand Up @@ -768,7 +766,6 @@ def test_replace_value_none_dtype_numeric(self, val):
expected = pd.Series([1, None], dtype=object)
tm.assert_series_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_replace_change_dtype_series(self):
# GH#25797
df = pd.DataFrame({"Test": ["0.5", True, "0.6"]}, dtype=object)
Expand Down
Loading