Skip to content

Commit

Permalink
String dtype (2.3.x): avoid downcasting object to string in fillna/wh…
Browse files Browse the repository at this point in the history
…ere/interpolate (pandas-dev#60183)
  • Loading branch information
jorisvandenbossche authored Dec 17, 2024
1 parent 6d9a2b4 commit 3362822
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 58 deletions.
3 changes: 3 additions & 0 deletions pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def maybe_convert_objects(
safe: bool = ...,
convert_numeric: bool = ...,
convert_non_numeric: Literal[False] = ...,
convert_string: Literal[False] = ...,
convert_to_nullable_dtype: Literal[False] = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> npt.NDArray[np.object_ | np.number]: ...
Expand All @@ -97,6 +98,7 @@ def maybe_convert_objects(
safe: bool = ...,
convert_numeric: bool = ...,
convert_non_numeric: bool = ...,
convert_string: bool = ...,
convert_to_nullable_dtype: Literal[True] = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> ArrayLike: ...
Expand All @@ -108,6 +110,7 @@ def maybe_convert_objects(
safe: bool = ...,
convert_numeric: bool = ...,
convert_non_numeric: bool = ...,
convert_string: bool = ...,
convert_to_nullable_dtype: bool = ...,
dtype_if_all_nat: DtypeObj | None = ...,
) -> ArrayLike: ...
Expand Down
7 changes: 6 additions & 1 deletion pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2498,6 +2498,7 @@ def maybe_convert_objects(ndarray[object] objects,
bint convert_numeric=True, # NB: different default!
bint convert_to_nullable_dtype=False,
bint convert_non_numeric=False,
bint convert_string=True,
object dtype_if_all_nat=None) -> "ArrayLike":
"""
Type inference function-- convert object array to proper dtype
Expand Down Expand Up @@ -2747,7 +2748,11 @@ def maybe_convert_objects(ndarray[object] objects,
dtype = StringDtype()
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)

elif using_string_dtype() and is_string_array(objects, skipna=True):
elif (
convert_string
and using_string_dtype()
and is_string_array(objects, skipna=True)
):
from pandas.core.arrays.string_ import StringDtype

dtype = StringDtype(na_value=np.nan)
Expand Down
38 changes: 33 additions & 5 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,7 +563,12 @@ def _maybe_downcast(
return blocks

nbs = extend_blocks(
[blk.convert(using_cow=using_cow, copy=not using_cow) for blk in blocks]
[
blk.convert(
using_cow=using_cow, copy=not using_cow, convert_string=False
)
for blk in blocks
]
)
if caller == "fillna":
if len(nbs) != len(blocks) or not all(
Expand Down Expand Up @@ -636,6 +641,7 @@ def convert(
*,
copy: bool = True,
using_cow: bool = False,
convert_string: bool = True,
) -> list[Block]:
"""
Attempt to coerce any object types to better types. Return a copy
Expand All @@ -648,7 +654,10 @@ def convert(

if self.ndim != 1 and self.shape[0] != 1:
blocks = self.split_and_operate(
Block.convert, copy=copy, using_cow=using_cow
Block.convert,
copy=copy,
using_cow=using_cow,
convert_string=convert_string,
)
if all(blk.dtype.kind == "O" for blk in blocks):
# Avoid fragmenting the block if convert is a no-op
Expand All @@ -666,6 +675,7 @@ def convert(
res_values = lib.maybe_convert_objects(
values, # type: ignore[arg-type]
convert_non_numeric=True,
convert_string=convert_string,
)
refs = None
if (
Expand Down Expand Up @@ -851,6 +861,7 @@ def replace(
mask: npt.NDArray[np.bool_] | None = None,
using_cow: bool = False,
already_warned=None,
convert_string=None,
) -> list[Block]:
"""
replace the to_replace value with value, possible to create new
Expand Down Expand Up @@ -915,7 +926,11 @@ def replace(
if get_option("future.no_silent_downcasting") is True:
blocks = [blk]
else:
blocks = blk.convert(copy=False, using_cow=using_cow)
blocks = blk.convert(
copy=False,
using_cow=using_cow,
convert_string=convert_string or self.dtype != _dtype_obj,
)
if len(blocks) > 1 or blocks[0].dtype != blk.dtype:
warnings.warn(
# GH#54710
Expand Down Expand Up @@ -944,6 +959,7 @@ def replace(
inplace=True,
mask=mask,
using_cow=using_cow,
convert_string=convert_string,
)

else:
Expand All @@ -958,6 +974,7 @@ def replace(
inplace=True,
mask=mask[i : i + 1],
using_cow=using_cow,
convert_string=convert_string,
)
)
return blocks
Expand All @@ -970,6 +987,7 @@ def _replace_regex(
inplace: bool = False,
mask=None,
using_cow: bool = False,
convert_string: bool = True,
already_warned=None,
) -> list[Block]:
"""
Expand Down Expand Up @@ -1029,7 +1047,9 @@ def _replace_regex(
)
already_warned.warned_already = True

nbs = block.convert(copy=False, using_cow=using_cow)
nbs = block.convert(
copy=False, using_cow=using_cow, convert_string=convert_string
)
opt = get_option("future.no_silent_downcasting")
if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt:
warnings.warn(
Expand Down Expand Up @@ -1068,6 +1088,8 @@ def replace_list(
values._replace(to_replace=src_list, value=dest_list, inplace=True)
return [blk]

convert_string = self.dtype != _dtype_obj

# Exclude anything that we know we won't contain
pairs = [
(x, y)
Expand Down Expand Up @@ -1152,6 +1174,7 @@ def replace_list(
inplace=inplace,
regex=regex,
using_cow=using_cow,
convert_string=convert_string,
)

if using_cow and i != src_len:
Expand All @@ -1174,7 +1197,9 @@ def replace_list(
nbs = []
for res_blk in result:
converted = res_blk.convert(
copy=True and not using_cow, using_cow=using_cow
copy=True and not using_cow,
using_cow=using_cow,
convert_string=convert_string,
)
if len(converted) > 1 or converted[0].dtype != res_blk.dtype:
warnings.warn(
Expand Down Expand Up @@ -1204,6 +1229,7 @@ def _replace_coerce(
inplace: bool = True,
regex: bool = False,
using_cow: bool = False,
convert_string: bool = True,
) -> list[Block]:
"""
Replace value corresponding to the given boolean array with another
Expand Down Expand Up @@ -1233,6 +1259,7 @@ def _replace_coerce(
inplace=inplace,
mask=mask,
using_cow=using_cow,
convert_string=convert_string,
)
else:
if value is None:
Expand All @@ -1256,6 +1283,7 @@ def _replace_coerce(
inplace=inplace,
mask=mask,
using_cow=using_cow,
convert_string=convert_string,
)

# ---------------------------------------------------------------------
Expand Down
21 changes: 5 additions & 16 deletions pandas/tests/frame/methods/test_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,21 +132,14 @@ def test_fillna_different_dtype(self, using_infer_string):
[["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]]
)

if using_infer_string:
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
result = df.fillna({2: "foo"})
else:
result = df.fillna({2: "foo"})
result = df.fillna({2: "foo"})
expected = DataFrame(
[["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]]
)
expected[2] = expected[2].astype("object")
tm.assert_frame_equal(result, expected)

if using_infer_string:
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
return_value = df.fillna({2: "foo"}, inplace=True)
else:
return_value = df.fillna({2: "foo"}, inplace=True)
return_value = df.fillna({2: "foo"}, inplace=True)
tm.assert_frame_equal(df, expected)
assert return_value is None

Expand Down Expand Up @@ -385,12 +378,8 @@ def test_fillna_dtype_conversion(self, using_infer_string):

# empty block
df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64")
if using_infer_string:
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
result = df.fillna("nan")
else:
result = df.fillna("nan")
expected = DataFrame("nan", index=range(3), columns=["A", "B"])
result = df.fillna("nan")
expected = DataFrame("nan", index=range(3), columns=["A", "B"], dtype=object)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("val", ["", 1, np.nan, 1.0])
Expand Down
37 changes: 5 additions & 32 deletions pandas/tests/frame/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,20 +281,12 @@ def test_regex_replace_dict_nested(self, mix_abc):
tm.assert_frame_equal(res3, expec)
tm.assert_frame_equal(res4, expec)

def test_regex_replace_dict_nested_non_first_character(
self, any_string_dtype, using_infer_string
):
def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype):
# GH 25259
dtype = any_string_dtype
df = DataFrame({"first": ["abc", "bca", "cab"]}, dtype=dtype)
if using_infer_string and any_string_dtype == "object":
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
result = df.replace({"a": "."}, regex=True)
expected = DataFrame({"first": [".bc", "bc.", "c.b"]})

else:
result = df.replace({"a": "."}, regex=True)
expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype)
result = df.replace({"a": "."}, regex=True)
expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype)
tm.assert_frame_equal(result, expected)

def test_regex_replace_dict_nested_gh4115(self):
Expand Down Expand Up @@ -429,31 +421,12 @@ def test_replace_regex_metachar(self, metachar):
],
)
def test_regex_replace_string_types(
self,
data,
to_replace,
expected,
frame_or_series,
any_string_dtype,
using_infer_string,
request,
self, data, to_replace, expected, frame_or_series, any_string_dtype
):
# GH-41333, GH-35977
dtype = any_string_dtype
obj = frame_or_series(data, dtype=dtype)
if using_infer_string and any_string_dtype == "object":
if len(to_replace) > 1 and isinstance(obj, DataFrame):
request.node.add_marker(
pytest.mark.xfail(
reason="object input array that gets downcasted raises on "
"second pass"
)
)
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
result = obj.replace(to_replace, regex=True)
dtype = "str"
else:
result = obj.replace(to_replace, regex=True)
result = obj.replace(to_replace, regex=True)
expected = frame_or_series(expected, dtype=dtype)

tm.assert_equal(result, expected)
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/indexing/test_coercion.py
Original file line number Diff line number Diff line change
Expand Up @@ -831,7 +831,7 @@ def replacer(self, how, from_key, to_key):
raise ValueError
return replacer

def test_replace_series(self, how, to_key, from_key, replacer):
def test_replace_series(self, how, to_key, from_key, replacer, using_infer_string):
index = pd.Index([3, 4], name="xxx")
obj = pd.Series(self.rep[from_key], index=index, name="yyy")
obj = obj.astype(from_key)
Expand All @@ -856,6 +856,10 @@ def test_replace_series(self, how, to_key, from_key, replacer):
else:
exp = pd.Series(self.rep[to_key], index=index, name="yyy")

if using_infer_string and exp.dtype == "string" and obj.dtype == object:
# with infer_string, we disable the deprecated downcasting behavior
exp = exp.astype(object)

msg = "Downcasting behavior in `replace`"
warn = FutureWarning
if (
Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/series/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
import pandas._testing as tm
from pandas.core.arrays import IntervalArray
Expand Down Expand Up @@ -768,7 +766,6 @@ def test_replace_value_none_dtype_numeric(self, val):
expected = pd.Series([1, None], dtype=object)
tm.assert_series_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_replace_change_dtype_series(self):
# GH#25797
df = pd.DataFrame({"Test": ["0.5", True, "0.6"]}, dtype=object)
Expand Down

0 comments on commit 3362822

Please sign in to comment.