Skip to content

Commit

Permalink
[backport 2.3.x] TST (string dtype): resolve xfails for frame fillna …
Browse files Browse the repository at this point in the history
…and replace tests + fix bug in replace for string (#60295) (#60331)

* TST (string dtype): resolve xfails for frame fillna and replace tests + fix bug in replace for string (#60295)

(cherry picked from commit fae3e80)

* fix tests for default mode

* fixes

* cleanup

* update indexing tests
  • Loading branch information
jorisvandenbossche authored Dec 26, 2024
1 parent 99ae39e commit e350f10
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 63 deletions.
2 changes: 2 additions & 0 deletions pandas/core/array_algos/replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,4 +149,6 @@ def re_replacer(s):
if mask is None:
values[:] = f(values)
else:
if values.ndim != mask.ndim:
mask = np.broadcast_to(mask, values.shape)
values[mask] = f(values[mask])
25 changes: 20 additions & 5 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -929,7 +929,7 @@ def replace(
blocks = blk.convert(
copy=False,
using_cow=using_cow,
convert_string=convert_string or self.dtype != _dtype_obj,
convert_string=convert_string or self.dtype == "string",
)
if len(blocks) > 1 or blocks[0].dtype != blk.dtype:
warnings.warn(
Expand Down Expand Up @@ -987,7 +987,7 @@ def _replace_regex(
inplace: bool = False,
mask=None,
using_cow: bool = False,
convert_string: bool = True,
convert_string=None,
already_warned=None,
) -> list[Block]:
"""
Expand Down Expand Up @@ -1048,10 +1048,18 @@ def _replace_regex(
already_warned.warned_already = True

nbs = block.convert(
copy=False, using_cow=using_cow, convert_string=convert_string
copy=False,
using_cow=using_cow,
convert_string=convert_string or self.dtype == "string",
)
opt = get_option("future.no_silent_downcasting")
if (len(nbs) > 1 or nbs[0].dtype != block.dtype) and not opt:
if (
len(nbs) > 1
or (
nbs[0].dtype != block.dtype
and not (self.dtype == "string" and nbs[0].dtype == "string")
)
) and not opt:
warnings.warn(
# GH#54710
"Downcasting behavior in `replace` is deprecated and "
Expand Down Expand Up @@ -1088,7 +1096,7 @@ def replace_list(
values._replace(to_replace=src_list, value=dest_list, inplace=True)
return [blk]

convert_string = self.dtype != _dtype_obj
convert_string = self.dtype == "string"

# Exclude anything that we know we won't contain
pairs = [
Expand Down Expand Up @@ -2167,6 +2175,13 @@ def where(
if isinstance(self.dtype, (IntervalDtype, StringDtype)):
# TestSetitemFloatIntervalWithIntIntervalValues
blk = self.coerce_to_target_dtype(orig_other)
if (
self.ndim == 2
and isinstance(orig_cond, np.ndarray)
and orig_cond.ndim == 1
and not is_1d_only_ea_dtype(blk.dtype)
):
orig_cond = orig_cond[:, None]
nbs = blk.where(orig_other, orig_cond, using_cow=using_cow)
return self._maybe_downcast(
nbs, downcast=_downcast, using_cow=using_cow, caller="where"
Expand Down
23 changes: 6 additions & 17 deletions pandas/tests/frame/methods/test_fillna.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas.util._test_decorators as td

from pandas import (
Expand Down Expand Up @@ -91,8 +89,6 @@ def test_fillna_datetime(self, datetime_frame):
with pytest.raises(ValueError, match=msg):
datetime_frame.fillna(5, method="ffill")

# TODO(infer_string) test as actual error instead of xfail
@pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string")
def test_fillna_mixed_type(self, float_string_frame):
mf = float_string_frame
mf.loc[mf.index[5:20], "foo"] = np.nan
Expand Down Expand Up @@ -126,7 +122,7 @@ def test_fillna_empty(self, using_copy_on_write):
df.x.fillna(method=m, inplace=True)
df.x.fillna(method=m)

def test_fillna_different_dtype(self, using_infer_string):
def test_fillna_different_dtype(self):
# with different dtype (GH#3386)
df = DataFrame(
[["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]]
Expand All @@ -136,6 +132,7 @@ def test_fillna_different_dtype(self, using_infer_string):
expected = DataFrame(
[["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]]
)
# column is originally float (all-NaN) -> filling with string gives object dtype
expected[2] = expected[2].astype("object")
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -654,18 +651,10 @@ def test_fillna_col_reordering(self):
filled = df.fillna(method="ffill")
assert df.columns.tolist() == filled.columns.tolist()

# TODO(infer_string) test as actual error instead of xfail
@pytest.mark.xfail(using_string_dtype(), reason="can't fill 0 in string")
def test_fill_corner(self, float_frame, float_string_frame):
mf = float_string_frame
mf.loc[mf.index[5:20], "foo"] = np.nan
mf.loc[mf.index[-10:], "A"] = np.nan

filled = float_string_frame.fillna(value=0)
assert (filled.loc[filled.index[5:20], "foo"] == 0).all()
del float_string_frame["foo"]

float_frame.reindex(columns=[]).fillna(value=0)
def test_fill_empty(self, float_frame):
df = float_frame.reindex(columns=[])
result = df.fillna(value=0)
tm.assert_frame_equal(result, df)

def test_fillna_downcast_dict(self):
# GH#40809
Expand Down
81 changes: 43 additions & 38 deletions pandas/tests/frame/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
DataFrame,
Expand All @@ -30,7 +28,6 @@ def mix_abc() -> dict[str, list[float | str]]:


class TestDataFrameReplace:
@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_replace_inplace(self, datetime_frame, float_string_frame):
datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan
datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan
Expand All @@ -46,7 +43,9 @@ def test_replace_inplace(self, datetime_frame, float_string_frame):
mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan

result = float_string_frame.replace(np.nan, 0)
expected = float_string_frame.fillna(value=0)
expected = float_string_frame.copy()
expected["foo"] = expected["foo"].astype(object)
expected = expected.fillna(value=0)
tm.assert_frame_equal(result, expected)

tsframe = datetime_frame.copy()
Expand Down Expand Up @@ -290,34 +289,39 @@ def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype):
tm.assert_frame_equal(result, expected)

def test_regex_replace_dict_nested_gh4115(self):
df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2})
df = DataFrame(
{"Type": Series(["Q", "T", "Q", "Q", "T"], dtype=object), "tmp": 2}
)
expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2})
msg = "Downcasting behavior in `replace`"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.replace({"Type": {"Q": 0, "T": 1}})

tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_regex_replace_list_to_scalar(self, mix_abc):
def test_regex_replace_list_to_scalar(self, mix_abc, using_infer_string):
df = DataFrame(mix_abc)
expec = DataFrame(
{
"a": mix_abc["a"],
"b": np.array([np.nan] * 4),
"b": [np.nan] * 4,
"c": [np.nan, np.nan, np.nan, "d"],
}
)
if using_infer_string:
expec["b"] = expec["b"].astype("str")
msg = "Downcasting behavior in `replace`"
with tm.assert_produces_warning(FutureWarning, match=msg):
warn = None if using_infer_string else FutureWarning
with tm.assert_produces_warning(warn, match=msg):
res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True)
res2 = df.copy()
res3 = df.copy()
with tm.assert_produces_warning(FutureWarning, match=msg):
with tm.assert_produces_warning(warn, match=msg):
return_value = res2.replace(
[r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True
)
assert return_value is None
with tm.assert_produces_warning(FutureWarning, match=msg):
with tm.assert_produces_warning(warn, match=msg):
return_value = res3.replace(
regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True
)
Expand All @@ -326,7 +330,6 @@ def test_regex_replace_list_to_scalar(self, mix_abc):
tm.assert_frame_equal(res2, expec)
tm.assert_frame_equal(res3, expec)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_regex_replace_str_to_numeric(self, mix_abc):
# what happens when you try to replace a numeric value with a regex?
df = DataFrame(mix_abc)
Expand All @@ -342,7 +345,6 @@ def test_regex_replace_str_to_numeric(self, mix_abc):
tm.assert_frame_equal(res2, expec)
tm.assert_frame_equal(res3, expec)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_regex_replace_regex_list_to_numeric(self, mix_abc):
df = DataFrame(mix_abc)
res = df.replace([r"\s*\.\s*", "b"], 0, regex=True)
Expand Down Expand Up @@ -539,21 +541,28 @@ def test_replace_convert(self):
res = rep.dtypes
tm.assert_series_equal(expec, res)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_replace_mixed(self, float_string_frame):
mf = float_string_frame
mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan
mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan

result = float_string_frame.replace(np.nan, -18)
expected = float_string_frame.fillna(value=-18)
expected = float_string_frame.copy()
expected["foo"] = expected["foo"].astype(object)
expected = expected.fillna(value=-18)
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result.replace(-18, np.nan), float_string_frame)
expected2 = float_string_frame.copy()
expected2["foo"] = expected2["foo"].astype(object)
tm.assert_frame_equal(result.replace(-18, np.nan), expected2)

result = float_string_frame.replace(np.nan, -1e8)
expected = float_string_frame.fillna(value=-1e8)
expected = float_string_frame.copy()
expected["foo"] = expected["foo"].astype(object)
expected = expected.fillna(value=-1e8)
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result.replace(-1e8, np.nan), float_string_frame)
expected2 = float_string_frame.copy()
expected2["foo"] = expected2["foo"].astype(object)
tm.assert_frame_equal(result.replace(-1e8, np.nan), expected2)

def test_replace_mixed_int_block_upcasting(self):
# int block upcasting
Expand Down Expand Up @@ -614,15 +623,11 @@ def test_replace_mixed2(self, using_infer_string):

expected = DataFrame(
{
"A": Series(["foo", "bar"]),
"A": Series(["foo", "bar"], dtype="object"),
"B": Series([0, "foo"], dtype="object"),
}
)
if using_infer_string:
with tm.assert_produces_warning(FutureWarning, match="Downcasting"):
result = df.replace([1, 2], ["foo", "bar"])
else:
result = df.replace([1, 2], ["foo", "bar"])
result = df.replace([1, 2], ["foo", "bar"])
tm.assert_frame_equal(result, expected)

def test_replace_mixed3(self):
Expand Down Expand Up @@ -931,15 +936,16 @@ def test_replace_limit(self):
# TODO
pass

def test_replace_dict_no_regex(self):
def test_replace_dict_no_regex(self, any_string_dtype):
answer = Series(
{
0: "Strongly Agree",
1: "Agree",
2: "Neutral",
3: "Disagree",
4: "Strongly Disagree",
}
},
dtype=any_string_dtype,
)
weights = {
"Agree": 4,
Expand All @@ -954,15 +960,16 @@ def test_replace_dict_no_regex(self):
result = answer.replace(weights)
tm.assert_series_equal(result, expected)

def test_replace_series_no_regex(self):
def test_replace_series_no_regex(self, any_string_dtype):
answer = Series(
{
0: "Strongly Agree",
1: "Agree",
2: "Neutral",
3: "Disagree",
4: "Strongly Disagree",
}
},
dtype=any_string_dtype,
)
weights = Series(
{
Expand Down Expand Up @@ -1060,16 +1067,15 @@ def test_nested_dict_overlapping_keys_replace_str(self):
expected = df.replace({"a": dict(zip(astr, bstr))})
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="can't set float into string")
def test_replace_swapping_bug(self, using_infer_string):
def test_replace_swapping_bug(self):
df = DataFrame({"a": [True, False, True]})
res = df.replace({"a": {True: "Y", False: "N"}})
expect = DataFrame({"a": ["Y", "N", "Y"]})
expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object)
tm.assert_frame_equal(res, expect)

df = DataFrame({"a": [0, 1, 0]})
res = df.replace({"a": {0: "Y", 1: "N"}})
expect = DataFrame({"a": ["Y", "N", "Y"]})
expect = DataFrame({"a": ["Y", "N", "Y"]}, dtype=object)
tm.assert_frame_equal(res, expect)

def test_replace_period(self):
Expand Down Expand Up @@ -1345,7 +1351,7 @@ def test_replace_commutative(self, df, to_replace, exp):
)
def test_replace_replacer_dtype(self, replacer):
# GH26632
df = DataFrame(["a"])
df = DataFrame(["a"], dtype=object)
msg = "Downcasting behavior in `replace` "
with tm.assert_produces_warning(FutureWarning, match=msg):
result = df.replace({"a": replacer, "b": replacer})
Expand Down Expand Up @@ -1462,6 +1468,7 @@ def test_replace_value_category_type(self):
input_df = input_df.replace("obj1", "obj9")
result = input_df.replace("cat2", "catX")

result = result.astype({"col1": "int64", "col3": "float64", "col5": "str"})
tm.assert_frame_equal(result, expected)

def test_replace_dict_category_type(self):
Expand Down Expand Up @@ -1503,13 +1510,11 @@ def test_replace_with_compiled_regex(self):
expected = DataFrame(["z", "b", "c"])
tm.assert_frame_equal(result, expected)

def test_replace_intervals(self, using_infer_string):
def test_replace_intervals(self):
# https://github.com/pandas-dev/pandas/issues/35931
df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]})
warning = FutureWarning if using_infer_string else None
with tm.assert_produces_warning(warning, match="Downcasting"):
result = df.replace({"a": {pd.Interval(0, 1): "x"}})
expected = DataFrame({"a": ["x", "x"]})
result = df.replace({"a": {pd.Interval(0, 1): "x"}})
expected = DataFrame({"a": ["x", "x"]}, dtype=object)
tm.assert_frame_equal(result, expected)

def test_replace_unicode(self):
Expand Down
7 changes: 4 additions & 3 deletions pandas/tests/indexing/test_coercion.py
Original file line number Diff line number Diff line change
Expand Up @@ -856,7 +856,7 @@ def test_replace_series(self, how, to_key, from_key, replacer, using_infer_strin
else:
exp = pd.Series(self.rep[to_key], index=index, name="yyy")

if using_infer_string and exp.dtype == "string" and obj.dtype == object:
if using_infer_string and exp.dtype == "string":
# with infer_string, we disable the deprecated downcasting behavior
exp = exp.astype(object)

Expand Down Expand Up @@ -889,8 +889,9 @@ def test_replace_series_datetime_tz(
assert obj.dtype == from_key

exp = pd.Series(self.rep[to_key], index=index, name="yyy")
if using_infer_string and to_key == "object":
assert exp.dtype == "string"
if using_infer_string and exp.dtype == "string":
# with infer_string, we disable the deprecated downcasting behavior
exp = exp.astype(object)
else:
assert exp.dtype == to_key

Expand Down

0 comments on commit e350f10

Please sign in to comment.