Skip to content

Commit

Permalink
Fix new string dtype tests for frame folder (#55409)
Browse files Browse the repository at this point in the history
* Start fixing string tests

* BUG: interpolate raising wrong error for ea

* Fix more tests

* REGR: join segfaulting for arrow string with nulls

* Fix more tests

* Fix more tests

* BUG: rank raising for arrow string dtypes

* BUG: eq not implemented for categorical and arrow backed strings

* More tests

* BUG: ndim of string block incorrect with string inference

* Fix test

* Fix tests

* Fix tests

* Fix more indexing tests

* BUG: Index.insert raising when inserting None into new string dtype

* Fix tests

* BUG: Inserting ndim=0 array does not infer string dtype

* Fix tests

* Fix tests

* Fix more tests

* Fix more tests

* BUG: idxmax raising for arrow strings

* Fix

* Fix more tests

* Fix more tests

* Fix more tests

* Fix remaining tests

* Fix remaining tests

* Change default

* BUG: Groupby not keeping string dtype for empty objects

* Start fixing gb tests

* Fix tests

* Merge main

* Update config_init.py

* Fixup

* Update
  • Loading branch information
phofl authored Dec 9, 2023
1 parent 71a3e3c commit ce4169a
Show file tree
Hide file tree
Showing 45 changed files with 477 additions and 170 deletions.
5 changes: 5 additions & 0 deletions pandas/tests/frame/constructors/test_from_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import numpy as np
import pytest

from pandas._config import using_pyarrow_string_dtype

from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -42,6 +44,9 @@ def test_constructor_single_row(self):
)
tm.assert_frame_equal(result, expected)

@pytest.mark.skipif(
using_pyarrow_string_dtype(), reason="columns inferring logic broken"
)
def test_constructor_list_of_series(self):
data = [
OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]),
Expand Down
5 changes: 5 additions & 0 deletions pandas/tests/frame/constructors/test_from_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import pytest
import pytz

from pandas._config import using_pyarrow_string_dtype

from pandas.compat import is_platform_little_endian

from pandas import (
Expand Down Expand Up @@ -56,6 +58,9 @@ def test_from_records_with_datetimes(self):
expected["EXPIRY"] = expected["EXPIRY"].astype("M8[s]")
tm.assert_frame_equal(result, expected)

@pytest.mark.skipif(
using_pyarrow_string_dtype(), reason="dtype checking logic doesn't work"
)
def test_from_records_sequencelike(self):
df = DataFrame(
{
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/indexing/test_getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def test_getitem_list_duplicates(self):

def test_getitem_dupe_cols(self):
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"])
msg = "\"None of [Index(['baf'], dtype='object')] are in the [columns]\""
msg = "\"None of [Index(['baf'], dtype="
with pytest.raises(KeyError, match=re.escape(msg)):
df[["baf"]]

Expand Down
35 changes: 25 additions & 10 deletions pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,9 @@ def test_setattr_column(self):
df.foobar = 5
assert (df.foobar == 5).all()

def test_setitem(self, float_frame, using_copy_on_write, warn_copy_on_write):
def test_setitem(
self, float_frame, using_copy_on_write, warn_copy_on_write, using_infer_string
):
# not sure what else to do here
series = float_frame["A"][::2]
float_frame["col5"] = series
Expand Down Expand Up @@ -331,7 +333,10 @@ def test_setitem(self, float_frame, using_copy_on_write, warn_copy_on_write):
with pytest.raises(SettingWithCopyError, match=msg):
smaller["col10"] = ["1", "2"]

assert smaller["col10"].dtype == np.object_
if using_infer_string:
assert smaller["col10"].dtype == "string"
else:
assert smaller["col10"].dtype == np.object_
assert (smaller["col10"] == ["1", "2"]).all()

def test_setitem2(self):
Expand Down Expand Up @@ -426,7 +431,7 @@ def test_setitem_cast(self, float_frame):
float_frame["something"] = 2.5
assert float_frame["something"].dtype == np.float64

def test_setitem_corner(self, float_frame):
def test_setitem_corner(self, float_frame, using_infer_string):
# corner case
df = DataFrame({"B": [1.0, 2.0, 3.0], "C": ["a", "b", "c"]}, index=np.arange(3))
del df["B"]
Expand Down Expand Up @@ -463,10 +468,16 @@ def test_setitem_corner(self, float_frame):
dm["foo"] = "bar"
del dm["foo"]
dm["foo"] = "bar"
assert dm["foo"].dtype == np.object_
if using_infer_string:
assert dm["foo"].dtype == "string"
else:
assert dm["foo"].dtype == np.object_

dm["coercible"] = ["1", "2", "3"]
assert dm["coercible"].dtype == np.object_
if using_infer_string:
assert dm["coercible"].dtype == "string"
else:
assert dm["coercible"].dtype == np.object_

def test_setitem_corner2(self):
data = {
Expand All @@ -483,7 +494,7 @@ def test_setitem_corner2(self):
assert df.loc[1, "title"] == "foobar"
assert df.loc[1, "cruft"] == 0

def test_setitem_ambig(self):
def test_setitem_ambig(self, using_infer_string):
# Difficulties with mixed-type data
# Created as float type
dm = DataFrame(index=range(3), columns=range(3))
Expand All @@ -499,18 +510,22 @@ def test_setitem_ambig(self):

dm[2] = uncoercable_series
assert len(dm.columns) == 3
assert dm[2].dtype == np.object_
if using_infer_string:
assert dm[2].dtype == "string"
else:
assert dm[2].dtype == np.object_

def test_setitem_None(self, float_frame):
def test_setitem_None(self, float_frame, using_infer_string):
# GH #766
float_frame[None] = float_frame["A"]
key = None if not using_infer_string else np.nan
tm.assert_series_equal(
float_frame.iloc[:, -1], float_frame["A"], check_names=False
)
tm.assert_series_equal(
float_frame.loc[:, None], float_frame["A"], check_names=False
float_frame.loc[:, key], float_frame["A"], check_names=False
)
tm.assert_series_equal(float_frame[None], float_frame["A"], check_names=False)
tm.assert_series_equal(float_frame[key], float_frame["A"], check_names=False)

def test_loc_setitem_boolean_mask_allfalse(self):
# GH 9596
Expand Down
8 changes: 5 additions & 3 deletions pandas/tests/frame/indexing/test_set_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def test_set_value(self, float_frame):
float_frame._set_value(idx, col, 1)
assert float_frame[col][idx] == 1

def test_set_value_resize(self, float_frame):
def test_set_value_resize(self, float_frame, using_infer_string):
res = float_frame._set_value("foobar", "B", 0)
assert res is None
assert float_frame.index[-1] == "foobar"
Expand All @@ -27,8 +27,10 @@ def test_set_value_resize(self, float_frame):

res = float_frame.copy()
res._set_value("foobar", "baz", "sam")
assert res["baz"].dtype == np.object_

if using_infer_string:
assert res["baz"].dtype == "string"
else:
assert res["baz"].dtype == np.object_
res = float_frame.copy()
with tm.assert_produces_warning(
FutureWarning, match="Setting an item of incompatible dtype"
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -1319,7 +1319,7 @@ def test_setitem_column_frame_as_category(self):
df["col2"] = Series([1, 2, 3], dtype="category")

expected_types = Series(
["int64", "category", "category"], index=[0, "col1", "col2"]
["int64", "category", "category"], index=[0, "col1", "col2"], dtype=object
)
tm.assert_series_equal(df.dtypes, expected_types)

Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/frame/indexing/test_where.py
Original file line number Diff line number Diff line change
Expand Up @@ -1077,9 +1077,13 @@ def test_where_producing_ea_cond_for_np_dtype():
@pytest.mark.parametrize(
"replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)]
)
def test_where_int_overflow(replacement):
def test_where_int_overflow(replacement, using_infer_string, request):
# GH 31687
df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]])
if using_infer_string and replacement not in (None, "snake"):
request.node.add_marker(
pytest.mark.xfail(reason="Can't set non-string into string column")
)
result = df.where(pd.notnull(df), replacement)
expected = DataFrame([[1.0, 2e25, "nine"], [replacement, 0.1, replacement]])

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/frame/methods/test_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def test_align_float(self, float_frame, using_copy_on_write):
af, bf = float_frame.align(
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None
)
tm.assert_index_equal(bf.index, Index([]))
tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype))

msg = (
"The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align "
Expand All @@ -117,7 +117,7 @@ def test_align_float(self, float_frame, using_copy_on_write):
af, bf = float_frame.align(
other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0
)
tm.assert_index_equal(bf.index, Index([]))
tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype))

# Try to align DataFrame to Series along bad axis
msg = "No axis named 2 for object type DataFrame"
Expand Down
22 changes: 15 additions & 7 deletions pandas/tests/frame/methods/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,21 +166,22 @@ def test_astype_str(self):
"c": [Timedelta(x)._repr_base() for x in c._values],
"d": list(map(str, d._values)),
"e": list(map(str, e._values)),
}
},
dtype="object",
)

tm.assert_frame_equal(result, expected)

def test_astype_str_float(self):
# see GH#11302
result = DataFrame([np.nan]).astype(str)
expected = DataFrame(["nan"])
expected = DataFrame(["nan"], dtype="object")

tm.assert_frame_equal(result, expected)
result = DataFrame([1.12345678901234567890]).astype(str)

val = "1.1234567890123457"
expected = DataFrame([val])
expected = DataFrame([val], dtype="object")
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("dtype_class", [dict, Series])
Expand All @@ -199,7 +200,7 @@ def test_astype_dict_like(self, dtype_class):
expected = DataFrame(
{
"a": a,
"b": Series(["0", "1", "2", "3", "4"]),
"b": Series(["0", "1", "2", "3", "4"], dtype="object"),
"c": c,
"d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"),
}
Expand Down Expand Up @@ -282,7 +283,7 @@ def test_astype_duplicate_col_series_arg(self):
result = df.astype(dtypes)
expected = DataFrame(
{
0: vals[:, 0].astype(str),
0: Series(vals[:, 0].astype(str), dtype=object),
1: vals[:, 1],
2: pd.array(vals[:, 2], dtype="Float64"),
3: vals[:, 3],
Expand Down Expand Up @@ -620,6 +621,7 @@ def test_astype_arg_for_errors_dictlist(self):
{"a": 2.2, "b": "15.3", "c": "another_test"},
]
)
expected["c"] = expected["c"].astype("object")
type_dict = {"a": "float64", "b": "float64", "c": "object"}

result = df.astype(dtype=type_dict, errors="ignore")
Expand Down Expand Up @@ -680,6 +682,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame):
],
],
columns=timezone_frame.columns,
dtype="object",
)
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -754,7 +757,9 @@ def test_astype_tz_object_conversion(self, tz):
result = result.astype({"tz": "datetime64[ns, Europe/London]"})
tm.assert_frame_equal(result, expected)

def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture):
def test_astype_dt64_to_string(
self, frame_or_series, tz_naive_fixture, using_infer_string
):
# GH#41409
tz = tz_naive_fixture

Expand All @@ -772,7 +777,10 @@ def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture):
item = result.iloc[0]
if frame_or_series is DataFrame:
item = item.iloc[0]
assert item is pd.NA
if using_infer_string:
assert item is np.nan
else:
assert item is pd.NA

# For non-NA values, we should match what we get for non-EA str
alt = obj.astype(str)
Expand Down
6 changes: 4 additions & 2 deletions pandas/tests/frame/methods/test_combine_first.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def test_combine_first_mixed(self):
combined = f.combine_first(g)
tm.assert_frame_equal(combined, exp)

def test_combine_first(self, float_frame):
def test_combine_first(self, float_frame, using_infer_string):
# disjoint
head, tail = float_frame[:5], float_frame[5:]

Expand Down Expand Up @@ -76,7 +76,9 @@ def test_combine_first(self, float_frame):
tm.assert_series_equal(combined["A"].reindex(g.index), g["A"])

# corner cases
comb = float_frame.combine_first(DataFrame())
warning = FutureWarning if using_infer_string else None
with tm.assert_produces_warning(warning, match="empty entries"):
comb = float_frame.combine_first(DataFrame())
tm.assert_frame_equal(comb, float_frame)

comb = DataFrame().combine_first(float_frame)
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/frame/methods/test_convert_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,13 @@ class TestConvertDtypes:
@pytest.mark.parametrize(
"convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")]
)
def test_convert_dtypes(self, convert_integer, expected, string_storage):
def test_convert_dtypes(
self, convert_integer, expected, string_storage, using_infer_string
):
# Specific types are tested in tests/series/test_dtypes.py
# Just check that it works for DataFrame here
if using_infer_string:
string_storage = "pyarrow_numpy"
df = pd.DataFrame(
{
"a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
Expand Down
12 changes: 9 additions & 3 deletions pandas/tests/frame/methods/test_cov_corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ def test_corrwith(self, datetime_frame, dtype):
for row in index[:4]:
tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row]))

def test_corrwith_with_objects(self):
def test_corrwith_with_objects(self, using_infer_string):
df1 = DataFrame(
np.random.default_rng(2).standard_normal((10, 4)),
columns=Index(list("ABCD"), dtype=object),
Expand All @@ -338,8 +338,14 @@ def test_corrwith_with_objects(self):
df1["obj"] = "foo"
df2["obj"] = "bar"

with pytest.raises(TypeError, match="Could not convert"):
df1.corrwith(df2)
if using_infer_string:
import pyarrow as pa

with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"):
df1.corrwith(df2)
else:
with pytest.raises(TypeError, match="Could not convert"):
df1.corrwith(df2)
result = df1.corrwith(df2, numeric_only=True)
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols])
tm.assert_series_equal(result, expected)
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_drop.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,7 @@ def test_drop_with_duplicate_columns2(self):

def test_drop_inplace_no_leftover_column_reference(self):
# GH 13934
df = DataFrame({"a": [1, 2, 3]})
df = DataFrame({"a": [1, 2, 3]}, columns=Index(["a"], dtype="object"))
a = df.a
df.drop(["a"], axis=1, inplace=True)
tm.assert_index_equal(df.columns, Index([], dtype="object"))
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_drop_duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
def test_drop_duplicates_with_misspelled_column_name(subset):
# GH 19730
df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
msg = re.escape("Index(['a'], dtype='object')")
msg = re.escape("Index(['a'], dtype=")

with pytest.raises(KeyError, match=msg):
df.drop_duplicates(subset)
Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/frame/methods/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,12 @@ def test_dtypes_timedeltas(self):
)
tm.assert_series_equal(result, expected)

def test_frame_apply_np_array_return_type(self):
def test_frame_apply_np_array_return_type(self, using_infer_string):
# GH 35517
df = DataFrame([["foo"]])
result = df.apply(lambda col: np.array("bar"))
expected = Series(["bar"])
if using_infer_string:
expected = Series([np.array(["bar"])])
else:
expected = Series(["bar"])
tm.assert_series_equal(result, expected)
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_duplicated.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
def test_duplicated_with_misspelled_column_name(subset):
# GH 19730
df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]})
msg = re.escape("Index(['a'], dtype='object')")
msg = re.escape("Index(['a'], dtype=")

with pytest.raises(KeyError, match=msg):
df.duplicated(subset)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/frame/methods/test_equals.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ def test_dataframe_not_equal(self):
df2 = DataFrame({"a": ["s", "d"], "b": [1, 2]})
assert df1.equals(df2) is False

def test_equals_different_blocks(self, using_array_manager):
def test_equals_different_blocks(self, using_array_manager, using_infer_string):
# GH#9330
df0 = DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]})
df1 = df0.reset_index()[["A", "B", "C"]]
if not using_array_manager:
if not using_array_manager and not using_infer_string:
# this assert verifies that the above operations have
# induced a block rearrangement
assert df0._mgr.blocks[0].dtype != df1._mgr.blocks[0].dtype
Expand Down
Loading

0 comments on commit ce4169a

Please sign in to comment.