From ce4169ac51d20786864157912072b68ae331dc52 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 9 Dec 2023 20:16:53 +0100 Subject: [PATCH] Fix new string dtype tests for frame folder (#55409) * Start fixing string tests * BUG: interpolate raising wrong error for ea * Fix more tests * REGR: join segfaulting for arrow string with nulls * Fix more tests * Fix more tests * BUG: rank raising for arrow string dtypes * BUG: eq not implemented for categorical and arrow backed strings * More tests * BUG: ndim of string block incorrect with string inference * Fix test * Fix tests * Fix tests * Fix more indexing tests * BUG: Index.insert raising when inserting None into new string dtype * Fix tests * BUG: Inserting ndim=0 array does not infer string dtype * Fix tests * Fix tests * Fix more tests * Fix more tests * BUG: idxmax raising for arrow strings * Fix * Fix more tests * Fix more tests * Fix more tests * Fix remaining tests * Fix remaining tests * Change default * BUG: Groupby not keeping string dtype for empty objects * Start fixing gb tests * Fix tests * Merge main * Update config_init.py * Fixup * Update --- .../frame/constructors/test_from_dict.py | 5 + .../frame/constructors/test_from_records.py | 5 + pandas/tests/frame/indexing/test_getitem.py | 2 +- pandas/tests/frame/indexing/test_indexing.py | 35 ++++-- pandas/tests/frame/indexing/test_set_value.py | 8 +- pandas/tests/frame/indexing/test_setitem.py | 2 +- pandas/tests/frame/indexing/test_where.py | 6 +- pandas/tests/frame/methods/test_align.py | 4 +- pandas/tests/frame/methods/test_astype.py | 22 ++-- .../tests/frame/methods/test_combine_first.py | 6 +- .../frame/methods/test_convert_dtypes.py | 6 +- pandas/tests/frame/methods/test_cov_corr.py | 12 +- pandas/tests/frame/methods/test_drop.py | 2 +- .../frame/methods/test_drop_duplicates.py | 2 +- pandas/tests/frame/methods/test_dtypes.py | 7 +- pandas/tests/frame/methods/test_duplicated.py | 2 +- pandas/tests/frame/methods/test_equals.py | 4 +- pandas/tests/frame/methods/test_explode.py | 2 +- pandas/tests/frame/methods/test_fillna.py | 26 +++- .../frame/methods/test_get_numeric_data.py | 6 +- .../tests/frame/methods/test_interpolate.py | 13 +- .../methods/test_is_homogeneous_dtype.py | 3 +- pandas/tests/frame/methods/test_nlargest.py | 2 +- pandas/tests/frame/methods/test_rank.py | 14 ++- pandas/tests/frame/methods/test_reindex.py | 6 +- pandas/tests/frame/methods/test_replace.py | 114 +++++++++++++++--- .../tests/frame/methods/test_reset_index.py | 12 +- .../tests/frame/methods/test_select_dtypes.py | 12 +- pandas/tests/frame/methods/test_to_csv.py | 7 +- pandas/tests/frame/methods/test_update.py | 14 ++- pandas/tests/frame/test_api.py | 2 + pandas/tests/frame/test_arithmetic.py | 16 ++- pandas/tests/frame/test_block_internals.py | 4 +- pandas/tests/frame/test_constructors.py | 39 +++--- pandas/tests/frame/test_logical_ops.py | 12 +- pandas/tests/frame/test_query_eval.py | 19 ++- pandas/tests/frame/test_reductions.py | 64 +++++++--- pandas/tests/frame/test_repr.py | 3 + pandas/tests/frame/test_stack_unstack.py | 23 ++-- pandas/tests/frame/test_unary.py | 20 ++- pandas/tests/groupby/test_apply.py | 29 +++-- pandas/tests/groupby/test_categorical.py | 5 +- pandas/tests/groupby/test_groupby.py | 35 ++++-- pandas/tests/groupby/test_numeric_only.py | 13 +- pandas/tests/groupby/test_raises.py | 2 +- 45 files changed, 477 insertions(+), 170 deletions(-) diff --git a/pandas/tests/frame/constructors/test_from_dict.py b/pandas/tests/frame/constructors/test_from_dict.py index 845174bbf600e..60a8e688b3b8a 100644 --- a/pandas/tests/frame/constructors/test_from_dict.py +++ b/pandas/tests/frame/constructors/test_from_dict.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas import ( DataFrame, Index, @@ -42,6 +44,9 @@ def test_constructor_single_row(self): ) tm.assert_frame_equal(result, expected) + @pytest.mark.skipif( + using_pyarrow_string_dtype(), reason="columns inferring logic broken" + ) def test_constructor_list_of_series(self): data = [ OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]), diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index edb21fb92f6a2..3622571f1365d 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -6,6 +6,8 @@ import pytest import pytz +from pandas._config import using_pyarrow_string_dtype + from pandas.compat import is_platform_little_endian from pandas import ( @@ -56,6 +58,9 @@ def test_from_records_with_datetimes(self): expected["EXPIRY"] = expected["EXPIRY"].astype("M8[s]") tm.assert_frame_equal(result, expected) + @pytest.mark.skipif( + using_pyarrow_string_dtype(), reason="dtype checking logic doesn't work" + ) def test_from_records_sequencelike(self): df = DataFrame( { diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py index 8502f98df5962..a36b0c0e850b3 100644 --- a/pandas/tests/frame/indexing/test_getitem.py +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -103,7 +103,7 @@ def test_getitem_list_duplicates(self): def test_getitem_dupe_cols(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) - msg = "\"None of [Index(['baf'], dtype='object')] are in the [columns]\"" + msg = "\"None of [Index(['baf'], dtype=" with pytest.raises(KeyError, match=re.escape(msg)): df[["baf"]] diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 40c6b8e180c5b..4be5be77b015c 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -288,7 +288,9 @@ def test_setattr_column(self): df.foobar = 5 assert (df.foobar == 5).all() - def test_setitem(self, float_frame, using_copy_on_write, warn_copy_on_write): + def test_setitem( + self, float_frame, using_copy_on_write, warn_copy_on_write, using_infer_string + ): # not sure what else to do here series = float_frame["A"][::2] float_frame["col5"] = series @@ -331,7 +333,10 @@ def test_setitem(self, float_frame, using_copy_on_write, warn_copy_on_write): with pytest.raises(SettingWithCopyError, match=msg): smaller["col10"] = ["1", "2"] - assert smaller["col10"].dtype == np.object_ + if using_infer_string: + assert smaller["col10"].dtype == "string" + else: + assert smaller["col10"].dtype == np.object_ assert (smaller["col10"] == ["1", "2"]).all() def test_setitem2(self): @@ -426,7 +431,7 @@ def test_setitem_cast(self, float_frame): float_frame["something"] = 2.5 assert float_frame["something"].dtype == np.float64 - def test_setitem_corner(self, float_frame): + def test_setitem_corner(self, float_frame, using_infer_string): # corner case df = DataFrame({"B": [1.0, 2.0, 3.0], "C": ["a", "b", "c"]}, index=np.arange(3)) del df["B"] @@ -463,10 +468,16 @@ def test_setitem_corner(self, float_frame): dm["foo"] = "bar" del dm["foo"] dm["foo"] = "bar" - assert dm["foo"].dtype == np.object_ + if using_infer_string: + assert dm["foo"].dtype == "string" + else: + assert dm["foo"].dtype == np.object_ dm["coercible"] = ["1", "2", "3"] - assert dm["coercible"].dtype == np.object_ + if using_infer_string: + assert dm["coercible"].dtype == "string" + else: + assert dm["coercible"].dtype == np.object_ def test_setitem_corner2(self): data = { @@ -483,7 +494,7 @@ def test_setitem_corner2(self): assert df.loc[1, "title"] == "foobar" assert df.loc[1, "cruft"] == 0 - def test_setitem_ambig(self): + def test_setitem_ambig(self, using_infer_string): # Difficulties with mixed-type data # Created as float type dm = DataFrame(index=range(3), columns=range(3)) @@ -499,18 +510,22 @@ def test_setitem_ambig(self): dm[2] = uncoercable_series assert len(dm.columns) == 3 - assert dm[2].dtype == np.object_ + if using_infer_string: + assert dm[2].dtype == "string" + else: + assert dm[2].dtype == np.object_ - def test_setitem_None(self, float_frame): + def test_setitem_None(self, float_frame, using_infer_string): # GH #766 float_frame[None] = float_frame["A"] + key = None if not using_infer_string else np.nan tm.assert_series_equal( float_frame.iloc[:, -1], float_frame["A"], check_names=False ) tm.assert_series_equal( - float_frame.loc[:, None], float_frame["A"], check_names=False + float_frame.loc[:, key], float_frame["A"], check_names=False ) - tm.assert_series_equal(float_frame[None], float_frame["A"], check_names=False) + tm.assert_series_equal(float_frame[key], float_frame["A"], check_names=False) def test_loc_setitem_boolean_mask_allfalse(self): # GH 9596 diff --git a/pandas/tests/frame/indexing/test_set_value.py b/pandas/tests/frame/indexing/test_set_value.py index 32312868adacb..1e3c793c8449f 100644 --- a/pandas/tests/frame/indexing/test_set_value.py +++ b/pandas/tests/frame/indexing/test_set_value.py @@ -16,7 +16,7 @@ def test_set_value(self, float_frame): float_frame._set_value(idx, col, 1) assert float_frame[col][idx] == 1 - def test_set_value_resize(self, float_frame): + def test_set_value_resize(self, float_frame, using_infer_string): res = float_frame._set_value("foobar", "B", 0) assert res is None assert float_frame.index[-1] == "foobar" @@ -27,8 +27,10 @@ def test_set_value_resize(self, float_frame): res = float_frame.copy() res._set_value("foobar", "baz", "sam") - assert res["baz"].dtype == np.object_ - + if using_infer_string: + assert res["baz"].dtype == "string" + else: + assert res["baz"].dtype == np.object_ res = float_frame.copy() with tm.assert_produces_warning( FutureWarning, match="Setting an item of incompatible dtype" diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index c0ba2f245efed..d0caaa3756170 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1319,7 +1319,7 @@ def test_setitem_column_frame_as_category(self): df["col2"] = Series([1, 2, 3], dtype="category") expected_types = Series( - ["int64", "category", "category"], index=[0, "col1", "col2"] + ["int64", "category", "category"], index=[0, "col1", "col2"], dtype=object ) tm.assert_series_equal(df.dtypes, expected_types) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 103ec67951a01..3d36d0471f02f 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -1077,9 +1077,13 @@ def test_where_producing_ea_cond_for_np_dtype(): @pytest.mark.parametrize( "replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)] ) -def test_where_int_overflow(replacement): +def test_where_int_overflow(replacement, using_infer_string, request): # GH 31687 df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]]) + if using_infer_string and replacement not in (None, "snake"): + request.node.add_marker( + pytest.mark.xfail(reason="Can't set non-string into string column") + ) result = df.where(pd.notnull(df), replacement) expected = DataFrame([[1.0, 2e25, "nine"], [replacement, 0.1, replacement]]) diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py index 312d6f6d37dde..5a9c47866dae8 100644 --- a/pandas/tests/frame/methods/test_align.py +++ b/pandas/tests/frame/methods/test_align.py @@ -107,7 +107,7 @@ def test_align_float(self, float_frame, using_copy_on_write): af, bf = float_frame.align( other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=None ) - tm.assert_index_equal(bf.index, Index([])) + tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype)) msg = ( "The 'method', 'limit', and 'fill_axis' keywords in DataFrame.align " @@ -117,7 +117,7 @@ def test_align_float(self, float_frame, using_copy_on_write): af, bf = float_frame.align( other.iloc[:, 0], join="inner", axis=1, method=None, fill_value=0 ) - tm.assert_index_equal(bf.index, Index([])) + tm.assert_index_equal(bf.index, Index([]).astype(bf.index.dtype)) # Try to align DataFrame to Series along bad axis msg = "No axis named 2 for object type DataFrame" diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 2578dfb622fbf..5a1e3cd786f84 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -166,7 +166,8 @@ def test_astype_str(self): "c": [Timedelta(x)._repr_base() for x in c._values], "d": list(map(str, d._values)), "e": list(map(str, e._values)), - } + }, + dtype="object", ) tm.assert_frame_equal(result, expected) @@ -174,13 +175,13 @@ def test_astype_str(self): def test_astype_str_float(self): # see GH#11302 result = DataFrame([np.nan]).astype(str) - expected = DataFrame(["nan"]) + expected = DataFrame(["nan"], dtype="object") tm.assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(str) val = "1.1234567890123457" - expected = DataFrame([val]) + expected = DataFrame([val], dtype="object") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) @@ -199,7 +200,7 @@ def test_astype_dict_like(self, dtype_class): expected = DataFrame( { "a": a, - "b": Series(["0", "1", "2", "3", "4"]), + "b": Series(["0", "1", "2", "3", "4"], dtype="object"), "c": c, "d": Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype="float32"), } @@ -282,7 +283,7 @@ def test_astype_duplicate_col_series_arg(self): result = df.astype(dtypes) expected = DataFrame( { - 0: vals[:, 0].astype(str), + 0: Series(vals[:, 0].astype(str), dtype=object), 1: vals[:, 1], 2: pd.array(vals[:, 2], dtype="Float64"), 3: vals[:, 3], @@ -620,6 +621,7 @@ def test_astype_arg_for_errors_dictlist(self): {"a": 2.2, "b": "15.3", "c": "another_test"}, ] ) + expected["c"] = expected["c"].astype("object") type_dict = {"a": "float64", "b": "float64", "c": "object"} result = df.astype(dtype=type_dict, errors="ignore") @@ -680,6 +682,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): ], ], columns=timezone_frame.columns, + dtype="object", ) tm.assert_frame_equal(result, expected) @@ -754,7 +757,9 @@ def test_astype_tz_object_conversion(self, tz): result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) - def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture): + def test_astype_dt64_to_string( + self, frame_or_series, tz_naive_fixture, using_infer_string + ): # GH#41409 tz = tz_naive_fixture @@ -772,7 +777,10 @@ def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture): item = result.iloc[0] if frame_or_series is DataFrame: item = item.iloc[0] - assert item is pd.NA + if using_infer_string: + assert item is np.nan + else: + assert item is pd.NA # For non-NA values, we should match what we get for non-EA str alt = obj.astype(str) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 0335279b3a123..941e4c03464ea 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -30,7 +30,7 @@ def test_combine_first_mixed(self): combined = f.combine_first(g) tm.assert_frame_equal(combined, exp) - def test_combine_first(self, float_frame): + def test_combine_first(self, float_frame, using_infer_string): # disjoint head, tail = float_frame[:5], float_frame[5:] @@ -76,7 +76,9 @@ def test_combine_first(self, float_frame): tm.assert_series_equal(combined["A"].reindex(g.index), g["A"]) # corner cases - comb = float_frame.combine_first(DataFrame()) + warning = FutureWarning if using_infer_string else None + with tm.assert_produces_warning(warning, match="empty entries"): + comb = float_frame.combine_first(DataFrame()) tm.assert_frame_equal(comb, float_frame) comb = DataFrame().combine_first(float_frame) diff --git a/pandas/tests/frame/methods/test_convert_dtypes.py b/pandas/tests/frame/methods/test_convert_dtypes.py index 4c371afcc4e00..a181a271181ca 100644 --- a/pandas/tests/frame/methods/test_convert_dtypes.py +++ b/pandas/tests/frame/methods/test_convert_dtypes.py @@ -11,9 +11,13 @@ class TestConvertDtypes: @pytest.mark.parametrize( "convert_integer, expected", [(False, np.dtype("int32")), (True, "Int32")] ) - def test_convert_dtypes(self, convert_integer, expected, string_storage): + def test_convert_dtypes( + self, convert_integer, expected, string_storage, using_infer_string + ): # Specific types are tested in tests/series/test_dtypes.py # Just check that it works for DataFrame here + if using_infer_string: + string_storage = "pyarrow_numpy" df = pd.DataFrame( { "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")), diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 108816697ef3e..04a08c8b9bc52 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -326,7 +326,7 @@ def test_corrwith(self, datetime_frame, dtype): for row in index[:4]: tm.assert_almost_equal(correls[row], df1.loc[row].corr(df2.loc[row])) - def test_corrwith_with_objects(self): + def test_corrwith_with_objects(self, using_infer_string): df1 = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD"), dtype=object), @@ -338,8 +338,14 @@ def test_corrwith_with_objects(self): df1["obj"] = "foo" df2["obj"] = "bar" - with pytest.raises(TypeError, match="Could not convert"): - df1.corrwith(df2) + if using_infer_string: + import pyarrow as pa + + with pytest.raises(pa.lib.ArrowNotImplementedError, match="has no kernel"): + df1.corrwith(df2) + else: + with pytest.raises(TypeError, match="Could not convert"): + df1.corrwith(df2) result = df1.corrwith(df2, numeric_only=True) expected = df1.loc[:, cols].corrwith(df2.loc[:, cols]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index f72c0594fa1f7..06cd51b43a0aa 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -510,7 +510,7 @@ def test_drop_with_duplicate_columns2(self): def test_drop_inplace_no_leftover_column_reference(self): # GH 13934 - df = DataFrame({"a": [1, 2, 3]}) + df = DataFrame({"a": [1, 2, 3]}, columns=Index(["a"], dtype="object")) a = df.a df.drop(["a"], axis=1, inplace=True) tm.assert_index_equal(df.columns, Index([], dtype="object")) diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py index df12139258a6d..6bea97b2cf189 100644 --- a/pandas/tests/frame/methods/test_drop_duplicates.py +++ b/pandas/tests/frame/methods/test_drop_duplicates.py @@ -16,7 +16,7 @@ def test_drop_duplicates_with_misspelled_column_name(subset): # GH 19730 df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) - msg = re.escape("Index(['a'], dtype='object')") + msg = re.escape("Index(['a'], dtype=") with pytest.raises(KeyError, match=msg): df.drop_duplicates(subset) diff --git a/pandas/tests/frame/methods/test_dtypes.py b/pandas/tests/frame/methods/test_dtypes.py index 4bdf16977dae6..ab632ac17318e 100644 --- a/pandas/tests/frame/methods/test_dtypes.py +++ b/pandas/tests/frame/methods/test_dtypes.py @@ -142,9 +142,12 @@ def test_dtypes_timedeltas(self): ) tm.assert_series_equal(result, expected) - def test_frame_apply_np_array_return_type(self): + def test_frame_apply_np_array_return_type(self, using_infer_string): # GH 35517 df = DataFrame([["foo"]]) result = df.apply(lambda col: np.array("bar")) - expected = Series(["bar"]) + if using_infer_string: + expected = Series([np.array(["bar"])]) + else: + expected = Series(["bar"]) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_duplicated.py b/pandas/tests/frame/methods/test_duplicated.py index 788aede805110..6052b61ea8db5 100644 --- a/pandas/tests/frame/methods/test_duplicated.py +++ b/pandas/tests/frame/methods/test_duplicated.py @@ -16,7 +16,7 @@ def test_duplicated_with_misspelled_column_name(subset): # GH 19730 df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) - msg = re.escape("Index(['a'], dtype='object')") + msg = re.escape("Index(['a'], dtype=") with pytest.raises(KeyError, match=msg): df.duplicated(subset) diff --git a/pandas/tests/frame/methods/test_equals.py b/pandas/tests/frame/methods/test_equals.py index 6fcf670f96ef0..d0b9d96cafa0d 100644 --- a/pandas/tests/frame/methods/test_equals.py +++ b/pandas/tests/frame/methods/test_equals.py @@ -14,11 +14,11 @@ def test_dataframe_not_equal(self): df2 = DataFrame({"a": ["s", "d"], "b": [1, 2]}) assert df1.equals(df2) is False - def test_equals_different_blocks(self, using_array_manager): + def test_equals_different_blocks(self, using_array_manager, using_infer_string): # GH#9330 df0 = DataFrame({"A": ["x", "y"], "B": [1, 2], "C": ["w", "z"]}) df1 = df0.reset_index()[["A", "B", "C"]] - if not using_array_manager: + if not using_array_manager and not using_infer_string: # this assert verifies that the above operations have # induced a block rearrangement assert df0._mgr.blocks[0].dtype != df1._mgr.blocks[0].dtype diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index d1e4a603c5710..5cd54db62d783 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -203,7 +203,7 @@ def test_usecase(): ) def test_duplicate_index(input_dict, input_index, expected_dict, expected_index): # GH 28005 - df = pd.DataFrame(input_dict, index=input_index) + df = pd.DataFrame(input_dict, index=input_index, dtype=object) result = df.explode("col1") expected = pd.DataFrame(expected_dict, index=expected_index, dtype=object) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index 960f05a6457a4..1403a45a5cccd 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -89,6 +91,7 @@ def test_fillna_datetime(self, datetime_frame): with pytest.raises(ValueError, match=msg): datetime_frame.fillna(5, method="ffill") + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") def test_fillna_mixed_type(self, float_string_frame): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan @@ -122,19 +125,27 @@ def test_fillna_empty(self, using_copy_on_write): df.x.fillna(method=m, inplace=True) df.x.fillna(method=m) - def test_fillna_different_dtype(self): + def test_fillna_different_dtype(self, using_infer_string): # with different dtype (GH#3386) df = DataFrame( [["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"]] ) - result = df.fillna({2: "foo"}) + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.fillna({2: "foo"}) + else: + result = df.fillna({2: "foo"}) expected = DataFrame( [["a", "a", "foo", "a"], ["b", "b", "foo", "b"], ["c", "c", "foo", "c"]] ) tm.assert_frame_equal(result, expected) - return_value = df.fillna({2: "foo"}, inplace=True) + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + return_value = df.fillna({2: "foo"}, inplace=True) + else: + return_value = df.fillna({2: "foo"}, inplace=True) tm.assert_frame_equal(df, expected) assert return_value is None @@ -358,7 +369,7 @@ def test_fillna_dictlike_value_duplicate_colnames(self, columns): expected["A"] = 0.0 tm.assert_frame_equal(result, expected) - def test_fillna_dtype_conversion(self): + def test_fillna_dtype_conversion(self, using_infer_string): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) result = df.dtypes @@ -373,7 +384,11 @@ def test_fillna_dtype_conversion(self): # empty block df = DataFrame(index=range(3), columns=["A", "B"], dtype="float64") - result = df.fillna("nan") + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.fillna("nan") + else: + result = df.fillna("nan") expected = DataFrame("nan", index=range(3), columns=["A", "B"]) tm.assert_frame_equal(result, expected) @@ -649,6 +664,7 @@ def test_fillna_col_reordering(self): filled = df.fillna(method="ffill") assert df.columns.tolist() == filled.columns.tolist() + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't fill 0 in string") def test_fill_corner(self, float_frame, float_string_frame): mf = float_string_frame mf.loc[mf.index[5:20], "foo"] = np.nan diff --git a/pandas/tests/frame/methods/test_get_numeric_data.py b/pandas/tests/frame/methods/test_get_numeric_data.py index ec1c768603a59..c5d32d56d03c1 100644 --- a/pandas/tests/frame/methods/test_get_numeric_data.py +++ b/pandas/tests/frame/methods/test_get_numeric_data.py @@ -15,12 +15,12 @@ class TestGetNumericData: def test_get_numeric_data_preserve_dtype(self): # get the numeric data - obj = DataFrame({"A": [1, "2", 3.0]}) + obj = DataFrame({"A": [1, "2", 3.0]}, columns=Index(["A"], dtype="object")) result = obj._get_numeric_data() expected = DataFrame(dtype=object, index=pd.RangeIndex(3), columns=[]) tm.assert_frame_equal(result, expected) - def test_get_numeric_data(self): + def test_get_numeric_data(self, using_infer_string): datetime64name = np.dtype("M8[s]").name objectname = np.dtype(np.object_).name @@ -33,7 +33,7 @@ def test_get_numeric_data(self): [ np.dtype("float64"), np.dtype("int64"), - np.dtype(objectname), + np.dtype(objectname) if not using_infer_string else "string", np.dtype(datetime64name), ], index=["a", "b", "c", "f"], diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 5f37ed6d9e18a..e0641fcb65bd3 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.errors import ChainedAssignmentError import pandas.util._test_decorators as td @@ -67,6 +69,9 @@ def test_interpolate_inplace(self, frame_or_series, using_array_manager, request assert np.shares_memory(orig, obj.values) assert orig.squeeze()[1] == 1.5 + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + ) def test_interp_basic(self, using_copy_on_write): df = DataFrame( { @@ -108,7 +113,10 @@ def test_interp_basic(self, using_copy_on_write): assert np.shares_memory(df["C"]._values, cvalues) assert np.shares_memory(df["D"]._values, dvalues) - def test_interp_basic_with_non_range_index(self): + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="interpolate doesn't work for string" + ) + def test_interp_basic_with_non_range_index(self, using_infer_string): df = DataFrame( { "A": [1, 2, np.nan, 4], @@ -119,7 +127,8 @@ def test_interp_basic_with_non_range_index(self): ) msg = "DataFrame.interpolate with object dtype" - with tm.assert_produces_warning(FutureWarning, match=msg): + warning = FutureWarning if not using_infer_string else None + with tm.assert_produces_warning(warning, match=msg): result = df.set_index("C").interpolate() expected = df.set_index("C") expected.loc[3, "A"] = 3 diff --git a/pandas/tests/frame/methods/test_is_homogeneous_dtype.py b/pandas/tests/frame/methods/test_is_homogeneous_dtype.py index a5f285d31301b..1fe28cb8eb856 100644 --- a/pandas/tests/frame/methods/test_is_homogeneous_dtype.py +++ b/pandas/tests/frame/methods/test_is_homogeneous_dtype.py @@ -25,7 +25,8 @@ { "A": np.array([1, 2], dtype=object), "B": np.array(["a", "b"], dtype=object), - } + }, + dtype="object", ), True, ), diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 1196f8cd3886a..3ba893501914a 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -86,7 +86,7 @@ def test_nlargest_n(self, df_strings, nselect_method, n, order): df = df_strings if "b" in order: error_msg = ( - f"Column 'b' has dtype object, " + f"Column 'b' has dtype (object|string), " f"cannot use method '{nselect_method}' with this dtype" ) with pytest.raises(TypeError, match=error_msg): diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index b5b5e42691e59..8d7a0b373f5f8 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -13,6 +13,7 @@ from pandas import ( DataFrame, + Index, Series, ) import pandas._testing as tm @@ -469,21 +470,28 @@ def test_rank_inf_nans_na_option( ("top", False, [2.0, 3.0, 1.0, 4.0]), ], ) - def test_rank_object_first(self, frame_or_series, na_option, ascending, expected): + def test_rank_object_first( + self, frame_or_series, na_option, ascending, expected, using_infer_string + ): obj = frame_or_series(["foo", "foo", None, "foo"]) result = obj.rank(method="first", na_option=na_option, ascending=ascending) expected = frame_or_series(expected) + if using_infer_string and isinstance(obj, Series): + expected = expected.astype("uint64") tm.assert_equal(result, expected) @pytest.mark.parametrize( "data,expected", [ - ({"a": [1, 2, "a"], "b": [4, 5, 6]}, DataFrame({"b": [1.0, 2.0, 3.0]})), + ( + {"a": [1, 2, "a"], "b": [4, 5, 6]}, + DataFrame({"b": [1.0, 2.0, 3.0]}, columns=Index(["b"], dtype=object)), + ), ({"a": [1, 2, "a"]}, DataFrame(index=range(3), columns=[])), ], ) def test_rank_mixed_axis_zero(self, data, expected): - df = DataFrame(data) + df = DataFrame(data, columns=Index(list(data.keys()), dtype=object)) with pytest.raises(TypeError, match="'<' not supported between instances of"): df.rank() result = df.rank(numeric_only=True) diff --git a/pandas/tests/frame/methods/test_reindex.py b/pandas/tests/frame/methods/test_reindex.py index d0d971e29204a..d862e14ce86cb 100644 --- a/pandas/tests/frame/methods/test_reindex.py +++ b/pandas/tests/frame/methods/test_reindex.py @@ -120,7 +120,7 @@ def test_reindex_timestamp_with_fold(self, timezone, year, month, day, hour): exp = DataFrame({"index": ["1", "2"], "vals": [np.nan, np.nan]}).set_index( "index" ) - exp = exp.astype(object) + exp = exp.astype(df.vals.dtype) tm.assert_frame_equal( df, exp, @@ -840,8 +840,8 @@ def test_reindex_fill_value(self): # other dtypes df["foo"] = "foo" - result = df.reindex(range(15), fill_value=0) - expected = df.reindex(range(15)).fillna(0) + result = df.reindex(range(15), fill_value="0") + expected = df.reindex(range(15)).fillna("0") tm.assert_frame_equal(result, expected) def test_reindex_uint_dtypes_fill_value(self, any_unsigned_int_numpy_dtype): diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 53c45a5f4b5c6..8bfa98042eb07 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas as pd from pandas import ( DataFrame, @@ -28,6 +30,9 @@ def mix_abc() -> dict[str, list[float | str]]: class TestDataFrameReplace: + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_inplace(self, datetime_frame, float_string_frame): datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan @@ -278,14 +283,25 @@ def test_regex_replace_dict_nested(self, mix_abc): tm.assert_frame_equal(res3, expec) tm.assert_frame_equal(res4, expec) - def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype): + def test_regex_replace_dict_nested_non_first_character( + self, any_string_dtype, using_infer_string + ): # GH 25259 dtype = any_string_dtype df = DataFrame({"first": ["abc", "bca", "cab"]}, dtype=dtype) - expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) - result = df.replace({"a": "."}, regex=True) + if using_infer_string and any_string_dtype == "object": + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.replace({"a": "."}, regex=True) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}) + + else: + result = df.replace({"a": "."}, regex=True) + expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_regex_replace_dict_nested_gh4115(self): df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2}) expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2}) @@ -294,6 +310,9 @@ def test_regex_replace_dict_nested_gh4115(self): result = df.replace({"Type": {"Q": 0, "T": 1}}) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_regex_replace_list_to_scalar(self, mix_abc): df = DataFrame(mix_abc) expec = DataFrame( @@ -322,6 +341,9 @@ def test_regex_replace_list_to_scalar(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_regex_replace_str_to_numeric(self, mix_abc): # what happens when you try to replace a numeric value with a regex? df = DataFrame(mix_abc) @@ -337,6 +359,9 @@ def test_regex_replace_str_to_numeric(self, mix_abc): tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_regex_replace_regex_list_to_numeric(self, mix_abc): df = DataFrame(mix_abc) res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) @@ -415,12 +440,31 @@ def test_replace_regex_metachar(self, metachar): ], ) def test_regex_replace_string_types( - self, data, to_replace, expected, frame_or_series, any_string_dtype + self, + data, + to_replace, + expected, + frame_or_series, + any_string_dtype, + using_infer_string, + request, ): # GH-41333, GH-35977 dtype = any_string_dtype obj = frame_or_series(data, dtype=dtype) - result = obj.replace(to_replace, regex=True) + if using_infer_string and any_string_dtype == "object": + if len(to_replace) > 1 and isinstance(obj, DataFrame): + request.node.add_marker( + pytest.mark.xfail( + reason="object input array that gets downcasted raises on " + "second pass" + ) + ) + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = obj.replace(to_replace, regex=True) + dtype = "string[pyarrow_numpy]" + else: + result = obj.replace(to_replace, regex=True) expected = frame_or_series(expected, dtype=dtype) tm.assert_equal(result, expected) @@ -522,6 +566,9 @@ def test_replace_series_dict(self): result = df.replace(s, df.mean()) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_convert(self): # gh 3907 df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]]) @@ -533,6 +580,9 @@ def test_replace_convert(self): res = rep.dtypes tm.assert_series_equal(expec, res) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_mixed(self, float_string_frame): mf = float_string_frame mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan @@ -588,7 +638,7 @@ def test_replace_mixed_int_block_splitting(self): result = df.replace(0, 0.5) tm.assert_frame_equal(result, expected) - def test_replace_mixed2(self): + def test_replace_mixed2(self, using_infer_string): # to object block upcasting df = DataFrame( { @@ -607,11 +657,15 @@ def test_replace_mixed2(self): expected = DataFrame( { - "A": Series(["foo", "bar"], dtype="object"), + "A": Series(["foo", "bar"]), "B": Series([0, "foo"], dtype="object"), } ) - result = df.replace([1, 2], ["foo", "bar"]) + if using_infer_string: + with tm.assert_produces_warning(FutureWarning, match="Downcasting"): + result = df.replace([1, 2], ["foo", "bar"]) + else: + result = df.replace([1, 2], ["foo", "bar"]) tm.assert_frame_equal(result, expected) def test_replace_mixed3(self): @@ -892,6 +946,9 @@ def test_replace_input_formats_listlike(self): with pytest.raises(ValueError, match=msg): df.replace(to_rep, values[1:]) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_input_formats_scalar(self): df = DataFrame( {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]} @@ -920,6 +977,9 @@ def test_replace_limit(self): # TODO pass + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_dict_no_regex(self): answer = Series( { @@ -943,6 +1003,9 @@ def test_replace_dict_no_regex(self): result = answer.replace(weights) tm.assert_series_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_series_no_regex(self): answer = Series( { @@ -1049,7 +1112,10 @@ def test_nested_dict_overlapping_keys_replace_str(self): expected = df.replace({"a": dict(zip(astr, bstr))}) tm.assert_frame_equal(result, expected) - def test_replace_swapping_bug(self): + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) + def test_replace_swapping_bug(self, using_infer_string): df = DataFrame({"a": [True, False, True]}) res = df.replace({"a": {True: "Y", False: "N"}}) expect = DataFrame({"a": ["Y", "N", "Y"]}) @@ -1060,6 +1126,9 @@ def test_replace_swapping_bug(self): expect = DataFrame({"a": ["Y", "N", "Y"]}) tm.assert_frame_equal(res, expect) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_period(self): d = { "fname": { @@ -1096,6 +1165,9 @@ def test_replace_period(self): result = df.replace(d) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) def test_replace_datetime(self): d = { "fname": { @@ -1321,6 +1393,9 @@ def test_replace_commutative(self, df, to_replace, exp): result = df.replace(to_replace) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) @pytest.mark.parametrize( "replacer", [ @@ -1491,10 +1566,12 @@ def test_replace_with_compiled_regex(self): expected = DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) - def test_replace_intervals(self): + def test_replace_intervals(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/35931 df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) - result = df.replace({"a": {pd.Interval(0, 1): "x"}}) + warning = FutureWarning if using_infer_string else None + with tm.assert_produces_warning(warning, match="Downcasting"): + result = df.replace({"a": {pd.Interval(0, 1): "x"}}) expected = DataFrame({"a": ["x", "x"]}) tm.assert_frame_equal(result, expected) @@ -1595,6 +1672,9 @@ def test_regex_replace_scalar( expected.loc[expected["a"] == ".", "a"] = expected_replace_val tm.assert_frame_equal(result, expected) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't set float into string" + ) @pytest.mark.parametrize("regex", [False, True]) def test_replace_regex_dtype_frame(self, regex): # GH-48644 @@ -1632,9 +1712,15 @@ def test_replace_categorical_no_replacement(self): result = df.replace(to_replace=[".", "def"], value=["_", None]) tm.assert_frame_equal(result, expected) - def test_replace_object_splitting(self): + def test_replace_object_splitting(self, using_infer_string): # GH#53977 df = DataFrame({"a": ["a"], "b": "b"}) - assert len(df._mgr.blocks) == 1 + if using_infer_string: + assert len(df._mgr.blocks) == 2 + else: + assert len(df._mgr.blocks) == 1 df.replace(to_replace=r"^\s*$", value="", inplace=True, regex=True) - assert len(df._mgr.blocks) == 1 + if using_infer_string: + assert len(df._mgr.blocks) == 2 + else: + assert len(df._mgr.blocks) == 1 diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 20f0dcc816408..fbf36dbc4fb02 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -654,10 +654,14 @@ def test_rest_index_multiindex_categorical_with_missing_values(self, codes): ), ], ) -def test_reset_index_dtypes_on_empty_frame_with_multiindex(array, dtype): +def test_reset_index_dtypes_on_empty_frame_with_multiindex( + array, dtype, using_infer_string +): # GH 19602 - Preserve dtype on empty DataFrame with MultiIndex idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = DataFrame(index=idx)[:0].reset_index().dtypes + if using_infer_string and dtype == object: + dtype = "string" expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype}) tm.assert_series_equal(result, expected) @@ -676,7 +680,9 @@ def test_reset_index_empty_frame_with_datetime64_multiindex(): tm.assert_frame_equal(result, expected) -def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby(): +def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby( + using_infer_string, +): # https://github.com/pandas-dev/pandas/issues/35657 dti = pd.DatetimeIndex(["2020-01-01"], dtype="M8[ns]") df = DataFrame({"c1": [10.0], "c2": ["a"], "c3": dti}) @@ -687,6 +693,8 @@ def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby(): ) expected["c3"] = expected["c3"].astype("datetime64[ns]") expected["c1"] = expected["c1"].astype("float64") + if using_infer_string: + expected["c2"] = expected["c2"].astype("string[pyarrow_numpy]") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index e2759c5d5b7b7..47c479faed1ef 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -282,7 +282,7 @@ def test_select_dtypes_duplicate_columns(self): result = df.select_dtypes(include=[np.number], exclude=["floating"]) tm.assert_frame_equal(result, expected) - def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): + def test_select_dtypes_not_an_attr_but_still_valid_dtype(self, using_infer_string): df = DataFrame( { "a": list("abc"), @@ -296,11 +296,17 @@ def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): df["g"] = df.f.diff() assert not hasattr(np, "u8") r = df.select_dtypes(include=["i8", "O"], exclude=["timedelta"]) - e = df[["a", "b"]] + if using_infer_string: + e = df[["b"]] + else: + e = df[["a", "b"]] tm.assert_frame_equal(r, e) r = df.select_dtypes(include=["i8", "O", "timedelta64[ns]"]) - e = df[["a", "b", "g"]] + if using_infer_string: + e = df[["b", "g"]] + else: + e = df[["a", "b", "g"]] tm.assert_frame_equal(r, e) def test_select_dtypes_empty(self): diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 97fbe597d1dab..250567eafc670 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -682,7 +682,7 @@ def _make_frame(names=None): tm.assert_index_equal(recons.columns, exp.columns) assert len(recons) == 0 - def test_to_csv_interval_index(self): + def test_to_csv_interval_index(self, using_infer_string): # GH 28210 df = DataFrame({"A": list("abc"), "B": range(3)}, index=pd.interval_range(0, 3)) @@ -692,7 +692,10 @@ def test_to_csv_interval_index(self): # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) expected = df.copy() - expected.index = expected.index.astype(str) + if using_infer_string: + expected.index = expected.index.astype("string[pyarrow_numpy]") + else: + expected.index = expected.index.astype(str) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index c79a37b5b30f0..7c7a0d23ff75f 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -166,11 +166,19 @@ def test_update_with_different_dtype(self, using_copy_on_write): with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"): df["c"].update(Series(["foo"], index=[0])) - expected = DataFrame({"a": [1, 3], "b": [np.nan, 2], "c": ["foo", np.nan]}) + expected = DataFrame( + { + "a": [1, 3], + "b": [np.nan, 2], + "c": Series(["foo", np.nan], dtype="object"), + } + ) tm.assert_frame_equal(df, expected) @td.skip_array_manager_invalid_test - def test_update_modify_view(self, using_copy_on_write, warn_copy_on_write): + def test_update_modify_view( + self, using_copy_on_write, warn_copy_on_write, using_infer_string + ): # GH#47188 df = DataFrame({"A": ["1", np.nan], "B": ["100", np.nan]}) df2 = DataFrame({"A": ["a", "x"], "B": ["100", "200"]}) @@ -181,7 +189,7 @@ def test_update_modify_view(self, using_copy_on_write, warn_copy_on_write): df2.update(df) expected = DataFrame({"A": ["1", "x"], "B": ["100", "200"]}) tm.assert_frame_equal(df2, expected) - if using_copy_on_write: + if using_copy_on_write or using_infer_string: tm.assert_frame_equal(result_view, df2_orig) else: tm.assert_frame_equal(result_view, expected) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 2b392ddcfb44d..c7b444045a0f2 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -5,6 +5,7 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype from pandas._config.config import option_context import pandas as pd @@ -112,6 +113,7 @@ def test_not_hashable(self): with pytest.raises(TypeError, match=msg): hash(empty_frame) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="surrogates not allowed") def test_column_name_contains_unicode_surrogate(self): # GH 25509 colname = "\ud83d" diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index a4825c80ee815..ec3222efab5a8 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,6 +11,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + import pandas.util._test_decorators as td import pandas as pd @@ -251,6 +253,9 @@ def test_timestamp_compare(self, left, right): with pytest.raises(TypeError, match=msg): right_f(pd.Timestamp("nat"), df) + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="can't compare string and int" + ) def test_mixed_comparison(self): # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, # not raise TypeError @@ -432,8 +437,8 @@ def test_bool_flex_frame_complex_dtype(self): def test_bool_flex_frame_object_dtype(self): # corner, dtype=object - df1 = DataFrame({"col": ["foo", np.nan, "bar"]}) - df2 = DataFrame({"col": ["foo", datetime.now(), "bar"]}) + df1 = DataFrame({"col": ["foo", np.nan, "bar"]}, dtype=object) + df2 = DataFrame({"col": ["foo", datetime.now(), "bar"]}, dtype=object) result = df1.ne(df2) exp = DataFrame({"col": [False, True, False]}) tm.assert_frame_equal(result, exp) @@ -1976,7 +1981,12 @@ def test_dataframe_blockwise_slicelike(): "df, col_dtype", [ (DataFrame([[1.0, 2.0], [4.0, 5.0]], columns=list("ab")), "float64"), - (DataFrame([[1.0, "b"], [4.0, "b"]], columns=list("ab")), "object"), + ( + DataFrame([[1.0, "b"], [4.0, "b"]], columns=list("ab")).astype( + {"b": object} + ), + "object", + ), ], ) def test_dataframe_operation_with_non_numeric_types(df, col_dtype): diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index b132f136e9741..712494ef15f97 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -183,7 +183,7 @@ def test_constructor_with_convert(self): ) tm.assert_series_equal(result, expected) - def test_construction_with_mixed(self, float_string_frame): + def test_construction_with_mixed(self, float_string_frame, using_infer_string): # test construction edge cases with mixed types # f7u12, this does not work without extensive workaround @@ -206,7 +206,7 @@ def test_construction_with_mixed(self, float_string_frame): expected = Series( [np.dtype("float64")] * 4 + [ - np.dtype("object"), + np.dtype("object") if not using_infer_string else "string", np.dtype("datetime64[us]"), np.dtype("timedelta64[us]"), ], diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index c6fe3a154905c..e1abd0344e356 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -21,6 +21,8 @@ import pytest import pytz +from pandas._config import using_pyarrow_string_dtype + from pandas._libs import lib from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td @@ -79,7 +81,7 @@ def test_constructor_from_ndarray_with_str_dtype(self): # with an array of strings each of which is e.g. "[0 1 2]" arr = np.arange(12).reshape(4, 3) df = DataFrame(arr, dtype=str) - expected = DataFrame(arr.astype(str)) + expected = DataFrame(arr.astype(str), dtype=object) tm.assert_frame_equal(df, expected) def test_constructor_from_2d_datetimearray(self, using_array_manager): @@ -261,8 +263,9 @@ def test_emptylike_constructor(self, emptylike, expected_index, expected_columns result = DataFrame(emptylike) tm.assert_frame_equal(result, expected) - def test_constructor_mixed(self, float_string_frame): - assert float_string_frame["foo"].dtype == np.object_ + def test_constructor_mixed(self, float_string_frame, using_infer_string): + dtype = "string" if using_infer_string else np.object_ + assert float_string_frame["foo"].dtype == dtype def test_constructor_cast_failure(self): # as of 2.0, we raise if we can't respect "dtype", previously we @@ -323,6 +326,7 @@ def test_constructor_dtype_nocast_view_2d_array( assert df2._mgr.arrays[0].flags.c_contiguous @td.skip_array_manager_invalid_test + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") def test_1d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") @@ -330,6 +334,7 @@ def test_1d_object_array_does_not_copy(self): assert np.shares_memory(df.values, arr) @td.skip_array_manager_invalid_test + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="conversion copies") def test_2d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") @@ -773,7 +778,7 @@ def test_constructor_dict_block(self): ) tm.assert_numpy_array_equal(df.values, expected) - def test_constructor_dict_cast(self): + def test_constructor_dict_cast(self, using_infer_string): # cast float tests test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}} frame = DataFrame(test_data, dtype=float) @@ -783,7 +788,7 @@ def test_constructor_dict_cast(self): frame = DataFrame(test_data) assert len(frame) == 3 - assert frame["B"].dtype == np.object_ + assert frame["B"].dtype == np.object_ if not using_infer_string else "string" assert frame["A"].dtype == np.float64 def test_constructor_dict_cast2(self): @@ -1195,7 +1200,7 @@ def test_constructor_dtype_nullable_extension_arrays( df = DataFrame({"a": data}, dtype=input_dtype) assert df["a"].dtype == expected_dtype() - def test_constructor_scalar_inference(self): + def test_constructor_scalar_inference(self, using_infer_string): data = {"int": 1, "bool": True, "float": 3.0, "complex": 4j, "object": "foo"} df = DataFrame(data, index=np.arange(10)) @@ -1203,7 +1208,7 @@ def test_constructor_scalar_inference(self): assert df["bool"].dtype == np.bool_ assert df["float"].dtype == np.float64 assert df["complex"].dtype == np.complex128 - assert df["object"].dtype == np.object_ + assert df["object"].dtype == np.object_ if not using_infer_string else "string" def test_constructor_arrays_and_scalars(self): df = DataFrame({"a": np.random.default_rng(2).standard_normal(10), "b": True}) @@ -1282,11 +1287,11 @@ def empty_gen(): df = DataFrame(empty_gen(), columns=["A", "B"]) tm.assert_frame_equal(df, expected) - def test_constructor_list_of_lists(self): + def test_constructor_list_of_lists(self, using_infer_string): # GH #484 df = DataFrame(data=[[1, "a"], [2, "b"]], columns=["num", "str"]) assert is_integer_dtype(df["num"]) - assert df["str"].dtype == np.object_ + assert df["str"].dtype == np.object_ if not using_infer_string else "string" # GH 4851 # list of 0-dim ndarrays @@ -1835,7 +1840,7 @@ def test_constructor_single_value(self): with pytest.raises(TypeError, match=msg): DataFrame("a", [1, 2], ["a", "c"], float) - def test_constructor_with_datetimes(self): + def test_constructor_with_datetimes(self, using_infer_string): intname = np.dtype(int).name floatname = np.dtype(np.float64).name objectname = np.dtype(np.object_).name @@ -1854,7 +1859,7 @@ def test_constructor_with_datetimes(self): result = df.dtypes expected = Series( [np.dtype("int64")] - + [np.dtype(objectname)] * 2 + + [np.dtype(objectname) if not using_infer_string else "string"] * 2 + [np.dtype("M8[s]"), np.dtype("M8[us]")], index=list("ABCDE"), ) @@ -1876,7 +1881,7 @@ def test_constructor_with_datetimes(self): expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object")] + + [np.dtype("object") if not using_infer_string else "string"] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -1898,7 +1903,7 @@ def test_constructor_with_datetimes(self): expected = Series( [np.dtype("float64")] + [np.dtype("int64")] - + [np.dtype("object")] + + [np.dtype("object") if not using_infer_string else "string"] + [np.dtype("float64")] + [np.dtype(intname)], index=["a", "b", "c", floatname, intname], @@ -1935,13 +1940,13 @@ def test_constructor_with_datetimes3(self): df = DataFrame({"End Date": dt}, index=[0]) assert df.iat[0, 0] == dt tm.assert_series_equal( - df.dtypes, Series({"End Date": "datetime64[us, US/Eastern]"}) + df.dtypes, Series({"End Date": "datetime64[us, US/Eastern]"}, dtype=object) ) df = DataFrame([{"End Date": dt}]) assert df.iat[0, 0] == dt tm.assert_series_equal( - df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}) + df.dtypes, Series({"End Date": "datetime64[ns, US/Eastern]"}, dtype=object) ) def test_constructor_with_datetimes4(self): @@ -2066,7 +2071,7 @@ def test_constructor_timedelta_non_ns(self, order, unit): # dtype=exp_dtype. tm.assert_frame_equal(df, expected) - def test_constructor_for_list_with_dtypes(self): + def test_constructor_for_list_with_dtypes(self, using_infer_string): # test list of lists/ndarrays df = DataFrame([np.arange(5) for x in range(5)]) result = df.dtypes @@ -2117,7 +2122,7 @@ def test_constructor_for_list_with_dtypes(self): [ np.dtype("int64"), np.dtype("float64"), - np.dtype("object"), + np.dtype("object") if not using_infer_string else "string", np.dtype("datetime64[ns]"), np.dtype("float64"), ], diff --git a/pandas/tests/frame/test_logical_ops.py b/pandas/tests/frame/test_logical_ops.py index a15d7d7f93f01..16ca3a202f1e0 100644 --- a/pandas/tests/frame/test_logical_ops.py +++ b/pandas/tests/frame/test_logical_ops.py @@ -96,7 +96,7 @@ def test_logical_ops_int_frame(self): res_ser = df1a_int["A"] | df1a_bool["A"] tm.assert_series_equal(res_ser, df1a_bool["A"]) - def test_logical_ops_invalid(self): + def test_logical_ops_invalid(self, using_infer_string): # GH#5808 df1 = DataFrame(1.0, index=[1], columns=["A"]) @@ -108,8 +108,14 @@ def test_logical_ops_invalid(self): df1 = DataFrame("foo", index=[1], columns=["A"]) df2 = DataFrame(True, index=[1], columns=["A"]) msg = re.escape("unsupported operand type(s) for |: 'str' and 'bool'") - with pytest.raises(TypeError, match=msg): - df1 | df2 + if using_infer_string: + import pyarrow as pa + + with pytest.raises(pa.lib.ArrowNotImplementedError, match="|has no kernel"): + df1 | df2 + else: + with pytest.raises(TypeError, match=msg): + df1 | df2 def test_logical_operators(self): def _check_bin_op(op): diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 6353546648156..a498296e09c52 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1035,7 +1035,7 @@ def test_query_with_string_columns(self, parser, engine): with pytest.raises(NotImplementedError, match=msg): df.query("a in b and c < d", parser=parser, engine=engine) - def test_object_array_eq_ne(self, parser, engine): + def test_object_array_eq_ne(self, parser, engine, using_infer_string): df = DataFrame( { "a": list("aaaabbbbcccc"), @@ -1044,11 +1044,14 @@ def test_object_array_eq_ne(self, parser, engine): "d": np.random.default_rng(2).integers(9, size=12), } ) - res = df.query("a == b", parser=parser, engine=engine) + warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None + with tm.assert_produces_warning(warning): + res = df.query("a == b", parser=parser, engine=engine) exp = df[df.a == df.b] tm.assert_frame_equal(res, exp) - res = df.query("a != b", parser=parser, engine=engine) + with tm.assert_produces_warning(warning): + res = df.query("a != b", parser=parser, engine=engine) exp = df[df.a != df.b] tm.assert_frame_equal(res, exp) @@ -1087,12 +1090,16 @@ def test_query_with_nested_special_character(self, parser, engine): [">=", operator.ge], ], ) - def test_query_lex_compare_strings(self, parser, engine, op, func): + def test_query_lex_compare_strings( + self, parser, engine, op, func, using_infer_string + ): a = Series(np.random.default_rng(2).choice(list("abcde"), 20)) b = Series(np.arange(a.size)) df = DataFrame({"X": a, "Y": b}) - res = df.query(f'X {op} "d"', engine=engine, parser=parser) + warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None + with tm.assert_produces_warning(warning): + res = df.query(f'X {op} "d"', engine=engine, parser=parser) expected = df[func(df.X, "d")] tm.assert_frame_equal(res, expected) @@ -1166,7 +1173,7 @@ def test_bool_arith_expr(self, frame, parser, engine): @pytest.mark.parametrize("op", ["+", "-", "*", "/"]) def test_invalid_type_for_operator_raises(self, parser, engine, op): df = DataFrame({"a": [1, 2], "b": ["c", "d"]}) - msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'" + msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'|Cannot" with pytest.raises(TypeError, match=msg): df.eval(f"a {op} b", engine=engine, parser=parser) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 3b1a751a738f9..66145c32c18d7 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -6,6 +6,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.compat import ( IS64, is_platform_windows, @@ -243,11 +245,17 @@ class TestDataFrameAnalytics: pytest.param("kurt", marks=td.skip_if_no("scipy")), ], ) - def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): - if (opname in ("sum", "min", "max") and axis == 0) or opname in ( - "count", - "nunique", - ): + def test_stat_op_api_float_string_frame( + self, float_string_frame, axis, opname, using_infer_string + ): + if ( + (opname in ("sum", "min", "max") and axis == 0) + or opname + in ( + "count", + "nunique", + ) + ) and not (using_infer_string and opname == "sum"): getattr(float_string_frame, opname)(axis=axis) else: if opname in ["var", "std", "sem", "skew", "kurt"]: @@ -273,7 +281,11 @@ def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname): elif opname in ["min", "max"]: msg = "'[><]=' not supported between instances of 'float' and 'str'" elif opname == "median": - msg = re.compile(r"Cannot convert \[.*\] to numeric", flags=re.S) + msg = re.compile( + r"Cannot convert \[.*\] to numeric|does not support", flags=re.S + ) + if not isinstance(msg, re.Pattern): + msg = msg + "|does not support" with pytest.raises(TypeError, match=msg): getattr(float_string_frame, opname)(axis=axis) if opname != "nunique": @@ -434,6 +446,7 @@ def test_mixed_ops(self, op): "Could not convert", "could not convert", "can't multiply sequence by non-int", + "does not support", ] ) with pytest.raises(TypeError, match=msg): @@ -445,11 +458,15 @@ def test_mixed_ops(self, op): "Could not convert", "could not convert", "can't multiply sequence by non-int", + "does not support", ] ) with pytest.raises(TypeError, match=msg): getattr(df, op)() + @pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="sum doesn't work for arrow strings" + ) def test_reduce_mixed_frame(self): # GH 6806 df = DataFrame( @@ -516,7 +533,9 @@ def test_mean_mixed_string_decimal(self): df = DataFrame(d) - with pytest.raises(TypeError, match="unsupported operand type"): + with pytest.raises( + TypeError, match="unsupported operand type|does not support" + ): df.mean() result = df[["A", "C"]].mean() expected = Series([2.7, 681.6], index=["A", "C"], dtype=object) @@ -652,7 +671,7 @@ def test_mode_dropna(self, dropna, expected): "A": [12, 12, 19, 11], "B": [10, 10, np.nan, 3], "C": [1, np.nan, np.nan, np.nan], - "D": [np.nan, np.nan, "a", np.nan], + "D": Series([np.nan, np.nan, "a", np.nan], dtype=object), "E": Categorical([np.nan, np.nan, "a", np.nan]), "F": DatetimeIndex(["NaT", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]"), "G": to_timedelta(["1 days", "nan", "nan", "nan"]), @@ -672,14 +691,15 @@ def test_mode_dropna(self, dropna, expected): expected = DataFrame(expected) tm.assert_frame_equal(result, expected) - def test_mode_sortwarning(self): + def test_mode_sortwarning(self, using_infer_string): # Check for the warning that is raised when the mode # results cannot be sorted df = DataFrame({"A": [np.nan, np.nan, "a", "a"]}) expected = DataFrame({"A": ["a", np.nan]}) - with tm.assert_produces_warning(UserWarning): + warning = None if using_infer_string else UserWarning + with tm.assert_produces_warning(warning): result = df.mode(dropna=False) result = result.sort_values(by="A").reset_index(drop=True) @@ -969,7 +989,8 @@ def test_sum_mixed_datetime(self): def test_mean_corner(self, float_frame, float_string_frame): # unit test when have object data - with pytest.raises(TypeError, match="Could not convert"): + msg = "Could not convert|does not support" + with pytest.raises(TypeError, match=msg): float_string_frame.mean(axis=0) # xs sum mixed type, just want to know it works... @@ -1341,7 +1362,9 @@ def test_any_all_extra(self): @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("bool_agg_func", ["any", "all"]) @pytest.mark.parametrize("skipna", [True, False]) - def test_any_all_object_dtype(self, axis, bool_agg_func, skipna): + def test_any_all_object_dtype( + self, axis, bool_agg_func, skipna, using_infer_string + ): # GH#35450 df = DataFrame( data=[ @@ -1351,8 +1374,13 @@ def test_any_all_object_dtype(self, axis, bool_agg_func, skipna): [np.nan, np.nan, "5", np.nan], ] ) + if using_infer_string: + # na in object is True while in string pyarrow numpy it's false + val = not axis == 0 and not skipna and bool_agg_func == "all" + else: + val = True result = getattr(df, bool_agg_func)(axis=axis, skipna=skipna) - expected = Series([True, True, True, True]) + expected = Series([True, True, val, True]) tm.assert_series_equal(result, expected) # GH#50947 deprecates this but it is not emitting a warning in some builds. @@ -1378,7 +1406,8 @@ def test_any_datetime(self): def test_any_all_bool_only(self): # GH 25101 df = DataFrame( - {"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]} + {"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]}, + columns=Index(["col1", "col2", "col3"], dtype=object), ) result = df.all(bool_only=True) @@ -1931,6 +1960,9 @@ def test_sum_timedelta64_skipna_false(using_array_manager, request): tm.assert_series_equal(result, expected) +@pytest.mark.xfail( + using_pyarrow_string_dtype(), reason="sum doesn't work with arrow strings" +) def test_mixed_frame_with_integer_sum(): # https://github.com/pandas-dev/pandas/issues/34520 df = DataFrame([["a", 1]], columns=list("ab")) @@ -1951,7 +1983,7 @@ def test_minmax_extensionarray(method, numeric_only): expected = Series( [getattr(int64_info, method)], dtype="Int64", - index=Index(["Int64"], dtype="object"), + index=Index(["Int64"]), ) tm.assert_series_equal(result, expected) @@ -1969,7 +2001,7 @@ def test_prod_sum_min_count_mixed_object(): df = DataFrame([1, "a", True]) result = df.prod(axis=0, min_count=1, numeric_only=False) - expected = Series(["a"]) + expected = Series(["a"], dtype=object) tm.assert_series_equal(result, expected) msg = re.escape("unsupported operand type(s) for +: 'int' and 'str'") diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py index 6184e791cab5d..776007fb9691d 100644 --- a/pandas/tests/frame/test_repr.py +++ b/pandas/tests/frame/test_repr.py @@ -7,6 +7,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas import ( NA, Categorical, @@ -174,6 +176,7 @@ def test_repr_mixed_big(self): repr(biggie) + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="/r in") def test_repr(self): # columns but no index no_index = DataFrame(columns=[0, 1, 3]) diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py index 2e7e8eba270c0..554a9d4ce2d5d 100644 --- a/pandas/tests/frame/test_stack_unstack.py +++ b/pandas/tests/frame/test_stack_unstack.py @@ -619,7 +619,7 @@ def test_unstack_to_series(self, float_frame): data = data.unstack() tm.assert_frame_equal(old_data, data) - def test_unstack_dtypes(self): + def test_unstack_dtypes(self, using_infer_string): # GH 2929 rows = [[1, 1, 3, 4], [1, 2, 3, 4], [2, 1, 3, 4], [2, 2, 3, 4]] @@ -655,8 +655,9 @@ def test_unstack_dtypes(self): df2["D"] = "foo" df3 = df2.unstack("B") result = df3.dtypes + dtype = "string" if using_infer_string else np.dtype("object") expected = Series( - [np.dtype("float64")] * 2 + [np.dtype("object")] * 2, + [np.dtype("float64")] * 2 + [dtype] * 2, index=MultiIndex.from_arrays( [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B") ), @@ -1359,14 +1360,16 @@ def test_unstack_fill_frame_object(): # By default missing values will be NaN result = data.unstack() expected = DataFrame( - {"a": ["a", np.nan, "a"], "b": ["b", "c", np.nan]}, index=list("xyz") + {"a": ["a", np.nan, "a"], "b": ["b", "c", np.nan]}, + index=list("xyz"), + dtype=object, ) tm.assert_frame_equal(result, expected) # Fill with any value replaces missing values as expected result = data.unstack(fill_value="d") expected = DataFrame( - {"a": ["a", "d", "a"], "b": ["b", "c", "d"]}, index=list("xyz") + {"a": ["a", "d", "a"], "b": ["b", "c", "d"]}, index=list("xyz"), dtype=object ) tm.assert_frame_equal(result, expected) @@ -2083,7 +2086,7 @@ def test_stack_multiple_bug(self, future_stack): multi = df.set_index(["DATE", "ID"]) multi.columns.name = "Params" unst = multi.unstack("ID") - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): unst.resample("W-THU").mean() down = unst.resample("W-THU").mean(numeric_only=True) @@ -2298,7 +2301,7 @@ def test_stack_unstack_unordered_multiindex(self, future_stack): tm.assert_frame_equal(result, expected) def test_unstack_preserve_types( - self, multiindex_year_month_day_dataframe_random_data + self, multiindex_year_month_day_dataframe_random_data, using_infer_string ): # GH#403 ymd = multiindex_year_month_day_dataframe_random_data @@ -2307,7 +2310,11 @@ def test_unstack_preserve_types( unstacked = ymd.unstack("month") assert unstacked["A", 1].dtype == np.float64 - assert unstacked["E", 1].dtype == np.object_ + assert ( + unstacked["E", 1].dtype == np.object_ + if not using_infer_string + else "string" + ) assert unstacked["F", 1].dtype == np.float64 def test_unstack_group_index_overflow(self, future_stack): @@ -2367,7 +2374,7 @@ def test_unstack_with_missing_int_cast_to_float(self, using_array_manager): expected = DataFrame( [[10.0, 10.0, 1.0, 1.0], [np.nan, 10.0, 0.0, 1.0]], - index=Index(["A", "B"], dtype="object", name="a"), + index=Index(["A", "B"], name="a"), columns=MultiIndex.from_tuples( [("v", "ca"), ("v", "cb"), ("is_", "ca"), ("is_", "cb")], names=[None, "b"], diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py index 5e29d3c868983..850c92013694f 100644 --- a/pandas/tests/frame/test_unary.py +++ b/pandas/tests/frame/test_unary.py @@ -48,15 +48,25 @@ def test_neg_object(self, df, expected): pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])}), ], ) - def test_neg_raises(self, df): + def test_neg_raises(self, df, using_infer_string): msg = ( "bad operand type for unary -: 'str'|" r"bad operand type for unary -: 'DatetimeArray'" ) - with pytest.raises(TypeError, match=msg): - (-df) - with pytest.raises(TypeError, match=msg): - (-df["a"]) + if using_infer_string and df.dtypes.iloc[0] == "string": + import pyarrow as pa + + msg = "has no kernel" + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + (-df) + with pytest.raises(pa.lib.ArrowNotImplementedError, match=msg): + (-df["a"]) + + else: + with pytest.raises(TypeError, match=msg): + (-df) + with pytest.raises(TypeError, match=msg): + (-df["a"]) def test_invert(self, float_frame): df = float_frame diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 60b386adb664a..34b6e7c4cde5f 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -37,7 +37,7 @@ def store(group): tm.assert_frame_equal(groups[0], expected_value) -def test_apply_index_date(): +def test_apply_index_date(using_infer_string): # GH 5788 ts = [ "2011-05-16 00:00", @@ -77,7 +77,7 @@ def test_apply_index_date(): tm.assert_frame_equal(result, expected) -def test_apply_index_date_object(): +def test_apply_index_date_object(using_infer_string): # GH 5789 # don't auto coerce dates ts = [ @@ -109,8 +109,9 @@ def test_apply_index_date_object(): 1.40750, 1.40649, ] + dtype = "string[pyarrow_numpy]" if using_infer_string else object exp_idx = Index( - ["2011-05-16", "2011-05-17", "2011-05-18"], dtype=object, name="date" + ["2011-05-16", "2011-05-17", "2011-05-18"], dtype=dtype, name="date" ) expected = Series(["00:00", "02:00", "02:00"], index=exp_idx) msg = "DataFrameGroupBy.apply operated on the grouping columns" @@ -121,14 +122,15 @@ def test_apply_index_date_object(): tm.assert_series_equal(result, expected) -def test_apply_trivial(): +def test_apply_trivial(using_infer_string): # GH 20066 # trivial apply: ignore input and return a constant dataframe. df = DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) - expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", "object"]) + dtype = "string" if using_infer_string else "object" + expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", dtype]) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): @@ -138,13 +140,14 @@ def test_apply_trivial(): tm.assert_frame_equal(result, expected) -def test_apply_trivial_fail(): +def test_apply_trivial_fail(using_infer_string): # GH 20066 df = DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], ) - expected = pd.concat([df, df], axis=1, keys=["float64", "object"]) + dtype = "string" if using_infer_string else "object" + expected = pd.concat([df, df], axis=1, keys=["float64", dtype]) msg = "DataFrame.groupby with axis=1 is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): gb = df.groupby([str(x) for x in df.dtypes], axis=1, group_keys=True) @@ -941,7 +944,7 @@ def test_func_returns_object(): "group_column_dtlike", [datetime.today(), datetime.today().date(), datetime.today().time()], ) -def test_apply_datetime_issue(group_column_dtlike): +def test_apply_datetime_issue(group_column_dtlike, using_infer_string): # GH-28247 # groupby-apply throws an error if one of the columns in the DataFrame # is a datetime object and the column labels are different from @@ -952,9 +955,8 @@ def test_apply_datetime_issue(group_column_dtlike): with tm.assert_produces_warning(FutureWarning, match=msg): result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42])) - expected = DataFrame( - ["spam"], Index(["foo"], dtype="object", name="a"), columns=[42] - ) + dtype = "string" if using_infer_string else "object" + expected = DataFrame(["spam"], Index(["foo"], dtype=dtype, name="a"), columns=[42]) tm.assert_frame_equal(result, expected) @@ -1021,7 +1023,7 @@ def test_apply_multi_level_name(category): assert df.index.names == ["A", "B"] -def test_groupby_apply_datetime_result_dtypes(): +def test_groupby_apply_datetime_result_dtypes(using_infer_string): # GH 14849 data = DataFrame.from_records( [ @@ -1035,8 +1037,9 @@ def test_groupby_apply_datetime_result_dtypes(): msg = "DataFrameGroupBy.apply operated on the grouping columns" with tm.assert_produces_warning(FutureWarning, match=msg): result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes + dtype = "string" if using_infer_string else object expected = Series( - [np.dtype("datetime64[ns]"), object, object, np.int64, object], + [np.dtype("datetime64[ns]"), dtype, dtype, np.int64, dtype], index=["observation", "color", "mood", "intensity", "score"], ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index d4ccbe4c1c376..7a91601bf688f 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -82,7 +82,7 @@ def get_stats(group): assert result.index.names[0] == "C" -def test_basic(): # TODO: split this test +def test_basic(using_infer_string): # TODO: split this test cats = Categorical( ["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], @@ -129,7 +129,8 @@ def f(x): result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") - expected["person_name"] = expected["person_name"].astype("object") + dtype = "string[pyarrow_numpy]" if using_infer_string else object + expected["person_name"] = expected["person_name"].astype(dtype) tm.assert_frame_equal(result, expected) # GH 9921 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 62347ec1d3d6a..802cae9ff65f0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -11,6 +11,8 @@ ) import pandas.util._test_decorators as td +from pandas.core.dtypes.common import is_string_dtype + import pandas as pd from pandas import ( Categorical, @@ -687,7 +689,7 @@ def test_frame_multi_key_function_list_partial_failure(): grouped = data.groupby(["A", "B"]) funcs = ["mean", "std"] - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.agg(funcs) @@ -980,7 +982,7 @@ def test_groupby_multi_corner(df): def test_raises_on_nuisance(df): grouped = df.groupby("A") - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -1036,7 +1038,7 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only): msg = "could not convert string to float: 'one'" else: klass = TypeError - msg = re.escape(f"agg function failed [how->{agg_function},dtype->object]") + msg = re.escape(f"agg function failed [how->{agg_function},dtype->") with pytest.raises(klass, match=msg): getattr(grouped, agg_function)(numeric_only=numeric_only) else: @@ -1061,7 +1063,7 @@ def test_raise_on_nuisance_python_single(df): def test_raise_on_nuisance_python_multiple(three_group): grouped = three_group.groupby(["A", "B"]) - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): grouped.agg("mean") with pytest.raises(TypeError, match=msg): @@ -1104,7 +1106,7 @@ def test_wrap_aggregated_output_multindex(multiindex_dataframe_random_data): df["baz", "two"] = "peekaboo" keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] - msg = re.escape("agg function failed [how->mean,dtype->object]") + msg = re.escape("agg function failed [how->mean,dtype->") with pytest.raises(TypeError, match=msg): df.groupby(keys).agg("mean") agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean") @@ -1193,7 +1195,7 @@ def test_groupby_complex(): tm.assert_series_equal(result, expected) -def test_groupby_complex_numbers(): +def test_groupby_complex_numbers(using_infer_string): # GH 17927 df = DataFrame( [ @@ -1202,10 +1204,11 @@ def test_groupby_complex_numbers(): {"a": 4, "b": 1}, ] ) + dtype = "string[pyarrow_numpy]" if using_infer_string else object expected = DataFrame( np.array([1, 1, 1], dtype=np.int64), index=Index([(1 + 1j), (1 + 2j), (1 + 0j)], name="b"), - columns=Index(["a"], dtype="object"), + columns=Index(["a"], dtype=dtype), ) result = df.groupby("b", sort=False).count() tm.assert_frame_equal(result, expected) @@ -1720,14 +1723,18 @@ def g(group): @pytest.mark.parametrize("grouper", ["A", ["A", "B"]]) -def test_set_group_name(df, grouper): +def test_set_group_name(df, grouper, using_infer_string): def f(group): assert group.name is not None return group def freduce(group): assert group.name is not None - return group.sum() + if using_infer_string and grouper == "A" and is_string_dtype(group.dtype): + with pytest.raises(TypeError, match="does not support"): + group.sum() + else: + return group.sum() def freducex(x): return freduce(x) @@ -2024,7 +2031,9 @@ def test_pivot_table_values_key_error(): @pytest.mark.parametrize( "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"] ) -def test_empty_groupby(columns, keys, values, method, op, using_array_manager, dropna): +def test_empty_groupby( + columns, keys, values, method, op, using_array_manager, dropna, using_infer_string +): # GH8093 & GH26411 override_dtype = None @@ -2065,7 +2074,11 @@ def get_categorical_invalid_expected(): # Categorical is special without 'observed=True' idx = Index(lev, name=keys[0]) - expected = DataFrame([], columns=[], index=idx) + if using_infer_string: + columns = Index([], dtype="string[pyarrow_numpy]") + else: + columns = [] + expected = DataFrame([], columns=columns, index=idx) return expected is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype) diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py index 0141adf44c86b..ff4685b1e412d 100644 --- a/pandas/tests/groupby/test_numeric_only.py +++ b/pandas/tests/groupby/test_numeric_only.py @@ -180,6 +180,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): [ "category type does not support sum operations", re.escape(f"agg function failed [how->{method},dtype->object]"), + re.escape(f"agg function failed [how->{method},dtype->string]"), ] ) with pytest.raises(exception, match=msg): @@ -196,6 +197,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): "function is not implemented for this dtype", f"Cannot perform {method} with non-ordered Categorical", re.escape(f"agg function failed [how->{method},dtype->object]"), + re.escape(f"agg function failed [how->{method},dtype->string]"), ] ) with pytest.raises(exception, match=msg): @@ -206,7 +208,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): @pytest.mark.parametrize("numeric_only", [True, False, None]) -def test_axis1_numeric_only(request, groupby_func, numeric_only): +def test_axis1_numeric_only(request, groupby_func, numeric_only, using_infer_string): if groupby_func in ("idxmax", "idxmin"): pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1") if groupby_func in ("corrwith", "skew"): @@ -268,8 +270,15 @@ def test_axis1_numeric_only(request, groupby_func, numeric_only): "can't multiply sequence by non-int of type 'float'", # cumsum, diff, pct_change "unsupported operand type", + "has no kernel", ) - with pytest.raises(TypeError, match=f"({'|'.join(msgs)})"): + if using_infer_string: + import pyarrow as pa + + errs = (TypeError, pa.lib.ArrowNotImplementedError) + else: + errs = TypeError + with pytest.raises(errs, match=f"({'|'.join(msgs)})"): with tm.assert_produces_warning(FutureWarning, match=warn_msg): method(*args, **kwargs) else: diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 2800f08b5fd90..0b451ce73db89 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -189,7 +189,7 @@ def test_groupby_raises_string( "sum": (None, ""), "var": ( TypeError, - re.escape("agg function failed [how->var,dtype->object]"), + re.escape("agg function failed [how->var,dtype->"), ), }[groupby_func]