From 2be559acbc0a50695716df61c75f41a9f8891449 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 30 Dec 2024 08:54:00 -0500 Subject: [PATCH 1/3] TST(string dtype): Resolve HDF5 xfails in test_put.py --- pandas/io/pytables.py | 16 ++++++++--- pandas/tests/io/pytables/test_put.py | 43 +++++++++++++++++++--------- 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b75dc6c3a43b4..8e70f50ce9934 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -86,6 +86,7 @@ PeriodArray, ) from pandas.core.arrays.datetimes import tz_to_dtype +from pandas.core.arrays.string_ import BaseStringArray import pandas.core.common as com from pandas.core.computation.pytables import ( PyTablesExpr, @@ -3185,6 +3186,8 @@ def write_array( # both self._filters and EA value = extract_array(obj, extract_numpy=True) + if isinstance(value, BaseStringArray): + value = value.to_numpy() if key in self.group: self._handle.remove_node(self.group, key) @@ -3363,7 +3366,11 @@ def read( columns = items[items.get_indexer(blk_items)] df = DataFrame(values.T, columns=columns, index=axes[1], copy=False) - if using_string_dtype() and is_string_array(values, skipna=True): + if ( + using_string_dtype() + and isinstance(values, np.ndarray) + and is_string_array(values, skipna=True) + ): df = df.astype(StringDtype(na_value=np.nan)) dfs.append(df) @@ -4737,9 +4744,10 @@ def read( df = DataFrame._from_arrays([values], columns=cols_, index=index_) if not (using_string_dtype() and values.dtype.kind == "O"): assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype) - if using_string_dtype() and is_string_array( - values, # type: ignore[arg-type] - skipna=True, + if ( + using_string_dtype() + and isinstance(values, np.ndarray) + and is_string_array(values, skipna=True) ): df = df.astype(StringDtype(na_value=np.nan)) frames.append(df) diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py index a4257b54dd6db..4b7548192196e 100644 --- a/pandas/tests/io/pytables/test_put.py +++ b/pandas/tests/io/pytables/test_put.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp import pandas as pd @@ -26,7 +24,6 @@ pytestmark = [ pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), ] @@ -99,7 +96,7 @@ def test_api_default_format(tmp_path, setup_path): assert store.get_storer("df4").is_table -def test_put(setup_path): +def test_put(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: ts = Series( np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10) @@ -133,7 +130,11 @@ def test_put(setup_path): # overwrite table store.put("c", df[:10], format="table", append=False) - tm.assert_frame_equal(df[:10], store["c"]) + expected = df[:10] + if using_infer_string: + expected.columns = expected.columns.astype("str") + result = store["c"] + tm.assert_frame_equal(result, expected) def test_put_string_index(setup_path): @@ -162,7 +163,7 @@ def test_put_string_index(setup_path): tm.assert_frame_equal(store["b"], df) -def test_put_compression(setup_path): +def test_put_compression(setup_path, using_infer_string): with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), @@ -171,7 +172,11 @@ def test_put_compression(setup_path): ) store.put("c", df, format="table", complib="zlib") - tm.assert_frame_equal(store["c"], df) + expected = df + if using_infer_string: + expected.columns = expected.columns.astype("str") + result = store["c"] + tm.assert_frame_equal(result, expected) # can't compress if format='fixed' msg = "Compression not supported on Fixed format stores" @@ -180,7 +185,7 @@ def test_put_compression(setup_path): @td.skip_if_windows -def test_put_compression_blosc(setup_path): +def test_put_compression_blosc(setup_path, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD"), dtype=object), @@ -194,10 +199,14 @@ def test_put_compression_blosc(setup_path): store.put("b", df, format="fixed", complib="blosc") store.put("c", df, format="table", complib="blosc") - tm.assert_frame_equal(store["c"], df) + expected = df + if using_infer_string: + expected.columns = expected.columns.astype("str") + result = store["c"] + tm.assert_frame_equal(result, expected) -def test_put_mixed_type(setup_path, performance_warning): +def test_put_mixed_type(setup_path, performance_warning, using_infer_string): df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD"), dtype=object), @@ -223,8 +232,11 @@ def test_put_mixed_type(setup_path, performance_warning): with tm.assert_produces_warning(performance_warning): store.put("df", df) - expected = store.get("df") - tm.assert_frame_equal(expected, df) + expected = df + if using_infer_string: + expected.columns = expected.columns.astype("str") + result = store.get("df") + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("format", ["table", "fixed"]) @@ -253,7 +265,7 @@ def test_store_index_types(setup_path, format, index): tm.assert_frame_equal(df, store["df"]) -def test_column_multiindex(setup_path): +def test_column_multiindex(setup_path, using_infer_string): # GH 4710 # recreate multi-indexes properly @@ -264,6 +276,11 @@ def test_column_multiindex(setup_path): expected = df.set_axis(df.index.to_numpy()) with ensure_clean_store(setup_path) as store: + if using_infer_string: + msg = "Saving a MultiIndex with an extension dtype is not supported." + with pytest.raises(NotImplementedError, match=msg): + store.put("df", df) + return store.put("df", df) tm.assert_frame_equal( store["df"], expected, check_index_type=True, check_column_type=True From ad6ed76425fa596b5ac8374c388322b2ed22a876 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 30 Dec 2024 09:19:16 -0500 Subject: [PATCH 2/3] TST(string dtype): Resolve HDF5 xfails in test_round_trip.py --- pandas/tests/io/pytables/test_round_trip.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 6b98a720e4299..cf9a866130fec 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp from pandas.compat import is_platform_windows @@ -28,11 +26,10 @@ pytestmark = [ pytest.mark.single_cpu, - pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False), ] -def test_conv_read_write(): +def test_conv_read_write(using_infer_string): with tm.ensure_clean() as path: def roundtrip(key, obj, **kwargs): @@ -52,13 +49,21 @@ def roundtrip(key, obj, **kwargs): columns=Index(list("ABCD"), dtype=object), index=Index([f"i-{i}" for i in range(30)], dtype=object), ) - tm.assert_frame_equal(o, roundtrip("frame", o)) + expected = o + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + result = roundtrip("frame", o) + tm.assert_frame_equal(result, expected) # table df = DataFrame({"A": range(5), "B": range(5)}) df.to_hdf(path, key="table", append=True) + expected = df[df.index > 2] + if using_infer_string: + expected.columns = expected.columns.astype("str") result = read_hdf(path, "table", where=["index>2"]) - tm.assert_frame_equal(df[df.index > 2], result) + tm.assert_frame_equal(result, expected) def test_long_strings(setup_path): From 7db8debeaef0b24cb59bb86d6bc223c2afc7816e Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Mon, 30 Dec 2024 10:00:45 -0500 Subject: [PATCH 3/3] TST(string dtype): Resolve HDF5 xfails in test_round_trip.py --- pandas/io/pytables.py | 10 +- pandas/tests/io/pytables/test_round_trip.py | 105 +++++++++++++++++--- 2 files changed, 99 insertions(+), 16 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 8e70f50ce9934..47dc1740fa9c9 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1950,6 +1950,7 @@ def _write_to_group( def _read_group(self, group: Node): s = self._create_storer(group) s.infer_axes() + print(type(s), s) return s.read() def _identify_group(self, key: str, append: bool) -> Node: @@ -3297,7 +3298,12 @@ def read( index = self.read_index("index", start=start, stop=stop) values = self.read_array("values", start=start, stop=stop) result = Series(values, index=index, name=self.name, copy=False) - if using_string_dtype() and is_string_array(values, skipna=True): + if ( + using_string_dtype() + and isinstance(values, np.ndarray) + and len(values) > 0 + and is_string_array(values, skipna=True) + ): result = result.astype(StringDtype(na_value=np.nan)) return result @@ -3369,6 +3375,7 @@ def read( if ( using_string_dtype() and isinstance(values, np.ndarray) + and len(df) > 0 and is_string_array(values, skipna=True) ): df = df.astype(StringDtype(na_value=np.nan)) @@ -4747,6 +4754,7 @@ def read( if ( using_string_dtype() and isinstance(values, np.ndarray) + and len(df) > 0 and is_string_array(values, skipna=True) ): df = df.astype(StringDtype(na_value=np.nan)) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index cf9a866130fec..3217f275b24f6 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + from pandas._libs.tslibs import Timestamp from pandas.compat import is_platform_windows @@ -66,6 +68,7 @@ def roundtrip(key, obj, **kwargs): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_long_strings(setup_path): # GH6166 data = ["a" * 50] * 10 @@ -206,6 +209,7 @@ def test_put_integer(setup_path): _check_roundtrip(df, tm.assert_frame_equal, setup_path) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_table_values_dtypes_roundtrip(setup_path): with ensure_clean_store(setup_path) as store: df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8") @@ -375,7 +379,7 @@ def test_timeseries_preepoch(setup_path, request): @pytest.mark.parametrize( "compression", [False, pytest.param(True, marks=td.skip_if_windows)] ) -def test_frame(compression, setup_path): +def test_frame(compression, setup_path, using_infer_string): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), columns=Index(list("ABCD"), dtype=object), @@ -386,11 +390,24 @@ def test_frame(compression, setup_path): df.iloc[0, 0] = np.nan df.iloc[5, 3] = np.nan + expected = df.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + _check_roundtrip_table( - df, tm.assert_frame_equal, path=setup_path, compression=compression + df, + tm.assert_frame_equal, + path=setup_path, + compression=compression, + expected=expected, ) _check_roundtrip( - df, tm.assert_frame_equal, path=setup_path, compression=compression + df, + tm.assert_frame_equal, + path=setup_path, + compression=compression, + expected=expected, ) tdf = DataFrame( @@ -398,8 +415,15 @@ def test_frame(compression, setup_path): columns=Index(list("ABCD"), dtype=object), index=date_range("2000-01-01", periods=10, freq="B"), ) + expected = tdf.copy() + if using_infer_string: + expected.columns = expected.columns.astype("str") _check_roundtrip( - tdf, tm.assert_frame_equal, path=setup_path, compression=compression + tdf, + tm.assert_frame_equal, + path=setup_path, + compression=compression, + expected=expected, ) with ensure_clean_store(setup_path) as store: @@ -410,7 +434,10 @@ def test_frame(compression, setup_path): assert recons._mgr.is_consolidated() # empty - _check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path) + expected = df[:0] + if using_infer_string: + expected.columns = expected.columns.astype("str") + _check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path, expected=expected) def test_empty_series_frame(setup_path): @@ -442,9 +469,21 @@ def test_can_serialize_dates(setup_path): _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) -def test_store_hierarchical(setup_path, multiindex_dataframe_random_data): +def test_store_hierarchical( + setup_path, multiindex_dataframe_random_data, using_infer_string +): frame = multiindex_dataframe_random_data + if using_infer_string: + msg = "Saving a MultiIndex with an extension dtype is not supported." + with pytest.raises(NotImplementedError, match=msg): + _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) + with pytest.raises(NotImplementedError, match=msg): + _check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path) + with pytest.raises(NotImplementedError, match=msg): + _check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path) + return + _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) _check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path) _check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path) @@ -459,7 +498,7 @@ def test_store_hierarchical(setup_path, multiindex_dataframe_random_data): @pytest.mark.parametrize( "compression", [False, pytest.param(True, marks=td.skip_if_windows)] ) -def test_store_mixed(compression, setup_path): +def test_store_mixed(compression, setup_path, using_infer_string): def _make_one(): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -477,57 +516,91 @@ def _make_one(): df1 = _make_one() df2 = _make_one() - _check_roundtrip(df1, tm.assert_frame_equal, path=setup_path) - _check_roundtrip(df2, tm.assert_frame_equal, path=setup_path) + expected = df1.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + _check_roundtrip(df1, tm.assert_frame_equal, path=setup_path, expected=expected) + + expected = df2.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + _check_roundtrip(df2, tm.assert_frame_equal, path=setup_path, expected=expected) with ensure_clean_store(setup_path) as store: store["obj"] = df1 - tm.assert_frame_equal(store["obj"], df1) + expected = df1.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(store["obj"], expected) + store["obj"] = df2 - tm.assert_frame_equal(store["obj"], df2) + expected = df2.copy() + if using_infer_string: + expected.index = expected.index.astype("str") + expected.columns = expected.columns.astype("str") + tm.assert_frame_equal(store["obj"], expected) # check that can store Series of all of these types + expected = df1["obj1"] + if using_infer_string: + expected.index = expected.index.astype("str") _check_roundtrip( df1["obj1"], tm.assert_series_equal, path=setup_path, compression=compression, + expected=expected, ) + expected = df1["bool1"] + if using_infer_string: + expected.index = expected.index.astype("str") _check_roundtrip( df1["bool1"], tm.assert_series_equal, path=setup_path, compression=compression, + expected=expected, ) + expected = df1["int1"] + if using_infer_string: + expected.index = expected.index.astype("str") _check_roundtrip( df1["int1"], tm.assert_series_equal, path=setup_path, compression=compression, + expected=expected, ) -def _check_roundtrip(obj, comparator, path, compression=False, **kwargs): +def _check_roundtrip(obj, comparator, path, compression=False, expected=None, **kwargs): options = {} if compression: options["complib"] = "blosc" + if expected is None: + expected = obj with ensure_clean_store(path, "w", **options) as store: store["obj"] = obj retrieved = store["obj"] - comparator(retrieved, obj, **kwargs) + comparator(retrieved, expected, **kwargs) -def _check_roundtrip_table(obj, comparator, path, compression=False): +def _check_roundtrip_table(obj, comparator, path, compression=False, expected=None): options = {} if compression: options["complib"] = "blosc" + if expected is None: + expected = obj with ensure_clean_store(path, "w", **options) as store: store.put("obj", obj, format="table") retrieved = store["obj"] - comparator(retrieved, obj) + comparator(retrieved, expected) def test_unicode_index(setup_path): @@ -540,6 +613,7 @@ def test_unicode_index(setup_path): _check_roundtrip(s, tm.assert_series_equal, path=setup_path) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_unicode_longer_encoded(setup_path): # GH 11234 char = "\u0394" @@ -565,6 +639,7 @@ def test_store_datetime_mixed(setup_path): _check_roundtrip(df, tm.assert_frame_equal, path=setup_path) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_round_trip_equals(tmp_path, setup_path): # GH 9330 df = DataFrame({"B": [1, 2], "A": ["x", "y"]})