From 2be559acbc0a50695716df61c75f41a9f8891449 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Mon, 30 Dec 2024 08:54:00 -0500
Subject: [PATCH 1/3] TST(string dtype): Resolve HDF5 xfails in test_put.py

---
 pandas/io/pytables.py                | 16 ++++++++---
 pandas/tests/io/pytables/test_put.py | 43 +++++++++++++++++++---------
 2 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index b75dc6c3a43b4..8e70f50ce9934 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -86,6 +86,7 @@
     PeriodArray,
 )
 from pandas.core.arrays.datetimes import tz_to_dtype
+from pandas.core.arrays.string_ import BaseStringArray
 import pandas.core.common as com
 from pandas.core.computation.pytables import (
     PyTablesExpr,
@@ -3185,6 +3186,8 @@ def write_array(
         #  both self._filters and EA
 
         value = extract_array(obj, extract_numpy=True)
+        if isinstance(value, BaseStringArray):
+            value = value.to_numpy()
 
         if key in self.group:
             self._handle.remove_node(self.group, key)
@@ -3363,7 +3366,11 @@ def read(
 
             columns = items[items.get_indexer(blk_items)]
             df = DataFrame(values.T, columns=columns, index=axes[1], copy=False)
-            if using_string_dtype() and is_string_array(values, skipna=True):
+            if (
+                using_string_dtype()
+                and isinstance(values, np.ndarray)
+                and is_string_array(values, skipna=True)
+            ):
                 df = df.astype(StringDtype(na_value=np.nan))
             dfs.append(df)
 
@@ -4737,9 +4744,10 @@ def read(
                 df = DataFrame._from_arrays([values], columns=cols_, index=index_)
             if not (using_string_dtype() and values.dtype.kind == "O"):
                 assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
-            if using_string_dtype() and is_string_array(
-                values,  # type: ignore[arg-type]
-                skipna=True,
+            if (
+                using_string_dtype()
+                and isinstance(values, np.ndarray)
+                and is_string_array(values, skipna=True)
             ):
                 df = df.astype(StringDtype(na_value=np.nan))
             frames.append(df)
diff --git a/pandas/tests/io/pytables/test_put.py b/pandas/tests/io/pytables/test_put.py
index a4257b54dd6db..4b7548192196e 100644
--- a/pandas/tests/io/pytables/test_put.py
+++ b/pandas/tests/io/pytables/test_put.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs.tslibs import Timestamp
 
 import pandas as pd
@@ -26,7 +24,6 @@
 
 pytestmark = [
     pytest.mark.single_cpu,
-    pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
 ]
 
 
@@ -99,7 +96,7 @@ def test_api_default_format(tmp_path, setup_path):
             assert store.get_storer("df4").is_table
 
 
-def test_put(setup_path):
+def test_put(setup_path, using_infer_string):
     with ensure_clean_store(setup_path) as store:
         ts = Series(
             np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
@@ -133,7 +130,11 @@ def test_put(setup_path):
 
         # overwrite table
         store.put("c", df[:10], format="table", append=False)
-        tm.assert_frame_equal(df[:10], store["c"])
+        expected = df[:10]
+        if using_infer_string:
+            expected.columns = expected.columns.astype("str")
+        result = store["c"]
+        tm.assert_frame_equal(result, expected)
 
 
 def test_put_string_index(setup_path):
@@ -162,7 +163,7 @@ def test_put_string_index(setup_path):
         tm.assert_frame_equal(store["b"], df)
 
 
-def test_put_compression(setup_path):
+def test_put_compression(setup_path, using_infer_string):
     with ensure_clean_store(setup_path) as store:
         df = DataFrame(
             np.random.default_rng(2).standard_normal((10, 4)),
@@ -171,7 +172,11 @@ def test_put_compression(setup_path):
         )
 
         store.put("c", df, format="table", complib="zlib")
-        tm.assert_frame_equal(store["c"], df)
+        expected = df
+        if using_infer_string:
+            expected.columns = expected.columns.astype("str")
+        result = store["c"]
+        tm.assert_frame_equal(result, expected)
 
         # can't compress if format='fixed'
         msg = "Compression not supported on Fixed format stores"
@@ -180,7 +185,7 @@ def test_put_compression(setup_path):
 
 
 @td.skip_if_windows
-def test_put_compression_blosc(setup_path):
+def test_put_compression_blosc(setup_path, using_infer_string):
     df = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
         columns=Index(list("ABCD"), dtype=object),
@@ -194,10 +199,14 @@ def test_put_compression_blosc(setup_path):
             store.put("b", df, format="fixed", complib="blosc")
 
         store.put("c", df, format="table", complib="blosc")
-        tm.assert_frame_equal(store["c"], df)
+        expected = df
+        if using_infer_string:
+            expected.columns = expected.columns.astype("str")
+        result = store["c"]
+        tm.assert_frame_equal(result, expected)
 
 
-def test_put_mixed_type(setup_path, performance_warning):
+def test_put_mixed_type(setup_path, performance_warning, using_infer_string):
     df = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
         columns=Index(list("ABCD"), dtype=object),
@@ -223,8 +232,11 @@ def test_put_mixed_type(setup_path, performance_warning):
         with tm.assert_produces_warning(performance_warning):
             store.put("df", df)
 
-        expected = store.get("df")
-        tm.assert_frame_equal(expected, df)
+        expected = df
+        if using_infer_string:
+            expected.columns = expected.columns.astype("str")
+        result = store.get("df")
+        tm.assert_frame_equal(result, expected)
 
 
 @pytest.mark.parametrize("format", ["table", "fixed"])
@@ -253,7 +265,7 @@ def test_store_index_types(setup_path, format, index):
         tm.assert_frame_equal(df, store["df"])
 
 
-def test_column_multiindex(setup_path):
+def test_column_multiindex(setup_path, using_infer_string):
     # GH 4710
     # recreate multi-indexes properly
 
@@ -264,6 +276,11 @@ def test_column_multiindex(setup_path):
     expected = df.set_axis(df.index.to_numpy())
 
     with ensure_clean_store(setup_path) as store:
+        if using_infer_string:
+            msg = "Saving a MultiIndex with an extension dtype is not supported."
+            with pytest.raises(NotImplementedError, match=msg):
+                store.put("df", df)
+            return
         store.put("df", df)
         tm.assert_frame_equal(
             store["df"], expected, check_index_type=True, check_column_type=True

From ad6ed76425fa596b5ac8374c388322b2ed22a876 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Mon, 30 Dec 2024 09:19:16 -0500
Subject: [PATCH 2/3] TST(string dtype): Resolve HDF5 xfails in
 test_round_trip.py

---
 pandas/tests/io/pytables/test_round_trip.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py
index 6b98a720e4299..cf9a866130fec 100644
--- a/pandas/tests/io/pytables/test_round_trip.py
+++ b/pandas/tests/io/pytables/test_round_trip.py
@@ -4,8 +4,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs.tslibs import Timestamp
 from pandas.compat import is_platform_windows
 
@@ -28,11 +26,10 @@
 
 pytestmark = [
     pytest.mark.single_cpu,
-    pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
 ]
 
 
-def test_conv_read_write():
+def test_conv_read_write(using_infer_string):
     with tm.ensure_clean() as path:
 
         def roundtrip(key, obj, **kwargs):
@@ -52,13 +49,21 @@ def roundtrip(key, obj, **kwargs):
             columns=Index(list("ABCD"), dtype=object),
             index=Index([f"i-{i}" for i in range(30)], dtype=object),
         )
-        tm.assert_frame_equal(o, roundtrip("frame", o))
+        expected = o
+        if using_infer_string:
+            expected.index = expected.index.astype("str")
+            expected.columns = expected.columns.astype("str")
+        result = roundtrip("frame", o)
+        tm.assert_frame_equal(result, expected)
 
         # table
         df = DataFrame({"A": range(5), "B": range(5)})
         df.to_hdf(path, key="table", append=True)
+        expected = df[df.index > 2]
+        if using_infer_string:
+            expected.columns = expected.columns.astype("str")
         result = read_hdf(path, "table", where=["index>2"])
-        tm.assert_frame_equal(df[df.index > 2], result)
+        tm.assert_frame_equal(result, expected)
 
 
 def test_long_strings(setup_path):

From 7db8debeaef0b24cb59bb86d6bc223c2afc7816e Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Mon, 30 Dec 2024 10:00:45 -0500
Subject: [PATCH 3/3] TST(string dtype): Resolve HDF5 xfails in
 test_round_trip.py

---
 pandas/io/pytables.py                       |  10 +-
 pandas/tests/io/pytables/test_round_trip.py | 105 +++++++++++++++++---
 2 files changed, 99 insertions(+), 16 deletions(-)

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index 8e70f50ce9934..47dc1740fa9c9 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -1950,6 +1950,7 @@ def _write_to_group(
     def _read_group(self, group: Node):
         s = self._create_storer(group)
         s.infer_axes()
+        print(type(s), s)
         return s.read()
 
     def _identify_group(self, key: str, append: bool) -> Node:
@@ -3297,7 +3298,12 @@ def read(
         index = self.read_index("index", start=start, stop=stop)
         values = self.read_array("values", start=start, stop=stop)
         result = Series(values, index=index, name=self.name, copy=False)
-        if using_string_dtype() and is_string_array(values, skipna=True):
+        if (
+            using_string_dtype()
+            and isinstance(values, np.ndarray)
+            and len(values) > 0
+            and is_string_array(values, skipna=True)
+        ):
             result = result.astype(StringDtype(na_value=np.nan))
         return result
 
@@ -3369,6 +3375,7 @@ def read(
             if (
                 using_string_dtype()
                 and isinstance(values, np.ndarray)
+                and len(df) > 0
                 and is_string_array(values, skipna=True)
             ):
                 df = df.astype(StringDtype(na_value=np.nan))
@@ -4747,6 +4754,7 @@ def read(
             if (
                 using_string_dtype()
                 and isinstance(values, np.ndarray)
+                and len(df) > 0
                 and is_string_array(values, skipna=True)
             ):
                 df = df.astype(StringDtype(na_value=np.nan))
diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py
index cf9a866130fec..3217f275b24f6 100644
--- a/pandas/tests/io/pytables/test_round_trip.py
+++ b/pandas/tests/io/pytables/test_round_trip.py
@@ -4,6 +4,8 @@
 import numpy as np
 import pytest
 
+from pandas._config import using_string_dtype
+
 from pandas._libs.tslibs import Timestamp
 from pandas.compat import is_platform_windows
 
@@ -66,6 +68,7 @@ def roundtrip(key, obj, **kwargs):
         tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_long_strings(setup_path):
     # GH6166
     data = ["a" * 50] * 10
@@ -206,6 +209,7 @@ def test_put_integer(setup_path):
     _check_roundtrip(df, tm.assert_frame_equal, setup_path)
 
 
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_table_values_dtypes_roundtrip(setup_path):
     with ensure_clean_store(setup_path) as store:
         df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8")
@@ -375,7 +379,7 @@ def test_timeseries_preepoch(setup_path, request):
 @pytest.mark.parametrize(
     "compression", [False, pytest.param(True, marks=td.skip_if_windows)]
 )
-def test_frame(compression, setup_path):
+def test_frame(compression, setup_path, using_infer_string):
     df = DataFrame(
         1.1 * np.arange(120).reshape((30, 4)),
         columns=Index(list("ABCD"), dtype=object),
@@ -386,11 +390,24 @@ def test_frame(compression, setup_path):
     df.iloc[0, 0] = np.nan
     df.iloc[5, 3] = np.nan
 
+    expected = df.copy()
+    if using_infer_string:
+        expected.index = expected.index.astype("str")
+        expected.columns = expected.columns.astype("str")
+
     _check_roundtrip_table(
-        df, tm.assert_frame_equal, path=setup_path, compression=compression
+        df,
+        tm.assert_frame_equal,
+        path=setup_path,
+        compression=compression,
+        expected=expected,
     )
     _check_roundtrip(
-        df, tm.assert_frame_equal, path=setup_path, compression=compression
+        df,
+        tm.assert_frame_equal,
+        path=setup_path,
+        compression=compression,
+        expected=expected,
     )
 
     tdf = DataFrame(
@@ -398,8 +415,15 @@ def test_frame(compression, setup_path):
         columns=Index(list("ABCD"), dtype=object),
         index=date_range("2000-01-01", periods=10, freq="B"),
     )
+    expected = tdf.copy()
+    if using_infer_string:
+        expected.columns = expected.columns.astype("str")
     _check_roundtrip(
-        tdf, tm.assert_frame_equal, path=setup_path, compression=compression
+        tdf,
+        tm.assert_frame_equal,
+        path=setup_path,
+        compression=compression,
+        expected=expected,
     )
 
     with ensure_clean_store(setup_path) as store:
@@ -410,7 +434,10 @@ def test_frame(compression, setup_path):
         assert recons._mgr.is_consolidated()
 
     # empty
-    _check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path)
+    expected = df[:0]
+    if using_infer_string:
+        expected.columns = expected.columns.astype("str")
+    _check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path, expected=expected)
 
 
 def test_empty_series_frame(setup_path):
@@ -442,9 +469,21 @@ def test_can_serialize_dates(setup_path):
     _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path)
 
 
-def test_store_hierarchical(setup_path, multiindex_dataframe_random_data):
+def test_store_hierarchical(
+    setup_path, multiindex_dataframe_random_data, using_infer_string
+):
     frame = multiindex_dataframe_random_data
 
+    if using_infer_string:
+        msg = "Saving a MultiIndex with an extension dtype is not supported."
+        with pytest.raises(NotImplementedError, match=msg):
+            _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path)
+        with pytest.raises(NotImplementedError, match=msg):
+            _check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path)
+        with pytest.raises(NotImplementedError, match=msg):
+            _check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path)
+        return
+
     _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path)
     _check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path)
     _check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path)
@@ -459,7 +498,7 @@ def test_store_hierarchical(setup_path, multiindex_dataframe_random_data):
 @pytest.mark.parametrize(
     "compression", [False, pytest.param(True, marks=td.skip_if_windows)]
 )
-def test_store_mixed(compression, setup_path):
+def test_store_mixed(compression, setup_path, using_infer_string):
     def _make_one():
         df = DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
@@ -477,57 +516,91 @@ def _make_one():
     df1 = _make_one()
     df2 = _make_one()
 
-    _check_roundtrip(df1, tm.assert_frame_equal, path=setup_path)
-    _check_roundtrip(df2, tm.assert_frame_equal, path=setup_path)
+    expected = df1.copy()
+    if using_infer_string:
+        expected.index = expected.index.astype("str")
+        expected.columns = expected.columns.astype("str")
+    _check_roundtrip(df1, tm.assert_frame_equal, path=setup_path, expected=expected)
+
+    expected = df2.copy()
+    if using_infer_string:
+        expected.index = expected.index.astype("str")
+        expected.columns = expected.columns.astype("str")
+    _check_roundtrip(df2, tm.assert_frame_equal, path=setup_path, expected=expected)
 
     with ensure_clean_store(setup_path) as store:
         store["obj"] = df1
-        tm.assert_frame_equal(store["obj"], df1)
+        expected = df1.copy()
+        if using_infer_string:
+            expected.index = expected.index.astype("str")
+            expected.columns = expected.columns.astype("str")
+        tm.assert_frame_equal(store["obj"], expected)
+
         store["obj"] = df2
-        tm.assert_frame_equal(store["obj"], df2)
+        expected = df2.copy()
+        if using_infer_string:
+            expected.index = expected.index.astype("str")
+            expected.columns = expected.columns.astype("str")
+        tm.assert_frame_equal(store["obj"], expected)
 
     # check that can store Series of all of these types
+    expected = df1["obj1"]
+    if using_infer_string:
+        expected.index = expected.index.astype("str")
     _check_roundtrip(
         df1["obj1"],
         tm.assert_series_equal,
         path=setup_path,
         compression=compression,
+        expected=expected,
     )
+    expected = df1["bool1"]
+    if using_infer_string:
+        expected.index = expected.index.astype("str")
     _check_roundtrip(
         df1["bool1"],
         tm.assert_series_equal,
         path=setup_path,
         compression=compression,
+        expected=expected,
     )
+    expected = df1["int1"]
+    if using_infer_string:
+        expected.index = expected.index.astype("str")
     _check_roundtrip(
         df1["int1"],
         tm.assert_series_equal,
         path=setup_path,
         compression=compression,
+        expected=expected,
     )
 
 
-def _check_roundtrip(obj, comparator, path, compression=False, **kwargs):
+def _check_roundtrip(obj, comparator, path, compression=False, expected=None, **kwargs):
     options = {}
     if compression:
         options["complib"] = "blosc"
+    if expected is None:
+        expected = obj
 
     with ensure_clean_store(path, "w", **options) as store:
         store["obj"] = obj
         retrieved = store["obj"]
-        comparator(retrieved, obj, **kwargs)
+        comparator(retrieved, expected, **kwargs)
 
 
-def _check_roundtrip_table(obj, comparator, path, compression=False):
+def _check_roundtrip_table(obj, comparator, path, compression=False, expected=None):
     options = {}
     if compression:
         options["complib"] = "blosc"
+    if expected is None:
+        expected = obj
 
     with ensure_clean_store(path, "w", **options) as store:
         store.put("obj", obj, format="table")
         retrieved = store["obj"]
 
-        comparator(retrieved, obj)
+        comparator(retrieved, expected)
 
 
 def test_unicode_index(setup_path):
@@ -540,6 +613,7 @@ def test_unicode_index(setup_path):
     _check_roundtrip(s, tm.assert_series_equal, path=setup_path)
 
 
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_unicode_longer_encoded(setup_path):
     # GH 11234
     char = "\u0394"
@@ -565,6 +639,7 @@ def test_store_datetime_mixed(setup_path):
     _check_roundtrip(df, tm.assert_frame_equal, path=setup_path)
 
 
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_round_trip_equals(tmp_path, setup_path):
     # GH 9330
     df = DataFrame({"B": [1, 2], "A": ["x", "y"]})