Merge branch 'main' into issue-37210-to-sql-truncate

pandas-dev · Dec 29, 2024 · 7a06949 · 7a06949
2 parents 2eb19e7 + d81882b
commit 7a06949
Show file tree

Hide file tree

Showing 9 changed files with 99 additions and 72 deletions.
diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py
@@ -1644,7 +1644,7 @@ def _update_ctx_header(self, attrs: DataFrame, axis: AxisInt) -> None:
         for j in attrs.columns:
             ser = attrs[j]
             for i, c in ser.items():
-                if not c:
+                if not c or pd.isna(c):
                     continue
                 css_list = maybe_convert_css_to_tuples(c)
                 if axis == 0:

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -5297,6 +5297,8 @@ def _dtype_to_kind(dtype_str: str) -> str:
         kind = "integer"
     elif dtype_str == "object":
         kind = "object"
+    elif dtype_str == "str":
+        kind = "str"
     else:
         raise ValueError(f"cannot interpret dtype of [{dtype_str}]")
 

diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -255,19 +255,19 @@ def test_apply_with_mixed_dtype():
             "foo2": ["one", "two", "two", "three", "one", "two"],
         }
     )
-    result = df.apply(lambda x: x, axis=1).dtypes
-    expected = df.dtypes
-    tm.assert_series_equal(result, expected)
+    result = df.apply(lambda x: x, axis=1)
+    expected = df
+    tm.assert_frame_equal(result, expected)
 
     # GH 3610 incorrect dtype conversion with as_index=False
     df = DataFrame({"c1": [1, 2, 6, 6, 8]})
     df["c2"] = df.c1 / 2.0
-    result1 = df.groupby("c2").mean().reset_index().c2
-    result2 = df.groupby("c2", as_index=False).mean().c2
-    tm.assert_series_equal(result1, result2)
+    result1 = df.groupby("c2").mean().reset_index()
+    result2 = df.groupby("c2", as_index=False).mean()
+    tm.assert_frame_equal(result1, result2)
 
 
-def test_groupby_as_index_apply():
+def test_groupby_as_index_apply(as_index):
     # GH #4648 and #3417
     df = DataFrame(
         {
@@ -276,27 +276,35 @@ def test_groupby_as_index_apply():
             "time": range(6),
         }
     )
+    gb = df.groupby("user_id", as_index=as_index)
 
-    g_as = df.groupby("user_id", as_index=True)
-    g_not_as = df.groupby("user_id", as_index=False)
-
-    res_as = g_as.head(2).index
-    res_not_as = g_not_as.head(2).index
-    exp = Index([0, 1, 2, 4])
-    tm.assert_index_equal(res_as, exp)
-    tm.assert_index_equal(res_not_as, exp)
-
-    res_as_apply = g_as.apply(lambda x: x.head(2)).index
-    res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index
+    expected = DataFrame(
+        {
+            "item_id": ["b", "b", "a", "a"],
+            "user_id": [1, 2, 1, 3],
+            "time": [0, 1, 2, 4],
+        },
+        index=[0, 1, 2, 4],
+    )
+    result = gb.head(2)
+    tm.assert_frame_equal(result, expected)
 
     # apply doesn't maintain the original ordering
     # changed in GH5610 as the as_index=False returns a MI here
-    exp_not_as_apply = Index([0, 2, 1, 4])
-    tp = [(1, 0), (1, 2), (2, 1), (3, 4)]
-    exp_as_apply = MultiIndex.from_tuples(tp, names=["user_id", None])
-
-    tm.assert_index_equal(res_as_apply, exp_as_apply)
-    tm.assert_index_equal(res_not_as_apply, exp_not_as_apply)
+    if as_index:
+        tp = [(1, 0), (1, 2), (2, 1), (3, 4)]
+        index = MultiIndex.from_tuples(tp, names=["user_id", None])
+    else:
+        index = Index([0, 2, 1, 4])
+    expected = DataFrame(
+        {
+            "item_id": list("baba"),
+            "time": [0, 2, 1, 4],
+        },
+        index=index,
+    )
+    result = gb.apply(lambda x: x.head(2))
+    tm.assert_frame_equal(result, expected)
 
 
 def test_groupby_as_index_apply_str():

diff --git a/pandas/tests/groupby/test_apply_mutate.py b/pandas/tests/groupby/test_apply_mutate.py
@@ -38,18 +38,20 @@ def test_mutate_groups():
         }
     )
 
-    def f_copy(x):
+    def f(x):
         x = x.copy()
         x["rank"] = x.val.rank(method="min")
         return x.groupby("cat2")["rank"].min()
 
-    def f_no_copy(x):
-        x["rank"] = x.val.rank(method="min")
-        return x.groupby("cat2")["rank"].min()
-
-    grpby_copy = df.groupby("cat1").apply(f_copy)
-    grpby_no_copy = df.groupby("cat1").apply(f_no_copy)
-    tm.assert_series_equal(grpby_copy, grpby_no_copy)
+    expected = pd.DataFrame(
+        {
+            "cat1": list("aaaabbb"),
+            "cat2": list("cdefcde"),
+            "rank": [3.0, 2.0, 5.0, 1.0, 2.0, 4.0, 1.0],
+        }
+    ).set_index(["cat1", "cat2"])["rank"]
+    result = df.groupby("cat1").apply(f)
+    tm.assert_series_equal(result, expected)
 
 
 def test_no_mutate_but_looks_like():
@@ -61,22 +63,3 @@ def test_no_mutate_but_looks_like():
     result1 = df.groupby("key", group_keys=True).apply(lambda x: x[:].value)
     result2 = df.groupby("key", group_keys=True).apply(lambda x: x.value)
     tm.assert_series_equal(result1, result2)
-
-
-def test_apply_function_with_indexing():
-    # GH: 33058
-    df = pd.DataFrame(
-        {"col1": ["A", "A", "A", "B", "B", "B"], "col2": [1, 2, 3, 4, 5, 6]}
-    )
-
-    def fn(x):
-        x.loc[x.index[-1], "col2"] = 0
-        return x.col2
-
-    result = df.groupby(["col1"], as_index=False).apply(fn)
-    expected = pd.Series(
-        [1, 2, 0, 4, 5, 0],
-        index=range(6),
-        name="col2",
-    )
-    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
@@ -3297,3 +3297,23 @@ def test_loc_reindexing_of_empty_index(self):
         df.loc[Series([False] * 4, index=df.index, name=0), 0] = df[0]
         expected = DataFrame(index=[1, 1, 2, 2], data=["1", "1", "2", "2"])
         tm.assert_frame_equal(df, expected)
+
+    def test_loc_setitem_matching_index(self):
+        # GH 25548
+        s = Series(0.0, index=list("abcd"))
+        s1 = Series(1.0, index=list("ab"))
+        s2 = Series(2.0, index=list("xy"))
+
+        # Test matching indices
+        s.loc[["a", "b"]] = s1
+
+        result = s[["a", "b"]]
+        expected = s1
+        tm.assert_series_equal(result, expected)
+
+        # Test unmatched indices
+        s.loc[["a", "b"]] = s2
+
+        result = s[["a", "b"]]
+        expected = Series([np.nan, np.nan], index=["a", "b"])
+        tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/io/formats/style/test_to_latex.py b/pandas/tests/io/formats/style/test_to_latex.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     DataFrame,
     MultiIndex,
@@ -731,7 +729,6 @@ def test_longtable_caption_label(styler, caption, cap_exp, label, lab_exp):
     )
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize("index", [True, False])
 @pytest.mark.parametrize(
     "columns, siunitx",

diff --git a/pandas/tests/io/pytables/test_file_handling.py b/pandas/tests/io/pytables/test_file_handling.py
@@ -37,12 +37,11 @@
 
 pytestmark = [
     pytest.mark.single_cpu,
-    pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False),
 ]
 
 
 @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"])
-def test_mode(setup_path, tmp_path, mode):
+def test_mode(setup_path, tmp_path, mode, using_infer_string):
     df = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
         columns=Index(list("ABCD"), dtype=object),
@@ -91,10 +90,12 @@ def test_mode(setup_path, tmp_path, mode):
             read_hdf(path, "df", mode=mode)
     else:
         result = read_hdf(path, "df", mode=mode)
+        if using_infer_string:
+            df.columns = df.columns.astype("str")
         tm.assert_frame_equal(result, df)
 
 
-def test_default_mode(tmp_path, setup_path):
+def test_default_mode(tmp_path, setup_path, using_infer_string):
     # read_hdf uses default mode
     df = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
@@ -104,7 +105,10 @@ def test_default_mode(tmp_path, setup_path):
     path = tmp_path / setup_path
     df.to_hdf(path, key="df", mode="w")
     result = read_hdf(path, "df")
-    tm.assert_frame_equal(result, df)
+    expected = df.copy()
+    if using_infer_string:
+        expected.columns = expected.columns.astype("str")
+    tm.assert_frame_equal(result, expected)
 
 
 def test_reopen_handle(tmp_path, setup_path):
@@ -163,7 +167,7 @@ def test_reopen_handle(tmp_path, setup_path):
     assert not store.is_open
 
 
-def test_open_args(setup_path):
+def test_open_args(setup_path, using_infer_string):
     with tm.ensure_clean(setup_path) as path:
         df = DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
@@ -178,8 +182,13 @@ def test_open_args(setup_path):
         store["df"] = df
         store.append("df2", df)
 
-        tm.assert_frame_equal(store["df"], df)
-        tm.assert_frame_equal(store["df2"], df)
+        expected = df.copy()
+        if using_infer_string:
+            expected.index = expected.index.astype("str")
+            expected.columns = expected.columns.astype("str")
+
+        tm.assert_frame_equal(store["df"], expected)
+        tm.assert_frame_equal(store["df2"], expected)
 
         store.close()
 
@@ -194,7 +203,7 @@ def test_flush(setup_path):
         store.flush(fsync=True)
 
 
-def test_complibs_default_settings(tmp_path, setup_path):
+def test_complibs_default_settings(tmp_path, setup_path, using_infer_string):
     # GH15943
     df = DataFrame(
         1.1 * np.arange(120).reshape((30, 4)),
@@ -207,7 +216,11 @@ def test_complibs_default_settings(tmp_path, setup_path):
     tmpfile = tmp_path / setup_path
     df.to_hdf(tmpfile, key="df", complevel=9)
     result = read_hdf(tmpfile, "df")
-    tm.assert_frame_equal(result, df)
+    expected = df.copy()
+    if using_infer_string:
+        expected.index = expected.index.astype("str")
+        expected.columns = expected.columns.astype("str")
+    tm.assert_frame_equal(result, expected)
 
     with tables.open_file(tmpfile, mode="r") as h5file:
         for node in h5file.walk_nodes(where="/df", classname="Leaf"):
@@ -218,7 +231,11 @@ def test_complibs_default_settings(tmp_path, setup_path):
     tmpfile = tmp_path / setup_path
     df.to_hdf(tmpfile, key="df", complib="zlib")
     result = read_hdf(tmpfile, "df")
-    tm.assert_frame_equal(result, df)
+    expected = df.copy()
+    if using_infer_string:
+        expected.index = expected.index.astype("str")
+        expected.columns = expected.columns.astype("str")
+    tm.assert_frame_equal(result, expected)
 
     with tables.open_file(tmpfile, mode="r") as h5file:
         for node in h5file.walk_nodes(where="/df", classname="Leaf"):
@@ -229,7 +246,11 @@ def test_complibs_default_settings(tmp_path, setup_path):
     tmpfile = tmp_path / setup_path
     df.to_hdf(tmpfile, key="df")
     result = read_hdf(tmpfile, "df")
-    tm.assert_frame_equal(result, df)
+    expected = df.copy()
+    if using_infer_string:
+        expected.index = expected.index.astype("str")
+        expected.columns = expected.columns.astype("str")
+    tm.assert_frame_equal(result, expected)
 
     with tables.open_file(tmpfile, mode="r") as h5file:
         for node in h5file.walk_nodes(where="/df", classname="Leaf"):
@@ -308,6 +329,7 @@ def test_complibs(tmp_path, lvl, lib, request):
                 assert node.filters.complib == lib
 
 
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.skipif(
     not is_platform_little_endian(), reason="reason platform is not little endian"
 )
@@ -325,6 +347,7 @@ def test_encoding(setup_path):
         tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize(
     "val",
     [

diff --git a/pandas/tests/io/pytables/test_subclass.py b/pandas/tests/io/pytables/test_subclass.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     DataFrame,
     Series,
@@ -19,7 +17,6 @@
 
 class TestHDFStoreSubclass:
     # GH 33748
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_supported_for_subclass_dataframe(self, tmp_path):
         data = {"a": [1, 2], "b": [3, 4]}
         sdf = tm.SubclassedDataFrame(data, dtype=np.intp)

diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
@@ -19,8 +19,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat import (
     WASM,
     is_platform_windows,
@@ -365,7 +363,6 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module):
                     expected = f_path.read()
                     assert result == expected
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) hdf support")
     def test_write_fspath_hdf5(self):
         # Same test as write_fspath_all, except HDF5 files aren't
         # necessarily byte-for-byte identical for a given dataframe, so we'll