diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
index 46c27849356b5..64220f1d3d5b4 100644
--- a/pandas/tests/groupby/aggregate/test_aggregate.py
+++ b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -9,8 +9,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import SpecificationError
 
 from pandas.core.dtypes.common import is_integer_dtype
@@ -296,12 +294,11 @@ def aggfun_1(ser):
     assert len(result) == 0
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_wrap_agg_out(three_group):
     grouped = three_group.groupby(["A", "B"])
 
     def func(ser):
-        if ser.dtype == object:
+        if ser.dtype == object or ser.dtype == "string":
             raise TypeError("Test error message")
         return ser.sum()
 
@@ -1117,7 +1114,6 @@ def test_lambda_named_agg(func):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_aggregate_mixed_types():
     # GH 16916
     df = DataFrame(
@@ -1129,7 +1125,7 @@ def test_aggregate_mixed_types():
     expected = DataFrame(
         expected_data,
         index=Index([2, "group 1"], dtype="object", name="grouping"),
-        columns=Index(["X", "Y", "Z"], dtype="object"),
+        columns=Index(["X", "Y", "Z"]),
     )
     tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
index b937e7dcc8136..a706ea795a0e2 100644
--- a/pandas/tests/groupby/aggregate/test_cython.py
+++ b/pandas/tests/groupby/aggregate/test_cython.py
@@ -5,8 +5,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.core.dtypes.common import (
     is_float_dtype,
     is_integer_dtype,
@@ -92,7 +90,6 @@ def test_cython_agg_boolean():
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_cython_agg_nothing_to_agg():
     frame = DataFrame(
         {"a": np.random.default_rng(2).integers(0, 5, 50), "b": ["foo", "bar"] * 25}
@@ -108,7 +105,9 @@ def test_cython_agg_nothing_to_agg():
 
     result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
     expected = DataFrame(
-        [], index=frame["a"].sort_values().drop_duplicates(), columns=[]
+        [],
+        index=frame["a"].sort_values().drop_duplicates(),
+        columns=Index([], dtype="str"),
     )
     tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py
index 835cad0d13078..ce78b58e5d8f4 100644
--- a/pandas/tests/groupby/aggregate/test_other.py
+++ b/pandas/tests/groupby/aggregate/test_other.py
@@ -8,8 +8,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import SpecificationError
 
 import pandas as pd
@@ -308,7 +306,6 @@ def test_series_agg_multikey():
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_series_agg_multi_pure_python():
     data = DataFrame(
         {
@@ -358,7 +355,8 @@ def test_series_agg_multi_pure_python():
     )
 
     def bad(x):
-        assert len(x.values.base) > 0
+        if isinstance(x.values, np.ndarray):
+            assert len(x.values.base) > 0
         return "foo"
 
     result = data.groupby(["A", "B"]).agg(bad)
diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py
index 4a8ad65200caa..28cb25b515ed2 100644
--- a/pandas/tests/groupby/methods/test_quantile.py
+++ b/pandas/tests/groupby/methods/test_quantile.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -158,11 +156,10 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby,
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_quantile_raises():
     df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])
 
-    msg = "dtype 'object' does not support operation 'quantile'"
+    msg = "dtype '(object|str)' does not support operation 'quantile'"
     with pytest.raises(TypeError, match=msg):
         df.groupby("key").quantile()
 
diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py
index 91200f53e36bd..2dc89bc75746f 100644
--- a/pandas/tests/groupby/methods/test_size.py
+++ b/pandas/tests/groupby/methods/test_size.py
@@ -76,6 +76,8 @@ def test_size_series_masked_type_returns_Int64(dtype):
     tm.assert_series_equal(result, expected)
 
 
+# TODO(infer_string) in case the column is object dtype, it should preserve that dtype
+# for the result's index
 @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 def test_size_strings(any_string_dtype):
     # GH#55627
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
index 1e86b5401ee09..6d84dae1d25d8 100644
--- a/pandas/tests/groupby/test_categorical.py
+++ b/pandas/tests/groupby/test_categorical.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     Categorical,
@@ -322,8 +320,7 @@ def test_apply(ordered):
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
-def test_observed(observed):
+def test_observed(request, using_infer_string, observed):
     # multiple groupers, don't re-expand the output space
     # of the grouper
     # gh-14942 (implement)
@@ -331,6 +328,10 @@ def test_observed(observed):
     # gh-8138 (back-compat)
     # gh-8869
 
+    if using_infer_string and not observed:
+        # TODO(infer_string) this fails with filling the string column with 0
+        request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
+
     cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
     cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
     df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 3305b48a4dcdc..702bbfef2be3b 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -1281,7 +1281,6 @@ def test_groupby_two_group_keys_all_nan():
     assert result == {}
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_groupby_2d_malformed():
     d = DataFrame(index=range(2))
     d["group"] = ["g1", "g2"]
@@ -1290,7 +1289,7 @@ def test_groupby_2d_malformed():
     d["label"] = ["l1", "l2"]
     tmp = d.groupby(["group"]).mean(numeric_only=True)
     res_values = np.array([[0.0, 1.0], [0.0, 1.0]])
-    tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"]))
+    tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"], dtype=object))
     tm.assert_numpy_array_equal(tmp.values, res_values)
 
 
@@ -2345,7 +2344,6 @@ def test_groupby_all_nan_groups_drop():
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize("numeric_only", [True, False])
 def test_groupby_empty_multi_column(as_index, numeric_only):
     # GH 15106 & GH 41998
@@ -2354,7 +2352,7 @@ def test_groupby_empty_multi_column(as_index, numeric_only):
     result = gb.sum(numeric_only=numeric_only)
     if as_index:
         index = MultiIndex([[], []], [[], []], names=["A", "B"])
-        columns = ["C"] if not numeric_only else []
+        columns = ["C"] if not numeric_only else Index([], dtype="str")
     else:
         index = RangeIndex(0)
         columns = ["A", "B", "C"] if not numeric_only else ["A", "B"]
@@ -2362,7 +2360,6 @@ def test_groupby_empty_multi_column(as_index, numeric_only):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_groupby_aggregation_non_numeric_dtype():
     # GH #43108
     df = DataFrame(
@@ -2373,7 +2370,7 @@ def test_groupby_aggregation_non_numeric_dtype():
         {
             "v": [[1, 1], [10, 20]],
         },
-        index=Index(["M", "W"], dtype="object", name="MW"),
+        index=Index(["M", "W"], name="MW"),
     )
 
     gb = df.groupby(by=["MW"])
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index d42aa06d6bbfe..060a8b7fd3824 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat.pyarrow import pa_version_under10p1
 
 from pandas.core.dtypes.missing import na_value_for_dtype
@@ -99,7 +97,6 @@ def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups(
     tm.assert_frame_equal(grouped, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize(
     "dropna, idx, outputs",
     [
@@ -126,7 +123,7 @@ def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs):
     df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"])
     grouped = df.groupby("a", dropna=dropna).sum()
 
-    expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a"))
+    expected = pd.DataFrame(outputs, index=pd.Index(idx, name="a"))
 
     tm.assert_frame_equal(grouped, expected)
 
diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py
index 6bb2eaf89b5d7..366eb59ee226a 100644
--- a/pandas/tests/groupby/test_grouping.py
+++ b/pandas/tests/groupby/test_grouping.py
@@ -10,8 +10,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import SpecificationError
 
 import pandas as pd
@@ -807,7 +805,6 @@ def test_groupby_empty(self):
         expected = ["name"]
         assert result == expected
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_groupby_level_index_value_all_na(self):
         # issue 20519
         df = DataFrame(
@@ -817,7 +814,7 @@ def test_groupby_level_index_value_all_na(self):
         expected = DataFrame(
             data=[],
             index=MultiIndex(
-                levels=[Index(["x"], dtype="object"), Index([], dtype="float64")],
+                levels=[Index(["x"], dtype="str"), Index([], dtype="float64")],
                 codes=[[], []],
                 names=["A", "B"],
             ),
@@ -981,12 +978,13 @@ def test_groupby_with_empty(self):
         grouped = series.groupby(grouper)
         assert next(iter(grouped), None) is None
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_groupby_with_single_column(self):
         df = DataFrame({"a": list("abssbab")})
         tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]])
         # GH 13530
-        exp = DataFrame(index=Index(["a", "b", "s"], name="a"), columns=[])
+        exp = DataFrame(
+            index=Index(["a", "b", "s"], name="a"), columns=Index([], dtype="str")
+        )
         tm.assert_frame_equal(df.groupby("a").count(), exp)
         tm.assert_frame_equal(df.groupby("a").sum(), exp)
 
diff --git a/pandas/tests/groupby/test_pipe.py b/pandas/tests/groupby/test_pipe.py
index 1044c83e3e56b..ee59a93695bcf 100644
--- a/pandas/tests/groupby/test_pipe.py
+++ b/pandas/tests/groupby/test_pipe.py
@@ -1,7 +1,4 @@
 import numpy as np
-import pytest
-
-from pandas._config import using_string_dtype
 
 import pandas as pd
 from pandas import (
@@ -11,7 +8,6 @@
 import pandas._testing as tm
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_pipe():
     # Test the pipe method of DataFrameGroupBy.
     # Issue #17871
@@ -39,7 +35,7 @@ def square(srs):
     # NDFrame.pipe methods
     result = df.groupby("A").pipe(f).pipe(square)
 
-    index = Index(["bar", "foo"], dtype="object", name="A")
+    index = Index(["bar", "foo"], name="A")
     expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index)
 
     tm.assert_series_equal(expected, result)
diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py
index a6ea1502103c5..51c7eab2bfa82 100644
--- a/pandas/tests/groupby/test_reductions.py
+++ b/pandas/tests/groupby/test_reductions.py
@@ -5,8 +5,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs.tslibs import iNaT
 
 from pandas.core.dtypes.common import pandas_dtype
@@ -470,8 +468,7 @@ def test_max_min_non_numeric():
     assert "ss" in result
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-def test_max_min_object_multiple_columns():
+def test_max_min_object_multiple_columns(using_infer_string):
     # GH#41111 case where the aggregation is valid for some columns but not
     # others; we split object blocks column-wise, consistent with
     # DataFrame._reduce
@@ -484,7 +481,7 @@ def test_max_min_object_multiple_columns():
         }
     )
     df._consolidate_inplace()  # should already be consolidate, but double-check
-    assert len(df._mgr.blocks) == 2
+    assert len(df._mgr.blocks) == 3 if using_infer_string else 2
 
     gb = df.groupby("A")
 
diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py
index ee4973cbf18af..a7712d9dc6586 100644
--- a/pandas/tests/groupby/test_timegrouper.py
+++ b/pandas/tests/groupby/test_timegrouper.py
@@ -76,6 +76,8 @@ def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper):
 
 
 class TestGroupBy:
+    # TODO(infer_string) resample sum introduces 0's
+    # https://github.com/pandas-dev/pandas/issues/60229
     @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_groupby_with_timegrouper(self):
         # GH 4161
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index 5b8fa96291c9f..022d3d51ded4e 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs import lib
 
 from pandas.core.dtypes.common import ensure_platform_int
@@ -1034,20 +1032,19 @@ def test_groupby_transform_with_datetimes(func, values):
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_groupby_transform_dtype():
     # GH 22243
     df = DataFrame({"a": [1], "val": [1.35]})
 
     result = df["val"].transform(lambda x: x.map(lambda y: f"+{y}"))
-    expected1 = Series(["+1.35"], name="val", dtype="object")
+    expected1 = Series(["+1.35"], name="val")
     tm.assert_series_equal(result, expected1)
 
     result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+{y}"))
     tm.assert_series_equal(result, expected1)
 
     result = df.groupby("a")["val"].transform(lambda x: x.map(lambda y: f"+({y})"))
-    expected2 = Series(["+(1.35)"], name="val", dtype="object")
+    expected2 = Series(["+(1.35)"], name="val")
     tm.assert_series_equal(result, expected2)
 
     df["val"] = df["val"].astype(object)