TST: de-xfail chunksize pyarrow tests (#56041)

pandas-dev · Nov 18, 2023 · 47a596e · 47a596e
1 parent 4ac5cf6
commit 47a596e
Show file tree

Hide file tree

Showing 2 changed files with 105 additions and 21 deletions.
diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py
@@ -16,13 +16,11 @@
 )
 import pandas._testing as tm
 
-xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
 pytestmark = pytest.mark.filterwarnings(
     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
 )
 
 
-@xfail_pyarrow  # The 'chunksize' option is not supported
 @pytest.mark.parametrize("index_col", [0, "index"])
 def test_read_chunksize_with_index(all_parsers, index_col):
     parser = all_parsers
@@ -48,14 +46,20 @@ def test_read_chunksize_with_index(all_parsers, index_col):
     )
     expected = expected.set_index("index")
 
+    if parser.engine == "pyarrow":
+        msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
+        with pytest.raises(ValueError, match=msg):
+            with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
+                list(reader)
+        return
+
     with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
         chunks = list(reader)
     tm.assert_frame_equal(chunks[0], expected[:2])
     tm.assert_frame_equal(chunks[1], expected[2:4])
     tm.assert_frame_equal(chunks[2], expected[4:])
 
 
-@xfail_pyarrow  # AssertionError: Regex pattern did not match
 @pytest.mark.parametrize("chunksize", [1.3, "foo", 0])
 def test_read_chunksize_bad(all_parsers, chunksize):
     data = """index,A,B,C,D
@@ -68,13 +72,14 @@ def test_read_chunksize_bad(all_parsers, chunksize):
 """
     parser = all_parsers
     msg = r"'chunksize' must be an integer >=1"
+    if parser.engine == "pyarrow":
+        msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
 
     with pytest.raises(ValueError, match=msg):
         with parser.read_csv(StringIO(data), chunksize=chunksize) as _:
             pass
 
 
-@xfail_pyarrow  # The 'nrows' option is not supported
 @pytest.mark.parametrize("chunksize", [2, 8])
 def test_read_chunksize_and_nrows(all_parsers, chunksize):
     # see gh-15755
@@ -89,12 +94,17 @@ def test_read_chunksize_and_nrows(all_parsers, chunksize):
     parser = all_parsers
     kwargs = {"index_col": 0, "nrows": 5}
 
+    if parser.engine == "pyarrow":
+        msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
+        with pytest.raises(ValueError, match=msg):
+            parser.read_csv(StringIO(data), **kwargs)
+        return
+
     expected = parser.read_csv(StringIO(data), **kwargs)
     with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader:
         tm.assert_frame_equal(concat(reader), expected)
 
 
-@xfail_pyarrow  # The 'chunksize' option is not supported
 def test_read_chunksize_and_nrows_changing_size(all_parsers):
     data = """index,A,B,C,D
 foo,2,3,4,5
@@ -107,6 +117,12 @@ def test_read_chunksize_and_nrows_changing_size(all_parsers):
     parser = all_parsers
     kwargs = {"index_col": 0, "nrows": 5}
 
+    if parser.engine == "pyarrow":
+        msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
+        with pytest.raises(ValueError, match=msg):
+            parser.read_csv(StringIO(data), **kwargs)
+        return
+
     expected = parser.read_csv(StringIO(data), **kwargs)
     with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader:
         tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2])
@@ -116,7 +132,6 @@ def test_read_chunksize_and_nrows_changing_size(all_parsers):
             reader.get_chunk(size=3)
 
 
-@xfail_pyarrow  # The 'chunksize' option is not supported
 def test_get_chunk_passed_chunksize(all_parsers):
     parser = all_parsers
     data = """A,B,C
@@ -125,14 +140,20 @@ def test_get_chunk_passed_chunksize(all_parsers):
 7,8,9
 1,2,3"""
 
+    if parser.engine == "pyarrow":
+        msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
+        with pytest.raises(ValueError, match=msg):
+            with parser.read_csv(StringIO(data), chunksize=2) as reader:
+                reader.get_chunk()
+        return
+
     with parser.read_csv(StringIO(data), chunksize=2) as reader:
         result = reader.get_chunk()
 
     expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow  # The 'chunksize' option is not supported
 @pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}])
 def test_read_chunksize_compat(all_parsers, kwargs):
     # see gh-12185
@@ -146,17 +167,35 @@ def test_read_chunksize_compat(all_parsers, kwargs):
 """
     parser = all_parsers
     result = parser.read_csv(StringIO(data), **kwargs)
+
+    if parser.engine == "pyarrow":
+        msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
+        with pytest.raises(ValueError, match=msg):
+            with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
+                concat(reader)
+        return
+
     with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
-        tm.assert_frame_equal(concat(reader), result)
+        via_reader = concat(reader)
+    tm.assert_frame_equal(via_reader, result)
 
 
-@xfail_pyarrow  # The 'chunksize' option is not supported
 def test_read_chunksize_jagged_names(all_parsers):
     # see gh-23509
     parser = all_parsers
     data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])
 
     expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])
+
+    if parser.engine == "pyarrow":
+        msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
+        with pytest.raises(ValueError, match=msg):
+            with parser.read_csv(
+                StringIO(data), names=range(10), chunksize=4
+            ) as reader:
+                concat(reader)
+        return
+
     with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader:
         result = concat(reader)
     tm.assert_frame_equal(result, expected)
@@ -194,7 +233,6 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
     assert result.a.dtype == float
 
 
-@xfail_pyarrow  # ValueError: The 'chunksize' option is not supported
 def test_warn_if_chunks_have_mismatched_type(all_parsers):
     warning_type = None
     parser = all_parsers
@@ -212,17 +250,24 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):
 
     buf = StringIO(data)
 
-    df = parser.read_csv_check_warnings(
-        warning_type,
-        r"Columns \(0\) have mixed types. "
-        "Specify dtype option on import or set low_memory=False.",
-        buf,
-    )
+    if parser.engine == "pyarrow":
+        df = parser.read_csv_check_warnings(
+            DeprecationWarning,
+            "Passing a BlockManager to DataFrame is deprecated",
+            buf,
+            check_stacklevel=False,
+        )
+    else:
+        df = parser.read_csv_check_warnings(
+            warning_type,
+            r"Columns \(0\) have mixed types. "
+            "Specify dtype option on import or set low_memory=False.",
+            buf,
+        )
 
     assert df.a.dtype == object
 
 
-@xfail_pyarrow  # ValueError: The 'chunksize' option is not supported
 @pytest.mark.parametrize("iterator", [True, False])
 def test_empty_with_nrows_chunksize(all_parsers, iterator):
     # see gh-9535
@@ -232,6 +277,18 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator):
     nrows = 10
     data = StringIO("foo,bar\n")
 
+    if parser.engine == "pyarrow":
+        msg = (
+            "The '(nrows|chunksize)' option is not supported with the 'pyarrow' engine"
+        )
+        with pytest.raises(ValueError, match=msg):
+            if iterator:
+                with parser.read_csv(data, chunksize=nrows) as reader:
+                    next(iter(reader))
+            else:
+                parser.read_csv(data, nrows=nrows)
+        return
+
     if iterator:
         with parser.read_csv(data, chunksize=nrows) as reader:
             result = next(iter(reader))
@@ -241,7 +298,6 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow  # ValueError: The 'chunksize' option is not supported
 def test_read_csv_memory_growth_chunksize(all_parsers):
     # see gh-24805
     #
@@ -254,12 +310,19 @@ def test_read_csv_memory_growth_chunksize(all_parsers):
             for i in range(1000):
                 f.write(str(i) + "\n")
 
+        if parser.engine == "pyarrow":
+            msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
+            with pytest.raises(ValueError, match=msg):
+                with parser.read_csv(path, chunksize=20) as result:
+                    for _ in result:
+                        pass
+            return
+
         with parser.read_csv(path, chunksize=20) as result:
             for _ in result:
                 pass
 
 
-@xfail_pyarrow  # ValueError: The 'chunksize' option is not supported
 def test_chunksize_with_usecols_second_block_shorter(all_parsers):
     # GH#21211
     parser = all_parsers
@@ -268,6 +331,18 @@ def test_chunksize_with_usecols_second_block_shorter(all_parsers):
 9,10,11
 """
 
+    if parser.engine == "pyarrow":
+        msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
+        with pytest.raises(ValueError, match=msg):
+            parser.read_csv(
+                StringIO(data),
+                names=["a", "b"],
+                chunksize=2,
+                usecols=[0, 1],
+                header=None,
+            )
+        return
+
     result_chunks = parser.read_csv(
         StringIO(data),
         names=["a", "b"],
@@ -285,7 +360,6 @@ def test_chunksize_with_usecols_second_block_shorter(all_parsers):
         tm.assert_frame_equal(result, expected_frames[i])
 
 
-@xfail_pyarrow  # ValueError: The 'chunksize' option is not supported
 def test_chunksize_second_block_shorter(all_parsers):
     # GH#21211
     parser = all_parsers
@@ -295,6 +369,12 @@ def test_chunksize_second_block_shorter(all_parsers):
 9,10,11
 """
 
+    if parser.engine == "pyarrow":
+        msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
+        with pytest.raises(ValueError, match=msg):
+            parser.read_csv(StringIO(data), chunksize=2)
+        return
+
     result_chunks = parser.read_csv(StringIO(data), chunksize=2)
 
     expected_frames = [

diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
@@ -34,14 +34,18 @@ def read_csv_check_warnings(
         warn_msg: str,
         *args,
         raise_on_extra_warnings=True,
+        check_stacklevel: bool = True,
         **kwargs,
     ):
         # We need to check the stacklevel here instead of in the tests
         # since this is where read_csv is called and where the warning
         # should point to.
         kwargs = self.update_kwargs(kwargs)
         with tm.assert_produces_warning(
-            warn_type, match=warn_msg, raise_on_extra_warnings=raise_on_extra_warnings
+            warn_type,
+            match=warn_msg,
+            raise_on_extra_warnings=raise_on_extra_warnings,
+            check_stacklevel=check_stacklevel,
         ):
             return read_csv(*args, **kwargs)