Fix -GH 55677:

Added support for the python parser to handle using skiprows and chunk_size options at the same time to ensure API contract is met. Added a regression test to ensure this bug can be quickly caught in the future if it reappears. Signed-off-by: Flytre <[email protected]>
pandas-dev · Nov 29, 2023 · d654823 · d654823
1 parent b2d9ec1
commit d654823
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 26 deletions.
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -1117,18 +1117,15 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]:
                 new_rows = []
                 try:
                     if rows is not None:
-                        rows_to_skip = 0
-                        if self.skiprows is not None and self.pos is not None:
-                            # Only read additional rows if pos is in skiprows
-                            rows_to_skip = len(
-                                set(self.skiprows) - set(range(self.pos))
-                            )
-
-                        for _ in range(rows + rows_to_skip):
-                            # assert for mypy, data is Iterator[str] or None, would
-                            # error in next
-                            assert self.data is not None
-                            new_rows.append(next(self.data))
+                        row_index = 0
+                        row_ct = 0
+                        offset = self.pos if self.pos is not None else 0
+                        while row_ct < rows:
+                            new_row = next(self.data)
+                            if not self.skipfunc(offset + row_index):
+                                row_ct += 1
+                            row_index += 1
+                            new_rows.append(new_row)
 
                         len_new_rows = len(new_rows)
                         new_rows = self._remove_skipped_rows(new_rows)

diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py
@@ -99,11 +99,11 @@ def test_skip_rows_blank(all_parsers):
     [
         (
             """id,text,num_lines
-1,"line 11
-line 12",2
-2,"line 21
-line 22",2
-3,"line 31",1""",
+    1,"line 11
+    line 12",2
+    2,"line 21
+    line 22",2
+    3,"line 31",1""",
             {"skiprows": [1]},
             DataFrame(
                 [[2, "line 21\nline 22", 2], [3, "line 31", 1]],
@@ -156,23 +156,23 @@ def test_skip_row_with_quote(all_parsers):
     [
         (
             """id,text,num_lines
-1,"line \n'11' line 12",2
-2,"line \n'21' line 22",2
-3,"line \n'31' line 32",1""",
+    1,"line \n'11' line 12",2
+    2,"line \n'21' line 22",2
+    3,"line \n'31' line 32",1""",
             [[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]],
         ),
         (
             """id,text,num_lines
-1,"line '11\n' line 12",2
-2,"line '21\n' line 22",2
-3,"line '31\n' line 32",1""",
+    1,"line '11\n' line 12",2
+    2,"line '21\n' line 22",2
+    3,"line '31\n' line 32",1""",
             [[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]],
         ),
         (
             """id,text,num_lines
-1,"line '11\n' \r\tline 12",2
-2,"line '21\n' \r\tline 22",2
-3,"line '31\n' \r\tline 32",1""",
+    1,"line '11\n' \r\tline 12",2
+    2,"line '21\n' \r\tline 22",2
+    3,"line '31\n' \r\tline 32",1""",
             [[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]],
         ),
     ],
@@ -301,3 +301,31 @@ def test_skip_rows_and_n_rows(all_parsers):
     result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6])
     expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]})
     tm.assert_frame_equal(result, expected)
+
+
+@xfail_pyarrow
+def test_skip_rows_with_chunks(all_parsers):
+    # GH 55677
+    data = """col_a
+10
+20
+30
+40
+50
+60
+70
+80
+90
+100
+"""
+    parser = all_parsers
+    reader = parser.read_csv(
+        StringIO(data), engine=parser, skiprows=lambda x: x in [1, 4, 5], chunksize=4
+    )
+    df1 = next(reader)
+    df2 = next(reader)
+
+    tm.assert_frame_equal(
+        df1, DataFrame({"col_a": [20, 30, 60, 70]}, index=[0, 1, 2, 3])
+    )
+    tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6]))