pandas-dev · mroeschke · Dec 5, 2023 · Nov 30, 2023 · Nov 30, 2023 · Nov 30, 2023
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -566,6 +566,8 @@ MultiIndex
 
 I/O
 ^^^
+- Bug in :func:`read_csv` where ``engine="python"`` did not respect ``chunksize`` arg when ``skiprows`` was specified. (:issue:`56323`)
+- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable ``skiprows`` and a chunk size was specified. (:issue:`55677`)
 - Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`)
 - Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`)
 - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`)

diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
@@ -1117,18 +1117,18 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]:
                 new_rows = []
                 try:
                     if rows is not None:
-                        rows_to_skip = 0
-                        if self.skiprows is not None and self.pos is not None:
-                            # Only read additional rows if pos is in skiprows
-                            rows_to_skip = len(
-                                set(self.skiprows) - set(range(self.pos))
-                            )
-
-                        for _ in range(rows + rows_to_skip):
+                        row_index = 0
+                        row_ct = 0
+                        offset = self.pos if self.pos is not None else 0
+                        while row_ct < rows:
                             # assert for mypy, data is Iterator[str] or None, would
                             # error in next
                             assert self.data is not None
-                            new_rows.append(next(self.data))
+                            new_row = next(self.data)
+                            if not self.skipfunc(offset + row_index):
+                                row_ct += 1
+                            row_index += 1
+                            new_rows.append(new_row)
 
                         len_new_rows = len(new_rows)
                         new_rows = self._remove_skipped_rows(new_rows)
@@ -1137,11 +1137,11 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]:
                         rows = 0
 
                         while True:
-                            new_row = self._next_iter_line(row_num=self.pos + rows + 1)
+                            next_row = self._next_iter_line(row_num=self.pos + rows + 1)
                             rows += 1
 
-                            if new_row is not None:
-                                new_rows.append(new_row)
+                            if next_row is not None:
+                                new_rows.append(next_row)
                         len_new_rows = len(new_rows)
 
                 except StopIteration:

diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py
@@ -898,7 +898,7 @@ def test_skip_rows_and_n_rows():
 
 
 def test_skiprows_with_iterator():
-    # GH#10261
+    # GH#10261, GH#56323
     data = """0
 1
 2
@@ -920,8 +920,8 @@ def test_skiprows_with_iterator():
     )
     expected_frames = [
         DataFrame({"a": [3, 4]}),
-        DataFrame({"a": [5, 7, 8]}, index=[2, 3, 4]),
-        DataFrame({"a": []}, dtype="object"),
+        DataFrame({"a": [5, 7]}, index=[2, 3]),
+        DataFrame({"a": [8]}, index=[4]),
     ]
     for i, result in enumerate(df_iter):
         tm.assert_frame_equal(result, expected_frames[i])

diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py
@@ -301,3 +301,29 @@ def test_skip_rows_and_n_rows(all_parsers):
     result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6])
     expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]})
     tm.assert_frame_equal(result, expected)
+
+
+@xfail_pyarrow
+def test_skip_rows_with_chunks(all_parsers):
+    # GH 55677
+    data = """col_a
+10
+20
+30
+40
+50
+60
+70
+80
+90
+100
+"""
+    parser = all_parsers
+    reader = parser.read_csv(
+        StringIO(data), engine=parser, skiprows=lambda x: x in [1, 4, 5], chunksize=4
+    )
+    df1 = next(reader)
+    df2 = next(reader)
+
+    tm.assert_frame_equal(df1, DataFrame({"col_a": [20, 30, 60, 70]}))
+    tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6]))