diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index fae3293414b02..742d9f6defc76 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1117,18 +1117,15 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]: new_rows = [] try: if rows is not None: - rows_to_skip = 0 - if self.skiprows is not None and self.pos is not None: - # Only read additional rows if pos is in skiprows - rows_to_skip = len( - set(self.skiprows) - set(range(self.pos)) - ) - - for _ in range(rows + rows_to_skip): - # assert for mypy, data is Iterator[str] or None, would - # error in next - assert self.data is not None - new_rows.append(next(self.data)) + row_index = 0 + row_ct = 0 + offset = self.pos if self.pos is not None else 0 + while row_ct < rows: + new_row = next(self.data) + if not self.skipfunc(offset + row_index): + row_ct += 1 + row_index += 1 + new_rows.append(new_row) len_new_rows = len(new_rows) new_rows = self._remove_skipped_rows(new_rows) diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 9146af3f969e6..6ef66d2567fe1 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -99,11 +99,11 @@ def test_skip_rows_blank(all_parsers): [ ( """id,text,num_lines -1,"line 11 -line 12",2 -2,"line 21 -line 22",2 -3,"line 31",1""", + 1,"line 11 + line 12",2 + 2,"line 21 + line 22",2 + 3,"line 31",1""", {"skiprows": [1]}, DataFrame( [[2, "line 21\nline 22", 2], [3, "line 31", 1]], @@ -156,23 +156,23 @@ def test_skip_row_with_quote(all_parsers): [ ( """id,text,num_lines -1,"line \n'11' line 12",2 -2,"line \n'21' line 22",2 -3,"line \n'31' line 32",1""", + 1,"line \n'11' line 12",2 + 2,"line \n'21' line 22",2 + 3,"line \n'31' line 32",1""", [[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]], ), ( """id,text,num_lines -1,"line '11\n' line 12",2 -2,"line '21\n' line 22",2 -3,"line '31\n' line 32",1""", + 1,"line '11\n' line 12",2 + 2,"line '21\n' line 22",2 + 3,"line '31\n' line 32",1""", [[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]], ), ( """id,text,num_lines -1,"line '11\n' \r\tline 12",2 -2,"line '21\n' \r\tline 22",2 -3,"line '31\n' \r\tline 32",1""", + 1,"line '11\n' \r\tline 12",2 + 2,"line '21\n' \r\tline 22",2 + 3,"line '31\n' \r\tline 32",1""", [[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]], ), ], @@ -301,3 +301,31 @@ def test_skip_rows_and_n_rows(all_parsers): result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6]) expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]}) tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +def test_skip_rows_with_chunks(all_parsers): + # GH 55677 + data = """col_a +10 +20 +30 +40 +50 +60 +70 +80 +90 +100 +""" + parser = all_parsers + reader = parser.read_csv( + StringIO(data), engine=parser, skiprows=lambda x: x in [1, 4, 5], chunksize=4 + ) + df1 = next(reader) + df2 = next(reader) + + tm.assert_frame_equal( + df1, DataFrame({"col_a": [20, 30, 60, 70]}, index=[0, 1, 2, 3]) + ) + tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6]))