From d654823bf3d6b5246467968b588eaa6b4efe1218 Mon Sep 17 00:00:00 2001 From: Flytre Date: Wed, 29 Nov 2023 10:55:38 -0500 Subject: [PATCH 1/2] Fix -GH 55677: Added support for the python parser to handle using skiprows and chunk_size options at the same time to ensure API contract is met. Added a regression test to ensure this bug can be quickly caught in the future if it reappears. Signed-off-by: Flytre --- pandas/io/parsers/python_parser.py | 21 ++++------ pandas/tests/io/parser/test_skiprows.py | 56 ++++++++++++++++++------- 2 files changed, 51 insertions(+), 26 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index fae3293414b02..742d9f6defc76 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1117,18 +1117,15 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]: new_rows = [] try: if rows is not None: - rows_to_skip = 0 - if self.skiprows is not None and self.pos is not None: - # Only read additional rows if pos is in skiprows - rows_to_skip = len( - set(self.skiprows) - set(range(self.pos)) - ) - - for _ in range(rows + rows_to_skip): - # assert for mypy, data is Iterator[str] or None, would - # error in next - assert self.data is not None - new_rows.append(next(self.data)) + row_index = 0 + row_ct = 0 + offset = self.pos if self.pos is not None else 0 + while row_ct < rows: + new_row = next(self.data) + if not self.skipfunc(offset + row_index): + row_ct += 1 + row_index += 1 + new_rows.append(new_row) len_new_rows = len(new_rows) new_rows = self._remove_skipped_rows(new_rows) diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 9146af3f969e6..6ef66d2567fe1 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -99,11 +99,11 @@ def test_skip_rows_blank(all_parsers): [ ( """id,text,num_lines -1,"line 11 -line 12",2 -2,"line 21 -line 22",2 -3,"line 31",1""", + 1,"line 11 + line 12",2 + 2,"line 21 + line 22",2 + 3,"line 31",1""", {"skiprows": [1]}, DataFrame( [[2, "line 21\nline 22", 2], [3, "line 31", 1]], @@ -156,23 +156,23 @@ def test_skip_row_with_quote(all_parsers): [ ( """id,text,num_lines -1,"line \n'11' line 12",2 -2,"line \n'21' line 22",2 -3,"line \n'31' line 32",1""", + 1,"line \n'11' line 12",2 + 2,"line \n'21' line 22",2 + 3,"line \n'31' line 32",1""", [[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]], ), ( """id,text,num_lines -1,"line '11\n' line 12",2 -2,"line '21\n' line 22",2 -3,"line '31\n' line 32",1""", + 1,"line '11\n' line 12",2 + 2,"line '21\n' line 22",2 + 3,"line '31\n' line 32",1""", [[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]], ), ( """id,text,num_lines -1,"line '11\n' \r\tline 12",2 -2,"line '21\n' \r\tline 22",2 -3,"line '31\n' \r\tline 32",1""", + 1,"line '11\n' \r\tline 12",2 + 2,"line '21\n' \r\tline 22",2 + 3,"line '31\n' \r\tline 32",1""", [[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]], ), ], @@ -301,3 +301,31 @@ def test_skip_rows_and_n_rows(all_parsers): result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6]) expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]}) tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +def test_skip_rows_with_chunks(all_parsers): + # GH 55677 + data = """col_a +10 +20 +30 +40 +50 +60 +70 +80 +90 +100 +""" + parser = all_parsers + reader = parser.read_csv( + StringIO(data), engine=parser, skiprows=lambda x: x in [1, 4, 5], chunksize=4 + ) + df1 = next(reader) + df2 = next(reader) + + tm.assert_frame_equal( + df1, DataFrame({"col_a": [20, 30, 60, 70]}, index=[0, 1, 2, 3]) + ) + tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6])) From f11d7a824499bd70283ad369eba183a5d76ad8dd Mon Sep 17 00:00:00 2001 From: Flytre Date: Wed, 29 Nov 2023 11:14:40 -0500 Subject: [PATCH 2/2] Fix -GH 55677: Added support for the python parser to handle using skiprows and chunk_size options at the same time to ensure API contract is met. Added a regression test to ensure this bug can be quickly caught in the future if it reappears. Signed-off-by: Flytre --- doc/source/whatsnew/v2.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index efa4a52993a90..08fa594632259 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -407,6 +407,7 @@ MultiIndex I/O ^^^ +- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable skiprows and a chunk size was specified. (:issue:`55677`) - Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`) - Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`) - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`)