diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 5ee2bb1778cb1..c20e667774b3c 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -566,6 +566,8 @@ MultiIndex I/O ^^^ +- Bug in :func:`read_csv` where ``engine="python"`` did not respect ``chunksize`` arg when ``skiprows`` was specified. (:issue:`56323`) +- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable ``skiprows`` and a chunk size was specified. (:issue:`55677`) - Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`) - Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`) - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index fae3293414b02..79e7554a5744c 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1117,18 +1117,18 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]: new_rows = [] try: if rows is not None: - rows_to_skip = 0 - if self.skiprows is not None and self.pos is not None: - # Only read additional rows if pos is in skiprows - rows_to_skip = len( - set(self.skiprows) - set(range(self.pos)) - ) - - for _ in range(rows + rows_to_skip): + row_index = 0 + row_ct = 0 + offset = self.pos if self.pos is not None else 0 + while row_ct < rows: # assert for mypy, data is Iterator[str] or None, would # error in next assert self.data is not None - new_rows.append(next(self.data)) + new_row = next(self.data) + if not self.skipfunc(offset + row_index): + row_ct += 1 + row_index += 1 + new_rows.append(new_row) len_new_rows = len(new_rows) new_rows = self._remove_skipped_rows(new_rows) @@ -1137,11 +1137,11 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]: rows = 0 while True: - new_row = self._next_iter_line(row_num=self.pos + rows + 1) + next_row = self._next_iter_line(row_num=self.pos + rows + 1) rows += 1 - if new_row is not None: - new_rows.append(new_row) + if next_row is not None: + new_rows.append(next_row) len_new_rows = len(new_rows) except StopIteration: diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 34cae289c0f22..480d579f7f400 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -898,7 +898,7 @@ def test_skip_rows_and_n_rows(): def test_skiprows_with_iterator(): - # GH#10261 + # GH#10261, GH#56323 data = """0 1 2 @@ -920,8 +920,8 @@ def test_skiprows_with_iterator(): ) expected_frames = [ DataFrame({"a": [3, 4]}), - DataFrame({"a": [5, 7, 8]}, index=[2, 3, 4]), - DataFrame({"a": []}, dtype="object"), + DataFrame({"a": [5, 7]}, index=[2, 3]), + DataFrame({"a": [8]}, index=[4]), ] for i, result in enumerate(df_iter): tm.assert_frame_equal(result, expected_frames[i]) diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 47c3739c979a3..749bd47d5c1a3 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -301,3 +301,29 @@ def test_skip_rows_and_n_rows(all_parsers): result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6]) expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]}) tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +def test_skip_rows_with_chunks(all_parsers): + # GH 55677 + data = """col_a +10 +20 +30 +40 +50 +60 +70 +80 +90 +100 +""" + parser = all_parsers + reader = parser.read_csv( + StringIO(data), engine=parser, skiprows=lambda x: x in [1, 4, 5], chunksize=4 + ) + df1 = next(reader) + df2 = next(reader) + + tm.assert_frame_equal(df1, DataFrame({"col_a": [20, 30, 60, 70]})) + tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6]))