From 6cd018803bee93fb53abcc339665557d05ec12b0 Mon Sep 17 00:00:00 2001 From: Aaron Rahman Date: Wed, 29 Nov 2023 21:09:46 -0500 Subject: [PATCH 1/8] Fix -GH 55677: Added support for the python parser to handle using skiprows and chunk_size options at the same time to ensure API contract is met. Added a regression test to ensure this bug can be quickly caught in the future if it reappears. --- TODO.txt | 5 ++++ doc/source/whatsnew/v2.2.0.rst | 1 + pandas/io/parsers/python_parser.py | 40 ++++++++++++++++--------- pandas/tests/io/parser/test_skiprows.py | 27 +++++++++++++++++ 4 files changed, 59 insertions(+), 14 deletions(-) create mode 100644 TODO.txt diff --git a/TODO.txt b/TODO.txt new file mode 100644 index 0000000000000..f15a7d11be21d --- /dev/null +++ b/TODO.txt @@ -0,0 +1,5 @@ +-run tests and style guide +-add test to prevent regression +-add documentation +-submit PR under the fork + diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 38f39270d12c7..4e61f1be5f8e9 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -420,6 +420,7 @@ MultiIndex I/O ^^^ +- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable skiprows and a chunk size was specified. (:issue:`55677`) - Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`) - Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`) - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index fae3293414b02..10fcbd70f269b 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1114,21 +1114,33 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]: self.pos = new_pos else: - new_rows = [] + new_rows: list[list[Scalar]] = [] try: if rows is not None: - rows_to_skip = 0 - if self.skiprows is not None and self.pos is not None: - # Only read additional rows if pos is in skiprows - rows_to_skip = len( - set(self.skiprows) - set(range(self.pos)) - ) - - for _ in range(rows + rows_to_skip): - # assert for mypy, data is Iterator[str] or None, would - # error in next - assert self.data is not None - new_rows.append(next(self.data)) + if callable(self.skiprows): + row_index = 0 + row_ct = 0 + offset = self.pos if self.pos is not None else 0 + while row_ct < rows: + new_row = next(self.data) + if not self.skipfunc(offset + row_index): + row_ct += 1 + row_index += 1 + new_rows.append(new_row) + else: + # Maintain legacy chunking behavior pre-2.2.0 to ensure backwards compatibility + rows_to_skip = 0 + if self.skiprows is not None and self.pos is not None: + # Only read additional rows if pos is in skiprows + rows_to_skip = len( + set(self.skiprows) - set(range(self.pos)) + ) + + for _ in range(rows + rows_to_skip): + # assert for mypy, data is Iterator[str] or None, would + # error in next + assert self.data is not None + new_rows.append(next(self.data)) len_new_rows = len(new_rows) new_rows = self._remove_skipped_rows(new_rows) @@ -1137,7 +1149,7 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]: rows = 0 while True: - new_row = self._next_iter_line(row_num=self.pos + rows + 1) + new_row: list[Scalar] = self._next_iter_line(row_num=self.pos + rows + 1) rows += 1 if new_row is not None: diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 9146af3f969e6..9297b754cf44e 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -301,3 +301,30 @@ def test_skip_rows_and_n_rows(all_parsers): result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6]) expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]}) tm.assert_frame_equal(result, expected) + +@xfail_pyarrow +def test_skip_rows_with_chunks(all_parsers): + # GH 55677 + data = """col_a +10 +20 +30 +40 +50 +60 +70 +80 +90 +100 +""" + parser = all_parsers + reader = parser.read_csv( + StringIO(data), engine=parser, skiprows=lambda x: x in [1, 4, 5], chunksize=4 + ) + df1 = next(reader) + df2 = next(reader) + + tm.assert_frame_equal( + df1, DataFrame({"col_a": [20, 30, 60, 70]}, index=[0, 1, 2, 3]) + ) + tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6])) \ No newline at end of file From 9233273a37ef6dc5e4d5bf1e44d91f65b7b3dd0a Mon Sep 17 00:00:00 2001 From: Flytre Date: Wed, 29 Nov 2023 21:10:17 -0500 Subject: [PATCH 2/8] Fix -GH 55677: Added support for the python parser to handle using skiprows and chunk_size options at the same time to ensure API contract is met. Added a regression test to ensure this bug can be quickly caught in the future if it reappears. --- TODO.txt | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 TODO.txt diff --git a/TODO.txt b/TODO.txt deleted file mode 100644 index f15a7d11be21d..0000000000000 --- a/TODO.txt +++ /dev/null @@ -1,5 +0,0 @@ --run tests and style guide --add test to prevent regression --add documentation --submit PR under the fork - From 35a6929f8306f67ed7cf33ad336768500421a373 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 30 Nov 2023 02:34:34 +0000 Subject: [PATCH 3/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/io/parsers/python_parser.py | 4 +++- pandas/tests/io/parser/test_skiprows.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 10fcbd70f269b..c3b31de3bd1e8 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1149,7 +1149,9 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]: rows = 0 while True: - new_row: list[Scalar] = self._next_iter_line(row_num=self.pos + rows + 1) + new_row: list[Scalar] = self._next_iter_line( + row_num=self.pos + rows + 1 + ) rows += 1 if new_row is not None: diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 9297b754cf44e..ecc768bdd02de 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -302,6 +302,7 @@ def test_skip_rows_and_n_rows(all_parsers): expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]}) tm.assert_frame_equal(result, expected) + @xfail_pyarrow def test_skip_rows_with_chunks(all_parsers): # GH 55677 @@ -327,4 +328,4 @@ def test_skip_rows_with_chunks(all_parsers): tm.assert_frame_equal( df1, DataFrame({"col_a": [20, 30, 60, 70]}, index=[0, 1, 2, 3]) ) - tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6])) \ No newline at end of file + tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6])) From 5469a47d24c422cd4e7ae88798a92e0ddbe92571 Mon Sep 17 00:00:00 2001 From: Flytre Date: Wed, 29 Nov 2023 21:36:04 -0500 Subject: [PATCH 4/8] Fix -GH 55677: Added support for the python parser to handle using skiprows and chunk_size options at the same time to ensure API contract is met. Added a regression test to ensure this bug can be quickly caught in the future if it reappears. --- pandas/io/parsers/python_parser.py | 6 ++++-- pandas/tests/io/parser/test_skiprows.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 10fcbd70f269b..55151bb3bcd5b 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1128,7 +1128,7 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]: row_index += 1 new_rows.append(new_row) else: - # Maintain legacy chunking behavior pre-2.2.0 to ensure backwards compatibility + # Maintain legacy chunking behavior rows_to_skip = 0 if self.skiprows is not None and self.pos is not None: # Only read additional rows if pos is in skiprows @@ -1149,7 +1149,9 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]: rows = 0 while True: - new_row: list[Scalar] = self._next_iter_line(row_num=self.pos + rows + 1) + new_row: list[Scalar] = self._next_iter_line( + row_num=self.pos + rows + 1 + ) rows += 1 if new_row is not None: diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 9297b754cf44e..ecc768bdd02de 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -302,6 +302,7 @@ def test_skip_rows_and_n_rows(all_parsers): expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]}) tm.assert_frame_equal(result, expected) + @xfail_pyarrow def test_skip_rows_with_chunks(all_parsers): # GH 55677 @@ -327,4 +328,4 @@ def test_skip_rows_with_chunks(all_parsers): tm.assert_frame_equal( df1, DataFrame({"col_a": [20, 30, 60, 70]}, index=[0, 1, 2, 3]) ) - tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6])) \ No newline at end of file + tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6])) From 7d0b1657ba1c6f079bdedd99e966c919b0181ccd Mon Sep 17 00:00:00 2001 From: Aaron Rahman Date: Thu, 30 Nov 2023 11:05:20 -0500 Subject: [PATCH 5/8] Fix -GH 55677: Made changes consistment with mypy checking --- pandas/io/parsers/python_parser.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 55151bb3bcd5b..630232bda724e 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1114,7 +1114,7 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]: self.pos = new_pos else: - new_rows: list[list[Scalar]] = [] + new_rows = [] try: if rows is not None: if callable(self.skiprows): @@ -1122,6 +1122,9 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]: row_ct = 0 offset = self.pos if self.pos is not None else 0 while row_ct < rows: + # assert for mypy, data is Iterator[str] or None, would + # error in next + assert self.data is not None new_row = next(self.data) if not self.skipfunc(offset + row_index): row_ct += 1 @@ -1149,13 +1152,13 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]: rows = 0 while True: - new_row: list[Scalar] = self._next_iter_line( + next_row = self._next_iter_line( row_num=self.pos + rows + 1 ) rows += 1 - if new_row is not None: - new_rows.append(new_row) + if next_row is not None: + new_rows.append(next_row) len_new_rows = len(new_rows) except StopIteration: From 08084581bcb642bd6a13f34317bf841d281c2169 Mon Sep 17 00:00:00 2001 From: Aaron Rahman Date: Thu, 30 Nov 2023 11:38:47 -0500 Subject: [PATCH 6/8] Fix -GH 55677: Made changes consistment with mypy checking and pre-commit --- pandas/io/parsers/python_parser.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 630232bda724e..972e2ef88dfce 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1152,9 +1152,7 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]: rows = 0 while True: - next_row = self._next_iter_line( - row_num=self.pos + rows + 1 - ) + next_row = self._next_iter_line(row_num=self.pos + rows + 1) rows += 1 if next_row is not None: From c3b330e2996cf0ef0b50ef3d4058e6d607c69815 Mon Sep 17 00:00:00 2001 From: Aaron Rahman Date: Mon, 4 Dec 2023 11:20:41 -0500 Subject: [PATCH 7/8] Fix -GH 55677 & 56323: This commit: -Fixes GH 56323 by replacing the python engine chunksize logic -Fixes formatting on the added test_skiprows test case -Fixes incorrect test in read_fwf that expected an output chunk of size 3 when chunksize=2 was specified. --- doc/source/whatsnew/v2.2.0.rst | 3 +- pandas/io/parsers/python_parser.py | 39 ++++++++----------------- pandas/tests/io/parser/test_read_fwf.py | 6 ++-- pandas/tests/io/parser/test_skiprows.py | 4 +-- 4 files changed, 18 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 4e61f1be5f8e9..f6e7e624d76ff 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -420,7 +420,8 @@ MultiIndex I/O ^^^ -- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable skiprows and a chunk size was specified. (:issue:`55677`) +- Bug in :func:`read_csv` where ``engine="python"`` did not respect ``chunksize`` arg when ``skiprows`` was specified. (:issue:`56323`) +- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable ``skiprows`` and a chunk size was specified. (:issue:`55677`) - Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`) - Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`) - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 972e2ef88dfce..79e7554a5744c 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -1117,33 +1117,18 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]: new_rows = [] try: if rows is not None: - if callable(self.skiprows): - row_index = 0 - row_ct = 0 - offset = self.pos if self.pos is not None else 0 - while row_ct < rows: - # assert for mypy, data is Iterator[str] or None, would - # error in next - assert self.data is not None - new_row = next(self.data) - if not self.skipfunc(offset + row_index): - row_ct += 1 - row_index += 1 - new_rows.append(new_row) - else: - # Maintain legacy chunking behavior - rows_to_skip = 0 - if self.skiprows is not None and self.pos is not None: - # Only read additional rows if pos is in skiprows - rows_to_skip = len( - set(self.skiprows) - set(range(self.pos)) - ) - - for _ in range(rows + rows_to_skip): - # assert for mypy, data is Iterator[str] or None, would - # error in next - assert self.data is not None - new_rows.append(next(self.data)) + row_index = 0 + row_ct = 0 + offset = self.pos if self.pos is not None else 0 + while row_ct < rows: + # assert for mypy, data is Iterator[str] or None, would + # error in next + assert self.data is not None + new_row = next(self.data) + if not self.skipfunc(offset + row_index): + row_ct += 1 + row_index += 1 + new_rows.append(new_row) len_new_rows = len(new_rows) new_rows = self._remove_skipped_rows(new_rows) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 34cae289c0f22..480d579f7f400 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -898,7 +898,7 @@ def test_skip_rows_and_n_rows(): def test_skiprows_with_iterator(): - # GH#10261 + # GH#10261, GH#56323 data = """0 1 2 @@ -920,8 +920,8 @@ def test_skiprows_with_iterator(): ) expected_frames = [ DataFrame({"a": [3, 4]}), - DataFrame({"a": [5, 7, 8]}, index=[2, 3, 4]), - DataFrame({"a": []}, dtype="object"), + DataFrame({"a": [5, 7]}, index=[2, 3]), + DataFrame({"a": [8]}, index=[4]), ] for i, result in enumerate(df_iter): tm.assert_frame_equal(result, expected_frames[i]) diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index ecc768bdd02de..061d7a07b4d8d 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -325,7 +325,5 @@ def test_skip_rows_with_chunks(all_parsers): df1 = next(reader) df2 = next(reader) - tm.assert_frame_equal( - df1, DataFrame({"col_a": [20, 30, 60, 70]}, index=[0, 1, 2, 3]) - ) + tm.assert_frame_equal(df1, DataFrame({"col_a": [20, 30, 60, 70]})) tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6])) From 265bd1ea01271ec79a068db9fb4c8c73f85d973e Mon Sep 17 00:00:00 2001 From: Flytre Date: Mon, 4 Dec 2023 12:51:09 -0500 Subject: [PATCH 8/8] Trigger CI