Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Read CSV on python engine fails with callable skiprows and chunk size specified (#55677) #56238

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,7 @@ MultiIndex

I/O
^^^
- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable skiprows and a chunk size was specified. (:issue:`55677`)
- Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`)
- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`)
- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`)
Expand Down
21 changes: 9 additions & 12 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1117,18 +1117,15 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]:
new_rows = []
try:
if rows is not None:
rows_to_skip = 0
if self.skiprows is not None and self.pos is not None:
# Only read additional rows if pos is in skiprows
rows_to_skip = len(
set(self.skiprows) - set(range(self.pos))
)

for _ in range(rows + rows_to_skip):
# assert for mypy, data is Iterator[str] or None, would
# error in next
assert self.data is not None
new_rows.append(next(self.data))
row_index = 0
row_ct = 0
offset = self.pos if self.pos is not None else 0
while row_ct < rows:
new_row = next(self.data)
if not self.skipfunc(offset + row_index):
row_ct += 1
row_index += 1
new_rows.append(new_row)

len_new_rows = len(new_rows)
new_rows = self._remove_skipped_rows(new_rows)
Expand Down
56 changes: 42 additions & 14 deletions pandas/tests/io/parser/test_skiprows.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,11 @@ def test_skip_rows_blank(all_parsers):
[
(
"""id,text,num_lines
1,"line 11
line 12",2
2,"line 21
line 22",2
3,"line 31",1""",
1,"line 11
line 12",2
2,"line 21
line 22",2
3,"line 31",1""",
{"skiprows": [1]},
DataFrame(
[[2, "line 21\nline 22", 2], [3, "line 31", 1]],
Expand Down Expand Up @@ -156,23 +156,23 @@ def test_skip_row_with_quote(all_parsers):
[
(
"""id,text,num_lines
1,"line \n'11' line 12",2
2,"line \n'21' line 22",2
3,"line \n'31' line 32",1""",
1,"line \n'11' line 12",2
2,"line \n'21' line 22",2
3,"line \n'31' line 32",1""",
[[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]],
),
(
"""id,text,num_lines
1,"line '11\n' line 12",2
2,"line '21\n' line 22",2
3,"line '31\n' line 32",1""",
1,"line '11\n' line 12",2
2,"line '21\n' line 22",2
3,"line '31\n' line 32",1""",
[[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]],
),
(
"""id,text,num_lines
1,"line '11\n' \r\tline 12",2
2,"line '21\n' \r\tline 22",2
3,"line '31\n' \r\tline 32",1""",
1,"line '11\n' \r\tline 12",2
2,"line '21\n' \r\tline 22",2
3,"line '31\n' \r\tline 32",1""",
[[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]],
),
],
Expand Down Expand Up @@ -301,3 +301,31 @@ def test_skip_rows_and_n_rows(all_parsers):
result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6])
expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]})
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_skip_rows_with_chunks(all_parsers):
# GH 55677
data = """col_a
10
20
30
40
50
60
70
80
90
100
"""
parser = all_parsers
reader = parser.read_csv(
StringIO(data), engine=parser, skiprows=lambda x: x in [1, 4, 5], chunksize=4
)
df1 = next(reader)
df2 = next(reader)

tm.assert_frame_equal(
df1, DataFrame({"col_a": [20, 30, 60, 70]}, index=[0, 1, 2, 3])
)
tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6]))
Loading