Skip to content

Commit

Permalink
Fix -GH 55677:
Browse files Browse the repository at this point in the history
Added support for the python parser to handle using skiprows and chunk_size options at the same time to ensure API contract is met.

Added a regression test to ensure this bug can be quickly caught in the future if it reappears.

Signed-off-by: Flytre <[email protected]>
  • Loading branch information
Flytre committed Nov 29, 2023
1 parent b2d9ec1 commit d654823
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 26 deletions.
21 changes: 9 additions & 12 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1117,18 +1117,15 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]:
new_rows = []
try:
if rows is not None:
rows_to_skip = 0
if self.skiprows is not None and self.pos is not None:
# Only read additional rows if pos is in skiprows
rows_to_skip = len(
set(self.skiprows) - set(range(self.pos))
)

for _ in range(rows + rows_to_skip):
# assert for mypy, data is Iterator[str] or None, would
# error in next
assert self.data is not None
new_rows.append(next(self.data))
row_index = 0
row_ct = 0
offset = self.pos if self.pos is not None else 0
while row_ct < rows:
new_row = next(self.data)
if not self.skipfunc(offset + row_index):
row_ct += 1
row_index += 1
new_rows.append(new_row)

len_new_rows = len(new_rows)
new_rows = self._remove_skipped_rows(new_rows)
Expand Down
56 changes: 42 additions & 14 deletions pandas/tests/io/parser/test_skiprows.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,11 @@ def test_skip_rows_blank(all_parsers):
[
(
"""id,text,num_lines
1,"line 11
line 12",2
2,"line 21
line 22",2
3,"line 31",1""",
1,"line 11
line 12",2
2,"line 21
line 22",2
3,"line 31",1""",
{"skiprows": [1]},
DataFrame(
[[2, "line 21\nline 22", 2], [3, "line 31", 1]],
Expand Down Expand Up @@ -156,23 +156,23 @@ def test_skip_row_with_quote(all_parsers):
[
(
"""id,text,num_lines
1,"line \n'11' line 12",2
2,"line \n'21' line 22",2
3,"line \n'31' line 32",1""",
1,"line \n'11' line 12",2
2,"line \n'21' line 22",2
3,"line \n'31' line 32",1""",
[[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]],
),
(
"""id,text,num_lines
1,"line '11\n' line 12",2
2,"line '21\n' line 22",2
3,"line '31\n' line 32",1""",
1,"line '11\n' line 12",2
2,"line '21\n' line 22",2
3,"line '31\n' line 32",1""",
[[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]],
),
(
"""id,text,num_lines
1,"line '11\n' \r\tline 12",2
2,"line '21\n' \r\tline 22",2
3,"line '31\n' \r\tline 32",1""",
1,"line '11\n' \r\tline 12",2
2,"line '21\n' \r\tline 22",2
3,"line '31\n' \r\tline 32",1""",
[[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]],
),
],
Expand Down Expand Up @@ -301,3 +301,31 @@ def test_skip_rows_and_n_rows(all_parsers):
result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6])
expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]})
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_skip_rows_with_chunks(all_parsers):
# GH 55677
data = """col_a
10
20
30
40
50
60
70
80
90
100
"""
parser = all_parsers
reader = parser.read_csv(
StringIO(data), engine=parser, skiprows=lambda x: x in [1, 4, 5], chunksize=4
)
df1 = next(reader)
df2 = next(reader)

tm.assert_frame_equal(
df1, DataFrame({"col_a": [20, 30, 60, 70]}, index=[0, 1, 2, 3])
)
tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6]))

0 comments on commit d654823

Please sign in to comment.