Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Read CSV on python engine fails when skiprows and chunk size are specified (#55677, #56323) #56250

Merged
merged 11 commits into from
Dec 5, 2023
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,8 @@ MultiIndex

I/O
^^^
- Bug in :func:`read_csv` where ``engine="python"`` did not respect ``chunksize`` arg when ``skiprows`` was specified. (:issue:`56323`)
- Bug in :func:`read_csv` where ``engine="python"`` was causing a ``TypeError`` when a callable ``skiprows`` and a chunk size was specified. (:issue:`55677`)
- Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`)
- Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`)
- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`)
Expand Down
24 changes: 12 additions & 12 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1117,18 +1117,18 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]:
new_rows = []
try:
if rows is not None:
rows_to_skip = 0
if self.skiprows is not None and self.pos is not None:
# Only read additional rows if pos is in skiprows
rows_to_skip = len(
set(self.skiprows) - set(range(self.pos))
)

for _ in range(rows + rows_to_skip):
row_index = 0
row_ct = 0
offset = self.pos if self.pos is not None else 0
while row_ct < rows:
# assert for mypy, data is Iterator[str] or None, would
# error in next
assert self.data is not None
new_rows.append(next(self.data))
new_row = next(self.data)
if not self.skipfunc(offset + row_index):
row_ct += 1
row_index += 1
new_rows.append(new_row)

len_new_rows = len(new_rows)
new_rows = self._remove_skipped_rows(new_rows)
Expand All @@ -1137,11 +1137,11 @@ def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]:
rows = 0

while True:
new_row = self._next_iter_line(row_num=self.pos + rows + 1)
next_row = self._next_iter_line(row_num=self.pos + rows + 1)
rows += 1

if new_row is not None:
new_rows.append(new_row)
if next_row is not None:
new_rows.append(next_row)
len_new_rows = len(new_rows)

except StopIteration:
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/io/parser/test_read_fwf.py
Original file line number Diff line number Diff line change
Expand Up @@ -898,7 +898,7 @@ def test_skip_rows_and_n_rows():


def test_skiprows_with_iterator():
# GH#10261
# GH#10261, GH#56323
data = """0
1
2
Expand All @@ -920,8 +920,8 @@ def test_skiprows_with_iterator():
)
expected_frames = [
DataFrame({"a": [3, 4]}),
DataFrame({"a": [5, 7, 8]}, index=[2, 3, 4]),
DataFrame({"a": []}, dtype="object"),
DataFrame({"a": [5, 7]}, index=[2, 3]),
DataFrame({"a": [8]}, index=[4]),
]
for i, result in enumerate(df_iter):
tm.assert_frame_equal(result, expected_frames[i])
Expand Down
26 changes: 26 additions & 0 deletions pandas/tests/io/parser/test_skiprows.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,3 +301,29 @@ def test_skip_rows_and_n_rows(all_parsers):
result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6])
expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]})
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_skip_rows_with_chunks(all_parsers):
# GH 55677
data = """col_a
10
20
30
40
50
60
70
80
90
100
"""
parser = all_parsers
reader = parser.read_csv(
StringIO(data), engine=parser, skiprows=lambda x: x in [1, 4, 5], chunksize=4
)
df1 = next(reader)
df2 = next(reader)

tm.assert_frame_equal(df1, DataFrame({"col_a": [20, 30, 60, 70]}))
tm.assert_frame_equal(df2, DataFrame({"col_a": [80, 90, 100]}, index=[4, 5, 6]))
Loading