Skip to content

Commit

Permalink
BUG: read_csv(on_bad_lines='warn') did not raise a Python warning (#5…
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke authored Sep 8, 2023
1 parent 711fea0 commit 5cf6e74
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 60 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ MultiIndex

I/O
^^^
- Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`)
- Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`)

Period
Expand Down
13 changes: 9 additions & 4 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ from csv import (
QUOTE_NONE,
QUOTE_NONNUMERIC,
)
import sys
import time
import warnings

Expand Down Expand Up @@ -880,9 +879,15 @@ cdef class TextReader:

cdef _check_tokenize_status(self, int status):
if self.parser.warn_msg != NULL:
print(PyUnicode_DecodeUTF8(
self.parser.warn_msg, strlen(self.parser.warn_msg),
self.encoding_errors), file=sys.stderr)
warnings.warn(
PyUnicode_DecodeUTF8(
self.parser.warn_msg,
strlen(self.parser.warn_msg),
self.encoding_errors
),
ParserWarning,
stacklevel=find_stack_level()
)
free(self.parser.warn_msg)
self.parser.warn_msg = NULL

Expand Down
11 changes: 8 additions & 3 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,25 @@
import csv
from io import StringIO
import re
import sys
from typing import (
IO,
TYPE_CHECKING,
DefaultDict,
Literal,
cast,
)
import warnings

import numpy as np

from pandas._libs import lib
from pandas.errors import (
EmptyDataError,
ParserError,
ParserWarning,
)
from pandas.util._decorators import cache_readonly
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.common import (
is_bool_dtype,
Expand Down Expand Up @@ -778,8 +780,11 @@ def _alert_malformed(self, msg: str, row_num: int) -> None:
if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
raise ParserError(msg)
if self.on_bad_lines == self.BadLineHandleMethod.WARN:
base = f"Skipping line {row_num}: "
sys.stderr.write(base + msg + "\n")
warnings.warn(
f"Skipping line {row_num}: {msg}\n",
ParserWarning,
stacklevel=find_stack_level(),
)

def _next_iter_line(self, row_num: int) -> list[Scalar] | None:
"""
Expand Down
39 changes: 14 additions & 25 deletions pandas/tests/io/parser/common/test_read_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from pandas.errors import (
EmptyDataError,
ParserError,
ParserWarning,
)

from pandas import DataFrame
Expand Down Expand Up @@ -129,18 +130,16 @@ def test_unexpected_keyword_parameter_exception(all_parsers):
parser.read_table("foo.tsv", foo=1)


def test_suppress_error_output(all_parsers, capsys):
def test_suppress_error_output(all_parsers):
# see gh-15925
parser = all_parsers
data = "a\n1\n1,2,3\n4\n5,6,7"
expected = DataFrame({"a": [1, 4]})

result = parser.read_csv(StringIO(data), on_bad_lines="skip")
with tm.assert_produces_warning(None):
result = parser.read_csv(StringIO(data), on_bad_lines="skip")
tm.assert_frame_equal(result, expected)

captured = capsys.readouterr()
assert captured.err == ""


def test_error_bad_lines(all_parsers):
# see gh-15925
Expand All @@ -152,19 +151,18 @@ def test_error_bad_lines(all_parsers):
parser.read_csv(StringIO(data), on_bad_lines="error")


def test_warn_bad_lines(all_parsers, capsys):
def test_warn_bad_lines(all_parsers):
# see gh-15925
parser = all_parsers
data = "a\n1\n1,2,3\n4\n5,6,7"
expected = DataFrame({"a": [1, 4]})

result = parser.read_csv(StringIO(data), on_bad_lines="warn")
with tm.assert_produces_warning(
ParserWarning, match="Skipping line", check_stacklevel=False
):
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
tm.assert_frame_equal(result, expected)

captured = capsys.readouterr()
assert "Skipping line 3" in captured.err
assert "Skipping line 5" in captured.err


def test_read_csv_wrong_num_columns(all_parsers):
# Too few columns.
Expand Down Expand Up @@ -245,7 +243,7 @@ def test_bad_header_uniform_error(all_parsers):
parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error")


def test_on_bad_lines_warn_correct_formatting(all_parsers, capsys):
def test_on_bad_lines_warn_correct_formatting(all_parsers):
# see gh-15925
parser = all_parsers
data = """1,2
Expand All @@ -256,17 +254,8 @@ def test_on_bad_lines_warn_correct_formatting(all_parsers, capsys):
"""
expected = DataFrame({"1": "a", "2": ["b"] * 2})

result = parser.read_csv(StringIO(data), on_bad_lines="warn")
with tm.assert_produces_warning(
ParserWarning, match="Skipping line", check_stacklevel=False
):
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
tm.assert_frame_equal(result, expected)

captured = capsys.readouterr()
if parser.engine == "c":
warn = """Skipping line 3: expected 2 fields, saw 3
Skipping line 4: expected 2 fields, saw 3
"""
else:
warn = """Skipping line 3: Expected 2 fields in line 3, saw 3
Skipping line 4: Expected 2 fields in line 4, saw 3
"""
assert captured.err == warn
30 changes: 16 additions & 14 deletions pandas/tests/io/parser/test_c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@

from pandas.compat import is_ci_environment
from pandas.compat.numpy import np_version_gte1p24
from pandas.errors import ParserError
from pandas.errors import (
ParserError,
ParserWarning,
)
import pandas.util._test_decorators as td

from pandas import (
Expand Down Expand Up @@ -461,7 +464,7 @@ def test_data_after_quote(c_parser_only):
tm.assert_frame_equal(result, expected)


def test_comment_whitespace_delimited(c_parser_only, capsys):
def test_comment_whitespace_delimited(c_parser_only):
parser = c_parser_only
test_input = """\
1 2
Expand All @@ -474,18 +477,17 @@ def test_comment_whitespace_delimited(c_parser_only, capsys):
8# 1 field, NaN
9 2 3 # skipped line
# comment"""
df = parser.read_csv(
StringIO(test_input),
comment="#",
header=None,
delimiter="\\s+",
skiprows=0,
on_bad_lines="warn",
)
captured = capsys.readouterr()
# skipped lines 2, 3, 4, 9
for line_num in (2, 3, 4, 9):
assert f"Skipping line {line_num}" in captured.err
with tm.assert_produces_warning(
ParserWarning, match="Skipping line", check_stacklevel=False
):
df = parser.read_csv(
StringIO(test_input),
comment="#",
header=None,
delimiter="\\s+",
skiprows=0,
on_bad_lines="warn",
)
expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]])
tm.assert_frame_equal(df, expected)

Expand Down
12 changes: 7 additions & 5 deletions pandas/tests/io/parser/test_python_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ def test_multi_char_sep_quotes(python_parser_only, quoting):
parser.read_csv(StringIO(data), quoting=quoting, **kwargs)


def test_none_delimiter(python_parser_only, capsys):
def test_none_delimiter(python_parser_only):
# see gh-13374 and gh-17465
parser = python_parser_only
data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9"
Expand All @@ -283,12 +283,14 @@ def test_none_delimiter(python_parser_only, capsys):
# We expect the third line in the data to be
# skipped because it is malformed, but we do
# not expect any errors to occur.
result = parser.read_csv(StringIO(data), header=0, sep=None, on_bad_lines="warn")
with tm.assert_produces_warning(
ParserWarning, match="Skipping line 3", check_stacklevel=False
):
result = parser.read_csv(
StringIO(data), header=0, sep=None, on_bad_lines="warn"
)
tm.assert_frame_equal(result, expected)

captured = capsys.readouterr()
assert "Skipping line 3" in captured.err


@pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz'])
@pytest.mark.parametrize("skipfooter", [0, 1])
Expand Down
16 changes: 7 additions & 9 deletions pandas/tests/io/parser/test_textreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import pandas._libs.parsers as parser
from pandas._libs.parsers import TextReader
from pandas.errors import ParserWarning

from pandas import DataFrame
import pandas._testing as tm
Expand Down Expand Up @@ -125,7 +126,7 @@ def test_integer_thousands_alt(self):
expected = DataFrame([123456, 12500])
tm.assert_frame_equal(result, expected)

def test_skip_bad_lines(self, capsys):
def test_skip_bad_lines(self):
# too many lines, see #2430 for why
data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r"

Expand All @@ -145,14 +146,11 @@ def test_skip_bad_lines(self, capsys):
}
assert_array_dicts_equal(result, expected)

reader = TextReader(
StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn
)
reader.read()
captured = capsys.readouterr()

assert "Skipping line 4" in captured.err
assert "Skipping line 6" in captured.err
with tm.assert_produces_warning(ParserWarning, match="Skipping line"):
reader = TextReader(
StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn
)
reader.read()

def test_header_not_enough_lines(self):
data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6"
Expand Down

0 comments on commit 5cf6e74

Please sign in to comment.