From 5cf6e74e198ba34c8f64bed32082fddc2292986d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Sep 2023 10:32:20 -1000 Subject: [PATCH] BUG: read_csv(on_bad_lines='warn') did not raise a Python warning (#55071) --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/_libs/parsers.pyx | 13 +++++-- pandas/io/parsers/python_parser.py | 11 ++++-- .../io/parser/common/test_read_errors.py | 39 +++++++------------ pandas/tests/io/parser/test_c_parser_only.py | 30 +++++++------- .../io/parser/test_python_parser_only.py | 12 +++--- pandas/tests/io/parser/test_textreader.py | 16 ++++---- 7 files changed, 62 insertions(+), 60 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 7bb4aaec0dd7c..a795514aa31f8 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -227,6 +227,7 @@ MultiIndex I/O ^^^ +- Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`) - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) Period diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 6d66e21ce49f5..5f51f48b43ca9 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -6,7 +6,6 @@ from csv import ( QUOTE_NONE, QUOTE_NONNUMERIC, ) -import sys import time import warnings @@ -880,9 +879,15 @@ cdef class TextReader: cdef _check_tokenize_status(self, int status): if self.parser.warn_msg != NULL: - print(PyUnicode_DecodeUTF8( - self.parser.warn_msg, strlen(self.parser.warn_msg), - self.encoding_errors), file=sys.stderr) + warnings.warn( + PyUnicode_DecodeUTF8( + self.parser.warn_msg, + strlen(self.parser.warn_msg), + self.encoding_errors + ), + ParserWarning, + stacklevel=find_stack_level() + ) free(self.parser.warn_msg) self.parser.warn_msg = NULL diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 6846ea2b196b8..43fb4ec3b55fc 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -13,7 +13,6 @@ import csv from io import StringIO import re -import sys from typing import ( IO, TYPE_CHECKING, @@ -21,6 +20,7 @@ Literal, cast, ) +import warnings import numpy as np @@ -28,8 +28,10 @@ from pandas.errors import ( EmptyDataError, ParserError, + ParserWarning, ) from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_bool_dtype, @@ -778,8 +780,11 @@ def _alert_malformed(self, msg: str, row_num: int) -> None: if self.on_bad_lines == self.BadLineHandleMethod.ERROR: raise ParserError(msg) if self.on_bad_lines == self.BadLineHandleMethod.WARN: - base = f"Skipping line {row_num}: " - sys.stderr.write(base + msg + "\n") + warnings.warn( + f"Skipping line {row_num}: {msg}\n", + ParserWarning, + stacklevel=find_stack_level(), + ) def _next_iter_line(self, row_num: int) -> list[Scalar] | None: """ diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 492b4d5ec058e..0c5a2e0d04e5a 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -15,6 +15,7 @@ from pandas.errors import ( EmptyDataError, ParserError, + ParserWarning, ) from pandas import DataFrame @@ -129,18 +130,16 @@ def test_unexpected_keyword_parameter_exception(all_parsers): parser.read_table("foo.tsv", foo=1) -def test_suppress_error_output(all_parsers, capsys): +def test_suppress_error_output(all_parsers): # see gh-15925 parser = all_parsers data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - result = parser.read_csv(StringIO(data), on_bad_lines="skip") + with tm.assert_produces_warning(None): + result = parser.read_csv(StringIO(data), on_bad_lines="skip") tm.assert_frame_equal(result, expected) - captured = capsys.readouterr() - assert captured.err == "" - def test_error_bad_lines(all_parsers): # see gh-15925 @@ -152,19 +151,18 @@ def test_error_bad_lines(all_parsers): parser.read_csv(StringIO(data), on_bad_lines="error") -def test_warn_bad_lines(all_parsers, capsys): +def test_warn_bad_lines(all_parsers): # see gh-15925 parser = all_parsers data = "a\n1\n1,2,3\n4\n5,6,7" expected = DataFrame({"a": [1, 4]}) - result = parser.read_csv(StringIO(data), on_bad_lines="warn") + with tm.assert_produces_warning( + ParserWarning, match="Skipping line", check_stacklevel=False + ): + result = parser.read_csv(StringIO(data), on_bad_lines="warn") tm.assert_frame_equal(result, expected) - captured = capsys.readouterr() - assert "Skipping line 3" in captured.err - assert "Skipping line 5" in captured.err - def test_read_csv_wrong_num_columns(all_parsers): # Too few columns. @@ -245,7 +243,7 @@ def test_bad_header_uniform_error(all_parsers): parser.read_csv(StringIO(data), index_col=0, on_bad_lines="error") -def test_on_bad_lines_warn_correct_formatting(all_parsers, capsys): +def test_on_bad_lines_warn_correct_formatting(all_parsers): # see gh-15925 parser = all_parsers data = """1,2 @@ -256,17 +254,8 @@ def test_on_bad_lines_warn_correct_formatting(all_parsers, capsys): """ expected = DataFrame({"1": "a", "2": ["b"] * 2}) - result = parser.read_csv(StringIO(data), on_bad_lines="warn") + with tm.assert_produces_warning( + ParserWarning, match="Skipping line", check_stacklevel=False + ): + result = parser.read_csv(StringIO(data), on_bad_lines="warn") tm.assert_frame_equal(result, expected) - - captured = capsys.readouterr() - if parser.engine == "c": - warn = """Skipping line 3: expected 2 fields, saw 3 -Skipping line 4: expected 2 fields, saw 3 - -""" - else: - warn = """Skipping line 3: Expected 2 fields in line 3, saw 3 -Skipping line 4: Expected 2 fields in line 4, saw 3 -""" - assert captured.err == warn diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 32a010b3aeb34..18eee01f87621 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -19,7 +19,10 @@ from pandas.compat import is_ci_environment from pandas.compat.numpy import np_version_gte1p24 -from pandas.errors import ParserError +from pandas.errors import ( + ParserError, + ParserWarning, +) import pandas.util._test_decorators as td from pandas import ( @@ -461,7 +464,7 @@ def test_data_after_quote(c_parser_only): tm.assert_frame_equal(result, expected) -def test_comment_whitespace_delimited(c_parser_only, capsys): +def test_comment_whitespace_delimited(c_parser_only): parser = c_parser_only test_input = """\ 1 2 @@ -474,18 +477,17 @@ def test_comment_whitespace_delimited(c_parser_only, capsys): 8# 1 field, NaN 9 2 3 # skipped line # comment""" - df = parser.read_csv( - StringIO(test_input), - comment="#", - header=None, - delimiter="\\s+", - skiprows=0, - on_bad_lines="warn", - ) - captured = capsys.readouterr() - # skipped lines 2, 3, 4, 9 - for line_num in (2, 3, 4, 9): - assert f"Skipping line {line_num}" in captured.err + with tm.assert_produces_warning( + ParserWarning, match="Skipping line", check_stacklevel=False + ): + df = parser.read_csv( + StringIO(test_input), + comment="#", + header=None, + delimiter="\\s+", + skiprows=0, + on_bad_lines="warn", + ) expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]]) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 959b988e208c1..dbd474c6ae0b9 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -274,7 +274,7 @@ def test_multi_char_sep_quotes(python_parser_only, quoting): parser.read_csv(StringIO(data), quoting=quoting, **kwargs) -def test_none_delimiter(python_parser_only, capsys): +def test_none_delimiter(python_parser_only): # see gh-13374 and gh-17465 parser = python_parser_only data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9" @@ -283,12 +283,14 @@ def test_none_delimiter(python_parser_only, capsys): # We expect the third line in the data to be # skipped because it is malformed, but we do # not expect any errors to occur. - result = parser.read_csv(StringIO(data), header=0, sep=None, on_bad_lines="warn") + with tm.assert_produces_warning( + ParserWarning, match="Skipping line 3", check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), header=0, sep=None, on_bad_lines="warn" + ) tm.assert_frame_equal(result, expected) - captured = capsys.readouterr() - assert "Skipping line 3" in captured.err - @pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz']) @pytest.mark.parametrize("skipfooter", [0, 1]) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index f150ed3903443..e2d785a38eb51 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -12,6 +12,7 @@ import pandas._libs.parsers as parser from pandas._libs.parsers import TextReader +from pandas.errors import ParserWarning from pandas import DataFrame import pandas._testing as tm @@ -125,7 +126,7 @@ def test_integer_thousands_alt(self): expected = DataFrame([123456, 12500]) tm.assert_frame_equal(result, expected) - def test_skip_bad_lines(self, capsys): + def test_skip_bad_lines(self): # too many lines, see #2430 for why data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r" @@ -145,14 +146,11 @@ def test_skip_bad_lines(self, capsys): } assert_array_dicts_equal(result, expected) - reader = TextReader( - StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn - ) - reader.read() - captured = capsys.readouterr() - - assert "Skipping line 4" in captured.err - assert "Skipping line 6" in captured.err + with tm.assert_produces_warning(ParserWarning, match="Skipping line"): + reader = TextReader( + StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn + ) + reader.read() def test_header_not_enough_lines(self): data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6"