diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 843144c3453c2c..6148086452d546 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -81,6 +81,9 @@ delim_whitespace : boolean, default False If this option is set to ``True``, nothing should be passed in for the ``delimiter`` parameter. + .. deprecated: 2.2.0 + Use ``sep="\\s+" instead. + Column and index locations and names ++++++++++++++++++++++++++++++++++++ diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 80d86805ded496..f7017aba1d9964 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -484,6 +484,7 @@ Other Deprecations - Deprecated support for combining parsed datetime columns in :func:`read_csv` along with the ``keep_date_col`` keyword (:issue:`55569`) - Deprecated the :attr:`.DataFrameGroupBy.grouper` and :attr:`SeriesGroupBy.grouper`; these attributes will be removed in a future version of pandas (:issue:`56521`) - Deprecated the :class:`.Grouping` attributes ``group_index``, ``result_index``, and ``group_arraylike``; these will be removed in a future version of pandas (:issue:`56148`) +- Deprecated the ``delim_whitespace`` keyword in :func:`read_csv` and :func:`read_table`, use ``sep="\\s+"`` instead (:issue:`55569`) - Deprecated the ``errors="ignore"`` option in :func:`to_datetime`, :func:`to_timedelta`, and :func:`to_numeric`; explicitly catch exceptions instead (:issue:`54467`) - Deprecated the ``fastpath`` keyword in the :class:`Series` constructor (:issue:`20110`) - Deprecated the ``kind`` keyword in :meth:`Series.resample` and :meth:`DataFrame.resample`, explicitly cast the object's ``index`` instead (:issue:`55895`) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 4e19f401af229d..a9b41b45aba2f3 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -403,6 +403,9 @@ used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option is set to ``True``, nothing should be passed in for the ``delimiter`` parameter. + + .. deprecated:: 2.2.0 + Use ``sep="\\s+"`` instead. low_memory : bool, default True Internally process the file in chunks, resulting in lower memory use while parsing, but possibly mixed type inference. To ensure no mixed @@ -670,7 +673,7 @@ def read_csv( encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., on_bad_lines=..., - delim_whitespace: bool = ..., + delim_whitespace: bool | lib.NoDefault = ..., low_memory: bool = ..., memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., @@ -730,7 +733,7 @@ def read_csv( encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., on_bad_lines=..., - delim_whitespace: bool = ..., + delim_whitespace: bool | lib.NoDefault = ..., low_memory: bool = ..., memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., @@ -790,7 +793,7 @@ def read_csv( encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., on_bad_lines=..., - delim_whitespace: bool = ..., + delim_whitespace: bool | lib.NoDefault = ..., low_memory: bool = ..., memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., @@ -850,7 +853,7 @@ def read_csv( encoding_errors: str | None = ..., dialect: str | csv.Dialect | None = ..., on_bad_lines=..., - delim_whitespace: bool = ..., + delim_whitespace: bool | lib.NoDefault = ..., low_memory: bool = ..., memory_map: bool = ..., float_precision: Literal["high", "legacy"] | None = ..., @@ -928,7 +931,7 @@ def read_csv( # Error Handling on_bad_lines: str = "error", # Internal - delim_whitespace: bool = False, + delim_whitespace: bool | lib.NoDefault = lib.no_default, low_memory: bool = _c_parser_defaults["low_memory"], memory_map: bool = False, float_precision: Literal["high", "legacy"] | None = None, @@ -978,6 +981,17 @@ def read_csv( stacklevel=find_stack_level(), ) + if delim_whitespace is not lib.no_default: + # GH#55569 + warnings.warn( + "The 'delim_whitespace' keyword in pd.read_csv is deprecated and " + "will be removed in a future version. Use ``sep='\\s+'`` instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + delim_whitespace = False + if verbose is not lib.no_default: # GH#55569 warnings.warn( @@ -1305,7 +1319,7 @@ def read_table( # Error Handling on_bad_lines: str = "error", # Internal - delim_whitespace: bool = False, + delim_whitespace: bool | lib.NoDefault = lib.no_default, low_memory: bool = _c_parser_defaults["low_memory"], memory_map: bool = False, float_precision: str | None = None, @@ -1346,6 +1360,17 @@ def read_table( stacklevel=find_stack_level(), ) + if delim_whitespace is not lib.no_default: + # GH#55569 + warnings.warn( + "The 'delim_whitespace' keyword in pd.read_table is deprecated and " + "will be removed in a future version. Use ``sep='\\s+'`` instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + delim_whitespace = False + if verbose is not lib.no_default: # GH#55569 warnings.warn( @@ -2131,6 +2156,9 @@ def _refine_defaults_read( used as the sep. Equivalent to setting ``sep='\\s+'``. If this option is set to True, nothing should be passed in for the ``delimiter`` parameter. + + .. deprecated:: 2.2.0 + Use ``sep="\\s+"`` instead. engine : {{'c', 'python'}} Parser engine to use. The C engine is faster while the python engine is currently more feature-complete. diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 558fdb76321023..7ffc49e941c14f 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -500,13 +500,21 @@ def test_trailing_spaces(all_parsers, kwargs, expected): data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501 parser = all_parsers + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + if parser.engine == "pyarrow": msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) return - result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs) tm.assert_frame_equal(result, expected) @@ -515,8 +523,12 @@ def test_raise_on_sep_with_delim_whitespace(all_parsers): data = "a b c\n1 2 3" parser = all_parsers + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" with pytest.raises(ValueError, match="you can only specify one"): - parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True) + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True) def test_read_filepath_or_buffer(all_parsers): @@ -539,18 +551,27 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace): b\n""" expected = DataFrame({"MyColumn": list("abab")}) + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" if parser.engine == "pyarrow": msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): - parser.read_csv( - StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace - ) + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv( + StringIO(data), + skipinitialspace=True, + delim_whitespace=delim_whitespace, + ) return - result = parser.read_csv( - StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace - ) + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace + ) tm.assert_frame_equal(result, expected) @@ -798,12 +819,20 @@ def test_read_table_delim_whitespace_default_sep(all_parsers): f = StringIO("a b c\n1 -2 -3\n4 5 6") parser = all_parsers + depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated" + if parser.engine == "pyarrow": msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): - parser.read_table(f, delim_whitespace=True) + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_table(f, delim_whitespace=True) return - result = parser.read_table(f, delim_whitespace=True) + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_table(f, delim_whitespace=True) expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]}) tm.assert_frame_equal(result, expected) @@ -817,11 +846,15 @@ def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter): "Specified a delimiter with both sep and " "delim_whitespace=True; you can only specify one." ) - with pytest.raises(ValueError, match=msg): - parser.read_csv(f, delim_whitespace=True, sep=delimiter) + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + with pytest.raises(ValueError, match=msg): + parser.read_csv(f, delim_whitespace=True, sep=delimiter) - with pytest.raises(ValueError, match=msg): - parser.read_csv(f, delim_whitespace=True, delimiter=delimiter) + with pytest.raises(ValueError, match=msg): + parser.read_csv(f, delim_whitespace=True, delimiter=delimiter) def test_read_csv_delimiter_and_sep_no_default(all_parsers): @@ -858,11 +891,15 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): "Specified a delimiter with both sep and " "delim_whitespace=True; you can only specify one." ) - with pytest.raises(ValueError, match=msg): - parser.read_table(f, delim_whitespace=True, sep=delimiter) + depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated" + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + with pytest.raises(ValueError, match=msg): + parser.read_table(f, delim_whitespace=True, sep=delimiter) - with pytest.raises(ValueError, match=msg): - parser.read_table(f, delim_whitespace=True, delimiter=delimiter) + with pytest.raises(ValueError, match=msg): + parser.read_table(f, delim_whitespace=True, delimiter=delimiter) @skip_pyarrow diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 500863dce84ee5..27d7bc0bb6c07a 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -51,7 +51,11 @@ def test_delim_whitespace_custom_terminator(c_parser_only): data = "a b c~1 2 3~4 5 6~7 8 9" parser = c_parser_only - df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True) + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True) expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/io/parser/test_comment.py b/pandas/tests/io/parser/test_comment.py index 4af03e5f3b1ecc..abaeeb86476da1 100644 --- a/pandas/tests/io/parser/test_comment.py +++ b/pandas/tests/io/parser/test_comment.py @@ -41,8 +41,12 @@ def test_line_comment(all_parsers, read_kwargs, request): #ignore this line 5.,NaN,10.0 """ + warn = None + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + if read_kwargs.get("delim_whitespace"): data = data.replace(",", " ") + warn = FutureWarning elif read_kwargs.get("lineterminator"): data = data.replace("\n", read_kwargs.get("lineterminator")) @@ -55,15 +59,22 @@ def test_line_comment(all_parsers, read_kwargs, request): else: msg = "The 'comment' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), **read_kwargs) + with tm.assert_produces_warning( + warn, match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data), **read_kwargs) return elif parser.engine == "python" and read_kwargs.get("lineterminator"): msg = r"Custom line terminators not supported in python parser \(yet\)" with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), **read_kwargs) + with tm.assert_produces_warning( + warn, match=depr_msg, check_stacklevel=False + ): + parser.read_csv(StringIO(data), **read_kwargs) return - result = parser.read_csv(StringIO(data), **read_kwargs) + with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): + result = parser.read_csv(StringIO(data), **read_kwargs) expected = DataFrame( [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"] diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index aaac416d8f464a..0dbd4e3569ad6d 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -706,7 +706,11 @@ def test_header_delim_whitespace(all_parsers): 3,4 """ - result = parser.read_csv(StringIO(data), delim_whitespace=True) + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv(StringIO(data), delim_whitespace=True) expected = DataFrame({"a,b": ["1,2", "3,4"]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 480d579f7f400e..8566b87ef42921 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -602,7 +602,10 @@ def test_skiprows_inference(): 101.6 956.1 """.strip() skiprows = 2 - expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) + + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) result = read_fwf(StringIO(data), skiprows=skiprows) tm.assert_frame_equal(result, expected) @@ -617,7 +620,10 @@ def test_skiprows_by_index_inference(): 456 78 9 456 """.strip() skiprows = [0, 2] - expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) + + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True) result = read_fwf(StringIO(data), skiprows=skiprows) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 749bd47d5c1a38..2d50916228f148 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -216,12 +216,17 @@ def test_skiprows_lineterminator(all_parsers, lineterminator, request): request.applymarker(mark) data = data.replace("\n", lineterminator) - result = parser.read_csv( - StringIO(data), - skiprows=1, - delim_whitespace=True, - names=["date", "time", "var", "flag", "oflag"], - ) + + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), + skiprows=1, + delim_whitespace=True, + names=["date", "time", "var", "flag", "oflag"], + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 7a9e5b14e5eea9..f8790bdb5fa426 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -43,9 +43,12 @@ def test_c_engine(self): data = "a b c\n1 2 3" msg = "does not support" + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + # specify C engine with unsupported options (raise) with pytest.raises(ValueError, match=msg): - read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False) with pytest.raises(ValueError, match=msg): read_csv(StringIO(data), engine="c", sep=r"\s") with pytest.raises(ValueError, match=msg): @@ -54,7 +57,7 @@ def test_c_engine(self): read_csv(StringIO(data), engine="c", skipfooter=1) # specify C-unsupported options without python-unsupported options - with tm.assert_produces_warning(parsers.ParserWarning): + with tm.assert_produces_warning((parsers.ParserWarning, FutureWarning)): read_csv(StringIO(data), sep=None, delim_whitespace=False) with tm.assert_produces_warning(parsers.ParserWarning): read_csv(StringIO(data), sep=r"\s") @@ -153,10 +156,15 @@ def test_pyarrow_engine(self): elif default == "on_bad_lines": kwargs[default] = "warn" - depr_msg = "The 'verbose' keyword in pd.read_csv is deprecated" warn = None + depr_msg = None + if "delim_whitespace" in kwargs: + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + warn = FutureWarning if "verbose" in kwargs: + depr_msg = "The 'verbose' keyword in pd.read_csv is deprecated" warn = FutureWarning + with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(warn, match=depr_msg): read_csv(StringIO(data), engine="pyarrow", **kwargs) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index 92db98bc4ec287..767fba666e4176 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -257,13 +257,25 @@ def test_usecols_with_whitespace(all_parsers): parser = all_parsers data = "a b c\n4 apple bat 5.7\n8 orange cow 10" + depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated" + if parser.engine == "pyarrow": msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine" with pytest.raises(ValueError, match=msg): - parser.read_csv(StringIO(data), delim_whitespace=True, usecols=("a", "b")) + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + parser.read_csv( + StringIO(data), delim_whitespace=True, usecols=("a", "b") + ) return - result = parser.read_csv(StringIO(data), delim_whitespace=True, usecols=("a", "b")) + with tm.assert_produces_warning( + FutureWarning, match=depr_msg, check_stacklevel=False + ): + result = parser.read_csv( + StringIO(data), delim_whitespace=True, usecols=("a", "b") + ) expected = DataFrame({"a": ["apple", "orange"], "b": ["bat", "cow"]}, index=[4, 8]) tm.assert_frame_equal(result, expected)