Skip to content

Commit

Permalink
DEPR: delim_whitespace kwd in read_csv (pandas-dev#56557)
Browse files Browse the repository at this point in the history
* DEPR: delim_whitespace kwd in read_csv

* Update doc/source/whatsnew/v2.2.0.rst

Co-authored-by: Matthew Roeschke <[email protected]>

---------

Co-authored-by: Matthew Roeschke <[email protected]>
  • Loading branch information
2 people authored and cbpygit committed Jan 2, 2024
1 parent 57a5a15 commit 35bc2d3
Show file tree
Hide file tree
Showing 11 changed files with 162 additions and 43 deletions.
3 changes: 3 additions & 0 deletions doc/source/user_guide/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ delim_whitespace : boolean, default False
If this option is set to ``True``, nothing should be passed in for the
``delimiter`` parameter.

.. deprecated: 2.2.0
Use ``sep="\\s+" instead.
Column and index locations and names
++++++++++++++++++++++++++++++++++++

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,7 @@ Other Deprecations
- Deprecated support for combining parsed datetime columns in :func:`read_csv` along with the ``keep_date_col`` keyword (:issue:`55569`)
- Deprecated the :attr:`.DataFrameGroupBy.grouper` and :attr:`SeriesGroupBy.grouper`; these attributes will be removed in a future version of pandas (:issue:`56521`)
- Deprecated the :class:`.Grouping` attributes ``group_index``, ``result_index``, and ``group_arraylike``; these will be removed in a future version of pandas (:issue:`56148`)
- Deprecated the ``delim_whitespace`` keyword in :func:`read_csv` and :func:`read_table`, use ``sep="\\s+"`` instead (:issue:`55569`)
- Deprecated the ``errors="ignore"`` option in :func:`to_datetime`, :func:`to_timedelta`, and :func:`to_numeric`; explicitly catch exceptions instead (:issue:`54467`)
- Deprecated the ``fastpath`` keyword in the :class:`Series` constructor (:issue:`20110`)
- Deprecated the ``kind`` keyword in :meth:`Series.resample` and :meth:`DataFrame.resample`, explicitly cast the object's ``index`` instead (:issue:`55895`)
Expand Down
40 changes: 34 additions & 6 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,9 @@
used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option
is set to ``True``, nothing should be passed in for the ``delimiter``
parameter.
.. deprecated:: 2.2.0
Use ``sep="\\s+"`` instead.
low_memory : bool, default True
Internally process the file in chunks, resulting in lower memory use
while parsing, but possibly mixed type inference. To ensure no mixed
Expand Down Expand Up @@ -670,7 +673,7 @@ def read_csv(
encoding_errors: str | None = ...,
dialect: str | csv.Dialect | None = ...,
on_bad_lines=...,
delim_whitespace: bool = ...,
delim_whitespace: bool | lib.NoDefault = ...,
low_memory: bool = ...,
memory_map: bool = ...,
float_precision: Literal["high", "legacy"] | None = ...,
Expand Down Expand Up @@ -730,7 +733,7 @@ def read_csv(
encoding_errors: str | None = ...,
dialect: str | csv.Dialect | None = ...,
on_bad_lines=...,
delim_whitespace: bool = ...,
delim_whitespace: bool | lib.NoDefault = ...,
low_memory: bool = ...,
memory_map: bool = ...,
float_precision: Literal["high", "legacy"] | None = ...,
Expand Down Expand Up @@ -790,7 +793,7 @@ def read_csv(
encoding_errors: str | None = ...,
dialect: str | csv.Dialect | None = ...,
on_bad_lines=...,
delim_whitespace: bool = ...,
delim_whitespace: bool | lib.NoDefault = ...,
low_memory: bool = ...,
memory_map: bool = ...,
float_precision: Literal["high", "legacy"] | None = ...,
Expand Down Expand Up @@ -850,7 +853,7 @@ def read_csv(
encoding_errors: str | None = ...,
dialect: str | csv.Dialect | None = ...,
on_bad_lines=...,
delim_whitespace: bool = ...,
delim_whitespace: bool | lib.NoDefault = ...,
low_memory: bool = ...,
memory_map: bool = ...,
float_precision: Literal["high", "legacy"] | None = ...,
Expand Down Expand Up @@ -928,7 +931,7 @@ def read_csv(
# Error Handling
on_bad_lines: str = "error",
# Internal
delim_whitespace: bool = False,
delim_whitespace: bool | lib.NoDefault = lib.no_default,
low_memory: bool = _c_parser_defaults["low_memory"],
memory_map: bool = False,
float_precision: Literal["high", "legacy"] | None = None,
Expand Down Expand Up @@ -978,6 +981,17 @@ def read_csv(
stacklevel=find_stack_level(),
)

if delim_whitespace is not lib.no_default:
# GH#55569
warnings.warn(
"The 'delim_whitespace' keyword in pd.read_csv is deprecated and "
"will be removed in a future version. Use ``sep='\\s+'`` instead",
FutureWarning,
stacklevel=find_stack_level(),
)
else:
delim_whitespace = False

if verbose is not lib.no_default:
# GH#55569
warnings.warn(
Expand Down Expand Up @@ -1305,7 +1319,7 @@ def read_table(
# Error Handling
on_bad_lines: str = "error",
# Internal
delim_whitespace: bool = False,
delim_whitespace: bool | lib.NoDefault = lib.no_default,
low_memory: bool = _c_parser_defaults["low_memory"],
memory_map: bool = False,
float_precision: str | None = None,
Expand Down Expand Up @@ -1346,6 +1360,17 @@ def read_table(
stacklevel=find_stack_level(),
)

if delim_whitespace is not lib.no_default:
# GH#55569
warnings.warn(
"The 'delim_whitespace' keyword in pd.read_table is deprecated and "
"will be removed in a future version. Use ``sep='\\s+'`` instead",
FutureWarning,
stacklevel=find_stack_level(),
)
else:
delim_whitespace = False

if verbose is not lib.no_default:
# GH#55569
warnings.warn(
Expand Down Expand Up @@ -2131,6 +2156,9 @@ def _refine_defaults_read(
used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
is set to True, nothing should be passed in for the ``delimiter``
parameter.
.. deprecated:: 2.2.0
Use ``sep="\\s+"`` instead.
engine : {{'c', 'python'}}
Parser engine to use. The C engine is faster while the python engine is
currently more feature-complete.
Expand Down
75 changes: 56 additions & 19 deletions pandas/tests/io/parser/common/test_common_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,13 +500,21 @@ def test_trailing_spaces(all_parsers, kwargs, expected):
data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa: E501
parser = all_parsers

depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"

if parser.engine == "pyarrow":
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
return

result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
tm.assert_frame_equal(result, expected)


Expand All @@ -515,8 +523,12 @@ def test_raise_on_sep_with_delim_whitespace(all_parsers):
data = "a b c\n1 2 3"
parser = all_parsers

depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with pytest.raises(ValueError, match="you can only specify one"):
parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)


def test_read_filepath_or_buffer(all_parsers):
Expand All @@ -539,18 +551,27 @@ def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
b\n"""

expected = DataFrame({"MyColumn": list("abab")})
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"

if parser.engine == "pyarrow":
msg = "The 'skipinitialspace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
)
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_csv(
StringIO(data),
skipinitialspace=True,
delim_whitespace=delim_whitespace,
)
return

result = parser.read_csv(
StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
)
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(
StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
)
tm.assert_frame_equal(result, expected)


Expand Down Expand Up @@ -798,12 +819,20 @@ def test_read_table_delim_whitespace_default_sep(all_parsers):
f = StringIO("a b c\n1 -2 -3\n4 5 6")
parser = all_parsers

depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated"

if parser.engine == "pyarrow":
msg = "The 'delim_whitespace' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True)
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
parser.read_table(f, delim_whitespace=True)
return
result = parser.read_table(f, delim_whitespace=True)
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_table(f, delim_whitespace=True)
expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]})
tm.assert_frame_equal(result, expected)

Expand All @@ -817,11 +846,15 @@ def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter):
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(f, delim_whitespace=True, sep=delimiter)
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
with pytest.raises(ValueError, match=msg):
parser.read_csv(f, delim_whitespace=True, sep=delimiter)

with pytest.raises(ValueError, match=msg):
parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)
with pytest.raises(ValueError, match=msg):
parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)


def test_read_csv_delimiter_and_sep_no_default(all_parsers):
Expand Down Expand Up @@ -858,11 +891,15 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
"Specified a delimiter with both sep and "
"delim_whitespace=True; you can only specify one."
)
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, sep=delimiter)
depr_msg = "The 'delim_whitespace' keyword in pd.read_table is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, sep=delimiter)

with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, delimiter=delimiter)
with pytest.raises(ValueError, match=msg):
parser.read_table(f, delim_whitespace=True, delimiter=delimiter)


@skip_pyarrow
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/io/parser/test_c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,11 @@ def test_delim_whitespace_custom_terminator(c_parser_only):
data = "a b c~1 2 3~4 5 6~7 8 9"
parser = c_parser_only

df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True)
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True)
expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"])
tm.assert_frame_equal(df, expected)

Expand Down
17 changes: 14 additions & 3 deletions pandas/tests/io/parser/test_comment.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,12 @@ def test_line_comment(all_parsers, read_kwargs, request):
#ignore this line
5.,NaN,10.0
"""
warn = None
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"

if read_kwargs.get("delim_whitespace"):
data = data.replace(",", " ")
warn = FutureWarning
elif read_kwargs.get("lineterminator"):
data = data.replace("\n", read_kwargs.get("lineterminator"))

Expand All @@ -55,15 +59,22 @@ def test_line_comment(all_parsers, read_kwargs, request):
else:
msg = "The 'comment' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **read_kwargs)
with tm.assert_produces_warning(
warn, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), **read_kwargs)
return
elif parser.engine == "python" and read_kwargs.get("lineterminator"):
msg = r"Custom line terminators not supported in python parser \(yet\)"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **read_kwargs)
with tm.assert_produces_warning(
warn, match=depr_msg, check_stacklevel=False
):
parser.read_csv(StringIO(data), **read_kwargs)
return

result = parser.read_csv(StringIO(data), **read_kwargs)
with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False):
result = parser.read_csv(StringIO(data), **read_kwargs)

expected = DataFrame(
[[1.0, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"]
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/io/parser/test_header.py
Original file line number Diff line number Diff line change
Expand Up @@ -706,7 +706,11 @@ def test_header_delim_whitespace(all_parsers):
3,4
"""

result = parser.read_csv(StringIO(data), delim_whitespace=True)
depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(StringIO(data), delim_whitespace=True)
expected = DataFrame({"a,b": ["1,2", "3,4"]})
tm.assert_frame_equal(result, expected)

Expand Down
10 changes: 8 additions & 2 deletions pandas/tests/io/parser/test_read_fwf.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,10 @@ def test_skiprows_inference():
101.6 956.1
""".strip()
skiprows = 2
expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True)

depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True)

result = read_fwf(StringIO(data), skiprows=skiprows)
tm.assert_frame_equal(result, expected)
Expand All @@ -617,7 +620,10 @@ def test_skiprows_by_index_inference():
456 78 9 456
""".strip()
skiprows = [0, 2]
expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True)

depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
expected = read_csv(StringIO(data), skiprows=skiprows, delim_whitespace=True)

result = read_fwf(StringIO(data), skiprows=skiprows)
tm.assert_frame_equal(result, expected)
Expand Down
17 changes: 11 additions & 6 deletions pandas/tests/io/parser/test_skiprows.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,12 +216,17 @@ def test_skiprows_lineterminator(all_parsers, lineterminator, request):
request.applymarker(mark)

data = data.replace("\n", lineterminator)
result = parser.read_csv(
StringIO(data),
skiprows=1,
delim_whitespace=True,
names=["date", "time", "var", "flag", "oflag"],
)

depr_msg = "The 'delim_whitespace' keyword in pd.read_csv is deprecated"
with tm.assert_produces_warning(
FutureWarning, match=depr_msg, check_stacklevel=False
):
result = parser.read_csv(
StringIO(data),
skiprows=1,
delim_whitespace=True,
names=["date", "time", "var", "flag", "oflag"],
)
tm.assert_frame_equal(result, expected)


Expand Down
Loading

0 comments on commit 35bc2d3

Please sign in to comment.