Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TST (string dtype): resolve all xfails in IO parser tests #60321

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions pandas/tests/io/parser/common/test_chunksize.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas._libs import parsers as libparsers
from pandas.errors import DtypeWarning

Expand Down Expand Up @@ -231,8 +229,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
assert result.a.dtype == float


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_warn_if_chunks_have_mismatched_type(all_parsers):
def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string):
warning_type = None
parser = all_parsers
size = 10000
Expand Down Expand Up @@ -260,8 +257,12 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):
"Specify dtype option on import or set low_memory=False.",
buf,
)

assert df.a.dtype == object
if parser.engine == "c" and parser.low_memory:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't low_memory still be using the proper data type? Or why would that stick to object?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not super familiar with the parser code, but I think that with the low memory parser, parsing is done in chunks, and so if the inference changes later on, you end up with chunks with different types, and then get object dtype as a result.

In the test here, we have a column with mostly integers, and only a few strings in the middle. So with the default parser, it will decide based on the values in the full column that the dtype should be string. But chunk by chunk you get some chunks as integer and some as string

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see...yea that's a weird one

assert df.a.dtype == object
elif using_infer_string:
assert df.a.dtype == "str"
else:
assert df.a.dtype == object


@pytest.mark.parametrize("iterator", [True, False])
Expand Down
7 changes: 2 additions & 5 deletions pandas/tests/io/parser/common/test_file_buffer_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import WASM
from pandas.errors import (
EmptyDataError,
Expand Down Expand Up @@ -71,14 +69,13 @@ def test_local_file(all_parsers, csv_dir_path):
pytest.skip("Failing on: " + " ".join(platform.uname()))


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@xfail_pyarrow # AssertionError: DataFrame.index are different
def test_path_path_lib(all_parsers):
parser = all_parsers
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)
result = tm.round_trip_pathlib(df.to_csv, lambda p: parser.read_csv(p, index_col=0))
tm.assert_frame_equal(df, result)
Expand Down
10 changes: 6 additions & 4 deletions pandas/tests/io/parser/common/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@

import pytest

from pandas._config import using_string_dtype

from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -88,9 +86,13 @@ def test_pass_names_with_index(all_parsers, data, kwargs, expected):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("index_col", [[0, 1], [1, 0]])
def test_multi_index_no_level_names(all_parsers, index_col):
def test_multi_index_no_level_names(
request, all_parsers, index_col, using_infer_string
):
if using_infer_string and all_parsers.engine == "pyarrow":
# result should have string columns instead of object dtype
request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
data = """index1,index2,A,B,C,D
foo,one,2,3,4,5
foo,two,7,8,9,10
Expand Down
4 changes: 0 additions & 4 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.errors import ParserWarning

import pandas as pd
Expand Down Expand Up @@ -57,7 +55,6 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.usefixtures("pyarrow_xfail")
def test_dtype_per_column(all_parsers):
parser = all_parsers
Expand All @@ -71,7 +68,6 @@ def test_dtype_per_column(all_parsers):
[[1, "2.5"], [2, "3.5"], [3, "4.5"], [4, "5.5"]], columns=["one", "two"]
)
expected["one"] = expected["one"].astype(np.float64)
expected["two"] = expected["two"].astype(object)

result = parser.read_csv(StringIO(data), dtype={"one": np.float64, 1: str})
tm.assert_frame_equal(result, expected)
Expand Down
13 changes: 7 additions & 6 deletions pandas/tests/io/parser/test_c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import WASM
from pandas.compat.numpy import np_version_gte1p24
from pandas.errors import (
Expand Down Expand Up @@ -184,8 +182,7 @@ def error(val: float, actual_val: Decimal) -> Decimal:
assert max(precise_errors) <= max(normal_errors)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_usecols_dtypes(c_parser_only):
def test_usecols_dtypes(c_parser_only, using_infer_string):
parser = c_parser_only
data = """\
1,2,3
Expand All @@ -210,8 +207,12 @@ def test_usecols_dtypes(c_parser_only):
dtype={"b": int, "c": float},
)

assert (result.dtypes == [object, int, float]).all()
assert (result2.dtypes == [object, float]).all()
if using_infer_string:
assert (result.dtypes == ["string", int, float]).all()
assert (result2.dtypes == ["string", float]).all()
else:
assert (result.dtypes == [object, int, float]).all()
assert (result2.dtypes == [object, float]).all()


def test_disable_bool_parsing(c_parser_only):
Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/io/parser/test_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
DataFrame,
Expand Down Expand Up @@ -188,7 +186,6 @@ def convert_score(x):
tm.assert_frame_equal(results[0], results[1])


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("conv_f", [lambda x: x, str])
def test_converter_index_col_bug(all_parsers, conv_f):
# see gh-1835 , GH#40589
Expand All @@ -207,7 +204,7 @@ def test_converter_index_col_bug(all_parsers, conv_f):
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
)

xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object"))
xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A"))
tm.assert_frame_equal(rs, xp)


Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/io/parser/test_index_col.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas import (
DataFrame,
Index,
Expand Down Expand Up @@ -345,7 +343,6 @@ def test_infer_types_boolean_sum(all_parsers):
tm.assert_frame_equal(result, expected, check_index_type=False)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("dtype, val", [(object, "01"), ("int64", 1)])
def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
# GH#9435
Expand All @@ -356,7 +353,7 @@ def test_specify_dtype_for_index_col(all_parsers, dtype, val, request):
pytest.mark.xfail(reason="Cannot disable type-inference for pyarrow engine")
)
result = parser.read_csv(StringIO(data), index_col="a", dtype={"a": dtype})
expected = DataFrame({"b": [2]}, index=Index([val], name="a"))
expected = DataFrame({"b": [2]}, index=Index([val], name="a", dtype=dtype))
tm.assert_frame_equal(result, expected)


Expand Down
10 changes: 5 additions & 5 deletions pandas/tests/io/parser/test_mangle_dupes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@

import pytest

from pandas._config import using_string_dtype

from pandas import DataFrame
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm

xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
Expand Down Expand Up @@ -121,7 +122,6 @@ def test_thorough_mangle_names(all_parsers, data, names, expected):
parser.read_csv(StringIO(data), names=names)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@xfail_pyarrow # AssertionError: DataFrame.columns are different
def test_mangled_unnamed_placeholders(all_parsers):
# xref gh-13017
Expand All @@ -133,7 +133,7 @@ def test_mangled_unnamed_placeholders(all_parsers):

# This test recursively updates `df`.
for i in range(3):
expected = DataFrame()
expected = DataFrame(columns=Index([], dtype="str"))

for j in range(i + 1):
col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1)
Expand Down
31 changes: 17 additions & 14 deletions pandas/tests/io/parser/test_na_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas._libs.parsers import STR_NA_VALUES

from pandas import (
Expand Down Expand Up @@ -261,7 +259,6 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize(
"kwargs,expected",
[
Expand Down Expand Up @@ -299,7 +296,9 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected):
),
],
)
def test_na_values_keep_default(all_parsers, kwargs, expected, request):
def test_na_values_keep_default(
all_parsers, kwargs, expected, request, using_infer_string
):
data = """\
A,B,C
a,1,one
Expand All @@ -317,8 +316,9 @@ def test_na_values_keep_default(all_parsers, kwargs, expected, request):
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
return
mark = pytest.mark.xfail()
request.applymarker(mark)
if not using_infer_string or "na_values" in kwargs:
mark = pytest.mark.xfail()
request.applymarker(mark)

result = parser.read_csv(StringIO(data), **kwargs)
expected = DataFrame(expected)
Expand Down Expand Up @@ -429,23 +429,28 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case
@pytest.mark.parametrize(
"na_filter,row_data",
[
(True, [[1, "A"], [np.nan, np.nan], [3, "C"]]),
(False, [["1", "A"], ["nan", "B"], ["3", "C"]]),
],
)
def test_na_values_na_filter_override(all_parsers, na_filter, row_data):
def test_na_values_na_filter_override(
request, all_parsers, na_filter, row_data, using_infer_string
):
parser = all_parsers
if parser.engine == "pyarrow":
# mismatched dtypes in both cases, FutureWarning in the True case
if not (using_infer_string and na_filter):
mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
request.applymarker(mark)
data = """\
A,B
1,A
nan,B
3,C
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter)

expected = DataFrame(row_data, columns=["A", "B"])
Expand Down Expand Up @@ -536,7 +541,6 @@ def test_na_values_dict_aliasing(all_parsers):
tm.assert_dict_equal(na_values, na_values_copy)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_na_values_dict_null_column_name(all_parsers):
# see gh-57547
parser = all_parsers
Expand All @@ -560,11 +564,10 @@ def test_na_values_dict_null_column_name(all_parsers):
return

expected = DataFrame(
{None: ["MA", "NA", "OA"], "x": [1.0, 2.0, np.nan], "y": [2.0, 1.0, 3.0]}
{"x": [1.0, 2.0, np.nan], "y": [2.0, 1.0, 3.0]},
index=Index(["MA", "NA", "OA"], dtype=object),
)

expected = expected.set_index(None)

result = parser.read_csv(
StringIO(data),
index_col=0,
Expand Down
11 changes: 3 additions & 8 deletions pandas/tests/io/parser/test_parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
DataFrame,
Expand Down Expand Up @@ -421,15 +419,14 @@ def test_parse_timezone(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@skip_pyarrow # pandas.errors.ParserError: CSV parse error
@pytest.mark.parametrize(
"date_string",
["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"],
)
def test_invalid_parse_delimited_date(all_parsers, date_string):
parser = all_parsers
expected = DataFrame({0: [date_string]}, dtype="object")
expected = DataFrame({0: [date_string]}, dtype="str")
result = parser.read_csv(
StringIO(date_string),
header=None,
Expand Down Expand Up @@ -609,7 +606,6 @@ def test_date_parser_usecols_thousands(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_dayfirst_warnings():
# GH 12585

Expand Down Expand Up @@ -642,7 +638,7 @@ def test_dayfirst_warnings():

# first in DD/MM/YYYY, second in MM/DD/YYYY
input = "date\n31/12/2014\n03/30/2011"
expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date")
expected = Index(["31/12/2014", "03/30/2011"], dtype="str", name="date")

# A. use dayfirst=True
res5 = read_csv(
Expand Down Expand Up @@ -752,7 +748,6 @@ def test_parse_dates_and_string_dtype(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_parse_dot_separated_dates(all_parsers):
# https://github.com/pandas-dev/pandas/issues/2586
parser = all_parsers
Expand All @@ -762,7 +757,7 @@ def test_parse_dot_separated_dates(all_parsers):
if parser.engine == "pyarrow":
expected_index = Index(
["27.03.2003 14:55:00.000", "03.08.2003 15:20:00.000"],
dtype="object",
dtype="str",
name="a",
)
warn = None
Expand Down
3 changes: 0 additions & 3 deletions pandas/tests/io/parser/test_upcast.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas._libs.parsers import (
_maybe_upcast,
na_values,
Expand Down Expand Up @@ -86,7 +84,6 @@ def test_maybe_upcaste_all_nan():
tm.assert_extension_array_equal(result, expected)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("val", [na_values[np.object_], "c"])
def test_maybe_upcast_object(val, string_storage):
# GH#36712
Expand Down