Skip to content

Commit

Permalink
Adjust tests in parsers folder for new string option
Browse files Browse the repository at this point in the history
  • Loading branch information
phofl committed Nov 26, 2023
1 parent 15cc6ea commit 9ee4201
Show file tree
Hide file tree
Showing 12 changed files with 74 additions and 27 deletions.
2 changes: 1 addition & 1 deletion pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -905,7 +905,7 @@ def register_converter_cb(key) -> None:
with cf.config_prefix("future"):
cf.register_option(
"infer_string",
True,
False,
"Whether to infer sequence of str objects as pyarrow string "
"dtype, which will be the default in pandas 3.0 "
"(at which point this option will be deprecated).",
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/io/parser/common/test_chunksize.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
assert result.a.dtype == float


def test_warn_if_chunks_have_mismatched_type(all_parsers):
def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string):
warning_type = None
parser = all_parsers
size = 10000
Expand Down Expand Up @@ -265,7 +265,7 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):
buf,
)

assert df.a.dtype == object
assert df.a.dtype == object if not using_infer_string else "string[pyarrow_numpy]"


@pytest.mark.parametrize("iterator", [True, False])
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/io/parser/common/test_common_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import numpy as np
import pytest

from pandas._config import using_pyarrow_string_dtype

from pandas.errors import (
EmptyDataError,
ParserError,
Expand Down Expand Up @@ -878,6 +880,7 @@ def test_dict_keys_as_names(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't decode")
@xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0
def test_encoding_surrogatepass(all_parsers):
# GH39017
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/io/parser/common/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def test_multi_index_no_level_names(all_parsers, index_col):

# No index names in headless data.
expected.index.names = [None] * 2
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected, check_column_type=False)


@skip_pyarrow
Expand Down
12 changes: 10 additions & 2 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
IntegerArray,
StringArray,
)
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics

pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
Expand Down Expand Up @@ -458,7 +459,7 @@ def test_dtype_backend_and_dtype(all_parsers):
tm.assert_frame_equal(result, expected)


def test_dtype_backend_string(all_parsers, string_storage):
def test_dtype_backend_string(all_parsers, string_storage, using_infer_string):
# GH#36712
pa = pytest.importorskip("pyarrow")

Expand All @@ -471,7 +472,14 @@ def test_dtype_backend_string(all_parsers, string_storage):
"""
result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable")

if string_storage == "python":
if using_infer_string:
expected = DataFrame(
{
"a": ArrowStringArrayNumpySemantics(pa.array(["a", "b"])),
"b": ArrowStringArrayNumpySemantics(pa.array(["x", None])),
}
)
elif string_storage == "python":
expected = DataFrame(
{
"a": StringArray(np.array(["a", "b"], dtype=np.object_)),
Expand Down
10 changes: 7 additions & 3 deletions pandas/tests/io/parser/test_c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def error(val: float, actual_val: Decimal) -> Decimal:
assert max(precise_errors) <= max(normal_errors)


def test_usecols_dtypes(c_parser_only):
def test_usecols_dtypes(c_parser_only, using_infer_string):
parser = c_parser_only
data = """\
1,2,3
Expand All @@ -204,8 +204,12 @@ def test_usecols_dtypes(c_parser_only):
dtype={"b": int, "c": float},
)

assert (result.dtypes == [object, int, float]).all()
assert (result2.dtypes == [object, float]).all()
if using_infer_string:
assert (result.dtypes == ["string", int, float]).all()
assert (result2.dtypes == ["string", float]).all()
else:
assert (result.dtypes == [object, int, float]).all()
assert (result2.dtypes == [object, float]).all()


def test_disable_bool_parsing(c_parser_only):
Expand Down
11 changes: 9 additions & 2 deletions pandas/tests/io/parser/test_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def convert_score(x):


@pytest.mark.parametrize("conv_f", [lambda x: x, str])
def test_converter_index_col_bug(all_parsers, conv_f):
def test_converter_index_col_bug(all_parsers, conv_f, using_infer_string):
# see gh-1835 , GH#40589
parser = all_parsers
data = "A;B\n1;2\n3;4"
Expand All @@ -202,7 +202,14 @@ def test_converter_index_col_bug(all_parsers, conv_f):
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
)

xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object"))
xp = DataFrame(
{"B": [2, 4]},
index=Index(
["1", "3"],
name="A",
dtype="object" if not using_infer_string else "string[pyarrow_numpy]",
),
)
tm.assert_frame_equal(rs, xp)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/io/parser/test_mangle_dupes.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def test_mangled_unnamed_placeholders(all_parsers):
expected[orig_key] = orig_value
df = parser.read_csv(StringIO(df.to_csv()))

tm.assert_frame_equal(df, expected)
tm.assert_frame_equal(df, expected, check_column_type=False)


@xfail_pyarrow # ValueError: Found non-unique column index
Expand Down
19 changes: 14 additions & 5 deletions pandas/tests/io/parser/test_na_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,9 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected):
),
],
)
def test_na_values_keep_default(all_parsers, kwargs, expected, request):
def test_na_values_keep_default(
all_parsers, kwargs, expected, request, using_infer_string
):
data = """\
A,B,C
a,1,one
Expand All @@ -321,8 +323,9 @@ def test_na_values_keep_default(all_parsers, kwargs, expected, request):
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
return
mark = pytest.mark.xfail()
request.applymarker(mark)
if not using_infer_string or len(kwargs) > 0:
mark = pytest.mark.xfail()
request.applymarker(mark)

result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -432,22 +435,28 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v
tm.assert_frame_equal(result, expected)


@xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case
@pytest.mark.parametrize(
"na_filter,row_data",
[
(True, [[1, "A"], [np.nan, np.nan], [3, "C"]]),
(False, [["1", "A"], ["nan", "B"], ["3", "C"]]),
],
)
def test_na_values_na_filter_override(all_parsers, na_filter, row_data):
def test_na_values_na_filter_override(
all_parsers, na_filter, row_data, request, using_infer_string
):
data = """\
A,B
1,A
nan,B
3,C
"""
parser = all_parsers
if parser.engine == "pyarrow":
if not using_infer_string or not na_filter:
mark = pytest.mark.xfail()
request.applymarker(mark)

result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter)

expected = DataFrame(row_data, columns=["A", "B"])
Expand Down
19 changes: 13 additions & 6 deletions pandas/tests/io/parser/test_parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -1751,9 +1751,12 @@ def test_parse_timezone(all_parsers):
"date_string",
["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"],
)
def test_invalid_parse_delimited_date(all_parsers, date_string):
def test_invalid_parse_delimited_date(all_parsers, date_string, using_infer_string):
parser = all_parsers
expected = DataFrame({0: [date_string]}, dtype="object")
expected = DataFrame(
{0: [date_string]},
dtype="object" if not using_infer_string else "string[pyarrow_numpy]",
)
result = parser.read_csv(
StringIO(date_string),
header=None,
Expand Down Expand Up @@ -2021,7 +2024,7 @@ def test_parse_dates_and_keep_original_column(all_parsers):
tm.assert_frame_equal(result, expected)


def test_dayfirst_warnings():
def test_dayfirst_warnings(using_infer_string):
# GH 12585

# CASE 1: valid input
Expand Down Expand Up @@ -2053,7 +2056,11 @@ def test_dayfirst_warnings():

# first in DD/MM/YYYY, second in MM/DD/YYYY
input = "date\n31/12/2014\n03/30/2011"
expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date")
expected = Index(
["31/12/2014", "03/30/2011"],
dtype="object" if not using_infer_string else "string[pyarrow_numpy]",
name="date",
)

# A. use dayfirst=True
res5 = read_csv(
Expand Down Expand Up @@ -2170,7 +2177,7 @@ def test_parse_dates_and_string_dtype(all_parsers):
tm.assert_frame_equal(result, expected)


def test_parse_dot_separated_dates(all_parsers):
def test_parse_dot_separated_dates(all_parsers, using_infer_string):
# https://github.com/pandas-dev/pandas/issues/2586
parser = all_parsers
data = """a,b
Expand All @@ -2179,7 +2186,7 @@ def test_parse_dot_separated_dates(all_parsers):
if parser.engine == "pyarrow":
expected_index = Index(
["27.03.2003 14:55:00.000", "03.08.2003 15:20:00.000"],
dtype="object",
dtype="object" if not using_infer_string else "string[pyarrow_numpy]",
name="a",
)
warn = None
Expand Down
9 changes: 7 additions & 2 deletions pandas/tests/io/parser/test_read_fwf.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
ArrowStringArray,
StringArray,
)
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics

from pandas.io.common import urlopen
from pandas.io.parsers import (
Expand Down Expand Up @@ -960,9 +961,13 @@ def test_widths_and_usecols():
tm.assert_frame_equal(result, expected)


def test_dtype_backend(string_storage, dtype_backend):
def test_dtype_backend(string_storage, dtype_backend, using_infer_string):
# GH#50289
if string_storage == "python":
if using_infer_string:
pa = pytest.importorskip("pyarrow")
arr = ArrowStringArrayNumpySemantics(pa.array(["a", "b"]))
arr_na = ArrowStringArrayNumpySemantics(pa.array([None, "a"]))
elif string_storage == "python":
arr = StringArray(np.array(["a", "b"], dtype=np.object_))
arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_))
else:
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/io/parser/test_upcast.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
IntegerArray,
StringArray,
)
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics


def test_maybe_upcast(any_real_numpy_dtype):
Expand Down Expand Up @@ -85,15 +86,18 @@ def test_maybe_upcaste_all_nan():


@pytest.mark.parametrize("val", [na_values[np.object_], "c"])
def test_maybe_upcast_object(val, string_storage):
def test_maybe_upcast_object(val, string_storage, using_infer_string):
# GH#36712
pa = pytest.importorskip("pyarrow")

with pd.option_context("mode.string_storage", string_storage):
arr = np.array(["a", "b", val], dtype=np.object_)
result = _maybe_upcast(arr, use_dtype_backend=True)

if string_storage == "python":
if using_infer_string:
exp_val = "c" if val == "c" else None
expected = ArrowStringArrayNumpySemantics(pa.array(["a", "b", exp_val]))
elif string_storage == "python":
exp_val = "c" if val == "c" else NA
expected = StringArray(np.array(["a", "b", exp_val], dtype=np.object_))
else:
Expand Down

0 comments on commit 9ee4201

Please sign in to comment.