Skip to content

Commit

Permalink
TST: de-xfail chunksize pyarrow tests (#56041)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored Nov 18, 2023
1 parent 4ac5cf6 commit 47a596e
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 21 deletions.
120 changes: 100 additions & 20 deletions pandas/tests/io/parser/common/test_chunksize.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,11 @@
)
import pandas._testing as tm

xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)


@xfail_pyarrow # The 'chunksize' option is not supported
@pytest.mark.parametrize("index_col", [0, "index"])
def test_read_chunksize_with_index(all_parsers, index_col):
parser = all_parsers
Expand All @@ -48,14 +46,20 @@ def test_read_chunksize_with_index(all_parsers, index_col):
)
expected = expected.set_index("index")

if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
list(reader)
return

with parser.read_csv(StringIO(data), index_col=0, chunksize=2) as reader:
chunks = list(reader)
tm.assert_frame_equal(chunks[0], expected[:2])
tm.assert_frame_equal(chunks[1], expected[2:4])
tm.assert_frame_equal(chunks[2], expected[4:])


@xfail_pyarrow # AssertionError: Regex pattern did not match
@pytest.mark.parametrize("chunksize", [1.3, "foo", 0])
def test_read_chunksize_bad(all_parsers, chunksize):
data = """index,A,B,C,D
Expand All @@ -68,13 +72,14 @@ def test_read_chunksize_bad(all_parsers, chunksize):
"""
parser = all_parsers
msg = r"'chunksize' must be an integer >=1"
if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"

with pytest.raises(ValueError, match=msg):
with parser.read_csv(StringIO(data), chunksize=chunksize) as _:
pass


@xfail_pyarrow # The 'nrows' option is not supported
@pytest.mark.parametrize("chunksize", [2, 8])
def test_read_chunksize_and_nrows(all_parsers, chunksize):
# see gh-15755
Expand All @@ -89,12 +94,17 @@ def test_read_chunksize_and_nrows(all_parsers, chunksize):
parser = all_parsers
kwargs = {"index_col": 0, "nrows": 5}

if parser.engine == "pyarrow":
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
return

expected = parser.read_csv(StringIO(data), **kwargs)
with parser.read_csv(StringIO(data), chunksize=chunksize, **kwargs) as reader:
tm.assert_frame_equal(concat(reader), expected)


@xfail_pyarrow # The 'chunksize' option is not supported
def test_read_chunksize_and_nrows_changing_size(all_parsers):
data = """index,A,B,C,D
foo,2,3,4,5
Expand All @@ -107,6 +117,12 @@ def test_read_chunksize_and_nrows_changing_size(all_parsers):
parser = all_parsers
kwargs = {"index_col": 0, "nrows": 5}

if parser.engine == "pyarrow":
msg = "The 'nrows' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
return

expected = parser.read_csv(StringIO(data), **kwargs)
with parser.read_csv(StringIO(data), chunksize=8, **kwargs) as reader:
tm.assert_frame_equal(reader.get_chunk(size=2), expected.iloc[:2])
Expand All @@ -116,7 +132,6 @@ def test_read_chunksize_and_nrows_changing_size(all_parsers):
reader.get_chunk(size=3)


@xfail_pyarrow # The 'chunksize' option is not supported
def test_get_chunk_passed_chunksize(all_parsers):
parser = all_parsers
data = """A,B,C
Expand All @@ -125,14 +140,20 @@ def test_get_chunk_passed_chunksize(all_parsers):
7,8,9
1,2,3"""

if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(StringIO(data), chunksize=2) as reader:
reader.get_chunk()
return

with parser.read_csv(StringIO(data), chunksize=2) as reader:
result = reader.get_chunk()

expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
tm.assert_frame_equal(result, expected)


@xfail_pyarrow # The 'chunksize' option is not supported
@pytest.mark.parametrize("kwargs", [{}, {"index_col": 0}])
def test_read_chunksize_compat(all_parsers, kwargs):
# see gh-12185
Expand All @@ -146,17 +167,35 @@ def test_read_chunksize_compat(all_parsers, kwargs):
"""
parser = all_parsers
result = parser.read_csv(StringIO(data), **kwargs)

if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
concat(reader)
return

with parser.read_csv(StringIO(data), chunksize=2, **kwargs) as reader:
tm.assert_frame_equal(concat(reader), result)
via_reader = concat(reader)
tm.assert_frame_equal(via_reader, result)


@xfail_pyarrow # The 'chunksize' option is not supported
def test_read_chunksize_jagged_names(all_parsers):
# see gh-23509
parser = all_parsers
data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)])

expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10])

if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(
StringIO(data), names=range(10), chunksize=4
) as reader:
concat(reader)
return

with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader:
result = concat(reader)
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -194,7 +233,6 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
assert result.a.dtype == float


@xfail_pyarrow # ValueError: The 'chunksize' option is not supported
def test_warn_if_chunks_have_mismatched_type(all_parsers):
warning_type = None
parser = all_parsers
Expand All @@ -212,17 +250,24 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):

buf = StringIO(data)

df = parser.read_csv_check_warnings(
warning_type,
r"Columns \(0\) have mixed types. "
"Specify dtype option on import or set low_memory=False.",
buf,
)
if parser.engine == "pyarrow":
df = parser.read_csv_check_warnings(
DeprecationWarning,
"Passing a BlockManager to DataFrame is deprecated",
buf,
check_stacklevel=False,
)
else:
df = parser.read_csv_check_warnings(
warning_type,
r"Columns \(0\) have mixed types. "
"Specify dtype option on import or set low_memory=False.",
buf,
)

assert df.a.dtype == object


@xfail_pyarrow # ValueError: The 'chunksize' option is not supported
@pytest.mark.parametrize("iterator", [True, False])
def test_empty_with_nrows_chunksize(all_parsers, iterator):
# see gh-9535
Expand All @@ -232,6 +277,18 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator):
nrows = 10
data = StringIO("foo,bar\n")

if parser.engine == "pyarrow":
msg = (
"The '(nrows|chunksize)' option is not supported with the 'pyarrow' engine"
)
with pytest.raises(ValueError, match=msg):
if iterator:
with parser.read_csv(data, chunksize=nrows) as reader:
next(iter(reader))
else:
parser.read_csv(data, nrows=nrows)
return

if iterator:
with parser.read_csv(data, chunksize=nrows) as reader:
result = next(iter(reader))
Expand All @@ -241,7 +298,6 @@ def test_empty_with_nrows_chunksize(all_parsers, iterator):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow # ValueError: The 'chunksize' option is not supported
def test_read_csv_memory_growth_chunksize(all_parsers):
# see gh-24805
#
Expand All @@ -254,12 +310,19 @@ def test_read_csv_memory_growth_chunksize(all_parsers):
for i in range(1000):
f.write(str(i) + "\n")

if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
with parser.read_csv(path, chunksize=20) as result:
for _ in result:
pass
return

with parser.read_csv(path, chunksize=20) as result:
for _ in result:
pass


@xfail_pyarrow # ValueError: The 'chunksize' option is not supported
def test_chunksize_with_usecols_second_block_shorter(all_parsers):
# GH#21211
parser = all_parsers
Expand All @@ -268,6 +331,18 @@ def test_chunksize_with_usecols_second_block_shorter(all_parsers):
9,10,11
"""

if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
StringIO(data),
names=["a", "b"],
chunksize=2,
usecols=[0, 1],
header=None,
)
return

result_chunks = parser.read_csv(
StringIO(data),
names=["a", "b"],
Expand All @@ -285,7 +360,6 @@ def test_chunksize_with_usecols_second_block_shorter(all_parsers):
tm.assert_frame_equal(result, expected_frames[i])


@xfail_pyarrow # ValueError: The 'chunksize' option is not supported
def test_chunksize_second_block_shorter(all_parsers):
# GH#21211
parser = all_parsers
Expand All @@ -295,6 +369,12 @@ def test_chunksize_second_block_shorter(all_parsers):
9,10,11
"""

if parser.engine == "pyarrow":
msg = "The 'chunksize' option is not supported with the 'pyarrow' engine"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), chunksize=2)
return

result_chunks = parser.read_csv(StringIO(data), chunksize=2)

expected_frames = [
Expand Down
6 changes: 5 additions & 1 deletion pandas/tests/io/parser/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,18 @@ def read_csv_check_warnings(
warn_msg: str,
*args,
raise_on_extra_warnings=True,
check_stacklevel: bool = True,
**kwargs,
):
# We need to check the stacklevel here instead of in the tests
# since this is where read_csv is called and where the warning
# should point to.
kwargs = self.update_kwargs(kwargs)
with tm.assert_produces_warning(
warn_type, match=warn_msg, raise_on_extra_warnings=raise_on_extra_warnings
warn_type,
match=warn_msg,
raise_on_extra_warnings=raise_on_extra_warnings,
check_stacklevel=check_stacklevel,
):
return read_csv(*args, **kwargs)

Expand Down

0 comments on commit 47a596e

Please sign in to comment.