Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TST (string dtype): resolve xfails in common IO tests #60320

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions pandas/tests/io/test_clipboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.errors import (
PyperclipException,
PyperclipWindowsException,
Expand All @@ -26,10 +24,6 @@
init_qt_clipboard,
)

pytestmark = pytest.mark.xfail(
using_string_dtype(), reason="TODO(infer_string)", strict=False
)


def build_kwargs(sep, excel):
kwargs = {}
Expand Down Expand Up @@ -351,7 +345,7 @@ def test_raw_roundtrip(self, data):

@pytest.mark.parametrize("engine", ["c", "python"])
def test_read_clipboard_dtype_backend(
self, clipboard, string_storage, dtype_backend, engine
self, clipboard, string_storage, dtype_backend, engine, using_infer_string
):
# GH#50502
if dtype_backend == "pyarrow":
Expand Down Expand Up @@ -396,6 +390,11 @@ def test_read_clipboard_dtype_backend(
)
expected["g"] = ArrowExtensionArray(pa.array([None, None]))

if using_infer_string:
expected.columns = expected.columns.astype(
pd.StringDtype(string_storage, na_value=np.nan)
)

tm.assert_frame_equal(result, expected)

def test_invalid_dtype_backend(self):
Expand Down
33 changes: 15 additions & 18 deletions pandas/tests/io/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ def test_bytesiowrapper_returns_correct_bytes(self):
assert result == data.encode("utf-8")

# Test that pyarrow can handle a file opened with get_handle
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_get_handle_pyarrow_compat(self):
pa_csv = pytest.importorskip("pyarrow.csv")

Expand All @@ -155,6 +154,8 @@ def test_get_handle_pyarrow_compat(self):
s = StringIO(data)
with icom.get_handle(s, "rb", is_text=False) as handles:
df = pa_csv.read_csv(handles.handle).to_pandas()
# TODO will have to update this when pyarrow' to_pandas() is fixed
expected = expected.astype("object")
tm.assert_frame_equal(df, expected)
assert not s.closed

Expand Down Expand Up @@ -338,7 +339,6 @@ def test_read_fspath_all(self, reader, module, path, datapath):
("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"),
],
)
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_write_fspath_all(self, writer_name, writer_kwargs, module):
if writer_name in ["to_latex"]: # uses Styler implementation
pytest.importorskip("jinja2")
Expand All @@ -365,7 +365,7 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module):
expected = f_path.read()
assert result == expected

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) hdf support")
def test_write_fspath_hdf5(self):
# Same test as write_fspath_all, except HDF5 files aren't
# necessarily byte-for-byte identical for a given dataframe, so we'll
Expand Down Expand Up @@ -438,14 +438,13 @@ def test_unknown_engine(self):
with tm.ensure_clean() as path:
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
df.to_csv(path)
with pytest.raises(ValueError, match="Unknown engine"):
pd.read_csv(path, engine="pyt")

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_binary_mode(self):
"""
'encoding' shouldn't be passed to 'open' in binary mode.
Expand All @@ -455,8 +454,8 @@ def test_binary_mode(self):
with tm.ensure_clean() as path:
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
df.to_csv(path, mode="w+b")
tm.assert_frame_equal(df, pd.read_csv(path, index_col=0))
Expand All @@ -473,8 +472,8 @@ def test_warning_missing_utf_bom(self, encoding, compression_):
"""
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
with tm.ensure_clean() as path:
with tm.assert_produces_warning(UnicodeWarning, match="byte order mark"):
Expand Down Expand Up @@ -504,15 +503,14 @@ def test_is_fsspec_url():
assert icom.is_fsspec_url("RFC-3986+compliant.spec://something")


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.parametrize("encoding", [None, "utf-8"])
@pytest.mark.parametrize("format", ["csv", "json"])
def test_codecs_encoding(encoding, format):
# GH39247
expected = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
with tm.ensure_clean() as path:
with codecs.open(path, mode="w", encoding=encoding) as handle:
Expand All @@ -525,13 +523,12 @@ def test_codecs_encoding(encoding, format):
tm.assert_frame_equal(expected, df)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_codecs_get_writer_reader():
# GH39247
expected = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
with tm.ensure_clean() as path:
with open(path, "wb") as handle:
Expand All @@ -556,8 +553,8 @@ def test_explicit_encoding(io_class, mode, msg):
# wrong mode is requested
expected = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
with io_class() as buffer:
with pytest.raises(TypeError, match=msg):
Expand Down
15 changes: 6 additions & 9 deletions pandas/tests/io/test_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas.compat import is_platform_windows

import pandas as pd
Expand Down Expand Up @@ -139,7 +137,6 @@ def test_compression_warning(compression_only):
df.to_csv(handles.handle, compression=compression_only)


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
def test_compression_binary(compression_only):
"""
Binary file handles support compression.
Expand All @@ -148,8 +145,8 @@ def test_compression_binary(compression_only):
"""
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)

# with a file
Expand Down Expand Up @@ -180,8 +177,8 @@ def test_gzip_reproducibility_file_name():
"""
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
compression_options = {"method": "gzip", "mtime": 1}

Expand All @@ -203,8 +200,8 @@ def test_gzip_reproducibility_file_object():
"""
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD"), dtype=object),
index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
)
compression_options = {"method": "gzip", "mtime": 1}

Expand Down
5 changes: 2 additions & 3 deletions pandas/tests/io/test_gcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,6 @@ def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str):
assert result == expected


@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
@pytest.mark.parametrize("encoding", ["utf-8", "cp1251"])
def test_to_csv_compression_encoding_gcs(
gcs_buffer, compression_only, encoding, compression_to_extension
Expand All @@ -171,8 +170,8 @@ def test_to_csv_compression_encoding_gcs(
"""
df = DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=Index(list("ABCD"), dtype=object),
index=Index([f"i-{i}" for i in range(30)], dtype=object),
columns=Index(list("ABCD")),
index=Index([f"i-{i}" for i in range(30)]),
)

# reference of compressed and encoded file
Expand Down