Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adjust tests in json folder for new string option #56197

Merged
merged 3 commits into from
Nov 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions pandas/tests/io/json/test_json_table_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def df_table():


class TestBuildSchema:
def test_build_table_schema(self, df_schema):
def test_build_table_schema(self, df_schema, using_infer_string):
result = build_table_schema(df_schema, version=False)
expected = {
"fields": [
Expand All @@ -68,6 +68,8 @@ def test_build_table_schema(self, df_schema):
],
"primaryKey": ["idx"],
}
if using_infer_string:
expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "string"}
assert result == expected
result = build_table_schema(df_schema)
assert "pandas_version" in result
Expand Down Expand Up @@ -97,7 +99,7 @@ def test_series_unnamed(self):
}
assert result == expected

def test_multiindex(self, df_schema):
def test_multiindex(self, df_schema, using_infer_string):
df = df_schema
idx = pd.MultiIndex.from_product([("a", "b"), (1, 2)])
df.index = idx
Expand All @@ -114,6 +116,13 @@ def test_multiindex(self, df_schema):
],
"primaryKey": ["level_0", "level_1"],
}
if using_infer_string:
expected["fields"][0] = {
"name": "level_0",
"type": "any",
"extDtype": "string",
}
expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "string"}
assert result == expected

df.index.names = ["idx0", None]
Expand Down Expand Up @@ -156,7 +165,10 @@ def test_as_json_table_type_bool_data(self, bool_type):
def test_as_json_table_type_date_data(self, date_data):
assert as_json_table_type(date_data.dtype) == "datetime"

@pytest.mark.parametrize("str_data", [pd.Series(["a", "b"]), pd.Index(["a", "b"])])
@pytest.mark.parametrize(
"str_data",
[pd.Series(["a", "b"], dtype=object), pd.Index(["a", "b"], dtype=object)],
)
def test_as_json_table_type_string_data(self, str_data):
assert as_json_table_type(str_data.dtype) == "string"

Expand Down Expand Up @@ -261,7 +273,7 @@ def test_read_json_from_to_json_results(self):
tm.assert_frame_equal(result1, df)
tm.assert_frame_equal(result2, df)

def test_to_json(self, df_table):
def test_to_json(self, df_table, using_infer_string):
df = df_table
df.index.name = "idx"
result = df.to_json(orient="table", date_format="iso")
Expand Down Expand Up @@ -292,6 +304,9 @@ def test_to_json(self, df_table):
{"name": "H", "type": "datetime", "tz": "US/Central"},
]

if using_infer_string:
fields[2] = {"name": "B", "type": "any", "extDtype": "string"}

schema = {"fields": fields, "primaryKey": ["idx"]}
data = [
OrderedDict(
Expand Down
41 changes: 32 additions & 9 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import numpy as np
import pytest

from pandas._config import using_pyarrow_string_dtype

from pandas.compat import IS64
import pandas.util._test_decorators as td

Expand All @@ -30,6 +32,7 @@
ArrowStringArray,
StringArray,
)
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics

from pandas.io.json import ujson_dumps

Expand Down Expand Up @@ -238,7 +241,7 @@ def test_roundtrip_str_axes(self, orient, convert_axes, dtype):

@pytest.mark.parametrize("convert_axes", [True, False])
def test_roundtrip_categorical(
self, request, orient, categorical_frame, convert_axes
self, request, orient, categorical_frame, convert_axes, using_infer_string
):
# TODO: create a better frame to test with and improve coverage
if orient in ("index", "columns"):
Expand All @@ -252,7 +255,9 @@ def test_roundtrip_categorical(
result = read_json(data, orient=orient, convert_axes=convert_axes)

expected = categorical_frame.copy()
expected.index = expected.index.astype(str) # Categorical not preserved
expected.index = expected.index.astype(
str if not using_infer_string else "string[pyarrow_numpy]"
) # Categorical not preserved
expected.index.name = None # index names aren't preserved in JSON
assert_json_roundtrip_equal(result, expected, orient)

Expand Down Expand Up @@ -518,9 +523,9 @@ def test_v12_compat(self, datapath):
df_iso = df.drop(["modified"], axis=1)
v12_iso_json = os.path.join(dirpath, "tsframe_iso_v012.json")
df_unser_iso = read_json(v12_iso_json)
tm.assert_frame_equal(df_iso, df_unser_iso)
tm.assert_frame_equal(df_iso, df_unser_iso, check_column_type=False)

def test_blocks_compat_GH9037(self):
def test_blocks_compat_GH9037(self, using_infer_string):
index = pd.date_range("20000101", periods=10, freq="h")
# freq doesn't round-trip
index = DatetimeIndex(list(index), freq=None)
Expand Down Expand Up @@ -604,7 +609,9 @@ def test_blocks_compat_GH9037(self):
)

# JSON deserialisation always creates unicode strings
df_mixed.columns = df_mixed.columns.astype(np.str_)
df_mixed.columns = df_mixed.columns.astype(
np.str_ if not using_infer_string else "string[pyarrow_numpy]"
)
data = StringIO(df_mixed.to_json(orient="split"))
df_roundtrip = read_json(data, orient="split")
tm.assert_frame_equal(
Expand Down Expand Up @@ -676,16 +683,19 @@ def test_series_non_unique_index(self):
unserialized = read_json(
StringIO(s.to_json(orient="records")), orient="records", typ="series"
)
tm.assert_numpy_array_equal(s.values, unserialized.values)
tm.assert_equal(s.values, unserialized.values)

def test_series_default_orient(self, string_series):
assert string_series.to_json() == string_series.to_json(orient="index")

def test_series_roundtrip_simple(self, orient, string_series):
def test_series_roundtrip_simple(self, orient, string_series, using_infer_string):
data = StringIO(string_series.to_json(orient=orient))
result = read_json(data, typ="series", orient=orient)

expected = string_series
if using_infer_string and orient in ("split", "index", "columns"):
# These schemas don't contain dtypes, so we infer string
expected.index = expected.index.astype("string[pyarrow_numpy]")
if orient in ("values", "records"):
expected = expected.reset_index(drop=True)
if orient != "split":
Expand Down Expand Up @@ -1459,6 +1469,9 @@ def test_from_json_to_json_table_dtypes(self):
result = read_json(StringIO(dfjson), orient="table")
tm.assert_frame_equal(result, expected)

# TODO: We are casting to string which coerces None to NaN before casting back
# to object, ending up with incorrect na values
@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="incorrect na conversion")
@pytest.mark.parametrize("orient", ["split", "records", "index", "columns"])
def test_to_json_from_json_columns_dtypes(self, orient):
# GH21892 GH33205
Expand Down Expand Up @@ -1716,6 +1729,11 @@ def test_to_json_indent(self, indent):

assert result == expected

@pytest.mark.skipif(
using_pyarrow_string_dtype(),
reason="Adjust expected when infer_string is default, no bug here, "
"just a complicated parametrization",
)
@pytest.mark.parametrize(
"orient,expected",
[
Expand Down Expand Up @@ -1991,7 +2009,9 @@ def test_json_uint64(self):
@pytest.mark.parametrize(
"orient", ["split", "records", "values", "index", "columns"]
)
def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient):
def test_read_json_dtype_backend(
self, string_storage, dtype_backend, orient, using_infer_string
):
# GH#50750
pa = pytest.importorskip("pyarrow")
df = DataFrame(
Expand All @@ -2007,7 +2027,10 @@ def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient):
}
)

if string_storage == "python":
if using_infer_string:
string_array = ArrowStringArrayNumpySemantics(pa.array(["a", "b", "c"]))
string_array_na = ArrowStringArrayNumpySemantics(pa.array(["a", "b", None]))
elif string_storage == "python":
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))

Expand Down
Loading