Skip to content

Commit

Permalink
Adjust tests in json folder for new string option (#56197)
Browse files Browse the repository at this point in the history
* BUG: read_json not handling string dtype when converting to dates

* Adjust tests in json folder for new string option
  • Loading branch information
phofl authored Nov 30, 2023
1 parent 00a0216 commit 02324e6
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 13 deletions.
23 changes: 19 additions & 4 deletions pandas/tests/io/json/test_json_table_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def df_table():


class TestBuildSchema:
def test_build_table_schema(self, df_schema):
def test_build_table_schema(self, df_schema, using_infer_string):
result = build_table_schema(df_schema, version=False)
expected = {
"fields": [
Expand All @@ -68,6 +68,8 @@ def test_build_table_schema(self, df_schema):
],
"primaryKey": ["idx"],
}
if using_infer_string:
expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "string"}
assert result == expected
result = build_table_schema(df_schema)
assert "pandas_version" in result
Expand Down Expand Up @@ -97,7 +99,7 @@ def test_series_unnamed(self):
}
assert result == expected

def test_multiindex(self, df_schema):
def test_multiindex(self, df_schema, using_infer_string):
df = df_schema
idx = pd.MultiIndex.from_product([("a", "b"), (1, 2)])
df.index = idx
Expand All @@ -114,6 +116,13 @@ def test_multiindex(self, df_schema):
],
"primaryKey": ["level_0", "level_1"],
}
if using_infer_string:
expected["fields"][0] = {
"name": "level_0",
"type": "any",
"extDtype": "string",
}
expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "string"}
assert result == expected

df.index.names = ["idx0", None]
Expand Down Expand Up @@ -156,7 +165,10 @@ def test_as_json_table_type_bool_data(self, bool_type):
def test_as_json_table_type_date_data(self, date_data):
assert as_json_table_type(date_data.dtype) == "datetime"

@pytest.mark.parametrize("str_data", [pd.Series(["a", "b"]), pd.Index(["a", "b"])])
@pytest.mark.parametrize(
"str_data",
[pd.Series(["a", "b"], dtype=object), pd.Index(["a", "b"], dtype=object)],
)
def test_as_json_table_type_string_data(self, str_data):
assert as_json_table_type(str_data.dtype) == "string"

Expand Down Expand Up @@ -261,7 +273,7 @@ def test_read_json_from_to_json_results(self):
tm.assert_frame_equal(result1, df)
tm.assert_frame_equal(result2, df)

def test_to_json(self, df_table):
def test_to_json(self, df_table, using_infer_string):
df = df_table
df.index.name = "idx"
result = df.to_json(orient="table", date_format="iso")
Expand Down Expand Up @@ -292,6 +304,9 @@ def test_to_json(self, df_table):
{"name": "H", "type": "datetime", "tz": "US/Central"},
]

if using_infer_string:
fields[2] = {"name": "B", "type": "any", "extDtype": "string"}

schema = {"fields": fields, "primaryKey": ["idx"]}
data = [
OrderedDict(
Expand Down
41 changes: 32 additions & 9 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
import numpy as np
import pytest

from pandas._config import using_pyarrow_string_dtype

from pandas.compat import IS64
import pandas.util._test_decorators as td

Expand All @@ -30,6 +32,7 @@
ArrowStringArray,
StringArray,
)
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics

from pandas.io.json import ujson_dumps

Expand Down Expand Up @@ -237,7 +240,7 @@ def test_roundtrip_str_axes(self, orient, convert_axes, dtype):

@pytest.mark.parametrize("convert_axes", [True, False])
def test_roundtrip_categorical(
self, request, orient, categorical_frame, convert_axes
self, request, orient, categorical_frame, convert_axes, using_infer_string
):
# TODO: create a better frame to test with and improve coverage
if orient in ("index", "columns"):
Expand All @@ -251,7 +254,9 @@ def test_roundtrip_categorical(
result = read_json(data, orient=orient, convert_axes=convert_axes)

expected = categorical_frame.copy()
expected.index = expected.index.astype(str) # Categorical not preserved
expected.index = expected.index.astype(
str if not using_infer_string else "string[pyarrow_numpy]"
) # Categorical not preserved
expected.index.name = None # index names aren't preserved in JSON
assert_json_roundtrip_equal(result, expected, orient)

Expand Down Expand Up @@ -517,9 +522,9 @@ def test_v12_compat(self, datapath):
df_iso = df.drop(["modified"], axis=1)
v12_iso_json = os.path.join(dirpath, "tsframe_iso_v012.json")
df_unser_iso = read_json(v12_iso_json)
tm.assert_frame_equal(df_iso, df_unser_iso)
tm.assert_frame_equal(df_iso, df_unser_iso, check_column_type=False)

def test_blocks_compat_GH9037(self):
def test_blocks_compat_GH9037(self, using_infer_string):
index = pd.date_range("20000101", periods=10, freq="h")
# freq doesn't round-trip
index = DatetimeIndex(list(index), freq=None)
Expand Down Expand Up @@ -603,7 +608,9 @@ def test_blocks_compat_GH9037(self):
)

# JSON deserialisation always creates unicode strings
df_mixed.columns = df_mixed.columns.astype(np.str_)
df_mixed.columns = df_mixed.columns.astype(
np.str_ if not using_infer_string else "string[pyarrow_numpy]"
)
data = StringIO(df_mixed.to_json(orient="split"))
df_roundtrip = read_json(data, orient="split")
tm.assert_frame_equal(
Expand Down Expand Up @@ -675,16 +682,19 @@ def test_series_non_unique_index(self):
unserialized = read_json(
StringIO(s.to_json(orient="records")), orient="records", typ="series"
)
tm.assert_numpy_array_equal(s.values, unserialized.values)
tm.assert_equal(s.values, unserialized.values)

def test_series_default_orient(self, string_series):
assert string_series.to_json() == string_series.to_json(orient="index")

def test_series_roundtrip_simple(self, orient, string_series):
def test_series_roundtrip_simple(self, orient, string_series, using_infer_string):
data = StringIO(string_series.to_json(orient=orient))
result = read_json(data, typ="series", orient=orient)

expected = string_series
if using_infer_string and orient in ("split", "index", "columns"):
# These schemas don't contain dtypes, so we infer string
expected.index = expected.index.astype("string[pyarrow_numpy]")
if orient in ("values", "records"):
expected = expected.reset_index(drop=True)
if orient != "split":
Expand Down Expand Up @@ -1458,6 +1468,9 @@ def test_from_json_to_json_table_dtypes(self):
result = read_json(StringIO(dfjson), orient="table")
tm.assert_frame_equal(result, expected)

# TODO: We are casting to string which coerces None to NaN before casting back
# to object, ending up with incorrect na values
@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="incorrect na conversion")
@pytest.mark.parametrize("orient", ["split", "records", "index", "columns"])
def test_to_json_from_json_columns_dtypes(self, orient):
# GH21892 GH33205
Expand Down Expand Up @@ -1715,6 +1728,11 @@ def test_to_json_indent(self, indent):

assert result == expected

@pytest.mark.skipif(
using_pyarrow_string_dtype(),
reason="Adjust expected when infer_string is default, no bug here, "
"just a complicated parametrization",
)
@pytest.mark.parametrize(
"orient,expected",
[
Expand Down Expand Up @@ -1990,7 +2008,9 @@ def test_json_uint64(self):
@pytest.mark.parametrize(
"orient", ["split", "records", "values", "index", "columns"]
)
def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient):
def test_read_json_dtype_backend(
self, string_storage, dtype_backend, orient, using_infer_string
):
# GH#50750
pa = pytest.importorskip("pyarrow")
df = DataFrame(
Expand All @@ -2006,7 +2026,10 @@ def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient):
}
)

if string_storage == "python":
if using_infer_string:
string_array = ArrowStringArrayNumpySemantics(pa.array(["a", "b", "c"]))
string_array_na = ArrowStringArrayNumpySemantics(pa.array(["a", "b", None]))
elif string_storage == "python":
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))

Expand Down

0 comments on commit 02324e6

Please sign in to comment.