From 4a14a317dc64e09e5a5920ebea7793959cc61819 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 27 Nov 2023 00:09:30 +0100 Subject: [PATCH 1/2] BUG: read_json not handling string dtype when converting to dates --- doc/source/whatsnew/v2.2.0.rst | 1 + pandas/io/json/_json.py | 11 +++++++-- pandas/tests/io/json/test_compression.py | 30 ++++++++++++++---------- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index d252c19a95d4a..5cb99afdcb98d 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -530,6 +530,7 @@ I/O - Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`) - Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`) - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`) +- Bug in :func:`read_json` not handling dtype conversion properly if ``infer_string`` is set (:issue:`56195`) - Bug in :func:`to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`) - Bug in :meth:`DataFrame.to_hdf` and :func:`read_hdf` with ``datetime64`` dtypes with non-nanosecond resolution failing to round-trip correctly (:issue:`55622`) - Bug in :meth:`pandas.read_excel` with ``engine="odf"`` (``ods`` files) when string contains annotation (:issue:`55200`) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index e17fcea0aae71..9c56089560507 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -32,7 +32,10 @@ from pandas.util._exceptions import find_stack_level from pandas.util._validators import check_dtype_backend -from pandas.core.dtypes.common import ensure_str +from pandas.core.dtypes.common import ( + ensure_str, + is_string_dtype, +) from pandas.core.dtypes.dtypes import PeriodDtype from pandas.core.dtypes.generic import ABCIndex @@ -1249,7 +1252,7 @@ def _try_convert_data( if self.dtype_backend is not lib.no_default and not isinstance(data, ABCIndex): # Fall through for conversion later on return data, True - elif data.dtype == "object": + elif is_string_dtype(data.dtype): # try float try: data = data.astype("float64") @@ -1301,6 +1304,10 @@ def _try_convert_to_date(self, data): return data, False new_data = data + + if new_data.dtype == "string": + new_data = new_data.astype(object) + if new_data.dtype == "object": try: new_data = data.astype("int64") diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 410c20bb22d1e..ff7d34c85c015 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -93,27 +93,31 @@ def test_read_unsupported_compression_type(): pd.read_json(path, compression="unsupported") +@pytest.mark.parametrize( + "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))] +) @pytest.mark.parametrize("to_infer", [True, False]) @pytest.mark.parametrize("read_infer", [True, False]) def test_to_json_compression( - compression_only, read_infer, to_infer, compression_to_extension + compression_only, read_infer, to_infer, compression_to_extension, infer_string ): - # see gh-15008 - compression = compression_only + with pd.option_context("future.infer_string", infer_string): + # see gh-15008 + compression = compression_only - # We'll complete file extension subsequently. - filename = "test." - filename += compression_to_extension[compression] + # We'll complete file extension subsequently. + filename = "test." + filename += compression_to_extension[compression] - df = pd.DataFrame({"A": [1]}) + df = pd.DataFrame({"A": [1]}) - to_compression = "infer" if to_infer else compression - read_compression = "infer" if read_infer else compression + to_compression = "infer" if to_infer else compression + read_compression = "infer" if read_infer else compression - with tm.ensure_clean(filename) as path: - df.to_json(path, compression=to_compression) - result = pd.read_json(path, compression=read_compression) - tm.assert_frame_equal(result, df) + with tm.ensure_clean(filename) as path: + df.to_json(path, compression=to_compression) + result = pd.read_json(path, compression=read_compression) + tm.assert_frame_equal(result, df) def test_to_json_compression_mode(compression): From 5882cd98f1ceebb5a811f5d15acb66b85c3555bc Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 27 Nov 2023 00:37:08 +0100 Subject: [PATCH 2/2] Adjust tests in json folder for new string option --- .../tests/io/json/test_json_table_schema.py | 23 +++++++++-- pandas/tests/io/json/test_pandas.py | 41 +++++++++++++++---- 2 files changed, 51 insertions(+), 13 deletions(-) diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 79dbe448e9cbe..7569a74752bf2 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -56,7 +56,7 @@ def df_table(): class TestBuildSchema: - def test_build_table_schema(self, df_schema): + def test_build_table_schema(self, df_schema, using_infer_string): result = build_table_schema(df_schema, version=False) expected = { "fields": [ @@ -68,6 +68,8 @@ def test_build_table_schema(self, df_schema): ], "primaryKey": ["idx"], } + if using_infer_string: + expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "string"} assert result == expected result = build_table_schema(df_schema) assert "pandas_version" in result @@ -97,7 +99,7 @@ def test_series_unnamed(self): } assert result == expected - def test_multiindex(self, df_schema): + def test_multiindex(self, df_schema, using_infer_string): df = df_schema idx = pd.MultiIndex.from_product([("a", "b"), (1, 2)]) df.index = idx @@ -114,6 +116,13 @@ def test_multiindex(self, df_schema): ], "primaryKey": ["level_0", "level_1"], } + if using_infer_string: + expected["fields"][0] = { + "name": "level_0", + "type": "any", + "extDtype": "string", + } + expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "string"} assert result == expected df.index.names = ["idx0", None] @@ -156,7 +165,10 @@ def test_as_json_table_type_bool_data(self, bool_type): def test_as_json_table_type_date_data(self, date_data): assert as_json_table_type(date_data.dtype) == "datetime" - @pytest.mark.parametrize("str_data", [pd.Series(["a", "b"]), pd.Index(["a", "b"])]) + @pytest.mark.parametrize( + "str_data", + [pd.Series(["a", "b"], dtype=object), pd.Index(["a", "b"], dtype=object)], + ) def test_as_json_table_type_string_data(self, str_data): assert as_json_table_type(str_data.dtype) == "string" @@ -261,7 +273,7 @@ def test_read_json_from_to_json_results(self): tm.assert_frame_equal(result1, df) tm.assert_frame_equal(result2, df) - def test_to_json(self, df_table): + def test_to_json(self, df_table, using_infer_string): df = df_table df.index.name = "idx" result = df.to_json(orient="table", date_format="iso") @@ -292,6 +304,9 @@ def test_to_json(self, df_table): {"name": "H", "type": "datetime", "tz": "US/Central"}, ] + if using_infer_string: + fields[2] = {"name": "B", "type": "any", "extDtype": "string"} + schema = {"fields": fields, "primaryKey": ["idx"]} data = [ OrderedDict( diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 7312facc44c26..fe7882a0979a4 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -13,6 +13,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.compat import IS64 import pandas.util._test_decorators as td @@ -30,6 +32,7 @@ ArrowStringArray, StringArray, ) +from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics from pandas.io.json import ujson_dumps @@ -238,7 +241,7 @@ def test_roundtrip_str_axes(self, orient, convert_axes, dtype): @pytest.mark.parametrize("convert_axes", [True, False]) def test_roundtrip_categorical( - self, request, orient, categorical_frame, convert_axes + self, request, orient, categorical_frame, convert_axes, using_infer_string ): # TODO: create a better frame to test with and improve coverage if orient in ("index", "columns"): @@ -252,7 +255,9 @@ def test_roundtrip_categorical( result = read_json(data, orient=orient, convert_axes=convert_axes) expected = categorical_frame.copy() - expected.index = expected.index.astype(str) # Categorical not preserved + expected.index = expected.index.astype( + str if not using_infer_string else "string[pyarrow_numpy]" + ) # Categorical not preserved expected.index.name = None # index names aren't preserved in JSON assert_json_roundtrip_equal(result, expected, orient) @@ -518,9 +523,9 @@ def test_v12_compat(self, datapath): df_iso = df.drop(["modified"], axis=1) v12_iso_json = os.path.join(dirpath, "tsframe_iso_v012.json") df_unser_iso = read_json(v12_iso_json) - tm.assert_frame_equal(df_iso, df_unser_iso) + tm.assert_frame_equal(df_iso, df_unser_iso, check_column_type=False) - def test_blocks_compat_GH9037(self): + def test_blocks_compat_GH9037(self, using_infer_string): index = pd.date_range("20000101", periods=10, freq="h") # freq doesn't round-trip index = DatetimeIndex(list(index), freq=None) @@ -604,7 +609,9 @@ def test_blocks_compat_GH9037(self): ) # JSON deserialisation always creates unicode strings - df_mixed.columns = df_mixed.columns.astype(np.str_) + df_mixed.columns = df_mixed.columns.astype( + np.str_ if not using_infer_string else "string[pyarrow_numpy]" + ) data = StringIO(df_mixed.to_json(orient="split")) df_roundtrip = read_json(data, orient="split") tm.assert_frame_equal( @@ -676,16 +683,19 @@ def test_series_non_unique_index(self): unserialized = read_json( StringIO(s.to_json(orient="records")), orient="records", typ="series" ) - tm.assert_numpy_array_equal(s.values, unserialized.values) + tm.assert_equal(s.values, unserialized.values) def test_series_default_orient(self, string_series): assert string_series.to_json() == string_series.to_json(orient="index") - def test_series_roundtrip_simple(self, orient, string_series): + def test_series_roundtrip_simple(self, orient, string_series, using_infer_string): data = StringIO(string_series.to_json(orient=orient)) result = read_json(data, typ="series", orient=orient) expected = string_series + if using_infer_string and orient in ("split", "index", "columns"): + # These schemas don't contain dtypes, so we infer string + expected.index = expected.index.astype("string[pyarrow_numpy]") if orient in ("values", "records"): expected = expected.reset_index(drop=True) if orient != "split": @@ -1459,6 +1469,9 @@ def test_from_json_to_json_table_dtypes(self): result = read_json(StringIO(dfjson), orient="table") tm.assert_frame_equal(result, expected) + # TODO: We are casting to string which coerces None to NaN before casting back + # to object, ending up with incorrect na values + @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="incorrect na conversion") @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"]) def test_to_json_from_json_columns_dtypes(self, orient): # GH21892 GH33205 @@ -1715,6 +1728,11 @@ def test_to_json_indent(self, indent): assert result == expected + @pytest.mark.skipif( + using_pyarrow_string_dtype(), + reason="Adjust expected when infer_string is default, no bug here, " + "just a complicated parametrization", + ) @pytest.mark.parametrize( "orient,expected", [ @@ -1990,7 +2008,9 @@ def test_json_uint64(self): @pytest.mark.parametrize( "orient", ["split", "records", "values", "index", "columns"] ) - def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient): + def test_read_json_dtype_backend( + self, string_storage, dtype_backend, orient, using_infer_string + ): # GH#50750 pa = pytest.importorskip("pyarrow") df = DataFrame( @@ -2006,7 +2026,10 @@ def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient): } ) - if string_storage == "python": + if using_infer_string: + string_array = ArrowStringArrayNumpySemantics(pa.array(["a", "b", "c"])) + string_array_na = ArrowStringArrayNumpySemantics(pa.array(["a", "b", None])) + elif string_storage == "python": string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_)) string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))