From 4a14a317dc64e09e5a5920ebea7793959cc61819 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Mon, 27 Nov 2023 00:09:30 +0100
Subject: [PATCH 1/2] BUG: read_json not handling string dtype when converting
 to dates

---
 doc/source/whatsnew/v2.2.0.rst           |  1 +
 pandas/io/json/_json.py                  | 11 +++++++--
 pandas/tests/io/json/test_compression.py | 30 ++++++++++++++----------
 3 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
index d252c19a95d4a..5cb99afdcb98d 100644
--- a/doc/source/whatsnew/v2.2.0.rst
+++ b/doc/source/whatsnew/v2.2.0.rst
@@ -530,6 +530,7 @@ I/O
 - Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`)
 - Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`)
 - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`)
+- Bug in :func:`read_json` not handling dtype conversion properly if ``infer_string`` is set (:issue:`56195`)
 - Bug in :func:`to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`)
 - Bug in :meth:`DataFrame.to_hdf` and :func:`read_hdf` with ``datetime64`` dtypes with non-nanosecond resolution failing to round-trip correctly (:issue:`55622`)
 - Bug in :meth:`pandas.read_excel` with ``engine="odf"`` (``ods`` files) when string contains annotation (:issue:`55200`)
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index e17fcea0aae71..9c56089560507 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -32,7 +32,10 @@
 from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import check_dtype_backend
 
-from pandas.core.dtypes.common import ensure_str
+from pandas.core.dtypes.common import (
+    ensure_str,
+    is_string_dtype,
+)
 from pandas.core.dtypes.dtypes import PeriodDtype
 from pandas.core.dtypes.generic import ABCIndex
 
@@ -1249,7 +1252,7 @@ def _try_convert_data(
         if self.dtype_backend is not lib.no_default and not isinstance(data, ABCIndex):
             # Fall through for conversion later on
             return data, True
-        elif data.dtype == "object":
+        elif is_string_dtype(data.dtype):
             # try float
             try:
                 data = data.astype("float64")
@@ -1301,6 +1304,10 @@ def _try_convert_to_date(self, data):
             return data, False
 
         new_data = data
+
+        if new_data.dtype == "string":
+            new_data = new_data.astype(object)
+
         if new_data.dtype == "object":
             try:
                 new_data = data.astype("int64")
diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py
index 410c20bb22d1e..ff7d34c85c015 100644
--- a/pandas/tests/io/json/test_compression.py
+++ b/pandas/tests/io/json/test_compression.py
@@ -93,27 +93,31 @@ def test_read_unsupported_compression_type():
             pd.read_json(path, compression="unsupported")
 
 
+@pytest.mark.parametrize(
+    "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
+)
 @pytest.mark.parametrize("to_infer", [True, False])
 @pytest.mark.parametrize("read_infer", [True, False])
 def test_to_json_compression(
-    compression_only, read_infer, to_infer, compression_to_extension
+    compression_only, read_infer, to_infer, compression_to_extension, infer_string
 ):
-    # see gh-15008
-    compression = compression_only
+    with pd.option_context("future.infer_string", infer_string):
+        # see gh-15008
+        compression = compression_only
 
-    # We'll complete file extension subsequently.
-    filename = "test."
-    filename += compression_to_extension[compression]
+        # We'll complete file extension subsequently.
+        filename = "test."
+        filename += compression_to_extension[compression]
 
-    df = pd.DataFrame({"A": [1]})
+        df = pd.DataFrame({"A": [1]})
 
-    to_compression = "infer" if to_infer else compression
-    read_compression = "infer" if read_infer else compression
+        to_compression = "infer" if to_infer else compression
+        read_compression = "infer" if read_infer else compression
 
-    with tm.ensure_clean(filename) as path:
-        df.to_json(path, compression=to_compression)
-        result = pd.read_json(path, compression=read_compression)
-        tm.assert_frame_equal(result, df)
+        with tm.ensure_clean(filename) as path:
+            df.to_json(path, compression=to_compression)
+            result = pd.read_json(path, compression=read_compression)
+            tm.assert_frame_equal(result, df)
 
 
 def test_to_json_compression_mode(compression):

From 5882cd98f1ceebb5a811f5d15acb66b85c3555bc Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Mon, 27 Nov 2023 00:37:08 +0100
Subject: [PATCH 2/2] Adjust tests in json folder for new string option

---
 .../tests/io/json/test_json_table_schema.py   | 23 +++++++++--
 pandas/tests/io/json/test_pandas.py           | 41 +++++++++++++++----
 2 files changed, 51 insertions(+), 13 deletions(-)

diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py
index 79dbe448e9cbe..7569a74752bf2 100644
--- a/pandas/tests/io/json/test_json_table_schema.py
+++ b/pandas/tests/io/json/test_json_table_schema.py
@@ -56,7 +56,7 @@ def df_table():
 
 
 class TestBuildSchema:
-    def test_build_table_schema(self, df_schema):
+    def test_build_table_schema(self, df_schema, using_infer_string):
         result = build_table_schema(df_schema, version=False)
         expected = {
             "fields": [
@@ -68,6 +68,8 @@ def test_build_table_schema(self, df_schema):
             ],
             "primaryKey": ["idx"],
         }
+        if using_infer_string:
+            expected["fields"][2] = {"name": "B", "type": "any", "extDtype": "string"}
         assert result == expected
         result = build_table_schema(df_schema)
         assert "pandas_version" in result
@@ -97,7 +99,7 @@ def test_series_unnamed(self):
         }
         assert result == expected
 
-    def test_multiindex(self, df_schema):
+    def test_multiindex(self, df_schema, using_infer_string):
         df = df_schema
         idx = pd.MultiIndex.from_product([("a", "b"), (1, 2)])
         df.index = idx
@@ -114,6 +116,13 @@ def test_multiindex(self, df_schema):
             ],
             "primaryKey": ["level_0", "level_1"],
         }
+        if using_infer_string:
+            expected["fields"][0] = {
+                "name": "level_0",
+                "type": "any",
+                "extDtype": "string",
+            }
+            expected["fields"][3] = {"name": "B", "type": "any", "extDtype": "string"}
         assert result == expected
 
         df.index.names = ["idx0", None]
@@ -156,7 +165,10 @@ def test_as_json_table_type_bool_data(self, bool_type):
     def test_as_json_table_type_date_data(self, date_data):
         assert as_json_table_type(date_data.dtype) == "datetime"
 
-    @pytest.mark.parametrize("str_data", [pd.Series(["a", "b"]), pd.Index(["a", "b"])])
+    @pytest.mark.parametrize(
+        "str_data",
+        [pd.Series(["a", "b"], dtype=object), pd.Index(["a", "b"], dtype=object)],
+    )
     def test_as_json_table_type_string_data(self, str_data):
         assert as_json_table_type(str_data.dtype) == "string"
 
@@ -261,7 +273,7 @@ def test_read_json_from_to_json_results(self):
         tm.assert_frame_equal(result1, df)
         tm.assert_frame_equal(result2, df)
 
-    def test_to_json(self, df_table):
+    def test_to_json(self, df_table, using_infer_string):
         df = df_table
         df.index.name = "idx"
         result = df.to_json(orient="table", date_format="iso")
@@ -292,6 +304,9 @@ def test_to_json(self, df_table):
             {"name": "H", "type": "datetime", "tz": "US/Central"},
         ]
 
+        if using_infer_string:
+            fields[2] = {"name": "B", "type": "any", "extDtype": "string"}
+
         schema = {"fields": fields, "primaryKey": ["idx"]}
         data = [
             OrderedDict(
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 7312facc44c26..fe7882a0979a4 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -13,6 +13,8 @@
 import numpy as np
 import pytest
 
+from pandas._config import using_pyarrow_string_dtype
+
 from pandas.compat import IS64
 import pandas.util._test_decorators as td
 
@@ -30,6 +32,7 @@
     ArrowStringArray,
     StringArray,
 )
+from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics
 
 from pandas.io.json import ujson_dumps
 
@@ -238,7 +241,7 @@ def test_roundtrip_str_axes(self, orient, convert_axes, dtype):
 
     @pytest.mark.parametrize("convert_axes", [True, False])
     def test_roundtrip_categorical(
-        self, request, orient, categorical_frame, convert_axes
+        self, request, orient, categorical_frame, convert_axes, using_infer_string
     ):
         # TODO: create a better frame to test with and improve coverage
         if orient in ("index", "columns"):
@@ -252,7 +255,9 @@ def test_roundtrip_categorical(
         result = read_json(data, orient=orient, convert_axes=convert_axes)
 
         expected = categorical_frame.copy()
-        expected.index = expected.index.astype(str)  # Categorical not preserved
+        expected.index = expected.index.astype(
+            str if not using_infer_string else "string[pyarrow_numpy]"
+        )  # Categorical not preserved
         expected.index.name = None  # index names aren't preserved in JSON
         assert_json_roundtrip_equal(result, expected, orient)
 
@@ -518,9 +523,9 @@ def test_v12_compat(self, datapath):
         df_iso = df.drop(["modified"], axis=1)
         v12_iso_json = os.path.join(dirpath, "tsframe_iso_v012.json")
         df_unser_iso = read_json(v12_iso_json)
-        tm.assert_frame_equal(df_iso, df_unser_iso)
+        tm.assert_frame_equal(df_iso, df_unser_iso, check_column_type=False)
 
-    def test_blocks_compat_GH9037(self):
+    def test_blocks_compat_GH9037(self, using_infer_string):
         index = pd.date_range("20000101", periods=10, freq="h")
         # freq doesn't round-trip
         index = DatetimeIndex(list(index), freq=None)
@@ -604,7 +609,9 @@ def test_blocks_compat_GH9037(self):
         )
 
         # JSON deserialisation always creates unicode strings
-        df_mixed.columns = df_mixed.columns.astype(np.str_)
+        df_mixed.columns = df_mixed.columns.astype(
+            np.str_ if not using_infer_string else "string[pyarrow_numpy]"
+        )
         data = StringIO(df_mixed.to_json(orient="split"))
         df_roundtrip = read_json(data, orient="split")
         tm.assert_frame_equal(
@@ -676,16 +683,19 @@ def test_series_non_unique_index(self):
         unserialized = read_json(
             StringIO(s.to_json(orient="records")), orient="records", typ="series"
         )
-        tm.assert_numpy_array_equal(s.values, unserialized.values)
+        tm.assert_equal(s.values, unserialized.values)
 
     def test_series_default_orient(self, string_series):
         assert string_series.to_json() == string_series.to_json(orient="index")
 
-    def test_series_roundtrip_simple(self, orient, string_series):
+    def test_series_roundtrip_simple(self, orient, string_series, using_infer_string):
         data = StringIO(string_series.to_json(orient=orient))
         result = read_json(data, typ="series", orient=orient)
 
         expected = string_series
+        if using_infer_string and orient in ("split", "index", "columns"):
+            # These schemas don't contain dtypes, so we infer string
+            expected.index = expected.index.astype("string[pyarrow_numpy]")
         if orient in ("values", "records"):
             expected = expected.reset_index(drop=True)
         if orient != "split":
@@ -1459,6 +1469,9 @@ def test_from_json_to_json_table_dtypes(self):
         result = read_json(StringIO(dfjson), orient="table")
         tm.assert_frame_equal(result, expected)
 
+    # TODO: We are casting to string which coerces None to NaN before casting back
+    # to object, ending up with incorrect na values
+    @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="incorrect na conversion")
     @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"])
     def test_to_json_from_json_columns_dtypes(self, orient):
         # GH21892 GH33205
@@ -1715,6 +1728,11 @@ def test_to_json_indent(self, indent):
 
         assert result == expected
 
+    @pytest.mark.skipif(
+        using_pyarrow_string_dtype(),
+        reason="Adjust expected when infer_string is default, no bug here, "
+        "just a complicated parametrization",
+    )
     @pytest.mark.parametrize(
         "orient,expected",
         [
@@ -1990,7 +2008,9 @@ def test_json_uint64(self):
     @pytest.mark.parametrize(
         "orient", ["split", "records", "values", "index", "columns"]
     )
-    def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient):
+    def test_read_json_dtype_backend(
+        self, string_storage, dtype_backend, orient, using_infer_string
+    ):
         # GH#50750
         pa = pytest.importorskip("pyarrow")
         df = DataFrame(
@@ -2006,7 +2026,10 @@ def test_read_json_dtype_backend(self, string_storage, dtype_backend, orient):
             }
         )
 
-        if string_storage == "python":
+        if using_infer_string:
+            string_array = ArrowStringArrayNumpySemantics(pa.array(["a", "b", "c"]))
+            string_array_na = ArrowStringArrayNumpySemantics(pa.array(["a", "b", None]))
+        elif string_storage == "python":
             string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
             string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))