BUG: read_json not handling string dtype when converting to dates (#56195)

phofl · web-flow · commit f6eee830c7f2 · 2023-11-27T09:30:36.000-08:00
diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst
@@ -531,6 +531,7 @@ I/O
 - Bug in :func:`read_csv` where ``on_bad_lines="warn"`` would write to ``stderr`` instead of raise a Python warning. This now yields a :class:`.errors.ParserWarning` (:issue:`54296`)
 - Bug in :func:`read_csv` with ``engine="pyarrow"`` where ``usecols`` wasn't working with a csv with no headers (:issue:`54459`)
 - Bug in :func:`read_excel`, with ``engine="xlrd"`` (``xls`` files) erroring when file contains NaNs/Infs (:issue:`54564`)
+- Bug in :func:`read_json` not handling dtype conversion properly if ``infer_string`` is set (:issue:`56195`)
 - Bug in :func:`to_excel`, with ``OdsWriter`` (``ods`` files) writing boolean/string value (:issue:`54994`)
 - Bug in :meth:`DataFrame.to_hdf` and :func:`read_hdf` with ``datetime64`` dtypes with non-nanosecond resolution failing to round-trip correctly (:issue:`55622`)
 - Bug in :meth:`pandas.read_excel` with ``engine="odf"`` (``ods`` files) when string contains annotation (:issue:`55200`)
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -32,7 +32,10 @@
 from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import check_dtype_backend
 
-from pandas.core.dtypes.common import ensure_str
+from pandas.core.dtypes.common import (
+    ensure_str,
+    is_string_dtype,
+)
 from pandas.core.dtypes.dtypes import PeriodDtype
 from pandas.core.dtypes.generic import ABCIndex
 
@@ -1249,7 +1252,7 @@ def _try_convert_data(
         if self.dtype_backend is not lib.no_default and not isinstance(data, ABCIndex):
             # Fall through for conversion later on
             return data, True
-        elif data.dtype == "object":
+        elif is_string_dtype(data.dtype):
             # try float
             try:
                 data = data.astype("float64")
@@ -1301,6 +1304,10 @@ def _try_convert_to_date(self, data):
             return data, False
 
         new_data = data
+
+        if new_data.dtype == "string":
+            new_data = new_data.astype(object)
+
         if new_data.dtype == "object":
             try:
                 new_data = data.astype("int64")
diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py
@@ -93,27 +93,31 @@ def test_read_unsupported_compression_type():
             pd.read_json(path, compression="unsupported")
 
 
+@pytest.mark.parametrize(
+    "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
+)
 @pytest.mark.parametrize("to_infer", [True, False])
 @pytest.mark.parametrize("read_infer", [True, False])
 def test_to_json_compression(
-    compression_only, read_infer, to_infer, compression_to_extension
+    compression_only, read_infer, to_infer, compression_to_extension, infer_string
 ):
-    # see gh-15008
-    compression = compression_only
+    with pd.option_context("future.infer_string", infer_string):
+        # see gh-15008
+        compression = compression_only
 
-    # We'll complete file extension subsequently.
-    filename = "test."
-    filename += compression_to_extension[compression]
+        # We'll complete file extension subsequently.
+        filename = "test."
+        filename += compression_to_extension[compression]
 
-    df = pd.DataFrame({"A": [1]})
+        df = pd.DataFrame({"A": [1]})
 
-    to_compression = "infer" if to_infer else compression
-    read_compression = "infer" if read_infer else compression
+        to_compression = "infer" if to_infer else compression
+        read_compression = "infer" if read_infer else compression
 
-    with tm.ensure_clean(filename) as path:
-        df.to_json(path, compression=to_compression)
-        result = pd.read_json(path, compression=read_compression)
-        tm.assert_frame_equal(result, df)
+        with tm.ensure_clean(filename) as path:
+            df.to_json(path, compression=to_compression)
+            result = pd.read_json(path, compression=read_compression)
+            tm.assert_frame_equal(result, df)
 
 
 def test_to_json_compression_mode(compression):