ENH: Add use_nullable_dtypes option to read_json (#50750)

phofl · web-flow · commit 7f8baa0f0eec · 2023-01-17T11:07:48.000-08:00
* ENH: Add use_nullable_dtypes option to read_json

* Add gh ref

* Add test
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -41,6 +41,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following
 * :func:`read_excel`
 * :func:`read_html`
 * :func:`read_xml`
+* :func:`read_json`
 * :func:`read_sql`
 * :func:`read_sql_query`
 * :func:`read_sql_table`
@@ -56,6 +57,7 @@ to select the nullable dtypes implementation.
 * :func:`read_excel`
 * :func:`read_html`
 * :func:`read_xml`
+* :func:`read_json`
 * :func:`read_parquet`
 * :func:`read_orc`
 * :func:`read_feather`
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -43,6 +43,7 @@
     ensure_str,
     is_period_dtype,
 )
+from pandas.core.dtypes.generic import ABCIndex
 
 from pandas import (
     DataFrame,
@@ -396,6 +397,7 @@ def read_json(
     compression: CompressionOptions = ...,
     nrows: int | None = ...,
     storage_options: StorageOptions = ...,
+    use_nullable_dtypes: bool = ...,
 ) -> JsonReader[Literal["frame"]]:
     ...
 
@@ -419,6 +421,7 @@ def read_json(
     compression: CompressionOptions = ...,
     nrows: int | None = ...,
     storage_options: StorageOptions = ...,
+    use_nullable_dtypes: bool = ...,
 ) -> JsonReader[Literal["series"]]:
     ...
 
@@ -442,6 +445,7 @@ def read_json(
     compression: CompressionOptions = ...,
     nrows: int | None = ...,
     storage_options: StorageOptions = ...,
+    use_nullable_dtypes: bool = ...,
 ) -> Series:
     ...
 
@@ -465,6 +469,7 @@ def read_json(
     compression: CompressionOptions = ...,
     nrows: int | None = ...,
     storage_options: StorageOptions = ...,
+    use_nullable_dtypes: bool = ...,
 ) -> DataFrame:
     ...
 
@@ -491,6 +496,7 @@ def read_json(
     compression: CompressionOptions = "infer",
     nrows: int | None = None,
     storage_options: StorageOptions = None,
+    use_nullable_dtypes: bool = False,
 ) -> DataFrame | Series | JsonReader:
     """
     Convert a JSON string to pandas object.
@@ -629,6 +635,19 @@ def read_json(
 
         .. versionadded:: 1.2.0
 
+    use_nullable_dtypes : bool = False
+        Whether or not to use nullable dtypes as default when reading data. If
+        set to True, nullable dtypes are used for all dtypes that have a nullable
+        implementation, even if no nulls are present.
+
+        The nullable dtype implementation can be configured by calling
+        ``pd.set_option("mode.dtype_backend", "pandas")`` to use
+        numpy-backed nullable dtypes or
+        ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use
+        pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).
+
+        .. versionadded:: 2.0
+
     Returns
     -------
     Series or DataFrame
@@ -740,6 +759,7 @@ def read_json(
         nrows=nrows,
         storage_options=storage_options,
         encoding_errors=encoding_errors,
+        use_nullable_dtypes=use_nullable_dtypes,
     )
 
     if chunksize:
@@ -775,6 +795,7 @@ def __init__(
         nrows: int | None,
         storage_options: StorageOptions = None,
         encoding_errors: str | None = "strict",
+        use_nullable_dtypes: bool = False,
     ) -> None:
 
         self.orient = orient
@@ -794,6 +815,7 @@ def __init__(
         self.nrows = nrows
         self.encoding_errors = encoding_errors
         self.handles: IOHandles[str] | None = None
+        self.use_nullable_dtypes = use_nullable_dtypes
 
         if self.chunksize is not None:
             self.chunksize = validate_integer("chunksize", self.chunksize, 1)
@@ -903,7 +925,10 @@ def read(self) -> DataFrame | Series:
                     obj = self._get_object_parser(self._combine_lines(data_lines))
             else:
                 obj = self._get_object_parser(self.data)
-        return obj
+        if self.use_nullable_dtypes:
+            return obj.convert_dtypes(infer_objects=False)
+        else:
+            return obj
 
     def _get_object_parser(self, json) -> DataFrame | Series:
         """
@@ -919,6 +944,7 @@ def _get_object_parser(self, json) -> DataFrame | Series:
             "keep_default_dates": self.keep_default_dates,
             "precise_float": self.precise_float,
             "date_unit": self.date_unit,
+            "use_nullable_dtypes": self.use_nullable_dtypes,
         }
         obj = None
         if typ == "frame":
@@ -977,7 +1003,10 @@ def __next__(self) -> DataFrame | Series:
             self.close()
             raise ex
 
-        return obj
+        if self.use_nullable_dtypes:
+            return obj.convert_dtypes(infer_objects=False)
+        else:
+            return obj
 
     def __enter__(self) -> JsonReader[FrameSeriesStrT]:
         return self
@@ -1013,6 +1042,7 @@ def __init__(
         keep_default_dates: bool = False,
         precise_float: bool = False,
         date_unit=None,
+        use_nullable_dtypes: bool = False,
     ) -> None:
         self.json = json
 
@@ -1037,6 +1067,7 @@ def __init__(
         self.date_unit = date_unit
         self.keep_default_dates = keep_default_dates
         self.obj: DataFrame | Series | None = None
+        self.use_nullable_dtypes = use_nullable_dtypes
 
     def check_keys_split(self, decoded) -> None:
         """
@@ -1119,7 +1150,10 @@ def _try_convert_data(
             if result:
                 return new_data, True
 
-        if data.dtype == "object":
+        if self.use_nullable_dtypes and not isinstance(data, ABCIndex):
+            # Fall through for conversion later on
+            return data, True
+        elif data.dtype == "object":
 
             # try float
             try:
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -15,13 +15,18 @@
 
 import pandas as pd
 from pandas import (
+    NA,
     DataFrame,
     DatetimeIndex,
     Series,
     Timestamp,
     read_json,
 )
 import pandas._testing as tm
+from pandas.core.arrays import (
+    ArrowStringArray,
+    StringArray,
+)
 
 
 def assert_json_roundtrip_equal(result, expected, orient):
@@ -1863,3 +1868,88 @@ def test_json_uint64(self):
         df = DataFrame(data={"col1": [13342205958987758245, 12388075603347835679]})
         result = df.to_json(orient="split")
         assert result == expected
+
+    @pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"])
+    @pytest.mark.parametrize(
+        "orient", ["split", "records", "values", "index", "columns"]
+    )
+    def test_read_json_nullable(self, string_storage, dtype_backend, orient):
+        # GH#50750
+        pa = pytest.importorskip("pyarrow")
+        df = DataFrame(
+            {
+                "a": Series([1, np.nan, 3], dtype="Int64"),
+                "b": Series([1, 2, 3], dtype="Int64"),
+                "c": Series([1.5, np.nan, 2.5], dtype="Float64"),
+                "d": Series([1.5, 2.0, 2.5], dtype="Float64"),
+                "e": [True, False, None],
+                "f": [True, False, True],
+                "g": ["a", "b", "c"],
+                "h": ["a", "b", None],
+            }
+        )
+
+        if string_storage == "python":
+            string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
+            string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))
+
+        else:
+            string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
+            string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
+
+        out = df.to_json(orient=orient)
+        with pd.option_context("mode.string_storage", string_storage):
+            with pd.option_context("mode.dtype_backend", dtype_backend):
+                result = read_json(out, use_nullable_dtypes=True, orient=orient)
+
+        expected = DataFrame(
+            {
+                "a": Series([1, np.nan, 3], dtype="Int64"),
+                "b": Series([1, 2, 3], dtype="Int64"),
+                "c": Series([1.5, np.nan, 2.5], dtype="Float64"),
+                "d": Series([1.5, 2.0, 2.5], dtype="Float64"),
+                "e": Series([True, False, NA], dtype="boolean"),
+                "f": Series([True, False, True], dtype="boolean"),
+                "g": string_array,
+                "h": string_array_na,
+            }
+        )
+
+        if dtype_backend == "pyarrow":
+
+            from pandas.arrays import ArrowExtensionArray
+
+            expected = DataFrame(
+                {
+                    col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
+                    for col in expected.columns
+                }
+            )
+
+        if orient == "values":
+            expected.columns = list(range(0, 8))
+
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"])
+    @pytest.mark.parametrize("orient", ["split", "records", "index"])
+    def test_read_json_nullable_series(self, string_storage, dtype_backend, orient):
+        # GH#50750
+        pa = pytest.importorskip("pyarrow")
+        ser = Series([1, np.nan, 3], dtype="Int64")
+
+        out = ser.to_json(orient=orient)
+        with pd.option_context("mode.string_storage", string_storage):
+            with pd.option_context("mode.dtype_backend", dtype_backend):
+                result = read_json(
+                    out, use_nullable_dtypes=True, orient=orient, typ="series"
+                )
+
+        expected = Series([1, np.nan, 3], dtype="Int64")
+
+        if dtype_backend == "pyarrow":
+            from pandas.arrays import ArrowExtensionArray
+
+            expected = Series(ArrowExtensionArray(pa.array(expected, from_pandas=True)))
+
+        tm.assert_series_equal(result, expected)