read_json engine keyword and pyarrow integration (#49249)

abkosar · web-flow · commit 94f9412c85e2 · 2023-02-10T13:26:44.000+01:00
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -2069,6 +2069,8 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series``
 * ``lines`` : reads file as one json object per line.
 * ``encoding`` : The encoding to use to decode py3 bytes.
 * ``chunksize`` : when used in combination with ``lines=True``, return a JsonReader which reads in ``chunksize`` lines per iteration.
+* ``engine``: Either ``"ujson"``, the built-in JSON parser, or ``"pyarrow"`` which dispatches to pyarrow's ``pyarrow.json.read_json``.
+  The ``"pyarrow"`` is only available when ``lines=True``
 
 The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parseable.
 
@@ -2250,6 +2252,16 @@ For line-delimited json files, pandas can also return an iterator which reads in
       for chunk in reader:
           print(chunk)
 
+Line-limited json can also be read using the pyarrow reader by specifying ``engine="pyarrow"``.
+
+.. ipython:: python
+
+   from io import BytesIO
+   df = pd.read_json(BytesIO(jsonl.encode()), lines=True, engine="pyarrow")
+   df
+
+.. versionadded:: 2.0.0
+
 .. _io.table_schema:
 
 Table schema
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -305,6 +305,7 @@ Other enhancements
 - Added :meth:`DatetimeIndex.as_unit` and :meth:`TimedeltaIndex.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`50616`)
 - Added :meth:`Series.dt.unit` and :meth:`Series.dt.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`51223`)
 - Added new argument ``dtype`` to :func:`read_sql` to be consistent with :func:`read_sql_query` (:issue:`50797`)
+- Added new argument ``engine`` to :func:`read_json` to support parsing JSON with pyarrow by specifying ``engine="pyarrow"`` (:issue:`48893`)
 - Added support for SQLAlchemy 2.0 (:issue:`40686`)
 -
 
diff --git a/pandas/_typing.py b/pandas/_typing.py
@@ -324,6 +324,9 @@ def closed(self) -> bool:
 # read_csv engines
 CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"]
 
+# read_json engines
+JSONEngine = Literal["ujson", "pyarrow"]
+
 # read_xml parsers
 XMLParsers = Literal["lxml", "etree"]
 
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -21,7 +21,10 @@
 
 import numpy as np
 
-from pandas._config import using_nullable_dtypes
+from pandas._config import (
+    get_option,
+    using_nullable_dtypes,
+)
 
 from pandas._libs import lib
 from pandas._libs.json import (
@@ -34,11 +37,13 @@
     DtypeArg,
     FilePath,
     IndexLabel,
+    JSONEngine,
     JSONSerializable,
     ReadBuffer,
     StorageOptions,
     WriteBuffer,
 )
+from pandas.compat._optional import import_optional_dependency
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import doc
 
@@ -401,6 +406,7 @@ def read_json(
     nrows: int | None = ...,
     storage_options: StorageOptions = ...,
     use_nullable_dtypes: bool = ...,
+    engine: JSONEngine = ...,
 ) -> JsonReader[Literal["frame"]]:
     ...
 
@@ -425,6 +431,7 @@ def read_json(
     nrows: int | None = ...,
     storage_options: StorageOptions = ...,
     use_nullable_dtypes: bool = ...,
+    engine: JSONEngine = ...,
 ) -> JsonReader[Literal["series"]]:
     ...
 
@@ -449,6 +456,7 @@ def read_json(
     nrows: int | None = ...,
     storage_options: StorageOptions = ...,
     use_nullable_dtypes: bool = ...,
+    engine: JSONEngine = ...,
 ) -> Series:
     ...
 
@@ -473,6 +481,7 @@ def read_json(
     nrows: int | None = ...,
     storage_options: StorageOptions = ...,
     use_nullable_dtypes: bool = ...,
+    engine: JSONEngine = ...,
 ) -> DataFrame:
     ...
 
@@ -500,6 +509,7 @@ def read_json(
     nrows: int | None = None,
     storage_options: StorageOptions = None,
     use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
+    engine: JSONEngine = "ujson",
 ) -> DataFrame | Series | JsonReader:
     """
     Convert a JSON string to pandas object.
@@ -653,6 +663,12 @@ def read_json(
 
         .. versionadded:: 2.0
 
+    engine : {{"ujson", "pyarrow"}}, default "ujson"
+        Parser engine to use. The ``"pyarrow"`` engine is only available when
+        ``lines=True``.
+
+        .. versionadded:: 2.0
+
     Returns
     -------
     Series or DataFrame
@@ -771,6 +787,7 @@ def read_json(
         storage_options=storage_options,
         encoding_errors=encoding_errors,
         use_nullable_dtypes=use_nullable_dtypes,
+        engine=engine,
     )
 
     if chunksize:
@@ -807,6 +824,7 @@ def __init__(
         storage_options: StorageOptions = None,
         encoding_errors: str | None = "strict",
         use_nullable_dtypes: bool = False,
+        engine: JSONEngine = "ujson",
     ) -> None:
 
         self.orient = orient
@@ -818,6 +836,7 @@ def __init__(
         self.precise_float = precise_float
         self.date_unit = date_unit
         self.encoding = encoding
+        self.engine = engine
         self.compression = compression
         self.storage_options = storage_options
         self.lines = lines
@@ -828,17 +847,32 @@ def __init__(
         self.handles: IOHandles[str] | None = None
         self.use_nullable_dtypes = use_nullable_dtypes
 
+        if self.engine not in {"pyarrow", "ujson"}:
+            raise ValueError(
+                f"The engine type {self.engine} is currently not supported."
+            )
         if self.chunksize is not None:
             self.chunksize = validate_integer("chunksize", self.chunksize, 1)
             if not self.lines:
                 raise ValueError("chunksize can only be passed if lines=True")
+            if self.engine == "pyarrow":
+                raise ValueError(
+                    "currently pyarrow engine doesn't support chunksize parameter"
+                )
         if self.nrows is not None:
             self.nrows = validate_integer("nrows", self.nrows, 0)
             if not self.lines:
                 raise ValueError("nrows can only be passed if lines=True")
-
-        data = self._get_data_from_filepath(filepath_or_buffer)
-        self.data = self._preprocess_data(data)
+        if self.engine == "pyarrow":
+            if not self.lines:
+                raise ValueError(
+                    "currently pyarrow engine only supports "
+                    "the line-delimited JSON format"
+                )
+            self.data = filepath_or_buffer
+        elif self.engine == "ujson":
+            data = self._get_data_from_filepath(filepath_or_buffer)
+            self.data = self._preprocess_data(data)
 
     def _preprocess_data(self, data):
         """
@@ -923,23 +957,45 @@ def read(self) -> DataFrame | Series:
         """
         obj: DataFrame | Series
         with self:
-            if self.lines:
-                if self.chunksize:
-                    obj = concat(self)
-                elif self.nrows:
-                    lines = list(islice(self.data, self.nrows))
-                    lines_json = self._combine_lines(lines)
-                    obj = self._get_object_parser(lines_json)
+            if self.engine == "pyarrow":
+                pyarrow_json = import_optional_dependency("pyarrow.json")
+                pa_table = pyarrow_json.read_json(self.data)
+                if self.use_nullable_dtypes:
+                    if get_option("mode.dtype_backend") == "pyarrow":
+                        from pandas.arrays import ArrowExtensionArray
+
+                        return DataFrame(
+                            {
+                                col_name: ArrowExtensionArray(pa_col)
+                                for col_name, pa_col in zip(
+                                    pa_table.column_names, pa_table.itercolumns()
+                                )
+                            }
+                        )
+                    elif get_option("mode.dtype_backend") == "pandas":
+                        from pandas.io._util import _arrow_dtype_mapping
+
+                        mapping = _arrow_dtype_mapping()
+                        return pa_table.to_pandas(types_mapper=mapping.get)
+                return pa_table.to_pandas()
+            elif self.engine == "ujson":
+                if self.lines:
+                    if self.chunksize:
+                        obj = concat(self)
+                    elif self.nrows:
+                        lines = list(islice(self.data, self.nrows))
+                        lines_json = self._combine_lines(lines)
+                        obj = self._get_object_parser(lines_json)
+                    else:
+                        data = ensure_str(self.data)
+                        data_lines = data.split("\n")
+                        obj = self._get_object_parser(self._combine_lines(data_lines))
                 else:
-                    data = ensure_str(self.data)
-                    data_lines = data.split("\n")
-                    obj = self._get_object_parser(self._combine_lines(data_lines))
-            else:
-                obj = self._get_object_parser(self.data)
-        if self.use_nullable_dtypes:
-            return obj.convert_dtypes(infer_objects=False)
-        else:
-            return obj
+                    obj = self._get_object_parser(self.data)
+                if self.use_nullable_dtypes:
+                    return obj.convert_dtypes(infer_objects=False)
+                else:
+                    return obj
 
     def _get_object_parser(self, json) -> DataFrame | Series:
         """
diff --git a/pandas/tests/io/json/conftest.py b/pandas/tests/io/json/conftest.py
@@ -7,3 +7,10 @@ def orient(request):
     Fixture for orients excluding the table format.
     """
     return request.param
+
+
+@pytest.fixture(params=["ujson", "pyarrow"])
+def engine(request):
+    if request.param == "pyarrow":
+        pytest.importorskip("pyarrow.json")
+    return request.param
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -1956,3 +1956,19 @@ def test_read_json_nullable_series(self, string_storage, dtype_backend, orient):
             expected = Series(ArrowExtensionArray(pa.array(expected, from_pandas=True)))
 
         tm.assert_series_equal(result, expected)
+
+
+def test_invalid_engine():
+    # GH 48893
+    ser = Series(range(1))
+    out = ser.to_json()
+    with pytest.raises(ValueError, match="The engine type foo"):
+        read_json(out, engine="foo")
+
+
+def test_pyarrow_engine_lines_false():
+    # GH 48893
+    ser = Series(range(1))
+    out = ser.to_json()
+    with pytest.raises(ValueError, match="currently pyarrow engine only supports"):
+        read_json(out, engine="pyarrow", lines=False)
diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py

Original file line number	Diff line number	Diff line change
`@@ -305,6 +305,7 @@ Other enhancements`
`305`	`305`	- Added :meth:`DatetimeIndex.as_unit` and :meth:`TimedeltaIndex.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`50616`)
`306`	`306`	- Added :meth:`Series.dt.unit` and :meth:`Series.dt.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`51223`)
`307`	`307`	- Added new argument ``dtype`` to :func:`read_sql` to be consistent with :func:`read_sql_query` (:issue:`50797`)
	`308`	+- Added new argument ``engine`` to :func:`read_json` to support parsing JSON with pyarrow by specifying ``engine="pyarrow"`` (:issue:`48893`)
`308`	`309`	- Added support for SQLAlchemy 2.0 (:issue:`40686`)
`309`	`310`	`-`
`310`	`311`