Skip to content

Commit 94f9412

Browse files
authored
read_json engine keyword and pyarrow integration (#49249)
1 parent 35d76e9 commit 94f9412

File tree

7 files changed

+223
-45
lines changed

7 files changed

+223
-45
lines changed

doc/source/user_guide/io.rst

+12
Original file line numberDiff line numberDiff line change
@@ -2069,6 +2069,8 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series``
20692069
* ``lines`` : reads file as one json object per line.
20702070
* ``encoding`` : The encoding to use to decode py3 bytes.
20712071
* ``chunksize`` : when used in combination with ``lines=True``, return a JsonReader which reads in ``chunksize`` lines per iteration.
2072+
* ``engine``: Either ``"ujson"``, the built-in JSON parser, or ``"pyarrow"`` which dispatches to pyarrow's ``pyarrow.json.read_json``.
2073+
The ``"pyarrow"`` is only available when ``lines=True``
20722074

20732075
The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parseable.
20742076

@@ -2250,6 +2252,16 @@ For line-delimited json files, pandas can also return an iterator which reads in
22502252
for chunk in reader:
22512253
print(chunk)
22522254
2255+
Line-limited json can also be read using the pyarrow reader by specifying ``engine="pyarrow"``.
2256+
2257+
.. ipython:: python
2258+
2259+
from io import BytesIO
2260+
df = pd.read_json(BytesIO(jsonl.encode()), lines=True, engine="pyarrow")
2261+
df
2262+
2263+
.. versionadded:: 2.0.0
2264+
22532265
.. _io.table_schema:
22542266

22552267
Table schema

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,7 @@ Other enhancements
305305
- Added :meth:`DatetimeIndex.as_unit` and :meth:`TimedeltaIndex.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`50616`)
306306
- Added :meth:`Series.dt.unit` and :meth:`Series.dt.as_unit` to convert to different resolutions; supported resolutions are "s", "ms", "us", and "ns" (:issue:`51223`)
307307
- Added new argument ``dtype`` to :func:`read_sql` to be consistent with :func:`read_sql_query` (:issue:`50797`)
308+
- Added new argument ``engine`` to :func:`read_json` to support parsing JSON with pyarrow by specifying ``engine="pyarrow"`` (:issue:`48893`)
308309
- Added support for SQLAlchemy 2.0 (:issue:`40686`)
309310
-
310311

pandas/_typing.py

+3
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,9 @@ def closed(self) -> bool:
324324
# read_csv engines
325325
CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"]
326326

327+
# read_json engines
328+
JSONEngine = Literal["ujson", "pyarrow"]
329+
327330
# read_xml parsers
328331
XMLParsers = Literal["lxml", "etree"]
329332

pandas/io/json/_json.py

+76-20
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@
2121

2222
import numpy as np
2323

24-
from pandas._config import using_nullable_dtypes
24+
from pandas._config import (
25+
get_option,
26+
using_nullable_dtypes,
27+
)
2528

2629
from pandas._libs import lib
2730
from pandas._libs.json import (
@@ -34,11 +37,13 @@
3437
DtypeArg,
3538
FilePath,
3639
IndexLabel,
40+
JSONEngine,
3741
JSONSerializable,
3842
ReadBuffer,
3943
StorageOptions,
4044
WriteBuffer,
4145
)
46+
from pandas.compat._optional import import_optional_dependency
4247
from pandas.errors import AbstractMethodError
4348
from pandas.util._decorators import doc
4449

@@ -401,6 +406,7 @@ def read_json(
401406
nrows: int | None = ...,
402407
storage_options: StorageOptions = ...,
403408
use_nullable_dtypes: bool = ...,
409+
engine: JSONEngine = ...,
404410
) -> JsonReader[Literal["frame"]]:
405411
...
406412

@@ -425,6 +431,7 @@ def read_json(
425431
nrows: int | None = ...,
426432
storage_options: StorageOptions = ...,
427433
use_nullable_dtypes: bool = ...,
434+
engine: JSONEngine = ...,
428435
) -> JsonReader[Literal["series"]]:
429436
...
430437

@@ -449,6 +456,7 @@ def read_json(
449456
nrows: int | None = ...,
450457
storage_options: StorageOptions = ...,
451458
use_nullable_dtypes: bool = ...,
459+
engine: JSONEngine = ...,
452460
) -> Series:
453461
...
454462

@@ -473,6 +481,7 @@ def read_json(
473481
nrows: int | None = ...,
474482
storage_options: StorageOptions = ...,
475483
use_nullable_dtypes: bool = ...,
484+
engine: JSONEngine = ...,
476485
) -> DataFrame:
477486
...
478487

@@ -500,6 +509,7 @@ def read_json(
500509
nrows: int | None = None,
501510
storage_options: StorageOptions = None,
502511
use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
512+
engine: JSONEngine = "ujson",
503513
) -> DataFrame | Series | JsonReader:
504514
"""
505515
Convert a JSON string to pandas object.
@@ -653,6 +663,12 @@ def read_json(
653663
654664
.. versionadded:: 2.0
655665
666+
engine : {{"ujson", "pyarrow"}}, default "ujson"
667+
Parser engine to use. The ``"pyarrow"`` engine is only available when
668+
``lines=True``.
669+
670+
.. versionadded:: 2.0
671+
656672
Returns
657673
-------
658674
Series or DataFrame
@@ -771,6 +787,7 @@ def read_json(
771787
storage_options=storage_options,
772788
encoding_errors=encoding_errors,
773789
use_nullable_dtypes=use_nullable_dtypes,
790+
engine=engine,
774791
)
775792

776793
if chunksize:
@@ -807,6 +824,7 @@ def __init__(
807824
storage_options: StorageOptions = None,
808825
encoding_errors: str | None = "strict",
809826
use_nullable_dtypes: bool = False,
827+
engine: JSONEngine = "ujson",
810828
) -> None:
811829

812830
self.orient = orient
@@ -818,6 +836,7 @@ def __init__(
818836
self.precise_float = precise_float
819837
self.date_unit = date_unit
820838
self.encoding = encoding
839+
self.engine = engine
821840
self.compression = compression
822841
self.storage_options = storage_options
823842
self.lines = lines
@@ -828,17 +847,32 @@ def __init__(
828847
self.handles: IOHandles[str] | None = None
829848
self.use_nullable_dtypes = use_nullable_dtypes
830849

850+
if self.engine not in {"pyarrow", "ujson"}:
851+
raise ValueError(
852+
f"The engine type {self.engine} is currently not supported."
853+
)
831854
if self.chunksize is not None:
832855
self.chunksize = validate_integer("chunksize", self.chunksize, 1)
833856
if not self.lines:
834857
raise ValueError("chunksize can only be passed if lines=True")
858+
if self.engine == "pyarrow":
859+
raise ValueError(
860+
"currently pyarrow engine doesn't support chunksize parameter"
861+
)
835862
if self.nrows is not None:
836863
self.nrows = validate_integer("nrows", self.nrows, 0)
837864
if not self.lines:
838865
raise ValueError("nrows can only be passed if lines=True")
839-
840-
data = self._get_data_from_filepath(filepath_or_buffer)
841-
self.data = self._preprocess_data(data)
866+
if self.engine == "pyarrow":
867+
if not self.lines:
868+
raise ValueError(
869+
"currently pyarrow engine only supports "
870+
"the line-delimited JSON format"
871+
)
872+
self.data = filepath_or_buffer
873+
elif self.engine == "ujson":
874+
data = self._get_data_from_filepath(filepath_or_buffer)
875+
self.data = self._preprocess_data(data)
842876

843877
def _preprocess_data(self, data):
844878
"""
@@ -923,23 +957,45 @@ def read(self) -> DataFrame | Series:
923957
"""
924958
obj: DataFrame | Series
925959
with self:
926-
if self.lines:
927-
if self.chunksize:
928-
obj = concat(self)
929-
elif self.nrows:
930-
lines = list(islice(self.data, self.nrows))
931-
lines_json = self._combine_lines(lines)
932-
obj = self._get_object_parser(lines_json)
960+
if self.engine == "pyarrow":
961+
pyarrow_json = import_optional_dependency("pyarrow.json")
962+
pa_table = pyarrow_json.read_json(self.data)
963+
if self.use_nullable_dtypes:
964+
if get_option("mode.dtype_backend") == "pyarrow":
965+
from pandas.arrays import ArrowExtensionArray
966+
967+
return DataFrame(
968+
{
969+
col_name: ArrowExtensionArray(pa_col)
970+
for col_name, pa_col in zip(
971+
pa_table.column_names, pa_table.itercolumns()
972+
)
973+
}
974+
)
975+
elif get_option("mode.dtype_backend") == "pandas":
976+
from pandas.io._util import _arrow_dtype_mapping
977+
978+
mapping = _arrow_dtype_mapping()
979+
return pa_table.to_pandas(types_mapper=mapping.get)
980+
return pa_table.to_pandas()
981+
elif self.engine == "ujson":
982+
if self.lines:
983+
if self.chunksize:
984+
obj = concat(self)
985+
elif self.nrows:
986+
lines = list(islice(self.data, self.nrows))
987+
lines_json = self._combine_lines(lines)
988+
obj = self._get_object_parser(lines_json)
989+
else:
990+
data = ensure_str(self.data)
991+
data_lines = data.split("\n")
992+
obj = self._get_object_parser(self._combine_lines(data_lines))
933993
else:
934-
data = ensure_str(self.data)
935-
data_lines = data.split("\n")
936-
obj = self._get_object_parser(self._combine_lines(data_lines))
937-
else:
938-
obj = self._get_object_parser(self.data)
939-
if self.use_nullable_dtypes:
940-
return obj.convert_dtypes(infer_objects=False)
941-
else:
942-
return obj
994+
obj = self._get_object_parser(self.data)
995+
if self.use_nullable_dtypes:
996+
return obj.convert_dtypes(infer_objects=False)
997+
else:
998+
return obj
943999

9441000
def _get_object_parser(self, json) -> DataFrame | Series:
9451001
"""

pandas/tests/io/json/conftest.py

+7
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,10 @@ def orient(request):
77
Fixture for orients excluding the table format.
88
"""
99
return request.param
10+
11+
12+
@pytest.fixture(params=["ujson", "pyarrow"])
13+
def engine(request):
14+
if request.param == "pyarrow":
15+
pytest.importorskip("pyarrow.json")
16+
return request.param

pandas/tests/io/json/test_pandas.py

+16
Original file line numberDiff line numberDiff line change
@@ -1956,3 +1956,19 @@ def test_read_json_nullable_series(self, string_storage, dtype_backend, orient):
19561956
expected = Series(ArrowExtensionArray(pa.array(expected, from_pandas=True)))
19571957

19581958
tm.assert_series_equal(result, expected)
1959+
1960+
1961+
def test_invalid_engine():
1962+
# GH 48893
1963+
ser = Series(range(1))
1964+
out = ser.to_json()
1965+
with pytest.raises(ValueError, match="The engine type foo"):
1966+
read_json(out, engine="foo")
1967+
1968+
1969+
def test_pyarrow_engine_lines_false():
1970+
# GH 48893
1971+
ser = Series(range(1))
1972+
out = ser.to_json()
1973+
with pytest.raises(ValueError, match="currently pyarrow engine only supports"):
1974+
read_json(out, engine="pyarrow", lines=False)

0 commit comments

Comments
 (0)