Skip to content

Commit b85057e

Browse files
committed
ENH: Add engine keyword to read_json to enable reading from pyarrow pandas-dev#48893
1 parent 627d1b6 commit b85057e

File tree

5 files changed

+201
-47
lines changed

5 files changed

+201
-47
lines changed

pandas/_typing.py

+3
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,9 @@ def closed(self) -> bool:
319319
# read_csv engines
320320
CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"]
321321

322+
# read_json engines
323+
JSONEngine = Literal["ujson", "pyarrow"]
324+
322325
# read_xml parsers
323326
XMLParsers = Literal["lxml", "etree"]
324327

pandas/io/json/_json.py

+69-14
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
DtypeArg,
3232
FilePath,
3333
IndexLabel,
34+
JSONEngine,
3435
JSONSerializable,
3536
ReadBuffer,
3637
StorageOptions,
@@ -69,6 +70,7 @@
6970
build_table_schema,
7071
parse_table_schema,
7172
)
73+
from pandas.io.json.arrow_json_parser_wrapper import ArrowJsonParserWrapper
7274
from pandas.io.parsers.readers import validate_integer
7375

7476
if TYPE_CHECKING:
@@ -389,6 +391,7 @@ def read_json(
389391
date_unit: str | None = ...,
390392
encoding: str | None = ...,
391393
encoding_errors: str | None = ...,
394+
engine: JSONEngine = ...,
392395
lines: bool = ...,
393396
chunksize: int,
394397
compression: CompressionOptions = ...,
@@ -417,6 +420,7 @@ def read_json(
417420
compression: CompressionOptions = ...,
418421
nrows: int | None = ...,
419422
storage_options: StorageOptions = ...,
423+
engine: JSONEngine = ...,
420424
) -> JsonReader[Literal["series"]]:
421425
...
422426

@@ -440,6 +444,7 @@ def read_json(
440444
compression: CompressionOptions = ...,
441445
nrows: int | None = ...,
442446
storage_options: StorageOptions = ...,
447+
engine: JSONEngine = ...,
443448
) -> Series:
444449
...
445450

@@ -463,6 +468,7 @@ def read_json(
463468
compression: CompressionOptions = ...,
464469
nrows: int | None = ...,
465470
storage_options: StorageOptions = ...,
471+
engine: JSONEngine = ...,
466472
) -> DataFrame:
467473
...
468474

@@ -489,6 +495,7 @@ def read_json(
489495
compression: CompressionOptions = "infer",
490496
nrows: int | None = None,
491497
storage_options: StorageOptions = None,
498+
engine: JSONEngine = "ujson",
492499
) -> DataFrame | Series | JsonReader:
493500
"""
494501
Convert a JSON string to pandas object.
@@ -597,6 +604,9 @@ def read_json(
597604
598605
.. versionadded:: 1.3.0
599606
607+
engine : {{'ujson', 'pyarrow'}}, default "ujson"
608+
Parser engine to use.
609+
600610
lines : bool, default False
601611
Read the file as a json object per line.
602612
@@ -738,6 +748,7 @@ def read_json(
738748
nrows=nrows,
739749
storage_options=storage_options,
740750
encoding_errors=encoding_errors,
751+
engine=engine,
741752
)
742753

743754
if chunksize:
@@ -773,6 +784,7 @@ def __init__(
773784
nrows: int | None,
774785
storage_options: StorageOptions = None,
775786
encoding_errors: str | None = "strict",
787+
engine: JSONEngine = "ujson",
776788
) -> None:
777789

778790
self.orient = orient
@@ -784,6 +796,7 @@ def __init__(
784796
self.precise_float = precise_float
785797
self.date_unit = date_unit
786798
self.encoding = encoding
799+
self.engine = engine
787800
self.compression = compression
788801
self.storage_options = storage_options
789802
self.lines = lines
@@ -801,9 +814,48 @@ def __init__(
801814
self.nrows = validate_integer("nrows", self.nrows, 0)
802815
if not self.lines:
803816
raise ValueError("nrows can only be passed if lines=True")
817+
if self.engine == "pyarrow":
818+
if not self.lines:
819+
raise ValueError(
820+
"currently pyarrow engine only supports "
821+
"the line-delimited JSON format"
822+
)
823+
if self.engine not in ["pyarrow", "ujson"]:
824+
raise ValueError(
825+
f"The engine type {self.engine} is currently not supported."
826+
)
827+
828+
if self.engine == "pyarrow":
829+
self._engine = self._make_engine(filepath_or_buffer)
830+
if self.engine == "ujson":
831+
data = self._get_data_from_filepath(filepath_or_buffer)
832+
self.data = self._preprocess_data(data)
833+
834+
def _make_engine(
835+
self,
836+
filepath_or_buffer: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
837+
) -> ArrowJsonParserWrapper:
838+
839+
if not isinstance(filepath_or_buffer, list):
840+
is_text = False
841+
mode = "rb"
842+
self.handles = get_handle(
843+
self._get_data_from_filepath(filepath_or_buffer),
844+
mode=mode,
845+
encoding=self.encoding,
846+
is_text=is_text,
847+
compression=self.compression,
848+
storage_options=self.storage_options,
849+
errors=self.encoding_errors,
850+
)
851+
filepath_or_buffer = self.handles.handle
804852

805-
data = self._get_data_from_filepath(filepath_or_buffer)
806-
self.data = self._preprocess_data(data)
853+
try:
854+
return ArrowJsonParserWrapper(filepath_or_buffer)
855+
except Exception:
856+
if self.handles is not None:
857+
self.handles.close()
858+
raise
807859

808860
def _preprocess_data(self, data):
809861
"""
@@ -888,19 +940,22 @@ def read(self) -> DataFrame | Series:
888940
"""
889941
obj: DataFrame | Series
890942
with self:
891-
if self.lines:
892-
if self.chunksize:
893-
obj = concat(self)
894-
elif self.nrows:
895-
lines = list(islice(self.data, self.nrows))
896-
lines_json = self._combine_lines(lines)
897-
obj = self._get_object_parser(lines_json)
943+
if self.engine == "pyarrow":
944+
obj = self._engine.read()
945+
if self.engine == "ujson":
946+
if self.lines:
947+
if self.chunksize:
948+
obj = concat(self)
949+
elif self.nrows:
950+
lines = list(islice(self.data, self.nrows))
951+
lines_json = self._combine_lines(lines)
952+
obj = self._get_object_parser(lines_json)
953+
else:
954+
data = ensure_str(self.data)
955+
data_lines = data.split("\n")
956+
obj = self._get_object_parser(self._combine_lines(data_lines))
898957
else:
899-
data = ensure_str(self.data)
900-
data_lines = data.split("\n")
901-
obj = self._get_object_parser(self._combine_lines(data_lines))
902-
else:
903-
obj = self._get_object_parser(self.data)
958+
obj = self._get_object_parser(self.data)
904959
return obj
905960

906961
def _get_object_parser(self, json) -> DataFrame | Series:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
5+
from pandas._typing import ReadBuffer
6+
from pandas.compat._optional import import_optional_dependency
7+
8+
if TYPE_CHECKING:
9+
from pandas import DataFrame
10+
11+
12+
class ArrowJsonParserWrapper:
13+
"""
14+
Wrapper for the pyarrow engine for read_json()
15+
"""
16+
17+
def __init__(self, src: ReadBuffer[bytes]) -> None:
18+
self.src = src
19+
20+
def read(self) -> DataFrame:
21+
"""
22+
Reads the contents of a JSON file into a DataFrame and
23+
processes it according to the kwargs passed in the
24+
constructor.
25+
26+
Returns
27+
-------
28+
DataFrame
29+
The DataFrame created from the JSON file.
30+
"""
31+
pyarrow_json = import_optional_dependency("pyarrow.json")
32+
table = pyarrow_json.read_json(self.src)
33+
34+
frame = table.to_pandas()
35+
return frame

pandas/tests/io/json/conftest.py

+24
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,27 @@ def orient(request):
77
Fixture for orients excluding the table format.
88
"""
99
return request.param
10+
11+
12+
@pytest.fixture
13+
def json_dir_path(datapath):
14+
"""
15+
The directory path to the data files needed for parser tests.
16+
"""
17+
return datapath("io", "json", "data")
18+
19+
20+
@pytest.fixture(params=["ujson", "pyarrow"])
21+
def engine(request):
22+
return request.param
23+
24+
25+
@pytest.fixture
26+
def json_engine_pyarrow_xfail(request):
27+
"""
28+
Fixture that xfails a test if the engine is pyarrow.
29+
"""
30+
engine = request.getfixturevalue("engine")
31+
if engine == "pyarrow":
32+
mark = pytest.mark.xfail(reason="pyarrow doesn't support this.")
33+
request.node.add_marker(mark)

0 commit comments

Comments
 (0)