Skip to content

Commit 9f915d7

Browse files
committed
added ArrowJsonParser and tests
1 parent 28c4629 commit 9f915d7

File tree

4 files changed

+192
-38
lines changed

4 files changed

+192
-38
lines changed

pandas/io/json/_json.py

+52-38
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,7 @@
7373
build_table_schema,
7474
parse_table_schema,
7575
)
76-
from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper
77-
from pandas.io.parsers.base_parser import ParserBase
76+
from pandas.io.json.arrow_json_parser_wrapper import ArrowJsonParserWrapper
7877
from pandas.io.parsers.readers import validate_integer
7978

8079
if TYPE_CHECKING:
@@ -383,7 +382,7 @@ def read_json(
383382
date_unit: str | None = ...,
384383
encoding: str | None = ...,
385384
encoding_errors: str | None = ...,
386-
engine: JSONEngine | None = ...,
385+
engine: JSONEngine = ...,
387386
lines: bool = ...,
388387
chunksize: int,
389388
compression: CompressionOptions = ...,
@@ -408,7 +407,7 @@ def read_json(
408407
date_unit: str | None = ...,
409408
encoding: str | None = ...,
410409
encoding_errors: str | None = ...,
411-
engine: JSONEngine | None = ...,
410+
engine: JSONEngine = ...,
412411
lines: bool = ...,
413412
chunksize: int,
414413
compression: CompressionOptions = ...,
@@ -433,7 +432,7 @@ def read_json(
433432
date_unit: str | None = ...,
434433
encoding: str | None = ...,
435434
encoding_errors: str | None = ...,
436-
engine: JSONEngine | None = ...,
435+
engine: JSONEngine = ...,
437436
lines: bool = ...,
438437
chunksize: None = ...,
439438
compression: CompressionOptions = ...,
@@ -457,7 +456,7 @@ def read_json(
457456
date_unit: str | None = ...,
458457
encoding: str | None = ...,
459458
encoding_errors: str | None = ...,
460-
engine: JSONEngine | None = None,
459+
engine: JSONEngine = ...,
461460
lines: bool = ...,
462461
chunksize: None = ...,
463462
compression: CompressionOptions = ...,
@@ -486,7 +485,7 @@ def read_json(
486485
date_unit: str | None = None,
487486
encoding: str | None = None,
488487
encoding_errors: str | None = "strict",
489-
engine: JSONEngine | None = None,
488+
engine: JSONEngine = "ujson",
490489
lines: bool = False,
491490
chunksize: int | None = None,
492491
compression: CompressionOptions = "infer",
@@ -615,7 +614,7 @@ def read_json(
615614
616615
.. versionadded:: 1.3.0
617616
618-
engine : {{'ujson', 'pyarrow'}}
617+
engine : {{'ujson', 'pyarrow'}}, default "ujson"
619618
Parser engine to use.
620619
621620
lines : bool, default False
@@ -792,13 +791,13 @@ def __init__(
792791
precise_float: bool,
793792
date_unit,
794793
encoding,
795-
engine,
796794
lines: bool,
797795
chunksize: int | None,
798796
compression: CompressionOptions,
799797
nrows: int | None,
800798
storage_options: StorageOptions = None,
801799
encoding_errors: str | None = "strict",
800+
engine: JSONEngine = "ujson",
802801
) -> None:
803802

804803
self.orient = orient
@@ -829,33 +828,45 @@ def __init__(
829828
self.nrows = validate_integer("nrows", self.nrows, 0)
830829
if not self.lines:
831830
raise ValueError("nrows can only be passed if lines=True")
831+
if self.engine == "pyarrow":
832+
if not self.lines:
833+
raise ValueError(
834+
"currently pyarrow engine only supports "
835+
"the line-delimited JSON format"
836+
)
832837

833-
if engine is not None:
838+
if self.engine == "pyarrow":
834839
self._engine = self._make_engine(filepath_or_buffer, self.engine)
835-
else:
840+
if self.engine == "ujson":
836841
data = self._get_data_from_filepath(filepath_or_buffer)
837842
self.data = self._preprocess_data(data)
838843

839844
def _make_engine(
840845
self,
841846
filepath_or_buffer: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
842-
engine: JSONEngine,
843-
) -> ParserBase:
844-
845-
mapping: dict[str, type[ParserBase]] = {
846-
"pyarrow": ArrowParserWrapper,
847-
"ujson": ...,
848-
}
849-
850-
if engine not in mapping:
851-
raise ValueError(
852-
f"Unknown engine: {engine} (valid options are {mapping.keys()})"
853-
)
847+
engine: JSONEngine = "pyarrow",
848+
) -> ArrowJsonParserWrapper:
854849

855850
if not isinstance(filepath_or_buffer, list):
856-
...
851+
is_text = False
852+
mode = "rb"
853+
self.handles = get_handle(
854+
filepath_or_buffer,
855+
mode=mode,
856+
encoding=self.encoding,
857+
is_text=is_text,
858+
compression=self.compression,
859+
storage_options=self.storage_options,
860+
errors=self.encoding_errors,
861+
)
862+
filepath_or_buffer = self.handles.handle
857863

858-
return mapping[engine](filepath_or_buffer)
864+
try:
865+
return ArrowJsonParserWrapper(filepath_or_buffer)
866+
except Exception:
867+
if self.handles is not None:
868+
self.handles.close()
869+
raise
859870

860871
def _preprocess_data(self, data):
861872
"""
@@ -939,20 +950,23 @@ def read(self) -> DataFrame | Series:
939950
Read the whole JSON input into a pandas object.
940951
"""
941952
obj: DataFrame | Series
942-
if self.lines:
943-
if self.chunksize:
944-
obj = concat(self)
945-
elif self.nrows:
946-
lines = list(islice(self.data, self.nrows))
947-
lines_json = self._combine_lines(lines)
948-
obj = self._get_object_parser(lines_json)
953+
if self.engine == "pyarrow":
954+
obj = self._engine.read()
955+
if self.engine == "ujson":
956+
if self.lines:
957+
if self.chunksize:
958+
obj = concat(self)
959+
elif self.nrows:
960+
lines = list(islice(self.data, self.nrows))
961+
lines_json = self._combine_lines(lines)
962+
obj = self._get_object_parser(lines_json)
963+
else:
964+
data = ensure_str(self.data)
965+
data_lines = data.split("\n")
966+
obj = self._get_object_parser(self._combine_lines(data_lines))
949967
else:
950-
data = ensure_str(self.data)
951-
data_lines = data.split("\n")
952-
obj = self._get_object_parser(self._combine_lines(data_lines))
953-
else:
954-
obj = self._get_object_parser(self.data)
955-
self.close()
968+
obj = self._get_object_parser(self.data)
969+
self.close()
956970
return obj
957971

958972
def _get_object_parser(self, json) -> DataFrame | Series:
+100
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
5+
from pandas._typing import ReadBuffer
6+
from pandas.compat._optional import import_optional_dependency
7+
8+
from pandas.core.dtypes.inference import is_integer
9+
10+
if TYPE_CHECKING:
11+
from pandas import DataFrame
12+
13+
14+
class ArrowJsonParserWrapper:
15+
"""
16+
Wrapper for the pyarrow engine for read_json()
17+
"""
18+
19+
def __init__(self, src: ReadBuffer[bytes]) -> None:
20+
super().__init__()
21+
self.src = src
22+
23+
def _parse_kwd(self) -> None:
24+
"""
25+
Validates keywords before passing to pyarrow
26+
"""
27+
...
28+
29+
def _get_pyarrow_options(self) -> None:
30+
...
31+
32+
def read(self) -> DataFrame:
33+
"""
34+
Reads the contents of a JSON file into a DataFrame and
35+
processes it according to the kwargs passed in the
36+
constructor.
37+
38+
Returns
39+
-------
40+
DataFrame
41+
The DataFrame created from the JSON file.
42+
"""
43+
pyarrow_json = import_optional_dependency("pyarrow.json")
44+
table = pyarrow_json.read_json(self.src)
45+
46+
frame = table.to_pandas()
47+
return frame
48+
49+
def _finalize_output(self, frame: DataFrame) -> DataFrame:
50+
"""
51+
Processes data read in based on kwargs.
52+
53+
Parameters
54+
----------
55+
frame: DataFrame
56+
The DataFrame to process.
57+
58+
Returns
59+
-------
60+
DataFrame
61+
The processed DataFrame.
62+
"""
63+
num_cols = len(frame.columns)
64+
multi_index_named = True
65+
if self.header is None:
66+
if self.names is None:
67+
if self.prefix is not None:
68+
self.names = [f"{self.prefix}{i}" for i in range(num_cols)]
69+
elif self.header is None:
70+
self.names = range(num_cols)
71+
if len(self.names) != num_cols:
72+
# usecols is passed through to pyarrow, we only handle index col here
73+
# The only way self.names is not the same length as number of cols is
74+
# if we have int index_col. We should just pad the names(they will get
75+
# removed anyways) to expected length then.
76+
self.names = list(range(num_cols - len(self.names))) + self.names
77+
multi_index_named = False
78+
frame.columns = self.names
79+
# we only need the frame not the names
80+
frame.columns, frame = self._do_date_conversions(frame.columns, frame)
81+
if self.index_col is not None:
82+
for i, item in enumerate(self.index_col):
83+
if is_integer(item):
84+
self.index_col[i] = frame.columns[item]
85+
else:
86+
# String case
87+
if item not in frame.columns:
88+
raise ValueError(f"Index {item} invalid")
89+
frame.set_index(self.index_col, drop=True, inplace=True)
90+
# Clear names if headerless and no name given
91+
if self.header is None and not multi_index_named:
92+
frame.index.names = [None] * len(frame.index.names)
93+
94+
if self.kwds.get("dtype") is not None:
95+
try:
96+
frame = frame.astype(self.kwds.get("dtype"))
97+
except TypeError as e:
98+
# GH#44901 reraise to keep api consistent
99+
raise ValueError(e)
100+
return frame

pandas/tests/io/json/conftest.py

+8
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,11 @@ def orient(request):
77
Fixture for orients excluding the table format.
88
"""
99
return request.param
10+
11+
12+
@pytest.fixture
13+
def json_dir_path(datapath):
14+
"""
15+
The directory path to the data files needed for parser tests.
16+
"""
17+
return datapath("io", "json", "data")

pandas/tests/io/json/test_readlines.py

+32
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from io import StringIO
2+
import os
23
from pathlib import Path
34
from typing import Iterator
45

@@ -27,6 +28,37 @@ def test_read_jsonl():
2728
tm.assert_frame_equal(result, expected)
2829

2930

31+
def test_read_jsonl_engine_pyarrow(json_dir_path):
32+
# '{"a": 1, "b": 2}\n{"a": 3, "b": 4}\n{"a": 5, "b": 6}'
33+
34+
result = read_json(
35+
os.path.join(json_dir_path, "line_delimited.json"),
36+
lines=True,
37+
engine="pyarrow",
38+
)
39+
expected = DataFrame({"a": [1, 3, 5], "b": [2, 4, 6]})
40+
tm.assert_frame_equal(result, expected)
41+
42+
43+
@pytest.mark.xfail
44+
def test_read_jsonl_engine_pyarrow_lines_false(json_dir_path):
45+
result = read_json(
46+
os.path.join(json_dir_path, "line_delimited.json"),
47+
engine="pyarrow",
48+
)
49+
expected = DataFrame({"a": [1, 3, 5], "b": [2, 4, 6]})
50+
tm.assert_frame_equal(result, expected)
51+
52+
53+
@pytest.mark.xfail
54+
def test_read_jsonl_engine_pyarrow_json_string():
55+
result = read_json(
56+
'{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}', engine="pyarrow"
57+
)
58+
expected = DataFrame({"a": [1, 3, 5], "b": [2, 4, 6]})
59+
tm.assert_frame_equal(result, expected)
60+
61+
3062
def test_read_datetime():
3163
# GH33787
3264
df = DataFrame(

0 commit comments

Comments
 (0)