21
21
22
22
import numpy as np
23
23
24
- from pandas ._config import using_nullable_dtypes
24
+ from pandas ._config import (
25
+ get_option ,
26
+ using_nullable_dtypes ,
27
+ )
25
28
26
29
from pandas ._libs import lib
27
30
from pandas ._libs .json import (
34
37
DtypeArg ,
35
38
FilePath ,
36
39
IndexLabel ,
40
+ JSONEngine ,
37
41
JSONSerializable ,
38
42
ReadBuffer ,
39
43
StorageOptions ,
40
44
WriteBuffer ,
41
45
)
46
+ from pandas .compat ._optional import import_optional_dependency
42
47
from pandas .errors import AbstractMethodError
43
48
from pandas .util ._decorators import doc
44
49
@@ -401,6 +406,7 @@ def read_json(
401
406
nrows : int | None = ...,
402
407
storage_options : StorageOptions = ...,
403
408
use_nullable_dtypes : bool = ...,
409
+ engine : JSONEngine = ...,
404
410
) -> JsonReader [Literal ["frame" ]]:
405
411
...
406
412
@@ -425,6 +431,7 @@ def read_json(
425
431
nrows : int | None = ...,
426
432
storage_options : StorageOptions = ...,
427
433
use_nullable_dtypes : bool = ...,
434
+ engine : JSONEngine = ...,
428
435
) -> JsonReader [Literal ["series" ]]:
429
436
...
430
437
@@ -449,6 +456,7 @@ def read_json(
449
456
nrows : int | None = ...,
450
457
storage_options : StorageOptions = ...,
451
458
use_nullable_dtypes : bool = ...,
459
+ engine : JSONEngine = ...,
452
460
) -> Series :
453
461
...
454
462
@@ -473,6 +481,7 @@ def read_json(
473
481
nrows : int | None = ...,
474
482
storage_options : StorageOptions = ...,
475
483
use_nullable_dtypes : bool = ...,
484
+ engine : JSONEngine = ...,
476
485
) -> DataFrame :
477
486
...
478
487
@@ -500,6 +509,7 @@ def read_json(
500
509
nrows : int | None = None ,
501
510
storage_options : StorageOptions = None ,
502
511
use_nullable_dtypes : bool | lib .NoDefault = lib .no_default ,
512
+ engine : JSONEngine = "ujson" ,
503
513
) -> DataFrame | Series | JsonReader :
504
514
"""
505
515
Convert a JSON string to pandas object.
@@ -653,6 +663,12 @@ def read_json(
653
663
654
664
.. versionadded:: 2.0
655
665
666
+ engine : {{"ujson", "pyarrow"}}, default "ujson"
667
+ Parser engine to use. The ``"pyarrow"`` engine is only available when
668
+ ``lines=True``.
669
+
670
+ .. versionadded:: 2.0
671
+
656
672
Returns
657
673
-------
658
674
Series or DataFrame
@@ -771,6 +787,7 @@ def read_json(
771
787
storage_options = storage_options ,
772
788
encoding_errors = encoding_errors ,
773
789
use_nullable_dtypes = use_nullable_dtypes ,
790
+ engine = engine ,
774
791
)
775
792
776
793
if chunksize :
@@ -807,6 +824,7 @@ def __init__(
807
824
storage_options : StorageOptions = None ,
808
825
encoding_errors : str | None = "strict" ,
809
826
use_nullable_dtypes : bool = False ,
827
+ engine : JSONEngine = "ujson" ,
810
828
) -> None :
811
829
812
830
self .orient = orient
@@ -818,6 +836,7 @@ def __init__(
818
836
self .precise_float = precise_float
819
837
self .date_unit = date_unit
820
838
self .encoding = encoding
839
+ self .engine = engine
821
840
self .compression = compression
822
841
self .storage_options = storage_options
823
842
self .lines = lines
@@ -828,17 +847,32 @@ def __init__(
828
847
self .handles : IOHandles [str ] | None = None
829
848
self .use_nullable_dtypes = use_nullable_dtypes
830
849
850
+ if self .engine not in {"pyarrow" , "ujson" }:
851
+ raise ValueError (
852
+ f"The engine type { self .engine } is currently not supported."
853
+ )
831
854
if self .chunksize is not None :
832
855
self .chunksize = validate_integer ("chunksize" , self .chunksize , 1 )
833
856
if not self .lines :
834
857
raise ValueError ("chunksize can only be passed if lines=True" )
858
+ if self .engine == "pyarrow" :
859
+ raise ValueError (
860
+ "currently pyarrow engine doesn't support chunksize parameter"
861
+ )
835
862
if self .nrows is not None :
836
863
self .nrows = validate_integer ("nrows" , self .nrows , 0 )
837
864
if not self .lines :
838
865
raise ValueError ("nrows can only be passed if lines=True" )
839
-
840
- data = self ._get_data_from_filepath (filepath_or_buffer )
841
- self .data = self ._preprocess_data (data )
866
+ if self .engine == "pyarrow" :
867
+ if not self .lines :
868
+ raise ValueError (
869
+ "currently pyarrow engine only supports "
870
+ "the line-delimited JSON format"
871
+ )
872
+ self .data = filepath_or_buffer
873
+ elif self .engine == "ujson" :
874
+ data = self ._get_data_from_filepath (filepath_or_buffer )
875
+ self .data = self ._preprocess_data (data )
842
876
843
877
def _preprocess_data (self , data ):
844
878
"""
@@ -923,23 +957,45 @@ def read(self) -> DataFrame | Series:
923
957
"""
924
958
obj : DataFrame | Series
925
959
with self :
926
- if self .lines :
927
- if self .chunksize :
928
- obj = concat (self )
929
- elif self .nrows :
930
- lines = list (islice (self .data , self .nrows ))
931
- lines_json = self ._combine_lines (lines )
932
- obj = self ._get_object_parser (lines_json )
960
+ if self .engine == "pyarrow" :
961
+ pyarrow_json = import_optional_dependency ("pyarrow.json" )
962
+ pa_table = pyarrow_json .read_json (self .data )
963
+ if self .use_nullable_dtypes :
964
+ if get_option ("mode.dtype_backend" ) == "pyarrow" :
965
+ from pandas .arrays import ArrowExtensionArray
966
+
967
+ return DataFrame (
968
+ {
969
+ col_name : ArrowExtensionArray (pa_col )
970
+ for col_name , pa_col in zip (
971
+ pa_table .column_names , pa_table .itercolumns ()
972
+ )
973
+ }
974
+ )
975
+ elif get_option ("mode.dtype_backend" ) == "pandas" :
976
+ from pandas .io ._util import _arrow_dtype_mapping
977
+
978
+ mapping = _arrow_dtype_mapping ()
979
+ return pa_table .to_pandas (types_mapper = mapping .get )
980
+ return pa_table .to_pandas ()
981
+ elif self .engine == "ujson" :
982
+ if self .lines :
983
+ if self .chunksize :
984
+ obj = concat (self )
985
+ elif self .nrows :
986
+ lines = list (islice (self .data , self .nrows ))
987
+ lines_json = self ._combine_lines (lines )
988
+ obj = self ._get_object_parser (lines_json )
989
+ else :
990
+ data = ensure_str (self .data )
991
+ data_lines = data .split ("\n " )
992
+ obj = self ._get_object_parser (self ._combine_lines (data_lines ))
933
993
else :
934
- data = ensure_str (self .data )
935
- data_lines = data .split ("\n " )
936
- obj = self ._get_object_parser (self ._combine_lines (data_lines ))
937
- else :
938
- obj = self ._get_object_parser (self .data )
939
- if self .use_nullable_dtypes :
940
- return obj .convert_dtypes (infer_objects = False )
941
- else :
942
- return obj
994
+ obj = self ._get_object_parser (self .data )
995
+ if self .use_nullable_dtypes :
996
+ return obj .convert_dtypes (infer_objects = False )
997
+ else :
998
+ return obj
943
999
944
1000
def _get_object_parser (self , json ) -> DataFrame | Series :
945
1001
"""
0 commit comments