31
31
DtypeArg ,
32
32
FilePath ,
33
33
IndexLabel ,
34
+ JSONEngine ,
34
35
JSONSerializable ,
35
36
ReadBuffer ,
36
37
StorageOptions ,
69
70
build_table_schema ,
70
71
parse_table_schema ,
71
72
)
73
+ from pandas .io .json .arrow_json_parser_wrapper import ArrowJsonParserWrapper
72
74
from pandas .io .parsers .readers import validate_integer
73
75
74
76
if TYPE_CHECKING :
@@ -389,6 +391,7 @@ def read_json(
389
391
date_unit : str | None = ...,
390
392
encoding : str | None = ...,
391
393
encoding_errors : str | None = ...,
394
+ engine : JSONEngine = ...,
392
395
lines : bool = ...,
393
396
chunksize : int ,
394
397
compression : CompressionOptions = ...,
@@ -417,6 +420,7 @@ def read_json(
417
420
compression : CompressionOptions = ...,
418
421
nrows : int | None = ...,
419
422
storage_options : StorageOptions = ...,
423
+ engine : JSONEngine = ...,
420
424
) -> JsonReader [Literal ["series" ]]:
421
425
...
422
426
@@ -440,6 +444,7 @@ def read_json(
440
444
compression : CompressionOptions = ...,
441
445
nrows : int | None = ...,
442
446
storage_options : StorageOptions = ...,
447
+ engine : JSONEngine = ...,
443
448
) -> Series :
444
449
...
445
450
@@ -463,6 +468,7 @@ def read_json(
463
468
compression : CompressionOptions = ...,
464
469
nrows : int | None = ...,
465
470
storage_options : StorageOptions = ...,
471
+ engine : JSONEngine = ...,
466
472
) -> DataFrame :
467
473
...
468
474
@@ -489,6 +495,7 @@ def read_json(
489
495
compression : CompressionOptions = "infer" ,
490
496
nrows : int | None = None ,
491
497
storage_options : StorageOptions = None ,
498
+ engine : JSONEngine = "ujson" ,
492
499
) -> DataFrame | Series | JsonReader :
493
500
"""
494
501
Convert a JSON string to pandas object.
@@ -597,6 +604,9 @@ def read_json(
597
604
598
605
.. versionadded:: 1.3.0
599
606
607
+ engine : {{'ujson', 'pyarrow'}}, default "ujson"
608
+ Parser engine to use.
609
+
600
610
lines : bool, default False
601
611
Read the file as a json object per line.
602
612
@@ -738,6 +748,7 @@ def read_json(
738
748
nrows = nrows ,
739
749
storage_options = storage_options ,
740
750
encoding_errors = encoding_errors ,
751
+ engine = engine ,
741
752
)
742
753
743
754
if chunksize :
@@ -773,6 +784,7 @@ def __init__(
773
784
nrows : int | None ,
774
785
storage_options : StorageOptions = None ,
775
786
encoding_errors : str | None = "strict" ,
787
+ engine : JSONEngine = "ujson" ,
776
788
) -> None :
777
789
778
790
self .orient = orient
@@ -784,6 +796,7 @@ def __init__(
784
796
self .precise_float = precise_float
785
797
self .date_unit = date_unit
786
798
self .encoding = encoding
799
+ self .engine = engine
787
800
self .compression = compression
788
801
self .storage_options = storage_options
789
802
self .lines = lines
@@ -801,9 +814,48 @@ def __init__(
801
814
self .nrows = validate_integer ("nrows" , self .nrows , 0 )
802
815
if not self .lines :
803
816
raise ValueError ("nrows can only be passed if lines=True" )
817
+ if self .engine == "pyarrow" :
818
+ if not self .lines :
819
+ raise ValueError (
820
+ "currently pyarrow engine only supports "
821
+ "the line-delimited JSON format"
822
+ )
823
+ if self .engine not in ["pyarrow" , "ujson" ]:
824
+ raise ValueError (
825
+ f"The engine type { self .engine } is currently not supported."
826
+ )
827
+
828
+ if self .engine == "pyarrow" :
829
+ self ._engine = self ._make_engine (filepath_or_buffer )
830
+ if self .engine == "ujson" :
831
+ data = self ._get_data_from_filepath (filepath_or_buffer )
832
+ self .data = self ._preprocess_data (data )
833
+
834
+ def _make_engine (
835
+ self ,
836
+ filepath_or_buffer : FilePath | ReadBuffer [str ] | ReadBuffer [bytes ],
837
+ ) -> ArrowJsonParserWrapper :
838
+
839
+ if not isinstance (filepath_or_buffer , list ):
840
+ is_text = False
841
+ mode = "rb"
842
+ self .handles = get_handle (
843
+ self ._get_data_from_filepath (filepath_or_buffer ),
844
+ mode = mode ,
845
+ encoding = self .encoding ,
846
+ is_text = is_text ,
847
+ compression = self .compression ,
848
+ storage_options = self .storage_options ,
849
+ errors = self .encoding_errors ,
850
+ )
851
+ filepath_or_buffer = self .handles .handle
804
852
805
- data = self ._get_data_from_filepath (filepath_or_buffer )
806
- self .data = self ._preprocess_data (data )
853
+ try :
854
+ return ArrowJsonParserWrapper (filepath_or_buffer )
855
+ except Exception :
856
+ if self .handles is not None :
857
+ self .handles .close ()
858
+ raise
807
859
808
860
def _preprocess_data (self , data ):
809
861
"""
@@ -888,19 +940,22 @@ def read(self) -> DataFrame | Series:
888
940
"""
889
941
obj : DataFrame | Series
890
942
with self :
891
- if self .lines :
892
- if self .chunksize :
893
- obj = concat (self )
894
- elif self .nrows :
895
- lines = list (islice (self .data , self .nrows ))
896
- lines_json = self ._combine_lines (lines )
897
- obj = self ._get_object_parser (lines_json )
943
+ if self .engine == "pyarrow" :
944
+ obj = self ._engine .read ()
945
+ if self .engine == "ujson" :
946
+ if self .lines :
947
+ if self .chunksize :
948
+ obj = concat (self )
949
+ elif self .nrows :
950
+ lines = list (islice (self .data , self .nrows ))
951
+ lines_json = self ._combine_lines (lines )
952
+ obj = self ._get_object_parser (lines_json )
953
+ else :
954
+ data = ensure_str (self .data )
955
+ data_lines = data .split ("\n " )
956
+ obj = self ._get_object_parser (self ._combine_lines (data_lines ))
898
957
else :
899
- data = ensure_str (self .data )
900
- data_lines = data .split ("\n " )
901
- obj = self ._get_object_parser (self ._combine_lines (data_lines ))
902
- else :
903
- obj = self ._get_object_parser (self .data )
958
+ obj = self ._get_object_parser (self .data )
904
959
return obj
905
960
906
961
def _get_object_parser (self , json ) -> DataFrame | Series :
0 commit comments