19
19
20
20
import numpy as np
21
21
import pandas as pd
22
- from pandas .core .arrays .arrow .array import ArrowExtensionArray
23
- from pandas .core .arrays .masked import BaseMaskedArray
24
- from pandas .core .dtypes .common import is_dict_like , is_integer , is_list_like , is_scalar
25
- from pandas .core .dtypes .dtypes import ExtensionDtype
26
- from pandas .core .indexers import check_array_indexer , unpack_tuple_and_ellipses
22
+ import pandas .arrays as arrays
23
+ import pandas .core .dtypes .common as common
24
+ import pandas .core .indexers as indexers
27
25
import pyarrow as pa
28
- import pyarrow .compute as pc
29
26
30
27
31
28
@pd .api .extensions .register_extension_dtype
@@ -63,78 +60,81 @@ def __from_arrow__(array: typing.Union[pa.Array, pa.ChunkedArray]) -> JSONArray:
63
60
return JSONArray (array )
64
61
65
62
66
- class JSONArray (ArrowExtensionArray ):
63
+ class JSONArray (arrays . ArrowExtensionArray ):
67
64
"""Extension array that handles BigQuery JSON data, leveraging a string-based
68
65
pyarrow array for storage. It enables seamless conversion to JSON objects when
69
66
accessing individual elements."""
70
67
71
68
_dtype = JSONDtype ()
72
69
73
70
def __init__ (self , values , dtype = None , copy = False ) -> None :
74
- if isinstance (values , (pa .Array , pa .ChunkedArray )) and pa .types .is_string (
75
- values .type
76
- ):
77
- values = pc .cast (values , pa .large_string ())
78
-
79
- super ().__init__ (values )
80
71
self ._dtype = JSONDtype ()
81
-
82
- if not pa .types .is_large_string (self ._pa_array .type ) and not (
83
- pa .types .is_dictionary (self ._pa_array .type )
84
- and pa .types .is_large_string (self ._pa_array .type .value_type )
85
- ):
86
- raise ValueError (
87
- "ArrowStringArray requires a PyArrow (chunked) array of "
88
- "large_string type"
89
- )
72
+ if isinstance (values , pa .Array ):
73
+ self ._pa_array = pa .chunked_array ([values ])
74
+ elif isinstance (values , pa .ChunkedArray ):
75
+ self ._pa_array = values
76
+ else :
77
+ raise ValueError (f"Unsupported type '{ type (values )} ' for JSONArray" )
90
78
91
79
@classmethod
92
80
def _box_pa (
93
81
cls , value , pa_type : pa .DataType | None = None
94
82
) -> pa .Array | pa .ChunkedArray | pa .Scalar :
95
83
"""Box value into a pyarrow Array, ChunkedArray or Scalar."""
96
84
if isinstance (value , pa .Scalar ) or not (
97
- is_list_like (value ) and not is_dict_like (value )
85
+ common . is_list_like (value ) and not common . is_dict_like (value )
98
86
):
99
87
return cls ._box_pa_scalar (value , pa_type )
100
88
return cls ._box_pa_array (value , pa_type )
101
89
102
90
@classmethod
103
91
def _box_pa_scalar (cls , value , pa_type : pa .DataType | None = None ) -> pa .Scalar :
104
92
"""Box value into a pyarrow Scalar."""
105
- value = JSONArray ._seralizate_json (value )
106
- pa_scalar = super ()._box_pa_scalar (value , pa_type )
107
- if pa .types .is_string (pa_scalar .type ) and pa_type is None :
108
- pa_scalar = pc .cast (pa_scalar , pa .large_string ())
93
+ if isinstance (value , pa .Scalar ):
94
+ pa_scalar = value
95
+ if pd .isna (value ):
96
+ pa_scalar = pa .scalar (None , type = pa_type )
97
+ else :
98
+ value = JSONArray ._serialize_json (value )
99
+ pa_scalar = pa .scalar (value , type = pa_type , from_pandas = True )
100
+
101
+ if pa_type is not None and pa_scalar .type != pa_type :
102
+ pa_scalar = pa_scalar .cast (pa_type )
109
103
return pa_scalar
110
104
111
105
@classmethod
112
106
def _box_pa_array (
113
107
cls , value , pa_type : pa .DataType | None = None , copy : bool = False
114
108
) -> pa .Array | pa .ChunkedArray :
115
109
"""Box value into a pyarrow Array or ChunkedArray."""
116
- if (
117
- not isinstance (value , cls )
118
- and not isinstance (value , (pa .Array , pa .ChunkedArray ))
119
- and not isinstance (value , BaseMaskedArray )
120
- ):
121
- value = [JSONArray ._seralizate_json (x ) for x in value ]
122
- pa_array = super ()._box_pa_array (value , pa_type )
123
- if pa .types .is_string (pa_array .type ) and pa_type is None :
124
- pa_array = pc .cast (pa_array , pa .large_string ())
110
+ if isinstance (value , cls ):
111
+ pa_array = value ._pa_array
112
+ elif isinstance (value , (pa .Array , pa .ChunkedArray )):
113
+ pa_array = value
114
+ else :
115
+ try :
116
+ value = [JSONArray ._serialize_json (x ) for x in value ]
117
+ pa_array = pa .array (value , type = pa_type , from_pandas = True )
118
+ except (pa .ArrowInvalid , pa .ArrowTypeError ):
119
+ # GH50430: let pyarrow infer type, then cast
120
+ pa_array = pa .array (value , from_pandas = True )
121
+
122
+ if pa_type is not None and pa_array .type != pa_type :
123
+ pa_array = pa_array .cast (pa_type )
124
+
125
125
return pa_array
126
126
127
127
@classmethod
128
128
def _from_sequence (cls , scalars , * , dtype = None , copy = False ):
129
129
"""Construct a new ExtensionArray from a sequence of scalars."""
130
130
result = []
131
131
for scalar in scalars :
132
- result .append (JSONArray ._seralizate_json (scalar ))
132
+ result .append (JSONArray ._serialize_json (scalar ))
133
133
return cls (pa .array (result , type = pa .large_string (), from_pandas = True ))
134
134
135
135
@classmethod
136
136
def _from_sequence_of_strings (
137
- cls , strings , * , dtype : ExtensionDtype , copy : bool = False
137
+ cls , strings , * , dtype , copy : bool = False
138
138
) -> JSONArray :
139
139
"""Construct a new ExtensionArray from a sequence of strings."""
140
140
return cls ._from_sequence (strings , dtype = dtype , copy = copy )
@@ -152,7 +152,7 @@ def _from_factorized(cls, values, original):
152
152
return cls ._from_sequence (values , dtype = original .dtype )
153
153
154
154
@staticmethod
155
- def _seralizate_json (value ):
155
+ def _serialize_json (value ):
156
156
"""A static method that converts a JSON value into a string representation."""
157
157
if isinstance (value , str ) or pd .isna (value ):
158
158
return value
@@ -176,19 +176,19 @@ def dtype(self) -> JSONDtype:
176
176
177
177
def __contains__ (self , key ) -> bool :
178
178
"""Return for `item in self`."""
179
- return super ().__contains__ (JSONArray ._seralizate_json (key ))
179
+ return super ().__contains__ (JSONArray ._serialize_json (key ))
180
180
181
181
def insert (self , loc : int , item ) -> JSONArray :
182
182
"""
183
183
Make new ExtensionArray inserting new item at location. Follows Python
184
184
list.append semantics for negative values.
185
185
"""
186
- val = JSONArray ._seralizate_json (item )
186
+ val = JSONArray ._serialize_json (item )
187
187
return super ().insert (loc , val )
188
188
189
189
def __getitem__ (self , item ):
190
190
"""Select a subset of self."""
191
- item = check_array_indexer (self , item )
191
+ item = indexers . check_array_indexer (self , item )
192
192
193
193
if isinstance (item , np .ndarray ):
194
194
if not len (item ):
@@ -203,9 +203,9 @@ def __getitem__(self, item):
203
203
"boolean arrays are valid indices."
204
204
)
205
205
elif isinstance (item , tuple ):
206
- item = unpack_tuple_and_ellipses (item )
206
+ item = indexers . unpack_tuple_and_ellipses (item )
207
207
208
- if is_scalar (item ) and not is_integer (item ):
208
+ if common . is_scalar (item ) and not common . is_integer (item ):
209
209
# e.g. "foo" or 2.5
210
210
# exception message copied from numpy
211
211
raise IndexError (
0 commit comments