30
30
31
31
@pd .api .extensions .register_extension_dtype
32
32
class JSONDtype (pd .api .extensions .ExtensionDtype ):
33
- """Extension dtype for JSON data."""
33
+ """Extension dtype for BigQuery JSON data."""
34
34
35
35
name = "dbjson"
36
36
37
37
@property
38
38
def na_value (self ) -> pd .NA :
39
+ """Default NA value to use for this type."""
39
40
return pd .NA
40
41
41
42
@property
42
43
def type (self ) -> type [str ]:
44
+ """Return the scalar type for the array, e.g. int."""
43
45
return dict
44
46
45
47
@property
@@ -62,7 +64,9 @@ def __from_arrow__(array: typing.Union[pa.Array, pa.ChunkedArray]) -> JSONArray:
62
64
63
65
64
66
class JSONArray (ArrowExtensionArray ):
65
- """Extension array containing JSON data."""
67
+ """Extension array that handles BigQuery JSON data, leveraging a string-based
68
+ pyarrow array for storage. It enables seamless conversion to JSON objects when
69
+ accessing individual elements."""
66
70
67
71
_dtype = JSONDtype ()
68
72
@@ -88,18 +92,7 @@ def __init__(self, values, dtype=None, copy=False) -> None:
88
92
def _box_pa (
89
93
cls , value , pa_type : pa .DataType | None = None
90
94
) -> pa .Array | pa .ChunkedArray | pa .Scalar :
91
- """
92
- Box value into a pyarrow Array, ChunkedArray or Scalar.
93
-
94
- Parameters
95
- ----------
96
- value : any
97
- pa_type : pa.DataType | None
98
-
99
- Returns
100
- -------
101
- pa.Array or pa.ChunkedArray or pa.Scalar
102
- """
95
+ """Box value into a pyarrow Array, ChunkedArray or Scalar."""
103
96
if isinstance (value , pa .Scalar ) or not (
104
97
is_list_like (value ) and not is_dict_like (value )
105
98
):
@@ -108,18 +101,7 @@ def _box_pa(
108
101
109
102
@classmethod
110
103
def _box_pa_scalar (cls , value , pa_type : pa .DataType | None = None ) -> pa .Scalar :
111
- """
112
- Box value into a pyarrow Scalar.
113
-
114
- Parameters
115
- ----------
116
- value : any
117
- pa_type : pa.DataType | None
118
-
119
- Returns
120
- -------
121
- pa.Scalar
122
- """
104
+ """Box value into a pyarrow Scalar."""
123
105
value = JSONArray ._seralizate_json (value )
124
106
pa_scalar = super ()._box_pa_scalar (value , pa_type )
125
107
if pa .types .is_string (pa_scalar .type ) and pa_type is None :
@@ -130,18 +112,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
130
112
def _box_pa_array (
131
113
cls , value , pa_type : pa .DataType | None = None , copy : bool = False
132
114
) -> pa .Array | pa .ChunkedArray :
133
- """
134
- Box value into a pyarrow Array or ChunkedArray.
135
-
136
- Parameters
137
- ----------
138
- value : Sequence
139
- pa_type : pa.DataType | None
140
-
141
- Returns
142
- -------
143
- pa.Array or pa.ChunkedArray
144
- """
115
+ """Box value into a pyarrow Array or ChunkedArray."""
145
116
if (
146
117
not isinstance (value , cls )
147
118
and not isinstance (value , (pa .Array , pa .ChunkedArray ))
@@ -155,18 +126,7 @@ def _box_pa_array(
155
126
156
127
@classmethod
157
128
def _from_sequence (cls , scalars , * , dtype = None , copy = False ):
158
- # TODO: check _from_arrow APIs etc.
159
- # from pandas.core.arrays.masked import BaseMaskedArray
160
-
161
- # if isinstance(scalars, BaseMaskedArray):
162
- # # avoid costly conversion to object dtype in ensure_string_array and
163
- # # numerical issues with Float32Dtype
164
- # na_values = scalars._mask
165
- # result = scalars._data
166
- # # result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
167
- # return cls(pa.array(result, mask=na_values, type=pa.large_string()))
168
- # elif isinstance(scalars, (pa.Array, pa.ChunkedArray)):
169
- # return cls(pc.cast(scalars, pa.large_string()))
129
+ """Construct a new ExtensionArray from a sequence of scalars."""
170
130
result = []
171
131
for scalar in scalars :
172
132
result .append (JSONArray ._seralizate_json (scalar ))
@@ -176,10 +136,12 @@ def _from_sequence(cls, scalars, *, dtype=None, copy=False):
176
136
def _from_sequence_of_strings (
177
137
cls , strings , * , dtype : ExtensionDtype , copy : bool = False
178
138
) -> JSONArray :
139
+ """Construct a new ExtensionArray from a sequence of strings."""
179
140
return cls ._from_sequence (strings , dtype = dtype , copy = copy )
180
141
181
142
@staticmethod
182
143
def _seralizate_json (value ):
144
+ """A static method that converts a JSON value into a string representation."""
183
145
if isinstance (value , str ) or pd .isna (value ):
184
146
return value
185
147
else :
@@ -189,6 +151,7 @@ def _seralizate_json(value):
189
151
190
152
@staticmethod
191
153
def _deserialize_json (value ):
154
+ """A static method that converts a JSON string back into its original value."""
192
155
if not pd .isna (value ):
193
156
return json .loads (value )
194
157
else :
@@ -200,40 +163,24 @@ def dtype(self) -> JSONDtype:
200
163
return self ._dtype
201
164
202
165
def __contains__ (self , key ) -> bool :
166
+ """Return for `item in self`."""
203
167
return super ().__contains__ (JSONArray ._seralizate_json (key ))
204
168
205
169
def insert (self , loc : int , item ) -> JSONArray :
170
+ """
171
+ Make new ExtensionArray inserting new item at location. Follows Python
172
+ list.append semantics for negative values.
173
+ """
206
174
val = JSONArray ._seralizate_json (item )
207
175
return super ().insert (loc , val )
208
176
209
177
@classmethod
210
178
def _from_factorized (cls , values , original ):
179
+ """Reconstruct an ExtensionArray after factorization."""
211
180
return cls ._from_sequence (values , dtype = original .dtype )
212
181
213
182
def __getitem__ (self , item ):
214
- """Select a subset of self.
215
-
216
- Parameters
217
- ----------
218
- item : int, slice, or ndarray
219
- * int: The position in 'self' to get.
220
- * slice: A slice object, where 'start', 'stop', and 'step' are
221
- integers or None
222
- * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
223
-
224
- Returns
225
- -------
226
- item : scalar or ExtensionArray
227
-
228
- Notes
229
- -----
230
- For scalar ``item``, return a scalar value suitable for the array's
231
- type. This should be an instance of ``self.dtype.type``.
232
- For slice ``key``, return an instance of ``ExtensionArray``, even
233
- if the slice is length 0 or 1.
234
- For a boolean mask, return an instance of ``ExtensionArray``, filtered
235
- to the values where ``item`` is True.
236
- """
183
+ """Select a subset of self."""
237
184
item = check_array_indexer (self , item )
238
185
239
186
if isinstance (item , np .ndarray ):
@@ -283,37 +230,17 @@ def __getitem__(self, item):
283
230
return scalar
284
231
285
232
def __iter__ (self ):
286
- """
287
- Iterate over elements of the array.
288
- """
233
+ """Iterate over elements of the array."""
289
234
for value in self ._pa_array :
290
235
val = JSONArray ._deserialize_json (value .as_py ())
291
236
if val is None :
292
237
yield self ._dtype .na_value
293
238
else :
294
239
yield val
295
240
296
- @classmethod
297
- def _result_converter (cls , values , na = None ):
298
- return pd .BooleanDtype ().__from_arrow__ (values )
299
-
300
241
@classmethod
301
242
def _concat_same_type (cls , to_concat ) -> JSONArray :
302
- """
303
- Concatenate multiple JSONArray.
304
-
305
- Parameters
306
- ----------
307
- to_concat : sequence of JSONArray
308
-
309
- Returns
310
- -------
311
- JSONArray
312
- """
243
+ """Concatenate multiple JSONArray."""
313
244
chunks = [array for ea in to_concat for array in ea ._pa_array .iterchunks ()]
314
245
arr = pa .chunked_array (chunks , type = pa .large_string ())
315
246
return cls (arr )
316
-
317
- def _pad_or_backfill (self , * , method , limit = None , copy = True ):
318
- # GH#56616 - test EA method without limit_area argument
319
- return super ()._pad_or_backfill (method = method , limit = limit , copy = copy )
0 commit comments