14
14
15
15
from __future__ import annotations
16
16
17
+ import json
17
18
import typing
18
19
19
20
import numpy as np
20
21
import pandas as pd
21
- from pandas ._libs import lib
22
22
from pandas .core .arrays .arrow .array import ArrowExtensionArray
23
- from pandas .core .arrays .numeric import NumericDtype
24
- from pandas .core .dtypes .common import is_integer , is_scalar , pandas_dtype
23
+ from pandas .core .arrays .masked import BaseMaskedArray
24
+ from pandas .core .dtypes .common import is_dict_like , is_integer , is_list_like , is_scalar
25
25
from pandas .core .dtypes .dtypes import ExtensionDtype
26
26
from pandas .core .indexers import check_array_indexer , unpack_tuple_and_ellipses
27
27
import pyarrow as pa
@@ -84,8 +84,43 @@ def __init__(self, values, dtype=None, copy=False) -> None:
84
84
"large_string type"
85
85
)
86
86
87
+ @classmethod
88
+ def _box_pa (
89
+ cls , value , pa_type : pa .DataType | None = None
90
+ ) -> pa .Array | pa .ChunkedArray | pa .Scalar :
91
+ """
92
+ Box value into a pyarrow Array, ChunkedArray or Scalar.
93
+
94
+ Parameters
95
+ ----------
96
+ value : any
97
+ pa_type : pa.DataType | None
98
+
99
+ Returns
100
+ -------
101
+ pa.Array or pa.ChunkedArray or pa.Scalar
102
+ """
103
+ if isinstance (value , pa .Scalar ) or not (
104
+ is_list_like (value ) and not is_dict_like (value )
105
+ ):
106
+ return cls ._box_pa_scalar (value , pa_type )
107
+ return cls ._box_pa_array (value , pa_type )
108
+
87
109
@classmethod
88
110
def _box_pa_scalar (cls , value , pa_type : pa .DataType | None = None ) -> pa .Scalar :
111
+ """
112
+ Box value into a pyarrow Scalar.
113
+
114
+ Parameters
115
+ ----------
116
+ value : any
117
+ pa_type : pa.DataType | None
118
+
119
+ Returns
120
+ -------
121
+ pa.Scalar
122
+ """
123
+ value = JSONArray ._seralizate_json (value )
89
124
pa_scalar = super ()._box_pa_scalar (value , pa_type )
90
125
if pa .types .is_string (pa_scalar .type ) and pa_type is None :
91
126
pa_scalar = pc .cast (pa_scalar , pa .large_string ())
@@ -95,27 +130,46 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
95
130
def _box_pa_array (
96
131
cls , value , pa_type : pa .DataType | None = None , copy : bool = False
97
132
) -> pa .Array | pa .ChunkedArray :
133
+ """
134
+ Box value into a pyarrow Array or ChunkedArray.
135
+
136
+ Parameters
137
+ ----------
138
+ value : Sequence
139
+ pa_type : pa.DataType | None
140
+
141
+ Returns
142
+ -------
143
+ pa.Array or pa.ChunkedArray
144
+ """
145
+ if (
146
+ not isinstance (value , cls )
147
+ and not isinstance (value , (pa .Array , pa .ChunkedArray ))
148
+ and not isinstance (value , BaseMaskedArray )
149
+ ):
150
+ value = [JSONArray ._seralizate_json (x ) for x in value ]
98
151
pa_array = super ()._box_pa_array (value , pa_type )
99
152
if pa .types .is_string (pa_array .type ) and pa_type is None :
100
153
pa_array = pc .cast (pa_array , pa .large_string ())
101
154
return pa_array
102
155
103
156
@classmethod
104
157
def _from_sequence (cls , scalars , * , dtype = None , copy = False ):
105
- from pandas .core .arrays .masked import BaseMaskedArray
106
-
107
- if isinstance (scalars , BaseMaskedArray ):
108
- # avoid costly conversion to object dtype in ensure_string_array and
109
- # numerical issues with Float32Dtype
110
- na_values = scalars ._mask
111
- result = scalars ._data
112
- result = lib .ensure_string_array (result , copy = copy , convert_na_value = False )
113
- return cls (pa .array (result , mask = na_values , type = pa .large_string ()))
114
- elif isinstance (scalars , (pa .Array , pa .ChunkedArray )):
115
- return cls (pc .cast (scalars , pa .large_string ()))
116
-
117
- # convert non-na-likes to str
118
- result = lib .ensure_string_array (scalars , copy = copy )
158
+ # TODO: check _from_arrow APIs etc.
159
+ # from pandas.core.arrays.masked import BaseMaskedArray
160
+
161
+ # if isinstance(scalars, BaseMaskedArray):
162
+ # # avoid costly conversion to object dtype in ensure_string_array and
163
+ # # numerical issues with Float32Dtype
164
+ # na_values = scalars._mask
165
+ # result = scalars._data
166
+ # # result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
167
+ # return cls(pa.array(result, mask=na_values, type=pa.large_string()))
168
+ # elif isinstance(scalars, (pa.Array, pa.ChunkedArray)):
169
+ # return cls(pc.cast(scalars, pa.large_string()))
170
+ result = []
171
+ for scalar in scalars :
172
+ result .append (JSONArray ._seralizate_json (scalar ))
119
173
return cls (pa .array (result , type = pa .large_string (), from_pandas = True ))
120
174
121
175
@classmethod
@@ -124,30 +178,45 @@ def _from_sequence_of_strings(
124
178
) -> JSONArray :
125
179
return cls ._from_sequence (strings , dtype = dtype , copy = copy )
126
180
181
+ @staticmethod
182
+ def _seralizate_json (value ):
183
+ if isinstance (value , str ) or pd .isna (value ):
184
+ return value
185
+ else :
186
+ # `sort_keys=True` sorts dictionary keys before serialization, making
187
+ # JSON comparisons deterministic.
188
+ return json .dumps (value , sort_keys = True )
189
+
190
+ @staticmethod
191
+ def _deserialize_json (value ):
192
+ if not pd .isna (value ):
193
+ return json .loads (value )
194
+ else :
195
+ return value
196
+
127
197
@property
128
198
def dtype (self ) -> JSONDtype :
129
199
"""An instance of JSONDtype"""
130
200
return self ._dtype
131
201
132
- def insert (self , loc : int , item ) -> JSONArray :
133
- if not isinstance (item , str ) and not pd .isna (item ):
134
- raise TypeError ("Scalar must be NA or str" )
135
- return super ().insert (loc , item )
202
+ def __contains__ (self , key ) -> bool :
203
+ return super ().__contains__ (JSONArray ._seralizate_json (key ))
204
+
205
+ # def __contains__(self, key) -> bool:
206
+ # # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604
207
+ # if pd.isna(key) and key is not self.dtype.na_value:
208
+ # if self.dtype.kind == "f" and lib.is_float(key):
209
+ # return pc.any(pc.is_nan(self._pa_array)).as_py()
136
210
137
- def astype (self , dtype , copy : bool = True ):
138
- dtype = pandas_dtype (dtype )
211
+ # # e.g. date or timestamp types we do not allow None here to match pd.NA
212
+ # return False
213
+ # # TODO: maybe complex? object?
139
214
140
- if dtype == self .dtype :
141
- if copy :
142
- return self .copy ()
143
- return self
144
- elif isinstance (dtype , NumericDtype ):
145
- data = self ._pa_array .cast (pa .from_numpy_dtype (dtype .numpy_dtype ))
146
- return dtype .__from_arrow__ (data )
147
- elif isinstance (dtype , np .dtype ) and np .issubdtype (dtype , np .floating ):
148
- return self .to_numpy (dtype = dtype , na_value = np .nan )
215
+ # return bool(super().__contains__(key))
149
216
150
- return super ().astype (dtype , copy = copy )
217
+ def insert (self , loc : int , item ) -> JSONArray :
218
+ val = JSONArray ._seralizate_json (item )
219
+ return super ().insert (loc , val )
151
220
152
221
@classmethod
153
222
def _from_factorized (cls , values , original ):
@@ -219,12 +288,23 @@ def __getitem__(self, item):
219
288
if isinstance (value , pa .ChunkedArray ):
220
289
return type (self )(value )
221
290
else :
222
- scalar = value .as_py ()
291
+ scalar = JSONArray . _deserialize_json ( value .as_py () )
223
292
if scalar is None :
224
293
return self ._dtype .na_value
225
294
else :
226
295
return scalar
227
296
297
+ def __iter__ (self ):
298
+ """
299
+ Iterate over elements of the array.
300
+ """
301
+ for value in self ._pa_array :
302
+ val = JSONArray ._deserialize_json (value .as_py ())
303
+ if val is None :
304
+ yield self ._dtype .na_value
305
+ else :
306
+ yield val
307
+
228
308
@classmethod
229
309
def _result_converter (cls , values , na = None ):
230
310
return pd .BooleanDtype ().__from_arrow__ (values )
0 commit comments