Skip to content

Commit ed906ef

Browse files
committed
box and unbox between string(storage) and dict(getitem)
1 parent 2b03662 commit ed906ef

File tree

2 files changed

+428
-80
lines changed

2 files changed

+428
-80
lines changed

db_dtypes/json.py

Lines changed: 114 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,14 @@
1414

1515
from __future__ import annotations
1616

17+
import json
1718
import typing
1819

1920
import numpy as np
2021
import pandas as pd
21-
from pandas._libs import lib
2222
from pandas.core.arrays.arrow.array import ArrowExtensionArray
23-
from pandas.core.arrays.numeric import NumericDtype
24-
from pandas.core.dtypes.common import is_integer, is_scalar, pandas_dtype
23+
from pandas.core.arrays.masked import BaseMaskedArray
24+
from pandas.core.dtypes.common import is_dict_like, is_integer, is_list_like, is_scalar
2525
from pandas.core.dtypes.dtypes import ExtensionDtype
2626
from pandas.core.indexers import check_array_indexer, unpack_tuple_and_ellipses
2727
import pyarrow as pa
@@ -84,8 +84,43 @@ def __init__(self, values, dtype=None, copy=False) -> None:
8484
"large_string type"
8585
)
8686

87+
@classmethod
88+
def _box_pa(
89+
cls, value, pa_type: pa.DataType | None = None
90+
) -> pa.Array | pa.ChunkedArray | pa.Scalar:
91+
"""
92+
Box value into a pyarrow Array, ChunkedArray or Scalar.
93+
94+
Parameters
95+
----------
96+
value : any
97+
pa_type : pa.DataType | None
98+
99+
Returns
100+
-------
101+
pa.Array or pa.ChunkedArray or pa.Scalar
102+
"""
103+
if isinstance(value, pa.Scalar) or not (
104+
is_list_like(value) and not is_dict_like(value)
105+
):
106+
return cls._box_pa_scalar(value, pa_type)
107+
return cls._box_pa_array(value, pa_type)
108+
87109
@classmethod
88110
def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
111+
"""
112+
Box value into a pyarrow Scalar.
113+
114+
Parameters
115+
----------
116+
value : any
117+
pa_type : pa.DataType | None
118+
119+
Returns
120+
-------
121+
pa.Scalar
122+
"""
123+
value = JSONArray._seralizate_json(value)
89124
pa_scalar = super()._box_pa_scalar(value, pa_type)
90125
if pa.types.is_string(pa_scalar.type) and pa_type is None:
91126
pa_scalar = pc.cast(pa_scalar, pa.large_string())
@@ -95,27 +130,46 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
95130
def _box_pa_array(
96131
cls, value, pa_type: pa.DataType | None = None, copy: bool = False
97132
) -> pa.Array | pa.ChunkedArray:
133+
"""
134+
Box value into a pyarrow Array or ChunkedArray.
135+
136+
Parameters
137+
----------
138+
value : Sequence
139+
pa_type : pa.DataType | None
140+
141+
Returns
142+
-------
143+
pa.Array or pa.ChunkedArray
144+
"""
145+
if (
146+
not isinstance(value, cls)
147+
and not isinstance(value, (pa.Array, pa.ChunkedArray))
148+
and not isinstance(value, BaseMaskedArray)
149+
):
150+
value = [JSONArray._seralizate_json(x) for x in value]
98151
pa_array = super()._box_pa_array(value, pa_type)
99152
if pa.types.is_string(pa_array.type) and pa_type is None:
100153
pa_array = pc.cast(pa_array, pa.large_string())
101154
return pa_array
102155

103156
@classmethod
104157
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
105-
from pandas.core.arrays.masked import BaseMaskedArray
106-
107-
if isinstance(scalars, BaseMaskedArray):
108-
# avoid costly conversion to object dtype in ensure_string_array and
109-
# numerical issues with Float32Dtype
110-
na_values = scalars._mask
111-
result = scalars._data
112-
result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
113-
return cls(pa.array(result, mask=na_values, type=pa.large_string()))
114-
elif isinstance(scalars, (pa.Array, pa.ChunkedArray)):
115-
return cls(pc.cast(scalars, pa.large_string()))
116-
117-
# convert non-na-likes to str
118-
result = lib.ensure_string_array(scalars, copy=copy)
158+
# TODO: check _from_arrow APIs etc.
159+
# from pandas.core.arrays.masked import BaseMaskedArray
160+
161+
# if isinstance(scalars, BaseMaskedArray):
162+
# # avoid costly conversion to object dtype in ensure_string_array and
163+
# # numerical issues with Float32Dtype
164+
# na_values = scalars._mask
165+
# result = scalars._data
166+
# # result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
167+
# return cls(pa.array(result, mask=na_values, type=pa.large_string()))
168+
# elif isinstance(scalars, (pa.Array, pa.ChunkedArray)):
169+
# return cls(pc.cast(scalars, pa.large_string()))
170+
result = []
171+
for scalar in scalars:
172+
result.append(JSONArray._seralizate_json(scalar))
119173
return cls(pa.array(result, type=pa.large_string(), from_pandas=True))
120174

121175
@classmethod
@@ -124,30 +178,45 @@ def _from_sequence_of_strings(
124178
) -> JSONArray:
125179
return cls._from_sequence(strings, dtype=dtype, copy=copy)
126180

181+
@staticmethod
182+
def _seralizate_json(value):
183+
if isinstance(value, str) or pd.isna(value):
184+
return value
185+
else:
186+
# `sort_keys=True` sorts dictionary keys before serialization, making
187+
# JSON comparisons deterministic.
188+
return json.dumps(value, sort_keys=True)
189+
190+
@staticmethod
191+
def _deserialize_json(value):
192+
if not pd.isna(value):
193+
return json.loads(value)
194+
else:
195+
return value
196+
127197
@property
128198
def dtype(self) -> JSONDtype:
129199
"""An instance of JSONDtype"""
130200
return self._dtype
131201

132-
def insert(self, loc: int, item) -> JSONArray:
133-
if not isinstance(item, str) and not pd.isna(item):
134-
raise TypeError("Scalar must be NA or str")
135-
return super().insert(loc, item)
202+
def __contains__(self, key) -> bool:
203+
return super().__contains__(JSONArray._seralizate_json(key))
204+
205+
# def __contains__(self, key) -> bool:
206+
# # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604
207+
# if pd.isna(key) and key is not self.dtype.na_value:
208+
# if self.dtype.kind == "f" and lib.is_float(key):
209+
# return pc.any(pc.is_nan(self._pa_array)).as_py()
136210

137-
def astype(self, dtype, copy: bool = True):
138-
dtype = pandas_dtype(dtype)
211+
# # e.g. date or timestamp types we do not allow None here to match pd.NA
212+
# return False
213+
# # TODO: maybe complex? object?
139214

140-
if dtype == self.dtype:
141-
if copy:
142-
return self.copy()
143-
return self
144-
elif isinstance(dtype, NumericDtype):
145-
data = self._pa_array.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
146-
return dtype.__from_arrow__(data)
147-
elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating):
148-
return self.to_numpy(dtype=dtype, na_value=np.nan)
215+
# return bool(super().__contains__(key))
149216

150-
return super().astype(dtype, copy=copy)
217+
def insert(self, loc: int, item) -> JSONArray:
218+
val = JSONArray._seralizate_json(item)
219+
return super().insert(loc, val)
151220

152221
@classmethod
153222
def _from_factorized(cls, values, original):
@@ -219,12 +288,23 @@ def __getitem__(self, item):
219288
if isinstance(value, pa.ChunkedArray):
220289
return type(self)(value)
221290
else:
222-
scalar = value.as_py()
291+
scalar = JSONArray._deserialize_json(value.as_py())
223292
if scalar is None:
224293
return self._dtype.na_value
225294
else:
226295
return scalar
227296

297+
def __iter__(self):
298+
"""
299+
Iterate over elements of the array.
300+
"""
301+
for value in self._pa_array:
302+
val = JSONArray._deserialize_json(value.as_py())
303+
if val is None:
304+
yield self._dtype.na_value
305+
else:
306+
yield val
307+
228308
@classmethod
229309
def _result_converter(cls, values, na=None):
230310
return pd.BooleanDtype().__from_arrow__(values)

0 commit comments

Comments
 (0)