From d0e94d51c8823c44f27263ccb75efff86b15ac71 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 8 Jul 2024 22:51:03 +0000 Subject: [PATCH 01/28] Copy JSONDtype and JSONArray from tests/extension/json and their tests --- db_dtypes/__init__.py | 3 + db_dtypes/json.py | 273 +++++++++++ tests/compliance/json/conftest.py | 223 +++++++++ tests/compliance/json/test_json_compliance.py | 444 ++++++++++++++++++ .../json/test_json_compliance_1_5.py | 31 ++ 5 files changed, 974 insertions(+) create mode 100644 db_dtypes/json.py create mode 100644 tests/compliance/json/conftest.py create mode 100644 tests/compliance/json/test_json_compliance.py create mode 100644 tests/compliance/json/test_json_compliance_1_5.py diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py index ad4ea33..076270f 100644 --- a/db_dtypes/__init__.py +++ b/db_dtypes/__init__.py @@ -28,6 +28,7 @@ import pyarrow.compute from db_dtypes import core +from db_dtypes.json import JSONArray, JSONDtype from db_dtypes.version import __version__ date_dtype_name = "dbdate" @@ -341,6 +342,8 @@ def __sub__(self, other): "__version__", "DateArray", "DateDtype", + "JSONDtype", + "JSONArray", "TimeArray", "TimeDtype", ] diff --git a/db_dtypes/json.py b/db_dtypes/json.py new file mode 100644 index 0000000..72f4c2c --- /dev/null +++ b/db_dtypes/json.py @@ -0,0 +1,273 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from collections import UserDict, abc +import itertools +import numbers +import string +import sys +from typing import TYPE_CHECKING, Any + +import numpy as np +import pandas as pd +from pandas.api.extensions import ExtensionArray, ExtensionDtype +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike +from pandas.core.dtypes.common import is_bool_dtype, is_list_like, pandas_dtype +from pandas.core.indexers import unpack_tuple_and_ellipses + +if TYPE_CHECKING: + from collections.abc import Mapping + + from pandas._typing import type_t + + +@pd.api.extensions.register_extension_dtype +class JSONDtype(pd.api.extensions.ExtensionDtype): + """Extension dtype for JSON data.""" + + # type = str + + type = abc.Mapping + name = "dbjson" + # na_value = pd.NA # TODO: StringDtype is libmissing.NA + + na_value: Mapping[str, Any] = UserDict() + # _is_numeric = False + # _is_boolean = False + + @classmethod + def construct_array_type(cls): + """Return the array type associated with this dtype.""" + return JSONArray + + # @staticmethod + # def __from_arrow__( + # array: Union[pyarrow.Array, pyarrow.ChunkedArray] + # ) -> "JSONArray": + # """Convert to JSONArray from an Arrow array. + + # See: + # https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow + # """ + # if isinstance(array, pyarrow.Array): + # chunks = [array] + # else: + # chunks = array.chunks + + # results = [] + # for arr in chunks: + # # convert chunk by chunk to numpy and concatenate then, to avoid + # # overflow for large string data when concatenating the pyarrow arrays + # arr = arr.to_numpy(zero_copy_only=False) + # arr = ensure_string_array(arr, na_value=pandas.NA) + # results.append(arr) + + # if len(chunks) == 0: + # arr = numpy.array([], dtype=str) + # else: + # arr = numpy.concatenate(results) + + # return JSONArray(arr) + + # # TODO: codes from StringDtype + # # # Bypass validation inside StringArray constructor, see GH#47781 + # # new_string_array = StringArray.__new__(StringArray) + # # NDArrayBacked.__init__( + # # new_string_array, + # # arr, + # # StringDtype(storage="python"), + # # ) + # # return new_string_array + + +class JSONArray(pd.api.extensions.ExtensionArray): + """Extension array containing JSON data.""" + + dtype = JSONDtype() + __array_priority__ = 1000 + + def __init__(self, values, dtype=None, copy=False) -> None: + for val in values: + if not isinstance(val, self.dtype.type): + raise TypeError(f"All values must be of type {str(self.dtype.type)}: actual {type(val)}") + self.data = values + + # Some aliases for common attribute names to ensure pandas supports + # these + self._items = self._data = self.data + # those aliases are currently not working due to assumptions + # in internal code (GH-20735) + # self._values = self.values = self.data + + @classmethod + def _from_sequence(cls, scalars, *, dtype=None, copy=False): + return cls(scalars) + + @classmethod + def _from_factorized(cls, values, original): + return cls([UserDict(x) for x in values if x != ()]) + + def __getitem__(self, item): + if isinstance(item, tuple): + item = unpack_tuple_and_ellipses(item) + + if isinstance(item, numbers.Integral): + return self.data[item] + elif isinstance(item, slice) and item == slice(None): + # Make sure we get a view + return type(self)(self.data) + elif isinstance(item, slice): + # slice + return type(self)(self.data[item]) + elif not is_list_like(item): + # e.g. "foo" or 2.5 + # exception message copied from numpy + raise IndexError( + r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " + r"(`None`) and integer or boolean arrays are valid indices" + ) + else: + item = pd.api.indexers.check_array_indexer(self, item) + if is_bool_dtype(item.dtype): + return type(self)._from_sequence( + [x for x, m in zip(self, item) if m], dtype=self.dtype + ) + # integer + return type(self)([self.data[i] for i in item]) + + def __setitem__(self, key, value) -> None: + if isinstance(key, numbers.Integral): + self.data[key] = value + else: + if not isinstance(value, (type(self), abc.Sequence)): + # broadcast value + value = itertools.cycle([value]) + + if isinstance(key, np.ndarray) and key.dtype == "bool": + # masking + for i, (k, v) in enumerate(zip(key, value)): + if k: + assert isinstance(v, self.dtype.type) + self.data[i] = v + else: + for k, v in zip(key, value): + assert isinstance(v, self.dtype.type) + self.data[k] = v + + def __len__(self) -> int: + return len(self.data) + + def __eq__(self, other): + return NotImplemented + + def __ne__(self, other): + return NotImplemented + + def __array__(self, dtype=None, copy=None): + if dtype is None: + dtype = object + if dtype == object: + # on py38 builds it looks like numpy is inferring to a non-1D array + return construct_1d_object_array_from_listlike(list(self)) + return np.asarray(self.data, dtype=dtype) + + @property + def nbytes(self) -> int: + return sys.getsizeof(self.data) + + def isna(self): + return np.array([x == self.dtype.na_value for x in self.data], dtype=bool) + + def take(self, indexer, allow_fill=False, fill_value=None): + # re-implement here, since NumPy has trouble setting + # sized objects like UserDicts into scalar slots of + # an ndarary. + indexer = np.asarray(indexer) + msg = ( + "Index is out of bounds or cannot do a " + "non-empty take from an empty array." + ) + + if allow_fill: + # Do not allow any custom na_value + if fill_value is None: + fill_value = self.dtype.na_value + # bounds check + if (indexer < -1).any(): + raise ValueError + try: + output = [ + self.data[loc] if loc != -1 else fill_value for loc in indexer + ] + except IndexError as err: + raise IndexError(msg) from err + else: + try: + output = [self.data[loc] for loc in indexer] + except IndexError as err: + raise IndexError(msg) from err + + return type(self)._from_sequence(output, dtype=self.dtype) + + def copy(self): + return type(self)(self.data[:]) + + def astype(self, dtype, copy=True): + # NumPy has issues when all the dicts are the same length. + # np.array([UserDict(...), UserDict(...)]) fails, + # but np.array([{...}, {...}]) works, so cast. + from pandas.core.arrays.string_ import StringDtype + + dtype = pandas_dtype(dtype) + # needed to add this check for the Series constructor + if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: + if copy: + return self.copy() + return self + elif isinstance(dtype, StringDtype): + value = self.astype(str) # numpy doesn't like nested dicts + arr_cls = dtype.construct_array_type() + return arr_cls._from_sequence(value, dtype=dtype, copy=False) + elif not copy: + return np.asarray([dict(x) for x in self], dtype=dtype) + else: + return np.array([dict(x) for x in self], dtype=dtype, copy=copy) + + def unique(self): + # Parent method doesn't work since np.array will try to infer + # a 2-dim object. + return type(self)([dict(x) for x in {tuple(d.items()) for d in self.data}]) + + @classmethod + def _concat_same_type(cls, to_concat): + data = list(itertools.chain.from_iterable(x.data for x in to_concat)) + return cls(data) + + def _values_for_factorize(self): + frozen = self._values_for_argsort() + if len(frozen) == 0: + # factorize_array expects 1-d array, this is a len-0 2-d array. + frozen = frozen.ravel() + return frozen, () + + def _values_for_argsort(self): + # Bypass NumPy's shape inference to get a (N,) array of tuples. + frozen = [tuple(x.items()) for x in self] + return construct_1d_object_array_from_listlike(frozen) + + def _pad_or_backfill(self, *, method, limit=None, copy=True): + # GH#56616 - test EA method without limit_area argument + return super()._pad_or_backfill(method=method, limit=limit, copy=copy) diff --git a/tests/compliance/json/conftest.py b/tests/compliance/json/conftest.py new file mode 100644 index 0000000..67e5fb6 --- /dev/null +++ b/tests/compliance/json/conftest.py @@ -0,0 +1,223 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +from collections import UserDict, abc +import operator +import sys + +import numpy as np +import pytest +from typing import TYPE_CHECKING, Any + +import pandas as pd +import pandas._testing as tm +import string +from pandas.tests.extension import base + +from db_dtypes import JSONArray, JSONDtype + +from collections import ( + UserDict, + abc, +) + +if TYPE_CHECKING: + from collections.abc import Mapping + +def make_data(): + # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer + rng = np.random.default_rng(2) + return [ + UserDict( + [ + (rng.choice(list(string.ascii_letters)), rng.integers(0, 100)) + for _ in range(rng.integers(0, 10)) + ] + ) + for _ in range(100) + ] + + +# We intentionally don't run base.BaseSetitemTests because pandas' +# internals has trouble setting sequences of values into scalar positions. +unhashable = pytest.mark.xfail(reason="Unhashable") + + +@pytest.fixture +def dtype(): + return JSONDtype() + + +@pytest.fixture +def data(): + """Length-100 PeriodArray for semantics test.""" + data = make_data() + + # Why the while loop? NumPy is unable to construct an ndarray from + # equal-length ndarrays. Many of our operations involve coercing the + # EA to an ndarray of objects. To avoid random test failures, we ensure + # that our data is coercible to an ndarray. Several tests deal with only + # the first two elements, so that's what we'll check. + + while len(data[0]) == len(data[1]): + data = make_data() + + return JSONArray(data) + + +@pytest.fixture +def data_for_twos(dtype): + """ + Length-100 array in which all the elements are two. + + Call pytest.skip in your fixture if the dtype does not support divmod. + """ + if not (dtype._is_numeric or dtype.kind == "m"): + # Object-dtypes may want to allow this, but for the most part + # only numeric and timedelta-like dtypes will need to implement this. + pytest.skip(f"{dtype} is not a numeric dtype") + + raise NotImplementedError + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + return JSONArray([{}, {"a": 10}]) + + +@pytest.fixture +def data_for_sorting(): + return JSONArray([{"b": 1}, {"c": 4}, {"a": 2, "c": 3}]) + + +@pytest.fixture +def data_missing_for_sorting(): + return JSONArray([{"b": 1}, {}, {"a": 4}]) + + +@pytest.fixture +def na_cmp(): + return operator.eq + + +@pytest.fixture +def data_for_grouping(): + return JSONArray( + [ + {"b": 1}, + {"b": 1}, + {}, + {}, + {"a": 0, "c": 2}, + {"a": 0, "c": 2}, + {"b": 1}, + {"c": 2}, + ] + ) + +@pytest.fixture +def data_repeated(data): + """ + Generate many datasets. + + Parameters + ---------- + data : fixture implementing `data` + + Returns + ------- + Callable[[int], Generator]: + A callable that takes a `count` argument and + returns a generator yielding `count` datasets. + """ + + def gen(count): + for _ in range(count): + yield data + + return gen + + +_all_numeric_accumulations = ["cumsum", "cumprod", "cummin", "cummax"] + +@pytest.fixture(params=_all_numeric_accumulations) +def all_numeric_accumulations(request): + """ + Fixture for numeric accumulation names + """ + return request.param + + +_all_boolean_reductions = ["all", "any"] + + +@pytest.fixture(params=_all_boolean_reductions) +def all_boolean_reductions(request): + """ + Fixture for boolean reduction names. + """ + return request.param + + +_all_numeric_reductions = [ + "count", + "sum", + "max", + "min", + "mean", + "prod", + "std", + "var", + "median", + "kurt", + "skew", + "sem", +] + + +@pytest.fixture(params=_all_numeric_reductions) +def all_numeric_reductions(request): + """ + Fixture for numeric reduction names. + """ + return request.param + + +@pytest.fixture(params=tm.arithmetic_dunder_methods) +def all_arithmetic_operators(request): + """ + Fixture for dunder names for common arithmetic operations. + """ + return request.param + +@pytest.fixture +def na_value(): + """ + The scalar missing value for this type. Default 'None'. + """ + return UserDict() + +@pytest.fixture(params=["data", "data_missing"]) +def all_data(request, data, data_missing): + """Parametrized fixture returning 'data' or 'data_missing' integer arrays. + + Used to test dtype conversion with and without missing values. + """ + if request.param == "data": + return data + elif request.param == "data_missing": + return data_missing + diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py new file mode 100644 index 0000000..359430d --- /dev/null +++ b/tests/compliance/json/test_json_compliance.py @@ -0,0 +1,444 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Tests for extension interface compliance, inherited from pandas. + +See: +https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/decimal/test_decimal.py +and +https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/test_period.py +""" + +import collections +import operator +import sys + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +import string +from pandas.tests.extension import base + +from db_dtypes import JSONArray, JSONDtype + +# We intentionally don't run base.BaseSetitemTests because pandas' +# internals has trouble setting sequences of values into scalar positions. +unhashable = pytest.mark.xfail(reason="Unhashable") + + +class TestJSONArray(base.ExtensionTests): + @pytest.mark.xfail( + reason="comparison method not implemented for JSONArray (GH-37867)" + ) + def test_contains(self, data): + # GH-37867 + super().test_contains(data) + + @pytest.mark.xfail(reason="not implemented constructor from dtype") + def test_from_dtype(self, data): + # construct from our dtype & string dtype + super().test_from_dtype(data) + + @pytest.mark.xfail(reason="RecursionError, GH-33900") + def test_series_constructor_no_data_with_index(self, dtype, na_value): + # RecursionError: maximum recursion depth exceeded in comparison + rec_limit = sys.getrecursionlimit() + try: + # Limit to avoid stack overflow on Windows CI + sys.setrecursionlimit(100) + super().test_series_constructor_no_data_with_index(dtype, na_value) + finally: + sys.setrecursionlimit(rec_limit) + + @pytest.mark.xfail(reason="RecursionError, GH-33900") + def test_series_constructor_scalar_na_with_index(self, dtype, na_value): + # RecursionError: maximum recursion depth exceeded in comparison + rec_limit = sys.getrecursionlimit() + try: + # Limit to avoid stack overflow on Windows CI + sys.setrecursionlimit(100) + super().test_series_constructor_scalar_na_with_index(dtype, na_value) + finally: + sys.setrecursionlimit(rec_limit) + + @pytest.mark.xfail(reason="collection as scalar, GH-33901") + def test_series_constructor_scalar_with_index(self, data, dtype): + # TypeError: All values must be of type + rec_limit = sys.getrecursionlimit() + try: + # Limit to avoid stack overflow on Windows CI + sys.setrecursionlimit(100) + super().test_series_constructor_scalar_with_index(data, dtype) + finally: + sys.setrecursionlimit(rec_limit) + + @pytest.mark.xfail(reason="Different definitions of NA") + def test_stack(self): + """ + The test does .astype(object).stack(). If we happen to have + any missing values in `data`, then we'll end up with different + rows since we consider `{}` NA, but `.astype(object)` doesn't. + """ + super().test_stack() + + @pytest.mark.xfail(reason="dict for NA") + def test_unstack(self, data, index): + # The base test has NaN for the expected NA value. + # this matches otherwise + return super().test_unstack(data, index) + + @pytest.mark.xfail(reason="Setting a dict as a scalar") + def test_fillna_series(self): + """We treat dictionaries as a mapping in fillna, not a scalar.""" + super().test_fillna_series() + + @pytest.mark.xfail(reason="Setting a dict as a scalar") + def test_fillna_frame(self): + """We treat dictionaries as a mapping in fillna, not a scalar.""" + super().test_fillna_frame() + + @pytest.mark.xfail(reason="fill value is a dictionary, takes incorrect code path") + def test_fillna_limit_frame(self, data_missing): + # GH#58001 + super().test_fillna_limit_frame(data_missing) + + @pytest.mark.xfail(reason="fill value is a dictionary, takes incorrect code path") + def test_fillna_limit_series(self, data_missing): + # GH#58001 + super().test_fillna_limit_frame(data_missing) + + @pytest.mark.parametrize( + "limit_area, input_ilocs, expected_ilocs", + [ + ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]), + ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]), + ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]), + ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]), + ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]), + ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]), + ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]), + ], + ) + def test_ffill_limit_area( + self, data_missing, limit_area, input_ilocs, expected_ilocs + ): + # GH#56616 + msg = "JSONArray does not implement limit_area" + with pytest.raises(NotImplementedError, match=msg): + super().test_ffill_limit_area( + data_missing, limit_area, input_ilocs, expected_ilocs + ) + + @unhashable + def test_value_counts(self, all_data, dropna): + super().test_value_counts(all_data, dropna) + + @unhashable + def test_value_counts_with_normalize(self, data): + super().test_value_counts_with_normalize(data) + + @unhashable + def test_sort_values_frame(self): + # TODO (EA.factorize): see if _values_for_factorize allows this. + super().test_sort_values_frame() + + @pytest.mark.xfail(reason="combine for JSONArray not supported") + def test_combine_le(self, data_repeated): + super().test_combine_le(data_repeated) + + @pytest.mark.xfail( + reason="combine for JSONArray not supported - " + "may pass depending on random data", + strict=False, + raises=AssertionError, + ) + def test_combine_first(self, data): + super().test_combine_first(data) + + @pytest.mark.xfail(reason="broadcasting error") + def test_where_series(self, data, na_value): + # Fails with + # *** ValueError: operands could not be broadcast together + # with shapes (4,) (4,) (0,) + super().test_where_series(data, na_value) + + @pytest.mark.xfail(reason="Can't compare dicts.") + def test_searchsorted(self, data_for_sorting): + super().test_searchsorted(data_for_sorting) + + @pytest.mark.xfail(reason="Can't compare dicts.") + def test_equals(self, data, na_value, as_series): + super().test_equals(data, na_value, as_series) + + @pytest.mark.skip("fill-value is interpreted as a dict of values") + def test_fillna_copy_frame(self, data_missing): + super().test_fillna_copy_frame(data_missing) + + @pytest.mark.xfail(reason="Fails with CoW") + def test_equals_same_data_different_object(self, data): + super().test_equals_same_data_different_object(data) + + @pytest.mark.xfail(reason="failing on np.array(self, dtype=str)") + def test_astype_str(self): + """This currently fails in NumPy on np.array(self, dtype=str) with + + *** ValueError: setting an array element with a sequence + """ + super().test_astype_str() + + @unhashable + def test_groupby_extension_transform(self): + """ + This currently fails in Series.name.setter, since the + name must be hashable, but the value is a dictionary. + I think this is what we want, i.e. `.name` should be the original + values, and not the values for factorization. + """ + super().test_groupby_extension_transform() + + @unhashable + def test_groupby_extension_apply(self): + """ + This fails in Index._do_unique_check with + + > hash(val) + E TypeError: unhashable type: 'UserDict' with + + I suspect that once we support Index[ExtensionArray], + we'll be able to dispatch unique. + """ + super().test_groupby_extension_apply() + + @unhashable + def test_groupby_extension_agg(self): + """ + This fails when we get to tm.assert_series_equal when left.index + contains dictionaries, which are not hashable. + """ + super().test_groupby_extension_agg() + + @unhashable + def test_groupby_extension_no_sort(self): + """ + This fails when we get to tm.assert_series_equal when left.index + contains dictionaries, which are not hashable. + """ + super().test_groupby_extension_no_sort() + + def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): + if len(data[0]) != 1: + mark = pytest.mark.xfail(reason="raises in coercing to Series") + request.applymarker(mark) + super().test_arith_frame_with_scalar(data, all_arithmetic_operators) + + def test_compare_array(self, data, comparison_op, request): + if comparison_op.__name__ in ["eq", "ne"]: + mark = pytest.mark.xfail(reason="Comparison methods not implemented") + request.applymarker(mark) + super().test_compare_array(data, comparison_op) + + @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value") + def test_setitem_loc_scalar_mixed(self, data): + super().test_setitem_loc_scalar_mixed(data) + + @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value") + def test_setitem_loc_scalar_multiple_homogoneous(self, data): + super().test_setitem_loc_scalar_multiple_homogoneous(data) + + @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value") + def test_setitem_iloc_scalar_mixed(self, data): + super().test_setitem_iloc_scalar_mixed(data) + + @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value") + def test_setitem_iloc_scalar_multiple_homogoneous(self, data): + super().test_setitem_iloc_scalar_multiple_homogoneous(data) + + @pytest.mark.parametrize( + "mask", + [ + np.array([True, True, True, False, False]), + pd.array([True, True, True, False, False], dtype="boolean"), + pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"), + ], + ids=["numpy-array", "boolean-array", "boolean-array-na"], + ) + def test_setitem_mask(self, data, mask, box_in_series, request): + if box_in_series: + mark = pytest.mark.xfail( + reason="cannot set using a list-like indexer with a different length" + ) + request.applymarker(mark) + elif not isinstance(mask, np.ndarray): + mark = pytest.mark.xfail(reason="Issues unwanted DeprecationWarning") + request.applymarker(mark) + super().test_setitem_mask(data, mask, box_in_series) + + def test_setitem_mask_raises(self, data, box_in_series, request): + if not box_in_series: + mark = pytest.mark.xfail(reason="Fails to raise") + request.applymarker(mark) + + super().test_setitem_mask_raises(data, box_in_series) + + @pytest.mark.xfail( + reason="cannot set using a list-like indexer with a different length" + ) + def test_setitem_mask_boolean_array_with_na(self, data, box_in_series): + super().test_setitem_mask_boolean_array_with_na(data, box_in_series) + + @pytest.mark.parametrize( + "idx", + [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])], + ids=["list", "integer-array", "numpy-array"], + ) + def test_setitem_integer_array(self, data, idx, box_in_series, request): + if box_in_series: + mark = pytest.mark.xfail( + reason="cannot set using a list-like indexer with a different length" + ) + request.applymarker(mark) + super().test_setitem_integer_array(data, idx, box_in_series) + + @pytest.mark.xfail(reason="list indices must be integers or slices, not NAType") + @pytest.mark.parametrize( + "idx, box_in_series", + [ + ([0, 1, 2, pd.NA], False), + pytest.param( + [0, 1, 2, pd.NA], True, marks=pytest.mark.xfail(reason="GH-31948") + ), + (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), + (pd.array([0, 1, 2, pd.NA], dtype="Int64"), True), + ], + ids=["list-False", "list-True", "integer-array-False", "integer-array-True"], + ) + def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series): + super().test_setitem_integer_with_missing_raises(data, idx, box_in_series) + + @pytest.mark.xfail(reason="Fails to raise") + def test_setitem_scalar_key_sequence_raise(self, data): + super().test_setitem_scalar_key_sequence_raise(data) + + def test_setitem_with_expansion_dataframe_column(self, data, full_indexer, request): + if "full_slice" in request.node.name: + mark = pytest.mark.xfail(reason="slice is not iterable") + request.applymarker(mark) + super().test_setitem_with_expansion_dataframe_column(data, full_indexer) + + @pytest.mark.xfail(reason="slice is not iterable") + def test_setitem_frame_2d_values(self, data): + super().test_setitem_frame_2d_values(data) + + @pytest.mark.xfail( + reason="cannot set using a list-like indexer with a different length" + ) + @pytest.mark.parametrize("setter", ["loc", None]) + def test_setitem_mask_broadcast(self, data, setter): + super().test_setitem_mask_broadcast(data, setter) + + @pytest.mark.xfail( + reason="cannot set using a slice indexer with a different length" + ) + def test_setitem_slice(self, data, box_in_series): + super().test_setitem_slice(data, box_in_series) + + @pytest.mark.xfail(reason="slice object is not iterable") + def test_setitem_loc_iloc_slice(self, data): + super().test_setitem_loc_iloc_slice(data) + + @pytest.mark.xfail(reason="slice object is not iterable") + def test_setitem_slice_mismatch_length_raises(self, data): + super().test_setitem_slice_mismatch_length_raises(data) + + @pytest.mark.xfail(reason="slice object is not iterable") + def test_setitem_slice_array(self, data): + super().test_setitem_slice_array(data) + + @pytest.mark.xfail(reason="Fail to raise") + def test_setitem_invalid(self, data, invalid_scalar): + super().test_setitem_invalid(data, invalid_scalar) + + @pytest.mark.xfail(reason="only integer scalar arrays can be converted") + def test_setitem_2d_values(self, data): + super().test_setitem_2d_values(data) + + @pytest.mark.xfail(reason="data type 'json' not understood") + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data, request): + super().test_EA_types(engine, data, request) + + +def custom_assert_series_equal(left, right, *args, **kwargs): + # NumPy doesn't handle an array of equal-length UserDicts. + # The default assert_series_equal eventually does a + # Series.values, which raises. We work around it by + # converting the UserDicts to dicts. + if left.dtype.name == "json": + assert left.dtype == right.dtype + left = pd.Series( + JSONArray(left.values.astype(object)), index=left.index, name=left.name + ) + right = pd.Series( + JSONArray(right.values.astype(object)), + index=right.index, + name=right.name, + ) + tm.assert_series_equal(left, right, *args, **kwargs) + + +def custom_assert_frame_equal(left, right, *args, **kwargs): + obj_type = kwargs.get("obj", "DataFrame") + tm.assert_index_equal( + left.columns, + right.columns, + exact=kwargs.get("check_column_type", "equiv"), + check_names=kwargs.get("check_names", True), + check_exact=kwargs.get("check_exact", False), + check_categorical=kwargs.get("check_categorical", True), + obj=f"{obj_type}.columns", + ) + + jsons = (left.dtypes == "json").index + + for col in jsons: + custom_assert_series_equal(left[col], right[col], *args, **kwargs) + + left = left.drop(columns=jsons) + right = right.drop(columns=jsons) + tm.assert_frame_equal(left, right, *args, **kwargs) + + +def test_custom_asserts(): + # This would always trigger the KeyError from trying to put + # an array of equal-length UserDicts inside an ndarray. + data = JSONArray( + [ + dict({"a": 1}), + dict({"b": 2}), + dict({"c": 3}), + ] + ) + a = pd.Series(data) + custom_assert_series_equal(a, a) + custom_assert_frame_equal(a.to_frame(), a.to_frame()) + + b = pd.Series(data.take([0, 0, 1])) + with pytest.raises(AssertionError): + custom_assert_series_equal(a, b) + + with pytest.raises(AssertionError): + custom_assert_frame_equal(a.to_frame(), b.to_frame()) diff --git a/tests/compliance/json/test_json_compliance_1_5.py b/tests/compliance/json/test_json_compliance_1_5.py new file mode 100644 index 0000000..ee2d878 --- /dev/null +++ b/tests/compliance/json/test_json_compliance_1_5.py @@ -0,0 +1,31 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Tests for extension interface compliance, inherited from pandas. + +See: +https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/decimal/test_decimal.py +and +https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/test_period.py +""" + +from pandas.tests.extension import base +import pytest + +# NDArrayBacked2DTests suite added in https://github.com/pandas-dev/pandas/pull/44974 +pytest.importorskip("pandas", minversion="1.5.0dev") + + +# class Test2DCompat(base.NDArrayBacked2DTests): +# pass From 1d33703a908445b1f5679568272fca9470e06288 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 9 Jul 2024 17:32:21 +0000 Subject: [PATCH 02/28] formatting --- db_dtypes/json.py | 4 +++- tests/compliance/json/conftest.py | 18 ++++++++---------- tests/compliance/json/test_json_compliance.py | 5 ++--- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/db_dtypes/json.py b/db_dtypes/json.py index 72f4c2c..2ae4dc5 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -102,7 +102,9 @@ class JSONArray(pd.api.extensions.ExtensionArray): def __init__(self, values, dtype=None, copy=False) -> None: for val in values: if not isinstance(val, self.dtype.type): - raise TypeError(f"All values must be of type {str(self.dtype.type)}: actual {type(val)}") + raise TypeError( + f"All values must be of type {str(self.dtype.type)}: actual {type(val)}" + ) self.data = values # Some aliases for common attribute names to ensure pandas supports diff --git a/tests/compliance/json/conftest.py b/tests/compliance/json/conftest.py index 67e5fb6..775a302 100644 --- a/tests/compliance/json/conftest.py +++ b/tests/compliance/json/conftest.py @@ -15,27 +15,22 @@ import collections from collections import UserDict, abc import operator +import string import sys - -import numpy as np -import pytest from typing import TYPE_CHECKING, Any +import numpy as np import pandas as pd import pandas._testing as tm -import string from pandas.tests.extension import base +import pytest from db_dtypes import JSONArray, JSONDtype -from collections import ( - UserDict, - abc, -) - if TYPE_CHECKING: from collections.abc import Mapping + def make_data(): # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer rng = np.random.default_rng(2) @@ -128,6 +123,7 @@ def data_for_grouping(): ] ) + @pytest.fixture def data_repeated(data): """ @@ -153,6 +149,7 @@ def gen(count): _all_numeric_accumulations = ["cumsum", "cumprod", "cummin", "cummax"] + @pytest.fixture(params=_all_numeric_accumulations) def all_numeric_accumulations(request): """ @@ -203,6 +200,7 @@ def all_arithmetic_operators(request): """ return request.param + @pytest.fixture def na_value(): """ @@ -210,6 +208,7 @@ def na_value(): """ return UserDict() + @pytest.fixture(params=["data", "data_missing"]) def all_data(request, data, data_missing): """Parametrized fixture returning 'data' or 'data_missing' integer arrays. @@ -220,4 +219,3 @@ def all_data(request, data, data_missing): return data elif request.param == "data_missing": return data_missing - diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py index 359430d..e0be6d7 100644 --- a/tests/compliance/json/test_json_compliance.py +++ b/tests/compliance/json/test_json_compliance.py @@ -22,15 +22,14 @@ import collections import operator +import string import sys import numpy as np -import pytest - import pandas as pd import pandas._testing as tm -import string from pandas.tests.extension import base +import pytest from db_dtypes import JSONArray, JSONDtype From de3120ad4556f32eb86b061ca9b821b823a7034d Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 16 Jul 2024 20:36:58 +0000 Subject: [PATCH 03/28] converts to ArrowStringArray --- db_dtypes/json.py | 400 ++++++++---------- docs/conf.py | 2 +- samples/snippets/noxfile.py | 1 - tests/compliance/json/conftest.py | 96 ++--- tests/compliance/json/test_json_compliance.py | 381 +++-------------- .../json/test_json_compliance_1_5.py | 31 -- 6 files changed, 290 insertions(+), 621 deletions(-) delete mode 100644 tests/compliance/json/test_json_compliance_1_5.py diff --git a/db_dtypes/json.py b/db_dtypes/json.py index 2ae4dc5..814e8d6 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -14,261 +14,237 @@ from __future__ import annotations -from collections import UserDict, abc -import itertools -import numbers -import string -import sys -from typing import TYPE_CHECKING, Any +import typing import numpy as np import pandas as pd -from pandas.api.extensions import ExtensionArray, ExtensionDtype -from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike -from pandas.core.dtypes.common import is_bool_dtype, is_list_like, pandas_dtype -from pandas.core.indexers import unpack_tuple_and_ellipses - -if TYPE_CHECKING: - from collections.abc import Mapping - - from pandas._typing import type_t +from pandas._libs import lib +from pandas.core.arrays.arrow.array import ArrowExtensionArray +from pandas.core.arrays.numeric import NumericDtype +from pandas.core.dtypes.common import is_integer, is_scalar, pandas_dtype +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.indexers import check_array_indexer, unpack_tuple_and_ellipses +import pyarrow as pa +import pyarrow.compute as pc @pd.api.extensions.register_extension_dtype class JSONDtype(pd.api.extensions.ExtensionDtype): """Extension dtype for JSON data.""" - # type = str - - type = abc.Mapping name = "dbjson" - # na_value = pd.NA # TODO: StringDtype is libmissing.NA - na_value: Mapping[str, Any] = UserDict() - # _is_numeric = False - # _is_boolean = False + @property + def na_value(self) -> pd.NA: + return pd.NA + + @property + def type(self) -> type[str]: + return str + + @property + def _is_numeric(self) -> bool: + return False + + @property + def _is_boolean(self) -> bool: + return False @classmethod def construct_array_type(cls): """Return the array type associated with this dtype.""" return JSONArray - # @staticmethod - # def __from_arrow__( - # array: Union[pyarrow.Array, pyarrow.ChunkedArray] - # ) -> "JSONArray": - # """Convert to JSONArray from an Arrow array. - - # See: - # https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow - # """ - # if isinstance(array, pyarrow.Array): - # chunks = [array] - # else: - # chunks = array.chunks - - # results = [] - # for arr in chunks: - # # convert chunk by chunk to numpy and concatenate then, to avoid - # # overflow for large string data when concatenating the pyarrow arrays - # arr = arr.to_numpy(zero_copy_only=False) - # arr = ensure_string_array(arr, na_value=pandas.NA) - # results.append(arr) - - # if len(chunks) == 0: - # arr = numpy.array([], dtype=str) - # else: - # arr = numpy.concatenate(results) - - # return JSONArray(arr) - - # # TODO: codes from StringDtype - # # # Bypass validation inside StringArray constructor, see GH#47781 - # # new_string_array = StringArray.__new__(StringArray) - # # NDArrayBacked.__init__( - # # new_string_array, - # # arr, - # # StringDtype(storage="python"), - # # ) - # # return new_string_array - - -class JSONArray(pd.api.extensions.ExtensionArray): + @staticmethod + def __from_arrow__(array: typing.Union[pa.Array, pa.ChunkedArray]) -> JSONArray: + """Convert to JSONArray from an Arrow array.""" + return JSONArray(array) + + +class JSONArray(ArrowExtensionArray): """Extension array containing JSON data.""" - dtype = JSONDtype() - __array_priority__ = 1000 + _dtype = JSONDtype() def __init__(self, values, dtype=None, copy=False) -> None: - for val in values: - if not isinstance(val, self.dtype.type): - raise TypeError( - f"All values must be of type {str(self.dtype.type)}: actual {type(val)}" - ) - self.data = values + if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string( + values.type + ): + values = pc.cast(values, pa.large_string()) + + super().__init__(values) + self._dtype = JSONDtype() + + if not pa.types.is_large_string(self._pa_array.type) and not ( + pa.types.is_dictionary(self._pa_array.type) + and pa.types.is_large_string(self._pa_array.type.value_type) + ): + raise ValueError( + "ArrowStringArray requires a PyArrow (chunked) array of " + "large_string type" + ) + + @classmethod + def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: + pa_scalar = super()._box_pa_scalar(value, pa_type) + if pa.types.is_string(pa_scalar.type) and pa_type is None: + pa_scalar = pc.cast(pa_scalar, pa.large_string()) + return pa_scalar - # Some aliases for common attribute names to ensure pandas supports - # these - self._items = self._data = self.data - # those aliases are currently not working due to assumptions - # in internal code (GH-20735) - # self._values = self.values = self.data + @classmethod + def _box_pa_array( + cls, value, pa_type: pa.DataType | None = None, copy: bool = False + ) -> pa.Array | pa.ChunkedArray: + pa_array = super()._box_pa_array(value, pa_type) + if pa.types.is_string(pa_array.type) and pa_type is None: + pa_array = pc.cast(pa_array, pa.large_string()) + return pa_array @classmethod def _from_sequence(cls, scalars, *, dtype=None, copy=False): - return cls(scalars) + from pandas.core.arrays.masked import BaseMaskedArray + + if isinstance(scalars, BaseMaskedArray): + # avoid costly conversion to object dtype in ensure_string_array and + # numerical issues with Float32Dtype + na_values = scalars._mask + result = scalars._data + result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) + return cls(pa.array(result, mask=na_values, type=pa.large_string())) + elif isinstance(scalars, (pa.Array, pa.ChunkedArray)): + return cls(pc.cast(scalars, pa.large_string())) + + # convert non-na-likes to str + result = lib.ensure_string_array(scalars, copy=copy) + return cls(pa.array(result, type=pa.large_string(), from_pandas=True)) + + @classmethod + def _from_sequence_of_strings( + cls, strings, *, dtype: ExtensionDtype, copy: bool = False + ) -> JSONArray: + return cls._from_sequence(strings, dtype=dtype, copy=copy) + + @property + def dtype(self) -> JSONDtype: + """An instance of JSONDtype""" + return self._dtype + + def insert(self, loc: int, item) -> JSONArray: + if not isinstance(item, str) and not pd.isna(item): + raise TypeError("Scalar must be NA or str") + return super().insert(loc, item) + + def astype(self, dtype, copy: bool = True): + dtype = pandas_dtype(dtype) + + if dtype == self.dtype: + if copy: + return self.copy() + return self + elif isinstance(dtype, NumericDtype): + data = self._pa_array.cast(pa.from_numpy_dtype(dtype.numpy_dtype)) + return dtype.__from_arrow__(data) + elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating): + return self.to_numpy(dtype=dtype, na_value=np.nan) + + return super().astype(dtype, copy=copy) @classmethod def _from_factorized(cls, values, original): - return cls([UserDict(x) for x in values if x != ()]) + return cls._from_sequence(values, dtype=original.dtype) def __getitem__(self, item): - if isinstance(item, tuple): + """Select a subset of self. + + Parameters + ---------- + item : int, slice, or ndarray + * int: The position in 'self' to get. + * slice: A slice object, where 'start', 'stop', and 'step' are + integers or None + * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + + Returns + ------- + item : scalar or ExtensionArray + + Notes + ----- + For scalar ``item``, return a scalar value suitable for the array's + type. This should be an instance of ``self.dtype.type``. + For slice ``key``, return an instance of ``ExtensionArray``, even + if the slice is length 0 or 1. + For a boolean mask, return an instance of ``ExtensionArray``, filtered + to the values where ``item`` is True. + """ + item = check_array_indexer(self, item) + + if isinstance(item, np.ndarray): + if not len(item): + return type(self)(pa.chunked_array([], type=pa.string())) + elif item.dtype.kind in "iu": + return self.take(item) + elif item.dtype.kind == "b": + return type(self)(self._pa_array.filter(item)) + else: + raise IndexError( + "Only integers, slices and integer or " + "boolean arrays are valid indices." + ) + elif isinstance(item, tuple): item = unpack_tuple_and_ellipses(item) - if isinstance(item, numbers.Integral): - return self.data[item] - elif isinstance(item, slice) and item == slice(None): - # Make sure we get a view - return type(self)(self.data) - elif isinstance(item, slice): - # slice - return type(self)(self.data[item]) - elif not is_list_like(item): + if is_scalar(item) and not is_integer(item): # e.g. "foo" or 2.5 # exception message copied from numpy raise IndexError( r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " r"(`None`) and integer or boolean arrays are valid indices" ) + # We are not an array indexer, so maybe e.g. a slice or integer + # indexer. We dispatch to pyarrow. + if isinstance(item, slice): + # Arrow bug https://github.com/apache/arrow/issues/38768 + if item.start == item.stop: + pass + elif ( + item.stop is not None + and item.stop < -len(self) + and item.step is not None + and item.step < 0 + ): + item = slice(item.start, None, item.step) + + value = self._pa_array[item] + if isinstance(value, pa.ChunkedArray): + return type(self)(value) else: - item = pd.api.indexers.check_array_indexer(self, item) - if is_bool_dtype(item.dtype): - return type(self)._from_sequence( - [x for x, m in zip(self, item) if m], dtype=self.dtype - ) - # integer - return type(self)([self.data[i] for i in item]) - - def __setitem__(self, key, value) -> None: - if isinstance(key, numbers.Integral): - self.data[key] = value - else: - if not isinstance(value, (type(self), abc.Sequence)): - # broadcast value - value = itertools.cycle([value]) - - if isinstance(key, np.ndarray) and key.dtype == "bool": - # masking - for i, (k, v) in enumerate(zip(key, value)): - if k: - assert isinstance(v, self.dtype.type) - self.data[i] = v + scalar = value.as_py() + if scalar is None: + return self._dtype.na_value else: - for k, v in zip(key, value): - assert isinstance(v, self.dtype.type) - self.data[k] = v - - def __len__(self) -> int: - return len(self.data) - - def __eq__(self, other): - return NotImplemented + return scalar - def __ne__(self, other): - return NotImplemented - - def __array__(self, dtype=None, copy=None): - if dtype is None: - dtype = object - if dtype == object: - # on py38 builds it looks like numpy is inferring to a non-1D array - return construct_1d_object_array_from_listlike(list(self)) - return np.asarray(self.data, dtype=dtype) - - @property - def nbytes(self) -> int: - return sys.getsizeof(self.data) - - def isna(self): - return np.array([x == self.dtype.na_value for x in self.data], dtype=bool) - - def take(self, indexer, allow_fill=False, fill_value=None): - # re-implement here, since NumPy has trouble setting - # sized objects like UserDicts into scalar slots of - # an ndarary. - indexer = np.asarray(indexer) - msg = ( - "Index is out of bounds or cannot do a " - "non-empty take from an empty array." - ) - - if allow_fill: - # Do not allow any custom na_value - if fill_value is None: - fill_value = self.dtype.na_value - # bounds check - if (indexer < -1).any(): - raise ValueError - try: - output = [ - self.data[loc] if loc != -1 else fill_value for loc in indexer - ] - except IndexError as err: - raise IndexError(msg) from err - else: - try: - output = [self.data[loc] for loc in indexer] - except IndexError as err: - raise IndexError(msg) from err - - return type(self)._from_sequence(output, dtype=self.dtype) - - def copy(self): - return type(self)(self.data[:]) - - def astype(self, dtype, copy=True): - # NumPy has issues when all the dicts are the same length. - # np.array([UserDict(...), UserDict(...)]) fails, - # but np.array([{...}, {...}]) works, so cast. - from pandas.core.arrays.string_ import StringDtype - - dtype = pandas_dtype(dtype) - # needed to add this check for the Series constructor - if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: - if copy: - return self.copy() - return self - elif isinstance(dtype, StringDtype): - value = self.astype(str) # numpy doesn't like nested dicts - arr_cls = dtype.construct_array_type() - return arr_cls._from_sequence(value, dtype=dtype, copy=False) - elif not copy: - return np.asarray([dict(x) for x in self], dtype=dtype) - else: - return np.array([dict(x) for x in self], dtype=dtype, copy=copy) - - def unique(self): - # Parent method doesn't work since np.array will try to infer - # a 2-dim object. - return type(self)([dict(x) for x in {tuple(d.items()) for d in self.data}]) + @classmethod + def _result_converter(cls, values, na=None): + return pd.BooleanDtype().__from_arrow__(values) @classmethod - def _concat_same_type(cls, to_concat): - data = list(itertools.chain.from_iterable(x.data for x in to_concat)) - return cls(data) - - def _values_for_factorize(self): - frozen = self._values_for_argsort() - if len(frozen) == 0: - # factorize_array expects 1-d array, this is a len-0 2-d array. - frozen = frozen.ravel() - return frozen, () - - def _values_for_argsort(self): - # Bypass NumPy's shape inference to get a (N,) array of tuples. - frozen = [tuple(x.items()) for x in self] - return construct_1d_object_array_from_listlike(frozen) + def _concat_same_type(cls, to_concat) -> JSONArray: + """ + Concatenate multiple JSONArray. + + Parameters + ---------- + to_concat : sequence of JSONArray + + Returns + ------- + JSONArray + """ + chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()] + arr = pa.chunked_array(chunks, type=pa.large_string()) + return cls(arr) def _pad_or_backfill(self, *, method, limit=None, copy=True): # GH#56616 - test EA method without limit_area argument diff --git a/docs/conf.py b/docs/conf.py index 00e0013..672daff 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,9 +24,9 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys import os import shlex +import sys # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the diff --git a/samples/snippets/noxfile.py b/samples/snippets/noxfile.py index 3b71359..c36d5f2 100644 --- a/samples/snippets/noxfile.py +++ b/samples/snippets/noxfile.py @@ -22,7 +22,6 @@ import nox - # WARNING - WARNING - WARNING - WARNING - WARNING # WARNING - WARNING - WARNING - WARNING - WARNING # DO NOT EDIT THIS FILE EVER! diff --git a/tests/compliance/json/conftest.py b/tests/compliance/json/conftest.py index 775a302..f323f65 100644 --- a/tests/compliance/json/conftest.py +++ b/tests/compliance/json/conftest.py @@ -12,42 +12,30 @@ # See the License for the specific language governing permissions and # limitations under the License. -import collections -from collections import UserDict, abc -import operator -import string -import sys -from typing import TYPE_CHECKING, Any + +import json import numpy as np import pandas as pd import pandas._testing as tm -from pandas.tests.extension import base import pytest from db_dtypes import JSONArray, JSONDtype -if TYPE_CHECKING: - from collections.abc import Mapping - def make_data(): - # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer - rng = np.random.default_rng(2) - return [ - UserDict( - [ - (rng.choice(list(string.ascii_letters)), rng.integers(0, 100)) - for _ in range(rng.integers(0, 10)) - ] - ) - for _ in range(100) + # Sample data with varied lengths. + samples = [ + {"id": 1, "bool_value": True}, # Boolean + {"id": 2, "float_num": 3.14159}, # Floating + {"id": 3, "date": "2024-07-16"}, # Dates (as strings) + {"id": 4, "null_field": None}, # Null + {"list_data": [10, 20, 30]}, # Lists + {"person": {"name": "Alice", "age": 35}}, # Nested objects + {"address": {"street": "123 Main St", "city": "Anytown"}}, + {"order": {"items": ["book", "pen"], "total": 15.99}}, ] - - -# We intentionally don't run base.BaseSetitemTests because pandas' -# internals has trouble setting sequences of values into scalar positions. -unhashable = pytest.mark.xfail(reason="Unhashable") + return np.random.default_rng(2).choice(samples, size=100) @pytest.fixture @@ -67,9 +55,10 @@ def data(): # the first two elements, so that's what we'll check. while len(data[0]) == len(data[1]): + print(data) data = make_data() - return JSONArray(data) + return JSONArray._from_sequence(data) @pytest.fixture @@ -79,47 +68,56 @@ def data_for_twos(dtype): Call pytest.skip in your fixture if the dtype does not support divmod. """ - if not (dtype._is_numeric or dtype.kind == "m"): - # Object-dtypes may want to allow this, but for the most part - # only numeric and timedelta-like dtypes will need to implement this. - pytest.skip(f"{dtype} is not a numeric dtype") - - raise NotImplementedError + pytest.skip(f"{dtype} is not a numeric dtype") @pytest.fixture def data_missing(): """Length 2 array with [NA, Valid]""" - return JSONArray([{}, {"a": 10}]) + return JSONArray._from_sequence([None, {"a": 10}]) @pytest.fixture def data_for_sorting(): - return JSONArray([{"b": 1}, {"c": 4}, {"a": 2, "c": 3}]) + return JSONArray._from_sequence( + [json.dumps({"b": 1}), json.dumps({"c": 4}), json.dumps({"a": 2, "c": 3})] + ) @pytest.fixture def data_missing_for_sorting(): - return JSONArray([{"b": 1}, {}, {"a": 4}]) + return JSONArray._from_sequence([json.dumps({"b": 1}), None, json.dumps({"a": 4})]) @pytest.fixture def na_cmp(): - return operator.eq + """ + Binary operator for comparing NA values. + + Should return a function of two arguments that returns + True if both arguments are (scalar) NA for your type. + + By default, uses ``operator.is_`` + """ + + def cmp(a, b): + return lambda left, right: pd.isna(left) and pd.isna(right) + + return cmp @pytest.fixture def data_for_grouping(): - return JSONArray( + return JSONArray._from_sequence( [ - {"b": 1}, - {"b": 1}, - {}, - {}, - {"a": 0, "c": 2}, - {"a": 0, "c": 2}, - {"b": 1}, - {"c": 2}, + json.dumps({"b": 1}), + json.dumps({"b": 1}), + None, + None, + json.dumps({"a": 0, "c": 2}), + json.dumps({"a": 0, "c": 2}), + json.dumps({"b": 1}), + json.dumps({"c": 2}), ] ) @@ -201,14 +199,6 @@ def all_arithmetic_operators(request): return request.param -@pytest.fixture -def na_value(): - """ - The scalar missing value for this type. Default 'None'. - """ - return UserDict() - - @pytest.fixture(params=["data", "data_missing"]) def all_data(request, data, data_missing): """Parametrized fixture returning 'data' or 'data_missing' integer arrays. diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py index e0be6d7..d46b935 100644 --- a/tests/compliance/json/test_json_compliance.py +++ b/tests/compliance/json/test_json_compliance.py @@ -20,18 +20,15 @@ https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/test_period.py """ -import collections -import operator -import string -import sys +import typing -import numpy as np import pandas as pd import pandas._testing as tm from pandas.tests.extension import base +import pyarrow as pa import pytest -from db_dtypes import JSONArray, JSONDtype +from db_dtypes import JSONArray # We intentionally don't run base.BaseSetitemTests because pandas' # internals has trouble setting sequences of values into scalar positions. @@ -39,86 +36,6 @@ class TestJSONArray(base.ExtensionTests): - @pytest.mark.xfail( - reason="comparison method not implemented for JSONArray (GH-37867)" - ) - def test_contains(self, data): - # GH-37867 - super().test_contains(data) - - @pytest.mark.xfail(reason="not implemented constructor from dtype") - def test_from_dtype(self, data): - # construct from our dtype & string dtype - super().test_from_dtype(data) - - @pytest.mark.xfail(reason="RecursionError, GH-33900") - def test_series_constructor_no_data_with_index(self, dtype, na_value): - # RecursionError: maximum recursion depth exceeded in comparison - rec_limit = sys.getrecursionlimit() - try: - # Limit to avoid stack overflow on Windows CI - sys.setrecursionlimit(100) - super().test_series_constructor_no_data_with_index(dtype, na_value) - finally: - sys.setrecursionlimit(rec_limit) - - @pytest.mark.xfail(reason="RecursionError, GH-33900") - def test_series_constructor_scalar_na_with_index(self, dtype, na_value): - # RecursionError: maximum recursion depth exceeded in comparison - rec_limit = sys.getrecursionlimit() - try: - # Limit to avoid stack overflow on Windows CI - sys.setrecursionlimit(100) - super().test_series_constructor_scalar_na_with_index(dtype, na_value) - finally: - sys.setrecursionlimit(rec_limit) - - @pytest.mark.xfail(reason="collection as scalar, GH-33901") - def test_series_constructor_scalar_with_index(self, data, dtype): - # TypeError: All values must be of type - rec_limit = sys.getrecursionlimit() - try: - # Limit to avoid stack overflow on Windows CI - sys.setrecursionlimit(100) - super().test_series_constructor_scalar_with_index(data, dtype) - finally: - sys.setrecursionlimit(rec_limit) - - @pytest.mark.xfail(reason="Different definitions of NA") - def test_stack(self): - """ - The test does .astype(object).stack(). If we happen to have - any missing values in `data`, then we'll end up with different - rows since we consider `{}` NA, but `.astype(object)` doesn't. - """ - super().test_stack() - - @pytest.mark.xfail(reason="dict for NA") - def test_unstack(self, data, index): - # The base test has NaN for the expected NA value. - # this matches otherwise - return super().test_unstack(data, index) - - @pytest.mark.xfail(reason="Setting a dict as a scalar") - def test_fillna_series(self): - """We treat dictionaries as a mapping in fillna, not a scalar.""" - super().test_fillna_series() - - @pytest.mark.xfail(reason="Setting a dict as a scalar") - def test_fillna_frame(self): - """We treat dictionaries as a mapping in fillna, not a scalar.""" - super().test_fillna_frame() - - @pytest.mark.xfail(reason="fill value is a dictionary, takes incorrect code path") - def test_fillna_limit_frame(self, data_missing): - # GH#58001 - super().test_fillna_limit_frame(data_missing) - - @pytest.mark.xfail(reason="fill value is a dictionary, takes incorrect code path") - def test_fillna_limit_series(self, data_missing): - # GH#58001 - super().test_fillna_limit_frame(data_missing) - @pytest.mark.parametrize( "limit_area, input_ilocs, expected_ilocs", [ @@ -142,19 +59,10 @@ def test_ffill_limit_area( data_missing, limit_area, input_ilocs, expected_ilocs ) - @unhashable - def test_value_counts(self, all_data, dropna): - super().test_value_counts(all_data, dropna) - @unhashable def test_value_counts_with_normalize(self, data): super().test_value_counts_with_normalize(data) - @unhashable - def test_sort_values_frame(self): - # TODO (EA.factorize): see if _values_for_factorize allows this. - super().test_sort_values_frame() - @pytest.mark.xfail(reason="combine for JSONArray not supported") def test_combine_le(self, data_repeated): super().test_combine_le(data_repeated) @@ -168,75 +76,17 @@ def test_combine_le(self, data_repeated): def test_combine_first(self, data): super().test_combine_first(data) - @pytest.mark.xfail(reason="broadcasting error") - def test_where_series(self, data, na_value): - # Fails with - # *** ValueError: operands could not be broadcast together - # with shapes (4,) (4,) (0,) - super().test_where_series(data, na_value) - - @pytest.mark.xfail(reason="Can't compare dicts.") - def test_searchsorted(self, data_for_sorting): - super().test_searchsorted(data_for_sorting) - - @pytest.mark.xfail(reason="Can't compare dicts.") - def test_equals(self, data, na_value, as_series): - super().test_equals(data, na_value, as_series) - - @pytest.mark.skip("fill-value is interpreted as a dict of values") - def test_fillna_copy_frame(self, data_missing): - super().test_fillna_copy_frame(data_missing) + @pytest.mark.skip(reason="2D support not implemented for JSONArray") + def test_view(self, data): + super().test_view(data) - @pytest.mark.xfail(reason="Fails with CoW") - def test_equals_same_data_different_object(self, data): - super().test_equals_same_data_different_object(data) + @pytest.mark.skip(reason="2D support not implemented for JSONArray") + def test_setitem_preserves_views(self, data): + super().test_setitem_preserves_views(data) - @pytest.mark.xfail(reason="failing on np.array(self, dtype=str)") - def test_astype_str(self): - """This currently fails in NumPy on np.array(self, dtype=str) with - - *** ValueError: setting an array element with a sequence - """ - super().test_astype_str() - - @unhashable - def test_groupby_extension_transform(self): - """ - This currently fails in Series.name.setter, since the - name must be hashable, but the value is a dictionary. - I think this is what we want, i.e. `.name` should be the original - values, and not the values for factorization. - """ - super().test_groupby_extension_transform() - - @unhashable - def test_groupby_extension_apply(self): - """ - This fails in Index._do_unique_check with - - > hash(val) - E TypeError: unhashable type: 'UserDict' with - - I suspect that once we support Index[ExtensionArray], - we'll be able to dispatch unique. - """ - super().test_groupby_extension_apply() - - @unhashable - def test_groupby_extension_agg(self): - """ - This fails when we get to tm.assert_series_equal when left.index - contains dictionaries, which are not hashable. - """ - super().test_groupby_extension_agg() - - @unhashable - def test_groupby_extension_no_sort(self): - """ - This fails when we get to tm.assert_series_equal when left.index - contains dictionaries, which are not hashable. - """ - super().test_groupby_extension_no_sort() + @pytest.mark.skip(reason="2D support not implemented for JSONArray") + def test_transpose(self, data): + super().test_transpose(data) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): if len(data[0]) != 1: @@ -244,159 +94,46 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): request.applymarker(mark) super().test_arith_frame_with_scalar(data, all_arithmetic_operators) - def test_compare_array(self, data, comparison_op, request): - if comparison_op.__name__ in ["eq", "ne"]: - mark = pytest.mark.xfail(reason="Comparison methods not implemented") - request.applymarker(mark) - super().test_compare_array(data, comparison_op) - - @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value") - def test_setitem_loc_scalar_mixed(self, data): - super().test_setitem_loc_scalar_mixed(data) - - @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value") - def test_setitem_loc_scalar_multiple_homogoneous(self, data): - super().test_setitem_loc_scalar_multiple_homogoneous(data) - - @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value") - def test_setitem_iloc_scalar_mixed(self, data): - super().test_setitem_iloc_scalar_mixed(data) - - @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value") - def test_setitem_iloc_scalar_multiple_homogoneous(self, data): - super().test_setitem_iloc_scalar_multiple_homogoneous(data) - - @pytest.mark.parametrize( - "mask", - [ - np.array([True, True, True, False, False]), - pd.array([True, True, True, False, False], dtype="boolean"), - pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"), - ], - ids=["numpy-array", "boolean-array", "boolean-array-na"], - ) - def test_setitem_mask(self, data, mask, box_in_series, request): - if box_in_series: - mark = pytest.mark.xfail( - reason="cannot set using a list-like indexer with a different length" - ) - request.applymarker(mark) - elif not isinstance(mask, np.ndarray): - mark = pytest.mark.xfail(reason="Issues unwanted DeprecationWarning") - request.applymarker(mark) - super().test_setitem_mask(data, mask, box_in_series) - - def test_setitem_mask_raises(self, data, box_in_series, request): - if not box_in_series: - mark = pytest.mark.xfail(reason="Fails to raise") - request.applymarker(mark) - - super().test_setitem_mask_raises(data, box_in_series) - - @pytest.mark.xfail( - reason="cannot set using a list-like indexer with a different length" - ) - def test_setitem_mask_boolean_array_with_na(self, data, box_in_series): - super().test_setitem_mask_boolean_array_with_na(data, box_in_series) - - @pytest.mark.parametrize( - "idx", - [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])], - ids=["list", "integer-array", "numpy-array"], - ) - def test_setitem_integer_array(self, data, idx, box_in_series, request): - if box_in_series: - mark = pytest.mark.xfail( - reason="cannot set using a list-like indexer with a different length" - ) - request.applymarker(mark) - super().test_setitem_integer_array(data, idx, box_in_series) - - @pytest.mark.xfail(reason="list indices must be integers or slices, not NAType") - @pytest.mark.parametrize( - "idx, box_in_series", - [ - ([0, 1, 2, pd.NA], False), - pytest.param( - [0, 1, 2, pd.NA], True, marks=pytest.mark.xfail(reason="GH-31948") - ), - (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False), - (pd.array([0, 1, 2, pd.NA], dtype="Int64"), True), - ], - ids=["list-False", "list-True", "integer-array-False", "integer-array-True"], - ) - def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series): - super().test_setitem_integer_with_missing_raises(data, idx, box_in_series) - - @pytest.mark.xfail(reason="Fails to raise") - def test_setitem_scalar_key_sequence_raise(self, data): - super().test_setitem_scalar_key_sequence_raise(data) - - def test_setitem_with_expansion_dataframe_column(self, data, full_indexer, request): - if "full_slice" in request.node.name: - mark = pytest.mark.xfail(reason="slice is not iterable") + def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request): + if len(data[0]) != 1: + mark = pytest.mark.xfail(reason="raises in coercing to Series") request.applymarker(mark) - super().test_setitem_with_expansion_dataframe_column(data, full_indexer) - - @pytest.mark.xfail(reason="slice is not iterable") - def test_setitem_frame_2d_values(self, data): - super().test_setitem_frame_2d_values(data) - - @pytest.mark.xfail( - reason="cannot set using a list-like indexer with a different length" - ) - @pytest.mark.parametrize("setter", ["loc", None]) - def test_setitem_mask_broadcast(self, data, setter): - super().test_setitem_mask_broadcast(data, setter) - - @pytest.mark.xfail( - reason="cannot set using a slice indexer with a different length" - ) - def test_setitem_slice(self, data, box_in_series): - super().test_setitem_slice(data, box_in_series) - - @pytest.mark.xfail(reason="slice object is not iterable") - def test_setitem_loc_iloc_slice(self, data): - super().test_setitem_loc_iloc_slice(data) - - @pytest.mark.xfail(reason="slice object is not iterable") - def test_setitem_slice_mismatch_length_raises(self, data): - super().test_setitem_slice_mismatch_length_raises(data) - - @pytest.mark.xfail(reason="slice object is not iterable") - def test_setitem_slice_array(self, data): - super().test_setitem_slice_array(data) - - @pytest.mark.xfail(reason="Fail to raise") - def test_setitem_invalid(self, data, invalid_scalar): - super().test_setitem_invalid(data, invalid_scalar) - - @pytest.mark.xfail(reason="only integer scalar arrays can be converted") - def test_setitem_2d_values(self, data): - super().test_setitem_2d_values(data) - - @pytest.mark.xfail(reason="data type 'json' not understood") - @pytest.mark.parametrize("engine", ["c", "python"]) - def test_EA_types(self, engine, data, request): - super().test_EA_types(engine, data, request) - - -def custom_assert_series_equal(left, right, *args, **kwargs): - # NumPy doesn't handle an array of equal-length UserDicts. - # The default assert_series_equal eventually does a - # Series.values, which raises. We work around it by - # converting the UserDicts to dicts. - if left.dtype.name == "json": - assert left.dtype == right.dtype - left = pd.Series( - JSONArray(left.values.astype(object)), index=left.index, name=left.name - ) - right = pd.Series( - JSONArray(right.values.astype(object)), - index=right.index, - name=right.name, - ) - tm.assert_series_equal(left, right, *args, **kwargs) + super().test_arith_series_with_scalar(data, all_arithmetic_operators) + + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: + return op_name in ["min", "max"] + + def _get_expected_exception( + self, op_name: str, obj, other + ) -> type[Exception] | None: + if op_name in ["__divmod__", "__rdivmod__"]: + if isinstance(obj, pd.Series) or isinstance(other, pd.Series): + return NotImplementedError + return TypeError + elif op_name in ["__mod__", "__rmod__", "__pow__", "__rpow__"]: + return NotImplementedError + elif op_name in ["__mul__", "__rmul__"]: + # Can only multiply strings by integers + return TypeError + elif op_name in [ + "__truediv__", + "__rtruediv__", + "__floordiv__", + "__rfloordiv__", + "__sub__", + "__rsub__", + ]: + return pa.ArrowNotImplementedError + + return None + + def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): + dtype = typing.cast(pd.StringDtype, tm.get_dtype(obj)) + if op_name in ["__add__", "__radd__"]: + cast_to = dtype + else: + cast_to = "boolean[pyarrow]" # type: ignore[assignment] + return pointwise_result.astype(cast_to) def custom_assert_frame_equal(left, right, *args, **kwargs): @@ -414,7 +151,7 @@ def custom_assert_frame_equal(left, right, *args, **kwargs): jsons = (left.dtypes == "json").index for col in jsons: - custom_assert_series_equal(left[col], right[col], *args, **kwargs) + tm.assert_series_equal(left[col], right[col], *args, **kwargs) left = left.drop(columns=jsons) right = right.drop(columns=jsons) @@ -422,22 +159,20 @@ def custom_assert_frame_equal(left, right, *args, **kwargs): def test_custom_asserts(): - # This would always trigger the KeyError from trying to put - # an array of equal-length UserDicts inside an ndarray. - data = JSONArray( + data = JSONArray._from_sequence( [ - dict({"a": 1}), - dict({"b": 2}), - dict({"c": 3}), + {"a": 1}, + {"b": 2}, + {"c": 3}, ] ) a = pd.Series(data) - custom_assert_series_equal(a, a) + tm.assert_series_equal(a, a) custom_assert_frame_equal(a.to_frame(), a.to_frame()) b = pd.Series(data.take([0, 0, 1])) with pytest.raises(AssertionError): - custom_assert_series_equal(a, b) + tm.assert_series_equal(a, b) with pytest.raises(AssertionError): custom_assert_frame_equal(a.to_frame(), b.to_frame()) diff --git a/tests/compliance/json/test_json_compliance_1_5.py b/tests/compliance/json/test_json_compliance_1_5.py deleted file mode 100644 index ee2d878..0000000 --- a/tests/compliance/json/test_json_compliance_1_5.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Tests for extension interface compliance, inherited from pandas. - -See: -https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/decimal/test_decimal.py -and -https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/test_period.py -""" - -from pandas.tests.extension import base -import pytest - -# NDArrayBacked2DTests suite added in https://github.com/pandas-dev/pandas/pull/44974 -pytest.importorskip("pandas", minversion="1.5.0dev") - - -# class Test2DCompat(base.NDArrayBacked2DTests): -# pass From 8bd13ccf8e2e70e7557abca7d3cf57d513c63d61 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 22 Jul 2024 21:39:25 +0000 Subject: [PATCH 04/28] box and unbox between string(storage) and dict(getitem) --- db_dtypes/json.py | 148 +++++-- tests/compliance/json/test_json_compliance.py | 360 +++++++++++++++--- 2 files changed, 428 insertions(+), 80 deletions(-) diff --git a/db_dtypes/json.py b/db_dtypes/json.py index 814e8d6..390cd36 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -14,14 +14,14 @@ from __future__ import annotations +import json import typing import numpy as np import pandas as pd -from pandas._libs import lib from pandas.core.arrays.arrow.array import ArrowExtensionArray -from pandas.core.arrays.numeric import NumericDtype -from pandas.core.dtypes.common import is_integer, is_scalar, pandas_dtype +from pandas.core.arrays.masked import BaseMaskedArray +from pandas.core.dtypes.common import is_dict_like, is_integer, is_list_like, is_scalar from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.indexers import check_array_indexer, unpack_tuple_and_ellipses import pyarrow as pa @@ -84,8 +84,43 @@ def __init__(self, values, dtype=None, copy=False) -> None: "large_string type" ) + @classmethod + def _box_pa( + cls, value, pa_type: pa.DataType | None = None + ) -> pa.Array | pa.ChunkedArray | pa.Scalar: + """ + Box value into a pyarrow Array, ChunkedArray or Scalar. + + Parameters + ---------- + value : any + pa_type : pa.DataType | None + + Returns + ------- + pa.Array or pa.ChunkedArray or pa.Scalar + """ + if isinstance(value, pa.Scalar) or not ( + is_list_like(value) and not is_dict_like(value) + ): + return cls._box_pa_scalar(value, pa_type) + return cls._box_pa_array(value, pa_type) + @classmethod def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: + """ + Box value into a pyarrow Scalar. + + Parameters + ---------- + value : any + pa_type : pa.DataType | None + + Returns + ------- + pa.Scalar + """ + value = JSONArray._seralizate_json(value) pa_scalar = super()._box_pa_scalar(value, pa_type) if pa.types.is_string(pa_scalar.type) and pa_type is None: pa_scalar = pc.cast(pa_scalar, pa.large_string()) @@ -95,6 +130,24 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: def _box_pa_array( cls, value, pa_type: pa.DataType | None = None, copy: bool = False ) -> pa.Array | pa.ChunkedArray: + """ + Box value into a pyarrow Array or ChunkedArray. + + Parameters + ---------- + value : Sequence + pa_type : pa.DataType | None + + Returns + ------- + pa.Array or pa.ChunkedArray + """ + if ( + not isinstance(value, cls) + and not isinstance(value, (pa.Array, pa.ChunkedArray)) + and not isinstance(value, BaseMaskedArray) + ): + value = [JSONArray._seralizate_json(x) for x in value] pa_array = super()._box_pa_array(value, pa_type) if pa.types.is_string(pa_array.type) and pa_type is None: pa_array = pc.cast(pa_array, pa.large_string()) @@ -102,20 +155,21 @@ def _box_pa_array( @classmethod def _from_sequence(cls, scalars, *, dtype=None, copy=False): - from pandas.core.arrays.masked import BaseMaskedArray - - if isinstance(scalars, BaseMaskedArray): - # avoid costly conversion to object dtype in ensure_string_array and - # numerical issues with Float32Dtype - na_values = scalars._mask - result = scalars._data - result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - return cls(pa.array(result, mask=na_values, type=pa.large_string())) - elif isinstance(scalars, (pa.Array, pa.ChunkedArray)): - return cls(pc.cast(scalars, pa.large_string())) - - # convert non-na-likes to str - result = lib.ensure_string_array(scalars, copy=copy) + # TODO: check _from_arrow APIs etc. + # from pandas.core.arrays.masked import BaseMaskedArray + + # if isinstance(scalars, BaseMaskedArray): + # # avoid costly conversion to object dtype in ensure_string_array and + # # numerical issues with Float32Dtype + # na_values = scalars._mask + # result = scalars._data + # # result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) + # return cls(pa.array(result, mask=na_values, type=pa.large_string())) + # elif isinstance(scalars, (pa.Array, pa.ChunkedArray)): + # return cls(pc.cast(scalars, pa.large_string())) + result = [] + for scalar in scalars: + result.append(JSONArray._seralizate_json(scalar)) return cls(pa.array(result, type=pa.large_string(), from_pandas=True)) @classmethod @@ -124,30 +178,45 @@ def _from_sequence_of_strings( ) -> JSONArray: return cls._from_sequence(strings, dtype=dtype, copy=copy) + @staticmethod + def _seralizate_json(value): + if isinstance(value, str) or pd.isna(value): + return value + else: + # `sort_keys=True` sorts dictionary keys before serialization, making + # JSON comparisons deterministic. + return json.dumps(value, sort_keys=True) + + @staticmethod + def _deserialize_json(value): + if not pd.isna(value): + return json.loads(value) + else: + return value + @property def dtype(self) -> JSONDtype: """An instance of JSONDtype""" return self._dtype - def insert(self, loc: int, item) -> JSONArray: - if not isinstance(item, str) and not pd.isna(item): - raise TypeError("Scalar must be NA or str") - return super().insert(loc, item) + def __contains__(self, key) -> bool: + return super().__contains__(JSONArray._seralizate_json(key)) + + # def __contains__(self, key) -> bool: + # # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604 + # if pd.isna(key) and key is not self.dtype.na_value: + # if self.dtype.kind == "f" and lib.is_float(key): + # return pc.any(pc.is_nan(self._pa_array)).as_py() - def astype(self, dtype, copy: bool = True): - dtype = pandas_dtype(dtype) + # # e.g. date or timestamp types we do not allow None here to match pd.NA + # return False + # # TODO: maybe complex? object? - if dtype == self.dtype: - if copy: - return self.copy() - return self - elif isinstance(dtype, NumericDtype): - data = self._pa_array.cast(pa.from_numpy_dtype(dtype.numpy_dtype)) - return dtype.__from_arrow__(data) - elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating): - return self.to_numpy(dtype=dtype, na_value=np.nan) + # return bool(super().__contains__(key)) - return super().astype(dtype, copy=copy) + def insert(self, loc: int, item) -> JSONArray: + val = JSONArray._seralizate_json(item) + return super().insert(loc, val) @classmethod def _from_factorized(cls, values, original): @@ -219,12 +288,23 @@ def __getitem__(self, item): if isinstance(value, pa.ChunkedArray): return type(self)(value) else: - scalar = value.as_py() + scalar = JSONArray._deserialize_json(value.as_py()) if scalar is None: return self._dtype.na_value else: return scalar + def __iter__(self): + """ + Iterate over elements of the array. + """ + for value in self._pa_array: + val = JSONArray._deserialize_json(value.as_py()) + if val is None: + yield self._dtype.na_value + else: + yield val + @classmethod def _result_converter(cls, values, na=None): return pd.BooleanDtype().__from_arrow__(values) diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py index d46b935..b4d55a6 100644 --- a/tests/compliance/json/test_json_compliance.py +++ b/tests/compliance/json/test_json_compliance.py @@ -11,29 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" -Tests for extension interface compliance, inherited from pandas. - -See: -https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/decimal/test_decimal.py -and -https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/test_period.py -""" +import json import typing +import numpy as np import pandas as pd import pandas._testing as tm +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.tests.extension import base -import pyarrow as pa import pytest from db_dtypes import JSONArray -# We intentionally don't run base.BaseSetitemTests because pandas' -# internals has trouble setting sequences of values into scalar positions. -unhashable = pytest.mark.xfail(reason="Unhashable") - class TestJSONArray(base.ExtensionTests): @pytest.mark.parametrize( @@ -59,10 +49,35 @@ def test_ffill_limit_area( data_missing, limit_area, input_ilocs, expected_ilocs ) - @unhashable + @pytest.mark.xfail(reason="Unhashable") def test_value_counts_with_normalize(self, data): super().test_value_counts_with_normalize(data) + @pytest.mark.xfail(reason="Unhashable") + def test_groupby_extension_transform(self): + """ + This currently fails in Series.name.setter, since the + name must be hashable, but the value is a dictionary. + I think this is what we want, i.e. `.name` should be the original + values, and not the values for factorization. + """ + super().test_groupby_extension_transform() + + @pytest.mark.xfail(reason="Unhashable") + def test_groupby_extension_apply(self): + """ + This fails in Index._do_unique_check with + > hash(val) + E TypeError: unhashable type: 'dict' with + I suspect that once we support Index[ExtensionArray], + we'll be able to dispatch unique. + """ + super().test_groupby_extension_apply() + + @pytest.mark.xfail(reason="Unhashable") + def test_sort_values_frame(self): + super().test_sort_values_frame() + @pytest.mark.xfail(reason="combine for JSONArray not supported") def test_combine_le(self, data_repeated): super().test_combine_le(data_repeated) @@ -88,45 +103,45 @@ def test_setitem_preserves_views(self, data): def test_transpose(self, data): super().test_transpose(data) - def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): - if len(data[0]) != 1: - mark = pytest.mark.xfail(reason="raises in coercing to Series") - request.applymarker(mark) + @pytest.mark.xfail(reason="Arithmetic functions is not supported for json") + def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): super().test_arith_frame_with_scalar(data, all_arithmetic_operators) - def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request): - if len(data[0]) != 1: - mark = pytest.mark.xfail(reason="raises in coercing to Series") - request.applymarker(mark) + @pytest.mark.xfail(reason="Arithmetic functions is not supported for json") + def test_arith_series_with_scalar(self, data, all_arithmetic_operators): + super().test_arith_series_with_scalar(data, all_arithmetic_operators) + + @pytest.mark.xfail(reason="Arithmetic functions is not supported for json") + def test_arith_series_with_scalar(self, data, all_arithmetic_operators): super().test_arith_series_with_scalar(data, all_arithmetic_operators) + @pytest.mark.xfail(reason="Arithmetic functions is not supported for json") + def test_arith_series_with_array(self, data, all_arithmetic_operators): + super().test_arith_series_with_array(data, all_arithmetic_operators) + + @pytest.mark.xfail(reason="Arithmetic functions is not supported for json") + def test_add_series_with_extension_array(self, data): + super().test_add_series_with_extension_array(data, data) + + @pytest.mark.xfail(reason="Arithmetic functions is not supported for json") + def test_divmod(self, data): + super().test_divmod(data, data) + + def test_compare_array(self, data, comparison_op, request): + if comparison_op.__name__ not in ["eq", "ne"]: + mark = pytest.mark.xfail(reason="Comparison methods not implemented") + request.applymarker(mark) + super().test_compare_array(data, comparison_op) + + def test_compare_scalar(self, data, comparison_op, request): + if comparison_op.__name__ not in ["eq", "ne"]: + mark = pytest.mark.xfail(reason="Comparison methods not implemented") + request.applymarker(mark) + super().test_compare_scalar(data, comparison_op) + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return op_name in ["min", "max"] - def _get_expected_exception( - self, op_name: str, obj, other - ) -> type[Exception] | None: - if op_name in ["__divmod__", "__rdivmod__"]: - if isinstance(obj, pd.Series) or isinstance(other, pd.Series): - return NotImplementedError - return TypeError - elif op_name in ["__mod__", "__rmod__", "__pow__", "__rpow__"]: - return NotImplementedError - elif op_name in ["__mul__", "__rmul__"]: - # Can only multiply strings by integers - return TypeError - elif op_name in [ - "__truediv__", - "__rtruediv__", - "__floordiv__", - "__rfloordiv__", - "__sub__", - "__rsub__", - ]: - return pa.ArrowNotImplementedError - - return None - def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = typing.cast(pd.StringDtype, tm.get_dtype(obj)) if op_name in ["__add__", "__radd__"]: @@ -135,6 +150,259 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): cast_to = "boolean[pyarrow]" # type: ignore[assignment] return pointwise_result.astype(cast_to) + @pytest.mark.skip(reason="'<' not supported between instances of 'dict' and 'dict'") + def test_searchsorted(self, data_for_sorting, as_series): + super().test_searchsorted(self, data_for_sorting, as_series) + + def test_astype_str(self, data): + # Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method. + result = pd.Series(data[:5]).astype(str) + expected = pd.Series( + [json.dumps(x, sort_keys=True) for x in data[:5]], dtype=str + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "nullable_string_dtype", + [ + "string[python]", + "string[pyarrow]", + ], + ) + def test_astype_string(self, data, nullable_string_dtype): + # Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method. + result = pd.Series(data[:5]).astype(nullable_string_dtype) + expected = pd.Series( + [json.dumps(x, sort_keys=True) for x in data[:5]], + dtype=nullable_string_dtype, + ) + tm.assert_series_equal(result, expected) + + def test_array_interface(self, data): + result = np.array(data) + # Use `json.dumps(data[0])` instead of passing `data[0]` directly to the super method. + assert result[0] == json.dumps(data[0]) + + result = np.array(data, dtype=object) + # Use `json.dumps(x)` instead of passing `x` directly to the super method. + expected = np.array([json.dumps(x) for x in data], dtype=object) + if expected.ndim > 1: + # nested data, explicitly construct as 1D + expected = construct_1d_object_array_from_listlike(list(data)) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.xfail(reason="Setting a dict as a scalar") + def test_fillna_series(self): + """We treat dictionaries as a mapping in fillna, not a scalar.""" + super().test_fillna_series() + + @pytest.mark.xfail(reason="Setting a dict as a scalar") + def test_fillna_frame(self): + """We treat dictionaries as a mapping in fillna, not a scalar.""" + super().test_fillna_frame() + + @pytest.mark.skip("fill-value is interpreted as a dict of values") + def test_fillna_copy_frame(self, data_missing): + super().test_fillna_copy_frame(data_missing) + + def test_from_dtype(self, data): + # construct from our dtype & string dtype + dtype = data.dtype + + expected = pd.Series(data) + result = pd.Series(list(data), dtype=dtype) + tm.assert_series_equal(result, expected) + + result = pd.Series(list(data), dtype=str(dtype)) + tm.assert_series_equal(result, expected) + + # Use `{"col1": data}` instead of passing `data` directly to the super method. + # This prevents the DataFrame constructor from attempting to interpret the + # dictionary as column headers. + + # gh-30280 + expected = pd.DataFrame({"col1": data}).astype(dtype) + result = pd.DataFrame({"col1": list(data)}, dtype=dtype) + tm.assert_frame_equal(result, expected) + + result = pd.DataFrame({"col1": list(data)}, dtype=str(dtype)) + tm.assert_frame_equal(result, expected) + + def test_series_constructor_scalar_with_index(self, data, dtype): + # Use json.dumps(data[0]) instead of passing data[0] directly to the super method. + # This prevents the Series constructor from attempting to interpret the dictionary + # as column headers. + scalar = json.dumps(data[0]) + result = pd.Series(scalar, index=[1, 2, 3], dtype=dtype) + expected = pd.Series([scalar] * 3, index=[1, 2, 3], dtype=dtype) + tm.assert_series_equal(result, expected) + + result = pd.Series(scalar, index=["foo"], dtype=dtype) + expected = pd.Series([scalar], index=["foo"], dtype=dtype) + tm.assert_series_equal(result, expected) + + # Patching `json.dumps` to base.BaseSetitemTests because pandas' internals has + # has trouble setting sequences of values into scalar positions. + + @pytest.mark.parametrize( + "idx", + [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])], + ids=["list", "integer-array", "numpy-array"], + ) + def test_setitem_integer_array(self, data, idx, box_in_series): + arr = data[:5].copy() + expected = data.take([0, 0, 0, 3, 4]) + + if box_in_series: + arr = pd.Series(arr) + expected = pd.Series(expected) + + # Use json.dumps(arr[0]) instead of passing arr[0] directly to the super method. + arr[idx] = json.dumps(arr[0]) + tm.assert_equal(arr, expected) + + @pytest.mark.parametrize("setter", ["loc", None]) + def test_setitem_mask_broadcast(self, data, setter): + ser = pd.Series(data) + mask = np.zeros(len(data), dtype=bool) + mask[:2] = True + + if setter: # loc + target = getattr(ser, setter) + else: # __setitem__ + target = ser + + # Use json.dumps(data[10]) instead of passing data[10] directly to the super method. + target[mask] = json.dumps(data[10]) + assert ser[0] == data[10] + assert ser[1] == data[10] + + def test_setitem_loc_scalar_mixed(self, data): + df = pd.DataFrame({"A": np.arange(len(data)), "B": data}) + # Use json.dumps(data[1]) instead of passing data[1] directly to the super method. + df.loc[0, "B"] = json.dumps(data[1]) + assert df.loc[0, "B"] == data[1] + + def test_setitem_loc_scalar_single(self, data): + df = pd.DataFrame({"B": data}) + # Use json.dumps(data[1]) instead of passing data[1] directly to the super method. + df.loc[10, "B"] = json.dumps(data[1]) + assert df.loc[10, "B"] == data[1] + + def test_setitem_loc_iloc_slice(self, data): + arr = data[:5].copy() + s = pd.Series(arr, index=["a", "b", "c", "d", "e"]) + expected = pd.Series(data.take([0, 0, 0, 3, 4]), index=s.index) + + result = s.copy() + # Use json.dumps(data[0]) instead of passing data[0] directly to the super method. + result.iloc[:3] = json.dumps(data[0]) + tm.assert_equal(result, expected) + + result = s.copy() + result.loc[:"c"] = json.dumps(data[0]) + tm.assert_equal(result, expected) + + def test_setitem_iloc_scalar_single(self, data): + df = pd.DataFrame({"B": data}) + # Use json.dumps(data[1]) instead of passing data[1] directly to the super method. + df.iloc[10, 0] = json.dumps(data[1]) + assert df.loc[10, "B"] == data[1] + + def test_setitem_iloc_scalar_mixed(self, data): + df = pd.DataFrame({"A": np.arange(len(data)), "B": data}) + # Use json.dumps(data[1]) instead of passing data[1] directly to the super method. + df.iloc[0, 1] = json.dumps(data[1]) + assert df.loc[0, "B"] == data[1] + + @pytest.mark.xfail(reaons="eq not implemented for ") + def test_setitem_mask_boolean_array_with_na(self, data, box_in_series): + super().test_setitem_mask_boolean_array_with_na(data, box_in_series) + + @pytest.mark.parametrize("setter", ["loc", "iloc"]) + def test_setitem_scalar(self, data, setter): + arr = pd.Series(data) + setter = getattr(arr, setter) + # Use json.dumps(data[1]) instead of passing data[1] directly to the super method. + setter[0] = json.dumps(data[1]) + assert arr[0] == data[1] + + @pytest.mark.parametrize( + "mask", + [ + np.array([True, True, True, False, False]), + pd.array([True, True, True, False, False], dtype="boolean"), + pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"), + ], + ids=["numpy-array", "boolean-array", "boolean-array-na"], + ) + def test_setitem_mask(self, data, mask, box_in_series): + arr = data[:5].copy() + expected = arr.take([0, 0, 0, 3, 4]) + if box_in_series: + arr = pd.Series(arr) + expected = pd.Series(expected) + # Use json.dumps(data[0]) instead of passing data[0] directly to the super method. + arr[mask] = json.dumps(data[0]) + tm.assert_equal(expected, arr) + + def test_setitem_with_expansion_row(self, data, na_value): + df = pd.DataFrame({"data": data[:1]}) + + # Use json.dumps(data[1]) instead of passing data[1] directly to the super method. + df.loc[1, "data"] = json.dumps(data[1]) + expected = pd.DataFrame({"data": data[:2]}) + tm.assert_frame_equal(df, expected) + + # https://github.com/pandas-dev/pandas/issues/47284 + df.loc[2, "data"] = na_value + expected = pd.DataFrame( + {"data": pd.Series([data[0], data[1], na_value], dtype=data.dtype)} + ) + tm.assert_frame_equal(df, expected) + + def test_setitem_iloc_scalar_multiple_homogoneous(self, data): + df = pd.DataFrame({"A": data, "B": data}) + # Use json.dumps(data[1]) instead of passing data[1] directly to the super method. + df.iloc[10, 1] = json.dumps(data[1]) + assert df.loc[10, "B"] == data[1] + + def test_setitem_loc_scalar_multiple_homogoneous(self, data): + df = pd.DataFrame({"A": data, "B": data}) + # Use json.dumps(data[1]) instead of passing data[1] directly to the super method. + df.loc[10, "B"] = json.dumps(data[1]) + assert df.loc[10, "B"] == data[1] + + def test_setitem_slice(self, data, box_in_series): + arr = data[:5].copy() + expected = data.take([0, 0, 0, 3, 4]) + if box_in_series: + arr = pd.Series(arr) + expected = pd.Series(expected) + + # Use json.dumps(data[0]) instead of passing data[0] directly to the super method. + arr[:3] = json.dumps(data[0]) + tm.assert_equal(arr, expected) + + @pytest.mark.xfail(reason="only integer scalar arrays can be converted") + def test_setitem_2d_values(self, data): + super().test_setitem_2d_values(data) + + @pytest.mark.xfail(reason="data type 'json' not understood") + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data, request): + super().test_EA_types(engine, data, request) + + def test_getitem_scalar(self, data): + result = data[0] + # While JSONDtype internally stores data as pyarrow strings + # (equivalent to data.dtype.type), it is deliberately designed to return a + # dictionary as the result. + assert isinstance(result, dict) + + result = pd.Series(data)[0] + assert isinstance(result, dict) + def custom_assert_frame_equal(left, right, *args, **kwargs): obj_type = kwargs.get("obj", "DataFrame") From e29585de76ee05351a16cd5e8900998d770c975b Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 22 Jul 2024 21:42:00 +0000 Subject: [PATCH 05/28] minor --- db_dtypes/json.py | 12 ------ tests/compliance/json/test_json_compliance.py | 42 ------------------- 2 files changed, 54 deletions(-) diff --git a/db_dtypes/json.py b/db_dtypes/json.py index 390cd36..431da43 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -202,18 +202,6 @@ def dtype(self) -> JSONDtype: def __contains__(self, key) -> bool: return super().__contains__(JSONArray._seralizate_json(key)) - # def __contains__(self, key) -> bool: - # # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604 - # if pd.isna(key) and key is not self.dtype.na_value: - # if self.dtype.kind == "f" and lib.is_float(key): - # return pc.any(pc.is_nan(self._pa_array)).as_py() - - # # e.g. date or timestamp types we do not allow None here to match pd.NA - # return False - # # TODO: maybe complex? object? - - # return bool(super().__contains__(key)) - def insert(self, loc: int, item) -> JSONArray: val = JSONArray._seralizate_json(item) return super().insert(loc, val) diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py index b4d55a6..30a25d6 100644 --- a/tests/compliance/json/test_json_compliance.py +++ b/tests/compliance/json/test_json_compliance.py @@ -402,45 +402,3 @@ def test_getitem_scalar(self, data): result = pd.Series(data)[0] assert isinstance(result, dict) - - -def custom_assert_frame_equal(left, right, *args, **kwargs): - obj_type = kwargs.get("obj", "DataFrame") - tm.assert_index_equal( - left.columns, - right.columns, - exact=kwargs.get("check_column_type", "equiv"), - check_names=kwargs.get("check_names", True), - check_exact=kwargs.get("check_exact", False), - check_categorical=kwargs.get("check_categorical", True), - obj=f"{obj_type}.columns", - ) - - jsons = (left.dtypes == "json").index - - for col in jsons: - tm.assert_series_equal(left[col], right[col], *args, **kwargs) - - left = left.drop(columns=jsons) - right = right.drop(columns=jsons) - tm.assert_frame_equal(left, right, *args, **kwargs) - - -def test_custom_asserts(): - data = JSONArray._from_sequence( - [ - {"a": 1}, - {"b": 2}, - {"c": 3}, - ] - ) - a = pd.Series(data) - tm.assert_series_equal(a, a) - custom_assert_frame_equal(a.to_frame(), a.to_frame()) - - b = pd.Series(data.take([0, 0, 1])) - with pytest.raises(AssertionError): - tm.assert_series_equal(a, b) - - with pytest.raises(AssertionError): - custom_assert_frame_equal(a.to_frame(), b.to_frame()) From 84690ee2f7f5a7db6b6e3eb59a09b596c81209ec Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 22 Jul 2024 22:22:37 +0000 Subject: [PATCH 06/28] fix test_getitem_scalar test --- db_dtypes/json.py | 2 +- tests/compliance/json/test_json_compliance.py | 10 ---------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/db_dtypes/json.py b/db_dtypes/json.py index 431da43..43336bf 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -40,7 +40,7 @@ def na_value(self) -> pd.NA: @property def type(self) -> type[str]: - return str + return dict @property def _is_numeric(self) -> bool: diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py index 30a25d6..f434f6e 100644 --- a/tests/compliance/json/test_json_compliance.py +++ b/tests/compliance/json/test_json_compliance.py @@ -392,13 +392,3 @@ def test_setitem_2d_values(self, data): @pytest.mark.parametrize("engine", ["c", "python"]) def test_EA_types(self, engine, data, request): super().test_EA_types(engine, data, request) - - def test_getitem_scalar(self, data): - result = data[0] - # While JSONDtype internally stores data as pyarrow strings - # (equivalent to data.dtype.type), it is deliberately designed to return a - # dictionary as the result. - assert isinstance(result, dict) - - result = pd.Series(data)[0] - assert isinstance(result, dict) From d11cc873756f859d55933e8539c52b89a089050d Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 22 Jul 2024 22:45:24 +0000 Subject: [PATCH 07/28] add docstring and remove unused functions --- db_dtypes/json.py | 117 ++++-------------- tests/compliance/json/test_json_compliance.py | 23 ---- 2 files changed, 22 insertions(+), 118 deletions(-) diff --git a/db_dtypes/json.py b/db_dtypes/json.py index 43336bf..ce89b0d 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -30,16 +30,18 @@ @pd.api.extensions.register_extension_dtype class JSONDtype(pd.api.extensions.ExtensionDtype): - """Extension dtype for JSON data.""" + """Extension dtype for BigQuery JSON data.""" name = "dbjson" @property def na_value(self) -> pd.NA: + """Default NA value to use for this type.""" return pd.NA @property def type(self) -> type[str]: + """Return the scalar type for the array, e.g. int.""" return dict @property @@ -62,7 +64,9 @@ def __from_arrow__(array: typing.Union[pa.Array, pa.ChunkedArray]) -> JSONArray: class JSONArray(ArrowExtensionArray): - """Extension array containing JSON data.""" + """Extension array that handles BigQuery JSON data, leveraging a string-based + pyarrow array for storage. It enables seamless conversion to JSON objects when + accessing individual elements.""" _dtype = JSONDtype() @@ -88,18 +92,7 @@ def __init__(self, values, dtype=None, copy=False) -> None: def _box_pa( cls, value, pa_type: pa.DataType | None = None ) -> pa.Array | pa.ChunkedArray | pa.Scalar: - """ - Box value into a pyarrow Array, ChunkedArray or Scalar. - - Parameters - ---------- - value : any - pa_type : pa.DataType | None - - Returns - ------- - pa.Array or pa.ChunkedArray or pa.Scalar - """ + """Box value into a pyarrow Array, ChunkedArray or Scalar.""" if isinstance(value, pa.Scalar) or not ( is_list_like(value) and not is_dict_like(value) ): @@ -108,18 +101,7 @@ def _box_pa( @classmethod def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: - """ - Box value into a pyarrow Scalar. - - Parameters - ---------- - value : any - pa_type : pa.DataType | None - - Returns - ------- - pa.Scalar - """ + """Box value into a pyarrow Scalar.""" value = JSONArray._seralizate_json(value) pa_scalar = super()._box_pa_scalar(value, pa_type) if pa.types.is_string(pa_scalar.type) and pa_type is None: @@ -130,18 +112,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: def _box_pa_array( cls, value, pa_type: pa.DataType | None = None, copy: bool = False ) -> pa.Array | pa.ChunkedArray: - """ - Box value into a pyarrow Array or ChunkedArray. - - Parameters - ---------- - value : Sequence - pa_type : pa.DataType | None - - Returns - ------- - pa.Array or pa.ChunkedArray - """ + """Box value into a pyarrow Array or ChunkedArray.""" if ( not isinstance(value, cls) and not isinstance(value, (pa.Array, pa.ChunkedArray)) @@ -155,18 +126,7 @@ def _box_pa_array( @classmethod def _from_sequence(cls, scalars, *, dtype=None, copy=False): - # TODO: check _from_arrow APIs etc. - # from pandas.core.arrays.masked import BaseMaskedArray - - # if isinstance(scalars, BaseMaskedArray): - # # avoid costly conversion to object dtype in ensure_string_array and - # # numerical issues with Float32Dtype - # na_values = scalars._mask - # result = scalars._data - # # result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) - # return cls(pa.array(result, mask=na_values, type=pa.large_string())) - # elif isinstance(scalars, (pa.Array, pa.ChunkedArray)): - # return cls(pc.cast(scalars, pa.large_string())) + """Construct a new ExtensionArray from a sequence of scalars.""" result = [] for scalar in scalars: result.append(JSONArray._seralizate_json(scalar)) @@ -176,10 +136,12 @@ def _from_sequence(cls, scalars, *, dtype=None, copy=False): def _from_sequence_of_strings( cls, strings, *, dtype: ExtensionDtype, copy: bool = False ) -> JSONArray: + """Construct a new ExtensionArray from a sequence of strings.""" return cls._from_sequence(strings, dtype=dtype, copy=copy) @staticmethod def _seralizate_json(value): + """A static method that converts a JSON value into a string representation.""" if isinstance(value, str) or pd.isna(value): return value else: @@ -189,6 +151,7 @@ def _seralizate_json(value): @staticmethod def _deserialize_json(value): + """A static method that converts a JSON string back into its original value.""" if not pd.isna(value): return json.loads(value) else: @@ -200,40 +163,24 @@ def dtype(self) -> JSONDtype: return self._dtype def __contains__(self, key) -> bool: + """Return for `item in self`.""" return super().__contains__(JSONArray._seralizate_json(key)) def insert(self, loc: int, item) -> JSONArray: + """ + Make new ExtensionArray inserting new item at location. Follows Python + list.append semantics for negative values. + """ val = JSONArray._seralizate_json(item) return super().insert(loc, val) @classmethod def _from_factorized(cls, values, original): + """Reconstruct an ExtensionArray after factorization.""" return cls._from_sequence(values, dtype=original.dtype) def __getitem__(self, item): - """Select a subset of self. - - Parameters - ---------- - item : int, slice, or ndarray - * int: The position in 'self' to get. - * slice: A slice object, where 'start', 'stop', and 'step' are - integers or None - * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' - - Returns - ------- - item : scalar or ExtensionArray - - Notes - ----- - For scalar ``item``, return a scalar value suitable for the array's - type. This should be an instance of ``self.dtype.type``. - For slice ``key``, return an instance of ``ExtensionArray``, even - if the slice is length 0 or 1. - For a boolean mask, return an instance of ``ExtensionArray``, filtered - to the values where ``item`` is True. - """ + """Select a subset of self.""" item = check_array_indexer(self, item) if isinstance(item, np.ndarray): @@ -283,9 +230,7 @@ def __getitem__(self, item): return scalar def __iter__(self): - """ - Iterate over elements of the array. - """ + """Iterate over elements of the array.""" for value in self._pa_array: val = JSONArray._deserialize_json(value.as_py()) if val is None: @@ -293,27 +238,9 @@ def __iter__(self): else: yield val - @classmethod - def _result_converter(cls, values, na=None): - return pd.BooleanDtype().__from_arrow__(values) - @classmethod def _concat_same_type(cls, to_concat) -> JSONArray: - """ - Concatenate multiple JSONArray. - - Parameters - ---------- - to_concat : sequence of JSONArray - - Returns - ------- - JSONArray - """ + """Concatenate multiple JSONArray.""" chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()] arr = pa.chunked_array(chunks, type=pa.large_string()) return cls(arr) - - def _pad_or_backfill(self, *, method, limit=None, copy=True): - # GH#56616 - test EA method without limit_area argument - return super()._pad_or_backfill(method=method, limit=limit, copy=copy) diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py index f434f6e..89c13ec 100644 --- a/tests/compliance/json/test_json_compliance.py +++ b/tests/compliance/json/test_json_compliance.py @@ -26,29 +26,6 @@ class TestJSONArray(base.ExtensionTests): - @pytest.mark.parametrize( - "limit_area, input_ilocs, expected_ilocs", - [ - ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]), - ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]), - ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]), - ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]), - ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]), - ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]), - ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]), - ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]), - ], - ) - def test_ffill_limit_area( - self, data_missing, limit_area, input_ilocs, expected_ilocs - ): - # GH#56616 - msg = "JSONArray does not implement limit_area" - with pytest.raises(NotImplementedError, match=msg): - super().test_ffill_limit_area( - data_missing, limit_area, input_ilocs, expected_ilocs - ) - @pytest.mark.xfail(reason="Unhashable") def test_value_counts_with_normalize(self, data): super().test_value_counts_with_normalize(data) From 60da5700f7c2353c185d9f641bbef93c8e67d70b Mon Sep 17 00:00:00 2001 From: Owl Bot Date: Mon, 22 Jul 2024 22:54:13 +0000 Subject: [PATCH 08/28] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?= =?UTF-8?q?post-processor?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --- docs/conf.py | 2 +- samples/snippets/noxfile.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index 672daff..00e0013 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,9 +24,9 @@ # All configuration values have a default; values that are commented out # serve to show the default. +import sys import os import shlex -import sys # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the diff --git a/samples/snippets/noxfile.py b/samples/snippets/noxfile.py index c36d5f2..3b71359 100644 --- a/samples/snippets/noxfile.py +++ b/samples/snippets/noxfile.py @@ -22,6 +22,7 @@ import nox + # WARNING - WARNING - WARNING - WARNING - WARNING # WARNING - WARNING - WARNING - WARNING - WARNING # DO NOT EDIT THIS FILE EVER! From 48ee67db1f010e89f391b1b7ed58ea79b6b112a8 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 22 Jul 2024 22:54:26 +0000 Subject: [PATCH 09/28] fix lint --- db_dtypes/json.py | 24 +++++++++---------- tests/compliance/json/test_json_compliance.py | 6 ----- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/db_dtypes/json.py b/db_dtypes/json.py index ce89b0d..62c44e8 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -139,6 +139,18 @@ def _from_sequence_of_strings( """Construct a new ExtensionArray from a sequence of strings.""" return cls._from_sequence(strings, dtype=dtype, copy=copy) + @classmethod + def _concat_same_type(cls, to_concat) -> JSONArray: + """Concatenate multiple JSONArray.""" + chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()] + arr = pa.chunked_array(chunks, type=pa.large_string()) + return cls(arr) + + @classmethod + def _from_factorized(cls, values, original): + """Reconstruct an ExtensionArray after factorization.""" + return cls._from_sequence(values, dtype=original.dtype) + @staticmethod def _seralizate_json(value): """A static method that converts a JSON value into a string representation.""" @@ -174,11 +186,6 @@ def insert(self, loc: int, item) -> JSONArray: val = JSONArray._seralizate_json(item) return super().insert(loc, val) - @classmethod - def _from_factorized(cls, values, original): - """Reconstruct an ExtensionArray after factorization.""" - return cls._from_sequence(values, dtype=original.dtype) - def __getitem__(self, item): """Select a subset of self.""" item = check_array_indexer(self, item) @@ -237,10 +244,3 @@ def __iter__(self): yield self._dtype.na_value else: yield val - - @classmethod - def _concat_same_type(cls, to_concat) -> JSONArray: - """Concatenate multiple JSONArray.""" - chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()] - arr = pa.chunked_array(chunks, type=pa.large_string()) - return cls(arr) diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py index 89c13ec..02185fc 100644 --- a/tests/compliance/json/test_json_compliance.py +++ b/tests/compliance/json/test_json_compliance.py @@ -22,8 +22,6 @@ from pandas.tests.extension import base import pytest -from db_dtypes import JSONArray - class TestJSONArray(base.ExtensionTests): @pytest.mark.xfail(reason="Unhashable") @@ -88,10 +86,6 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): def test_arith_series_with_scalar(self, data, all_arithmetic_operators): super().test_arith_series_with_scalar(data, all_arithmetic_operators) - @pytest.mark.xfail(reason="Arithmetic functions is not supported for json") - def test_arith_series_with_scalar(self, data, all_arithmetic_operators): - super().test_arith_series_with_scalar(data, all_arithmetic_operators) - @pytest.mark.xfail(reason="Arithmetic functions is not supported for json") def test_arith_series_with_array(self, data, all_arithmetic_operators): super().test_arith_series_with_array(data, all_arithmetic_operators) From 91d5016a2642b66f5a7fc817c592a19b6b59cd7d Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 24 Jul 2024 21:52:48 +0000 Subject: [PATCH 10/28] address some comments --- db_dtypes/json.py | 88 +++++++++++++++++++++++------------------------ 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/db_dtypes/json.py b/db_dtypes/json.py index 62c44e8..8fcfbd1 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -19,13 +19,10 @@ import numpy as np import pandas as pd -from pandas.core.arrays.arrow.array import ArrowExtensionArray -from pandas.core.arrays.masked import BaseMaskedArray -from pandas.core.dtypes.common import is_dict_like, is_integer, is_list_like, is_scalar -from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.indexers import check_array_indexer, unpack_tuple_and_ellipses +import pandas.arrays as arrays +import pandas.core.dtypes.common as common +import pandas.core.indexers as indexers import pyarrow as pa -import pyarrow.compute as pc @pd.api.extensions.register_extension_dtype @@ -63,7 +60,7 @@ def __from_arrow__(array: typing.Union[pa.Array, pa.ChunkedArray]) -> JSONArray: return JSONArray(array) -class JSONArray(ArrowExtensionArray): +class JSONArray(arrays.ArrowExtensionArray): """Extension array that handles BigQuery JSON data, leveraging a string-based pyarrow array for storage. It enables seamless conversion to JSON objects when accessing individual elements.""" @@ -71,22 +68,13 @@ class JSONArray(ArrowExtensionArray): _dtype = JSONDtype() def __init__(self, values, dtype=None, copy=False) -> None: - if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string( - values.type - ): - values = pc.cast(values, pa.large_string()) - - super().__init__(values) self._dtype = JSONDtype() - - if not pa.types.is_large_string(self._pa_array.type) and not ( - pa.types.is_dictionary(self._pa_array.type) - and pa.types.is_large_string(self._pa_array.type.value_type) - ): - raise ValueError( - "ArrowStringArray requires a PyArrow (chunked) array of " - "large_string type" - ) + if isinstance(values, pa.Array): + self._pa_array = pa.chunked_array([values]) + elif isinstance(values, pa.ChunkedArray): + self._pa_array = values + else: + raise ValueError(f"Unsupported type '{type(values)}' for JSONArray") @classmethod def _box_pa( @@ -94,7 +82,7 @@ def _box_pa( ) -> pa.Array | pa.ChunkedArray | pa.Scalar: """Box value into a pyarrow Array, ChunkedArray or Scalar.""" if isinstance(value, pa.Scalar) or not ( - is_list_like(value) and not is_dict_like(value) + common.is_list_like(value) and not common.is_dict_like(value) ): return cls._box_pa_scalar(value, pa_type) return cls._box_pa_array(value, pa_type) @@ -102,10 +90,16 @@ def _box_pa( @classmethod def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: """Box value into a pyarrow Scalar.""" - value = JSONArray._seralizate_json(value) - pa_scalar = super()._box_pa_scalar(value, pa_type) - if pa.types.is_string(pa_scalar.type) and pa_type is None: - pa_scalar = pc.cast(pa_scalar, pa.large_string()) + if isinstance(value, pa.Scalar): + pa_scalar = value + if pd.isna(value): + pa_scalar = pa.scalar(None, type=pa_type) + else: + value = JSONArray._serialize_json(value) + pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True) + + if pa_type is not None and pa_scalar.type != pa_type: + pa_scalar = pa_scalar.cast(pa_type) return pa_scalar @classmethod @@ -113,15 +107,21 @@ def _box_pa_array( cls, value, pa_type: pa.DataType | None = None, copy: bool = False ) -> pa.Array | pa.ChunkedArray: """Box value into a pyarrow Array or ChunkedArray.""" - if ( - not isinstance(value, cls) - and not isinstance(value, (pa.Array, pa.ChunkedArray)) - and not isinstance(value, BaseMaskedArray) - ): - value = [JSONArray._seralizate_json(x) for x in value] - pa_array = super()._box_pa_array(value, pa_type) - if pa.types.is_string(pa_array.type) and pa_type is None: - pa_array = pc.cast(pa_array, pa.large_string()) + if isinstance(value, cls): + pa_array = value._pa_array + elif isinstance(value, (pa.Array, pa.ChunkedArray)): + pa_array = value + else: + try: + value = [JSONArray._serialize_json(x) for x in value] + pa_array = pa.array(value, type=pa_type, from_pandas=True) + except (pa.ArrowInvalid, pa.ArrowTypeError): + # GH50430: let pyarrow infer type, then cast + pa_array = pa.array(value, from_pandas=True) + + if pa_type is not None and pa_array.type != pa_type: + pa_array = pa_array.cast(pa_type) + return pa_array @classmethod @@ -129,12 +129,12 @@ def _from_sequence(cls, scalars, *, dtype=None, copy=False): """Construct a new ExtensionArray from a sequence of scalars.""" result = [] for scalar in scalars: - result.append(JSONArray._seralizate_json(scalar)) + result.append(JSONArray._serialize_json(scalar)) return cls(pa.array(result, type=pa.large_string(), from_pandas=True)) @classmethod def _from_sequence_of_strings( - cls, strings, *, dtype: ExtensionDtype, copy: bool = False + cls, strings, *, dtype, copy: bool = False ) -> JSONArray: """Construct a new ExtensionArray from a sequence of strings.""" return cls._from_sequence(strings, dtype=dtype, copy=copy) @@ -152,7 +152,7 @@ def _from_factorized(cls, values, original): return cls._from_sequence(values, dtype=original.dtype) @staticmethod - def _seralizate_json(value): + def _serialize_json(value): """A static method that converts a JSON value into a string representation.""" if isinstance(value, str) or pd.isna(value): return value @@ -176,19 +176,19 @@ def dtype(self) -> JSONDtype: def __contains__(self, key) -> bool: """Return for `item in self`.""" - return super().__contains__(JSONArray._seralizate_json(key)) + return super().__contains__(JSONArray._serialize_json(key)) def insert(self, loc: int, item) -> JSONArray: """ Make new ExtensionArray inserting new item at location. Follows Python list.append semantics for negative values. """ - val = JSONArray._seralizate_json(item) + val = JSONArray._serialize_json(item) return super().insert(loc, val) def __getitem__(self, item): """Select a subset of self.""" - item = check_array_indexer(self, item) + item = indexers.check_array_indexer(self, item) if isinstance(item, np.ndarray): if not len(item): @@ -203,9 +203,9 @@ def __getitem__(self, item): "boolean arrays are valid indices." ) elif isinstance(item, tuple): - item = unpack_tuple_and_ellipses(item) + item = indexers.unpack_tuple_and_ellipses(item) - if is_scalar(item) and not is_integer(item): + if common.is_scalar(item) and not common.is_integer(item): # e.g. "foo" or 2.5 # exception message copied from numpy raise IndexError( From 191deef76a6e587e274a0448490b3f8aad1d8701 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 30 Jul 2024 20:09:47 +0000 Subject: [PATCH 11/28] supports all types except Array --- db_dtypes/json.py | 75 ++++++++--- tests/compliance/json/test_json_compliance.py | 117 +++++------------- 2 files changed, 93 insertions(+), 99 deletions(-) diff --git a/db_dtypes/json.py b/db_dtypes/json.py index 8fcfbd1..e19a357 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -23,7 +23,16 @@ import pandas.core.dtypes.common as common import pandas.core.indexers as indexers import pyarrow as pa +import pyarrow.compute +ARROW_CMP_FUNCS = { + "eq": pyarrow.compute.equal, + "ne": pyarrow.compute.not_equal, + "lt": pyarrow.compute.less, + "gt": pyarrow.compute.greater, + "le": pyarrow.compute.less_equal, + "ge": pyarrow.compute.greater_equal, +} @pd.api.extensions.register_extension_dtype class JSONDtype(pd.api.extensions.ExtensionDtype): @@ -130,7 +139,7 @@ def _from_sequence(cls, scalars, *, dtype=None, copy=False): result = [] for scalar in scalars: result.append(JSONArray._serialize_json(scalar)) - return cls(pa.array(result, type=pa.large_string(), from_pandas=True)) + return cls(pa.array(result, type=pa.string(), from_pandas=True)) @classmethod def _from_sequence_of_strings( @@ -143,7 +152,7 @@ def _from_sequence_of_strings( def _concat_same_type(cls, to_concat) -> JSONArray: """Concatenate multiple JSONArray.""" chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()] - arr = pa.chunked_array(chunks, type=pa.large_string()) + arr = pa.chunked_array(chunks, type=pa.string()) return cls(arr) @classmethod @@ -154,7 +163,7 @@ def _from_factorized(cls, values, original): @staticmethod def _serialize_json(value): """A static method that converts a JSON value into a string representation.""" - if isinstance(value, str) or pd.isna(value): + if pd.isna(value): return value else: # `sort_keys=True` sorts dictionary keys before serialization, making @@ -174,17 +183,10 @@ def dtype(self) -> JSONDtype: """An instance of JSONDtype""" return self._dtype - def __contains__(self, key) -> bool: - """Return for `item in self`.""" - return super().__contains__(JSONArray._serialize_json(key)) - - def insert(self, loc: int, item) -> JSONArray: - """ - Make new ExtensionArray inserting new item at location. Follows Python - list.append semantics for negative values. - """ - val = JSONArray._serialize_json(item) - return super().insert(loc, val) + def _cmp_method(self, other, op): + pc_func = ARROW_CMP_FUNCS[op.__name__] + result = pc_func(self._pa_array, self._box_pa(other)) + return arrays.ArrowExtensionArray(result) def __getitem__(self, item): """Select a subset of self.""" @@ -244,3 +246,48 @@ def __iter__(self): yield self._dtype.na_value else: yield val + + def _reduce( + self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs + ): + """Return a scalar result of performing the reduction operation.""" + if name in ["min", "max"]: + raise TypeError("JSONArray does not support min/max reducntion.") + super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) + + def __array__( + self, dtype = None, copy = None + ) -> np.ndarray: + """Correctly construct numpy arrays when passed to `np.asarray()`.""" + return self.to_numpy(dtype=dtype) + + def to_numpy(self, dtype = None, copy = False, na_value = pd.NA) -> np.ndarray: + dtype, na_value = self._to_numpy_dtype_inference(dtype, na_value, self._hasna) + pa_type = self._pa_array.type + if not self._hasna or pd.isna(na_value) or pa.types.is_null(pa_type): + data = self + else: + data = self.fillna(na_value) + result = np.array(list(data), dtype=dtype) + + if data._hasna: + result[data.isna()] = na_value + return result + + def _to_numpy_dtype_inference( + self, dtype, na_value, hasna + ): + if dtype is not None: + dtype = np.dtype(dtype) + + if dtype is None or not hasna: + na_value = self.dtype.na_value + elif dtype.kind == "f": # type: ignore[union-attr] + na_value = np.nan + elif dtype.kind == "M": # type: ignore[union-attr] + na_value = np.datetime64("nat") + elif dtype.kind == "m": # type: ignore[union-attr] + na_value = np.timedelta64("nat") + else: + na_value = self.dtype.na_value + return dtype, na_value \ No newline at end of file diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py index 02185fc..af7e543 100644 --- a/tests/compliance/json/test_json_compliance.py +++ b/tests/compliance/json/test_json_compliance.py @@ -21,6 +21,7 @@ from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.tests.extension import base import pytest +import db_dtypes class TestJSONArray(base.ExtensionTests): @@ -111,7 +112,7 @@ def test_compare_scalar(self, data, comparison_op, request): super().test_compare_scalar(data, comparison_op) def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: - return op_name in ["min", "max"] + return False def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): dtype = typing.cast(pd.StringDtype, tm.get_dtype(obj)) @@ -125,43 +126,6 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): def test_searchsorted(self, data_for_sorting, as_series): super().test_searchsorted(self, data_for_sorting, as_series) - def test_astype_str(self, data): - # Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method. - result = pd.Series(data[:5]).astype(str) - expected = pd.Series( - [json.dumps(x, sort_keys=True) for x in data[:5]], dtype=str - ) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize( - "nullable_string_dtype", - [ - "string[python]", - "string[pyarrow]", - ], - ) - def test_astype_string(self, data, nullable_string_dtype): - # Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method. - result = pd.Series(data[:5]).astype(nullable_string_dtype) - expected = pd.Series( - [json.dumps(x, sort_keys=True) for x in data[:5]], - dtype=nullable_string_dtype, - ) - tm.assert_series_equal(result, expected) - - def test_array_interface(self, data): - result = np.array(data) - # Use `json.dumps(data[0])` instead of passing `data[0]` directly to the super method. - assert result[0] == json.dumps(data[0]) - - result = np.array(data, dtype=object) - # Use `json.dumps(x)` instead of passing `x` directly to the super method. - expected = np.array([json.dumps(x) for x in data], dtype=object) - if expected.ndim > 1: - # nested data, explicitly construct as 1D - expected = construct_1d_object_array_from_listlike(list(data)) - tm.assert_numpy_array_equal(result, expected) - @pytest.mark.xfail(reason="Setting a dict as a scalar") def test_fillna_series(self): """We treat dictionaries as a mapping in fillna, not a scalar.""" @@ -212,7 +176,7 @@ def test_series_constructor_scalar_with_index(self, data, dtype): expected = pd.Series([scalar], index=["foo"], dtype=dtype) tm.assert_series_equal(result, expected) - # Patching `json.dumps` to base.BaseSetitemTests because pandas' internals has + # Patching `[....] * len()` to base.BaseSetitemTests because pandas' internals # has trouble setting sequences of values into scalar positions. @pytest.mark.parametrize( @@ -228,8 +192,8 @@ def test_setitem_integer_array(self, data, idx, box_in_series): arr = pd.Series(arr) expected = pd.Series(expected) - # Use json.dumps(arr[0]) instead of passing arr[0] directly to the super method. - arr[idx] = json.dumps(arr[0]) + # Use `[arr[0]] * len()` instead of passing `arr[0]` directly to the super method. + arr[idx] = [arr[0]] * len(arr[idx]) tm.assert_equal(arr, expected) @pytest.mark.parametrize("setter", ["loc", None]) @@ -243,22 +207,20 @@ def test_setitem_mask_broadcast(self, data, setter): else: # __setitem__ target = ser - # Use json.dumps(data[10]) instead of passing data[10] directly to the super method. - target[mask] = json.dumps(data[10]) + # Use `[data[10]] * len()` instead of passing `data[10]` directly to the super method. + target[mask] = [data[10]] * len(target[mask]) assert ser[0] == data[10] assert ser[1] == data[10] def test_setitem_loc_scalar_mixed(self, data): df = pd.DataFrame({"A": np.arange(len(data)), "B": data}) - # Use json.dumps(data[1]) instead of passing data[1] directly to the super method. - df.loc[0, "B"] = json.dumps(data[1]) + # Use `[data[1]]` instead of passing `data[1]` directly to the super method. + df.loc[0, "B"] = [data[1]] assert df.loc[0, "B"] == data[1] + @pytest.mark.xfail(reason="TODO: open an issue for ArrowExtentionArray") def test_setitem_loc_scalar_single(self, data): - df = pd.DataFrame({"B": data}) - # Use json.dumps(data[1]) instead of passing data[1] directly to the super method. - df.loc[10, "B"] = json.dumps(data[1]) - assert df.loc[10, "B"] == data[1] + super().test_setitem_loc_scalar_single(data) def test_setitem_loc_iloc_slice(self, data): arr = data[:5].copy() @@ -266,37 +228,33 @@ def test_setitem_loc_iloc_slice(self, data): expected = pd.Series(data.take([0, 0, 0, 3, 4]), index=s.index) result = s.copy() - # Use json.dumps(data[0]) instead of passing data[0] directly to the super method. - result.iloc[:3] = json.dumps(data[0]) + # Use `[data[0]] * len()` instead of passing `data[0]` directly to the super method. + result.iloc[:3] = [data[0]] * len(result.iloc[:3]) tm.assert_equal(result, expected) result = s.copy() - result.loc[:"c"] = json.dumps(data[0]) + result.loc[:"c"] = [data[0]] * len(result.loc[:"c"]) tm.assert_equal(result, expected) + @pytest.mark.xfail(reason="TODO: open an issue for ArrowExtentionArray") def test_setitem_iloc_scalar_single(self, data): - df = pd.DataFrame({"B": data}) - # Use json.dumps(data[1]) instead of passing data[1] directly to the super method. - df.iloc[10, 0] = json.dumps(data[1]) - assert df.loc[10, "B"] == data[1] + super().test_setitem_iloc_scalar_single(data) def test_setitem_iloc_scalar_mixed(self, data): df = pd.DataFrame({"A": np.arange(len(data)), "B": data}) - # Use json.dumps(data[1]) instead of passing data[1] directly to the super method. - df.iloc[0, 1] = json.dumps(data[1]) + # Use `[data[1]] * len()` instead of passing `data[1]` directly to the super method. + df.iloc[0, 1] = [data[1]] * len(df.iloc[0, 1]) assert df.loc[0, "B"] == data[1] - @pytest.mark.xfail(reaons="eq not implemented for ") + @pytest.mark.xfail(reason="eq not implemented for ") def test_setitem_mask_boolean_array_with_na(self, data, box_in_series): super().test_setitem_mask_boolean_array_with_na(data, box_in_series) @pytest.mark.parametrize("setter", ["loc", "iloc"]) + + @pytest.mark.xfail(reason="TODO: open an issue for ArrowExtentionArray") def test_setitem_scalar(self, data, setter): - arr = pd.Series(data) - setter = getattr(arr, setter) - # Use json.dumps(data[1]) instead of passing data[1] directly to the super method. - setter[0] = json.dumps(data[1]) - assert arr[0] == data[1] + super().test_setitem_scalar(data, setter) @pytest.mark.parametrize( "mask", @@ -313,35 +271,24 @@ def test_setitem_mask(self, data, mask, box_in_series): if box_in_series: arr = pd.Series(arr) expected = pd.Series(expected) - # Use json.dumps(data[0]) instead of passing data[0] directly to the super method. - arr[mask] = json.dumps(data[0]) + # Use `[data[0]] * len()` instead of passing `data[0]` directly to the super method. + arr[mask] = [data[0]] * len(arr[mask]) tm.assert_equal(expected, arr) + @pytest.mark.xfail(reasons="Setting a `dict` to an expansion row is not supported") def test_setitem_with_expansion_row(self, data, na_value): - df = pd.DataFrame({"data": data[:1]}) - - # Use json.dumps(data[1]) instead of passing data[1] directly to the super method. - df.loc[1, "data"] = json.dumps(data[1]) - expected = pd.DataFrame({"data": data[:2]}) - tm.assert_frame_equal(df, expected) - - # https://github.com/pandas-dev/pandas/issues/47284 - df.loc[2, "data"] = na_value - expected = pd.DataFrame( - {"data": pd.Series([data[0], data[1], na_value], dtype=data.dtype)} - ) - tm.assert_frame_equal(df, expected) + super().test_setitem_with_expansion_row(data, na_value) def test_setitem_iloc_scalar_multiple_homogoneous(self, data): df = pd.DataFrame({"A": data, "B": data}) - # Use json.dumps(data[1]) instead of passing data[1] directly to the super method. - df.iloc[10, 1] = json.dumps(data[1]) + # Use `[data[1]]` instead of passing `data[1]` directly to the super method. + df.iloc[10, 1] = [data[1]] assert df.loc[10, "B"] == data[1] def test_setitem_loc_scalar_multiple_homogoneous(self, data): df = pd.DataFrame({"A": data, "B": data}) - # Use json.dumps(data[1]) instead of passing data[1] directly to the super method. - df.loc[10, "B"] = json.dumps(data[1]) + # Use `[data[1]]` instead of passing `data[1]` directly to the super method. + df.loc[10, "B"] = [data[1]] assert df.loc[10, "B"] == data[1] def test_setitem_slice(self, data, box_in_series): @@ -351,8 +298,8 @@ def test_setitem_slice(self, data, box_in_series): arr = pd.Series(arr) expected = pd.Series(expected) - # Use json.dumps(data[0]) instead of passing data[0] directly to the super method. - arr[:3] = json.dumps(data[0]) + # Use `[data[0]] * 3` instead of passing `data[0]` directly to the super method. + arr[:3] = [data[0]] * 3 tm.assert_equal(arr, expected) @pytest.mark.xfail(reason="only integer scalar arrays can be converted") From 7422f7aab80b7a6ed205f94e18a02228eec7e8a9 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 30 Jul 2024 23:36:49 +0000 Subject: [PATCH 12/28] support array type --- db_dtypes/json.py | 41 +----------- tests/compliance/json/conftest.py | 38 ++++++------ tests/compliance/json/test_json_compliance.py | 62 ++++++++++++++++++- 3 files changed, 83 insertions(+), 58 deletions(-) diff --git a/db_dtypes/json.py b/db_dtypes/json.py index e19a357..72d5fa9 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -34,6 +34,7 @@ "ge": pyarrow.compute.greater_equal, } + @pd.api.extensions.register_extension_dtype class JSONDtype(pd.api.extensions.ExtensionDtype): """Extension dtype for BigQuery JSON data.""" @@ -90,6 +91,7 @@ def _box_pa( cls, value, pa_type: pa.DataType | None = None ) -> pa.Array | pa.ChunkedArray | pa.Scalar: """Box value into a pyarrow Array, ChunkedArray or Scalar.""" + if isinstance(value, pa.Scalar) or not ( common.is_list_like(value) and not common.is_dict_like(value) ): @@ -163,7 +165,7 @@ def _from_factorized(cls, values, original): @staticmethod def _serialize_json(value): """A static method that converts a JSON value into a string representation.""" - if pd.isna(value): + if not common.is_list_like(value) and pd.isna(value): return value else: # `sort_keys=True` sorts dictionary keys before serialization, making @@ -254,40 +256,3 @@ def _reduce( if name in ["min", "max"]: raise TypeError("JSONArray does not support min/max reducntion.") super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) - - def __array__( - self, dtype = None, copy = None - ) -> np.ndarray: - """Correctly construct numpy arrays when passed to `np.asarray()`.""" - return self.to_numpy(dtype=dtype) - - def to_numpy(self, dtype = None, copy = False, na_value = pd.NA) -> np.ndarray: - dtype, na_value = self._to_numpy_dtype_inference(dtype, na_value, self._hasna) - pa_type = self._pa_array.type - if not self._hasna or pd.isna(na_value) or pa.types.is_null(pa_type): - data = self - else: - data = self.fillna(na_value) - result = np.array(list(data), dtype=dtype) - - if data._hasna: - result[data.isna()] = na_value - return result - - def _to_numpy_dtype_inference( - self, dtype, na_value, hasna - ): - if dtype is not None: - dtype = np.dtype(dtype) - - if dtype is None or not hasna: - na_value = self.dtype.na_value - elif dtype.kind == "f": # type: ignore[union-attr] - na_value = np.nan - elif dtype.kind == "M": # type: ignore[union-attr] - na_value = np.datetime64("nat") - elif dtype.kind == "m": # type: ignore[union-attr] - na_value = np.timedelta64("nat") - else: - na_value = self.dtype.na_value - return dtype, na_value \ No newline at end of file diff --git a/tests/compliance/json/conftest.py b/tests/compliance/json/conftest.py index f323f65..6e98650 100644 --- a/tests/compliance/json/conftest.py +++ b/tests/compliance/json/conftest.py @@ -14,6 +14,7 @@ import json +import random import numpy as np import pandas as pd @@ -24,18 +25,29 @@ def make_data(): - # Sample data with varied lengths. + # Since the `np.array` constructor needs a consistent shape after the first + # dimension, the samples data in this instance doesn't include the array type. samples = [ - {"id": 1, "bool_value": True}, # Boolean - {"id": 2, "float_num": 3.14159}, # Floating - {"id": 3, "date": "2024-07-16"}, # Dates (as strings) - {"id": 4, "null_field": None}, # Null - {"list_data": [10, 20, 30]}, # Lists - {"person": {"name": "Alice", "age": 35}}, # Nested objects + True, # Boolean + 100, # Int + 0.98, # Float + "str", # String + {"bool_value": True}, # Dict with a boolean + {"float_num": 3.14159}, # Dict with a float + {"date": "2024-07-16"}, # Dict with a date (as strings) + {"null_field": None}, # Dict with a null + {"list_data": [10, 20, 30]}, # Dict with a list + {"person": {"name": "Alice", "age": 35}}, # Dict with nested objects {"address": {"street": "123 Main St", "city": "Anytown"}}, {"order": {"items": ["book", "pen"], "total": 15.99}}, ] - return np.random.default_rng(2).choice(samples, size=100) + data = np.random.default_rng(2).choice(samples, size=100) + # This replaces a single data item with an array. We are skipping the first two + # items to avoid some `setitem` tests failed, because setting with a list is + # ambiguity in this context. + id = random.randint(3, 99) + data[id] = [0.1, 0.2] # Array + return data @pytest.fixture @@ -48,16 +60,6 @@ def data(): """Length-100 PeriodArray for semantics test.""" data = make_data() - # Why the while loop? NumPy is unable to construct an ndarray from - # equal-length ndarrays. Many of our operations involve coercing the - # EA to an ndarray of objects. To avoid random test failures, we ensure - # that our data is coercible to an ndarray. Several tests deal with only - # the first two elements, so that's what we'll check. - - while len(data[0]) == len(data[1]): - print(data) - data = make_data() - return JSONArray._from_sequence(data) diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py index af7e543..69425c8 100644 --- a/tests/compliance/json/test_json_compliance.py +++ b/tests/compliance/json/test_json_compliance.py @@ -21,7 +21,6 @@ from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.tests.extension import base import pytest -import db_dtypes class TestJSONArray(base.ExtensionTests): @@ -126,6 +125,43 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): def test_searchsorted(self, data_for_sorting, as_series): super().test_searchsorted(self, data_for_sorting, as_series) + def test_astype_str(self, data): + # Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method. + result = pd.Series(data[:5]).astype(str) + expected = pd.Series( + [json.dumps(x, sort_keys=True) for x in data[:5]], dtype=str + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "nullable_string_dtype", + [ + "string[python]", + "string[pyarrow]", + ], + ) + def test_astype_string(self, data, nullable_string_dtype): + # Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method. + result = pd.Series(data[:5]).astype(nullable_string_dtype) + expected = pd.Series( + [json.dumps(x, sort_keys=True) for x in data[:5]], + dtype=nullable_string_dtype, + ) + tm.assert_series_equal(result, expected) + + def test_array_interface(self, data): + result = np.array(data) + # Use `json.dumps(data[0])` instead of passing `data[0]` directly to the super method. + assert result[0] == json.dumps(data[0]) + + result = np.array(data, dtype=object) + # Use `json.dumps(x)` instead of passing `x` directly to the super method. + expected = np.array([json.dumps(x) for x in data], dtype=object) + if expected.ndim > 1: + # nested data, explicitly construct as 1D + expected = construct_1d_object_array_from_listlike(list(data)) + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.xfail(reason="Setting a dict as a scalar") def test_fillna_series(self): """We treat dictionaries as a mapping in fillna, not a scalar.""" @@ -251,7 +287,6 @@ def test_setitem_mask_boolean_array_with_na(self, data, box_in_series): super().test_setitem_mask_boolean_array_with_na(data, box_in_series) @pytest.mark.parametrize("setter", ["loc", "iloc"]) - @pytest.mark.xfail(reason="TODO: open an issue for ArrowExtentionArray") def test_setitem_scalar(self, data, setter): super().test_setitem_scalar(data, setter) @@ -310,3 +345,26 @@ def test_setitem_2d_values(self, data): @pytest.mark.parametrize("engine", ["c", "python"]) def test_EA_types(self, engine, data, request): super().test_EA_types(engine, data, request) + + @pytest.mark.xfail( + reason="`to_numpy` returns serialized JSON, " + + "while `__getitem__` returns JSON objects." + ) + def test_setitem_frame_2d_values(self, data): + super().test_setitem_frame_2d_values(data) + + @pytest.mark.xfail( + reason="`to_numpy` returns serialized JSON, " + + "while `__getitem__` returns JSON objects." + ) + def test_transpose_frame(self, data): + # `DataFrame.T` calls `to_numpy` to get results. + super().test_transpose_frame(data) + + @pytest.mark.xfail( + reason="`to_numpy` returns serialized JSON, " + + "while `__getitem__` returns JSON objects." + ) + def test_where_series(self, data, na_value, as_frame): + # `Series.where` calls `to_numpy` to get results. + super().test_where_series(data, na_value, as_frame) From 22a099b9fb34f1af6d33ef36b857156c7a312e4f Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Fri, 2 Aug 2024 20:40:09 +0000 Subject: [PATCH 13/28] only import when pandas version is higher than 1.5.0 --- db_dtypes/__init__.py | 10 +++++++-- db_dtypes/json.py | 9 ++++++-- tests/compliance/json/test_json_compliance.py | 21 +++++++++++++------ 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py index 076270f..4cb45c5 100644 --- a/db_dtypes/__init__.py +++ b/db_dtypes/__init__.py @@ -28,7 +28,6 @@ import pyarrow.compute from db_dtypes import core -from db_dtypes.json import JSONArray, JSONDtype from db_dtypes.version import __version__ date_dtype_name = "dbdate" @@ -44,7 +43,14 @@ # nanosecond precision when boxing scalars. _NP_BOX_DTYPE = "datetime64[us]" -pandas_release = packaging.version.parse(pandas.__version__).release + +# To use JSONArray and JSONDtype, you'll need Pandas 1.5.0 or later. With the removal +# of Python 3.7 compatibility, the minimum Pandas version will be updated to 1.5.0. +if packaging.version.Version(pandas.__version__) >= packaging.version.Version("1.5.0"): + from db_dtypes.json import JSONArray, JSONDtype +else: + JSONArray = None + JSONDtype = None @pandas.api.extensions.register_extension_dtype diff --git a/db_dtypes/json.py b/db_dtypes/json.py index 72d5fa9..0cf88d6 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -48,8 +48,13 @@ def na_value(self) -> pd.NA: @property def type(self) -> type[str]: - """Return the scalar type for the array, e.g. int.""" - return dict + """ + Return the scalar type for the array elements. + The standard JSON data types can be one of `dict`, `list`, `str`, `int`, `float`, + `bool` and `None`. However, this method returns a `str` type to indicate its + storage type, because the union of multiple types are not supported well in pandas. + """ + return str @property def _is_numeric(self) -> bool: diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py index 69425c8..faa2a20 100644 --- a/tests/compliance/json/test_json_compliance.py +++ b/tests/compliance/json/test_json_compliance.py @@ -18,12 +18,11 @@ import numpy as np import pandas as pd import pandas._testing as tm -from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike -from pandas.tests.extension import base +import pandas.tests.extension.base import pytest -class TestJSONArray(base.ExtensionTests): +class TestJSONArray(pandas.tests.extension.base.ExtensionTests): @pytest.mark.xfail(reason="Unhashable") def test_value_counts_with_normalize(self, data): super().test_value_counts_with_normalize(data) @@ -157,9 +156,9 @@ def test_array_interface(self, data): result = np.array(data, dtype=object) # Use `json.dumps(x)` instead of passing `x` directly to the super method. expected = np.array([json.dumps(x) for x in data], dtype=object) - if expected.ndim > 1: - # nested data, explicitly construct as 1D - expected = construct_1d_object_array_from_listlike(list(data)) + # if expected.ndim > 1: + # # nested data, explicitly construct as 1D + # expected = construct_1d_object_array_from_listlike(list(data)) tm.assert_numpy_array_equal(result, expected) @pytest.mark.xfail(reason="Setting a dict as a scalar") @@ -212,6 +211,16 @@ def test_series_constructor_scalar_with_index(self, data, dtype): expected = pd.Series([scalar], index=["foo"], dtype=dtype) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(reason="Unhashable") + def test_getitem_scalar(self, data): + """ + `_getitem_` can return any JSON-types objects while `data.dtype.type` returns + a string to indicate its storage type. + > assert isinstance(result, data.dtype.type) + E AssertionError + """ + super().test_getitem_scalar() + # Patching `[....] * len()` to base.BaseSetitemTests because pandas' internals # has trouble setting sequences of values into scalar positions. From 77339a0f91c13fcba3e8de8766a3e378220cc259 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Sat, 3 Aug 2024 05:44:56 +0000 Subject: [PATCH 14/28] exclude groupby and other tests --- tests/compliance/json/conftest.py | 24 -- tests/compliance/json/test_json_compliance.py | 406 ++++++++---------- 2 files changed, 186 insertions(+), 244 deletions(-) diff --git a/tests/compliance/json/conftest.py b/tests/compliance/json/conftest.py index 6e98650..20fe2f6 100644 --- a/tests/compliance/json/conftest.py +++ b/tests/compliance/json/conftest.py @@ -108,22 +108,6 @@ def cmp(a, b): return cmp -@pytest.fixture -def data_for_grouping(): - return JSONArray._from_sequence( - [ - json.dumps({"b": 1}), - json.dumps({"b": 1}), - None, - None, - json.dumps({"a": 0, "c": 2}), - json.dumps({"a": 0, "c": 2}), - json.dumps({"b": 1}), - json.dumps({"c": 2}), - ] - ) - - @pytest.fixture def data_repeated(data): """ @@ -193,14 +177,6 @@ def all_numeric_reductions(request): return request.param -@pytest.fixture(params=tm.arithmetic_dunder_methods) -def all_arithmetic_operators(request): - """ - Fixture for dunder names for common arithmetic operations. - """ - return request.param - - @pytest.fixture(params=["data", "data_missing"]) def all_data(request, data, data_missing): """Parametrized fixture returning 'data' or 'data_missing' integer arrays. diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py index faa2a20..443dc60 100644 --- a/tests/compliance/json/test_json_compliance.py +++ b/tests/compliance/json/test_json_compliance.py @@ -18,112 +18,15 @@ import numpy as np import pandas as pd import pandas._testing as tm -import pandas.tests.extension.base +import pandas.tests.extension.base as base import pytest -class TestJSONArray(pandas.tests.extension.base.ExtensionTests): - @pytest.mark.xfail(reason="Unhashable") - def test_value_counts_with_normalize(self, data): - super().test_value_counts_with_normalize(data) - - @pytest.mark.xfail(reason="Unhashable") - def test_groupby_extension_transform(self): - """ - This currently fails in Series.name.setter, since the - name must be hashable, but the value is a dictionary. - I think this is what we want, i.e. `.name` should be the original - values, and not the values for factorization. - """ - super().test_groupby_extension_transform() - - @pytest.mark.xfail(reason="Unhashable") - def test_groupby_extension_apply(self): - """ - This fails in Index._do_unique_check with - > hash(val) - E TypeError: unhashable type: 'dict' with - I suspect that once we support Index[ExtensionArray], - we'll be able to dispatch unique. - """ - super().test_groupby_extension_apply() - - @pytest.mark.xfail(reason="Unhashable") - def test_sort_values_frame(self): - super().test_sort_values_frame() - - @pytest.mark.xfail(reason="combine for JSONArray not supported") - def test_combine_le(self, data_repeated): - super().test_combine_le(data_repeated) - - @pytest.mark.xfail( - reason="combine for JSONArray not supported - " - "may pass depending on random data", - strict=False, - raises=AssertionError, - ) - def test_combine_first(self, data): - super().test_combine_first(data) - - @pytest.mark.skip(reason="2D support not implemented for JSONArray") - def test_view(self, data): - super().test_view(data) - - @pytest.mark.skip(reason="2D support not implemented for JSONArray") - def test_setitem_preserves_views(self, data): - super().test_setitem_preserves_views(data) - - @pytest.mark.skip(reason="2D support not implemented for JSONArray") - def test_transpose(self, data): - super().test_transpose(data) - - @pytest.mark.xfail(reason="Arithmetic functions is not supported for json") - def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): - super().test_arith_frame_with_scalar(data, all_arithmetic_operators) - - @pytest.mark.xfail(reason="Arithmetic functions is not supported for json") - def test_arith_series_with_scalar(self, data, all_arithmetic_operators): - super().test_arith_series_with_scalar(data, all_arithmetic_operators) - - @pytest.mark.xfail(reason="Arithmetic functions is not supported for json") - def test_arith_series_with_array(self, data, all_arithmetic_operators): - super().test_arith_series_with_array(data, all_arithmetic_operators) - - @pytest.mark.xfail(reason="Arithmetic functions is not supported for json") - def test_add_series_with_extension_array(self, data): - super().test_add_series_with_extension_array(data, data) - - @pytest.mark.xfail(reason="Arithmetic functions is not supported for json") - def test_divmod(self, data): - super().test_divmod(data, data) +class TestJSONArrayAccumulate(base.BaseAccumulateTests): + pass - def test_compare_array(self, data, comparison_op, request): - if comparison_op.__name__ not in ["eq", "ne"]: - mark = pytest.mark.xfail(reason="Comparison methods not implemented") - request.applymarker(mark) - super().test_compare_array(data, comparison_op) - - def test_compare_scalar(self, data, comparison_op, request): - if comparison_op.__name__ not in ["eq", "ne"]: - mark = pytest.mark.xfail(reason="Comparison methods not implemented") - request.applymarker(mark) - super().test_compare_scalar(data, comparison_op) - - def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: - return False - - def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): - dtype = typing.cast(pd.StringDtype, tm.get_dtype(obj)) - if op_name in ["__add__", "__radd__"]: - cast_to = dtype - else: - cast_to = "boolean[pyarrow]" # type: ignore[assignment] - return pointwise_result.astype(cast_to) - - @pytest.mark.skip(reason="'<' not supported between instances of 'dict' and 'dict'") - def test_searchsorted(self, data_for_sorting, as_series): - super().test_searchsorted(self, data_for_sorting, as_series) +class TestJSONArrayCasting(base.BaseCastingTests): def test_astype_str(self, data): # Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method. result = pd.Series(data[:5]).astype(str) @@ -148,33 +51,8 @@ def test_astype_string(self, data, nullable_string_dtype): ) tm.assert_series_equal(result, expected) - def test_array_interface(self, data): - result = np.array(data) - # Use `json.dumps(data[0])` instead of passing `data[0]` directly to the super method. - assert result[0] == json.dumps(data[0]) - - result = np.array(data, dtype=object) - # Use `json.dumps(x)` instead of passing `x` directly to the super method. - expected = np.array([json.dumps(x) for x in data], dtype=object) - # if expected.ndim > 1: - # # nested data, explicitly construct as 1D - # expected = construct_1d_object_array_from_listlike(list(data)) - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.xfail(reason="Setting a dict as a scalar") - def test_fillna_series(self): - """We treat dictionaries as a mapping in fillna, not a scalar.""" - super().test_fillna_series() - - @pytest.mark.xfail(reason="Setting a dict as a scalar") - def test_fillna_frame(self): - """We treat dictionaries as a mapping in fillna, not a scalar.""" - super().test_fillna_frame() - - @pytest.mark.skip("fill-value is interpreted as a dict of values") - def test_fillna_copy_frame(self, data_missing): - super().test_fillna_copy_frame(data_missing) +class TestJSONArrayConstructors(base.BaseConstructorsTests): def test_from_dtype(self, data): # construct from our dtype & string dtype dtype = data.dtype @@ -211,7 +89,18 @@ def test_series_constructor_scalar_with_index(self, data, dtype): expected = pd.Series([scalar], index=["foo"], dtype=dtype) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(reason="Unhashable") + +@pytest.mark.skip(reason="BigQuery does not allow group by a JSON-type column.") +class TestJSONArrayGroupby(base.BaseGroupbyTests): + pass + + +class TestJSONArrayDtype(base.BaseDtypeTests): + pass + + +class TestJSONArrayGetitem(base.BaseGetitemTests): + @pytest.mark.xfail(reason="JSONDtype's type returns its storage type.") def test_getitem_scalar(self, data): """ `_getitem_` can return any JSON-types objects while `data.dtype.type` returns @@ -219,8 +108,139 @@ def test_getitem_scalar(self, data): > assert isinstance(result, data.dtype.type) E AssertionError """ - super().test_getitem_scalar() + super().test_getitem_scalar(data) + + +class TestJSONArrayIndex(base.BaseIndexTests): + pass + + +class TestJSONArrayInterface(base.BaseInterfaceTests): + def test_array_interface(self, data): + result = np.array(data) + # Use `json.dumps(data[0])` instead of passing `data[0]` directly to the super method. + assert result[0] == json.dumps(data[0]) + + result = np.array(data, dtype=object) + # Use `json.dumps(x)` instead of passing `x` directly to the super method. + expected = np.array([json.dumps(x) for x in data], dtype=object) + # if expected.ndim > 1: + # # nested data, explicitly construct as 1D + # expected = construct_1d_object_array_from_listlike(list(data)) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.skip(reason="2D support not implemented for JSONArray") + def test_view(self, data): + super().test_view(data) + + +class TestJSONArrayParsing(base.BaseParsingTests): + @pytest.mark.xfail(reason="data type 'json' not understood") + @pytest.mark.parametrize("engine", ["c", "python"]) + def test_EA_types(self, engine, data, request): + super().test_EA_types(engine, data, request) + + +class TestJSONArrayMethods(base.BaseMethodsTests): + @pytest.mark.xfail(reason="Unhashable") + def test_value_counts_with_normalize(self, data): + super().test_value_counts_with_normalize(data) + + @pytest.mark.skip("fill-value is interpreted as a dict of values") + def test_fillna_copy_frame(self, data_missing): + super().test_fillna_copy_frame(data_missing) + + @pytest.mark.xfail(reason="combine for JSONArray not supported") + def test_combine_le(self, data_repeated): + super().test_combine_le(data_repeated) + + @pytest.mark.skip(reason="'<' not supported between instances of 'dict' and 'dict'") + def test_searchsorted(self, data_for_sorting, as_series): + super().test_searchsorted(self, data_for_sorting, as_series) + + @pytest.mark.xfail( + reason="`to_numpy` returns serialized JSON, " + + "while `__getitem__` returns JSON objects." + ) + def test_where_series(self, data, na_value, as_frame): + # `Series.where` calls `to_numpy` to get results. + super().test_where_series(data, na_value, as_frame) + + @pytest.mark.skip(reason="BigQuery does not allow group by a JSON-type column.") + def test_factorize(self, data_for_grouping): + super().test_factorize(data_for_grouping) + + @pytest.mark.skip(reason="BigQuery does not allow group by a JSON-type column.") + def test_factorize_equivalence(self, data_for_grouping): + super().test_factorize_equivalence(data_for_grouping) + +class TestJSONArrayMissing(base.BaseMissingTests): + @pytest.mark.xfail(reason="Setting a dict as a scalar") + def test_fillna_series(self): + """We treat dictionaries as a mapping in fillna, not a scalar.""" + super().test_fillna_series() + + @pytest.mark.xfail(reason="Setting a dict as a scalar") + def test_fillna_frame(self): + """We treat dictionaries as a mapping in fillna, not a scalar.""" + super().test_fillna_frame() + + +@pytest.mark.skip(reason="BigQuery JSON does not allow Arithmetic Ops.") +class TestJSONArrayArithmeticOps(base.BaseArithmeticOpsTests): + pass + + +class TestJSONArrayComparisonOps(base.BaseComparisonOpsTests): + def test_compare_array(self, data, comparison_op, request): + if comparison_op.__name__ not in ["eq", "ne"]: + mark = pytest.mark.xfail(reason="Comparison methods not implemented") + request.applymarker(mark) + super().test_compare_array(data, comparison_op) + + def test_compare_scalar(self, data, comparison_op, request): + if comparison_op.__name__ not in ["eq", "ne"]: + mark = pytest.mark.xfail(reason="Comparison methods not implemented") + request.applymarker(mark) + super().test_compare_scalar(data, comparison_op) + + def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result): + dtype = typing.cast(pd.StringDtype, tm.get_dtype(obj)) + if op_name in ["__add__", "__radd__"]: + cast_to = dtype + else: + cast_to = "boolean[pyarrow]" # type: ignore[assignment] + return pointwise_result.astype(cast_to) + + +class TestJSONArrayUnaryOps(base.BaseUnaryOpsTests): + pass + + +class TestJSONArrayPrinting(base.BasePrintingTests): + pass + + +class TestJSONArrayReduce(base.BaseReduceTests): + pass + + +class TestJSONArrayReshaping(base.BaseReshapingTests): + @pytest.mark.skip(reason="2D support not implemented for JSONArray") + def test_transpose(self, data): + super().test_transpose(data) + + @pytest.mark.xfail( + reason="`to_numpy` returns serialized JSON, " + + "while `__getitem__` returns JSON objects." + ) + def test_transpose_frame(self, data): + # `DataFrame.T` calls `to_numpy` to get results. + super().test_transpose_frame(data) + + +class TestJSONArraySetitem(base.BaseSetitemTests): # Patching `[....] * len()` to base.BaseSetitemTests because pandas' internals # has trouble setting sequences of values into scalar positions. @@ -241,65 +261,6 @@ def test_setitem_integer_array(self, data, idx, box_in_series): arr[idx] = [arr[0]] * len(arr[idx]) tm.assert_equal(arr, expected) - @pytest.mark.parametrize("setter", ["loc", None]) - def test_setitem_mask_broadcast(self, data, setter): - ser = pd.Series(data) - mask = np.zeros(len(data), dtype=bool) - mask[:2] = True - - if setter: # loc - target = getattr(ser, setter) - else: # __setitem__ - target = ser - - # Use `[data[10]] * len()` instead of passing `data[10]` directly to the super method. - target[mask] = [data[10]] * len(target[mask]) - assert ser[0] == data[10] - assert ser[1] == data[10] - - def test_setitem_loc_scalar_mixed(self, data): - df = pd.DataFrame({"A": np.arange(len(data)), "B": data}) - # Use `[data[1]]` instead of passing `data[1]` directly to the super method. - df.loc[0, "B"] = [data[1]] - assert df.loc[0, "B"] == data[1] - - @pytest.mark.xfail(reason="TODO: open an issue for ArrowExtentionArray") - def test_setitem_loc_scalar_single(self, data): - super().test_setitem_loc_scalar_single(data) - - def test_setitem_loc_iloc_slice(self, data): - arr = data[:5].copy() - s = pd.Series(arr, index=["a", "b", "c", "d", "e"]) - expected = pd.Series(data.take([0, 0, 0, 3, 4]), index=s.index) - - result = s.copy() - # Use `[data[0]] * len()` instead of passing `data[0]` directly to the super method. - result.iloc[:3] = [data[0]] * len(result.iloc[:3]) - tm.assert_equal(result, expected) - - result = s.copy() - result.loc[:"c"] = [data[0]] * len(result.loc[:"c"]) - tm.assert_equal(result, expected) - - @pytest.mark.xfail(reason="TODO: open an issue for ArrowExtentionArray") - def test_setitem_iloc_scalar_single(self, data): - super().test_setitem_iloc_scalar_single(data) - - def test_setitem_iloc_scalar_mixed(self, data): - df = pd.DataFrame({"A": np.arange(len(data)), "B": data}) - # Use `[data[1]] * len()` instead of passing `data[1]` directly to the super method. - df.iloc[0, 1] = [data[1]] * len(df.iloc[0, 1]) - assert df.loc[0, "B"] == data[1] - - @pytest.mark.xfail(reason="eq not implemented for ") - def test_setitem_mask_boolean_array_with_na(self, data, box_in_series): - super().test_setitem_mask_boolean_array_with_na(data, box_in_series) - - @pytest.mark.parametrize("setter", ["loc", "iloc"]) - @pytest.mark.xfail(reason="TODO: open an issue for ArrowExtentionArray") - def test_setitem_scalar(self, data, setter): - super().test_setitem_scalar(data, setter) - @pytest.mark.parametrize( "mask", [ @@ -319,21 +280,19 @@ def test_setitem_mask(self, data, mask, box_in_series): arr[mask] = [data[0]] * len(arr[mask]) tm.assert_equal(expected, arr) - @pytest.mark.xfail(reasons="Setting a `dict` to an expansion row is not supported") - def test_setitem_with_expansion_row(self, data, na_value): - super().test_setitem_with_expansion_row(data, na_value) + def test_setitem_loc_iloc_slice(self, data): + arr = data[:5].copy() + s = pd.Series(arr, index=["a", "b", "c", "d", "e"]) + expected = pd.Series(data.take([0, 0, 0, 3, 4]), index=s.index) - def test_setitem_iloc_scalar_multiple_homogoneous(self, data): - df = pd.DataFrame({"A": data, "B": data}) - # Use `[data[1]]` instead of passing `data[1]` directly to the super method. - df.iloc[10, 1] = [data[1]] - assert df.loc[10, "B"] == data[1] + result = s.copy() + # Use `[data[0]] * len()` instead of passing `data[0]` directly to the super method. + result.iloc[:3] = [data[0]] * len(result.iloc[:3]) + tm.assert_equal(result, expected) - def test_setitem_loc_scalar_multiple_homogoneous(self, data): - df = pd.DataFrame({"A": data, "B": data}) - # Use `[data[1]]` instead of passing `data[1]` directly to the super method. - df.loc[10, "B"] = [data[1]] - assert df.loc[10, "B"] == data[1] + result = s.copy() + result.loc[:"c"] = [data[0]] * len(result.loc[:"c"]) + tm.assert_equal(result, expected) def test_setitem_slice(self, data, box_in_series): arr = data[:5].copy() @@ -350,11 +309,6 @@ def test_setitem_slice(self, data, box_in_series): def test_setitem_2d_values(self, data): super().test_setitem_2d_values(data) - @pytest.mark.xfail(reason="data type 'json' not understood") - @pytest.mark.parametrize("engine", ["c", "python"]) - def test_EA_types(self, engine, data, request): - super().test_EA_types(engine, data, request) - @pytest.mark.xfail( reason="`to_numpy` returns serialized JSON, " + "while `__getitem__` returns JSON objects." @@ -362,18 +316,30 @@ def test_EA_types(self, engine, data, request): def test_setitem_frame_2d_values(self, data): super().test_setitem_frame_2d_values(data) - @pytest.mark.xfail( - reason="`to_numpy` returns serialized JSON, " - + "while `__getitem__` returns JSON objects." - ) - def test_transpose_frame(self, data): - # `DataFrame.T` calls `to_numpy` to get results. - super().test_transpose_frame(data) + @pytest.mark.parametrize("setter", ["loc", None]) + def test_setitem_mask_broadcast(self, data, setter): + ser = pd.Series(data) + mask = np.zeros(len(data), dtype=bool) + mask[:2] = True - @pytest.mark.xfail( - reason="`to_numpy` returns serialized JSON, " - + "while `__getitem__` returns JSON objects." - ) - def test_where_series(self, data, na_value, as_frame): - # `Series.where` calls `to_numpy` to get results. - super().test_where_series(data, na_value, as_frame) + if setter: # loc + target = getattr(ser, setter) + else: # __setitem__ + target = ser + + # Use `[data[10]] * len()` instead of passing `data[10]` directly to the super method. + target[mask] = [data[10]] * len(target[mask]) + assert ser[0] == data[10] + assert ser[1] == data[10] + + @pytest.mark.xfail(reason="eq not implemented for ") + def test_setitem_mask_boolean_array_with_na(self, data, box_in_series): + super().test_setitem_mask_boolean_array_with_na(data, box_in_series) + + @pytest.mark.skip(reason="2D support not implemented for JSONArray") + def test_setitem_preserves_views(self, data): + super().test_setitem_preserves_views(data) + + +class TestJSONArrayDim2Compat(base.Dim2CompatTests): + pass From 279882508ab2aee9f8c81746d2d03efb75d5328b Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Mon, 5 Aug 2024 17:59:49 +0000 Subject: [PATCH 15/28] others --- tests/compliance/json/conftest.py | 8 -------- tests/compliance/json/test_json_compliance.py | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/tests/compliance/json/conftest.py b/tests/compliance/json/conftest.py index 20fe2f6..74870c4 100644 --- a/tests/compliance/json/conftest.py +++ b/tests/compliance/json/conftest.py @@ -18,7 +18,6 @@ import numpy as np import pandas as pd -import pandas._testing as tm import pytest from db_dtypes import JSONArray, JSONDtype @@ -79,13 +78,6 @@ def data_missing(): return JSONArray._from_sequence([None, {"a": 10}]) -@pytest.fixture -def data_for_sorting(): - return JSONArray._from_sequence( - [json.dumps({"b": 1}), json.dumps({"c": 4}), json.dumps({"a": 2, "c": 3})] - ) - - @pytest.fixture def data_missing_for_sorting(): return JSONArray._from_sequence([json.dumps({"b": 1}), None, json.dumps({"a": 4})]) diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py index 443dc60..18610a0 100644 --- a/tests/compliance/json/test_json_compliance.py +++ b/tests/compliance/json/test_json_compliance.py @@ -174,6 +174,22 @@ def test_factorize(self, data_for_grouping): def test_factorize_equivalence(self, data_for_grouping): super().test_factorize_equivalence(data_for_grouping) + @pytest.mark.skip(reason="BigQuery does not allow sort by a JSON-type column.") + def test_argsort(self, data_for_sorting): + super().test_argsort(data_for_sorting) + + @pytest.mark.skip(reason="BigQuery does not allow sort by a JSON-type column.") + def test_argmin_argmax(self, data_for_sorting): + super().test_argmin_argmax(data_for_sorting) + + @pytest.mark.skip(reason="BigQuery does not allow sort by a JSON-type column.") + def test_sort_values(self, data_for_sorting): + super().test_sort_values(data_for_sorting) + + @pytest.mark.skip(reason="BigQuery does not allow sort by a JSON-type column.") + def test_sort_values_frame(self, data_for_sorting): + super().test_sort_values_frame(data_for_sorting) + class TestJSONArrayMissing(base.BaseMissingTests): @pytest.mark.xfail(reason="Setting a dict as a scalar") From efe72cceb5b346e8d74e4ab54e47d78a6205bdb1 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 6 Aug 2024 18:49:30 +0000 Subject: [PATCH 16/28] skip jsondtype and jsonarray --- db_dtypes/__init__.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py index 4cb45c5..dd17fb1 100644 --- a/db_dtypes/__init__.py +++ b/db_dtypes/__init__.py @@ -343,13 +343,21 @@ def __sub__(self, other): return super().__sub__(other) - -__all__ = [ - "__version__", - "DateArray", - "DateDtype", - "JSONDtype", - "JSONArray", - "TimeArray", - "TimeDtype", -] +if not JSONArray or not JSONDtype: + __all__ = [ + "__version__", + "DateArray", + "DateDtype", + "TimeArray", + "TimeDtype", + ] +else: + __all__ = [ + "__version__", + "DateArray", + "DateDtype", + "JSONDtype", + "JSONArray", + "TimeArray", + "TimeDtype", + ] From 98adb5a1cbb96542f2d1be6ca56f2d80328e6df7 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 6 Aug 2024 18:50:59 +0000 Subject: [PATCH 17/28] fixing --- db_dtypes/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py index dd17fb1..d27e93e 100644 --- a/db_dtypes/__init__.py +++ b/db_dtypes/__init__.py @@ -343,6 +343,7 @@ def __sub__(self, other): return super().__sub__(other) + if not JSONArray or not JSONDtype: __all__ = [ "__version__", From 790f2577601865c2d922d462445ce3576ee5acc1 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 6 Aug 2024 18:57:21 +0000 Subject: [PATCH 18/28] fix coverage file name --- .github/workflows/unittest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml index 81ff447..0c2dca0 100644 --- a/.github/workflows/unittest.yml +++ b/.github/workflows/unittest.yml @@ -76,7 +76,7 @@ jobs: python -m pip install nox - name: Run compliance tests env: - COVERAGE_FILE: .coverage-${{ matrix.python }} + COVERAGE_FILE: .coverage-compliance-${{ matrix.python }} run: | nox -s compliance-${{ matrix.python }} - name: Upload coverage results From 8800b6bc11f4600e5ec8b6ba5864d151672d63ec Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 6 Aug 2024 23:01:11 +0000 Subject: [PATCH 19/28] add a simple unit test --- tests/unit/test_json.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 tests/unit/test_json.py diff --git a/tests/unit/test_json.py b/tests/unit/test_json.py new file mode 100644 index 0000000..538fb4d --- /dev/null +++ b/tests/unit/test_json.py @@ -0,0 +1,34 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import datetime as dt +from typing import Optional + +import pandas +import pandas.api.extensions +import pandas.testing +import pyarrow +import pytest + +import packaging.version + +import db_dtypes + +is_supported_version = packaging.version.Version(pandas.__version__) >= packaging.version.Version("1.5.0") + +@pytest.mark.skipif(not is_supported_version, reason="requires Pandas 1.5.0 and above") +def test_constructor_from_sequence(): + json_obj = [0, "str", {"a": 0, "b": 1}] + data = db_dtypes.JSONArray._from_sequence(json_obj) From b4cfcd91d10003f41f242df965f8d957e68a4eb5 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 7 Aug 2024 21:50:41 +0000 Subject: [PATCH 20/28] unit-test for some functionalities --- db_dtypes/json.py | 20 +++-------- tests/unit/test_json.py | 74 ++++++++++++++++++++++++++++++++++------- 2 files changed, 67 insertions(+), 27 deletions(-) diff --git a/db_dtypes/json.py b/db_dtypes/json.py index 0cf88d6..a8a6caa 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -15,7 +15,6 @@ from __future__ import annotations import json -import typing import numpy as np import pandas as pd @@ -69,10 +68,10 @@ def construct_array_type(cls): """Return the array type associated with this dtype.""" return JSONArray - @staticmethod - def __from_arrow__(array: typing.Union[pa.Array, pa.ChunkedArray]) -> JSONArray: - """Convert to JSONArray from an Arrow array.""" - return JSONArray(array) + # @staticmethod + # def __from_arrow__(array: typing.Union[pa.Array, pa.ChunkedArray]) -> JSONArray: + # """Convert to JSONArray from an Arrow array.""" + # return JSONArray(array) class JSONArray(arrays.ArrowExtensionArray): @@ -143,18 +142,9 @@ def _box_pa_array( @classmethod def _from_sequence(cls, scalars, *, dtype=None, copy=False): """Construct a new ExtensionArray from a sequence of scalars.""" - result = [] - for scalar in scalars: - result.append(JSONArray._serialize_json(scalar)) + result = [JSONArray._serialize_json(scalar) for scalar in scalars] return cls(pa.array(result, type=pa.string(), from_pandas=True)) - @classmethod - def _from_sequence_of_strings( - cls, strings, *, dtype, copy: bool = False - ) -> JSONArray: - """Construct a new ExtensionArray from a sequence of strings.""" - return cls._from_sequence(strings, dtype=dtype, copy=copy) - @classmethod def _concat_same_type(cls, to_concat) -> JSONArray: """Concatenate multiple JSONArray.""" diff --git a/tests/unit/test_json.py b/tests/unit/test_json.py index 538fb4d..d18586a 100644 --- a/tests/unit/test_json.py +++ b/tests/unit/test_json.py @@ -13,22 +13,72 @@ # limitations under the License. -import datetime as dt -from typing import Optional +import json -import pandas -import pandas.api.extensions +import pandas as pd import pandas.testing -import pyarrow import pytest -import packaging.version - import db_dtypes -is_supported_version = packaging.version.Version(pandas.__version__) >= packaging.version.Version("1.5.0") +# Check for minimum Pandas version. +pytest.importorskip("pandas", minversion="1.5.0") + + +# # Python data types mirroring all standard JSON types +# https://json-schema.org/understanding-json-schema/reference/type +JSON_DATA = { + "boolean": True, + "int": 100, + "float": 0.98, + "string": "hello world", + "array": [0.1, 0.2], + "dict": { + "null_field": None, + "order": { + "items": ["book", "pen", "computer"], + "total": 15.99, + "address": {"street": "123 Main St", "city": "Anytown"}, + }, + }, + "null": None, +} + + +def test_get_items(): + data = db_dtypes.JSONArray._from_sequence(JSON_DATA.values()) + for id, key in enumerate(JSON_DATA.keys()): + if key == "null": + assert pd.isna(data[id]) + else: + assert data[id] == JSON_DATA[key] + + +def test_get_items_unbox_object(): + data = db_dtypes.JSONArray._from_sequence([JSON_DATA["dict"]]) + assert len(data[0]) == 2 + + assert data[0]["null_field"] is None + assert data[0]["order"]["address"]["city"] == "Anytown" + assert len(data[0]["order"]["items"]) == 3 + assert data[0]["order"]["items"][0] == "book" + + with pytest.raises(KeyError): + data[0]["unknown"] + + +def test_to_numpy(): + s = pd.Series(db_dtypes.JSONArray._from_sequence(JSON_DATA.values())) + data = s.to_numpy() + for id, key in enumerate(JSON_DATA.keys()): + if key == "null": + assert pd.isna(data[id]) + else: + assert data[id] == json.dumps(JSON_DATA[key], sort_keys=True) + -@pytest.mark.skipif(not is_supported_version, reason="requires Pandas 1.5.0 and above") -def test_constructor_from_sequence(): - json_obj = [0, "str", {"a": 0, "b": 1}] - data = db_dtypes.JSONArray._from_sequence(json_obj) +def test_deterministic_json_serialization(): + x = {"a": 0, "b": 1} + y = {"b": 1, "a": 0} + data = db_dtypes.JSONArray._from_sequence([x]) + assert y in data From 17f560e6414c9e7ff4380099558723aa37790b76 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 7 Aug 2024 22:17:37 +0000 Subject: [PATCH 21/28] address comments --- db_dtypes/json.py | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/db_dtypes/json.py b/db_dtypes/json.py index a8a6caa..3a2b0ee 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -24,15 +24,6 @@ import pyarrow as pa import pyarrow.compute -ARROW_CMP_FUNCS = { - "eq": pyarrow.compute.equal, - "ne": pyarrow.compute.not_equal, - "lt": pyarrow.compute.less, - "gt": pyarrow.compute.greater, - "le": pyarrow.compute.less_equal, - "ge": pyarrow.compute.greater_equal, -} - @pd.api.extensions.register_extension_dtype class JSONDtype(pd.api.extensions.ExtensionDtype): @@ -68,11 +59,6 @@ def construct_array_type(cls): """Return the array type associated with this dtype.""" return JSONArray - # @staticmethod - # def __from_arrow__(array: typing.Union[pa.Array, pa.ChunkedArray]) -> JSONArray: - # """Convert to JSONArray from an Arrow array.""" - # return JSONArray(array) - class JSONArray(arrays.ArrowExtensionArray): """Extension array that handles BigQuery JSON data, leveraging a string-based @@ -95,26 +81,26 @@ def _box_pa( cls, value, pa_type: pa.DataType | None = None ) -> pa.Array | pa.ChunkedArray | pa.Scalar: """Box value into a pyarrow Array, ChunkedArray or Scalar.""" + if pa_type is not None and pa_type != pa.string(): + raise ValueError(f"Unsupported type '{pa_type}' for JSONArray") if isinstance(value, pa.Scalar) or not ( common.is_list_like(value) and not common.is_dict_like(value) ): - return cls._box_pa_scalar(value, pa_type) - return cls._box_pa_array(value, pa_type) + return cls._box_pa_scalar(value) + return cls._box_pa_array(value) @classmethod - def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: + def _box_pa_scalar(cls, value) -> pa.Scalar: """Box value into a pyarrow Scalar.""" if isinstance(value, pa.Scalar): pa_scalar = value if pd.isna(value): - pa_scalar = pa.scalar(None, type=pa_type) + pa_scalar = pa.scalar(None, type=pa.string()) else: value = JSONArray._serialize_json(value) - pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True) + pa_scalar = pa.scalar(value, type=pa.string(), from_pandas=True) - if pa_type is not None and pa_scalar.type != pa_type: - pa_scalar = pa_scalar.cast(pa_type) return pa_scalar @classmethod @@ -131,7 +117,8 @@ def _box_pa_array( value = [JSONArray._serialize_json(x) for x in value] pa_array = pa.array(value, type=pa_type, from_pandas=True) except (pa.ArrowInvalid, pa.ArrowTypeError): - # GH50430: let pyarrow infer type, then cast + # https://github.com/pandas-dev/pandas/pull/50430: + # let pyarrow infer type, then cast pa_array = pa.array(value, from_pandas=True) if pa_type is not None and pa_array.type != pa_type: @@ -181,8 +168,12 @@ def dtype(self) -> JSONDtype: return self._dtype def _cmp_method(self, other, op): - pc_func = ARROW_CMP_FUNCS[op.__name__] - result = pc_func(self._pa_array, self._box_pa(other)) + if op.__name__ == "eq": + result = pyarrow.compute.equal(self._pa_array, self._box_pa(other)) + elif op.__name__ == "ne": + result = pyarrow.compute.not_equal(self._pa_array, self._box_pa(other)) + else: + raise NotImplementedError(f"{op.__name__} not implemented for JSONArray") return arrays.ArrowExtensionArray(result) def __getitem__(self, item): From 7add79219a23f435a3a515a798363c02a6e8e003 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 8 Aug 2024 00:17:56 +0000 Subject: [PATCH 22/28] fix test cover --- db_dtypes/json.py | 50 ++++++++--------------------------------- tests/unit/test_json.py | 25 ++++++++++++++++++--- 2 files changed, 31 insertions(+), 44 deletions(-) diff --git a/db_dtypes/json.py b/db_dtypes/json.py index 3a2b0ee..9db42d6 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -81,8 +81,7 @@ def _box_pa( cls, value, pa_type: pa.DataType | None = None ) -> pa.Array | pa.ChunkedArray | pa.Scalar: """Box value into a pyarrow Array, ChunkedArray or Scalar.""" - if pa_type is not None and pa_type != pa.string(): - raise ValueError(f"Unsupported type '{pa_type}' for JSONArray") + assert pa_type is None or pa_type == pa.string() if isinstance(value, pa.Scalar) or not ( common.is_list_like(value) and not common.is_dict_like(value) @@ -93,8 +92,6 @@ def _box_pa( @classmethod def _box_pa_scalar(cls, value) -> pa.Scalar: """Box value into a pyarrow Scalar.""" - if isinstance(value, pa.Scalar): - pa_scalar = value if pd.isna(value): pa_scalar = pa.scalar(None, type=pa.string()) else: @@ -104,33 +101,21 @@ def _box_pa_scalar(cls, value) -> pa.Scalar: return pa_scalar @classmethod - def _box_pa_array( - cls, value, pa_type: pa.DataType | None = None, copy: bool = False - ) -> pa.Array | pa.ChunkedArray: + def _box_pa_array(cls, value, copy: bool = False) -> pa.Array | pa.ChunkedArray: """Box value into a pyarrow Array or ChunkedArray.""" if isinstance(value, cls): pa_array = value._pa_array - elif isinstance(value, (pa.Array, pa.ChunkedArray)): - pa_array = value else: - try: - value = [JSONArray._serialize_json(x) for x in value] - pa_array = pa.array(value, type=pa_type, from_pandas=True) - except (pa.ArrowInvalid, pa.ArrowTypeError): - # https://github.com/pandas-dev/pandas/pull/50430: - # let pyarrow infer type, then cast - pa_array = pa.array(value, from_pandas=True) - - if pa_type is not None and pa_array.type != pa_type: - pa_array = pa_array.cast(pa_type) - + value = [JSONArray._serialize_json(x) for x in value] + pa_array = pa.array(value, type=pa.string(), from_pandas=True) return pa_array @classmethod def _from_sequence(cls, scalars, *, dtype=None, copy=False): """Construct a new ExtensionArray from a sequence of scalars.""" - result = [JSONArray._serialize_json(scalar) for scalar in scalars] - return cls(pa.array(result, type=pa.string(), from_pandas=True)) + pa_array = cls._box_pa(scalars) + arr = cls(pa_array) + return arr @classmethod def _concat_same_type(cls, to_concat) -> JSONArray: @@ -139,11 +124,6 @@ def _concat_same_type(cls, to_concat) -> JSONArray: arr = pa.chunked_array(chunks, type=pa.string()) return cls(arr) - @classmethod - def _from_factorized(cls, values, original): - """Reconstruct an ExtensionArray after factorization.""" - return cls._from_sequence(values, dtype=original.dtype) - @staticmethod def _serialize_json(value): """A static method that converts a JSON value into a string representation.""" @@ -202,19 +182,6 @@ def __getitem__(self, item): r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " r"(`None`) and integer or boolean arrays are valid indices" ) - # We are not an array indexer, so maybe e.g. a slice or integer - # indexer. We dispatch to pyarrow. - if isinstance(item, slice): - # Arrow bug https://github.com/apache/arrow/issues/38768 - if item.start == item.stop: - pass - elif ( - item.stop is not None - and item.stop < -len(self) - and item.step is not None - and item.step < 0 - ): - item = slice(item.start, None, item.step) value = self._pa_array[item] if isinstance(value, pa.ChunkedArray): @@ -229,7 +196,8 @@ def __getitem__(self, item): def __iter__(self): """Iterate over elements of the array.""" for value in self._pa_array: - val = JSONArray._deserialize_json(value.as_py()) + val = value.as_py() + # val = JSONArray._deserialize_json(value.as_py()) if val is None: yield self._dtype.na_value else: diff --git a/tests/unit/test_json.py b/tests/unit/test_json.py index d18586a..5e389a2 100644 --- a/tests/unit/test_json.py +++ b/tests/unit/test_json.py @@ -15,8 +15,8 @@ import json +import numpy as np import pandas as pd -import pandas.testing import pytest import db_dtypes @@ -45,7 +45,12 @@ } -def test_get_items(): +def test_construct_w_unspported_types(): + with pytest.raises(ValueError): + db_dtypes.JSONArray(100) + + +def test_getitems_return_json_objects(): data = db_dtypes.JSONArray._from_sequence(JSON_DATA.values()) for id, key in enumerate(JSON_DATA.keys()): if key == "null": @@ -54,7 +59,7 @@ def test_get_items(): assert data[id] == JSON_DATA[key] -def test_get_items_unbox_object(): +def test_getitems_w_unboxed_dict(): data = db_dtypes.JSONArray._from_sequence([JSON_DATA["dict"]]) assert len(data[0]) == 2 @@ -67,6 +72,20 @@ def test_get_items_unbox_object(): data[0]["unknown"] +def test_getitems_w_invalid_numpy_array(): + data = db_dtypes.JSONArray._from_sequence(JSON_DATA.values()) + idx = np.array(["str"]) + with pytest.raises(IndexError): + data[idx] + + +def test_getitems_when_iter_with_null(): + data = db_dtypes.JSONArray._from_sequence([JSON_DATA["null"]]) + s = pd.Series(data) + result = s[:1].item() + assert pd.isna(result) + + def test_to_numpy(): s = pd.Series(db_dtypes.JSONArray._from_sequence(JSON_DATA.values())) data = s.to_numpy() From ba516c71862069fbecf1ef098a44d0f2da577f3c Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 8 Aug 2024 00:22:41 +0000 Subject: [PATCH 23/28] fixing --- db_dtypes/json.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/db_dtypes/json.py b/db_dtypes/json.py index 9db42d6..6045b07 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -196,8 +196,7 @@ def __getitem__(self, item): def __iter__(self): """Iterate over elements of the array.""" for value in self._pa_array: - val = value.as_py() - # val = JSONArray._deserialize_json(value.as_py()) + val = JSONArray._deserialize_json(value.as_py()) if val is None: yield self._dtype.na_value else: From 0185f0847341976e3261b84705db7c5c6585d168 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 8 Aug 2024 10:03:27 -0500 Subject: [PATCH 24/28] Update db_dtypes/json.py --- db_dtypes/json.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/db_dtypes/json.py b/db_dtypes/json.py index 6045b07..27fdc15 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -153,7 +153,8 @@ def _cmp_method(self, other, op): elif op.__name__ == "ne": result = pyarrow.compute.not_equal(self._pa_array, self._box_pa(other)) else: - raise NotImplementedError(f"{op.__name__} not implemented for JSONArray") + # Comparison is not a meaningful one. We don't want to support sorting by JSON columns. + raise TypeError(f"{op.__name__} not supported for JSONArray") return arrays.ArrowExtensionArray(result) def __getitem__(self, item): From dac34315969da0a8f930a19f242bfbf55b22f15d Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 8 Aug 2024 17:09:36 +0000 Subject: [PATCH 25/28] fixing --- db_dtypes/json.py | 2 +- tests/unit/test_json.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/db_dtypes/json.py b/db_dtypes/json.py index 27fdc15..9f92134 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -171,7 +171,7 @@ def __getitem__(self, item): else: raise IndexError( "Only integers, slices and integer or " - "boolean arrays are valid indices." + + "boolean arrays are valid indices." ) elif isinstance(item, tuple): item = indexers.unpack_tuple_and_ellipses(item) diff --git a/tests/unit/test_json.py b/tests/unit/test_json.py index 5e389a2..dc76d6d 100644 --- a/tests/unit/test_json.py +++ b/tests/unit/test_json.py @@ -25,7 +25,7 @@ pytest.importorskip("pandas", minversion="1.5.0") -# # Python data types mirroring all standard JSON types +# Python data types mirroring all standard JSON types: # https://json-schema.org/understanding-json-schema/reference/type JSON_DATA = { "boolean": True, From 780024237cc11f8a291b56472f8b868235470f5c Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 8 Aug 2024 17:43:15 +0000 Subject: [PATCH 26/28] fixing --- db_dtypes/json.py | 15 ++++++++------- tests/unit/test_json.py | 7 ------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/db_dtypes/json.py b/db_dtypes/json.py index 9f92134..0192e66 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -120,7 +120,11 @@ def _from_sequence(cls, scalars, *, dtype=None, copy=False): @classmethod def _concat_same_type(cls, to_concat) -> JSONArray: """Concatenate multiple JSONArray.""" - chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()] + chunks = [ + pa_array_chunks + for item in to_concat + for pa_array_chunks in item._pa_array.iterchunks() + ] arr = pa.chunked_array(chunks, type=pa.string()) return cls(arr) @@ -166,13 +170,10 @@ def __getitem__(self, item): return type(self)(pa.chunked_array([], type=pa.string())) elif item.dtype.kind in "iu": return self.take(item) - elif item.dtype.kind == "b": - return type(self)(self._pa_array.filter(item)) else: - raise IndexError( - "Only integers, slices and integer or " - + "boolean arrays are valid indices." - ) + # `check_array_indexer` should verify that the assertion hold true. + assert item.dtype.kind == "b" + return type(self)(self._pa_array.filter(item)) elif isinstance(item, tuple): item = indexers.unpack_tuple_and_ellipses(item) diff --git a/tests/unit/test_json.py b/tests/unit/test_json.py index dc76d6d..ea2be7a 100644 --- a/tests/unit/test_json.py +++ b/tests/unit/test_json.py @@ -72,13 +72,6 @@ def test_getitems_w_unboxed_dict(): data[0]["unknown"] -def test_getitems_w_invalid_numpy_array(): - data = db_dtypes.JSONArray._from_sequence(JSON_DATA.values()) - idx = np.array(["str"]) - with pytest.raises(IndexError): - data[idx] - - def test_getitems_when_iter_with_null(): data = db_dtypes.JSONArray._from_sequence([JSON_DATA["null"]]) s = pd.Series(data) From 913d0bc977c7e2f9893dc4b4070cb2c5fca48035 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 8 Aug 2024 18:05:50 +0000 Subject: [PATCH 27/28] add pyarrow_dtypes --- db_dtypes/json.py | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/db_dtypes/json.py b/db_dtypes/json.py index 0192e66..ed04b72 100644 --- a/db_dtypes/json.py +++ b/db_dtypes/json.py @@ -46,6 +46,11 @@ def type(self) -> type[str]: """ return str + @property + def pyarrow_dtype(self): + """Return the pyarrow data type used for storing data in the pyarrow array.""" + return pa.string() + @property def _is_numeric(self) -> bool: return False @@ -81,7 +86,7 @@ def _box_pa( cls, value, pa_type: pa.DataType | None = None ) -> pa.Array | pa.ChunkedArray | pa.Scalar: """Box value into a pyarrow Array, ChunkedArray or Scalar.""" - assert pa_type is None or pa_type == pa.string() + assert pa_type is None or pa_type == cls._dtype.pyarrow_dtype if isinstance(value, pa.Scalar) or not ( common.is_list_like(value) and not common.is_dict_like(value) @@ -93,10 +98,12 @@ def _box_pa( def _box_pa_scalar(cls, value) -> pa.Scalar: """Box value into a pyarrow Scalar.""" if pd.isna(value): - pa_scalar = pa.scalar(None, type=pa.string()) + pa_scalar = pa.scalar(None, type=cls._dtype.pyarrow_dtype) else: value = JSONArray._serialize_json(value) - pa_scalar = pa.scalar(value, type=pa.string(), from_pandas=True) + pa_scalar = pa.scalar( + value, type=cls._dtype.pyarrow_dtype, from_pandas=True + ) return pa_scalar @@ -107,7 +114,7 @@ def _box_pa_array(cls, value, copy: bool = False) -> pa.Array | pa.ChunkedArray: pa_array = value._pa_array else: value = [JSONArray._serialize_json(x) for x in value] - pa_array = pa.array(value, type=pa.string(), from_pandas=True) + pa_array = pa.array(value, type=cls._dtype.pyarrow_dtype, from_pandas=True) return pa_array @classmethod @@ -117,17 +124,6 @@ def _from_sequence(cls, scalars, *, dtype=None, copy=False): arr = cls(pa_array) return arr - @classmethod - def _concat_same_type(cls, to_concat) -> JSONArray: - """Concatenate multiple JSONArray.""" - chunks = [ - pa_array_chunks - for item in to_concat - for pa_array_chunks in item._pa_array.iterchunks() - ] - arr = pa.chunked_array(chunks, type=pa.string()) - return cls(arr) - @staticmethod def _serialize_json(value): """A static method that converts a JSON value into a string representation.""" @@ -167,7 +163,7 @@ def __getitem__(self, item): if isinstance(item, np.ndarray): if not len(item): - return type(self)(pa.chunked_array([], type=pa.string())) + return type(self)(pa.chunked_array([], type=self.dtype.pyarrow_dtype)) elif item.dtype.kind in "iu": return self.take(item) else: From 01eef453e17bcce2c1760ec40baa2ce32587cd97 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Thu, 8 Aug 2024 18:14:12 +0000 Subject: [PATCH 28/28] fixing --- tests/unit/test_json.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/test_json.py b/tests/unit/test_json.py index ea2be7a..c48635d 100644 --- a/tests/unit/test_json.py +++ b/tests/unit/test_json.py @@ -15,7 +15,6 @@ import json -import numpy as np import pandas as pd import pytest