From d0e94d51c8823c44f27263ccb75efff86b15ac71 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Mon, 8 Jul 2024 22:51:03 +0000
Subject: [PATCH 01/28] Copy JSONDtype and JSONArray from tests/extension/json
 and their tests

---
 db_dtypes/__init__.py                         |   3 +
 db_dtypes/json.py                             | 273 +++++++++++
 tests/compliance/json/conftest.py             | 223 +++++++++
 tests/compliance/json/test_json_compliance.py | 444 ++++++++++++++++++
 .../json/test_json_compliance_1_5.py          |  31 ++
 5 files changed, 974 insertions(+)
 create mode 100644 db_dtypes/json.py
 create mode 100644 tests/compliance/json/conftest.py
 create mode 100644 tests/compliance/json/test_json_compliance.py
 create mode 100644 tests/compliance/json/test_json_compliance_1_5.py

diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py
index ad4ea33..076270f 100644
--- a/db_dtypes/__init__.py
+++ b/db_dtypes/__init__.py
@@ -28,6 +28,7 @@
 import pyarrow.compute
 
 from db_dtypes import core
+from db_dtypes.json import JSONArray, JSONDtype
 from db_dtypes.version import __version__
 
 date_dtype_name = "dbdate"
@@ -341,6 +342,8 @@ def __sub__(self, other):
     "__version__",
     "DateArray",
     "DateDtype",
+    "JSONDtype",
+    "JSONArray",
     "TimeArray",
     "TimeDtype",
 ]
diff --git a/db_dtypes/json.py b/db_dtypes/json.py
new file mode 100644
index 0000000..72f4c2c
--- /dev/null
+++ b/db_dtypes/json.py
@@ -0,0 +1,273 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from collections import UserDict, abc
+import itertools
+import numbers
+import string
+import sys
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
+import pandas as pd
+from pandas.api.extensions import ExtensionArray, ExtensionDtype
+from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
+from pandas.core.dtypes.common import is_bool_dtype, is_list_like, pandas_dtype
+from pandas.core.indexers import unpack_tuple_and_ellipses
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from pandas._typing import type_t
+
+
+@pd.api.extensions.register_extension_dtype
+class JSONDtype(pd.api.extensions.ExtensionDtype):
+    """Extension dtype for JSON data."""
+
+    # type = str
+
+    type = abc.Mapping
+    name = "dbjson"
+    # na_value = pd.NA  # TODO: StringDtype is libmissing.NA
+
+    na_value: Mapping[str, Any] = UserDict()
+    # _is_numeric = False
+    # _is_boolean = False
+
+    @classmethod
+    def construct_array_type(cls):
+        """Return the array type associated with this dtype."""
+        return JSONArray
+
+    # @staticmethod
+    # def __from_arrow__(
+    #     array: Union[pyarrow.Array, pyarrow.ChunkedArray]
+    # ) -> "JSONArray":
+    #     """Convert to JSONArray from an Arrow array.
+
+    #     See:
+    #     https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow
+    #     """
+    #     if isinstance(array, pyarrow.Array):
+    #         chunks = [array]
+    #     else:
+    #         chunks = array.chunks
+
+    #     results = []
+    #     for arr in chunks:
+    #         # convert chunk by chunk to numpy and concatenate then, to avoid
+    #         # overflow for large string data when concatenating the pyarrow arrays
+    #         arr = arr.to_numpy(zero_copy_only=False)
+    #         arr = ensure_string_array(arr, na_value=pandas.NA)
+    #         results.append(arr)
+
+    #     if len(chunks) == 0:
+    #         arr = numpy.array([], dtype=str)
+    #     else:
+    #         arr = numpy.concatenate(results)
+
+    #     return JSONArray(arr)
+
+    #     # TODO: codes from StringDtype
+    #     # # Bypass validation inside StringArray constructor, see GH#47781
+    #     # new_string_array = StringArray.__new__(StringArray)
+    #     # NDArrayBacked.__init__(
+    #     #     new_string_array,
+    #     #     arr,
+    #     #     StringDtype(storage="python"),
+    #     # )
+    #     # return new_string_array
+
+
+class JSONArray(pd.api.extensions.ExtensionArray):
+    """Extension array containing JSON data."""
+
+    dtype = JSONDtype()
+    __array_priority__ = 1000
+
+    def __init__(self, values, dtype=None, copy=False) -> None:
+        for val in values:
+            if not isinstance(val, self.dtype.type):
+                raise TypeError(f"All values must be of type {str(self.dtype.type)}: actual {type(val)}")
+        self.data = values
+
+        # Some aliases for common attribute names to ensure pandas supports
+        # these
+        self._items = self._data = self.data
+        # those aliases are currently not working due to assumptions
+        # in internal code (GH-20735)
+        # self._values = self.values = self.data
+
+    @classmethod
+    def _from_sequence(cls, scalars, *, dtype=None, copy=False):
+        return cls(scalars)
+
+    @classmethod
+    def _from_factorized(cls, values, original):
+        return cls([UserDict(x) for x in values if x != ()])
+
+    def __getitem__(self, item):
+        if isinstance(item, tuple):
+            item = unpack_tuple_and_ellipses(item)
+
+        if isinstance(item, numbers.Integral):
+            return self.data[item]
+        elif isinstance(item, slice) and item == slice(None):
+            # Make sure we get a view
+            return type(self)(self.data)
+        elif isinstance(item, slice):
+            # slice
+            return type(self)(self.data[item])
+        elif not is_list_like(item):
+            # e.g. "foo" or 2.5
+            # exception message copied from numpy
+            raise IndexError(
+                r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
+                r"(`None`) and integer or boolean arrays are valid indices"
+            )
+        else:
+            item = pd.api.indexers.check_array_indexer(self, item)
+            if is_bool_dtype(item.dtype):
+                return type(self)._from_sequence(
+                    [x for x, m in zip(self, item) if m], dtype=self.dtype
+                )
+            # integer
+            return type(self)([self.data[i] for i in item])
+
+    def __setitem__(self, key, value) -> None:
+        if isinstance(key, numbers.Integral):
+            self.data[key] = value
+        else:
+            if not isinstance(value, (type(self), abc.Sequence)):
+                # broadcast value
+                value = itertools.cycle([value])
+
+            if isinstance(key, np.ndarray) and key.dtype == "bool":
+                # masking
+                for i, (k, v) in enumerate(zip(key, value)):
+                    if k:
+                        assert isinstance(v, self.dtype.type)
+                        self.data[i] = v
+            else:
+                for k, v in zip(key, value):
+                    assert isinstance(v, self.dtype.type)
+                    self.data[k] = v
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+    def __eq__(self, other):
+        return NotImplemented
+
+    def __ne__(self, other):
+        return NotImplemented
+
+    def __array__(self, dtype=None, copy=None):
+        if dtype is None:
+            dtype = object
+        if dtype == object:
+            # on py38 builds it looks like numpy is inferring to a non-1D array
+            return construct_1d_object_array_from_listlike(list(self))
+        return np.asarray(self.data, dtype=dtype)
+
+    @property
+    def nbytes(self) -> int:
+        return sys.getsizeof(self.data)
+
+    def isna(self):
+        return np.array([x == self.dtype.na_value for x in self.data], dtype=bool)
+
+    def take(self, indexer, allow_fill=False, fill_value=None):
+        # re-implement here, since NumPy has trouble setting
+        # sized objects like UserDicts into scalar slots of
+        # an ndarary.
+        indexer = np.asarray(indexer)
+        msg = (
+            "Index is out of bounds or cannot do a "
+            "non-empty take from an empty array."
+        )
+
+        if allow_fill:
+            # Do not allow any custom na_value
+            if fill_value is None:
+                fill_value = self.dtype.na_value
+            # bounds check
+            if (indexer < -1).any():
+                raise ValueError
+            try:
+                output = [
+                    self.data[loc] if loc != -1 else fill_value for loc in indexer
+                ]
+            except IndexError as err:
+                raise IndexError(msg) from err
+        else:
+            try:
+                output = [self.data[loc] for loc in indexer]
+            except IndexError as err:
+                raise IndexError(msg) from err
+
+        return type(self)._from_sequence(output, dtype=self.dtype)
+
+    def copy(self):
+        return type(self)(self.data[:])
+
+    def astype(self, dtype, copy=True):
+        # NumPy has issues when all the dicts are the same length.
+        # np.array([UserDict(...), UserDict(...)]) fails,
+        # but np.array([{...}, {...}]) works, so cast.
+        from pandas.core.arrays.string_ import StringDtype
+
+        dtype = pandas_dtype(dtype)
+        # needed to add this check for the Series constructor
+        if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
+            if copy:
+                return self.copy()
+            return self
+        elif isinstance(dtype, StringDtype):
+            value = self.astype(str)  # numpy doesn't like nested dicts
+            arr_cls = dtype.construct_array_type()
+            return arr_cls._from_sequence(value, dtype=dtype, copy=False)
+        elif not copy:
+            return np.asarray([dict(x) for x in self], dtype=dtype)
+        else:
+            return np.array([dict(x) for x in self], dtype=dtype, copy=copy)
+
+    def unique(self):
+        # Parent method doesn't work since np.array will try to infer
+        # a 2-dim object.
+        return type(self)([dict(x) for x in {tuple(d.items()) for d in self.data}])
+
+    @classmethod
+    def _concat_same_type(cls, to_concat):
+        data = list(itertools.chain.from_iterable(x.data for x in to_concat))
+        return cls(data)
+
+    def _values_for_factorize(self):
+        frozen = self._values_for_argsort()
+        if len(frozen) == 0:
+            # factorize_array expects 1-d array, this is a len-0 2-d array.
+            frozen = frozen.ravel()
+        return frozen, ()
+
+    def _values_for_argsort(self):
+        # Bypass NumPy's shape inference to get a (N,) array of tuples.
+        frozen = [tuple(x.items()) for x in self]
+        return construct_1d_object_array_from_listlike(frozen)
+
+    def _pad_or_backfill(self, *, method, limit=None, copy=True):
+        # GH#56616 - test EA method without limit_area argument
+        return super()._pad_or_backfill(method=method, limit=limit, copy=copy)
diff --git a/tests/compliance/json/conftest.py b/tests/compliance/json/conftest.py
new file mode 100644
index 0000000..67e5fb6
--- /dev/null
+++ b/tests/compliance/json/conftest.py
@@ -0,0 +1,223 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+from collections import UserDict, abc
+import operator
+import sys
+
+import numpy as np
+import pytest
+from typing import TYPE_CHECKING, Any
+
+import pandas as pd
+import pandas._testing as tm
+import string
+from pandas.tests.extension import base
+
+from db_dtypes import JSONArray, JSONDtype
+
+from collections import (
+    UserDict,
+    abc,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+def make_data():
+    # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
+    rng = np.random.default_rng(2)
+    return [
+        UserDict(
+            [
+                (rng.choice(list(string.ascii_letters)), rng.integers(0, 100))
+                for _ in range(rng.integers(0, 10))
+            ]
+        )
+        for _ in range(100)
+    ]
+
+
+# We intentionally don't run base.BaseSetitemTests because pandas'
+# internals has trouble setting sequences of values into scalar positions.
+unhashable = pytest.mark.xfail(reason="Unhashable")
+
+
+@pytest.fixture
+def dtype():
+    return JSONDtype()
+
+
+@pytest.fixture
+def data():
+    """Length-100 PeriodArray for semantics test."""
+    data = make_data()
+
+    # Why the while loop? NumPy is unable to construct an ndarray from
+    # equal-length ndarrays. Many of our operations involve coercing the
+    # EA to an ndarray of objects. To avoid random test failures, we ensure
+    # that our data is coercible to an ndarray. Several tests deal with only
+    # the first two elements, so that's what we'll check.
+
+    while len(data[0]) == len(data[1]):
+        data = make_data()
+
+    return JSONArray(data)
+
+
+@pytest.fixture
+def data_for_twos(dtype):
+    """
+    Length-100 array in which all the elements are two.
+
+    Call pytest.skip in your fixture if the dtype does not support divmod.
+    """
+    if not (dtype._is_numeric or dtype.kind == "m"):
+        # Object-dtypes may want to allow this, but for the most part
+        #  only numeric and timedelta-like dtypes will need to implement this.
+        pytest.skip(f"{dtype} is not a numeric dtype")
+
+    raise NotImplementedError
+
+
+@pytest.fixture
+def data_missing():
+    """Length 2 array with [NA, Valid]"""
+    return JSONArray([{}, {"a": 10}])
+
+
+@pytest.fixture
+def data_for_sorting():
+    return JSONArray([{"b": 1}, {"c": 4}, {"a": 2, "c": 3}])
+
+
+@pytest.fixture
+def data_missing_for_sorting():
+    return JSONArray([{"b": 1}, {}, {"a": 4}])
+
+
+@pytest.fixture
+def na_cmp():
+    return operator.eq
+
+
+@pytest.fixture
+def data_for_grouping():
+    return JSONArray(
+        [
+            {"b": 1},
+            {"b": 1},
+            {},
+            {},
+            {"a": 0, "c": 2},
+            {"a": 0, "c": 2},
+            {"b": 1},
+            {"c": 2},
+        ]
+    )
+
+@pytest.fixture
+def data_repeated(data):
+    """
+    Generate many datasets.
+
+    Parameters
+    ----------
+    data : fixture implementing `data`
+
+    Returns
+    -------
+    Callable[[int], Generator]:
+        A callable that takes a `count` argument and
+        returns a generator yielding `count` datasets.
+    """
+
+    def gen(count):
+        for _ in range(count):
+            yield data
+
+    return gen
+
+
+_all_numeric_accumulations = ["cumsum", "cumprod", "cummin", "cummax"]
+
+@pytest.fixture(params=_all_numeric_accumulations)
+def all_numeric_accumulations(request):
+    """
+    Fixture for numeric accumulation names
+    """
+    return request.param
+
+
+_all_boolean_reductions = ["all", "any"]
+
+
+@pytest.fixture(params=_all_boolean_reductions)
+def all_boolean_reductions(request):
+    """
+    Fixture for boolean reduction names.
+    """
+    return request.param
+
+
+_all_numeric_reductions = [
+    "count",
+    "sum",
+    "max",
+    "min",
+    "mean",
+    "prod",
+    "std",
+    "var",
+    "median",
+    "kurt",
+    "skew",
+    "sem",
+]
+
+
+@pytest.fixture(params=_all_numeric_reductions)
+def all_numeric_reductions(request):
+    """
+    Fixture for numeric reduction names.
+    """
+    return request.param
+
+
+@pytest.fixture(params=tm.arithmetic_dunder_methods)
+def all_arithmetic_operators(request):
+    """
+    Fixture for dunder names for common arithmetic operations.
+    """
+    return request.param
+
+@pytest.fixture
+def na_value():
+    """
+    The scalar missing value for this type. Default 'None'.
+    """
+    return UserDict()
+
+@pytest.fixture(params=["data", "data_missing"])
+def all_data(request, data, data_missing):
+    """Parametrized fixture returning 'data' or 'data_missing' integer arrays.
+
+    Used to test dtype conversion with and without missing values.
+    """
+    if request.param == "data":
+        return data
+    elif request.param == "data_missing":
+        return data_missing
+
diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py
new file mode 100644
index 0000000..359430d
--- /dev/null
+++ b/tests/compliance/json/test_json_compliance.py
@@ -0,0 +1,444 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Tests for extension interface compliance, inherited from pandas.
+
+See:
+https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/decimal/test_decimal.py
+and
+https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/test_period.py
+"""
+
+import collections
+import operator
+import sys
+
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+import string
+from pandas.tests.extension import base
+
+from db_dtypes import JSONArray, JSONDtype
+
+# We intentionally don't run base.BaseSetitemTests because pandas'
+# internals has trouble setting sequences of values into scalar positions.
+unhashable = pytest.mark.xfail(reason="Unhashable")
+
+
+class TestJSONArray(base.ExtensionTests):
+    @pytest.mark.xfail(
+        reason="comparison method not implemented for JSONArray (GH-37867)"
+    )
+    def test_contains(self, data):
+        # GH-37867
+        super().test_contains(data)
+
+    @pytest.mark.xfail(reason="not implemented constructor from dtype")
+    def test_from_dtype(self, data):
+        # construct from our dtype & string dtype
+        super().test_from_dtype(data)
+
+    @pytest.mark.xfail(reason="RecursionError, GH-33900")
+    def test_series_constructor_no_data_with_index(self, dtype, na_value):
+        # RecursionError: maximum recursion depth exceeded in comparison
+        rec_limit = sys.getrecursionlimit()
+        try:
+            # Limit to avoid stack overflow on Windows CI
+            sys.setrecursionlimit(100)
+            super().test_series_constructor_no_data_with_index(dtype, na_value)
+        finally:
+            sys.setrecursionlimit(rec_limit)
+
+    @pytest.mark.xfail(reason="RecursionError, GH-33900")
+    def test_series_constructor_scalar_na_with_index(self, dtype, na_value):
+        # RecursionError: maximum recursion depth exceeded in comparison
+        rec_limit = sys.getrecursionlimit()
+        try:
+            # Limit to avoid stack overflow on Windows CI
+            sys.setrecursionlimit(100)
+            super().test_series_constructor_scalar_na_with_index(dtype, na_value)
+        finally:
+            sys.setrecursionlimit(rec_limit)
+
+    @pytest.mark.xfail(reason="collection as scalar, GH-33901")
+    def test_series_constructor_scalar_with_index(self, data, dtype):
+        # TypeError: All values must be of type <class 'collections.abc.Mapping'>
+        rec_limit = sys.getrecursionlimit()
+        try:
+            # Limit to avoid stack overflow on Windows CI
+            sys.setrecursionlimit(100)
+            super().test_series_constructor_scalar_with_index(data, dtype)
+        finally:
+            sys.setrecursionlimit(rec_limit)
+
+    @pytest.mark.xfail(reason="Different definitions of NA")
+    def test_stack(self):
+        """
+        The test does .astype(object).stack(). If we happen to have
+        any missing values in `data`, then we'll end up with different
+        rows since we consider `{}` NA, but `.astype(object)` doesn't.
+        """
+        super().test_stack()
+
+    @pytest.mark.xfail(reason="dict for NA")
+    def test_unstack(self, data, index):
+        # The base test has NaN for the expected NA value.
+        # this matches otherwise
+        return super().test_unstack(data, index)
+
+    @pytest.mark.xfail(reason="Setting a dict as a scalar")
+    def test_fillna_series(self):
+        """We treat dictionaries as a mapping in fillna, not a scalar."""
+        super().test_fillna_series()
+
+    @pytest.mark.xfail(reason="Setting a dict as a scalar")
+    def test_fillna_frame(self):
+        """We treat dictionaries as a mapping in fillna, not a scalar."""
+        super().test_fillna_frame()
+
+    @pytest.mark.xfail(reason="fill value is a dictionary, takes incorrect code path")
+    def test_fillna_limit_frame(self, data_missing):
+        # GH#58001
+        super().test_fillna_limit_frame(data_missing)
+
+    @pytest.mark.xfail(reason="fill value is a dictionary, takes incorrect code path")
+    def test_fillna_limit_series(self, data_missing):
+        # GH#58001
+        super().test_fillna_limit_frame(data_missing)
+
+    @pytest.mark.parametrize(
+        "limit_area, input_ilocs, expected_ilocs",
+        [
+            ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]),
+            ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]),
+            ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]),
+            ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]),
+            ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]),
+            ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]),
+            ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]),
+            ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]),
+        ],
+    )
+    def test_ffill_limit_area(
+        self, data_missing, limit_area, input_ilocs, expected_ilocs
+    ):
+        # GH#56616
+        msg = "JSONArray does not implement limit_area"
+        with pytest.raises(NotImplementedError, match=msg):
+            super().test_ffill_limit_area(
+                data_missing, limit_area, input_ilocs, expected_ilocs
+            )
+
+    @unhashable
+    def test_value_counts(self, all_data, dropna):
+        super().test_value_counts(all_data, dropna)
+
+    @unhashable
+    def test_value_counts_with_normalize(self, data):
+        super().test_value_counts_with_normalize(data)
+
+    @unhashable
+    def test_sort_values_frame(self):
+        # TODO (EA.factorize): see if _values_for_factorize allows this.
+        super().test_sort_values_frame()
+
+    @pytest.mark.xfail(reason="combine for JSONArray not supported")
+    def test_combine_le(self, data_repeated):
+        super().test_combine_le(data_repeated)
+
+    @pytest.mark.xfail(
+        reason="combine for JSONArray not supported - "
+        "may pass depending on random data",
+        strict=False,
+        raises=AssertionError,
+    )
+    def test_combine_first(self, data):
+        super().test_combine_first(data)
+
+    @pytest.mark.xfail(reason="broadcasting error")
+    def test_where_series(self, data, na_value):
+        # Fails with
+        # *** ValueError: operands could not be broadcast together
+        # with shapes (4,) (4,) (0,)
+        super().test_where_series(data, na_value)
+
+    @pytest.mark.xfail(reason="Can't compare dicts.")
+    def test_searchsorted(self, data_for_sorting):
+        super().test_searchsorted(data_for_sorting)
+
+    @pytest.mark.xfail(reason="Can't compare dicts.")
+    def test_equals(self, data, na_value, as_series):
+        super().test_equals(data, na_value, as_series)
+
+    @pytest.mark.skip("fill-value is interpreted as a dict of values")
+    def test_fillna_copy_frame(self, data_missing):
+        super().test_fillna_copy_frame(data_missing)
+
+    @pytest.mark.xfail(reason="Fails with CoW")
+    def test_equals_same_data_different_object(self, data):
+        super().test_equals_same_data_different_object(data)
+
+    @pytest.mark.xfail(reason="failing on np.array(self, dtype=str)")
+    def test_astype_str(self):
+        """This currently fails in NumPy on np.array(self, dtype=str) with
+
+        *** ValueError: setting an array element with a sequence
+        """
+        super().test_astype_str()
+
+    @unhashable
+    def test_groupby_extension_transform(self):
+        """
+        This currently fails in Series.name.setter, since the
+        name must be hashable, but the value is a dictionary.
+        I think this is what we want, i.e. `.name` should be the original
+        values, and not the values for factorization.
+        """
+        super().test_groupby_extension_transform()
+
+    @unhashable
+    def test_groupby_extension_apply(self):
+        """
+        This fails in Index._do_unique_check with
+
+        >   hash(val)
+        E   TypeError: unhashable type: 'UserDict' with
+
+        I suspect that once we support Index[ExtensionArray],
+        we'll be able to dispatch unique.
+        """
+        super().test_groupby_extension_apply()
+
+    @unhashable
+    def test_groupby_extension_agg(self):
+        """
+        This fails when we get to tm.assert_series_equal when left.index
+        contains dictionaries, which are not hashable.
+        """
+        super().test_groupby_extension_agg()
+
+    @unhashable
+    def test_groupby_extension_no_sort(self):
+        """
+        This fails when we get to tm.assert_series_equal when left.index
+        contains dictionaries, which are not hashable.
+        """
+        super().test_groupby_extension_no_sort()
+
+    def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
+        if len(data[0]) != 1:
+            mark = pytest.mark.xfail(reason="raises in coercing to Series")
+            request.applymarker(mark)
+        super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
+
+    def test_compare_array(self, data, comparison_op, request):
+        if comparison_op.__name__ in ["eq", "ne"]:
+            mark = pytest.mark.xfail(reason="Comparison methods not implemented")
+            request.applymarker(mark)
+        super().test_compare_array(data, comparison_op)
+
+    @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
+    def test_setitem_loc_scalar_mixed(self, data):
+        super().test_setitem_loc_scalar_mixed(data)
+
+    @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
+    def test_setitem_loc_scalar_multiple_homogoneous(self, data):
+        super().test_setitem_loc_scalar_multiple_homogoneous(data)
+
+    @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
+    def test_setitem_iloc_scalar_mixed(self, data):
+        super().test_setitem_iloc_scalar_mixed(data)
+
+    @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
+    def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
+        super().test_setitem_iloc_scalar_multiple_homogoneous(data)
+
+    @pytest.mark.parametrize(
+        "mask",
+        [
+            np.array([True, True, True, False, False]),
+            pd.array([True, True, True, False, False], dtype="boolean"),
+            pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"),
+        ],
+        ids=["numpy-array", "boolean-array", "boolean-array-na"],
+    )
+    def test_setitem_mask(self, data, mask, box_in_series, request):
+        if box_in_series:
+            mark = pytest.mark.xfail(
+                reason="cannot set using a list-like indexer with a different length"
+            )
+            request.applymarker(mark)
+        elif not isinstance(mask, np.ndarray):
+            mark = pytest.mark.xfail(reason="Issues unwanted DeprecationWarning")
+            request.applymarker(mark)
+        super().test_setitem_mask(data, mask, box_in_series)
+
+    def test_setitem_mask_raises(self, data, box_in_series, request):
+        if not box_in_series:
+            mark = pytest.mark.xfail(reason="Fails to raise")
+            request.applymarker(mark)
+
+        super().test_setitem_mask_raises(data, box_in_series)
+
+    @pytest.mark.xfail(
+        reason="cannot set using a list-like indexer with a different length"
+    )
+    def test_setitem_mask_boolean_array_with_na(self, data, box_in_series):
+        super().test_setitem_mask_boolean_array_with_na(data, box_in_series)
+
+    @pytest.mark.parametrize(
+        "idx",
+        [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
+        ids=["list", "integer-array", "numpy-array"],
+    )
+    def test_setitem_integer_array(self, data, idx, box_in_series, request):
+        if box_in_series:
+            mark = pytest.mark.xfail(
+                reason="cannot set using a list-like indexer with a different length"
+            )
+            request.applymarker(mark)
+        super().test_setitem_integer_array(data, idx, box_in_series)
+
+    @pytest.mark.xfail(reason="list indices must be integers or slices, not NAType")
+    @pytest.mark.parametrize(
+        "idx, box_in_series",
+        [
+            ([0, 1, 2, pd.NA], False),
+            pytest.param(
+                [0, 1, 2, pd.NA], True, marks=pytest.mark.xfail(reason="GH-31948")
+            ),
+            (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
+            (pd.array([0, 1, 2, pd.NA], dtype="Int64"), True),
+        ],
+        ids=["list-False", "list-True", "integer-array-False", "integer-array-True"],
+    )
+    def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series):
+        super().test_setitem_integer_with_missing_raises(data, idx, box_in_series)
+
+    @pytest.mark.xfail(reason="Fails to raise")
+    def test_setitem_scalar_key_sequence_raise(self, data):
+        super().test_setitem_scalar_key_sequence_raise(data)
+
+    def test_setitem_with_expansion_dataframe_column(self, data, full_indexer, request):
+        if "full_slice" in request.node.name:
+            mark = pytest.mark.xfail(reason="slice is not iterable")
+            request.applymarker(mark)
+        super().test_setitem_with_expansion_dataframe_column(data, full_indexer)
+
+    @pytest.mark.xfail(reason="slice is not iterable")
+    def test_setitem_frame_2d_values(self, data):
+        super().test_setitem_frame_2d_values(data)
+
+    @pytest.mark.xfail(
+        reason="cannot set using a list-like indexer with a different length"
+    )
+    @pytest.mark.parametrize("setter", ["loc", None])
+    def test_setitem_mask_broadcast(self, data, setter):
+        super().test_setitem_mask_broadcast(data, setter)
+
+    @pytest.mark.xfail(
+        reason="cannot set using a slice indexer with a different length"
+    )
+    def test_setitem_slice(self, data, box_in_series):
+        super().test_setitem_slice(data, box_in_series)
+
+    @pytest.mark.xfail(reason="slice object is not iterable")
+    def test_setitem_loc_iloc_slice(self, data):
+        super().test_setitem_loc_iloc_slice(data)
+
+    @pytest.mark.xfail(reason="slice object is not iterable")
+    def test_setitem_slice_mismatch_length_raises(self, data):
+        super().test_setitem_slice_mismatch_length_raises(data)
+
+    @pytest.mark.xfail(reason="slice object is not iterable")
+    def test_setitem_slice_array(self, data):
+        super().test_setitem_slice_array(data)
+
+    @pytest.mark.xfail(reason="Fail to raise")
+    def test_setitem_invalid(self, data, invalid_scalar):
+        super().test_setitem_invalid(data, invalid_scalar)
+
+    @pytest.mark.xfail(reason="only integer scalar arrays can be converted")
+    def test_setitem_2d_values(self, data):
+        super().test_setitem_2d_values(data)
+
+    @pytest.mark.xfail(reason="data type 'json' not understood")
+    @pytest.mark.parametrize("engine", ["c", "python"])
+    def test_EA_types(self, engine, data, request):
+        super().test_EA_types(engine, data, request)
+
+
+def custom_assert_series_equal(left, right, *args, **kwargs):
+    # NumPy doesn't handle an array of equal-length UserDicts.
+    # The default assert_series_equal eventually does a
+    # Series.values, which raises. We work around it by
+    # converting the UserDicts to dicts.
+    if left.dtype.name == "json":
+        assert left.dtype == right.dtype
+        left = pd.Series(
+            JSONArray(left.values.astype(object)), index=left.index, name=left.name
+        )
+        right = pd.Series(
+            JSONArray(right.values.astype(object)),
+            index=right.index,
+            name=right.name,
+        )
+    tm.assert_series_equal(left, right, *args, **kwargs)
+
+
+def custom_assert_frame_equal(left, right, *args, **kwargs):
+    obj_type = kwargs.get("obj", "DataFrame")
+    tm.assert_index_equal(
+        left.columns,
+        right.columns,
+        exact=kwargs.get("check_column_type", "equiv"),
+        check_names=kwargs.get("check_names", True),
+        check_exact=kwargs.get("check_exact", False),
+        check_categorical=kwargs.get("check_categorical", True),
+        obj=f"{obj_type}.columns",
+    )
+
+    jsons = (left.dtypes == "json").index
+
+    for col in jsons:
+        custom_assert_series_equal(left[col], right[col], *args, **kwargs)
+
+    left = left.drop(columns=jsons)
+    right = right.drop(columns=jsons)
+    tm.assert_frame_equal(left, right, *args, **kwargs)
+
+
+def test_custom_asserts():
+    # This would always trigger the KeyError from trying to put
+    # an array of equal-length UserDicts inside an ndarray.
+    data = JSONArray(
+        [
+            dict({"a": 1}),
+            dict({"b": 2}),
+            dict({"c": 3}),
+        ]
+    )
+    a = pd.Series(data)
+    custom_assert_series_equal(a, a)
+    custom_assert_frame_equal(a.to_frame(), a.to_frame())
+
+    b = pd.Series(data.take([0, 0, 1]))
+    with pytest.raises(AssertionError):
+        custom_assert_series_equal(a, b)
+
+    with pytest.raises(AssertionError):
+        custom_assert_frame_equal(a.to_frame(), b.to_frame())
diff --git a/tests/compliance/json/test_json_compliance_1_5.py b/tests/compliance/json/test_json_compliance_1_5.py
new file mode 100644
index 0000000..ee2d878
--- /dev/null
+++ b/tests/compliance/json/test_json_compliance_1_5.py
@@ -0,0 +1,31 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Tests for extension interface compliance, inherited from pandas.
+
+See:
+https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/decimal/test_decimal.py
+and
+https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/test_period.py
+"""
+
+from pandas.tests.extension import base
+import pytest
+
+# NDArrayBacked2DTests suite added in https://github.com/pandas-dev/pandas/pull/44974
+pytest.importorskip("pandas", minversion="1.5.0dev")
+
+
+# class Test2DCompat(base.NDArrayBacked2DTests):
+#     pass

From 1d33703a908445b1f5679568272fca9470e06288 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Tue, 9 Jul 2024 17:32:21 +0000
Subject: [PATCH 02/28] formatting

---
 db_dtypes/json.py                             |  4 +++-
 tests/compliance/json/conftest.py             | 18 ++++++++----------
 tests/compliance/json/test_json_compliance.py |  5 ++---
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/db_dtypes/json.py b/db_dtypes/json.py
index 72f4c2c..2ae4dc5 100644
--- a/db_dtypes/json.py
+++ b/db_dtypes/json.py
@@ -102,7 +102,9 @@ class JSONArray(pd.api.extensions.ExtensionArray):
     def __init__(self, values, dtype=None, copy=False) -> None:
         for val in values:
             if not isinstance(val, self.dtype.type):
-                raise TypeError(f"All values must be of type {str(self.dtype.type)}: actual {type(val)}")
+                raise TypeError(
+                    f"All values must be of type {str(self.dtype.type)}: actual {type(val)}"
+                )
         self.data = values
 
         # Some aliases for common attribute names to ensure pandas supports
diff --git a/tests/compliance/json/conftest.py b/tests/compliance/json/conftest.py
index 67e5fb6..775a302 100644
--- a/tests/compliance/json/conftest.py
+++ b/tests/compliance/json/conftest.py
@@ -15,27 +15,22 @@
 import collections
 from collections import UserDict, abc
 import operator
+import string
 import sys
-
-import numpy as np
-import pytest
 from typing import TYPE_CHECKING, Any
 
+import numpy as np
 import pandas as pd
 import pandas._testing as tm
-import string
 from pandas.tests.extension import base
+import pytest
 
 from db_dtypes import JSONArray, JSONDtype
 
-from collections import (
-    UserDict,
-    abc,
-)
-
 if TYPE_CHECKING:
     from collections.abc import Mapping
 
+
 def make_data():
     # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
     rng = np.random.default_rng(2)
@@ -128,6 +123,7 @@ def data_for_grouping():
         ]
     )
 
+
 @pytest.fixture
 def data_repeated(data):
     """
@@ -153,6 +149,7 @@ def gen(count):
 
 _all_numeric_accumulations = ["cumsum", "cumprod", "cummin", "cummax"]
 
+
 @pytest.fixture(params=_all_numeric_accumulations)
 def all_numeric_accumulations(request):
     """
@@ -203,6 +200,7 @@ def all_arithmetic_operators(request):
     """
     return request.param
 
+
 @pytest.fixture
 def na_value():
     """
@@ -210,6 +208,7 @@ def na_value():
     """
     return UserDict()
 
+
 @pytest.fixture(params=["data", "data_missing"])
 def all_data(request, data, data_missing):
     """Parametrized fixture returning 'data' or 'data_missing' integer arrays.
@@ -220,4 +219,3 @@ def all_data(request, data, data_missing):
         return data
     elif request.param == "data_missing":
         return data_missing
-
diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py
index 359430d..e0be6d7 100644
--- a/tests/compliance/json/test_json_compliance.py
+++ b/tests/compliance/json/test_json_compliance.py
@@ -22,15 +22,14 @@
 
 import collections
 import operator
+import string
 import sys
 
 import numpy as np
-import pytest
-
 import pandas as pd
 import pandas._testing as tm
-import string
 from pandas.tests.extension import base
+import pytest
 
 from db_dtypes import JSONArray, JSONDtype
 

From de3120ad4556f32eb86b061ca9b821b823a7034d Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Tue, 16 Jul 2024 20:36:58 +0000
Subject: [PATCH 03/28] converts to ArrowStringArray

---
 db_dtypes/json.py                             | 400 ++++++++----------
 docs/conf.py                                  |   2 +-
 samples/snippets/noxfile.py                   |   1 -
 tests/compliance/json/conftest.py             |  96 ++---
 tests/compliance/json/test_json_compliance.py | 381 +++--------------
 .../json/test_json_compliance_1_5.py          |  31 --
 6 files changed, 290 insertions(+), 621 deletions(-)
 delete mode 100644 tests/compliance/json/test_json_compliance_1_5.py

diff --git a/db_dtypes/json.py b/db_dtypes/json.py
index 2ae4dc5..814e8d6 100644
--- a/db_dtypes/json.py
+++ b/db_dtypes/json.py
@@ -14,261 +14,237 @@
 
 from __future__ import annotations
 
-from collections import UserDict, abc
-import itertools
-import numbers
-import string
-import sys
-from typing import TYPE_CHECKING, Any
+import typing
 
 import numpy as np
 import pandas as pd
-from pandas.api.extensions import ExtensionArray, ExtensionDtype
-from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
-from pandas.core.dtypes.common import is_bool_dtype, is_list_like, pandas_dtype
-from pandas.core.indexers import unpack_tuple_and_ellipses
-
-if TYPE_CHECKING:
-    from collections.abc import Mapping
-
-    from pandas._typing import type_t
+from pandas._libs import lib
+from pandas.core.arrays.arrow.array import ArrowExtensionArray
+from pandas.core.arrays.numeric import NumericDtype
+from pandas.core.dtypes.common import is_integer, is_scalar, pandas_dtype
+from pandas.core.dtypes.dtypes import ExtensionDtype
+from pandas.core.indexers import check_array_indexer, unpack_tuple_and_ellipses
+import pyarrow as pa
+import pyarrow.compute as pc
 
 
 @pd.api.extensions.register_extension_dtype
 class JSONDtype(pd.api.extensions.ExtensionDtype):
     """Extension dtype for JSON data."""
 
-    # type = str
-
-    type = abc.Mapping
     name = "dbjson"
-    # na_value = pd.NA  # TODO: StringDtype is libmissing.NA
 
-    na_value: Mapping[str, Any] = UserDict()
-    # _is_numeric = False
-    # _is_boolean = False
+    @property
+    def na_value(self) -> pd.NA:
+        return pd.NA
+
+    @property
+    def type(self) -> type[str]:
+        return str
+
+    @property
+    def _is_numeric(self) -> bool:
+        return False
+
+    @property
+    def _is_boolean(self) -> bool:
+        return False
 
     @classmethod
     def construct_array_type(cls):
         """Return the array type associated with this dtype."""
         return JSONArray
 
-    # @staticmethod
-    # def __from_arrow__(
-    #     array: Union[pyarrow.Array, pyarrow.ChunkedArray]
-    # ) -> "JSONArray":
-    #     """Convert to JSONArray from an Arrow array.
-
-    #     See:
-    #     https://pandas.pydata.org/pandas-docs/stable/development/extending.html#compatibility-with-apache-arrow
-    #     """
-    #     if isinstance(array, pyarrow.Array):
-    #         chunks = [array]
-    #     else:
-    #         chunks = array.chunks
-
-    #     results = []
-    #     for arr in chunks:
-    #         # convert chunk by chunk to numpy and concatenate then, to avoid
-    #         # overflow for large string data when concatenating the pyarrow arrays
-    #         arr = arr.to_numpy(zero_copy_only=False)
-    #         arr = ensure_string_array(arr, na_value=pandas.NA)
-    #         results.append(arr)
-
-    #     if len(chunks) == 0:
-    #         arr = numpy.array([], dtype=str)
-    #     else:
-    #         arr = numpy.concatenate(results)
-
-    #     return JSONArray(arr)
-
-    #     # TODO: codes from StringDtype
-    #     # # Bypass validation inside StringArray constructor, see GH#47781
-    #     # new_string_array = StringArray.__new__(StringArray)
-    #     # NDArrayBacked.__init__(
-    #     #     new_string_array,
-    #     #     arr,
-    #     #     StringDtype(storage="python"),
-    #     # )
-    #     # return new_string_array
-
-
-class JSONArray(pd.api.extensions.ExtensionArray):
+    @staticmethod
+    def __from_arrow__(array: typing.Union[pa.Array, pa.ChunkedArray]) -> JSONArray:
+        """Convert to JSONArray from an Arrow array."""
+        return JSONArray(array)
+
+
+class JSONArray(ArrowExtensionArray):
     """Extension array containing JSON data."""
 
-    dtype = JSONDtype()
-    __array_priority__ = 1000
+    _dtype = JSONDtype()
 
     def __init__(self, values, dtype=None, copy=False) -> None:
-        for val in values:
-            if not isinstance(val, self.dtype.type):
-                raise TypeError(
-                    f"All values must be of type {str(self.dtype.type)}: actual {type(val)}"
-                )
-        self.data = values
+        if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string(
+            values.type
+        ):
+            values = pc.cast(values, pa.large_string())
+
+        super().__init__(values)
+        self._dtype = JSONDtype()
+
+        if not pa.types.is_large_string(self._pa_array.type) and not (
+            pa.types.is_dictionary(self._pa_array.type)
+            and pa.types.is_large_string(self._pa_array.type.value_type)
+        ):
+            raise ValueError(
+                "ArrowStringArray requires a PyArrow (chunked) array of "
+                "large_string type"
+            )
+
+    @classmethod
+    def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
+        pa_scalar = super()._box_pa_scalar(value, pa_type)
+        if pa.types.is_string(pa_scalar.type) and pa_type is None:
+            pa_scalar = pc.cast(pa_scalar, pa.large_string())
+        return pa_scalar
 
-        # Some aliases for common attribute names to ensure pandas supports
-        # these
-        self._items = self._data = self.data
-        # those aliases are currently not working due to assumptions
-        # in internal code (GH-20735)
-        # self._values = self.values = self.data
+    @classmethod
+    def _box_pa_array(
+        cls, value, pa_type: pa.DataType | None = None, copy: bool = False
+    ) -> pa.Array | pa.ChunkedArray:
+        pa_array = super()._box_pa_array(value, pa_type)
+        if pa.types.is_string(pa_array.type) and pa_type is None:
+            pa_array = pc.cast(pa_array, pa.large_string())
+        return pa_array
 
     @classmethod
     def _from_sequence(cls, scalars, *, dtype=None, copy=False):
-        return cls(scalars)
+        from pandas.core.arrays.masked import BaseMaskedArray
+
+        if isinstance(scalars, BaseMaskedArray):
+            # avoid costly conversion to object dtype in ensure_string_array and
+            # numerical issues with Float32Dtype
+            na_values = scalars._mask
+            result = scalars._data
+            result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
+            return cls(pa.array(result, mask=na_values, type=pa.large_string()))
+        elif isinstance(scalars, (pa.Array, pa.ChunkedArray)):
+            return cls(pc.cast(scalars, pa.large_string()))
+
+        # convert non-na-likes to str
+        result = lib.ensure_string_array(scalars, copy=copy)
+        return cls(pa.array(result, type=pa.large_string(), from_pandas=True))
+
+    @classmethod
+    def _from_sequence_of_strings(
+        cls, strings, *, dtype: ExtensionDtype, copy: bool = False
+    ) -> JSONArray:
+        return cls._from_sequence(strings, dtype=dtype, copy=copy)
+
+    @property
+    def dtype(self) -> JSONDtype:
+        """An instance of JSONDtype"""
+        return self._dtype
+
+    def insert(self, loc: int, item) -> JSONArray:
+        if not isinstance(item, str) and not pd.isna(item):
+            raise TypeError("Scalar must be NA or str")
+        return super().insert(loc, item)
+
+    def astype(self, dtype, copy: bool = True):
+        dtype = pandas_dtype(dtype)
+
+        if dtype == self.dtype:
+            if copy:
+                return self.copy()
+            return self
+        elif isinstance(dtype, NumericDtype):
+            data = self._pa_array.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
+            return dtype.__from_arrow__(data)
+        elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating):
+            return self.to_numpy(dtype=dtype, na_value=np.nan)
+
+        return super().astype(dtype, copy=copy)
 
     @classmethod
     def _from_factorized(cls, values, original):
-        return cls([UserDict(x) for x in values if x != ()])
+        return cls._from_sequence(values, dtype=original.dtype)
 
     def __getitem__(self, item):
-        if isinstance(item, tuple):
+        """Select a subset of self.
+
+        Parameters
+        ----------
+        item : int, slice, or ndarray
+            * int: The position in 'self' to get.
+            * slice: A slice object, where 'start', 'stop', and 'step' are
+              integers or None
+            * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
+
+        Returns
+        -------
+        item : scalar or ExtensionArray
+
+        Notes
+        -----
+        For scalar ``item``, return a scalar value suitable for the array's
+        type. This should be an instance of ``self.dtype.type``.
+        For slice ``key``, return an instance of ``ExtensionArray``, even
+        if the slice is length 0 or 1.
+        For a boolean mask, return an instance of ``ExtensionArray``, filtered
+        to the values where ``item`` is True.
+        """
+        item = check_array_indexer(self, item)
+
+        if isinstance(item, np.ndarray):
+            if not len(item):
+                return type(self)(pa.chunked_array([], type=pa.string()))
+            elif item.dtype.kind in "iu":
+                return self.take(item)
+            elif item.dtype.kind == "b":
+                return type(self)(self._pa_array.filter(item))
+            else:
+                raise IndexError(
+                    "Only integers, slices and integer or "
+                    "boolean arrays are valid indices."
+                )
+        elif isinstance(item, tuple):
             item = unpack_tuple_and_ellipses(item)
 
-        if isinstance(item, numbers.Integral):
-            return self.data[item]
-        elif isinstance(item, slice) and item == slice(None):
-            # Make sure we get a view
-            return type(self)(self.data)
-        elif isinstance(item, slice):
-            # slice
-            return type(self)(self.data[item])
-        elif not is_list_like(item):
+        if is_scalar(item) and not is_integer(item):
             # e.g. "foo" or 2.5
             # exception message copied from numpy
             raise IndexError(
                 r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
                 r"(`None`) and integer or boolean arrays are valid indices"
             )
+        # We are not an array indexer, so maybe e.g. a slice or integer
+        # indexer. We dispatch to pyarrow.
+        if isinstance(item, slice):
+            # Arrow bug https://github.com/apache/arrow/issues/38768
+            if item.start == item.stop:
+                pass
+            elif (
+                item.stop is not None
+                and item.stop < -len(self)
+                and item.step is not None
+                and item.step < 0
+            ):
+                item = slice(item.start, None, item.step)
+
+        value = self._pa_array[item]
+        if isinstance(value, pa.ChunkedArray):
+            return type(self)(value)
         else:
-            item = pd.api.indexers.check_array_indexer(self, item)
-            if is_bool_dtype(item.dtype):
-                return type(self)._from_sequence(
-                    [x for x, m in zip(self, item) if m], dtype=self.dtype
-                )
-            # integer
-            return type(self)([self.data[i] for i in item])
-
-    def __setitem__(self, key, value) -> None:
-        if isinstance(key, numbers.Integral):
-            self.data[key] = value
-        else:
-            if not isinstance(value, (type(self), abc.Sequence)):
-                # broadcast value
-                value = itertools.cycle([value])
-
-            if isinstance(key, np.ndarray) and key.dtype == "bool":
-                # masking
-                for i, (k, v) in enumerate(zip(key, value)):
-                    if k:
-                        assert isinstance(v, self.dtype.type)
-                        self.data[i] = v
+            scalar = value.as_py()
+            if scalar is None:
+                return self._dtype.na_value
             else:
-                for k, v in zip(key, value):
-                    assert isinstance(v, self.dtype.type)
-                    self.data[k] = v
-
-    def __len__(self) -> int:
-        return len(self.data)
-
-    def __eq__(self, other):
-        return NotImplemented
+                return scalar
 
-    def __ne__(self, other):
-        return NotImplemented
-
-    def __array__(self, dtype=None, copy=None):
-        if dtype is None:
-            dtype = object
-        if dtype == object:
-            # on py38 builds it looks like numpy is inferring to a non-1D array
-            return construct_1d_object_array_from_listlike(list(self))
-        return np.asarray(self.data, dtype=dtype)
-
-    @property
-    def nbytes(self) -> int:
-        return sys.getsizeof(self.data)
-
-    def isna(self):
-        return np.array([x == self.dtype.na_value for x in self.data], dtype=bool)
-
-    def take(self, indexer, allow_fill=False, fill_value=None):
-        # re-implement here, since NumPy has trouble setting
-        # sized objects like UserDicts into scalar slots of
-        # an ndarary.
-        indexer = np.asarray(indexer)
-        msg = (
-            "Index is out of bounds or cannot do a "
-            "non-empty take from an empty array."
-        )
-
-        if allow_fill:
-            # Do not allow any custom na_value
-            if fill_value is None:
-                fill_value = self.dtype.na_value
-            # bounds check
-            if (indexer < -1).any():
-                raise ValueError
-            try:
-                output = [
-                    self.data[loc] if loc != -1 else fill_value for loc in indexer
-                ]
-            except IndexError as err:
-                raise IndexError(msg) from err
-        else:
-            try:
-                output = [self.data[loc] for loc in indexer]
-            except IndexError as err:
-                raise IndexError(msg) from err
-
-        return type(self)._from_sequence(output, dtype=self.dtype)
-
-    def copy(self):
-        return type(self)(self.data[:])
-
-    def astype(self, dtype, copy=True):
-        # NumPy has issues when all the dicts are the same length.
-        # np.array([UserDict(...), UserDict(...)]) fails,
-        # but np.array([{...}, {...}]) works, so cast.
-        from pandas.core.arrays.string_ import StringDtype
-
-        dtype = pandas_dtype(dtype)
-        # needed to add this check for the Series constructor
-        if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
-            if copy:
-                return self.copy()
-            return self
-        elif isinstance(dtype, StringDtype):
-            value = self.astype(str)  # numpy doesn't like nested dicts
-            arr_cls = dtype.construct_array_type()
-            return arr_cls._from_sequence(value, dtype=dtype, copy=False)
-        elif not copy:
-            return np.asarray([dict(x) for x in self], dtype=dtype)
-        else:
-            return np.array([dict(x) for x in self], dtype=dtype, copy=copy)
-
-    def unique(self):
-        # Parent method doesn't work since np.array will try to infer
-        # a 2-dim object.
-        return type(self)([dict(x) for x in {tuple(d.items()) for d in self.data}])
+    @classmethod
+    def _result_converter(cls, values, na=None):
+        return pd.BooleanDtype().__from_arrow__(values)
 
     @classmethod
-    def _concat_same_type(cls, to_concat):
-        data = list(itertools.chain.from_iterable(x.data for x in to_concat))
-        return cls(data)
-
-    def _values_for_factorize(self):
-        frozen = self._values_for_argsort()
-        if len(frozen) == 0:
-            # factorize_array expects 1-d array, this is a len-0 2-d array.
-            frozen = frozen.ravel()
-        return frozen, ()
-
-    def _values_for_argsort(self):
-        # Bypass NumPy's shape inference to get a (N,) array of tuples.
-        frozen = [tuple(x.items()) for x in self]
-        return construct_1d_object_array_from_listlike(frozen)
+    def _concat_same_type(cls, to_concat) -> JSONArray:
+        """
+        Concatenate multiple JSONArray.
+
+        Parameters
+        ----------
+        to_concat : sequence of JSONArray
+
+        Returns
+        -------
+        JSONArray
+        """
+        chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
+        arr = pa.chunked_array(chunks, type=pa.large_string())
+        return cls(arr)
 
     def _pad_or_backfill(self, *, method, limit=None, copy=True):
         # GH#56616 - test EA method without limit_area argument
diff --git a/docs/conf.py b/docs/conf.py
index 00e0013..672daff 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -24,9 +24,9 @@
 # All configuration values have a default; values that are commented out
 # serve to show the default.
 
-import sys
 import os
 import shlex
+import sys
 
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
diff --git a/samples/snippets/noxfile.py b/samples/snippets/noxfile.py
index 3b71359..c36d5f2 100644
--- a/samples/snippets/noxfile.py
+++ b/samples/snippets/noxfile.py
@@ -22,7 +22,6 @@
 
 import nox
 
-
 # WARNING - WARNING - WARNING - WARNING - WARNING
 # WARNING - WARNING - WARNING - WARNING - WARNING
 #           DO NOT EDIT THIS FILE EVER!
diff --git a/tests/compliance/json/conftest.py b/tests/compliance/json/conftest.py
index 775a302..f323f65 100644
--- a/tests/compliance/json/conftest.py
+++ b/tests/compliance/json/conftest.py
@@ -12,42 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import collections
-from collections import UserDict, abc
-import operator
-import string
-import sys
-from typing import TYPE_CHECKING, Any
+
+import json
 
 import numpy as np
 import pandas as pd
 import pandas._testing as tm
-from pandas.tests.extension import base
 import pytest
 
 from db_dtypes import JSONArray, JSONDtype
 
-if TYPE_CHECKING:
-    from collections.abc import Mapping
-
 
 def make_data():
-    # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer
-    rng = np.random.default_rng(2)
-    return [
-        UserDict(
-            [
-                (rng.choice(list(string.ascii_letters)), rng.integers(0, 100))
-                for _ in range(rng.integers(0, 10))
-            ]
-        )
-        for _ in range(100)
+    # Sample data with varied lengths.
+    samples = [
+        {"id": 1, "bool_value": True},  # Boolean
+        {"id": 2, "float_num": 3.14159},  # Floating
+        {"id": 3, "date": "2024-07-16"},  # Dates (as strings)
+        {"id": 4, "null_field": None},  # Null
+        {"list_data": [10, 20, 30]},  # Lists
+        {"person": {"name": "Alice", "age": 35}},  # Nested objects
+        {"address": {"street": "123 Main St", "city": "Anytown"}},
+        {"order": {"items": ["book", "pen"], "total": 15.99}},
     ]
-
-
-# We intentionally don't run base.BaseSetitemTests because pandas'
-# internals has trouble setting sequences of values into scalar positions.
-unhashable = pytest.mark.xfail(reason="Unhashable")
+    return np.random.default_rng(2).choice(samples, size=100)
 
 
 @pytest.fixture
@@ -67,9 +55,10 @@ def data():
     # the first two elements, so that's what we'll check.
 
     while len(data[0]) == len(data[1]):
+        print(data)
         data = make_data()
 
-    return JSONArray(data)
+    return JSONArray._from_sequence(data)
 
 
 @pytest.fixture
@@ -79,47 +68,56 @@ def data_for_twos(dtype):
 
     Call pytest.skip in your fixture if the dtype does not support divmod.
     """
-    if not (dtype._is_numeric or dtype.kind == "m"):
-        # Object-dtypes may want to allow this, but for the most part
-        #  only numeric and timedelta-like dtypes will need to implement this.
-        pytest.skip(f"{dtype} is not a numeric dtype")
-
-    raise NotImplementedError
+    pytest.skip(f"{dtype} is not a numeric dtype")
 
 
 @pytest.fixture
 def data_missing():
     """Length 2 array with [NA, Valid]"""
-    return JSONArray([{}, {"a": 10}])
+    return JSONArray._from_sequence([None, {"a": 10}])
 
 
 @pytest.fixture
 def data_for_sorting():
-    return JSONArray([{"b": 1}, {"c": 4}, {"a": 2, "c": 3}])
+    return JSONArray._from_sequence(
+        [json.dumps({"b": 1}), json.dumps({"c": 4}), json.dumps({"a": 2, "c": 3})]
+    )
 
 
 @pytest.fixture
 def data_missing_for_sorting():
-    return JSONArray([{"b": 1}, {}, {"a": 4}])
+    return JSONArray._from_sequence([json.dumps({"b": 1}), None, json.dumps({"a": 4})])
 
 
 @pytest.fixture
 def na_cmp():
-    return operator.eq
+    """
+    Binary operator for comparing NA values.
+
+    Should return a function of two arguments that returns
+    True if both arguments are (scalar) NA for your type.
+
+    By default, uses ``operator.is_``
+    """
+
+    def cmp(a, b):
+        return lambda left, right: pd.isna(left) and pd.isna(right)
+
+    return cmp
 
 
 @pytest.fixture
 def data_for_grouping():
-    return JSONArray(
+    return JSONArray._from_sequence(
         [
-            {"b": 1},
-            {"b": 1},
-            {},
-            {},
-            {"a": 0, "c": 2},
-            {"a": 0, "c": 2},
-            {"b": 1},
-            {"c": 2},
+            json.dumps({"b": 1}),
+            json.dumps({"b": 1}),
+            None,
+            None,
+            json.dumps({"a": 0, "c": 2}),
+            json.dumps({"a": 0, "c": 2}),
+            json.dumps({"b": 1}),
+            json.dumps({"c": 2}),
         ]
     )
 
@@ -201,14 +199,6 @@ def all_arithmetic_operators(request):
     return request.param
 
 
-@pytest.fixture
-def na_value():
-    """
-    The scalar missing value for this type. Default 'None'.
-    """
-    return UserDict()
-
-
 @pytest.fixture(params=["data", "data_missing"])
 def all_data(request, data, data_missing):
     """Parametrized fixture returning 'data' or 'data_missing' integer arrays.
diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py
index e0be6d7..d46b935 100644
--- a/tests/compliance/json/test_json_compliance.py
+++ b/tests/compliance/json/test_json_compliance.py
@@ -20,18 +20,15 @@
 https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/test_period.py
 """
 
-import collections
-import operator
-import string
-import sys
+import typing
 
-import numpy as np
 import pandas as pd
 import pandas._testing as tm
 from pandas.tests.extension import base
+import pyarrow as pa
 import pytest
 
-from db_dtypes import JSONArray, JSONDtype
+from db_dtypes import JSONArray
 
 # We intentionally don't run base.BaseSetitemTests because pandas'
 # internals has trouble setting sequences of values into scalar positions.
@@ -39,86 +36,6 @@
 
 
 class TestJSONArray(base.ExtensionTests):
-    @pytest.mark.xfail(
-        reason="comparison method not implemented for JSONArray (GH-37867)"
-    )
-    def test_contains(self, data):
-        # GH-37867
-        super().test_contains(data)
-
-    @pytest.mark.xfail(reason="not implemented constructor from dtype")
-    def test_from_dtype(self, data):
-        # construct from our dtype & string dtype
-        super().test_from_dtype(data)
-
-    @pytest.mark.xfail(reason="RecursionError, GH-33900")
-    def test_series_constructor_no_data_with_index(self, dtype, na_value):
-        # RecursionError: maximum recursion depth exceeded in comparison
-        rec_limit = sys.getrecursionlimit()
-        try:
-            # Limit to avoid stack overflow on Windows CI
-            sys.setrecursionlimit(100)
-            super().test_series_constructor_no_data_with_index(dtype, na_value)
-        finally:
-            sys.setrecursionlimit(rec_limit)
-
-    @pytest.mark.xfail(reason="RecursionError, GH-33900")
-    def test_series_constructor_scalar_na_with_index(self, dtype, na_value):
-        # RecursionError: maximum recursion depth exceeded in comparison
-        rec_limit = sys.getrecursionlimit()
-        try:
-            # Limit to avoid stack overflow on Windows CI
-            sys.setrecursionlimit(100)
-            super().test_series_constructor_scalar_na_with_index(dtype, na_value)
-        finally:
-            sys.setrecursionlimit(rec_limit)
-
-    @pytest.mark.xfail(reason="collection as scalar, GH-33901")
-    def test_series_constructor_scalar_with_index(self, data, dtype):
-        # TypeError: All values must be of type <class 'collections.abc.Mapping'>
-        rec_limit = sys.getrecursionlimit()
-        try:
-            # Limit to avoid stack overflow on Windows CI
-            sys.setrecursionlimit(100)
-            super().test_series_constructor_scalar_with_index(data, dtype)
-        finally:
-            sys.setrecursionlimit(rec_limit)
-
-    @pytest.mark.xfail(reason="Different definitions of NA")
-    def test_stack(self):
-        """
-        The test does .astype(object).stack(). If we happen to have
-        any missing values in `data`, then we'll end up with different
-        rows since we consider `{}` NA, but `.astype(object)` doesn't.
-        """
-        super().test_stack()
-
-    @pytest.mark.xfail(reason="dict for NA")
-    def test_unstack(self, data, index):
-        # The base test has NaN for the expected NA value.
-        # this matches otherwise
-        return super().test_unstack(data, index)
-
-    @pytest.mark.xfail(reason="Setting a dict as a scalar")
-    def test_fillna_series(self):
-        """We treat dictionaries as a mapping in fillna, not a scalar."""
-        super().test_fillna_series()
-
-    @pytest.mark.xfail(reason="Setting a dict as a scalar")
-    def test_fillna_frame(self):
-        """We treat dictionaries as a mapping in fillna, not a scalar."""
-        super().test_fillna_frame()
-
-    @pytest.mark.xfail(reason="fill value is a dictionary, takes incorrect code path")
-    def test_fillna_limit_frame(self, data_missing):
-        # GH#58001
-        super().test_fillna_limit_frame(data_missing)
-
-    @pytest.mark.xfail(reason="fill value is a dictionary, takes incorrect code path")
-    def test_fillna_limit_series(self, data_missing):
-        # GH#58001
-        super().test_fillna_limit_frame(data_missing)
-
     @pytest.mark.parametrize(
         "limit_area, input_ilocs, expected_ilocs",
         [
@@ -142,19 +59,10 @@ def test_ffill_limit_area(
                 data_missing, limit_area, input_ilocs, expected_ilocs
             )
 
-    @unhashable
-    def test_value_counts(self, all_data, dropna):
-        super().test_value_counts(all_data, dropna)
-
     @unhashable
     def test_value_counts_with_normalize(self, data):
         super().test_value_counts_with_normalize(data)
 
-    @unhashable
-    def test_sort_values_frame(self):
-        # TODO (EA.factorize): see if _values_for_factorize allows this.
-        super().test_sort_values_frame()
-
     @pytest.mark.xfail(reason="combine for JSONArray not supported")
     def test_combine_le(self, data_repeated):
         super().test_combine_le(data_repeated)
@@ -168,75 +76,17 @@ def test_combine_le(self, data_repeated):
     def test_combine_first(self, data):
         super().test_combine_first(data)
 
-    @pytest.mark.xfail(reason="broadcasting error")
-    def test_where_series(self, data, na_value):
-        # Fails with
-        # *** ValueError: operands could not be broadcast together
-        # with shapes (4,) (4,) (0,)
-        super().test_where_series(data, na_value)
-
-    @pytest.mark.xfail(reason="Can't compare dicts.")
-    def test_searchsorted(self, data_for_sorting):
-        super().test_searchsorted(data_for_sorting)
-
-    @pytest.mark.xfail(reason="Can't compare dicts.")
-    def test_equals(self, data, na_value, as_series):
-        super().test_equals(data, na_value, as_series)
-
-    @pytest.mark.skip("fill-value is interpreted as a dict of values")
-    def test_fillna_copy_frame(self, data_missing):
-        super().test_fillna_copy_frame(data_missing)
+    @pytest.mark.skip(reason="2D support not implemented for JSONArray")
+    def test_view(self, data):
+        super().test_view(data)
 
-    @pytest.mark.xfail(reason="Fails with CoW")
-    def test_equals_same_data_different_object(self, data):
-        super().test_equals_same_data_different_object(data)
+    @pytest.mark.skip(reason="2D support not implemented for JSONArray")
+    def test_setitem_preserves_views(self, data):
+        super().test_setitem_preserves_views(data)
 
-    @pytest.mark.xfail(reason="failing on np.array(self, dtype=str)")
-    def test_astype_str(self):
-        """This currently fails in NumPy on np.array(self, dtype=str) with
-
-        *** ValueError: setting an array element with a sequence
-        """
-        super().test_astype_str()
-
-    @unhashable
-    def test_groupby_extension_transform(self):
-        """
-        This currently fails in Series.name.setter, since the
-        name must be hashable, but the value is a dictionary.
-        I think this is what we want, i.e. `.name` should be the original
-        values, and not the values for factorization.
-        """
-        super().test_groupby_extension_transform()
-
-    @unhashable
-    def test_groupby_extension_apply(self):
-        """
-        This fails in Index._do_unique_check with
-
-        >   hash(val)
-        E   TypeError: unhashable type: 'UserDict' with
-
-        I suspect that once we support Index[ExtensionArray],
-        we'll be able to dispatch unique.
-        """
-        super().test_groupby_extension_apply()
-
-    @unhashable
-    def test_groupby_extension_agg(self):
-        """
-        This fails when we get to tm.assert_series_equal when left.index
-        contains dictionaries, which are not hashable.
-        """
-        super().test_groupby_extension_agg()
-
-    @unhashable
-    def test_groupby_extension_no_sort(self):
-        """
-        This fails when we get to tm.assert_series_equal when left.index
-        contains dictionaries, which are not hashable.
-        """
-        super().test_groupby_extension_no_sort()
+    @pytest.mark.skip(reason="2D support not implemented for JSONArray")
+    def test_transpose(self, data):
+        super().test_transpose(data)
 
     def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
         if len(data[0]) != 1:
@@ -244,159 +94,46 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
             request.applymarker(mark)
         super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
 
-    def test_compare_array(self, data, comparison_op, request):
-        if comparison_op.__name__ in ["eq", "ne"]:
-            mark = pytest.mark.xfail(reason="Comparison methods not implemented")
-            request.applymarker(mark)
-        super().test_compare_array(data, comparison_op)
-
-    @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
-    def test_setitem_loc_scalar_mixed(self, data):
-        super().test_setitem_loc_scalar_mixed(data)
-
-    @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
-    def test_setitem_loc_scalar_multiple_homogoneous(self, data):
-        super().test_setitem_loc_scalar_multiple_homogoneous(data)
-
-    @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
-    def test_setitem_iloc_scalar_mixed(self, data):
-        super().test_setitem_iloc_scalar_mixed(data)
-
-    @pytest.mark.xfail(reason="ValueError: Must have equal len keys and value")
-    def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
-        super().test_setitem_iloc_scalar_multiple_homogoneous(data)
-
-    @pytest.mark.parametrize(
-        "mask",
-        [
-            np.array([True, True, True, False, False]),
-            pd.array([True, True, True, False, False], dtype="boolean"),
-            pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"),
-        ],
-        ids=["numpy-array", "boolean-array", "boolean-array-na"],
-    )
-    def test_setitem_mask(self, data, mask, box_in_series, request):
-        if box_in_series:
-            mark = pytest.mark.xfail(
-                reason="cannot set using a list-like indexer with a different length"
-            )
-            request.applymarker(mark)
-        elif not isinstance(mask, np.ndarray):
-            mark = pytest.mark.xfail(reason="Issues unwanted DeprecationWarning")
-            request.applymarker(mark)
-        super().test_setitem_mask(data, mask, box_in_series)
-
-    def test_setitem_mask_raises(self, data, box_in_series, request):
-        if not box_in_series:
-            mark = pytest.mark.xfail(reason="Fails to raise")
-            request.applymarker(mark)
-
-        super().test_setitem_mask_raises(data, box_in_series)
-
-    @pytest.mark.xfail(
-        reason="cannot set using a list-like indexer with a different length"
-    )
-    def test_setitem_mask_boolean_array_with_na(self, data, box_in_series):
-        super().test_setitem_mask_boolean_array_with_na(data, box_in_series)
-
-    @pytest.mark.parametrize(
-        "idx",
-        [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
-        ids=["list", "integer-array", "numpy-array"],
-    )
-    def test_setitem_integer_array(self, data, idx, box_in_series, request):
-        if box_in_series:
-            mark = pytest.mark.xfail(
-                reason="cannot set using a list-like indexer with a different length"
-            )
-            request.applymarker(mark)
-        super().test_setitem_integer_array(data, idx, box_in_series)
-
-    @pytest.mark.xfail(reason="list indices must be integers or slices, not NAType")
-    @pytest.mark.parametrize(
-        "idx, box_in_series",
-        [
-            ([0, 1, 2, pd.NA], False),
-            pytest.param(
-                [0, 1, 2, pd.NA], True, marks=pytest.mark.xfail(reason="GH-31948")
-            ),
-            (pd.array([0, 1, 2, pd.NA], dtype="Int64"), False),
-            (pd.array([0, 1, 2, pd.NA], dtype="Int64"), True),
-        ],
-        ids=["list-False", "list-True", "integer-array-False", "integer-array-True"],
-    )
-    def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series):
-        super().test_setitem_integer_with_missing_raises(data, idx, box_in_series)
-
-    @pytest.mark.xfail(reason="Fails to raise")
-    def test_setitem_scalar_key_sequence_raise(self, data):
-        super().test_setitem_scalar_key_sequence_raise(data)
-
-    def test_setitem_with_expansion_dataframe_column(self, data, full_indexer, request):
-        if "full_slice" in request.node.name:
-            mark = pytest.mark.xfail(reason="slice is not iterable")
+    def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request):
+        if len(data[0]) != 1:
+            mark = pytest.mark.xfail(reason="raises in coercing to Series")
             request.applymarker(mark)
-        super().test_setitem_with_expansion_dataframe_column(data, full_indexer)
-
-    @pytest.mark.xfail(reason="slice is not iterable")
-    def test_setitem_frame_2d_values(self, data):
-        super().test_setitem_frame_2d_values(data)
-
-    @pytest.mark.xfail(
-        reason="cannot set using a list-like indexer with a different length"
-    )
-    @pytest.mark.parametrize("setter", ["loc", None])
-    def test_setitem_mask_broadcast(self, data, setter):
-        super().test_setitem_mask_broadcast(data, setter)
-
-    @pytest.mark.xfail(
-        reason="cannot set using a slice indexer with a different length"
-    )
-    def test_setitem_slice(self, data, box_in_series):
-        super().test_setitem_slice(data, box_in_series)
-
-    @pytest.mark.xfail(reason="slice object is not iterable")
-    def test_setitem_loc_iloc_slice(self, data):
-        super().test_setitem_loc_iloc_slice(data)
-
-    @pytest.mark.xfail(reason="slice object is not iterable")
-    def test_setitem_slice_mismatch_length_raises(self, data):
-        super().test_setitem_slice_mismatch_length_raises(data)
-
-    @pytest.mark.xfail(reason="slice object is not iterable")
-    def test_setitem_slice_array(self, data):
-        super().test_setitem_slice_array(data)
-
-    @pytest.mark.xfail(reason="Fail to raise")
-    def test_setitem_invalid(self, data, invalid_scalar):
-        super().test_setitem_invalid(data, invalid_scalar)
-
-    @pytest.mark.xfail(reason="only integer scalar arrays can be converted")
-    def test_setitem_2d_values(self, data):
-        super().test_setitem_2d_values(data)
-
-    @pytest.mark.xfail(reason="data type 'json' not understood")
-    @pytest.mark.parametrize("engine", ["c", "python"])
-    def test_EA_types(self, engine, data, request):
-        super().test_EA_types(engine, data, request)
-
-
-def custom_assert_series_equal(left, right, *args, **kwargs):
-    # NumPy doesn't handle an array of equal-length UserDicts.
-    # The default assert_series_equal eventually does a
-    # Series.values, which raises. We work around it by
-    # converting the UserDicts to dicts.
-    if left.dtype.name == "json":
-        assert left.dtype == right.dtype
-        left = pd.Series(
-            JSONArray(left.values.astype(object)), index=left.index, name=left.name
-        )
-        right = pd.Series(
-            JSONArray(right.values.astype(object)),
-            index=right.index,
-            name=right.name,
-        )
-    tm.assert_series_equal(left, right, *args, **kwargs)
+        super().test_arith_series_with_scalar(data, all_arithmetic_operators)
+
+    def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
+        return op_name in ["min", "max"]
+
+    def _get_expected_exception(
+        self, op_name: str, obj, other
+    ) -> type[Exception] | None:
+        if op_name in ["__divmod__", "__rdivmod__"]:
+            if isinstance(obj, pd.Series) or isinstance(other, pd.Series):
+                return NotImplementedError
+            return TypeError
+        elif op_name in ["__mod__", "__rmod__", "__pow__", "__rpow__"]:
+            return NotImplementedError
+        elif op_name in ["__mul__", "__rmul__"]:
+            # Can only multiply strings by integers
+            return TypeError
+        elif op_name in [
+            "__truediv__",
+            "__rtruediv__",
+            "__floordiv__",
+            "__rfloordiv__",
+            "__sub__",
+            "__rsub__",
+        ]:
+            return pa.ArrowNotImplementedError
+
+        return None
+
+    def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
+        dtype = typing.cast(pd.StringDtype, tm.get_dtype(obj))
+        if op_name in ["__add__", "__radd__"]:
+            cast_to = dtype
+        else:
+            cast_to = "boolean[pyarrow]"  # type: ignore[assignment]
+        return pointwise_result.astype(cast_to)
 
 
 def custom_assert_frame_equal(left, right, *args, **kwargs):
@@ -414,7 +151,7 @@ def custom_assert_frame_equal(left, right, *args, **kwargs):
     jsons = (left.dtypes == "json").index
 
     for col in jsons:
-        custom_assert_series_equal(left[col], right[col], *args, **kwargs)
+        tm.assert_series_equal(left[col], right[col], *args, **kwargs)
 
     left = left.drop(columns=jsons)
     right = right.drop(columns=jsons)
@@ -422,22 +159,20 @@ def custom_assert_frame_equal(left, right, *args, **kwargs):
 
 
 def test_custom_asserts():
-    # This would always trigger the KeyError from trying to put
-    # an array of equal-length UserDicts inside an ndarray.
-    data = JSONArray(
+    data = JSONArray._from_sequence(
         [
-            dict({"a": 1}),
-            dict({"b": 2}),
-            dict({"c": 3}),
+            {"a": 1},
+            {"b": 2},
+            {"c": 3},
         ]
     )
     a = pd.Series(data)
-    custom_assert_series_equal(a, a)
+    tm.assert_series_equal(a, a)
     custom_assert_frame_equal(a.to_frame(), a.to_frame())
 
     b = pd.Series(data.take([0, 0, 1]))
     with pytest.raises(AssertionError):
-        custom_assert_series_equal(a, b)
+        tm.assert_series_equal(a, b)
 
     with pytest.raises(AssertionError):
         custom_assert_frame_equal(a.to_frame(), b.to_frame())
diff --git a/tests/compliance/json/test_json_compliance_1_5.py b/tests/compliance/json/test_json_compliance_1_5.py
deleted file mode 100644
index ee2d878..0000000
--- a/tests/compliance/json/test_json_compliance_1_5.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright 2022 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Tests for extension interface compliance, inherited from pandas.
-
-See:
-https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/decimal/test_decimal.py
-and
-https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/test_period.py
-"""
-
-from pandas.tests.extension import base
-import pytest
-
-# NDArrayBacked2DTests suite added in https://github.com/pandas-dev/pandas/pull/44974
-pytest.importorskip("pandas", minversion="1.5.0dev")
-
-
-# class Test2DCompat(base.NDArrayBacked2DTests):
-#     pass

From 8bd13ccf8e2e70e7557abca7d3cf57d513c63d61 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Mon, 22 Jul 2024 21:39:25 +0000
Subject: [PATCH 04/28] box and unbox between string(storage) and dict(getitem)

---
 db_dtypes/json.py                             | 148 +++++--
 tests/compliance/json/test_json_compliance.py | 360 +++++++++++++++---
 2 files changed, 428 insertions(+), 80 deletions(-)

diff --git a/db_dtypes/json.py b/db_dtypes/json.py
index 814e8d6..390cd36 100644
--- a/db_dtypes/json.py
+++ b/db_dtypes/json.py
@@ -14,14 +14,14 @@
 
 from __future__ import annotations
 
+import json
 import typing
 
 import numpy as np
 import pandas as pd
-from pandas._libs import lib
 from pandas.core.arrays.arrow.array import ArrowExtensionArray
-from pandas.core.arrays.numeric import NumericDtype
-from pandas.core.dtypes.common import is_integer, is_scalar, pandas_dtype
+from pandas.core.arrays.masked import BaseMaskedArray
+from pandas.core.dtypes.common import is_dict_like, is_integer, is_list_like, is_scalar
 from pandas.core.dtypes.dtypes import ExtensionDtype
 from pandas.core.indexers import check_array_indexer, unpack_tuple_and_ellipses
 import pyarrow as pa
@@ -84,8 +84,43 @@ def __init__(self, values, dtype=None, copy=False) -> None:
                 "large_string type"
             )
 
+    @classmethod
+    def _box_pa(
+        cls, value, pa_type: pa.DataType | None = None
+    ) -> pa.Array | pa.ChunkedArray | pa.Scalar:
+        """
+        Box value into a pyarrow Array, ChunkedArray or Scalar.
+
+        Parameters
+        ----------
+        value : any
+        pa_type : pa.DataType | None
+
+        Returns
+        -------
+        pa.Array or pa.ChunkedArray or pa.Scalar
+        """
+        if isinstance(value, pa.Scalar) or not (
+            is_list_like(value) and not is_dict_like(value)
+        ):
+            return cls._box_pa_scalar(value, pa_type)
+        return cls._box_pa_array(value, pa_type)
+
     @classmethod
     def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
+        """
+        Box value into a pyarrow Scalar.
+
+        Parameters
+        ----------
+        value : any
+        pa_type : pa.DataType | None
+
+        Returns
+        -------
+        pa.Scalar
+        """
+        value = JSONArray._seralizate_json(value)
         pa_scalar = super()._box_pa_scalar(value, pa_type)
         if pa.types.is_string(pa_scalar.type) and pa_type is None:
             pa_scalar = pc.cast(pa_scalar, pa.large_string())
@@ -95,6 +130,24 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
     def _box_pa_array(
         cls, value, pa_type: pa.DataType | None = None, copy: bool = False
     ) -> pa.Array | pa.ChunkedArray:
+        """
+        Box value into a pyarrow Array or ChunkedArray.
+
+        Parameters
+        ----------
+        value : Sequence
+        pa_type : pa.DataType | None
+
+        Returns
+        -------
+        pa.Array or pa.ChunkedArray
+        """
+        if (
+            not isinstance(value, cls)
+            and not isinstance(value, (pa.Array, pa.ChunkedArray))
+            and not isinstance(value, BaseMaskedArray)
+        ):
+            value = [JSONArray._seralizate_json(x) for x in value]
         pa_array = super()._box_pa_array(value, pa_type)
         if pa.types.is_string(pa_array.type) and pa_type is None:
             pa_array = pc.cast(pa_array, pa.large_string())
@@ -102,20 +155,21 @@ def _box_pa_array(
 
     @classmethod
     def _from_sequence(cls, scalars, *, dtype=None, copy=False):
-        from pandas.core.arrays.masked import BaseMaskedArray
-
-        if isinstance(scalars, BaseMaskedArray):
-            # avoid costly conversion to object dtype in ensure_string_array and
-            # numerical issues with Float32Dtype
-            na_values = scalars._mask
-            result = scalars._data
-            result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
-            return cls(pa.array(result, mask=na_values, type=pa.large_string()))
-        elif isinstance(scalars, (pa.Array, pa.ChunkedArray)):
-            return cls(pc.cast(scalars, pa.large_string()))
-
-        # convert non-na-likes to str
-        result = lib.ensure_string_array(scalars, copy=copy)
+        # TODO: check _from_arrow APIs etc.
+        # from pandas.core.arrays.masked import BaseMaskedArray
+
+        # if isinstance(scalars, BaseMaskedArray):
+        #     # avoid costly conversion to object dtype in ensure_string_array and
+        #     # numerical issues with Float32Dtype
+        #     na_values = scalars._mask
+        #     result = scalars._data
+        #     # result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
+        #     return cls(pa.array(result, mask=na_values, type=pa.large_string()))
+        # elif isinstance(scalars, (pa.Array, pa.ChunkedArray)):
+        #     return cls(pc.cast(scalars, pa.large_string()))
+        result = []
+        for scalar in scalars:
+            result.append(JSONArray._seralizate_json(scalar))
         return cls(pa.array(result, type=pa.large_string(), from_pandas=True))
 
     @classmethod
@@ -124,30 +178,45 @@ def _from_sequence_of_strings(
     ) -> JSONArray:
         return cls._from_sequence(strings, dtype=dtype, copy=copy)
 
+    @staticmethod
+    def _seralizate_json(value):
+        if isinstance(value, str) or pd.isna(value):
+            return value
+        else:
+            # `sort_keys=True` sorts dictionary keys before serialization, making
+            # JSON comparisons deterministic.
+            return json.dumps(value, sort_keys=True)
+
+    @staticmethod
+    def _deserialize_json(value):
+        if not pd.isna(value):
+            return json.loads(value)
+        else:
+            return value
+
     @property
     def dtype(self) -> JSONDtype:
         """An instance of JSONDtype"""
         return self._dtype
 
-    def insert(self, loc: int, item) -> JSONArray:
-        if not isinstance(item, str) and not pd.isna(item):
-            raise TypeError("Scalar must be NA or str")
-        return super().insert(loc, item)
+    def __contains__(self, key) -> bool:
+        return super().__contains__(JSONArray._seralizate_json(key))
+
+    # def __contains__(self, key) -> bool:
+    #     # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604
+    #     if pd.isna(key) and key is not self.dtype.na_value:
+    #         if self.dtype.kind == "f" and lib.is_float(key):
+    #             return pc.any(pc.is_nan(self._pa_array)).as_py()
 
-    def astype(self, dtype, copy: bool = True):
-        dtype = pandas_dtype(dtype)
+    #         # e.g. date or timestamp types we do not allow None here to match pd.NA
+    #         return False
+    #         # TODO: maybe complex? object?
 
-        if dtype == self.dtype:
-            if copy:
-                return self.copy()
-            return self
-        elif isinstance(dtype, NumericDtype):
-            data = self._pa_array.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
-            return dtype.__from_arrow__(data)
-        elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating):
-            return self.to_numpy(dtype=dtype, na_value=np.nan)
+    #     return bool(super().__contains__(key))
 
-        return super().astype(dtype, copy=copy)
+    def insert(self, loc: int, item) -> JSONArray:
+        val = JSONArray._seralizate_json(item)
+        return super().insert(loc, val)
 
     @classmethod
     def _from_factorized(cls, values, original):
@@ -219,12 +288,23 @@ def __getitem__(self, item):
         if isinstance(value, pa.ChunkedArray):
             return type(self)(value)
         else:
-            scalar = value.as_py()
+            scalar = JSONArray._deserialize_json(value.as_py())
             if scalar is None:
                 return self._dtype.na_value
             else:
                 return scalar
 
+    def __iter__(self):
+        """
+        Iterate over elements of the array.
+        """
+        for value in self._pa_array:
+            val = JSONArray._deserialize_json(value.as_py())
+            if val is None:
+                yield self._dtype.na_value
+            else:
+                yield val
+
     @classmethod
     def _result_converter(cls, values, na=None):
         return pd.BooleanDtype().__from_arrow__(values)
diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py
index d46b935..b4d55a6 100644
--- a/tests/compliance/json/test_json_compliance.py
+++ b/tests/compliance/json/test_json_compliance.py
@@ -11,29 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-Tests for extension interface compliance, inherited from pandas.
-
-See:
-https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/decimal/test_decimal.py
-and
-https://github.com/pandas-dev/pandas/blob/main/pandas/tests/extension/test_period.py
-"""
 
+import json
 import typing
 
+import numpy as np
 import pandas as pd
 import pandas._testing as tm
+from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
 from pandas.tests.extension import base
-import pyarrow as pa
 import pytest
 
 from db_dtypes import JSONArray
 
-# We intentionally don't run base.BaseSetitemTests because pandas'
-# internals has trouble setting sequences of values into scalar positions.
-unhashable = pytest.mark.xfail(reason="Unhashable")
-
 
 class TestJSONArray(base.ExtensionTests):
     @pytest.mark.parametrize(
@@ -59,10 +49,35 @@ def test_ffill_limit_area(
                 data_missing, limit_area, input_ilocs, expected_ilocs
             )
 
-    @unhashable
+    @pytest.mark.xfail(reason="Unhashable")
     def test_value_counts_with_normalize(self, data):
         super().test_value_counts_with_normalize(data)
 
+    @pytest.mark.xfail(reason="Unhashable")
+    def test_groupby_extension_transform(self):
+        """
+        This currently fails in Series.name.setter, since the
+        name must be hashable, but the value is a dictionary.
+        I think this is what we want, i.e. `.name` should be the original
+        values, and not the values for factorization.
+        """
+        super().test_groupby_extension_transform()
+
+    @pytest.mark.xfail(reason="Unhashable")
+    def test_groupby_extension_apply(self):
+        """
+        This fails in Index._do_unique_check with
+        >   hash(val)
+        E   TypeError: unhashable type: 'dict' with
+        I suspect that once we support Index[ExtensionArray],
+        we'll be able to dispatch unique.
+        """
+        super().test_groupby_extension_apply()
+
+    @pytest.mark.xfail(reason="Unhashable")
+    def test_sort_values_frame(self):
+        super().test_sort_values_frame()
+
     @pytest.mark.xfail(reason="combine for JSONArray not supported")
     def test_combine_le(self, data_repeated):
         super().test_combine_le(data_repeated)
@@ -88,45 +103,45 @@ def test_setitem_preserves_views(self, data):
     def test_transpose(self, data):
         super().test_transpose(data)
 
-    def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
-        if len(data[0]) != 1:
-            mark = pytest.mark.xfail(reason="raises in coercing to Series")
-            request.applymarker(mark)
+    @pytest.mark.xfail(reason="Arithmetic functions is not supported for json")
+    def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
         super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
 
-    def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request):
-        if len(data[0]) != 1:
-            mark = pytest.mark.xfail(reason="raises in coercing to Series")
-            request.applymarker(mark)
+    @pytest.mark.xfail(reason="Arithmetic functions is not supported for json")
+    def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
+        super().test_arith_series_with_scalar(data, all_arithmetic_operators)
+
+    @pytest.mark.xfail(reason="Arithmetic functions is not supported for json")
+    def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
         super().test_arith_series_with_scalar(data, all_arithmetic_operators)
 
+    @pytest.mark.xfail(reason="Arithmetic functions is not supported for json")
+    def test_arith_series_with_array(self, data, all_arithmetic_operators):
+        super().test_arith_series_with_array(data, all_arithmetic_operators)
+
+    @pytest.mark.xfail(reason="Arithmetic functions is not supported for json")
+    def test_add_series_with_extension_array(self, data):
+        super().test_add_series_with_extension_array(data, data)
+
+    @pytest.mark.xfail(reason="Arithmetic functions is not supported for json")
+    def test_divmod(self, data):
+        super().test_divmod(data, data)
+
+    def test_compare_array(self, data, comparison_op, request):
+        if comparison_op.__name__ not in ["eq", "ne"]:
+            mark = pytest.mark.xfail(reason="Comparison methods not implemented")
+            request.applymarker(mark)
+        super().test_compare_array(data, comparison_op)
+
+    def test_compare_scalar(self, data, comparison_op, request):
+        if comparison_op.__name__ not in ["eq", "ne"]:
+            mark = pytest.mark.xfail(reason="Comparison methods not implemented")
+            request.applymarker(mark)
+        super().test_compare_scalar(data, comparison_op)
+
     def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
         return op_name in ["min", "max"]
 
-    def _get_expected_exception(
-        self, op_name: str, obj, other
-    ) -> type[Exception] | None:
-        if op_name in ["__divmod__", "__rdivmod__"]:
-            if isinstance(obj, pd.Series) or isinstance(other, pd.Series):
-                return NotImplementedError
-            return TypeError
-        elif op_name in ["__mod__", "__rmod__", "__pow__", "__rpow__"]:
-            return NotImplementedError
-        elif op_name in ["__mul__", "__rmul__"]:
-            # Can only multiply strings by integers
-            return TypeError
-        elif op_name in [
-            "__truediv__",
-            "__rtruediv__",
-            "__floordiv__",
-            "__rfloordiv__",
-            "__sub__",
-            "__rsub__",
-        ]:
-            return pa.ArrowNotImplementedError
-
-        return None
-
     def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
         dtype = typing.cast(pd.StringDtype, tm.get_dtype(obj))
         if op_name in ["__add__", "__radd__"]:
@@ -135,6 +150,259 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
             cast_to = "boolean[pyarrow]"  # type: ignore[assignment]
         return pointwise_result.astype(cast_to)
 
+    @pytest.mark.skip(reason="'<' not supported between instances of 'dict' and 'dict'")
+    def test_searchsorted(self, data_for_sorting, as_series):
+        super().test_searchsorted(self, data_for_sorting, as_series)
+
+    def test_astype_str(self, data):
+        # Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method.
+        result = pd.Series(data[:5]).astype(str)
+        expected = pd.Series(
+            [json.dumps(x, sort_keys=True) for x in data[:5]], dtype=str
+        )
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "nullable_string_dtype",
+        [
+            "string[python]",
+            "string[pyarrow]",
+        ],
+    )
+    def test_astype_string(self, data, nullable_string_dtype):
+        # Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method.
+        result = pd.Series(data[:5]).astype(nullable_string_dtype)
+        expected = pd.Series(
+            [json.dumps(x, sort_keys=True) for x in data[:5]],
+            dtype=nullable_string_dtype,
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_array_interface(self, data):
+        result = np.array(data)
+        # Use `json.dumps(data[0])` instead of passing `data[0]` directly to the super method.
+        assert result[0] == json.dumps(data[0])
+
+        result = np.array(data, dtype=object)
+        # Use `json.dumps(x)` instead of passing `x` directly to the super method.
+        expected = np.array([json.dumps(x) for x in data], dtype=object)
+        if expected.ndim > 1:
+            # nested data, explicitly construct as 1D
+            expected = construct_1d_object_array_from_listlike(list(data))
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.xfail(reason="Setting a dict as a scalar")
+    def test_fillna_series(self):
+        """We treat dictionaries as a mapping in fillna, not a scalar."""
+        super().test_fillna_series()
+
+    @pytest.mark.xfail(reason="Setting a dict as a scalar")
+    def test_fillna_frame(self):
+        """We treat dictionaries as a mapping in fillna, not a scalar."""
+        super().test_fillna_frame()
+
+    @pytest.mark.skip("fill-value is interpreted as a dict of values")
+    def test_fillna_copy_frame(self, data_missing):
+        super().test_fillna_copy_frame(data_missing)
+
+    def test_from_dtype(self, data):
+        # construct from our dtype & string dtype
+        dtype = data.dtype
+
+        expected = pd.Series(data)
+        result = pd.Series(list(data), dtype=dtype)
+        tm.assert_series_equal(result, expected)
+
+        result = pd.Series(list(data), dtype=str(dtype))
+        tm.assert_series_equal(result, expected)
+
+        # Use `{"col1": data}` instead of passing `data` directly to the super method.
+        # This prevents the DataFrame constructor from attempting to interpret the
+        # dictionary as column headers.
+
+        # gh-30280
+        expected = pd.DataFrame({"col1": data}).astype(dtype)
+        result = pd.DataFrame({"col1": list(data)}, dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+        result = pd.DataFrame({"col1": list(data)}, dtype=str(dtype))
+        tm.assert_frame_equal(result, expected)
+
+    def test_series_constructor_scalar_with_index(self, data, dtype):
+        # Use json.dumps(data[0]) instead of passing data[0] directly to the super method.
+        # This prevents the Series constructor from attempting to interpret the dictionary
+        # as column headers.
+        scalar = json.dumps(data[0])
+        result = pd.Series(scalar, index=[1, 2, 3], dtype=dtype)
+        expected = pd.Series([scalar] * 3, index=[1, 2, 3], dtype=dtype)
+        tm.assert_series_equal(result, expected)
+
+        result = pd.Series(scalar, index=["foo"], dtype=dtype)
+        expected = pd.Series([scalar], index=["foo"], dtype=dtype)
+        tm.assert_series_equal(result, expected)
+
+    # Patching `json.dumps` to base.BaseSetitemTests because pandas' internals has
+    # has trouble setting sequences of values into scalar positions.
+
+    @pytest.mark.parametrize(
+        "idx",
+        [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
+        ids=["list", "integer-array", "numpy-array"],
+    )
+    def test_setitem_integer_array(self, data, idx, box_in_series):
+        arr = data[:5].copy()
+        expected = data.take([0, 0, 0, 3, 4])
+
+        if box_in_series:
+            arr = pd.Series(arr)
+            expected = pd.Series(expected)
+
+        # Use json.dumps(arr[0]) instead of passing arr[0] directly to the super method.
+        arr[idx] = json.dumps(arr[0])
+        tm.assert_equal(arr, expected)
+
+    @pytest.mark.parametrize("setter", ["loc", None])
+    def test_setitem_mask_broadcast(self, data, setter):
+        ser = pd.Series(data)
+        mask = np.zeros(len(data), dtype=bool)
+        mask[:2] = True
+
+        if setter:  # loc
+            target = getattr(ser, setter)
+        else:  # __setitem__
+            target = ser
+
+        # Use json.dumps(data[10]) instead of passing data[10] directly to the super method.
+        target[mask] = json.dumps(data[10])
+        assert ser[0] == data[10]
+        assert ser[1] == data[10]
+
+    def test_setitem_loc_scalar_mixed(self, data):
+        df = pd.DataFrame({"A": np.arange(len(data)), "B": data})
+        # Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
+        df.loc[0, "B"] = json.dumps(data[1])
+        assert df.loc[0, "B"] == data[1]
+
+    def test_setitem_loc_scalar_single(self, data):
+        df = pd.DataFrame({"B": data})
+        # Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
+        df.loc[10, "B"] = json.dumps(data[1])
+        assert df.loc[10, "B"] == data[1]
+
+    def test_setitem_loc_iloc_slice(self, data):
+        arr = data[:5].copy()
+        s = pd.Series(arr, index=["a", "b", "c", "d", "e"])
+        expected = pd.Series(data.take([0, 0, 0, 3, 4]), index=s.index)
+
+        result = s.copy()
+        # Use json.dumps(data[0]) instead of passing data[0] directly to the super method.
+        result.iloc[:3] = json.dumps(data[0])
+        tm.assert_equal(result, expected)
+
+        result = s.copy()
+        result.loc[:"c"] = json.dumps(data[0])
+        tm.assert_equal(result, expected)
+
+    def test_setitem_iloc_scalar_single(self, data):
+        df = pd.DataFrame({"B": data})
+        # Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
+        df.iloc[10, 0] = json.dumps(data[1])
+        assert df.loc[10, "B"] == data[1]
+
+    def test_setitem_iloc_scalar_mixed(self, data):
+        df = pd.DataFrame({"A": np.arange(len(data)), "B": data})
+        # Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
+        df.iloc[0, 1] = json.dumps(data[1])
+        assert df.loc[0, "B"] == data[1]
+
+    @pytest.mark.xfail(reaons="eq not implemented for <class 'dict'>")
+    def test_setitem_mask_boolean_array_with_na(self, data, box_in_series):
+        super().test_setitem_mask_boolean_array_with_na(data, box_in_series)
+
+    @pytest.mark.parametrize("setter", ["loc", "iloc"])
+    def test_setitem_scalar(self, data, setter):
+        arr = pd.Series(data)
+        setter = getattr(arr, setter)
+        # Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
+        setter[0] = json.dumps(data[1])
+        assert arr[0] == data[1]
+
+    @pytest.mark.parametrize(
+        "mask",
+        [
+            np.array([True, True, True, False, False]),
+            pd.array([True, True, True, False, False], dtype="boolean"),
+            pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"),
+        ],
+        ids=["numpy-array", "boolean-array", "boolean-array-na"],
+    )
+    def test_setitem_mask(self, data, mask, box_in_series):
+        arr = data[:5].copy()
+        expected = arr.take([0, 0, 0, 3, 4])
+        if box_in_series:
+            arr = pd.Series(arr)
+            expected = pd.Series(expected)
+        # Use json.dumps(data[0]) instead of passing data[0] directly to the super method.
+        arr[mask] = json.dumps(data[0])
+        tm.assert_equal(expected, arr)
+
+    def test_setitem_with_expansion_row(self, data, na_value):
+        df = pd.DataFrame({"data": data[:1]})
+
+        # Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
+        df.loc[1, "data"] = json.dumps(data[1])
+        expected = pd.DataFrame({"data": data[:2]})
+        tm.assert_frame_equal(df, expected)
+
+        # https://github.com/pandas-dev/pandas/issues/47284
+        df.loc[2, "data"] = na_value
+        expected = pd.DataFrame(
+            {"data": pd.Series([data[0], data[1], na_value], dtype=data.dtype)}
+        )
+        tm.assert_frame_equal(df, expected)
+
+    def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
+        df = pd.DataFrame({"A": data, "B": data})
+        # Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
+        df.iloc[10, 1] = json.dumps(data[1])
+        assert df.loc[10, "B"] == data[1]
+
+    def test_setitem_loc_scalar_multiple_homogoneous(self, data):
+        df = pd.DataFrame({"A": data, "B": data})
+        # Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
+        df.loc[10, "B"] = json.dumps(data[1])
+        assert df.loc[10, "B"] == data[1]
+
+    def test_setitem_slice(self, data, box_in_series):
+        arr = data[:5].copy()
+        expected = data.take([0, 0, 0, 3, 4])
+        if box_in_series:
+            arr = pd.Series(arr)
+            expected = pd.Series(expected)
+
+        # Use json.dumps(data[0]) instead of passing data[0] directly to the super method.
+        arr[:3] = json.dumps(data[0])
+        tm.assert_equal(arr, expected)
+
+    @pytest.mark.xfail(reason="only integer scalar arrays can be converted")
+    def test_setitem_2d_values(self, data):
+        super().test_setitem_2d_values(data)
+
+    @pytest.mark.xfail(reason="data type 'json' not understood")
+    @pytest.mark.parametrize("engine", ["c", "python"])
+    def test_EA_types(self, engine, data, request):
+        super().test_EA_types(engine, data, request)
+
+    def test_getitem_scalar(self, data):
+        result = data[0]
+        # While JSONDtype internally stores data as pyarrow strings
+        # (equivalent to data.dtype.type), it is deliberately designed to return a
+        # dictionary as the result.
+        assert isinstance(result, dict)
+
+        result = pd.Series(data)[0]
+        assert isinstance(result, dict)
+
 
 def custom_assert_frame_equal(left, right, *args, **kwargs):
     obj_type = kwargs.get("obj", "DataFrame")

From e29585de76ee05351a16cd5e8900998d770c975b Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Mon, 22 Jul 2024 21:42:00 +0000
Subject: [PATCH 05/28] minor

---
 db_dtypes/json.py                             | 12 ------
 tests/compliance/json/test_json_compliance.py | 42 -------------------
 2 files changed, 54 deletions(-)

diff --git a/db_dtypes/json.py b/db_dtypes/json.py
index 390cd36..431da43 100644
--- a/db_dtypes/json.py
+++ b/db_dtypes/json.py
@@ -202,18 +202,6 @@ def dtype(self) -> JSONDtype:
     def __contains__(self, key) -> bool:
         return super().__contains__(JSONArray._seralizate_json(key))
 
-    # def __contains__(self, key) -> bool:
-    #     # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604
-    #     if pd.isna(key) and key is not self.dtype.na_value:
-    #         if self.dtype.kind == "f" and lib.is_float(key):
-    #             return pc.any(pc.is_nan(self._pa_array)).as_py()
-
-    #         # e.g. date or timestamp types we do not allow None here to match pd.NA
-    #         return False
-    #         # TODO: maybe complex? object?
-
-    #     return bool(super().__contains__(key))
-
     def insert(self, loc: int, item) -> JSONArray:
         val = JSONArray._seralizate_json(item)
         return super().insert(loc, val)
diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py
index b4d55a6..30a25d6 100644
--- a/tests/compliance/json/test_json_compliance.py
+++ b/tests/compliance/json/test_json_compliance.py
@@ -402,45 +402,3 @@ def test_getitem_scalar(self, data):
 
         result = pd.Series(data)[0]
         assert isinstance(result, dict)
-
-
-def custom_assert_frame_equal(left, right, *args, **kwargs):
-    obj_type = kwargs.get("obj", "DataFrame")
-    tm.assert_index_equal(
-        left.columns,
-        right.columns,
-        exact=kwargs.get("check_column_type", "equiv"),
-        check_names=kwargs.get("check_names", True),
-        check_exact=kwargs.get("check_exact", False),
-        check_categorical=kwargs.get("check_categorical", True),
-        obj=f"{obj_type}.columns",
-    )
-
-    jsons = (left.dtypes == "json").index
-
-    for col in jsons:
-        tm.assert_series_equal(left[col], right[col], *args, **kwargs)
-
-    left = left.drop(columns=jsons)
-    right = right.drop(columns=jsons)
-    tm.assert_frame_equal(left, right, *args, **kwargs)
-
-
-def test_custom_asserts():
-    data = JSONArray._from_sequence(
-        [
-            {"a": 1},
-            {"b": 2},
-            {"c": 3},
-        ]
-    )
-    a = pd.Series(data)
-    tm.assert_series_equal(a, a)
-    custom_assert_frame_equal(a.to_frame(), a.to_frame())
-
-    b = pd.Series(data.take([0, 0, 1]))
-    with pytest.raises(AssertionError):
-        tm.assert_series_equal(a, b)
-
-    with pytest.raises(AssertionError):
-        custom_assert_frame_equal(a.to_frame(), b.to_frame())

From 84690ee2f7f5a7db6b6e3eb59a09b596c81209ec Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Mon, 22 Jul 2024 22:22:37 +0000
Subject: [PATCH 06/28] fix test_getitem_scalar test

---
 db_dtypes/json.py                             |  2 +-
 tests/compliance/json/test_json_compliance.py | 10 ----------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/db_dtypes/json.py b/db_dtypes/json.py
index 431da43..43336bf 100644
--- a/db_dtypes/json.py
+++ b/db_dtypes/json.py
@@ -40,7 +40,7 @@ def na_value(self) -> pd.NA:
 
     @property
     def type(self) -> type[str]:
-        return str
+        return dict
 
     @property
     def _is_numeric(self) -> bool:
diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py
index 30a25d6..f434f6e 100644
--- a/tests/compliance/json/test_json_compliance.py
+++ b/tests/compliance/json/test_json_compliance.py
@@ -392,13 +392,3 @@ def test_setitem_2d_values(self, data):
     @pytest.mark.parametrize("engine", ["c", "python"])
     def test_EA_types(self, engine, data, request):
         super().test_EA_types(engine, data, request)
-
-    def test_getitem_scalar(self, data):
-        result = data[0]
-        # While JSONDtype internally stores data as pyarrow strings
-        # (equivalent to data.dtype.type), it is deliberately designed to return a
-        # dictionary as the result.
-        assert isinstance(result, dict)
-
-        result = pd.Series(data)[0]
-        assert isinstance(result, dict)

From d11cc873756f859d55933e8539c52b89a089050d Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Mon, 22 Jul 2024 22:45:24 +0000
Subject: [PATCH 07/28] add docstring and remove unused functions

---
 db_dtypes/json.py                             | 117 ++++--------------
 tests/compliance/json/test_json_compliance.py |  23 ----
 2 files changed, 22 insertions(+), 118 deletions(-)

diff --git a/db_dtypes/json.py b/db_dtypes/json.py
index 43336bf..ce89b0d 100644
--- a/db_dtypes/json.py
+++ b/db_dtypes/json.py
@@ -30,16 +30,18 @@
 
 @pd.api.extensions.register_extension_dtype
 class JSONDtype(pd.api.extensions.ExtensionDtype):
-    """Extension dtype for JSON data."""
+    """Extension dtype for BigQuery JSON data."""
 
     name = "dbjson"
 
     @property
     def na_value(self) -> pd.NA:
+        """Default NA value to use for this type."""
         return pd.NA
 
     @property
     def type(self) -> type[str]:
+        """Return the scalar type for the array, e.g. int."""
         return dict
 
     @property
@@ -62,7 +64,9 @@ def __from_arrow__(array: typing.Union[pa.Array, pa.ChunkedArray]) -> JSONArray:
 
 
 class JSONArray(ArrowExtensionArray):
-    """Extension array containing JSON data."""
+    """Extension array that handles BigQuery JSON data, leveraging a string-based
+    pyarrow array for storage. It enables seamless conversion to JSON objects when
+    accessing individual elements."""
 
     _dtype = JSONDtype()
 
@@ -88,18 +92,7 @@ def __init__(self, values, dtype=None, copy=False) -> None:
     def _box_pa(
         cls, value, pa_type: pa.DataType | None = None
     ) -> pa.Array | pa.ChunkedArray | pa.Scalar:
-        """
-        Box value into a pyarrow Array, ChunkedArray or Scalar.
-
-        Parameters
-        ----------
-        value : any
-        pa_type : pa.DataType | None
-
-        Returns
-        -------
-        pa.Array or pa.ChunkedArray or pa.Scalar
-        """
+        """Box value into a pyarrow Array, ChunkedArray or Scalar."""
         if isinstance(value, pa.Scalar) or not (
             is_list_like(value) and not is_dict_like(value)
         ):
@@ -108,18 +101,7 @@ def _box_pa(
 
     @classmethod
     def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
-        """
-        Box value into a pyarrow Scalar.
-
-        Parameters
-        ----------
-        value : any
-        pa_type : pa.DataType | None
-
-        Returns
-        -------
-        pa.Scalar
-        """
+        """Box value into a pyarrow Scalar."""
         value = JSONArray._seralizate_json(value)
         pa_scalar = super()._box_pa_scalar(value, pa_type)
         if pa.types.is_string(pa_scalar.type) and pa_type is None:
@@ -130,18 +112,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
     def _box_pa_array(
         cls, value, pa_type: pa.DataType | None = None, copy: bool = False
     ) -> pa.Array | pa.ChunkedArray:
-        """
-        Box value into a pyarrow Array or ChunkedArray.
-
-        Parameters
-        ----------
-        value : Sequence
-        pa_type : pa.DataType | None
-
-        Returns
-        -------
-        pa.Array or pa.ChunkedArray
-        """
+        """Box value into a pyarrow Array or ChunkedArray."""
         if (
             not isinstance(value, cls)
             and not isinstance(value, (pa.Array, pa.ChunkedArray))
@@ -155,18 +126,7 @@ def _box_pa_array(
 
     @classmethod
     def _from_sequence(cls, scalars, *, dtype=None, copy=False):
-        # TODO: check _from_arrow APIs etc.
-        # from pandas.core.arrays.masked import BaseMaskedArray
-
-        # if isinstance(scalars, BaseMaskedArray):
-        #     # avoid costly conversion to object dtype in ensure_string_array and
-        #     # numerical issues with Float32Dtype
-        #     na_values = scalars._mask
-        #     result = scalars._data
-        #     # result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
-        #     return cls(pa.array(result, mask=na_values, type=pa.large_string()))
-        # elif isinstance(scalars, (pa.Array, pa.ChunkedArray)):
-        #     return cls(pc.cast(scalars, pa.large_string()))
+        """Construct a new ExtensionArray from a sequence of scalars."""
         result = []
         for scalar in scalars:
             result.append(JSONArray._seralizate_json(scalar))
@@ -176,10 +136,12 @@ def _from_sequence(cls, scalars, *, dtype=None, copy=False):
     def _from_sequence_of_strings(
         cls, strings, *, dtype: ExtensionDtype, copy: bool = False
     ) -> JSONArray:
+        """Construct a new ExtensionArray from a sequence of strings."""
         return cls._from_sequence(strings, dtype=dtype, copy=copy)
 
     @staticmethod
     def _seralizate_json(value):
+        """A static method that converts a JSON value into a string representation."""
         if isinstance(value, str) or pd.isna(value):
             return value
         else:
@@ -189,6 +151,7 @@ def _seralizate_json(value):
 
     @staticmethod
     def _deserialize_json(value):
+        """A static method that converts a JSON string back into its original value."""
         if not pd.isna(value):
             return json.loads(value)
         else:
@@ -200,40 +163,24 @@ def dtype(self) -> JSONDtype:
         return self._dtype
 
     def __contains__(self, key) -> bool:
+        """Return for `item in self`."""
         return super().__contains__(JSONArray._seralizate_json(key))
 
     def insert(self, loc: int, item) -> JSONArray:
+        """
+        Make new ExtensionArray inserting new item at location. Follows Python
+        list.append semantics for negative values.
+        """
         val = JSONArray._seralizate_json(item)
         return super().insert(loc, val)
 
     @classmethod
     def _from_factorized(cls, values, original):
+        """Reconstruct an ExtensionArray after factorization."""
         return cls._from_sequence(values, dtype=original.dtype)
 
     def __getitem__(self, item):
-        """Select a subset of self.
-
-        Parameters
-        ----------
-        item : int, slice, or ndarray
-            * int: The position in 'self' to get.
-            * slice: A slice object, where 'start', 'stop', and 'step' are
-              integers or None
-            * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
-
-        Returns
-        -------
-        item : scalar or ExtensionArray
-
-        Notes
-        -----
-        For scalar ``item``, return a scalar value suitable for the array's
-        type. This should be an instance of ``self.dtype.type``.
-        For slice ``key``, return an instance of ``ExtensionArray``, even
-        if the slice is length 0 or 1.
-        For a boolean mask, return an instance of ``ExtensionArray``, filtered
-        to the values where ``item`` is True.
-        """
+        """Select a subset of self."""
         item = check_array_indexer(self, item)
 
         if isinstance(item, np.ndarray):
@@ -283,9 +230,7 @@ def __getitem__(self, item):
                 return scalar
 
     def __iter__(self):
-        """
-        Iterate over elements of the array.
-        """
+        """Iterate over elements of the array."""
         for value in self._pa_array:
             val = JSONArray._deserialize_json(value.as_py())
             if val is None:
@@ -293,27 +238,9 @@ def __iter__(self):
             else:
                 yield val
 
-    @classmethod
-    def _result_converter(cls, values, na=None):
-        return pd.BooleanDtype().__from_arrow__(values)
-
     @classmethod
     def _concat_same_type(cls, to_concat) -> JSONArray:
-        """
-        Concatenate multiple JSONArray.
-
-        Parameters
-        ----------
-        to_concat : sequence of JSONArray
-
-        Returns
-        -------
-        JSONArray
-        """
+        """Concatenate multiple JSONArray."""
         chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
         arr = pa.chunked_array(chunks, type=pa.large_string())
         return cls(arr)
-
-    def _pad_or_backfill(self, *, method, limit=None, copy=True):
-        # GH#56616 - test EA method without limit_area argument
-        return super()._pad_or_backfill(method=method, limit=limit, copy=copy)
diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py
index f434f6e..89c13ec 100644
--- a/tests/compliance/json/test_json_compliance.py
+++ b/tests/compliance/json/test_json_compliance.py
@@ -26,29 +26,6 @@
 
 
 class TestJSONArray(base.ExtensionTests):
-    @pytest.mark.parametrize(
-        "limit_area, input_ilocs, expected_ilocs",
-        [
-            ("outside", [1, 0, 0, 0, 1], [1, 0, 0, 0, 1]),
-            ("outside", [1, 0, 1, 0, 1], [1, 0, 1, 0, 1]),
-            ("outside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 1]),
-            ("outside", [0, 1, 0, 1, 0], [0, 1, 0, 1, 1]),
-            ("inside", [1, 0, 0, 0, 1], [1, 1, 1, 1, 1]),
-            ("inside", [1, 0, 1, 0, 1], [1, 1, 1, 1, 1]),
-            ("inside", [0, 1, 1, 1, 0], [0, 1, 1, 1, 0]),
-            ("inside", [0, 1, 0, 1, 0], [0, 1, 1, 1, 0]),
-        ],
-    )
-    def test_ffill_limit_area(
-        self, data_missing, limit_area, input_ilocs, expected_ilocs
-    ):
-        # GH#56616
-        msg = "JSONArray does not implement limit_area"
-        with pytest.raises(NotImplementedError, match=msg):
-            super().test_ffill_limit_area(
-                data_missing, limit_area, input_ilocs, expected_ilocs
-            )
-
     @pytest.mark.xfail(reason="Unhashable")
     def test_value_counts_with_normalize(self, data):
         super().test_value_counts_with_normalize(data)

From 60da5700f7c2353c185d9f641bbef93c8e67d70b Mon Sep 17 00:00:00 2001
From: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
Date: Mon, 22 Jul 2024 22:54:13 +0000
Subject: [PATCH 08/28] =?UTF-8?q?=F0=9F=A6=89=20Updates=20from=20OwlBot=20?=
 =?UTF-8?q?post-processor?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md
---
 docs/conf.py                | 2 +-
 samples/snippets/noxfile.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/conf.py b/docs/conf.py
index 672daff..00e0013 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -24,9 +24,9 @@
 # All configuration values have a default; values that are commented out
 # serve to show the default.
 
+import sys
 import os
 import shlex
-import sys
 
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
diff --git a/samples/snippets/noxfile.py b/samples/snippets/noxfile.py
index c36d5f2..3b71359 100644
--- a/samples/snippets/noxfile.py
+++ b/samples/snippets/noxfile.py
@@ -22,6 +22,7 @@
 
 import nox
 
+
 # WARNING - WARNING - WARNING - WARNING - WARNING
 # WARNING - WARNING - WARNING - WARNING - WARNING
 #           DO NOT EDIT THIS FILE EVER!

From 48ee67db1f010e89f391b1b7ed58ea79b6b112a8 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Mon, 22 Jul 2024 22:54:26 +0000
Subject: [PATCH 09/28] fix lint

---
 db_dtypes/json.py                             | 24 +++++++++----------
 tests/compliance/json/test_json_compliance.py |  6 -----
 2 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/db_dtypes/json.py b/db_dtypes/json.py
index ce89b0d..62c44e8 100644
--- a/db_dtypes/json.py
+++ b/db_dtypes/json.py
@@ -139,6 +139,18 @@ def _from_sequence_of_strings(
         """Construct a new ExtensionArray from a sequence of strings."""
         return cls._from_sequence(strings, dtype=dtype, copy=copy)
 
+    @classmethod
+    def _concat_same_type(cls, to_concat) -> JSONArray:
+        """Concatenate multiple JSONArray."""
+        chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
+        arr = pa.chunked_array(chunks, type=pa.large_string())
+        return cls(arr)
+
+    @classmethod
+    def _from_factorized(cls, values, original):
+        """Reconstruct an ExtensionArray after factorization."""
+        return cls._from_sequence(values, dtype=original.dtype)
+
     @staticmethod
     def _seralizate_json(value):
         """A static method that converts a JSON value into a string representation."""
@@ -174,11 +186,6 @@ def insert(self, loc: int, item) -> JSONArray:
         val = JSONArray._seralizate_json(item)
         return super().insert(loc, val)
 
-    @classmethod
-    def _from_factorized(cls, values, original):
-        """Reconstruct an ExtensionArray after factorization."""
-        return cls._from_sequence(values, dtype=original.dtype)
-
     def __getitem__(self, item):
         """Select a subset of self."""
         item = check_array_indexer(self, item)
@@ -237,10 +244,3 @@ def __iter__(self):
                 yield self._dtype.na_value
             else:
                 yield val
-
-    @classmethod
-    def _concat_same_type(cls, to_concat) -> JSONArray:
-        """Concatenate multiple JSONArray."""
-        chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
-        arr = pa.chunked_array(chunks, type=pa.large_string())
-        return cls(arr)
diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py
index 89c13ec..02185fc 100644
--- a/tests/compliance/json/test_json_compliance.py
+++ b/tests/compliance/json/test_json_compliance.py
@@ -22,8 +22,6 @@
 from pandas.tests.extension import base
 import pytest
 
-from db_dtypes import JSONArray
-
 
 class TestJSONArray(base.ExtensionTests):
     @pytest.mark.xfail(reason="Unhashable")
@@ -88,10 +86,6 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
     def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
         super().test_arith_series_with_scalar(data, all_arithmetic_operators)
 
-    @pytest.mark.xfail(reason="Arithmetic functions is not supported for json")
-    def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
-        super().test_arith_series_with_scalar(data, all_arithmetic_operators)
-
     @pytest.mark.xfail(reason="Arithmetic functions is not supported for json")
     def test_arith_series_with_array(self, data, all_arithmetic_operators):
         super().test_arith_series_with_array(data, all_arithmetic_operators)

From 91d5016a2642b66f5a7fc817c592a19b6b59cd7d Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Wed, 24 Jul 2024 21:52:48 +0000
Subject: [PATCH 10/28] address some comments

---
 db_dtypes/json.py | 88 +++++++++++++++++++++++------------------------
 1 file changed, 44 insertions(+), 44 deletions(-)

diff --git a/db_dtypes/json.py b/db_dtypes/json.py
index 62c44e8..8fcfbd1 100644
--- a/db_dtypes/json.py
+++ b/db_dtypes/json.py
@@ -19,13 +19,10 @@
 
 import numpy as np
 import pandas as pd
-from pandas.core.arrays.arrow.array import ArrowExtensionArray
-from pandas.core.arrays.masked import BaseMaskedArray
-from pandas.core.dtypes.common import is_dict_like, is_integer, is_list_like, is_scalar
-from pandas.core.dtypes.dtypes import ExtensionDtype
-from pandas.core.indexers import check_array_indexer, unpack_tuple_and_ellipses
+import pandas.arrays as arrays
+import pandas.core.dtypes.common as common
+import pandas.core.indexers as indexers
 import pyarrow as pa
-import pyarrow.compute as pc
 
 
 @pd.api.extensions.register_extension_dtype
@@ -63,7 +60,7 @@ def __from_arrow__(array: typing.Union[pa.Array, pa.ChunkedArray]) -> JSONArray:
         return JSONArray(array)
 
 
-class JSONArray(ArrowExtensionArray):
+class JSONArray(arrays.ArrowExtensionArray):
     """Extension array that handles BigQuery JSON data, leveraging a string-based
     pyarrow array for storage. It enables seamless conversion to JSON objects when
     accessing individual elements."""
@@ -71,22 +68,13 @@ class JSONArray(ArrowExtensionArray):
     _dtype = JSONDtype()
 
     def __init__(self, values, dtype=None, copy=False) -> None:
-        if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string(
-            values.type
-        ):
-            values = pc.cast(values, pa.large_string())
-
-        super().__init__(values)
         self._dtype = JSONDtype()
-
-        if not pa.types.is_large_string(self._pa_array.type) and not (
-            pa.types.is_dictionary(self._pa_array.type)
-            and pa.types.is_large_string(self._pa_array.type.value_type)
-        ):
-            raise ValueError(
-                "ArrowStringArray requires a PyArrow (chunked) array of "
-                "large_string type"
-            )
+        if isinstance(values, pa.Array):
+            self._pa_array = pa.chunked_array([values])
+        elif isinstance(values, pa.ChunkedArray):
+            self._pa_array = values
+        else:
+            raise ValueError(f"Unsupported type '{type(values)}' for JSONArray")
 
     @classmethod
     def _box_pa(
@@ -94,7 +82,7 @@ def _box_pa(
     ) -> pa.Array | pa.ChunkedArray | pa.Scalar:
         """Box value into a pyarrow Array, ChunkedArray or Scalar."""
         if isinstance(value, pa.Scalar) or not (
-            is_list_like(value) and not is_dict_like(value)
+            common.is_list_like(value) and not common.is_dict_like(value)
         ):
             return cls._box_pa_scalar(value, pa_type)
         return cls._box_pa_array(value, pa_type)
@@ -102,10 +90,16 @@ def _box_pa(
     @classmethod
     def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
         """Box value into a pyarrow Scalar."""
-        value = JSONArray._seralizate_json(value)
-        pa_scalar = super()._box_pa_scalar(value, pa_type)
-        if pa.types.is_string(pa_scalar.type) and pa_type is None:
-            pa_scalar = pc.cast(pa_scalar, pa.large_string())
+        if isinstance(value, pa.Scalar):
+            pa_scalar = value
+        if pd.isna(value):
+            pa_scalar = pa.scalar(None, type=pa_type)
+        else:
+            value = JSONArray._serialize_json(value)
+            pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True)
+
+        if pa_type is not None and pa_scalar.type != pa_type:
+            pa_scalar = pa_scalar.cast(pa_type)
         return pa_scalar
 
     @classmethod
@@ -113,15 +107,21 @@ def _box_pa_array(
         cls, value, pa_type: pa.DataType | None = None, copy: bool = False
     ) -> pa.Array | pa.ChunkedArray:
         """Box value into a pyarrow Array or ChunkedArray."""
-        if (
-            not isinstance(value, cls)
-            and not isinstance(value, (pa.Array, pa.ChunkedArray))
-            and not isinstance(value, BaseMaskedArray)
-        ):
-            value = [JSONArray._seralizate_json(x) for x in value]
-        pa_array = super()._box_pa_array(value, pa_type)
-        if pa.types.is_string(pa_array.type) and pa_type is None:
-            pa_array = pc.cast(pa_array, pa.large_string())
+        if isinstance(value, cls):
+            pa_array = value._pa_array
+        elif isinstance(value, (pa.Array, pa.ChunkedArray)):
+            pa_array = value
+        else:
+            try:
+                value = [JSONArray._serialize_json(x) for x in value]
+                pa_array = pa.array(value, type=pa_type, from_pandas=True)
+            except (pa.ArrowInvalid, pa.ArrowTypeError):
+                # GH50430: let pyarrow infer type, then cast
+                pa_array = pa.array(value, from_pandas=True)
+
+        if pa_type is not None and pa_array.type != pa_type:
+            pa_array = pa_array.cast(pa_type)
+
         return pa_array
 
     @classmethod
@@ -129,12 +129,12 @@ def _from_sequence(cls, scalars, *, dtype=None, copy=False):
         """Construct a new ExtensionArray from a sequence of scalars."""
         result = []
         for scalar in scalars:
-            result.append(JSONArray._seralizate_json(scalar))
+            result.append(JSONArray._serialize_json(scalar))
         return cls(pa.array(result, type=pa.large_string(), from_pandas=True))
 
     @classmethod
     def _from_sequence_of_strings(
-        cls, strings, *, dtype: ExtensionDtype, copy: bool = False
+        cls, strings, *, dtype, copy: bool = False
     ) -> JSONArray:
         """Construct a new ExtensionArray from a sequence of strings."""
         return cls._from_sequence(strings, dtype=dtype, copy=copy)
@@ -152,7 +152,7 @@ def _from_factorized(cls, values, original):
         return cls._from_sequence(values, dtype=original.dtype)
 
     @staticmethod
-    def _seralizate_json(value):
+    def _serialize_json(value):
         """A static method that converts a JSON value into a string representation."""
         if isinstance(value, str) or pd.isna(value):
             return value
@@ -176,19 +176,19 @@ def dtype(self) -> JSONDtype:
 
     def __contains__(self, key) -> bool:
         """Return for `item in self`."""
-        return super().__contains__(JSONArray._seralizate_json(key))
+        return super().__contains__(JSONArray._serialize_json(key))
 
     def insert(self, loc: int, item) -> JSONArray:
         """
         Make new ExtensionArray inserting new item at location. Follows Python
         list.append semantics for negative values.
         """
-        val = JSONArray._seralizate_json(item)
+        val = JSONArray._serialize_json(item)
         return super().insert(loc, val)
 
     def __getitem__(self, item):
         """Select a subset of self."""
-        item = check_array_indexer(self, item)
+        item = indexers.check_array_indexer(self, item)
 
         if isinstance(item, np.ndarray):
             if not len(item):
@@ -203,9 +203,9 @@ def __getitem__(self, item):
                     "boolean arrays are valid indices."
                 )
         elif isinstance(item, tuple):
-            item = unpack_tuple_and_ellipses(item)
+            item = indexers.unpack_tuple_and_ellipses(item)
 
-        if is_scalar(item) and not is_integer(item):
+        if common.is_scalar(item) and not common.is_integer(item):
             # e.g. "foo" or 2.5
             # exception message copied from numpy
             raise IndexError(

From 191deef76a6e587e274a0448490b3f8aad1d8701 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Tue, 30 Jul 2024 20:09:47 +0000
Subject: [PATCH 11/28] supports all types except Array

---
 db_dtypes/json.py                             |  75 ++++++++---
 tests/compliance/json/test_json_compliance.py | 117 +++++-------------
 2 files changed, 93 insertions(+), 99 deletions(-)

diff --git a/db_dtypes/json.py b/db_dtypes/json.py
index 8fcfbd1..e19a357 100644
--- a/db_dtypes/json.py
+++ b/db_dtypes/json.py
@@ -23,7 +23,16 @@
 import pandas.core.dtypes.common as common
 import pandas.core.indexers as indexers
 import pyarrow as pa
+import pyarrow.compute
 
+ARROW_CMP_FUNCS = {
+    "eq": pyarrow.compute.equal,
+    "ne": pyarrow.compute.not_equal,
+    "lt": pyarrow.compute.less,
+    "gt": pyarrow.compute.greater,
+    "le": pyarrow.compute.less_equal,
+    "ge": pyarrow.compute.greater_equal,
+}
 
 @pd.api.extensions.register_extension_dtype
 class JSONDtype(pd.api.extensions.ExtensionDtype):
@@ -130,7 +139,7 @@ def _from_sequence(cls, scalars, *, dtype=None, copy=False):
         result = []
         for scalar in scalars:
             result.append(JSONArray._serialize_json(scalar))
-        return cls(pa.array(result, type=pa.large_string(), from_pandas=True))
+        return cls(pa.array(result, type=pa.string(), from_pandas=True))
 
     @classmethod
     def _from_sequence_of_strings(
@@ -143,7 +152,7 @@ def _from_sequence_of_strings(
     def _concat_same_type(cls, to_concat) -> JSONArray:
         """Concatenate multiple JSONArray."""
         chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
-        arr = pa.chunked_array(chunks, type=pa.large_string())
+        arr = pa.chunked_array(chunks, type=pa.string())
         return cls(arr)
 
     @classmethod
@@ -154,7 +163,7 @@ def _from_factorized(cls, values, original):
     @staticmethod
     def _serialize_json(value):
         """A static method that converts a JSON value into a string representation."""
-        if isinstance(value, str) or pd.isna(value):
+        if pd.isna(value):
             return value
         else:
             # `sort_keys=True` sorts dictionary keys before serialization, making
@@ -174,17 +183,10 @@ def dtype(self) -> JSONDtype:
         """An instance of JSONDtype"""
         return self._dtype
 
-    def __contains__(self, key) -> bool:
-        """Return for `item in self`."""
-        return super().__contains__(JSONArray._serialize_json(key))
-
-    def insert(self, loc: int, item) -> JSONArray:
-        """
-        Make new ExtensionArray inserting new item at location. Follows Python
-        list.append semantics for negative values.
-        """
-        val = JSONArray._serialize_json(item)
-        return super().insert(loc, val)
+    def _cmp_method(self, other, op):
+        pc_func = ARROW_CMP_FUNCS[op.__name__]
+        result = pc_func(self._pa_array, self._box_pa(other))
+        return arrays.ArrowExtensionArray(result)
 
     def __getitem__(self, item):
         """Select a subset of self."""
@@ -244,3 +246,48 @@ def __iter__(self):
                 yield self._dtype.na_value
             else:
                 yield val
+
+    def _reduce(
+        self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
+    ):
+        """Return a scalar result of performing the reduction operation."""
+        if name in ["min", "max"]:
+            raise TypeError("JSONArray does not support min/max reducntion.")
+        super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
+
+    def __array__(
+        self, dtype = None, copy = None
+    ) -> np.ndarray:
+        """Correctly construct numpy arrays when passed to `np.asarray()`."""
+        return self.to_numpy(dtype=dtype)
+
+    def to_numpy(self, dtype = None, copy = False, na_value = pd.NA) -> np.ndarray:
+        dtype, na_value = self._to_numpy_dtype_inference(dtype, na_value, self._hasna)
+        pa_type = self._pa_array.type
+        if not self._hasna or pd.isna(na_value) or pa.types.is_null(pa_type):
+            data = self
+        else:
+            data = self.fillna(na_value)
+        result = np.array(list(data), dtype=dtype)
+        
+        if data._hasna:
+            result[data.isna()] = na_value
+        return result
+
+    def _to_numpy_dtype_inference(
+        self, dtype, na_value, hasna
+    ):
+        if dtype is not None:
+            dtype = np.dtype(dtype)
+
+        if dtype is None or not hasna:
+            na_value = self.dtype.na_value
+        elif dtype.kind == "f":  # type: ignore[union-attr]
+            na_value = np.nan
+        elif dtype.kind == "M":  # type: ignore[union-attr]
+            na_value = np.datetime64("nat")
+        elif dtype.kind == "m":  # type: ignore[union-attr]
+            na_value = np.timedelta64("nat")
+        else:
+            na_value = self.dtype.na_value
+        return dtype, na_value
\ No newline at end of file
diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py
index 02185fc..af7e543 100644
--- a/tests/compliance/json/test_json_compliance.py
+++ b/tests/compliance/json/test_json_compliance.py
@@ -21,6 +21,7 @@
 from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
 from pandas.tests.extension import base
 import pytest
+import db_dtypes
 
 
 class TestJSONArray(base.ExtensionTests):
@@ -111,7 +112,7 @@ def test_compare_scalar(self, data, comparison_op, request):
         super().test_compare_scalar(data, comparison_op)
 
     def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
-        return op_name in ["min", "max"]
+        return False
 
     def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
         dtype = typing.cast(pd.StringDtype, tm.get_dtype(obj))
@@ -125,43 +126,6 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
     def test_searchsorted(self, data_for_sorting, as_series):
         super().test_searchsorted(self, data_for_sorting, as_series)
 
-    def test_astype_str(self, data):
-        # Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method.
-        result = pd.Series(data[:5]).astype(str)
-        expected = pd.Series(
-            [json.dumps(x, sort_keys=True) for x in data[:5]], dtype=str
-        )
-        tm.assert_series_equal(result, expected)
-
-    @pytest.mark.parametrize(
-        "nullable_string_dtype",
-        [
-            "string[python]",
-            "string[pyarrow]",
-        ],
-    )
-    def test_astype_string(self, data, nullable_string_dtype):
-        # Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method.
-        result = pd.Series(data[:5]).astype(nullable_string_dtype)
-        expected = pd.Series(
-            [json.dumps(x, sort_keys=True) for x in data[:5]],
-            dtype=nullable_string_dtype,
-        )
-        tm.assert_series_equal(result, expected)
-
-    def test_array_interface(self, data):
-        result = np.array(data)
-        # Use `json.dumps(data[0])` instead of passing `data[0]` directly to the super method.
-        assert result[0] == json.dumps(data[0])
-
-        result = np.array(data, dtype=object)
-        # Use `json.dumps(x)` instead of passing `x` directly to the super method.
-        expected = np.array([json.dumps(x) for x in data], dtype=object)
-        if expected.ndim > 1:
-            # nested data, explicitly construct as 1D
-            expected = construct_1d_object_array_from_listlike(list(data))
-        tm.assert_numpy_array_equal(result, expected)
-
     @pytest.mark.xfail(reason="Setting a dict as a scalar")
     def test_fillna_series(self):
         """We treat dictionaries as a mapping in fillna, not a scalar."""
@@ -212,7 +176,7 @@ def test_series_constructor_scalar_with_index(self, data, dtype):
         expected = pd.Series([scalar], index=["foo"], dtype=dtype)
         tm.assert_series_equal(result, expected)
 
-    # Patching `json.dumps` to base.BaseSetitemTests because pandas' internals has
+    # Patching `[....] * len()` to base.BaseSetitemTests because pandas' internals
     # has trouble setting sequences of values into scalar positions.
 
     @pytest.mark.parametrize(
@@ -228,8 +192,8 @@ def test_setitem_integer_array(self, data, idx, box_in_series):
             arr = pd.Series(arr)
             expected = pd.Series(expected)
 
-        # Use json.dumps(arr[0]) instead of passing arr[0] directly to the super method.
-        arr[idx] = json.dumps(arr[0])
+        # Use `[arr[0]] * len()` instead of passing `arr[0]` directly to the super method.
+        arr[idx] = [arr[0]] * len(arr[idx])
         tm.assert_equal(arr, expected)
 
     @pytest.mark.parametrize("setter", ["loc", None])
@@ -243,22 +207,20 @@ def test_setitem_mask_broadcast(self, data, setter):
         else:  # __setitem__
             target = ser
 
-        # Use json.dumps(data[10]) instead of passing data[10] directly to the super method.
-        target[mask] = json.dumps(data[10])
+        # Use `[data[10]] * len()` instead of passing `data[10]` directly to the super method.
+        target[mask] = [data[10]] * len(target[mask])
         assert ser[0] == data[10]
         assert ser[1] == data[10]
 
     def test_setitem_loc_scalar_mixed(self, data):
         df = pd.DataFrame({"A": np.arange(len(data)), "B": data})
-        # Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
-        df.loc[0, "B"] = json.dumps(data[1])
+        # Use `[data[1]]` instead of passing `data[1]` directly to the super method.
+        df.loc[0, "B"] = [data[1]]
         assert df.loc[0, "B"] == data[1]
 
+    @pytest.mark.xfail(reason="TODO: open an issue for ArrowExtentionArray")
     def test_setitem_loc_scalar_single(self, data):
-        df = pd.DataFrame({"B": data})
-        # Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
-        df.loc[10, "B"] = json.dumps(data[1])
-        assert df.loc[10, "B"] == data[1]
+        super().test_setitem_loc_scalar_single(data)
 
     def test_setitem_loc_iloc_slice(self, data):
         arr = data[:5].copy()
@@ -266,37 +228,33 @@ def test_setitem_loc_iloc_slice(self, data):
         expected = pd.Series(data.take([0, 0, 0, 3, 4]), index=s.index)
 
         result = s.copy()
-        # Use json.dumps(data[0]) instead of passing data[0] directly to the super method.
-        result.iloc[:3] = json.dumps(data[0])
+        # Use `[data[0]] * len()` instead of passing `data[0]` directly to the super method.
+        result.iloc[:3] = [data[0]] * len(result.iloc[:3])
         tm.assert_equal(result, expected)
 
         result = s.copy()
-        result.loc[:"c"] = json.dumps(data[0])
+        result.loc[:"c"] = [data[0]] * len(result.loc[:"c"])
         tm.assert_equal(result, expected)
 
+    @pytest.mark.xfail(reason="TODO: open an issue for ArrowExtentionArray")
     def test_setitem_iloc_scalar_single(self, data):
-        df = pd.DataFrame({"B": data})
-        # Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
-        df.iloc[10, 0] = json.dumps(data[1])
-        assert df.loc[10, "B"] == data[1]
+        super().test_setitem_iloc_scalar_single(data)
 
     def test_setitem_iloc_scalar_mixed(self, data):
         df = pd.DataFrame({"A": np.arange(len(data)), "B": data})
-        # Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
-        df.iloc[0, 1] = json.dumps(data[1])
+        # Use `[data[1]] * len()` instead of passing `data[1]` directly to the super method.
+        df.iloc[0, 1] = [data[1]] * len(df.iloc[0, 1])
         assert df.loc[0, "B"] == data[1]
 
-    @pytest.mark.xfail(reaons="eq not implemented for <class 'dict'>")
+    @pytest.mark.xfail(reason="eq not implemented for <class 'dict'>")
     def test_setitem_mask_boolean_array_with_na(self, data, box_in_series):
         super().test_setitem_mask_boolean_array_with_na(data, box_in_series)
 
     @pytest.mark.parametrize("setter", ["loc", "iloc"])
+    
+    @pytest.mark.xfail(reason="TODO: open an issue for ArrowExtentionArray")
     def test_setitem_scalar(self, data, setter):
-        arr = pd.Series(data)
-        setter = getattr(arr, setter)
-        # Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
-        setter[0] = json.dumps(data[1])
-        assert arr[0] == data[1]
+        super().test_setitem_scalar(data, setter)
 
     @pytest.mark.parametrize(
         "mask",
@@ -313,35 +271,24 @@ def test_setitem_mask(self, data, mask, box_in_series):
         if box_in_series:
             arr = pd.Series(arr)
             expected = pd.Series(expected)
-        # Use json.dumps(data[0]) instead of passing data[0] directly to the super method.
-        arr[mask] = json.dumps(data[0])
+        # Use `[data[0]] * len()` instead of passing `data[0]` directly to the super method.
+        arr[mask] = [data[0]] * len(arr[mask])
         tm.assert_equal(expected, arr)
 
+    @pytest.mark.xfail(reasons="Setting a `dict` to an expansion row is not supported")
     def test_setitem_with_expansion_row(self, data, na_value):
-        df = pd.DataFrame({"data": data[:1]})
-
-        # Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
-        df.loc[1, "data"] = json.dumps(data[1])
-        expected = pd.DataFrame({"data": data[:2]})
-        tm.assert_frame_equal(df, expected)
-
-        # https://github.com/pandas-dev/pandas/issues/47284
-        df.loc[2, "data"] = na_value
-        expected = pd.DataFrame(
-            {"data": pd.Series([data[0], data[1], na_value], dtype=data.dtype)}
-        )
-        tm.assert_frame_equal(df, expected)
+        super().test_setitem_with_expansion_row(data, na_value)
 
     def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
         df = pd.DataFrame({"A": data, "B": data})
-        # Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
-        df.iloc[10, 1] = json.dumps(data[1])
+        # Use `[data[1]]` instead of passing `data[1]` directly to the super method.
+        df.iloc[10, 1] = [data[1]]
         assert df.loc[10, "B"] == data[1]
 
     def test_setitem_loc_scalar_multiple_homogoneous(self, data):
         df = pd.DataFrame({"A": data, "B": data})
-        # Use json.dumps(data[1]) instead of passing data[1] directly to the super method.
-        df.loc[10, "B"] = json.dumps(data[1])
+        # Use `[data[1]]` instead of passing `data[1]` directly to the super method.
+        df.loc[10, "B"] = [data[1]]
         assert df.loc[10, "B"] == data[1]
 
     def test_setitem_slice(self, data, box_in_series):
@@ -351,8 +298,8 @@ def test_setitem_slice(self, data, box_in_series):
             arr = pd.Series(arr)
             expected = pd.Series(expected)
 
-        # Use json.dumps(data[0]) instead of passing data[0] directly to the super method.
-        arr[:3] = json.dumps(data[0])
+        # Use `[data[0]] * 3` instead of passing `data[0]` directly to the super method.
+        arr[:3] = [data[0]] * 3
         tm.assert_equal(arr, expected)
 
     @pytest.mark.xfail(reason="only integer scalar arrays can be converted")

From 7422f7aab80b7a6ed205f94e18a02228eec7e8a9 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Tue, 30 Jul 2024 23:36:49 +0000
Subject: [PATCH 12/28] support array type

---
 db_dtypes/json.py                             | 41 +-----------
 tests/compliance/json/conftest.py             | 38 ++++++------
 tests/compliance/json/test_json_compliance.py | 62 ++++++++++++++++++-
 3 files changed, 83 insertions(+), 58 deletions(-)

diff --git a/db_dtypes/json.py b/db_dtypes/json.py
index e19a357..72d5fa9 100644
--- a/db_dtypes/json.py
+++ b/db_dtypes/json.py
@@ -34,6 +34,7 @@
     "ge": pyarrow.compute.greater_equal,
 }
 
+
 @pd.api.extensions.register_extension_dtype
 class JSONDtype(pd.api.extensions.ExtensionDtype):
     """Extension dtype for BigQuery JSON data."""
@@ -90,6 +91,7 @@ def _box_pa(
         cls, value, pa_type: pa.DataType | None = None
     ) -> pa.Array | pa.ChunkedArray | pa.Scalar:
         """Box value into a pyarrow Array, ChunkedArray or Scalar."""
+
         if isinstance(value, pa.Scalar) or not (
             common.is_list_like(value) and not common.is_dict_like(value)
         ):
@@ -163,7 +165,7 @@ def _from_factorized(cls, values, original):
     @staticmethod
     def _serialize_json(value):
         """A static method that converts a JSON value into a string representation."""
-        if pd.isna(value):
+        if not common.is_list_like(value) and pd.isna(value):
             return value
         else:
             # `sort_keys=True` sorts dictionary keys before serialization, making
@@ -254,40 +256,3 @@ def _reduce(
         if name in ["min", "max"]:
             raise TypeError("JSONArray does not support min/max reducntion.")
         super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
-
-    def __array__(
-        self, dtype = None, copy = None
-    ) -> np.ndarray:
-        """Correctly construct numpy arrays when passed to `np.asarray()`."""
-        return self.to_numpy(dtype=dtype)
-
-    def to_numpy(self, dtype = None, copy = False, na_value = pd.NA) -> np.ndarray:
-        dtype, na_value = self._to_numpy_dtype_inference(dtype, na_value, self._hasna)
-        pa_type = self._pa_array.type
-        if not self._hasna or pd.isna(na_value) or pa.types.is_null(pa_type):
-            data = self
-        else:
-            data = self.fillna(na_value)
-        result = np.array(list(data), dtype=dtype)
-        
-        if data._hasna:
-            result[data.isna()] = na_value
-        return result
-
-    def _to_numpy_dtype_inference(
-        self, dtype, na_value, hasna
-    ):
-        if dtype is not None:
-            dtype = np.dtype(dtype)
-
-        if dtype is None or not hasna:
-            na_value = self.dtype.na_value
-        elif dtype.kind == "f":  # type: ignore[union-attr]
-            na_value = np.nan
-        elif dtype.kind == "M":  # type: ignore[union-attr]
-            na_value = np.datetime64("nat")
-        elif dtype.kind == "m":  # type: ignore[union-attr]
-            na_value = np.timedelta64("nat")
-        else:
-            na_value = self.dtype.na_value
-        return dtype, na_value
\ No newline at end of file
diff --git a/tests/compliance/json/conftest.py b/tests/compliance/json/conftest.py
index f323f65..6e98650 100644
--- a/tests/compliance/json/conftest.py
+++ b/tests/compliance/json/conftest.py
@@ -14,6 +14,7 @@
 
 
 import json
+import random
 
 import numpy as np
 import pandas as pd
@@ -24,18 +25,29 @@
 
 
 def make_data():
-    # Sample data with varied lengths.
+    # Since the `np.array` constructor needs a consistent shape after the first
+    # dimension, the samples data in this instance doesn't include the array type.
     samples = [
-        {"id": 1, "bool_value": True},  # Boolean
-        {"id": 2, "float_num": 3.14159},  # Floating
-        {"id": 3, "date": "2024-07-16"},  # Dates (as strings)
-        {"id": 4, "null_field": None},  # Null
-        {"list_data": [10, 20, 30]},  # Lists
-        {"person": {"name": "Alice", "age": 35}},  # Nested objects
+        True,  # Boolean
+        100,  # Int
+        0.98,  # Float
+        "str",  # String
+        {"bool_value": True},  # Dict with a boolean
+        {"float_num": 3.14159},  # Dict with a float
+        {"date": "2024-07-16"},  # Dict with a date (as strings)
+        {"null_field": None},  # Dict with a null
+        {"list_data": [10, 20, 30]},  # Dict with a list
+        {"person": {"name": "Alice", "age": 35}},  # Dict with nested objects
         {"address": {"street": "123 Main St", "city": "Anytown"}},
         {"order": {"items": ["book", "pen"], "total": 15.99}},
     ]
-    return np.random.default_rng(2).choice(samples, size=100)
+    data = np.random.default_rng(2).choice(samples, size=100)
+    # This replaces a single data item with an array. We are skipping the first two
+    # items to avoid some `setitem` tests failed, because setting with a list is
+    # ambiguity in this context.
+    id = random.randint(3, 99)
+    data[id] = [0.1, 0.2]  # Array
+    return data
 
 
 @pytest.fixture
@@ -48,16 +60,6 @@ def data():
     """Length-100 PeriodArray for semantics test."""
     data = make_data()
 
-    # Why the while loop? NumPy is unable to construct an ndarray from
-    # equal-length ndarrays. Many of our operations involve coercing the
-    # EA to an ndarray of objects. To avoid random test failures, we ensure
-    # that our data is coercible to an ndarray. Several tests deal with only
-    # the first two elements, so that's what we'll check.
-
-    while len(data[0]) == len(data[1]):
-        print(data)
-        data = make_data()
-
     return JSONArray._from_sequence(data)
 
 
diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py
index af7e543..69425c8 100644
--- a/tests/compliance/json/test_json_compliance.py
+++ b/tests/compliance/json/test_json_compliance.py
@@ -21,7 +21,6 @@
 from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
 from pandas.tests.extension import base
 import pytest
-import db_dtypes
 
 
 class TestJSONArray(base.ExtensionTests):
@@ -126,6 +125,43 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
     def test_searchsorted(self, data_for_sorting, as_series):
         super().test_searchsorted(self, data_for_sorting, as_series)
 
+    def test_astype_str(self, data):
+        # Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method.
+        result = pd.Series(data[:5]).astype(str)
+        expected = pd.Series(
+            [json.dumps(x, sort_keys=True) for x in data[:5]], dtype=str
+        )
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "nullable_string_dtype",
+        [
+            "string[python]",
+            "string[pyarrow]",
+        ],
+    )
+    def test_astype_string(self, data, nullable_string_dtype):
+        # Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method.
+        result = pd.Series(data[:5]).astype(nullable_string_dtype)
+        expected = pd.Series(
+            [json.dumps(x, sort_keys=True) for x in data[:5]],
+            dtype=nullable_string_dtype,
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_array_interface(self, data):
+        result = np.array(data)
+        # Use `json.dumps(data[0])` instead of passing `data[0]` directly to the super method.
+        assert result[0] == json.dumps(data[0])
+
+        result = np.array(data, dtype=object)
+        # Use `json.dumps(x)` instead of passing `x` directly to the super method.
+        expected = np.array([json.dumps(x) for x in data], dtype=object)
+        if expected.ndim > 1:
+            # nested data, explicitly construct as 1D
+            expected = construct_1d_object_array_from_listlike(list(data))
+        tm.assert_numpy_array_equal(result, expected)
+
     @pytest.mark.xfail(reason="Setting a dict as a scalar")
     def test_fillna_series(self):
         """We treat dictionaries as a mapping in fillna, not a scalar."""
@@ -251,7 +287,6 @@ def test_setitem_mask_boolean_array_with_na(self, data, box_in_series):
         super().test_setitem_mask_boolean_array_with_na(data, box_in_series)
 
     @pytest.mark.parametrize("setter", ["loc", "iloc"])
-    
     @pytest.mark.xfail(reason="TODO: open an issue for ArrowExtentionArray")
     def test_setitem_scalar(self, data, setter):
         super().test_setitem_scalar(data, setter)
@@ -310,3 +345,26 @@ def test_setitem_2d_values(self, data):
     @pytest.mark.parametrize("engine", ["c", "python"])
     def test_EA_types(self, engine, data, request):
         super().test_EA_types(engine, data, request)
+
+    @pytest.mark.xfail(
+        reason="`to_numpy` returns serialized JSON, "
+        + "while `__getitem__` returns JSON objects."
+    )
+    def test_setitem_frame_2d_values(self, data):
+        super().test_setitem_frame_2d_values(data)
+
+    @pytest.mark.xfail(
+        reason="`to_numpy` returns serialized JSON, "
+        + "while `__getitem__` returns JSON objects."
+    )
+    def test_transpose_frame(self, data):
+        # `DataFrame.T` calls `to_numpy` to get results.
+        super().test_transpose_frame(data)
+
+    @pytest.mark.xfail(
+        reason="`to_numpy` returns serialized JSON, "
+        + "while `__getitem__` returns JSON objects."
+    )
+    def test_where_series(self, data, na_value, as_frame):
+        # `Series.where` calls `to_numpy` to get results.
+        super().test_where_series(data, na_value, as_frame)

From 22a099b9fb34f1af6d33ef36b857156c7a312e4f Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Fri, 2 Aug 2024 20:40:09 +0000
Subject: [PATCH 13/28] only import when pandas version is higher than 1.5.0

---
 db_dtypes/__init__.py                         | 10 +++++++--
 db_dtypes/json.py                             |  9 ++++++--
 tests/compliance/json/test_json_compliance.py | 21 +++++++++++++------
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py
index 076270f..4cb45c5 100644
--- a/db_dtypes/__init__.py
+++ b/db_dtypes/__init__.py
@@ -28,7 +28,6 @@
 import pyarrow.compute
 
 from db_dtypes import core
-from db_dtypes.json import JSONArray, JSONDtype
 from db_dtypes.version import __version__
 
 date_dtype_name = "dbdate"
@@ -44,7 +43,14 @@
 # nanosecond precision when boxing scalars.
 _NP_BOX_DTYPE = "datetime64[us]"
 
-pandas_release = packaging.version.parse(pandas.__version__).release
+
+# To use JSONArray and JSONDtype, you'll need Pandas 1.5.0 or later. With the removal
+# of Python 3.7 compatibility, the minimum Pandas version will be updated to 1.5.0.
+if packaging.version.Version(pandas.__version__) >= packaging.version.Version("1.5.0"):
+    from db_dtypes.json import JSONArray, JSONDtype
+else:
+    JSONArray = None
+    JSONDtype = None
 
 
 @pandas.api.extensions.register_extension_dtype
diff --git a/db_dtypes/json.py b/db_dtypes/json.py
index 72d5fa9..0cf88d6 100644
--- a/db_dtypes/json.py
+++ b/db_dtypes/json.py
@@ -48,8 +48,13 @@ def na_value(self) -> pd.NA:
 
     @property
     def type(self) -> type[str]:
-        """Return the scalar type for the array, e.g. int."""
-        return dict
+        """
+        Return the scalar type for the array elements.
+        The standard JSON data types can be one of `dict`, `list`, `str`, `int`, `float`,
+        `bool` and `None`. However, this method returns a `str` type to indicate its
+        storage type, because the union of multiple types are not supported well in pandas.
+        """
+        return str
 
     @property
     def _is_numeric(self) -> bool:
diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py
index 69425c8..faa2a20 100644
--- a/tests/compliance/json/test_json_compliance.py
+++ b/tests/compliance/json/test_json_compliance.py
@@ -18,12 +18,11 @@
 import numpy as np
 import pandas as pd
 import pandas._testing as tm
-from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
-from pandas.tests.extension import base
+import pandas.tests.extension.base
 import pytest
 
 
-class TestJSONArray(base.ExtensionTests):
+class TestJSONArray(pandas.tests.extension.base.ExtensionTests):
     @pytest.mark.xfail(reason="Unhashable")
     def test_value_counts_with_normalize(self, data):
         super().test_value_counts_with_normalize(data)
@@ -157,9 +156,9 @@ def test_array_interface(self, data):
         result = np.array(data, dtype=object)
         # Use `json.dumps(x)` instead of passing `x` directly to the super method.
         expected = np.array([json.dumps(x) for x in data], dtype=object)
-        if expected.ndim > 1:
-            # nested data, explicitly construct as 1D
-            expected = construct_1d_object_array_from_listlike(list(data))
+        # if expected.ndim > 1:
+        #     # nested data, explicitly construct as 1D
+        #     expected = construct_1d_object_array_from_listlike(list(data))
         tm.assert_numpy_array_equal(result, expected)
 
     @pytest.mark.xfail(reason="Setting a dict as a scalar")
@@ -212,6 +211,16 @@ def test_series_constructor_scalar_with_index(self, data, dtype):
         expected = pd.Series([scalar], index=["foo"], dtype=dtype)
         tm.assert_series_equal(result, expected)
 
+    @pytest.mark.xfail(reason="Unhashable")
+    def test_getitem_scalar(self, data):
+        """
+        `_getitem_` can return any JSON-types objects while `data.dtype.type` returns
+        a string to indicate its storage type.
+        >       assert isinstance(result, data.dtype.type)
+        E       AssertionError
+        """
+        super().test_getitem_scalar()
+
     # Patching `[....] * len()` to base.BaseSetitemTests because pandas' internals
     # has trouble setting sequences of values into scalar positions.
 

From 77339a0f91c13fcba3e8de8766a3e378220cc259 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Sat, 3 Aug 2024 05:44:56 +0000
Subject: [PATCH 14/28] exclude groupby and other tests

---
 tests/compliance/json/conftest.py             |  24 --
 tests/compliance/json/test_json_compliance.py | 406 ++++++++----------
 2 files changed, 186 insertions(+), 244 deletions(-)

diff --git a/tests/compliance/json/conftest.py b/tests/compliance/json/conftest.py
index 6e98650..20fe2f6 100644
--- a/tests/compliance/json/conftest.py
+++ b/tests/compliance/json/conftest.py
@@ -108,22 +108,6 @@ def cmp(a, b):
     return cmp
 
 
-@pytest.fixture
-def data_for_grouping():
-    return JSONArray._from_sequence(
-        [
-            json.dumps({"b": 1}),
-            json.dumps({"b": 1}),
-            None,
-            None,
-            json.dumps({"a": 0, "c": 2}),
-            json.dumps({"a": 0, "c": 2}),
-            json.dumps({"b": 1}),
-            json.dumps({"c": 2}),
-        ]
-    )
-
-
 @pytest.fixture
 def data_repeated(data):
     """
@@ -193,14 +177,6 @@ def all_numeric_reductions(request):
     return request.param
 
 
-@pytest.fixture(params=tm.arithmetic_dunder_methods)
-def all_arithmetic_operators(request):
-    """
-    Fixture for dunder names for common arithmetic operations.
-    """
-    return request.param
-
-
 @pytest.fixture(params=["data", "data_missing"])
 def all_data(request, data, data_missing):
     """Parametrized fixture returning 'data' or 'data_missing' integer arrays.
diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py
index faa2a20..443dc60 100644
--- a/tests/compliance/json/test_json_compliance.py
+++ b/tests/compliance/json/test_json_compliance.py
@@ -18,112 +18,15 @@
 import numpy as np
 import pandas as pd
 import pandas._testing as tm
-import pandas.tests.extension.base
+import pandas.tests.extension.base as base
 import pytest
 
 
-class TestJSONArray(pandas.tests.extension.base.ExtensionTests):
-    @pytest.mark.xfail(reason="Unhashable")
-    def test_value_counts_with_normalize(self, data):
-        super().test_value_counts_with_normalize(data)
-
-    @pytest.mark.xfail(reason="Unhashable")
-    def test_groupby_extension_transform(self):
-        """
-        This currently fails in Series.name.setter, since the
-        name must be hashable, but the value is a dictionary.
-        I think this is what we want, i.e. `.name` should be the original
-        values, and not the values for factorization.
-        """
-        super().test_groupby_extension_transform()
-
-    @pytest.mark.xfail(reason="Unhashable")
-    def test_groupby_extension_apply(self):
-        """
-        This fails in Index._do_unique_check with
-        >   hash(val)
-        E   TypeError: unhashable type: 'dict' with
-        I suspect that once we support Index[ExtensionArray],
-        we'll be able to dispatch unique.
-        """
-        super().test_groupby_extension_apply()
-
-    @pytest.mark.xfail(reason="Unhashable")
-    def test_sort_values_frame(self):
-        super().test_sort_values_frame()
-
-    @pytest.mark.xfail(reason="combine for JSONArray not supported")
-    def test_combine_le(self, data_repeated):
-        super().test_combine_le(data_repeated)
-
-    @pytest.mark.xfail(
-        reason="combine for JSONArray not supported - "
-        "may pass depending on random data",
-        strict=False,
-        raises=AssertionError,
-    )
-    def test_combine_first(self, data):
-        super().test_combine_first(data)
-
-    @pytest.mark.skip(reason="2D support not implemented for JSONArray")
-    def test_view(self, data):
-        super().test_view(data)
-
-    @pytest.mark.skip(reason="2D support not implemented for JSONArray")
-    def test_setitem_preserves_views(self, data):
-        super().test_setitem_preserves_views(data)
-
-    @pytest.mark.skip(reason="2D support not implemented for JSONArray")
-    def test_transpose(self, data):
-        super().test_transpose(data)
-
-    @pytest.mark.xfail(reason="Arithmetic functions is not supported for json")
-    def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
-        super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
-
-    @pytest.mark.xfail(reason="Arithmetic functions is not supported for json")
-    def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
-        super().test_arith_series_with_scalar(data, all_arithmetic_operators)
-
-    @pytest.mark.xfail(reason="Arithmetic functions is not supported for json")
-    def test_arith_series_with_array(self, data, all_arithmetic_operators):
-        super().test_arith_series_with_array(data, all_arithmetic_operators)
-
-    @pytest.mark.xfail(reason="Arithmetic functions is not supported for json")
-    def test_add_series_with_extension_array(self, data):
-        super().test_add_series_with_extension_array(data, data)
-
-    @pytest.mark.xfail(reason="Arithmetic functions is not supported for json")
-    def test_divmod(self, data):
-        super().test_divmod(data, data)
+class TestJSONArrayAccumulate(base.BaseAccumulateTests):
+    pass
 
-    def test_compare_array(self, data, comparison_op, request):
-        if comparison_op.__name__ not in ["eq", "ne"]:
-            mark = pytest.mark.xfail(reason="Comparison methods not implemented")
-            request.applymarker(mark)
-        super().test_compare_array(data, comparison_op)
-
-    def test_compare_scalar(self, data, comparison_op, request):
-        if comparison_op.__name__ not in ["eq", "ne"]:
-            mark = pytest.mark.xfail(reason="Comparison methods not implemented")
-            request.applymarker(mark)
-        super().test_compare_scalar(data, comparison_op)
-
-    def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
-        return False
-
-    def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
-        dtype = typing.cast(pd.StringDtype, tm.get_dtype(obj))
-        if op_name in ["__add__", "__radd__"]:
-            cast_to = dtype
-        else:
-            cast_to = "boolean[pyarrow]"  # type: ignore[assignment]
-        return pointwise_result.astype(cast_to)
-
-    @pytest.mark.skip(reason="'<' not supported between instances of 'dict' and 'dict'")
-    def test_searchsorted(self, data_for_sorting, as_series):
-        super().test_searchsorted(self, data_for_sorting, as_series)
 
+class TestJSONArrayCasting(base.BaseCastingTests):
     def test_astype_str(self, data):
         # Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method.
         result = pd.Series(data[:5]).astype(str)
@@ -148,33 +51,8 @@ def test_astype_string(self, data, nullable_string_dtype):
         )
         tm.assert_series_equal(result, expected)
 
-    def test_array_interface(self, data):
-        result = np.array(data)
-        # Use `json.dumps(data[0])` instead of passing `data[0]` directly to the super method.
-        assert result[0] == json.dumps(data[0])
-
-        result = np.array(data, dtype=object)
-        # Use `json.dumps(x)` instead of passing `x` directly to the super method.
-        expected = np.array([json.dumps(x) for x in data], dtype=object)
-        # if expected.ndim > 1:
-        #     # nested data, explicitly construct as 1D
-        #     expected = construct_1d_object_array_from_listlike(list(data))
-        tm.assert_numpy_array_equal(result, expected)
-
-    @pytest.mark.xfail(reason="Setting a dict as a scalar")
-    def test_fillna_series(self):
-        """We treat dictionaries as a mapping in fillna, not a scalar."""
-        super().test_fillna_series()
-
-    @pytest.mark.xfail(reason="Setting a dict as a scalar")
-    def test_fillna_frame(self):
-        """We treat dictionaries as a mapping in fillna, not a scalar."""
-        super().test_fillna_frame()
-
-    @pytest.mark.skip("fill-value is interpreted as a dict of values")
-    def test_fillna_copy_frame(self, data_missing):
-        super().test_fillna_copy_frame(data_missing)
 
+class TestJSONArrayConstructors(base.BaseConstructorsTests):
     def test_from_dtype(self, data):
         # construct from our dtype & string dtype
         dtype = data.dtype
@@ -211,7 +89,18 @@ def test_series_constructor_scalar_with_index(self, data, dtype):
         expected = pd.Series([scalar], index=["foo"], dtype=dtype)
         tm.assert_series_equal(result, expected)
 
-    @pytest.mark.xfail(reason="Unhashable")
+
+@pytest.mark.skip(reason="BigQuery does not allow group by a JSON-type column.")
+class TestJSONArrayGroupby(base.BaseGroupbyTests):
+    pass
+
+
+class TestJSONArrayDtype(base.BaseDtypeTests):
+    pass
+
+
+class TestJSONArrayGetitem(base.BaseGetitemTests):
+    @pytest.mark.xfail(reason="JSONDtype's type returns its storage type.")
     def test_getitem_scalar(self, data):
         """
         `_getitem_` can return any JSON-types objects while `data.dtype.type` returns
@@ -219,8 +108,139 @@ def test_getitem_scalar(self, data):
         >       assert isinstance(result, data.dtype.type)
         E       AssertionError
         """
-        super().test_getitem_scalar()
+        super().test_getitem_scalar(data)
+
+
+class TestJSONArrayIndex(base.BaseIndexTests):
+    pass
+
+
+class TestJSONArrayInterface(base.BaseInterfaceTests):
+    def test_array_interface(self, data):
+        result = np.array(data)
+        # Use `json.dumps(data[0])` instead of passing `data[0]` directly to the super method.
+        assert result[0] == json.dumps(data[0])
+
+        result = np.array(data, dtype=object)
+        # Use `json.dumps(x)` instead of passing `x` directly to the super method.
+        expected = np.array([json.dumps(x) for x in data], dtype=object)
+        # if expected.ndim > 1:
+        #     # nested data, explicitly construct as 1D
+        #     expected = construct_1d_object_array_from_listlike(list(data))
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.skip(reason="2D support not implemented for JSONArray")
+    def test_view(self, data):
+        super().test_view(data)
+
+
+class TestJSONArrayParsing(base.BaseParsingTests):
+    @pytest.mark.xfail(reason="data type 'json' not understood")
+    @pytest.mark.parametrize("engine", ["c", "python"])
+    def test_EA_types(self, engine, data, request):
+        super().test_EA_types(engine, data, request)
+
+
+class TestJSONArrayMethods(base.BaseMethodsTests):
+    @pytest.mark.xfail(reason="Unhashable")
+    def test_value_counts_with_normalize(self, data):
+        super().test_value_counts_with_normalize(data)
+
+    @pytest.mark.skip("fill-value is interpreted as a dict of values")
+    def test_fillna_copy_frame(self, data_missing):
+        super().test_fillna_copy_frame(data_missing)
+
+    @pytest.mark.xfail(reason="combine for JSONArray not supported")
+    def test_combine_le(self, data_repeated):
+        super().test_combine_le(data_repeated)
+
+    @pytest.mark.skip(reason="'<' not supported between instances of 'dict' and 'dict'")
+    def test_searchsorted(self, data_for_sorting, as_series):
+        super().test_searchsorted(self, data_for_sorting, as_series)
+
+    @pytest.mark.xfail(
+        reason="`to_numpy` returns serialized JSON, "
+        + "while `__getitem__` returns JSON objects."
+    )
+    def test_where_series(self, data, na_value, as_frame):
+        # `Series.where` calls `to_numpy` to get results.
+        super().test_where_series(data, na_value, as_frame)
+
+    @pytest.mark.skip(reason="BigQuery does not allow group by a JSON-type column.")
+    def test_factorize(self, data_for_grouping):
+        super().test_factorize(data_for_grouping)
+
+    @pytest.mark.skip(reason="BigQuery does not allow group by a JSON-type column.")
+    def test_factorize_equivalence(self, data_for_grouping):
+        super().test_factorize_equivalence(data_for_grouping)
+
 
+class TestJSONArrayMissing(base.BaseMissingTests):
+    @pytest.mark.xfail(reason="Setting a dict as a scalar")
+    def test_fillna_series(self):
+        """We treat dictionaries as a mapping in fillna, not a scalar."""
+        super().test_fillna_series()
+
+    @pytest.mark.xfail(reason="Setting a dict as a scalar")
+    def test_fillna_frame(self):
+        """We treat dictionaries as a mapping in fillna, not a scalar."""
+        super().test_fillna_frame()
+
+
+@pytest.mark.skip(reason="BigQuery JSON does not allow Arithmetic Ops.")
+class TestJSONArrayArithmeticOps(base.BaseArithmeticOpsTests):
+    pass
+
+
+class TestJSONArrayComparisonOps(base.BaseComparisonOpsTests):
+    def test_compare_array(self, data, comparison_op, request):
+        if comparison_op.__name__ not in ["eq", "ne"]:
+            mark = pytest.mark.xfail(reason="Comparison methods not implemented")
+            request.applymarker(mark)
+        super().test_compare_array(data, comparison_op)
+
+    def test_compare_scalar(self, data, comparison_op, request):
+        if comparison_op.__name__ not in ["eq", "ne"]:
+            mark = pytest.mark.xfail(reason="Comparison methods not implemented")
+            request.applymarker(mark)
+        super().test_compare_scalar(data, comparison_op)
+
+    def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
+        dtype = typing.cast(pd.StringDtype, tm.get_dtype(obj))
+        if op_name in ["__add__", "__radd__"]:
+            cast_to = dtype
+        else:
+            cast_to = "boolean[pyarrow]"  # type: ignore[assignment]
+        return pointwise_result.astype(cast_to)
+
+
+class TestJSONArrayUnaryOps(base.BaseUnaryOpsTests):
+    pass
+
+
+class TestJSONArrayPrinting(base.BasePrintingTests):
+    pass
+
+
+class TestJSONArrayReduce(base.BaseReduceTests):
+    pass
+
+
+class TestJSONArrayReshaping(base.BaseReshapingTests):
+    @pytest.mark.skip(reason="2D support not implemented for JSONArray")
+    def test_transpose(self, data):
+        super().test_transpose(data)
+
+    @pytest.mark.xfail(
+        reason="`to_numpy` returns serialized JSON, "
+        + "while `__getitem__` returns JSON objects."
+    )
+    def test_transpose_frame(self, data):
+        # `DataFrame.T` calls `to_numpy` to get results.
+        super().test_transpose_frame(data)
+
+
+class TestJSONArraySetitem(base.BaseSetitemTests):
     # Patching `[....] * len()` to base.BaseSetitemTests because pandas' internals
     # has trouble setting sequences of values into scalar positions.
 
@@ -241,65 +261,6 @@ def test_setitem_integer_array(self, data, idx, box_in_series):
         arr[idx] = [arr[0]] * len(arr[idx])
         tm.assert_equal(arr, expected)
 
-    @pytest.mark.parametrize("setter", ["loc", None])
-    def test_setitem_mask_broadcast(self, data, setter):
-        ser = pd.Series(data)
-        mask = np.zeros(len(data), dtype=bool)
-        mask[:2] = True
-
-        if setter:  # loc
-            target = getattr(ser, setter)
-        else:  # __setitem__
-            target = ser
-
-        # Use `[data[10]] * len()` instead of passing `data[10]` directly to the super method.
-        target[mask] = [data[10]] * len(target[mask])
-        assert ser[0] == data[10]
-        assert ser[1] == data[10]
-
-    def test_setitem_loc_scalar_mixed(self, data):
-        df = pd.DataFrame({"A": np.arange(len(data)), "B": data})
-        # Use `[data[1]]` instead of passing `data[1]` directly to the super method.
-        df.loc[0, "B"] = [data[1]]
-        assert df.loc[0, "B"] == data[1]
-
-    @pytest.mark.xfail(reason="TODO: open an issue for ArrowExtentionArray")
-    def test_setitem_loc_scalar_single(self, data):
-        super().test_setitem_loc_scalar_single(data)
-
-    def test_setitem_loc_iloc_slice(self, data):
-        arr = data[:5].copy()
-        s = pd.Series(arr, index=["a", "b", "c", "d", "e"])
-        expected = pd.Series(data.take([0, 0, 0, 3, 4]), index=s.index)
-
-        result = s.copy()
-        # Use `[data[0]] * len()` instead of passing `data[0]` directly to the super method.
-        result.iloc[:3] = [data[0]] * len(result.iloc[:3])
-        tm.assert_equal(result, expected)
-
-        result = s.copy()
-        result.loc[:"c"] = [data[0]] * len(result.loc[:"c"])
-        tm.assert_equal(result, expected)
-
-    @pytest.mark.xfail(reason="TODO: open an issue for ArrowExtentionArray")
-    def test_setitem_iloc_scalar_single(self, data):
-        super().test_setitem_iloc_scalar_single(data)
-
-    def test_setitem_iloc_scalar_mixed(self, data):
-        df = pd.DataFrame({"A": np.arange(len(data)), "B": data})
-        # Use `[data[1]] * len()` instead of passing `data[1]` directly to the super method.
-        df.iloc[0, 1] = [data[1]] * len(df.iloc[0, 1])
-        assert df.loc[0, "B"] == data[1]
-
-    @pytest.mark.xfail(reason="eq not implemented for <class 'dict'>")
-    def test_setitem_mask_boolean_array_with_na(self, data, box_in_series):
-        super().test_setitem_mask_boolean_array_with_na(data, box_in_series)
-
-    @pytest.mark.parametrize("setter", ["loc", "iloc"])
-    @pytest.mark.xfail(reason="TODO: open an issue for ArrowExtentionArray")
-    def test_setitem_scalar(self, data, setter):
-        super().test_setitem_scalar(data, setter)
-
     @pytest.mark.parametrize(
         "mask",
         [
@@ -319,21 +280,19 @@ def test_setitem_mask(self, data, mask, box_in_series):
         arr[mask] = [data[0]] * len(arr[mask])
         tm.assert_equal(expected, arr)
 
-    @pytest.mark.xfail(reasons="Setting a `dict` to an expansion row is not supported")
-    def test_setitem_with_expansion_row(self, data, na_value):
-        super().test_setitem_with_expansion_row(data, na_value)
+    def test_setitem_loc_iloc_slice(self, data):
+        arr = data[:5].copy()
+        s = pd.Series(arr, index=["a", "b", "c", "d", "e"])
+        expected = pd.Series(data.take([0, 0, 0, 3, 4]), index=s.index)
 
-    def test_setitem_iloc_scalar_multiple_homogoneous(self, data):
-        df = pd.DataFrame({"A": data, "B": data})
-        # Use `[data[1]]` instead of passing `data[1]` directly to the super method.
-        df.iloc[10, 1] = [data[1]]
-        assert df.loc[10, "B"] == data[1]
+        result = s.copy()
+        # Use `[data[0]] * len()` instead of passing `data[0]` directly to the super method.
+        result.iloc[:3] = [data[0]] * len(result.iloc[:3])
+        tm.assert_equal(result, expected)
 
-    def test_setitem_loc_scalar_multiple_homogoneous(self, data):
-        df = pd.DataFrame({"A": data, "B": data})
-        # Use `[data[1]]` instead of passing `data[1]` directly to the super method.
-        df.loc[10, "B"] = [data[1]]
-        assert df.loc[10, "B"] == data[1]
+        result = s.copy()
+        result.loc[:"c"] = [data[0]] * len(result.loc[:"c"])
+        tm.assert_equal(result, expected)
 
     def test_setitem_slice(self, data, box_in_series):
         arr = data[:5].copy()
@@ -350,11 +309,6 @@ def test_setitem_slice(self, data, box_in_series):
     def test_setitem_2d_values(self, data):
         super().test_setitem_2d_values(data)
 
-    @pytest.mark.xfail(reason="data type 'json' not understood")
-    @pytest.mark.parametrize("engine", ["c", "python"])
-    def test_EA_types(self, engine, data, request):
-        super().test_EA_types(engine, data, request)
-
     @pytest.mark.xfail(
         reason="`to_numpy` returns serialized JSON, "
         + "while `__getitem__` returns JSON objects."
@@ -362,18 +316,30 @@ def test_EA_types(self, engine, data, request):
     def test_setitem_frame_2d_values(self, data):
         super().test_setitem_frame_2d_values(data)
 
-    @pytest.mark.xfail(
-        reason="`to_numpy` returns serialized JSON, "
-        + "while `__getitem__` returns JSON objects."
-    )
-    def test_transpose_frame(self, data):
-        # `DataFrame.T` calls `to_numpy` to get results.
-        super().test_transpose_frame(data)
+    @pytest.mark.parametrize("setter", ["loc", None])
+    def test_setitem_mask_broadcast(self, data, setter):
+        ser = pd.Series(data)
+        mask = np.zeros(len(data), dtype=bool)
+        mask[:2] = True
 
-    @pytest.mark.xfail(
-        reason="`to_numpy` returns serialized JSON, "
-        + "while `__getitem__` returns JSON objects."
-    )
-    def test_where_series(self, data, na_value, as_frame):
-        # `Series.where` calls `to_numpy` to get results.
-        super().test_where_series(data, na_value, as_frame)
+        if setter:  # loc
+            target = getattr(ser, setter)
+        else:  # __setitem__
+            target = ser
+
+        # Use `[data[10]] * len()` instead of passing `data[10]` directly to the super method.
+        target[mask] = [data[10]] * len(target[mask])
+        assert ser[0] == data[10]
+        assert ser[1] == data[10]
+
+    @pytest.mark.xfail(reason="eq not implemented for <class 'dict'>")
+    def test_setitem_mask_boolean_array_with_na(self, data, box_in_series):
+        super().test_setitem_mask_boolean_array_with_na(data, box_in_series)
+
+    @pytest.mark.skip(reason="2D support not implemented for JSONArray")
+    def test_setitem_preserves_views(self, data):
+        super().test_setitem_preserves_views(data)
+
+
+class TestJSONArrayDim2Compat(base.Dim2CompatTests):
+    pass

From 279882508ab2aee9f8c81746d2d03efb75d5328b Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Mon, 5 Aug 2024 17:59:49 +0000
Subject: [PATCH 15/28] others

---
 tests/compliance/json/conftest.py             |  8 --------
 tests/compliance/json/test_json_compliance.py | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/tests/compliance/json/conftest.py b/tests/compliance/json/conftest.py
index 20fe2f6..74870c4 100644
--- a/tests/compliance/json/conftest.py
+++ b/tests/compliance/json/conftest.py
@@ -18,7 +18,6 @@
 
 import numpy as np
 import pandas as pd
-import pandas._testing as tm
 import pytest
 
 from db_dtypes import JSONArray, JSONDtype
@@ -79,13 +78,6 @@ def data_missing():
     return JSONArray._from_sequence([None, {"a": 10}])
 
 
-@pytest.fixture
-def data_for_sorting():
-    return JSONArray._from_sequence(
-        [json.dumps({"b": 1}), json.dumps({"c": 4}), json.dumps({"a": 2, "c": 3})]
-    )
-
-
 @pytest.fixture
 def data_missing_for_sorting():
     return JSONArray._from_sequence([json.dumps({"b": 1}), None, json.dumps({"a": 4})])
diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py
index 443dc60..18610a0 100644
--- a/tests/compliance/json/test_json_compliance.py
+++ b/tests/compliance/json/test_json_compliance.py
@@ -174,6 +174,22 @@ def test_factorize(self, data_for_grouping):
     def test_factorize_equivalence(self, data_for_grouping):
         super().test_factorize_equivalence(data_for_grouping)
 
+    @pytest.mark.skip(reason="BigQuery does not allow sort by a JSON-type column.")
+    def test_argsort(self, data_for_sorting):
+        super().test_argsort(data_for_sorting)
+
+    @pytest.mark.skip(reason="BigQuery does not allow sort by a JSON-type column.")
+    def test_argmin_argmax(self, data_for_sorting):
+        super().test_argmin_argmax(data_for_sorting)
+
+    @pytest.mark.skip(reason="BigQuery does not allow sort by a JSON-type column.")
+    def test_sort_values(self, data_for_sorting):
+        super().test_sort_values(data_for_sorting)
+
+    @pytest.mark.skip(reason="BigQuery does not allow sort by a JSON-type column.")
+    def test_sort_values_frame(self, data_for_sorting):
+        super().test_sort_values_frame(data_for_sorting)
+
 
 class TestJSONArrayMissing(base.BaseMissingTests):
     @pytest.mark.xfail(reason="Setting a dict as a scalar")

From efe72cceb5b346e8d74e4ab54e47d78a6205bdb1 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Tue, 6 Aug 2024 18:49:30 +0000
Subject: [PATCH 16/28] skip jsondtype and jsonarray

---
 db_dtypes/__init__.py | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py
index 4cb45c5..dd17fb1 100644
--- a/db_dtypes/__init__.py
+++ b/db_dtypes/__init__.py
@@ -343,13 +343,21 @@ def __sub__(self, other):
 
         return super().__sub__(other)
 
-
-__all__ = [
-    "__version__",
-    "DateArray",
-    "DateDtype",
-    "JSONDtype",
-    "JSONArray",
-    "TimeArray",
-    "TimeDtype",
-]
+if not JSONArray or not JSONDtype:
+    __all__ = [
+        "__version__",
+        "DateArray",
+        "DateDtype",
+        "TimeArray",
+        "TimeDtype",
+    ]
+else:
+    __all__ = [
+        "__version__",
+        "DateArray",
+        "DateDtype",
+        "JSONDtype",
+        "JSONArray",
+        "TimeArray",
+        "TimeDtype",
+    ]

From 98adb5a1cbb96542f2d1be6ca56f2d80328e6df7 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Tue, 6 Aug 2024 18:50:59 +0000
Subject: [PATCH 17/28] fixing

---
 db_dtypes/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/db_dtypes/__init__.py b/db_dtypes/__init__.py
index dd17fb1..d27e93e 100644
--- a/db_dtypes/__init__.py
+++ b/db_dtypes/__init__.py
@@ -343,6 +343,7 @@ def __sub__(self, other):
 
         return super().__sub__(other)
 
+
 if not JSONArray or not JSONDtype:
     __all__ = [
         "__version__",

From 790f2577601865c2d922d462445ce3576ee5acc1 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Tue, 6 Aug 2024 18:57:21 +0000
Subject: [PATCH 18/28] fix coverage file name

---
 .github/workflows/unittest.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index 81ff447..0c2dca0 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -76,7 +76,7 @@ jobs:
         python -m pip install nox
     - name: Run compliance tests
       env:
-        COVERAGE_FILE: .coverage-${{ matrix.python }}
+        COVERAGE_FILE: .coverage-compliance-${{ matrix.python }}
       run: |
         nox -s compliance-${{ matrix.python }}
     - name: Upload coverage results

From 8800b6bc11f4600e5ec8b6ba5864d151672d63ec Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Tue, 6 Aug 2024 23:01:11 +0000
Subject: [PATCH 19/28] add a simple unit test

---
 tests/unit/test_json.py | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 tests/unit/test_json.py

diff --git a/tests/unit/test_json.py b/tests/unit/test_json.py
new file mode 100644
index 0000000..538fb4d
--- /dev/null
+++ b/tests/unit/test_json.py
@@ -0,0 +1,34 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import datetime as dt
+from typing import Optional
+
+import pandas
+import pandas.api.extensions
+import pandas.testing
+import pyarrow
+import pytest
+
+import packaging.version
+
+import db_dtypes
+
+is_supported_version = packaging.version.Version(pandas.__version__) >= packaging.version.Version("1.5.0")
+
+@pytest.mark.skipif(not is_supported_version, reason="requires Pandas 1.5.0 and above")
+def test_constructor_from_sequence():
+    json_obj = [0, "str", {"a": 0, "b": 1}]
+    data = db_dtypes.JSONArray._from_sequence(json_obj)

From b4cfcd91d10003f41f242df965f8d957e68a4eb5 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Wed, 7 Aug 2024 21:50:41 +0000
Subject: [PATCH 20/28] unit-test for some functionalities

---
 db_dtypes/json.py       | 20 +++--------
 tests/unit/test_json.py | 74 ++++++++++++++++++++++++++++++++++-------
 2 files changed, 67 insertions(+), 27 deletions(-)

diff --git a/db_dtypes/json.py b/db_dtypes/json.py
index 0cf88d6..a8a6caa 100644
--- a/db_dtypes/json.py
+++ b/db_dtypes/json.py
@@ -15,7 +15,6 @@
 from __future__ import annotations
 
 import json
-import typing
 
 import numpy as np
 import pandas as pd
@@ -69,10 +68,10 @@ def construct_array_type(cls):
         """Return the array type associated with this dtype."""
         return JSONArray
 
-    @staticmethod
-    def __from_arrow__(array: typing.Union[pa.Array, pa.ChunkedArray]) -> JSONArray:
-        """Convert to JSONArray from an Arrow array."""
-        return JSONArray(array)
+    # @staticmethod
+    # def __from_arrow__(array: typing.Union[pa.Array, pa.ChunkedArray]) -> JSONArray:
+    #     """Convert to JSONArray from an Arrow array."""
+    #     return JSONArray(array)
 
 
 class JSONArray(arrays.ArrowExtensionArray):
@@ -143,18 +142,9 @@ def _box_pa_array(
     @classmethod
     def _from_sequence(cls, scalars, *, dtype=None, copy=False):
         """Construct a new ExtensionArray from a sequence of scalars."""
-        result = []
-        for scalar in scalars:
-            result.append(JSONArray._serialize_json(scalar))
+        result = [JSONArray._serialize_json(scalar) for scalar in scalars]
         return cls(pa.array(result, type=pa.string(), from_pandas=True))
 
-    @classmethod
-    def _from_sequence_of_strings(
-        cls, strings, *, dtype, copy: bool = False
-    ) -> JSONArray:
-        """Construct a new ExtensionArray from a sequence of strings."""
-        return cls._from_sequence(strings, dtype=dtype, copy=copy)
-
     @classmethod
     def _concat_same_type(cls, to_concat) -> JSONArray:
         """Concatenate multiple JSONArray."""
diff --git a/tests/unit/test_json.py b/tests/unit/test_json.py
index 538fb4d..d18586a 100644
--- a/tests/unit/test_json.py
+++ b/tests/unit/test_json.py
@@ -13,22 +13,72 @@
 # limitations under the License.
 
 
-import datetime as dt
-from typing import Optional
+import json
 
-import pandas
-import pandas.api.extensions
+import pandas as pd
 import pandas.testing
-import pyarrow
 import pytest
 
-import packaging.version
-
 import db_dtypes
 
-is_supported_version = packaging.version.Version(pandas.__version__) >= packaging.version.Version("1.5.0")
+# Check for minimum Pandas version.
+pytest.importorskip("pandas", minversion="1.5.0")
+
+
+# # Python data types mirroring all standard JSON types
+# https://json-schema.org/understanding-json-schema/reference/type
+JSON_DATA = {
+    "boolean": True,
+    "int": 100,
+    "float": 0.98,
+    "string": "hello world",
+    "array": [0.1, 0.2],
+    "dict": {
+        "null_field": None,
+        "order": {
+            "items": ["book", "pen", "computer"],
+            "total": 15.99,
+            "address": {"street": "123 Main St", "city": "Anytown"},
+        },
+    },
+    "null": None,
+}
+
+
+def test_get_items():
+    data = db_dtypes.JSONArray._from_sequence(JSON_DATA.values())
+    for id, key in enumerate(JSON_DATA.keys()):
+        if key == "null":
+            assert pd.isna(data[id])
+        else:
+            assert data[id] == JSON_DATA[key]
+
+
+def test_get_items_unbox_object():
+    data = db_dtypes.JSONArray._from_sequence([JSON_DATA["dict"]])
+    assert len(data[0]) == 2
+
+    assert data[0]["null_field"] is None
+    assert data[0]["order"]["address"]["city"] == "Anytown"
+    assert len(data[0]["order"]["items"]) == 3
+    assert data[0]["order"]["items"][0] == "book"
+
+    with pytest.raises(KeyError):
+        data[0]["unknown"]
+
+
+def test_to_numpy():
+    s = pd.Series(db_dtypes.JSONArray._from_sequence(JSON_DATA.values()))
+    data = s.to_numpy()
+    for id, key in enumerate(JSON_DATA.keys()):
+        if key == "null":
+            assert pd.isna(data[id])
+        else:
+            assert data[id] == json.dumps(JSON_DATA[key], sort_keys=True)
+
 
-@pytest.mark.skipif(not is_supported_version, reason="requires Pandas 1.5.0 and above")
-def test_constructor_from_sequence():
-    json_obj = [0, "str", {"a": 0, "b": 1}]
-    data = db_dtypes.JSONArray._from_sequence(json_obj)
+def test_deterministic_json_serialization():
+    x = {"a": 0, "b": 1}
+    y = {"b": 1, "a": 0}
+    data = db_dtypes.JSONArray._from_sequence([x])
+    assert y in data

From 17f560e6414c9e7ff4380099558723aa37790b76 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Wed, 7 Aug 2024 22:17:37 +0000
Subject: [PATCH 21/28] address comments

---
 db_dtypes/json.py | 39 +++++++++++++++------------------------
 1 file changed, 15 insertions(+), 24 deletions(-)

diff --git a/db_dtypes/json.py b/db_dtypes/json.py
index a8a6caa..3a2b0ee 100644
--- a/db_dtypes/json.py
+++ b/db_dtypes/json.py
@@ -24,15 +24,6 @@
 import pyarrow as pa
 import pyarrow.compute
 
-ARROW_CMP_FUNCS = {
-    "eq": pyarrow.compute.equal,
-    "ne": pyarrow.compute.not_equal,
-    "lt": pyarrow.compute.less,
-    "gt": pyarrow.compute.greater,
-    "le": pyarrow.compute.less_equal,
-    "ge": pyarrow.compute.greater_equal,
-}
-
 
 @pd.api.extensions.register_extension_dtype
 class JSONDtype(pd.api.extensions.ExtensionDtype):
@@ -68,11 +59,6 @@ def construct_array_type(cls):
         """Return the array type associated with this dtype."""
         return JSONArray
 
-    # @staticmethod
-    # def __from_arrow__(array: typing.Union[pa.Array, pa.ChunkedArray]) -> JSONArray:
-    #     """Convert to JSONArray from an Arrow array."""
-    #     return JSONArray(array)
-
 
 class JSONArray(arrays.ArrowExtensionArray):
     """Extension array that handles BigQuery JSON data, leveraging a string-based
@@ -95,26 +81,26 @@ def _box_pa(
         cls, value, pa_type: pa.DataType | None = None
     ) -> pa.Array | pa.ChunkedArray | pa.Scalar:
         """Box value into a pyarrow Array, ChunkedArray or Scalar."""
+        if pa_type is not None and pa_type != pa.string():
+            raise ValueError(f"Unsupported type '{pa_type}' for JSONArray")
 
         if isinstance(value, pa.Scalar) or not (
             common.is_list_like(value) and not common.is_dict_like(value)
         ):
-            return cls._box_pa_scalar(value, pa_type)
-        return cls._box_pa_array(value, pa_type)
+            return cls._box_pa_scalar(value)
+        return cls._box_pa_array(value)
 
     @classmethod
-    def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
+    def _box_pa_scalar(cls, value) -> pa.Scalar:
         """Box value into a pyarrow Scalar."""
         if isinstance(value, pa.Scalar):
             pa_scalar = value
         if pd.isna(value):
-            pa_scalar = pa.scalar(None, type=pa_type)
+            pa_scalar = pa.scalar(None, type=pa.string())
         else:
             value = JSONArray._serialize_json(value)
-            pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True)
+            pa_scalar = pa.scalar(value, type=pa.string(), from_pandas=True)
 
-        if pa_type is not None and pa_scalar.type != pa_type:
-            pa_scalar = pa_scalar.cast(pa_type)
         return pa_scalar
 
     @classmethod
@@ -131,7 +117,8 @@ def _box_pa_array(
                 value = [JSONArray._serialize_json(x) for x in value]
                 pa_array = pa.array(value, type=pa_type, from_pandas=True)
             except (pa.ArrowInvalid, pa.ArrowTypeError):
-                # GH50430: let pyarrow infer type, then cast
+                # https://github.com/pandas-dev/pandas/pull/50430:
+                # let pyarrow infer type, then cast
                 pa_array = pa.array(value, from_pandas=True)
 
         if pa_type is not None and pa_array.type != pa_type:
@@ -181,8 +168,12 @@ def dtype(self) -> JSONDtype:
         return self._dtype
 
     def _cmp_method(self, other, op):
-        pc_func = ARROW_CMP_FUNCS[op.__name__]
-        result = pc_func(self._pa_array, self._box_pa(other))
+        if op.__name__ == "eq":
+            result = pyarrow.compute.equal(self._pa_array, self._box_pa(other))
+        elif op.__name__ == "ne":
+            result = pyarrow.compute.not_equal(self._pa_array, self._box_pa(other))
+        else:
+            raise NotImplementedError(f"{op.__name__} not implemented for JSONArray")
         return arrays.ArrowExtensionArray(result)
 
     def __getitem__(self, item):

From 7add79219a23f435a3a515a798363c02a6e8e003 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Thu, 8 Aug 2024 00:17:56 +0000
Subject: [PATCH 22/28] fix test cover

---
 db_dtypes/json.py       | 50 ++++++++---------------------------------
 tests/unit/test_json.py | 25 ++++++++++++++++++---
 2 files changed, 31 insertions(+), 44 deletions(-)

diff --git a/db_dtypes/json.py b/db_dtypes/json.py
index 3a2b0ee..9db42d6 100644
--- a/db_dtypes/json.py
+++ b/db_dtypes/json.py
@@ -81,8 +81,7 @@ def _box_pa(
         cls, value, pa_type: pa.DataType | None = None
     ) -> pa.Array | pa.ChunkedArray | pa.Scalar:
         """Box value into a pyarrow Array, ChunkedArray or Scalar."""
-        if pa_type is not None and pa_type != pa.string():
-            raise ValueError(f"Unsupported type '{pa_type}' for JSONArray")
+        assert pa_type is None or pa_type == pa.string()
 
         if isinstance(value, pa.Scalar) or not (
             common.is_list_like(value) and not common.is_dict_like(value)
@@ -93,8 +92,6 @@ def _box_pa(
     @classmethod
     def _box_pa_scalar(cls, value) -> pa.Scalar:
         """Box value into a pyarrow Scalar."""
-        if isinstance(value, pa.Scalar):
-            pa_scalar = value
         if pd.isna(value):
             pa_scalar = pa.scalar(None, type=pa.string())
         else:
@@ -104,33 +101,21 @@ def _box_pa_scalar(cls, value) -> pa.Scalar:
         return pa_scalar
 
     @classmethod
-    def _box_pa_array(
-        cls, value, pa_type: pa.DataType | None = None, copy: bool = False
-    ) -> pa.Array | pa.ChunkedArray:
+    def _box_pa_array(cls, value, copy: bool = False) -> pa.Array | pa.ChunkedArray:
         """Box value into a pyarrow Array or ChunkedArray."""
         if isinstance(value, cls):
             pa_array = value._pa_array
-        elif isinstance(value, (pa.Array, pa.ChunkedArray)):
-            pa_array = value
         else:
-            try:
-                value = [JSONArray._serialize_json(x) for x in value]
-                pa_array = pa.array(value, type=pa_type, from_pandas=True)
-            except (pa.ArrowInvalid, pa.ArrowTypeError):
-                # https://github.com/pandas-dev/pandas/pull/50430:
-                # let pyarrow infer type, then cast
-                pa_array = pa.array(value, from_pandas=True)
-
-        if pa_type is not None and pa_array.type != pa_type:
-            pa_array = pa_array.cast(pa_type)
-
+            value = [JSONArray._serialize_json(x) for x in value]
+            pa_array = pa.array(value, type=pa.string(), from_pandas=True)
         return pa_array
 
     @classmethod
     def _from_sequence(cls, scalars, *, dtype=None, copy=False):
         """Construct a new ExtensionArray from a sequence of scalars."""
-        result = [JSONArray._serialize_json(scalar) for scalar in scalars]
-        return cls(pa.array(result, type=pa.string(), from_pandas=True))
+        pa_array = cls._box_pa(scalars)
+        arr = cls(pa_array)
+        return arr
 
     @classmethod
     def _concat_same_type(cls, to_concat) -> JSONArray:
@@ -139,11 +124,6 @@ def _concat_same_type(cls, to_concat) -> JSONArray:
         arr = pa.chunked_array(chunks, type=pa.string())
         return cls(arr)
 
-    @classmethod
-    def _from_factorized(cls, values, original):
-        """Reconstruct an ExtensionArray after factorization."""
-        return cls._from_sequence(values, dtype=original.dtype)
-
     @staticmethod
     def _serialize_json(value):
         """A static method that converts a JSON value into a string representation."""
@@ -202,19 +182,6 @@ def __getitem__(self, item):
                 r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
                 r"(`None`) and integer or boolean arrays are valid indices"
             )
-        # We are not an array indexer, so maybe e.g. a slice or integer
-        # indexer. We dispatch to pyarrow.
-        if isinstance(item, slice):
-            # Arrow bug https://github.com/apache/arrow/issues/38768
-            if item.start == item.stop:
-                pass
-            elif (
-                item.stop is not None
-                and item.stop < -len(self)
-                and item.step is not None
-                and item.step < 0
-            ):
-                item = slice(item.start, None, item.step)
 
         value = self._pa_array[item]
         if isinstance(value, pa.ChunkedArray):
@@ -229,7 +196,8 @@ def __getitem__(self, item):
     def __iter__(self):
         """Iterate over elements of the array."""
         for value in self._pa_array:
-            val = JSONArray._deserialize_json(value.as_py())
+            val = value.as_py()
+            # val = JSONArray._deserialize_json(value.as_py())
             if val is None:
                 yield self._dtype.na_value
             else:
diff --git a/tests/unit/test_json.py b/tests/unit/test_json.py
index d18586a..5e389a2 100644
--- a/tests/unit/test_json.py
+++ b/tests/unit/test_json.py
@@ -15,8 +15,8 @@
 
 import json
 
+import numpy as np
 import pandas as pd
-import pandas.testing
 import pytest
 
 import db_dtypes
@@ -45,7 +45,12 @@
 }
 
 
-def test_get_items():
+def test_construct_w_unspported_types():
+    with pytest.raises(ValueError):
+        db_dtypes.JSONArray(100)
+
+
+def test_getitems_return_json_objects():
     data = db_dtypes.JSONArray._from_sequence(JSON_DATA.values())
     for id, key in enumerate(JSON_DATA.keys()):
         if key == "null":
@@ -54,7 +59,7 @@ def test_get_items():
             assert data[id] == JSON_DATA[key]
 
 
-def test_get_items_unbox_object():
+def test_getitems_w_unboxed_dict():
     data = db_dtypes.JSONArray._from_sequence([JSON_DATA["dict"]])
     assert len(data[0]) == 2
 
@@ -67,6 +72,20 @@ def test_get_items_unbox_object():
         data[0]["unknown"]
 
 
+def test_getitems_w_invalid_numpy_array():
+    data = db_dtypes.JSONArray._from_sequence(JSON_DATA.values())
+    idx = np.array(["str"])
+    with pytest.raises(IndexError):
+        data[idx]
+
+
+def test_getitems_when_iter_with_null():
+    data = db_dtypes.JSONArray._from_sequence([JSON_DATA["null"]])
+    s = pd.Series(data)
+    result = s[:1].item()
+    assert pd.isna(result)
+
+
 def test_to_numpy():
     s = pd.Series(db_dtypes.JSONArray._from_sequence(JSON_DATA.values()))
     data = s.to_numpy()

From ba516c71862069fbecf1ef098a44d0f2da577f3c Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Thu, 8 Aug 2024 00:22:41 +0000
Subject: [PATCH 23/28] fixing

---
 db_dtypes/json.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/db_dtypes/json.py b/db_dtypes/json.py
index 9db42d6..6045b07 100644
--- a/db_dtypes/json.py
+++ b/db_dtypes/json.py
@@ -196,8 +196,7 @@ def __getitem__(self, item):
     def __iter__(self):
         """Iterate over elements of the array."""
         for value in self._pa_array:
-            val = value.as_py()
-            # val = JSONArray._deserialize_json(value.as_py())
+            val = JSONArray._deserialize_json(value.as_py())
             if val is None:
                 yield self._dtype.na_value
             else:

From 0185f0847341976e3261b84705db7c5c6585d168 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= <swast@google.com>
Date: Thu, 8 Aug 2024 10:03:27 -0500
Subject: [PATCH 24/28] Update db_dtypes/json.py

---
 db_dtypes/json.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/db_dtypes/json.py b/db_dtypes/json.py
index 6045b07..27fdc15 100644
--- a/db_dtypes/json.py
+++ b/db_dtypes/json.py
@@ -153,7 +153,8 @@ def _cmp_method(self, other, op):
         elif op.__name__ == "ne":
             result = pyarrow.compute.not_equal(self._pa_array, self._box_pa(other))
         else:
-            raise NotImplementedError(f"{op.__name__} not implemented for JSONArray")
+            # Comparison is not a meaningful one. We don't want to support sorting by JSON columns.
+            raise TypeError(f"{op.__name__} not supported for JSONArray")
         return arrays.ArrowExtensionArray(result)
 
     def __getitem__(self, item):

From dac34315969da0a8f930a19f242bfbf55b22f15d Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Thu, 8 Aug 2024 17:09:36 +0000
Subject: [PATCH 25/28] fixing

---
 db_dtypes/json.py       | 2 +-
 tests/unit/test_json.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/db_dtypes/json.py b/db_dtypes/json.py
index 27fdc15..9f92134 100644
--- a/db_dtypes/json.py
+++ b/db_dtypes/json.py
@@ -171,7 +171,7 @@ def __getitem__(self, item):
             else:
                 raise IndexError(
                     "Only integers, slices and integer or "
-                    "boolean arrays are valid indices."
+                    + "boolean arrays are valid indices."
                 )
         elif isinstance(item, tuple):
             item = indexers.unpack_tuple_and_ellipses(item)
diff --git a/tests/unit/test_json.py b/tests/unit/test_json.py
index 5e389a2..dc76d6d 100644
--- a/tests/unit/test_json.py
+++ b/tests/unit/test_json.py
@@ -25,7 +25,7 @@
 pytest.importorskip("pandas", minversion="1.5.0")
 
 
-# # Python data types mirroring all standard JSON types
+# Python data types mirroring all standard JSON types:
 # https://json-schema.org/understanding-json-schema/reference/type
 JSON_DATA = {
     "boolean": True,

From 780024237cc11f8a291b56472f8b868235470f5c Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Thu, 8 Aug 2024 17:43:15 +0000
Subject: [PATCH 26/28] fixing

---
 db_dtypes/json.py       | 15 ++++++++-------
 tests/unit/test_json.py |  7 -------
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/db_dtypes/json.py b/db_dtypes/json.py
index 9f92134..0192e66 100644
--- a/db_dtypes/json.py
+++ b/db_dtypes/json.py
@@ -120,7 +120,11 @@ def _from_sequence(cls, scalars, *, dtype=None, copy=False):
     @classmethod
     def _concat_same_type(cls, to_concat) -> JSONArray:
         """Concatenate multiple JSONArray."""
-        chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
+        chunks = [
+            pa_array_chunks
+            for item in to_concat
+            for pa_array_chunks in item._pa_array.iterchunks()
+        ]
         arr = pa.chunked_array(chunks, type=pa.string())
         return cls(arr)
 
@@ -166,13 +170,10 @@ def __getitem__(self, item):
                 return type(self)(pa.chunked_array([], type=pa.string()))
             elif item.dtype.kind in "iu":
                 return self.take(item)
-            elif item.dtype.kind == "b":
-                return type(self)(self._pa_array.filter(item))
             else:
-                raise IndexError(
-                    "Only integers, slices and integer or "
-                    + "boolean arrays are valid indices."
-                )
+                # `check_array_indexer` should verify that the assertion hold true.
+                assert item.dtype.kind == "b"
+                return type(self)(self._pa_array.filter(item))
         elif isinstance(item, tuple):
             item = indexers.unpack_tuple_and_ellipses(item)
 
diff --git a/tests/unit/test_json.py b/tests/unit/test_json.py
index dc76d6d..ea2be7a 100644
--- a/tests/unit/test_json.py
+++ b/tests/unit/test_json.py
@@ -72,13 +72,6 @@ def test_getitems_w_unboxed_dict():
         data[0]["unknown"]
 
 
-def test_getitems_w_invalid_numpy_array():
-    data = db_dtypes.JSONArray._from_sequence(JSON_DATA.values())
-    idx = np.array(["str"])
-    with pytest.raises(IndexError):
-        data[idx]
-
-
 def test_getitems_when_iter_with_null():
     data = db_dtypes.JSONArray._from_sequence([JSON_DATA["null"]])
     s = pd.Series(data)

From 913d0bc977c7e2f9893dc4b4070cb2c5fca48035 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Thu, 8 Aug 2024 18:05:50 +0000
Subject: [PATCH 27/28] add pyarrow_dtypes

---
 db_dtypes/json.py | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/db_dtypes/json.py b/db_dtypes/json.py
index 0192e66..ed04b72 100644
--- a/db_dtypes/json.py
+++ b/db_dtypes/json.py
@@ -46,6 +46,11 @@ def type(self) -> type[str]:
         """
         return str
 
+    @property
+    def pyarrow_dtype(self):
+        """Return the pyarrow data type used for storing data in the pyarrow array."""
+        return pa.string()
+
     @property
     def _is_numeric(self) -> bool:
         return False
@@ -81,7 +86,7 @@ def _box_pa(
         cls, value, pa_type: pa.DataType | None = None
     ) -> pa.Array | pa.ChunkedArray | pa.Scalar:
         """Box value into a pyarrow Array, ChunkedArray or Scalar."""
-        assert pa_type is None or pa_type == pa.string()
+        assert pa_type is None or pa_type == cls._dtype.pyarrow_dtype
 
         if isinstance(value, pa.Scalar) or not (
             common.is_list_like(value) and not common.is_dict_like(value)
@@ -93,10 +98,12 @@ def _box_pa(
     def _box_pa_scalar(cls, value) -> pa.Scalar:
         """Box value into a pyarrow Scalar."""
         if pd.isna(value):
-            pa_scalar = pa.scalar(None, type=pa.string())
+            pa_scalar = pa.scalar(None, type=cls._dtype.pyarrow_dtype)
         else:
             value = JSONArray._serialize_json(value)
-            pa_scalar = pa.scalar(value, type=pa.string(), from_pandas=True)
+            pa_scalar = pa.scalar(
+                value, type=cls._dtype.pyarrow_dtype, from_pandas=True
+            )
 
         return pa_scalar
 
@@ -107,7 +114,7 @@ def _box_pa_array(cls, value, copy: bool = False) -> pa.Array | pa.ChunkedArray:
             pa_array = value._pa_array
         else:
             value = [JSONArray._serialize_json(x) for x in value]
-            pa_array = pa.array(value, type=pa.string(), from_pandas=True)
+            pa_array = pa.array(value, type=cls._dtype.pyarrow_dtype, from_pandas=True)
         return pa_array
 
     @classmethod
@@ -117,17 +124,6 @@ def _from_sequence(cls, scalars, *, dtype=None, copy=False):
         arr = cls(pa_array)
         return arr
 
-    @classmethod
-    def _concat_same_type(cls, to_concat) -> JSONArray:
-        """Concatenate multiple JSONArray."""
-        chunks = [
-            pa_array_chunks
-            for item in to_concat
-            for pa_array_chunks in item._pa_array.iterchunks()
-        ]
-        arr = pa.chunked_array(chunks, type=pa.string())
-        return cls(arr)
-
     @staticmethod
     def _serialize_json(value):
         """A static method that converts a JSON value into a string representation."""
@@ -167,7 +163,7 @@ def __getitem__(self, item):
 
         if isinstance(item, np.ndarray):
             if not len(item):
-                return type(self)(pa.chunked_array([], type=pa.string()))
+                return type(self)(pa.chunked_array([], type=self.dtype.pyarrow_dtype))
             elif item.dtype.kind in "iu":
                 return self.take(item)
             else:

From 01eef453e17bcce2c1760ec40baa2ce32587cd97 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Thu, 8 Aug 2024 18:14:12 +0000
Subject: [PATCH 28/28] fixing

---
 tests/unit/test_json.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/unit/test_json.py b/tests/unit/test_json.py
index ea2be7a..c48635d 100644
--- a/tests/unit/test_json.py
+++ b/tests/unit/test_json.py
@@ -15,7 +15,6 @@
 
 import json
 
-import numpy as np
 import pandas as pd
 import pytest