support array type

chelsea-lin · chelsea-lin · commit 8cda5e89ba5c · 2024-08-02T20:40:23.000Z
diff --git a/db_dtypes/json.py b/db_dtypes/json.py
@@ -34,6 +34,7 @@
     "ge": pyarrow.compute.greater_equal,
 }
 
+
 @pd.api.extensions.register_extension_dtype
 class JSONDtype(pd.api.extensions.ExtensionDtype):
     """Extension dtype for BigQuery JSON data."""
@@ -90,6 +91,7 @@ def _box_pa(
         cls, value, pa_type: pa.DataType | None = None
     ) -> pa.Array | pa.ChunkedArray | pa.Scalar:
         """Box value into a pyarrow Array, ChunkedArray or Scalar."""
+
         if isinstance(value, pa.Scalar) or not (
             common.is_list_like(value) and not common.is_dict_like(value)
         ):
@@ -163,7 +165,7 @@ def _from_factorized(cls, values, original):
     @staticmethod
     def _serialize_json(value):
         """A static method that converts a JSON value into a string representation."""
-        if pd.isna(value):
+        if not common.is_list_like(value) and pd.isna(value):
             return value
         else:
             # `sort_keys=True` sorts dictionary keys before serialization, making
@@ -254,40 +256,3 @@ def _reduce(
         if name in ["min", "max"]:
             raise TypeError("JSONArray does not support min/max reducntion.")
         super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
-
-    def __array__(
-        self, dtype = None, copy = None
-    ) -> np.ndarray:
-        """Correctly construct numpy arrays when passed to `np.asarray()`."""
-        return self.to_numpy(dtype=dtype)
-
-    def to_numpy(self, dtype = None, copy = False, na_value = pd.NA) -> np.ndarray:
-        dtype, na_value = self._to_numpy_dtype_inference(dtype, na_value, self._hasna)
-        pa_type = self._pa_array.type
-        if not self._hasna or pd.isna(na_value) or pa.types.is_null(pa_type):
-            data = self
-        else:
-            data = self.fillna(na_value)
-        result = np.array(list(data), dtype=dtype)
-        
-        if data._hasna:
-            result[data.isna()] = na_value
-        return result
-
-    def _to_numpy_dtype_inference(
-        self, dtype, na_value, hasna
-    ):
-        if dtype is not None:
-            dtype = np.dtype(dtype)
-
-        if dtype is None or not hasna:
-            na_value = self.dtype.na_value
-        elif dtype.kind == "f":  # type: ignore[union-attr]
-            na_value = np.nan
-        elif dtype.kind == "M":  # type: ignore[union-attr]
-            na_value = np.datetime64("nat")
-        elif dtype.kind == "m":  # type: ignore[union-attr]
-            na_value = np.timedelta64("nat")
-        else:
-            na_value = self.dtype.na_value
-        return dtype, na_value
diff --git a/tests/compliance/json/conftest.py b/tests/compliance/json/conftest.py
@@ -14,6 +14,7 @@
 
 
 import json
+import random
 
 import numpy as np
 import pandas as pd
@@ -24,18 +25,29 @@
 
 
 def make_data():
-    # Sample data with varied lengths.
+    # Since the `np.array` constructor needs a consistent shape after the first
+    # dimension, the samples data in this instance doesn't include the array type.
     samples = [
-        {"id": 1, "bool_value": True},  # Boolean
-        {"id": 2, "float_num": 3.14159},  # Floating
-        {"id": 3, "date": "2024-07-16"},  # Dates (as strings)
-        {"id": 4, "null_field": None},  # Null
-        {"list_data": [10, 20, 30]},  # Lists
-        {"person": {"name": "Alice", "age": 35}},  # Nested objects
+        True,  # Boolean
+        100,  # Int
+        0.98,  # Float
+        "str",  # String
+        {"bool_value": True},  # Dict with a boolean
+        {"float_num": 3.14159},  # Dict with a float
+        {"date": "2024-07-16"},  # Dict with a date (as strings)
+        {"null_field": None},  # Dict with a null
+        {"list_data": [10, 20, 30]},  # Dict with a list
+        {"person": {"name": "Alice", "age": 35}},  # Dict with nested objects
         {"address": {"street": "123 Main St", "city": "Anytown"}},
         {"order": {"items": ["book", "pen"], "total": 15.99}},
     ]
-    return np.random.default_rng(2).choice(samples, size=100)
+    data = np.random.default_rng(2).choice(samples, size=100)
+    # This replaces a single data item with an array. We are skipping the first two
+    # items to avoid some `setitem` tests failed, because setting with a list is
+    # ambiguity in this context.
+    id = random.randint(3, 99)
+    data[id] = [0.1, 0.2]  # Array
+    return data
 
 
 @pytest.fixture
@@ -48,16 +60,6 @@ def data():
     """Length-100 PeriodArray for semantics test."""
     data = make_data()
 
-    # Why the while loop? NumPy is unable to construct an ndarray from
-    # equal-length ndarrays. Many of our operations involve coercing the
-    # EA to an ndarray of objects. To avoid random test failures, we ensure
-    # that our data is coercible to an ndarray. Several tests deal with only
-    # the first two elements, so that's what we'll check.
-
-    while len(data[0]) == len(data[1]):
-        print(data)
-        data = make_data()
-
     return JSONArray._from_sequence(data)
 
 
diff --git a/tests/compliance/json/test_json_compliance.py b/tests/compliance/json/test_json_compliance.py
@@ -21,7 +21,6 @@
 from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
 from pandas.tests.extension import base
 import pytest
-import db_dtypes
 
 
 class TestJSONArray(base.ExtensionTests):
@@ -126,6 +125,43 @@ def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
     def test_searchsorted(self, data_for_sorting, as_series):
         super().test_searchsorted(self, data_for_sorting, as_series)
 
+    def test_astype_str(self, data):
+        # Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method.
+        result = pd.Series(data[:5]).astype(str)
+        expected = pd.Series(
+            [json.dumps(x, sort_keys=True) for x in data[:5]], dtype=str
+        )
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "nullable_string_dtype",
+        [
+            "string[python]",
+            "string[pyarrow]",
+        ],
+    )
+    def test_astype_string(self, data, nullable_string_dtype):
+        # Use `json.dumps(str)` instead of passing `str(obj)` directly to the super method.
+        result = pd.Series(data[:5]).astype(nullable_string_dtype)
+        expected = pd.Series(
+            [json.dumps(x, sort_keys=True) for x in data[:5]],
+            dtype=nullable_string_dtype,
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_array_interface(self, data):
+        result = np.array(data)
+        # Use `json.dumps(data[0])` instead of passing `data[0]` directly to the super method.
+        assert result[0] == json.dumps(data[0])
+
+        result = np.array(data, dtype=object)
+        # Use `json.dumps(x)` instead of passing `x` directly to the super method.
+        expected = np.array([json.dumps(x) for x in data], dtype=object)
+        if expected.ndim > 1:
+            # nested data, explicitly construct as 1D
+            expected = construct_1d_object_array_from_listlike(list(data))
+        tm.assert_numpy_array_equal(result, expected)
+
     @pytest.mark.xfail(reason="Setting a dict as a scalar")
     def test_fillna_series(self):
         """We treat dictionaries as a mapping in fillna, not a scalar."""
@@ -251,7 +287,6 @@ def test_setitem_mask_boolean_array_with_na(self, data, box_in_series):
         super().test_setitem_mask_boolean_array_with_na(data, box_in_series)
 
     @pytest.mark.parametrize("setter", ["loc", "iloc"])
-    
     @pytest.mark.xfail(reason="TODO: open an issue for ArrowExtentionArray")
     def test_setitem_scalar(self, data, setter):
         super().test_setitem_scalar(data, setter)
@@ -310,3 +345,26 @@ def test_setitem_2d_values(self, data):
     @pytest.mark.parametrize("engine", ["c", "python"])
     def test_EA_types(self, engine, data, request):
         super().test_EA_types(engine, data, request)
+
+    @pytest.mark.xfail(
+        reason="`to_numpy` returns serialized JSON, "
+        + "while `__getitem__` returns JSON objects."
+    )
+    def test_setitem_frame_2d_values(self, data):
+        super().test_setitem_frame_2d_values(data)
+
+    @pytest.mark.xfail(
+        reason="`to_numpy` returns serialized JSON, "
+        + "while `__getitem__` returns JSON objects."
+    )
+    def test_transpose_frame(self, data):
+        # `DataFrame.T` calls `to_numpy` to get results.
+        super().test_transpose_frame(data)
+
+    @pytest.mark.xfail(
+        reason="`to_numpy` returns serialized JSON, "
+        + "while `__getitem__` returns JSON objects."
+    )
+    def test_where_series(self, data, na_value, as_frame):
+        # `Series.where` calls `to_numpy` to get results.
+        super().test_where_series(data, na_value, as_frame)