Skip to content

feat: create db_dtypes JSONDtype and JSONArray #284

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 28 commits into from
Aug 8, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
d0e94d5
Copy JSONDtype and JSONArray from tests/extension/json and their tests
chelsea-lin Jul 8, 2024
1d33703
formatting
chelsea-lin Jul 9, 2024
de3120a
converts to ArrowStringArray
chelsea-lin Jul 16, 2024
8bd13cc
box and unbox between string(storage) and dict(getitem)
chelsea-lin Jul 22, 2024
e29585d
minor
chelsea-lin Jul 22, 2024
84690ee
fix test_getitem_scalar test
chelsea-lin Jul 22, 2024
d11cc87
add docstring and remove unused functions
chelsea-lin Jul 22, 2024
60da570
🦉 Updates from OwlBot post-processor
gcf-owl-bot[bot] Jul 22, 2024
48ee67d
fix lint
chelsea-lin Jul 22, 2024
91d5016
address some comments
chelsea-lin Jul 24, 2024
191deef
supports all types except Array
chelsea-lin Jul 30, 2024
7422f7a
support array type
chelsea-lin Jul 30, 2024
22a099b
only import when pandas version is higher than 1.5.0
chelsea-lin Aug 2, 2024
77339a0
exclude groupby and other tests
chelsea-lin Aug 3, 2024
2798825
others
chelsea-lin Aug 5, 2024
efe72cc
skip jsondtype and jsonarray
chelsea-lin Aug 6, 2024
98adb5a
fixing
chelsea-lin Aug 6, 2024
790f257
fix coverage file name
chelsea-lin Aug 6, 2024
8800b6b
add a simple unit test
chelsea-lin Aug 6, 2024
b4cfcd9
unit-test for some functionalities
chelsea-lin Aug 7, 2024
17f560e
address comments
chelsea-lin Aug 7, 2024
7add792
fix test cover
chelsea-lin Aug 8, 2024
ba516c7
fixing
chelsea-lin Aug 8, 2024
0185f08
Update db_dtypes/json.py
tswast Aug 8, 2024
dac3431
fixing
chelsea-lin Aug 8, 2024
7800242
fixing
chelsea-lin Aug 8, 2024
913d0bc
add pyarrow_dtypes
chelsea-lin Aug 8, 2024
01eef45
fixing
chelsea-lin Aug 8, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/unittest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ jobs:
python -m pip install nox
- name: Run compliance tests
env:
COVERAGE_FILE: .coverage-${{ matrix.python }}
COVERAGE_FILE: .coverage-compliance-${{ matrix.python }}
run: |
nox -s compliance-${{ matrix.python }}
- name: Upload coverage results
Expand Down
34 changes: 26 additions & 8 deletions db_dtypes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,14 @@
# nanosecond precision when boxing scalars.
_NP_BOX_DTYPE = "datetime64[us]"

pandas_release = packaging.version.parse(pandas.__version__).release

# To use JSONArray and JSONDtype, you'll need Pandas 1.5.0 or later. With the removal
# of Python 3.7 compatibility, the minimum Pandas version will be updated to 1.5.0.
if packaging.version.Version(pandas.__version__) >= packaging.version.Version("1.5.0"):
from db_dtypes.json import JSONArray, JSONDtype
else:
JSONArray = None
JSONDtype = None


@pandas.api.extensions.register_extension_dtype
Expand Down Expand Up @@ -337,10 +344,21 @@ def __sub__(self, other):
return super().__sub__(other)


__all__ = [
"__version__",
"DateArray",
"DateDtype",
"TimeArray",
"TimeDtype",
]
if not JSONArray or not JSONDtype:
__all__ = [
"__version__",
"DateArray",
"DateDtype",
"TimeArray",
"TimeDtype",
]
else:
__all__ = [
"__version__",
"DateArray",
"DateDtype",
"JSONDtype",
"JSONArray",
"TimeArray",
"TimeDtype",
]
263 changes: 263 additions & 0 deletions db_dtypes/json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import json
import typing

import numpy as np
import pandas as pd
import pandas.arrays as arrays
import pandas.core.dtypes.common as common
import pandas.core.indexers as indexers
import pyarrow as pa
import pyarrow.compute

ARROW_CMP_FUNCS = {
"eq": pyarrow.compute.equal,
"ne": pyarrow.compute.not_equal,
"lt": pyarrow.compute.less,
"gt": pyarrow.compute.greater,
"le": pyarrow.compute.less_equal,
"ge": pyarrow.compute.greater_equal,
}


@pd.api.extensions.register_extension_dtype
class JSONDtype(pd.api.extensions.ExtensionDtype):
"""Extension dtype for BigQuery JSON data."""

name = "dbjson"

@property
def na_value(self) -> pd.NA:
"""Default NA value to use for this type."""
return pd.NA

@property
def type(self) -> type[str]:
"""
Return the scalar type for the array elements.
The standard JSON data types can be one of `dict`, `list`, `str`, `int`, `float`,
`bool` and `None`. However, this method returns a `str` type to indicate its
storage type, because the union of multiple types are not supported well in pandas.
"""
return str

@property
def _is_numeric(self) -> bool:
return False

@property
def _is_boolean(self) -> bool:
return False

@classmethod
def construct_array_type(cls):
"""Return the array type associated with this dtype."""
return JSONArray

@staticmethod
def __from_arrow__(array: typing.Union[pa.Array, pa.ChunkedArray]) -> JSONArray:
"""Convert to JSONArray from an Arrow array."""
return JSONArray(array)


class JSONArray(arrays.ArrowExtensionArray):
"""Extension array that handles BigQuery JSON data, leveraging a string-based
pyarrow array for storage. It enables seamless conversion to JSON objects when
accessing individual elements."""

_dtype = JSONDtype()

def __init__(self, values, dtype=None, copy=False) -> None:
self._dtype = JSONDtype()
if isinstance(values, pa.Array):
self._pa_array = pa.chunked_array([values])
elif isinstance(values, pa.ChunkedArray):
self._pa_array = values
else:
raise ValueError(f"Unsupported type '{type(values)}' for JSONArray")

@classmethod
def _box_pa(
cls, value, pa_type: pa.DataType | None = None
) -> pa.Array | pa.ChunkedArray | pa.Scalar:
"""Box value into a pyarrow Array, ChunkedArray or Scalar."""

if isinstance(value, pa.Scalar) or not (
common.is_list_like(value) and not common.is_dict_like(value)
):
return cls._box_pa_scalar(value, pa_type)
return cls._box_pa_array(value, pa_type)

@classmethod
def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
"""Box value into a pyarrow Scalar."""
if isinstance(value, pa.Scalar):
pa_scalar = value
if pd.isna(value):
pa_scalar = pa.scalar(None, type=pa_type)
else:
value = JSONArray._serialize_json(value)
pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True)

if pa_type is not None and pa_scalar.type != pa_type:
pa_scalar = pa_scalar.cast(pa_type)
return pa_scalar

@classmethod
def _box_pa_array(
cls, value, pa_type: pa.DataType | None = None, copy: bool = False
) -> pa.Array | pa.ChunkedArray:
"""Box value into a pyarrow Array or ChunkedArray."""
if isinstance(value, cls):
pa_array = value._pa_array
elif isinstance(value, (pa.Array, pa.ChunkedArray)):
pa_array = value
else:
try:
value = [JSONArray._serialize_json(x) for x in value]
pa_array = pa.array(value, type=pa_type, from_pandas=True)
except (pa.ArrowInvalid, pa.ArrowTypeError):
# GH50430: let pyarrow infer type, then cast
pa_array = pa.array(value, from_pandas=True)

if pa_type is not None and pa_array.type != pa_type:
pa_array = pa_array.cast(pa_type)

return pa_array

@classmethod
def _from_sequence(cls, scalars, *, dtype=None, copy=False):
"""Construct a new ExtensionArray from a sequence of scalars."""
result = []
for scalar in scalars:
result.append(JSONArray._serialize_json(scalar))
return cls(pa.array(result, type=pa.string(), from_pandas=True))

@classmethod
def _from_sequence_of_strings(
cls, strings, *, dtype, copy: bool = False
) -> JSONArray:
"""Construct a new ExtensionArray from a sequence of strings."""
return cls._from_sequence(strings, dtype=dtype, copy=copy)

@classmethod
def _concat_same_type(cls, to_concat) -> JSONArray:
"""Concatenate multiple JSONArray."""
chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
arr = pa.chunked_array(chunks, type=pa.string())
return cls(arr)

@classmethod
def _from_factorized(cls, values, original):
"""Reconstruct an ExtensionArray after factorization."""
return cls._from_sequence(values, dtype=original.dtype)

@staticmethod
def _serialize_json(value):
"""A static method that converts a JSON value into a string representation."""
if not common.is_list_like(value) and pd.isna(value):
return value
else:
# `sort_keys=True` sorts dictionary keys before serialization, making
# JSON comparisons deterministic.
return json.dumps(value, sort_keys=True)

@staticmethod
def _deserialize_json(value):
"""A static method that converts a JSON string back into its original value."""
if not pd.isna(value):
return json.loads(value)
else:
return value

@property
def dtype(self) -> JSONDtype:
"""An instance of JSONDtype"""
return self._dtype

def _cmp_method(self, other, op):
pc_func = ARROW_CMP_FUNCS[op.__name__]
result = pc_func(self._pa_array, self._box_pa(other))
return arrays.ArrowExtensionArray(result)

def __getitem__(self, item):
"""Select a subset of self."""
item = indexers.check_array_indexer(self, item)

if isinstance(item, np.ndarray):
if not len(item):
return type(self)(pa.chunked_array([], type=pa.string()))
elif item.dtype.kind in "iu":
return self.take(item)
elif item.dtype.kind == "b":
return type(self)(self._pa_array.filter(item))
else:
raise IndexError(
"Only integers, slices and integer or "
"boolean arrays are valid indices."
)
elif isinstance(item, tuple):
item = indexers.unpack_tuple_and_ellipses(item)

if common.is_scalar(item) and not common.is_integer(item):
# e.g. "foo" or 2.5
# exception message copied from numpy
raise IndexError(
r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
r"(`None`) and integer or boolean arrays are valid indices"
)
# We are not an array indexer, so maybe e.g. a slice or integer
# indexer. We dispatch to pyarrow.
if isinstance(item, slice):
# Arrow bug https://github.com/apache/arrow/issues/38768
if item.start == item.stop:
pass
elif (
item.stop is not None
and item.stop < -len(self)
and item.step is not None
and item.step < 0
):
item = slice(item.start, None, item.step)

value = self._pa_array[item]
if isinstance(value, pa.ChunkedArray):
return type(self)(value)
else:
scalar = JSONArray._deserialize_json(value.as_py())
if scalar is None:
return self._dtype.na_value
else:
return scalar

def __iter__(self):
"""Iterate over elements of the array."""
for value in self._pa_array:
val = JSONArray._deserialize_json(value.as_py())
if val is None:
yield self._dtype.na_value
else:
yield val

def _reduce(
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
):
"""Return a scalar result of performing the reduction operation."""
if name in ["min", "max"]:
raise TypeError("JSONArray does not support min/max reducntion.")
super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
Loading
Loading