From d651395143cc4cadbb7d18fae2f98ef2354710ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20R=2E=20H=C3=B6lzlwimmer?= Date: Mon, 31 Jan 2022 22:30:26 +0100 Subject: [PATCH] initial commit for StructDtype and StructArray --- pandas/core/arrays/__init__.py | 3 + pandas/core/arrays/struct.py | 1117 ++++++++++++++++++++++++ pandas/tests/dtypes/test_structtype.py | 119 +++ 3 files changed, 1239 insertions(+) create mode 100644 pandas/core/arrays/struct.py create mode 100644 pandas/tests/dtypes/test_structtype.py diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index e301e82a0ee75..6ad38e9d3da51 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -20,6 +20,8 @@ from pandas.core.arrays.string_arrow import ArrowStringArray from pandas.core.arrays.timedeltas import TimedeltaArray +from pandas.core.arrays.struct import StructArray, StructDtype + __all__ = [ "ExtensionArray", "ExtensionOpsMixin", @@ -37,5 +39,6 @@ "period_array", "SparseArray", "StringArray", + "StructArray", "TimedeltaArray", ] diff --git a/pandas/core/arrays/struct.py b/pandas/core/arrays/struct.py new file mode 100644 index 0000000000000..86958e07e4744 --- /dev/null +++ b/pandas/core/arrays/struct.py @@ -0,0 +1,1117 @@ +from __future__ import annotations + +from copy import deepcopy +import operator +from collections import namedtuple, OrderedDict +from typing import TYPE_CHECKING, Type, Union, TypeVar, Tuple, List, NamedTuple, Iterable, Sequence, Dict, Any + +from numpy import dtype + +import re + +import numpy as np +import pandas as pd +from pandas.api.extensions import ( + register_extension_dtype, + ExtensionDtype, + ExtensionArray, +) + +# from pandas.core.construction import extract_array + +try: + import pyarrow as pa +except ImportError: + pa = None + +import logging + +log = logging.getLogger(__name__) + +NoneType = type(None) + +StructArrayT = TypeVar("StructArrayT", bound="StructArray") + + +__all__ = [ + "StructDtype", + "StructArray", + # "ArrowStructType", +] + +def _ZERO_VALUE_FOR_TYPE(dtype): + # convert to pandas dtype + dtype = pd.api.types.pandas_dtype(dtype) + if hasattr(dtype, "zero_value"): + return dtype.zero_value + elif ( + pd.api.types.is_numeric_dtype(dtype) or + pd.api.types.is_datetime64_dtype(dtype) or + pd.api.types.is_timedelta64_dtype(dtype) or + pd.api.types.is_categorical_dtype(dtype) + ): + return 0 + elif pd.api.types.is_bool_dtype(dtype): + return False + elif pd.api.types.is_string_dtype(dtype): + return "" + else: + return ValueError(f"Unknown type: '{dtype}'") + + +def _EMPTY_ARRAY(size, dtype): + zero_value = _ZERO_VALUE_FOR_TYPE(dtype) + return pd.array([zero_value], dtype=dtype).repeat(size) + + +def _INFER_DTYPE_FROM_SCALAR(value): + # this does not work + # return pd.api.types.pandas_dtype(pd.api.types.infer_dtype([v])) + + # ugly hack abusing pd.array: + return pd.array([value]).dtype + + +# def _ARRAY_CLASS_FROM_TYPE(dtype): +# if pd.api.types.is_extension_type(dtype): +# cls = dtype.construct_array_type() +# +# return cls + +def _CONCAT_SAME_TYPE_FN(dtype): + if hasattr(dtype, "construct_array_type"): + cls = dtype.construct_array_type() + if hasattr(cls, "_concat_same_type"): + return cls._concat_same_type + + # workaround via factorization + return lambda values: pd.array(np.concatenate(values), dtype=dtype) + + +@register_extension_dtype +class StructDtype(ExtensionDtype): + name = "Struct" + + kind = "O" + str = "|O08" + base = np.dtype("O") + num = 103 + _metadata = ("fields", "nullable") + # e.g. Struct([(x: Integer), (y: Integer)], True) + _match = re.compile(r"[S|s]truct\(\[(?P.+)\](,(?P.*))?\)") + _cache: Dict[str, ExtensionDtype] = {} + + _fields: OrderedDict[str, ExtensionDtype] + _type: NamedTuple + + def __new__(cls, fields: Union[List[Tuple[str, Type]], Dict[str, Type], str], nullable=None): + # from pandas.core.dtypes.common import is_string_dtype, pandas_dtype + + if isinstance(fields, StructDtype): + return fields + elif pa is not None and isinstance(fields, pa.StructType): + pa_structtype: pa.StructType = fields + fields = OrderedDict() + for idx in range(pa_structtype.num_fields): + field = pa_structtype[idx] + field_name = field.name + field_dtype = field.type.to_pandas_dtype() + fields[field_name] = field_dtype + elif fields is None: + raise ValueError("Missing field types") + elif isinstance(fields, str): + m = cls._match.search(fields) + if m is not None: + # parse field types + fields = [] + for field in m.group("fields").split(","): + s = field.split(":") + if len(s) != 2: + raise ValueError("Wrong field description: %s" % field) + field_name: str = s[0].strip() + field_type = pd.api.types.pandas_dtype(s[1].strip()) + + fields.append((field_name, field_type)) + # get nullability of struct + nullable = m.group(nullable) + if nullable is not None: + nullable = bool(nullable.strip()) + + # make sure that every field type is actually a Pandas type + fields = OrderedDict(fields) + pd_type_fields = OrderedDict() + for f_name, f_type in fields.items(): + f_type = pd.api.types.pandas_dtype(f_type) + pd_type_fields[f_name] = f_type + fields = pd_type_fields + + # default for struct is nullable=True + if nullable is None: + nullable = True + + # deduplication of dtype + cache_key = str(fields) + str(nullable) + try: + return cls._cache[cache_key] + except KeyError: + u = object.__new__(cls) + # set dtype properties + u._fields = fields + u._nullable = nullable + u._type = None + + cls._cache[cache_key] = u + return u + + @property + def type(self) -> Type[NamedTuple]: + if self._type is None: + # create scalar struct type + self._type: NamedTuple = namedtuple("Struct", self.fields.keys()) + self._type.dtype = self + + return self._type + + @property + def fields(self) -> OrderedDict[str, ExtensionDtype]: + """ + The fields contained in this struct. + """ + return self._fields + + @property + def field_names(self): + return self._fields.keys() + + @property + def field_dtypes(self): + return self._fields.values() + + @property + def nullable(self): + return self._nullable + + @property + def zero_value(self): + return self.type(*[_ZERO_VALUE_FOR_TYPE(x) for x in self.field_dtypes]) + + @classmethod + def construct_array_type(cls) -> Type["StructArray"]: + """ + Return the array type associated with this dtype. + Returns + ------- + type + """ + return StructArray + + def __str__(self) -> str: + field_str_reprs = [ + f"{f_name}: {str(f_type)}" for f_name, f_type in self.fields.items() + ] + return f'''struct([{", ".join(field_str_reprs)}], nullable={self.nullable}''' + + def __hash__(self) -> int: + # make myself hashable + return hash(str(self)) + + def __repr__(self) -> str: + return f"dtype('{str(self)}'" + + def __from_arrow__( + self, array: Union["pa.Array", "pa.ChunkedArray"] + ) -> "StructArray": + """ + Construct VariantArray from pyarrow Array/ChunkedArray. + """ + import pyarrow as pa + + if isinstance(array, pa.Array): + chunks = [array] + else: + # pyarrow.ChunkedArray + chunks = array.chunks + + results = [] + for arr in chunks: + data = OrderedDict() + for f_name, f_type in self.fields: + pd_array = pd.array(arr.storage.field(f_name), dtype=f_type) + data[f_name] = pd_array + + if self.nullable: + # set missing values correctly + mask = ~ np.asarray(arr.is_valid()) + else: + mask = None + + iarr = StructArray(data, mask, dtype=self) + + results.append(iarr) + + return StructArray._concat_same_type(results) + + +# # x = StructDtype({"a": int, "b": int}) +# class ArrowStructType(pa.ExtensionType): +# def __init__(self, storage_type: pa.StructType): +# pa.ExtensionType.__init__(self, storage_type, "struct") +# +# def __arrow_ext_serialize__(self): +# # metadata = {"subtype": str(self.subtype), "closed": self.closed} +# # return json.dumps(metadata).encode() +# return b'{}' +# +# @classmethod +# def __arrow_ext_deserialize__(cls, storage_type: pa.StructType, serialized): +# # metadata = json.loads(serialized.decode()) +# # subtype = pyarrow.type_for_alias(metadata["subtype"]) +# # closed = metadata["closed"] +# return ArrowStructType(storage_type) +# +# def __eq__(self, other): +# if isinstance(other, pa.BaseExtensionType): +# return type(self) == type(other) +# else: +# return NotImplemented +# +# def __hash__(self): +# return hash(str(self)) +# +# def to_pandas_dtype(self): +# return StructDtype(self.storage_type) +# +# +# import pyarrow as pa +# # register the type with a dummy instance +# _arrow_variant_type = ArrowVariantType() +# pa.register_extension_type(_arrow_variant_type) + +class StructArray(ExtensionArray): + """ + TODO + Array representation of a list of Variant objects. + + Internally, stores the variants in a column-based representation. + + Attributes: + - chrom: array of chromosome names + - start: array of variant starts (0-based, inclusive) + - end: array of variant ends (1-based, exclusive) + - ref: array of reference sequences + - alt: array of alternative sequences + + In addition, can be converted from/to VCF-formatted (chrom, pos, ref, alt) representations. + """ + _fields: OrderedDict[str, ExtensionArray] + "fields that this StructArray holds" + # _field_names: List[str] + # "ordered list of field names that this StructArray holds" + _mask: Union[NoneType, np.ndarray] + "boolean array that denotes missing values" + _dtype: StructDtype + "data type of this StructArray" + + na_value = pd.NA + + def __init__(self, data, mask=None, dtype=None, copy=False, validate=True): + if isinstance(data, StructArray): + if dtype is not None and dtype != data.dtype: + raise ValueError(f"Passed StructArray but dtypes do not match: '{dtype}' != '{data.dtype}'") + else: + dtype = data.dtype + + fields = data.fields + if dtype.nullable and mask is None: + mask = data._mask + + if copy: + fields = deepcopy(fields) + mask = deepcopy(mask) + elif isinstance(data, OrderedDict) and not copy: + fields = data + if dtype is None: + field_types = self._infer_field_dtypes_from_arrays(fields.values, names=fields.keys()) + nullable = (mask is not None) + dtype = StructDtype(field_types, nullable=nullable) + + if copy: + fields = deepcopy(fields) + mask = deepcopy(mask) + elif isinstance(data, dict) or isinstance(data, pd.DataFrame): + fields = OrderedDict([(k, pd.array(v, copy=copy)) for k, v in data.items()]) + if dtype is None: + field_types = self._infer_field_dtypes_from_arrays(fields.values, names=fields.keys()) + nullable = (mask is not None) + dtype = StructDtype(field_types, nullable=nullable) + + if copy: + # no need to copy fields as this will already be done by pd.array if necessary + mask = deepcopy(mask) + elif self._is_valid_scalar(data): + # if dtype is None: + # field_types = self._infer_field_dtypes_from_scalar(data) + # nullable = (mask is not None) + # dtype = StructDtype(field_types, nullable=nullable) + # fields = self._parse_scalar(data, dtype=dtype) + raise ValueError(f"Cannot pass scalar '{data}' to '{self}'.") + else: + # assuming list of scalars + if dtype is None: + dtype = self._infer_dtype_from_list_of_scalars(data) + + data = self._parse_listlike(data, dtype) + fields = data._fields + # no need to copy 'fields' since it is a copy anyways + + if mask is None: + mask = data._mask + else: + if copy: + # need to copy the specified mask + mask = deepcopy(mask) + + # TODO: Copy-on-write when copy=False? + + self._fields = fields + self._mask = mask + self._dtype = dtype + + if validate: + self._validate() + + def _validate(self): + validated_fields = OrderedDict() + for f, v in self._fields.items(): + validated_fields[f] = pd.array(v, dtype=self.dtype.fields[f], copy=False) + self._fields = validated_fields + + def __getattr__(self, item): + if item in self.fields: + return self.fields[item] + else: + raise AttributeError + + @property + def fields(self): + return self._fields + + @property + def field_names(self): + return self._fields.keys() + + @property + def field_values(self): + return self._fields.values() + + @property + def field_types(self): + return self._dtype.fields + + @classmethod + def _validate_dtypes(cls, data: Iterable[Tuple[str, any]], dtype: Iterable[Tuple[str, Type]]): + """ + ensure correct array types + """ + return [ + (k, pd.array(v, dtype=v_dtype)) for ((k, v), (k2, v_dtype)) in zip(data, dtype) + ] + + def _filter(self, cond, inplace=False): + # handle scalar case + if pd.api.types.is_scalar(cond): + if not inplace: + if np.all(pd.isna(self)): + return self.na_value + else: + return self.dtype.type(*[v[cond] for v in self.field_values]) + else: + cond = [cond] + + if inplace: + retval = self + else: + retval = self.copy() + + for f_name, f_data in retval._fields.items(): + retval._fields[f_name] = f_data[cond] + if retval.dtype.nullable: + retval._mask = retval._mask[cond] + + return retval + + @classmethod + def _parse_listlike(cls, values: Iterable, dtype: StructDtype, validate=True) -> StructArrayT: + # list-like of structs + if not isinstance(values, list): + values = list(values) + length = len(values) + + casted_values = [cls._parse_scalar(v, dtype) for v in values] + + if dtype.nullable: + mask = np.array([pd.isna(v) for v in values]) + else: + # validate type + if validate and np.any([pd.isna(v) for v in values]): + raise ValueError("Data type '{dtype}' is not nullable but passed values contain NA!") + + mask = None + + # transpose List[Struct] to Tuple[Array] + fields_list = tuple(_EMPTY_ARRAY(size=length, dtype=d) for f, d in dtype.fields.items()) + for idx, scalar in enumerate(casted_values): + for field_idx, scalar_field in enumerate(scalar): + fields_list[field_idx][idx] = scalar_field + + data = OrderedDict() + for f, v in zip(dtype.field_names, fields_list): + data[f] = v + + return cls(data, mask=mask, dtype=dtype, copy=False) + + @classmethod + def _parse_scalar(cls, value, dtype: StructDtype) -> NamedTuple: + """ + + Args: + value: + dtype: + + Returns: dtype.type objects or dtype.zero_value if element is NA + + """ + if isinstance(value, tuple): + retval = dtype.type(*value) + elif isinstance(value, dict): + retval = dtype.type(**value) + elif pd.isna(value): + retval = dtype.zero_value + else: + raise TypeError( + "can only parse Struct-like objects" + ) + + # TODO: validate types + + return retval + + # @classmethod + # def _parse_fill_value(cls, value) -> Tuple: + # return cls._parse_scalar(value) + + @classmethod + def _parse_setitem_value(cls, value, dtype: StructDtype): + if cls._is_valid_scalar(value): + return cls._parse_scalar(value, dtype) + else: + return cls._parse_listlike(value, dtype) + + @staticmethod + def _is_valid_scalar(value): + return ( + isinstance(value, tuple) or + isinstance(value, dict) + ) + + @classmethod + def _infer_field_dtypes_from_scalar(cls, scalar) -> OrderedDict[str, Any[dtype, ExtensionDtype]]: + if hasattr(scalar, "dtype"): + return scalar.dtype + + if isinstance(scalar, NamedTuple): + field_names = scalar._fields + values = scalar + elif isinstance(scalar, tuple): + # simply enumerate fields + field_names = [f"f_{idx}" for idx in range(len(scalar))] + values = scalar + elif isinstance(scalar, dict): + field_names = scalar.keys() + values = scalar.values() + else: + raise ValueError(f"Unable to infer field dtypes from '{scalar}'") + + field_dtypes = OrderedDict() + for f, v in zip(field_names, values): + field_dtypes[str(f)] = _INFER_DTYPE_FROM_SCALAR(v) + + return field_dtypes + + @classmethod + def _infer_field_dtypes_from_arrays(cls, arrays, names=None) -> OrderedDict[str, Any[dtype, ExtensionDtype]]: + field_types = OrderedDict() + for field_idx, array in enumerate(arrays): + # use pandas Series to infer dtype + if not isinstance(array, pd.Series): + array = pd.Series(array) + + if names is not None: + field_name = names[field_idx] + elif array.name is not None: + # if array is already a series, this also allows to keep its name as field name + field_name = array.name + else: + # just use field index as name + field_name = f"f_{field_idx}" + + # make sure that field_name is a string + field_name = str(field_name) + # get dtype from series + field_dtype = array.dtype + + if field_name in field_types: + raise ValueError(f"Duplicate field name: '{field_name}'!") + + field_types[field_name] = field_dtype + + return field_types + + @classmethod + def _infer_dtype_from_list_of_scalars(cls, values: List[Tuple]) -> StructDtype: + values = np.asarray(values) + mask = np.array([pd.isna(v) for v in values]) + nullable = np.any(mask) + + nonnull_values = values[~ mask] + df = pd.DataFrame.from_records(nonnull_values) + + return StructDtype( + [(f"f_{field_name}", df.dtypes[field_name]) for field_name in df.columns], + nullable=nullable + ) + + # for scalar in values: + # if pd.isna(scalar): + # continue + # + # field_dtypes = cls._infer_field_dtypes_from_scalar(scalar) + # + # return StructDtype(field_dtypes, nullable=nullable) + # + # raise ValueError(f"Unable to infer field dtypes!") + + @classmethod + def from_arrays( + cls: type[StructArrayT], + list_of_fields, + dtype: StructDtype = None, + mask=None, + verify=True, + copy: bool = False, + ) -> StructArrayT: + if dtype is None: + field_types = cls._infer_field_dtypes_from_arrays(list_of_fields) + nullable = (mask is not None) + + dtype = StructDtype(field_types, nullable=nullable) + + data = OrderedDict() + for f, v in zip(list_of_fields, dtype.field_names): + data[f] = v + + return cls(data=data, mask=mask, dtype=dtype, copy=copy) + + @classmethod + def from_tuples( + cls: type[StructArrayT], + data: Iterable[Tuple], + dtype: StructDtype = None, + ) -> StructArrayT: + if dtype is None: + # infer dtype + if not isinstance(data, list): + data = list(data) + if len(data) == 0: + raise ValueError("Empty list specified!") + + # if cls._is_valid_scalar(data): + # field_types = cls._infer_field_dtypes_from_scalar(data) + # return StructDtype(field_types, nullable=True) + + dtype = cls._infer_dtype_from_list_of_scalars(data) + + return cls._parse_listlike(data, dtype=dtype) + + @classmethod + def from_df(cls, df: pd.DataFrame, is_null_column="is_null", dtype: StructDtype = None) -> StructArrayT: + """ + Creates a new StructArray from a DataFrame. + + :param df: Pandas dataframe + :param dtype: optional StructType datatype + :return: StructArray + """ + cols = df.columns + + if dtype is not None: + if not isinstance(dtype, StructDtype): + raise ValueError("Invalid dtype: '%s' is no instance of StructType" % dtype) + nullable = dtype.nullable + else: + nullable = is_null_column in cols + dtype = StructDtype( + [(field_name, df.dtypes[field_name]) for field_name in cols if field_name != is_null_column] + ) + + if nullable: + # validate is_null column + if is_null_column not in cols: + raise ValueError("dtype is nullable but column '%s' is missing!" % is_null_column) + elif not pd.api.types.is_bool_dtype(df.dtypes[is_null_column]): + raise ValueError("dtype of '%s' column is not boolean" % is_null_column) + + mask = df[is_null_column] + else: + mask = None + + return StructArray( + data={k: df[k].astype(v) for k, v in dtype.fields.items()}, + mask=mask, + dtype=dtype, + copy=False + ) + + def as_frame(self, is_null_column="is_null"): + retval = pd.DataFrame({k: v for k, v in self.fields.items()}) + if self.dtype.nullable: + if is_null_column in self.field_names: + raise ValueError( + f"is_null_column='{is_null_column}' cannot be used since there is an equally-named struct field!" + ) + retval[is_null_column] = self.isna() + return retval + + # --------------------------------------------------------------------- + # ExtensionArray interface + + @classmethod + def _concat_same_type( + cls: Type[StructArrayT], to_concat: Sequence[StructArrayT] + ) -> StructArrayT: + """ + Concatenate multiple arrays of this dtype. + + Parameters + ---------- + to_concat: sequence of this type + + Returns + ------- + ExtensionArray + """ + dtype = to_concat[0].dtype + data = OrderedDict() + for fname, ftype in dtype.fields.items(): + # get array type of field + concat_fn = _CONCAT_SAME_TYPE_FN(ftype) + + # concatenate same types + data[fname] = concat_fn([v.fields[fname] for v in to_concat]) + + if dtype.nullable: + mask = np.concatenate([pd.isna(v) for v in to_concat]) + else: + mask = None + + return cls(data, mask=mask, dtype=dtype, copy=False) + + @classmethod + def _from_sequence( + cls: type[StructArrayT], + scalars, + *, + dtype=None, + copy: bool = False, + ) -> StructArrayT: + """ + Construct a new ExtensionArray from a sequence of scalars. + + Parameters + ---------- + scalars: Sequence + + Each element will be an instance of the scalar type for this array, ``cls.dtype.type``. + dtype: dtype, optional + + Construct for this particular dtype.This should be a Dtype compatible with the ExtensionArray. + copy: boolean, default False + + If True, copy the underlying data. + + Returns + ------- + ExtensionArray + """ + if isinstance(scalars, cls) and scalars.dtype == dtype: + return cls(scalars, dtype=dtype, copy=copy) + + if dtype is None: + dtype = cls._infer_dtype_from_list_of_scalars(scalars) + return cls._parse_listlike(scalars, dtype=dtype) + + def _values_for_factorize(self): + # type: () -> Tuple[np.ndarray, Any] + """ + Return an array and missing value suitable for factorization. + + Returns + ------- + values : ndarray + + An array suitable for factorization. This should maintain order + and be a supported dtype (Float64, Int64, UInt64, String, Object). + By default, the extension array is cast to object dtype. + na_value : object + The value in `values` to consider missing. This will be treated + as NA in the factorization routines, so it will be coded as + `na_sentinel` and not included in `uniques`. By default, + ``np.nan`` is used. + + Notes + ----- + The values returned by this method are also used in + :func:`pandas.util.hash_pandas_object`. + """ + return np.asarray(self), self.na_value + + @classmethod + def _from_factorized( + cls: type[StructArrayT], values: np.ndarray, original: StructArrayT + ) -> StructArrayT: + """ + Reconstruct an ExtensionArray after factorization. + + Parameters + ---------- + values : ndarray + An integer ndarray with the factorized values. + original : ExtensionArray + The original ExtensionArray that factorize was called on. + + See Also + -------- + factorize : Top-level factorize method that dispatches here. + ExtensionArray.factorize : Encode the extension array as an enumerated type. + """ + return cls._from_sequence(values, dtype=original.dtype) + + @classmethod + def _from_sequence_of_strings(cls, strings, *, dtype=None, copy=False): + return cls.from_str(strings) + + def __array__(self, dtype=None) -> np.ndarray: + """ + Return the StructArray's data as a numpy array of Variant objects (with dtype='object') + """ + mask = self.isna() + + result = np.empty(len(self), dtype=object) + for i in range(len(self)): + if mask[i]: + result[i] = pd.NA + else: + result[i] = self.take(i) + + if dtype != None: + result = result.astype(dtype) + return result + + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow as pa + + storage_array = pa.StructArray.from_arrays( + arrays=[pa.array(f, from_pandas=True) for f in self.field_values], + names=list(self.field_names), + ) + + mask = self.isna() + if mask.any(): + # if there are missing values, set validity bitmap also on the array level + null_bitmap = pa.array(~mask).buffers()[1] + storage_array = pa.StructArray.from_buffers( + storage_array.type, + len(storage_array), + [null_bitmap], + children=[storage_array.field(i) for i in range(len(self.fields))], + ) + + if type is not None: + if type.equals(storage_array.type): + return storage_array + else: + raise TypeError( + f"Not supported to convert StructArray to '{type}' type" + ) + + return storage_array + + # --------------------------------------------------------------------- + # Descriptive + def copy(self: StructArrayT) -> StructArrayT: + """ + Return a copy of the array. + """ + return StructArray(self, copy=True) + + @property + def dtype(self) -> StructDtype: + return self._dtype + + @property + def nbytes(self) -> int: + size = sum(f.nbytes for f in self.fields.values()) + if self.dtype.nullable: + size += self._mask.nbytes + return size + + @property + def size(self) -> int: + # Avoid materializing self.values + return self._fields.values()[0].size + + def __iter__(self): + return iter(np.asarray(self)) + + def __len__(self) -> int: + return len(next(iter(self.fields.values()))) + + def __getitem__(self, key): + key = pd.api.indexers.check_array_indexer(self, key) + return self._filter(key) + + def __setitem__(self, key, value): + key = pd.api.indexers.check_array_indexer(self, key) + value_array = self._parse_setitem_value(value, self.dtype) + + for f, v in value_array.fields.items(): + self._fields[f][key] = v + + def take(self, indices, allow_fill=False, fill_value=None): + from pandas.api.extensions import take + + if allow_fill: + if fill_value is None or pd.isna(fill_value): + fill_value = None + elif self._is_valid_scalar(fill_value): + fill_value = self._parse_scalar(fill_value, dtype=self.dtype) + else: + raise TypeError(f"'{type(fill_value)}' is not a valid fill value!") + + # scalar case + if pd.api.types.is_scalar(indices): + if self._mask[indices]: # missing value at 'indices' + if allow_fill and fill_value is not None: + return fill_value + else: + return self.na_value + else: + # return Struct of this type + return self.dtype.type(*[v[indices] for v in self.field_values]) + + fields = OrderedDict() + for f, v in self.fields.items(): + field_fill_value = fill_value[f] if fill_value is not None else None + fields[f] = take(v, indices, allow_fill=allow_fill, fill_value=field_fill_value) + + if self.dtype.nullable: + mask = take(self._mask, indices) + else: + mask = None + + return type(self)(fields, mask=mask, dtype=self.dtype, copy=False) + + def isna(self): + """ + Returns boolean NumPy array indicating if eachvalue is missing + """ + return self._mask + + def unique(self): + """ + Compute the ExtensionArray of unique values. + + Returns + ------- + uniques: ExtensionArray + """ + # TODO: test alternatives, e.g.: + # uniques = StructArray(self.as_frame().drop_duplicates()) + + factors, uniques = pd.factorize(self) + if np.any(factors < 0): + uniques = self._concat_same_type([uniques, StructArray._from_sequence([self.na_value], dtype=self.dtype)]) + + return uniques + + def astype(self, dtype, copy=True): + dtype = pd.api.types.pandas_dtype(dtype) + + if pd.api.types.is_string_dtype(dtype): + return pd.array(self.to_str(), dtype=dtype) + elif dtype == self.dtype: + if copy: + return self.copy() + else: + return self + else: + raise TypeError(f"Cannot cast '{self.dtype}' to '{dtype}'") + # return super().astype(dtype, copy) + + def min(self, axis=None, skipna: bool = True, **kwargs) -> NamedTuple: + raise NotImplementedError() + + def max(self, axis=None, skipna: bool = True, **kwargs) -> NamedTuple: + raise NotImplementedError() + + def value_counts(self, dropna=False): + return pd.value_counts(np.asarray(self), dropna=dropna).astype("Int64") + + def _cmp_method(self, other, op, fail_on_missing=True): + # ensure pandas array for list-like and eliminate non-compatible scalars + if self._is_valid_scalar(other): + other = self._parse_scalar(other, dtype=self.dtype) + # directly use 'other' as iterable tuple + other_field_values = other + else: + # ensure same dtype + other = pd.array(other, dtype=self.dtype) + + if len(self) != len(other): + raise ValueError("Lengths must match to compare") + + other_field_values = other.field_values + + # if other.dtype != self.dtype: + # raise ValueError(f"Unknown type of other: {other.dtype}") + + if op is operator.eq or op is operator.ne: + is_eq = np.array(True) + for this_f, other_f in zip(self.field_values, other_field_values): + # element is equal if all fields are equal + # Has to be casted to numpy boolean array; otherwise we get + # `TypeError: unsupported operand type(s) for &: 'PandasArray' and 'PandasArray'` + # TODO: switch to Pandas boolean array, once these support all binary ops + is_eq = np.asarray(this_f == other_f) & is_eq + + if self.dtype.nullable: + is_eq = is_eq | (pd.isna(self) & pd.isna(other)) # return true if both values are NA + + if op is operator.ne: + return ~is_eq + else: + return is_eq + elif op in {operator.gt, operator.ge, operator.le, operator.lt}: + prev_is_eq = np.array(True) + self_is_gt_other = np.array(False) + for this_f, other_f in zip(self.field_values, other_field_values): + # TODO: switch to Pandas boolean array, once these support all binary ops + cur_is_eq = np.asarray(this_f == other_f) + cur_is_gt = np.asarray(this_f > other_f) + + # self > other if the first non-equal field is greater + self_is_gt_other = (cur_is_gt & prev_is_eq) | self_is_gt_other + prev_is_eq = cur_is_eq + if op is operator.gt: + retval = self_is_gt_other + elif op is operator.ge: + retval = self_is_gt_other | prev_is_eq + elif op is operator.lt: + retval = ~ (self_is_gt_other | prev_is_eq) + else: # op is operator.le + retval = ~self_is_gt_other + + if self.dtype.nullable: + any_of_both_missing = (pd.isna(self) | pd.isna(other)) + if fail_on_missing and np.any(any_of_both_missing): + raise TypeError("boolean value of NA is ambiguous") + + # return false if any of the values is NA + return retval & ~any_of_both_missing + else: + return retval + else: + raise ValueError(f"Unknown op {op}") + + _arith_method = _cmp_method + + # @unpack_zerodim_and_defer("__eq__") + def __eq__(self, other): + return self._cmp_method(other, operator.eq) + + # @unpack_zerodim_and_defer("__ne__") + def __ne__(self, other): + return self._cmp_method(other, operator.ne) + + # @unpack_zerodim_and_defer("__gt__") + def __gt__(self, other): + return self._cmp_method(other, operator.gt) + + # @unpack_zerodim_and_defer("__ge__") + def __ge__(self, other): + return self._cmp_method(other, operator.ge) + + # @unpack_zerodim_and_defer("__lt__") + def __lt__(self, other): + return self._cmp_method(other, operator.lt) + + # @unpack_zerodim_and_defer("__le__") + def __le__(self, other): + return self._cmp_method(other, operator.le) + + # --------------------------------------------------------------------- + # Rendering Methods + + def _format_data(self) -> str: + n = len(self) + + max_seq_items = min((pd.get_option("display.max_seq_items") or n) // 10, 10) + + formatter = str + + if n == 0: + summary = "[]" + elif n == 1: + first = formatter(self[0]) + summary = f"[{first}]" + elif n == 2: + first = formatter(self[0]) + last = formatter(self[-1]) + summary = f"[{first}, {last}]" + else: + if n > max_seq_items: + n = min(max_seq_items // 2, 10) + head = [formatter(x) for x in self[:n]] + tail = [formatter(x) for x in self[-n:]] + head_str = ", ".join(head) + tail_str = ", ".join(tail) + summary = f"[{head_str} ... {tail_str}]" + else: + tail = [formatter(x) for x in self] + tail_str = ", ".join(tail) + summary = f"[{tail_str}]" + + return summary + + def __repr__(self) -> str: + # the short repr has no trailing newline, while the truncated + # repr does. So we include a newline in our template, and strip + # any trailing newlines from format_object_summary + data = self._format_data() + class_name = f"<{type(self).__name__}>\n" + + template = f"{class_name}{data}\nLength: {len(self)}, dtype: {self.dtype}" + return template + + def _format_space(self) -> str: + space = " " * (len(type(self).__name__) + 1) + + return f"\n{space}" + + def __str__(self): + return self.__repr__() + + diff --git a/pandas/tests/dtypes/test_structtype.py b/pandas/tests/dtypes/test_structtype.py new file mode 100644 index 0000000000000..8c6214922b7a3 --- /dev/null +++ b/pandas/tests/dtypes/test_structtype.py @@ -0,0 +1,119 @@ +import pytest + +from genomictypes import Variant, VariantArray +import numpy as np +import pandas as pd + +from pandas.core.arrays import StructArray +from pandas.core.arrays import StructDtype + +# from pandas.core.dtypes.dtypes import StructDtype + + +def test_variantdtype(): + var_dtype = StructDtype({ + "chrom": "string", + "start": "int32", + "end": "int32", + "ref": "string", + "alt": "string" + }, nullable=True) + print(var_dtype) + + var_scalar_type = var_dtype.type + + var1 = var_scalar_type("chr1", 10, 11, "A", "G") + var2 = var_scalar_type("chr1", 12, 15, "AAT", "C") + + assert tuple(var1) == var1 + assert var2 == ( + var2.chrom, + var2.start, + var2.end, + var2.ref, + var2.alt + ) + + # test array + var_array = StructArray([ + var1, + ("chr1", 10, 11, "A", "G"), + var2, + None + ], dtype=var_dtype) + list(var_array) + print(var_array) + + assert len(var_array) == 4 + assert all(var_array[:2] == StructArray([var1, var1])) + + scalar = var_array[0] + assert isinstance(scalar, var_scalar_type) + assert scalar == var1 + + assert all(var_array.isna() == [False, False, False, True]) + assert all( + (np.asarray(var_array) == np.array([var1, var1, var2, None], dtype=object)) | pd.isna(var_array) + ) + + # TODO: proper test values + assert all( + StructArray.from_df( + var_array.as_frame() + ) == var_array + ) + + assert len(var_array.unique()) == 3 + assert set(var_array.unique()) == {var1, var2, pd.NA} + + # don't explicitly check the repr, only test if it works + str(var_array) + + assert all(pd.array(var_array) == var_array) + + # # test pyarrow + # arrow_array = var_array.__arrow_array__() + # arrow_array.to_pandas() + + +def test_structarray(): + dtype = StructDtype({"x": "int", "y": "float"}, nullable=True) + x = StructArray([(0, 3), (1, 2)], dtype=dtype) + + assert type(x[0]) == dtype.type + assert x[0] == (0, 3) + + assert all(StructArray.from_df(x.as_frame()) == x) + + # x == y + assert all((x == (0, 1)) == [False, False]) + assert all((x == (0, 3)) == [True, False]) + # x != y + assert all((x != (0, 1)) == [True, True]) + assert all((x != (0, 3)) == [False, True]) + # x < y + assert all((x < (0, 3)) == [False, False]) + assert all((x < (0, 5)) == [True, False]) + assert all((x < (3, 5)) == [True, True]) + # x <= y + assert all((x <= (0, 3)) == [True, False]) + assert all((x <= (0, 5)) == [True, False]) + assert all((x <= (3, 5)) == [True, True]) + # x >= y + assert all((x >= (0, 3)) == [True, True]) + assert all((x >= (0, 5)) == [False, True]) + assert all((x >= (3, 5)) == [False, False]) + # x > y + assert all((x > (0, 3)) == [False, True]) + assert all((x > (0, 5)) == [False, True]) + assert all((x > (3, 5)) == [False, False]) + + # test type inference + assert pd.api.types.is_dtype_equal( + StructArray([(0, 3), (1, 2.), None]).dtype, + StructDtype({"f_0": "int64", "f_1": "float64"}, nullable=True) + ) + assert not pd.api.types.is_dtype_equal( + StructArray([(0, 3), (1, 2.), None]).dtype, + StructDtype({"f_0": "int64", "f_1": "float64"}, nullable=False) + )