From 238a44e1f54b768a520bb4c0213cc93c6d2e277f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 13 Apr 2022 21:54:20 -0700 Subject: [PATCH 1/4] ENH: Create BaseArrowDtype & NumericArrowDtype --- pandas/core/arrays/arrow/base.py | 30 +++++++++++++ pandas/core/arrays/arrow/numeric.py | 68 +++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 pandas/core/arrays/arrow/base.py create mode 100644 pandas/core/arrays/arrow/numeric.py diff --git a/pandas/core/arrays/arrow/base.py b/pandas/core/arrays/arrow/base.py new file mode 100644 index 0000000000000..b9e795cc09a50 --- /dev/null +++ b/pandas/core/arrays/arrow/base.py @@ -0,0 +1,30 @@ +from __future__ import annotations + +import numpy as np +import pyarrow as pa + +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.dtypes import BaseMaskedDtype + + +class BaseArrowDtype(BaseMaskedDtype): + """ + Base class for dtypes for BaseArrowArray subclasses. + """ + + type: pa.DataType + na_value = pa.NA + + @cache_readonly + def numpy_dtype(self) -> np.dtype: + """Return an instance of our numpy dtype""" + return self.type.to_pandas_dtype() + + @classmethod + def from_numpy_dtype(cls, dtype: np.dtype) -> BaseArrowDtype: + """ + Construct the ArrowDtype corresponding to the given numpy dtype. + """ + # TODO: Fill when the other ArrowDtyes are created + raise NotImplementedError(dtype) diff --git a/pandas/core/arrays/arrow/numeric.py b/pandas/core/arrays/arrow/numeric.py new file mode 100644 index 0000000000000..d7872910ac7c9 --- /dev/null +++ b/pandas/core/arrays/arrow/numeric.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +from typing import ( + Any, + Callable, +) + +import numpy as np +import pyarrow as pa + +from pandas.errors import AbstractMethodError +from pandas.util._decorators import cache_readonly + +from pandas.core.arrays.arrow.base import BaseArrowDtype +from pandas.core.arrays.masked import BaseMaskedArray + + +class NumericArrowDtype(BaseArrowDtype): + + _default_pa_dtype: pa.DataType + _checker: Callable[[Any], bool] # is_foo_dtype + + @cache_readonly + def is_signed_integer(self) -> bool: + return pa.types.is_signed_integer(self.type) + + @cache_readonly + def is_unsigned_integer(self) -> bool: + return pa.types.is_unsigned_integer(self.type) + + @property + def _is_numeric(self) -> bool: + return True + + def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> BaseMaskedArray: + """ + Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray. + """ + array_class = self.construct_array_type() + return array_class(array) + + @classmethod + def _str_to_dtype_mapping(cls): + raise AbstractMethodError(cls) + + # TODO: Might not need this for pyarrow. Only used in _coerce_to_data_and_mask + # which is easily retrievable from pa.ChunkedArray + @classmethod + def _standardize_dtype(cls, dtype) -> NumericArrowDtype: + """ + Convert a string representation or a pyarrow dtype to NumericArrowDtype. + """ + if not issubclass(type(dtype), cls): + mapping = cls._str_to_dtype_mapping() + try: + dtype = mapping[str(pa.type_for_alias(dtype))] + except KeyError as err: + raise ValueError(f"invalid dtype specified {dtype}") from err + return dtype + + @classmethod + def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray: + """ + Safely cast the values to the given dtype. + + "safe" in this context means the casting is lossless. + """ + raise AbstractMethodError(cls) From f681d5e36e6736747735520d0b8fabd3b9a537fd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 17 Apr 2022 21:05:59 -0700 Subject: [PATCH 2/4] Simplfy inheritance --- pandas/core/arrays/arrow/base.py | 69 ++++++++++++++++++++++++++--- pandas/core/arrays/arrow/numeric.py | 68 ---------------------------- 2 files changed, 64 insertions(+), 73 deletions(-) delete mode 100644 pandas/core/arrays/arrow/numeric.py diff --git a/pandas/core/arrays/arrow/base.py b/pandas/core/arrays/arrow/base.py index b9e795cc09a50..4f7b1eb288999 100644 --- a/pandas/core/arrays/arrow/base.py +++ b/pandas/core/arrays/arrow/base.py @@ -3,28 +3,87 @@ import numpy as np import pyarrow as pa +from pandas._typing import DtypeObj from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.dtypes import BaseMaskedDtype +from pandas.core.dtypes.base import StorageExtensionDtype +from pandas.core.arrays.arrow import ArrowExtensionArray -class BaseArrowDtype(BaseMaskedDtype): + +class ArrowDtype(StorageExtensionDtype): """ Base class for dtypes for BaseArrowArray subclasses. + Modeled after BaseMaskedDtype """ + name: str + base = None type: pa.DataType + na_value = pa.NA + def __init__(self, storage="pyarrow") -> None: + super().__init__(storage) + @cache_readonly def numpy_dtype(self) -> np.dtype: - """Return an instance of our numpy dtype""" + """Return an instance of the related numpy dtype""" return self.type.to_pandas_dtype() + @cache_readonly + def kind(self) -> str: + return self.numpy_dtype.kind + + @cache_readonly + def itemsize(self) -> int: + """Return the number of bytes in this dtype""" + return self.numpy_dtype.itemsize + @classmethod - def from_numpy_dtype(cls, dtype: np.dtype) -> BaseArrowDtype: + def construct_array_type(cls): + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return ArrowExtensionArray + + @classmethod + def from_numpy_dtype(cls, dtype: np.dtype) -> ArrowDtype: """ Construct the ArrowDtype corresponding to the given numpy dtype. """ - # TODO: Fill when the other ArrowDtyes are created + # TODO: This may be incomplete + pa_dtype = pa.from_numpy_dtype(dtype) + if pa_dtype is cls.type: + return cls() raise NotImplementedError(dtype) + + def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: + # We unwrap any masked dtypes, find the common dtype we would use + # for that, then re-mask the result. + from pandas.core.dtypes.cast import find_common_type + + new_dtype = find_common_type( + [ + dtype.numpy_dtype if isinstance(dtype, ArrowDtype) else dtype + for dtype in dtypes + ] + ) + if not isinstance(new_dtype, np.dtype): + # If we ever support e.g. Masked[DatetimeArray] then this will change + return None + try: + return type(self).from_numpy_dtype(new_dtype) + except (KeyError, NotImplementedError): + return None + + def __from_arrow__(self, array: pa.Array | pa.ChunkedArray): + """ + Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray. + """ + array_class = self.construct_array_type() + return array_class(array) diff --git a/pandas/core/arrays/arrow/numeric.py b/pandas/core/arrays/arrow/numeric.py deleted file mode 100644 index d7872910ac7c9..0000000000000 --- a/pandas/core/arrays/arrow/numeric.py +++ /dev/null @@ -1,68 +0,0 @@ -from __future__ import annotations - -from typing import ( - Any, - Callable, -) - -import numpy as np -import pyarrow as pa - -from pandas.errors import AbstractMethodError -from pandas.util._decorators import cache_readonly - -from pandas.core.arrays.arrow.base import BaseArrowDtype -from pandas.core.arrays.masked import BaseMaskedArray - - -class NumericArrowDtype(BaseArrowDtype): - - _default_pa_dtype: pa.DataType - _checker: Callable[[Any], bool] # is_foo_dtype - - @cache_readonly - def is_signed_integer(self) -> bool: - return pa.types.is_signed_integer(self.type) - - @cache_readonly - def is_unsigned_integer(self) -> bool: - return pa.types.is_unsigned_integer(self.type) - - @property - def _is_numeric(self) -> bool: - return True - - def __from_arrow__(self, array: pa.Array | pa.ChunkedArray) -> BaseMaskedArray: - """ - Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray. - """ - array_class = self.construct_array_type() - return array_class(array) - - @classmethod - def _str_to_dtype_mapping(cls): - raise AbstractMethodError(cls) - - # TODO: Might not need this for pyarrow. Only used in _coerce_to_data_and_mask - # which is easily retrievable from pa.ChunkedArray - @classmethod - def _standardize_dtype(cls, dtype) -> NumericArrowDtype: - """ - Convert a string representation or a pyarrow dtype to NumericArrowDtype. - """ - if not issubclass(type(dtype), cls): - mapping = cls._str_to_dtype_mapping() - try: - dtype = mapping[str(pa.type_for_alias(dtype))] - except KeyError as err: - raise ValueError(f"invalid dtype specified {dtype}") from err - return dtype - - @classmethod - def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray: - """ - Safely cast the values to the given dtype. - - "safe" in this context means the casting is lossless. - """ - raise AbstractMethodError(cls) From 3fbbdba4297302857ffb4044c876fbe4c67cdd69 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 30 Apr 2022 22:59:12 -0700 Subject: [PATCH 3/4] add construct from string --- pandas/core/arrays/arrow/base.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/core/arrays/arrow/base.py b/pandas/core/arrays/arrow/base.py index 4f7b1eb288999..c0ecb0856f27f 100644 --- a/pandas/core/arrays/arrow/base.py +++ b/pandas/core/arrays/arrow/base.py @@ -51,6 +51,23 @@ def construct_array_type(cls): """ return ArrowExtensionArray + @classmethod + def construct_from_string(cls, string: str): + """ + Construct this type from a string. + + Parameters + ---------- + string : str + """ + if not isinstance(string, str): + raise TypeError( + f"'construct_from_string' expects a string, got {type(string)}" + ) + if string == f"{cls.name}[pyarrow]": + return cls(storage="pyarrow") + raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") + @classmethod def from_numpy_dtype(cls, dtype: np.dtype) -> ArrowDtype: """ From e4aa49cd20bb0c18815e2b24db82806bb401b7ac Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 30 Apr 2022 23:02:08 -0700 Subject: [PATCH 4/4] Rename --- pandas/core/arrays/arrow/{base.py => dtype.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pandas/core/arrays/arrow/{base.py => dtype.py} (100%) diff --git a/pandas/core/arrays/arrow/base.py b/pandas/core/arrays/arrow/dtype.py similarity index 100% rename from pandas/core/arrays/arrow/base.py rename to pandas/core/arrays/arrow/dtype.py