Skip to content

WIP: ENH Add int[pyarrow] dtype #46972

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,14 @@

from pandas.core.api import (
# dtype
Int8ArrowDtype,
Int16ArrowDtype,
Int32ArrowDtype,
Int64ArrowDtype,
UInt8ArrowDtype,
UInt16ArrowDtype,
UInt32ArrowDtype,
UInt64ArrowDtype,
Int8Dtype,
Int16Dtype,
Int32Dtype,
Expand Down
10 changes: 10 additions & 0 deletions pandas/core/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,16 @@
value_counts,
)
from pandas.core.arrays import Categorical
from pandas.core.arrays.arrow.integer import (
Int8ArrowDtype,
Int16ArrowDtype,
Int32ArrowDtype,
Int64ArrowDtype,
UInt8ArrowDtype,
UInt16ArrowDtype,
UInt32ArrowDtype,
UInt64ArrowDtype,
)
from pandas.core.arrays.boolean import BooleanDtype
from pandas.core.arrays.floating import (
Float32Dtype,
Expand Down
150 changes: 150 additions & 0 deletions pandas/core/arrays/arrow/integer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
from __future__ import annotations

import pyarrow as pa

from pandas.core.dtypes.base import register_extension_dtype

from pandas.core.arrays.arrow.numeric import (
NumericArrowArray,
NumericArrowDtype,
)


class IntegerArrowDtype(NumericArrowDtype):
"""
An ExtensionDtype to hold a single size & kind of integer Arrow dtype.

These specific implementations are subclasses of the non-public
IntegerArrowDtype. For example we have Int8ArrowDtype to represent signed int 8s.

The attributes name & type are set when these subclasses are created.
"""

_default_pa_dtype = pa.int64()
_dtype_checker = pa.types.is_integer

@classmethod
def construct_array_type(cls) -> type[IntegerArrowArray]:
"""
Return the array type associated with this dtype.

Returns
-------
type
"""
return IntegerArrowArray

@classmethod
def _str_to_dtype_mapping(cls):
return INT_STR_TO_DTYPE


class IntegerArrowArray(NumericArrowArray):
"""
Array of pyarrow integer values.

To construct an IntegerArray from generic array-like ipaut, use
:func:`pandas.array` with one of the integer dtypes (see examples).

Parameters
----------
values : pa.ChunkedArray
A 1-d integer-dtype array.

Attributes
----------
None

Methods
-------
None

Returns
-------
IntegerArrowArray
"""

_dtype_cls = IntegerArrowDtype


_dtype_docstring = """
An ExtensionDtype for {dtype} integer pyarrow data.

Attributes
----------
None

Methods
-------
None
"""

# create the Dtype


@register_extension_dtype
class Int8ArrowDtype(IntegerArrowDtype):
type = pa.int8()
name = "int8"
__doc__ = _dtype_docstring.format(dtype="int8")


@register_extension_dtype
class Int16ArrowDtype(IntegerArrowDtype):
type = pa.int16()
name = "int16"
__doc__ = _dtype_docstring.format(dtype="int16")


@register_extension_dtype
class Int32ArrowDtype(IntegerArrowDtype):
type = pa.int32()
name = "int32"
__doc__ = _dtype_docstring.format(dtype="int32")


@register_extension_dtype
class Int64ArrowDtype(IntegerArrowDtype):
type = pa.int64()
name = "int64"
__doc__ = _dtype_docstring.format(dtype="int64")


@register_extension_dtype
class UInt8ArrowDtype(IntegerArrowDtype):
type = pa.uint8()
name = "uint8"
__doc__ = _dtype_docstring.format(dtype="uint8")


@register_extension_dtype
class UInt16ArrowDtype(IntegerArrowDtype):
type = pa.uint16()
name = "uint16"
__doc__ = _dtype_docstring.format(dtype="uint16")


@register_extension_dtype
class UInt32ArrowDtype(IntegerArrowDtype):
type = pa.uint32()
name = "uint32"
__doc__ = _dtype_docstring.format(dtype="uint32")


@register_extension_dtype
class UInt64ArrowDtype(IntegerArrowDtype):
type = pa.uint64()
name = "uint64"
__doc__ = _dtype_docstring.format(dtype="uint64")


INT_STR_TO_DTYPE: dict[str, IntegerArrowDtype] = {
"int8": Int8ArrowDtype(),
"int16": Int16ArrowDtype(),
"int32": Int32ArrowDtype(),
"int64": Int64ArrowDtype(),
"uint8": UInt8ArrowDtype(),
"uint16": UInt16ArrowDtype(),
"uint32": UInt32ArrowDtype(),
"uint64": UInt64ArrowDtype(),
}
75 changes: 75 additions & 0 deletions pandas/core/arrays/arrow/numeric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from __future__ import annotations

from typing import (
Any,
Callable,
TypeVar,
)

import pyarrow as pa

from pandas.errors import AbstractMethodError
from pandas.util._decorators import cache_readonly

from pandas.core.arrays.arrow.array import ArrowExtensionArray
from pandas.core.arrays.arrow.dtype import ArrowDtype

T = TypeVar("T", bound="NumericArrowArray")


class NumericArrowDtype(ArrowDtype):
_default_pa_dtype: pa.null()
_dtype_checker: Callable[[Any], bool] # pa.types.is_<type>

@property
def _is_numeric(self) -> bool:
return True

@cache_readonly
def is_signed_integer(self) -> bool:
return self.kind == "i"

@cache_readonly
def is_unsigned_integer(self) -> bool:
return self.kind == "u"

@classmethod
def _str_to_dtype_mapping(cls):
raise AbstractMethodError(cls)


class NumericArrowArray(ArrowExtensionArray):
"""
Base class for Integer and Floating and Boolean dtypes.
"""

_dtype_cls: type[NumericArrowDtype]

def __init__(self, values: pa.ChunkedArray) -> None:
checker = self._dtype_cls._dtype_checker
if not (isinstance(values, pa.ChunkedArray) and checker(values.type)):
descr = (
"floating"
if self._dtype_cls.kind == "f" # type: ignore[comparison-overlap]
else "integer"
)
raise TypeError(f"values should be {descr} arrow array.")
super().__init__(values)

@cache_readonly
def dtype(self) -> NumericArrowDtype:
mapping = self._dtype_cls._str_to_dtype_mapping()
return mapping[str(self._data.type)]

@classmethod
def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False):
if dtype is None:
dtype = cls._dtype_cls._default_pa_dtype
return cls(pa.chunked_array([scalars], type=dtype.type))

@classmethod
def _from_sequence_of_strings(cls, strings, *, dtype=None, copy: bool = False):
from pandas.core.tools.numeric import to_numeric

scalars = to_numeric(strings, errors="raise")
return cls._from_sequence(scalars, dtype=dtype, copy=copy)