Skip to content

ENH: Create ArrowDtype #46774

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
May 2, 2022
Merged
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions pandas/core/arrays/arrow/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from __future__ import annotations

import numpy as np
import pyarrow as pa

from pandas._typing import DtypeObj
from pandas.util._decorators import cache_readonly

from pandas.core.dtypes.base import StorageExtensionDtype

from pandas.core.arrays.arrow import ArrowExtensionArray


class ArrowDtype(StorageExtensionDtype):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are the dtypes here? shouldn't these be in pandas/core/dtypes ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking of consolidating all arrow related functionality under /arrays/arrow/. Thoughts @jbrockmendel

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no objection. Might be worth keeping something in core.dtypes.dtypes for dependency-structure purposes

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok fair, looking to keep these separate as much as possible, maybe rename this to _dtype.py at some point.

"""
Base class for dtypes for BaseArrowArray subclasses.
Modeled after BaseMaskedDtype
"""

name: str
base = None
type: pa.DataType

na_value = pa.NA

def __init__(self, storage="pyarrow") -> None:
super().__init__(storage)

@cache_readonly
def numpy_dtype(self) -> np.dtype:
"""Return an instance of the related numpy dtype"""
return self.type.to_pandas_dtype()

@cache_readonly
def kind(self) -> str:
return self.numpy_dtype.kind

@cache_readonly
def itemsize(self) -> int:
"""Return the number of bytes in this dtype"""
return self.numpy_dtype.itemsize

@classmethod
def construct_array_type(cls):
"""
Return the array type associated with this dtype.

Returns
-------
type
"""
return ArrowExtensionArray

@classmethod
def from_numpy_dtype(cls, dtype: np.dtype) -> ArrowDtype:
"""
Construct the ArrowDtype corresponding to the given numpy dtype.
"""
# TODO: This may be incomplete
pa_dtype = pa.from_numpy_dtype(dtype)
if pa_dtype is cls.type:
return cls()
raise NotImplementedError(dtype)

def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
# We unwrap any masked dtypes, find the common dtype we would use
# for that, then re-mask the result.
from pandas.core.dtypes.cast import find_common_type

new_dtype = find_common_type(
[
dtype.numpy_dtype if isinstance(dtype, ArrowDtype) else dtype
for dtype in dtypes
]
)
if not isinstance(new_dtype, np.dtype):
# If we ever support e.g. Masked[DatetimeArray] then this will change
return None
try:
return type(self).from_numpy_dtype(new_dtype)
except (KeyError, NotImplementedError):
return None

def __from_arrow__(self, array: pa.Array | pa.ChunkedArray):
"""
Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray.
"""
array_class = self.construct_array_type()
return array_class(array)