-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
BUG: PyArrow dtypes were not supported in the interchange protocol #57764
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
9d6b21b
031d9aa
cec4b4d
a1f0c43
f041b58
9adf45f
080e54f
9344458
c2f5bfa
e4531a0
0d89d97
d85c904
f28e33c
db0f402
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,9 @@ | ||
from __future__ import annotations | ||
|
||
from typing import Any | ||
from typing import ( | ||
TYPE_CHECKING, | ||
Any, | ||
) | ||
|
||
import numpy as np | ||
|
||
|
@@ -9,15 +12,18 @@ | |
from pandas.errors import NoBufferPresent | ||
from pandas.util._decorators import cache_readonly | ||
|
||
from pandas.core.dtypes.dtypes import ( | ||
from pandas.core.dtypes.dtypes import BaseMaskedDtype | ||
|
||
import pandas as pd | ||
from pandas import ( | ||
ArrowDtype, | ||
BaseMaskedDtype, | ||
DatetimeTZDtype, | ||
) | ||
|
||
import pandas as pd | ||
from pandas.api.types import is_string_dtype | ||
from pandas.core.interchange.buffer import PandasBuffer | ||
from pandas.core.interchange.buffer import ( | ||
PandasBuffer, | ||
PandasBufferPyarrow, | ||
) | ||
from pandas.core.interchange.dataframe_protocol import ( | ||
Column, | ||
ColumnBuffers, | ||
|
@@ -30,6 +36,9 @@ | |
dtype_to_arrow_c_fmt, | ||
) | ||
|
||
if TYPE_CHECKING: | ||
from pandas.core.interchange.dataframe_protocol import Buffer | ||
|
||
_NP_KINDS = { | ||
"i": DtypeKind.INT, | ||
"u": DtypeKind.UINT, | ||
|
@@ -157,6 +166,14 @@ def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]: | |
else: | ||
byteorder = dtype.byteorder | ||
|
||
if dtype == "bool[pyarrow]": | ||
return ( | ||
kind, | ||
dtype.itemsize, # pyright: ignore[reportAttributeAccessIssue] | ||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ArrowCTypes.BOOL, | ||
byteorder, | ||
) | ||
|
||
return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), byteorder | ||
|
||
@property | ||
|
@@ -194,6 +211,13 @@ def describe_null(self): | |
column_null_dtype = ColumnNullType.USE_BYTEMASK | ||
null_value = 1 | ||
return column_null_dtype, null_value | ||
if isinstance(self._col.dtype, ArrowDtype): | ||
if all( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a real need to iterate the chunks like this and check null / not-null? Arrow leaves it implementation defined as to whether or not there is a bitmask https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps I just wonder if there is value in use trying to dictate that through the interchange protocol versus letting consumers handle that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've changed things round a bit to just rechunk upfront (if so once get here, we just need to check if The issue with just returning |
||
chunk.buffers()[0] is None | ||
for chunk in self._col.array._pa_array.chunks # type: ignore[attr-defined] | ||
): | ||
return ColumnNullType.NON_NULLABLE, None | ||
return ColumnNullType.USE_BITMASK, 0 | ||
kind = self.dtype[0] | ||
try: | ||
null, value = _NULL_DESCRIPTION[kind] | ||
|
@@ -278,7 +302,7 @@ def get_buffers(self) -> ColumnBuffers: | |
|
||
def _get_data_buffer( | ||
self, | ||
) -> tuple[PandasBuffer, Any]: # Any is for self.dtype tuple | ||
) -> tuple[Buffer, tuple[DtypeKind, int, str, str]]: | ||
""" | ||
Return the buffer containing the data and the buffer's associated dtype. | ||
""" | ||
|
@@ -289,7 +313,7 @@ def _get_data_buffer( | |
np_arr = self._col.dt.tz_convert(None).to_numpy() | ||
else: | ||
np_arr = self._col.to_numpy() | ||
buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy) | ||
buffer: Buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy) | ||
dtype = ( | ||
DtypeKind.INT, | ||
64, | ||
|
@@ -302,15 +326,27 @@ def _get_data_buffer( | |
DtypeKind.FLOAT, | ||
DtypeKind.BOOL, | ||
): | ||
dtype = self.dtype | ||
arr = self._col.array | ||
if isinstance(self._col.dtype, ArrowDtype): | ||
buffer = PandasBufferPyarrow( | ||
arr._pa_array, # type: ignore[attr-defined] | ||
is_validity=False, | ||
allow_copy=self._allow_copy, | ||
) | ||
if self.dtype[0] == DtypeKind.BOOL: | ||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
dtype = ( | ||
DtypeKind.BOOL, | ||
1, | ||
ArrowCTypes.BOOL, | ||
Endianness.NATIVE, | ||
) | ||
return buffer, dtype | ||
if isinstance(self._col.dtype, BaseMaskedDtype): | ||
np_arr = arr._data # type: ignore[attr-defined] | ||
elif isinstance(self._col.dtype, ArrowDtype): | ||
raise NotImplementedError("ArrowDtype not handled yet") | ||
else: | ||
np_arr = arr._ndarray # type: ignore[attr-defined] | ||
buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy) | ||
dtype = self.dtype | ||
elif self.dtype[0] == DtypeKind.CATEGORICAL: | ||
codes = self._col.values._codes | ||
buffer = PandasBuffer(codes, allow_copy=self._allow_copy) | ||
|
@@ -343,14 +379,29 @@ def _get_data_buffer( | |
|
||
return buffer, dtype | ||
|
||
def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]: | ||
def _get_validity_buffer(self) -> tuple[Buffer, Any] | None: | ||
""" | ||
Return the buffer containing the mask values indicating missing data and | ||
the buffer's associated dtype. | ||
Raises NoBufferPresent if null representation is not a bit or byte mask. | ||
""" | ||
null, invalid = self.describe_null | ||
|
||
if isinstance(self._col.dtype, ArrowDtype): | ||
arr = self._col.array | ||
dtype = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, Endianness.NATIVE) | ||
if all( | ||
chunk.buffers()[0] is None | ||
for chunk in arr._pa_array.chunks # type: ignore[attr-defined] | ||
): | ||
return None | ||
buffer: Buffer = PandasBufferPyarrow( | ||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
arr._pa_array, # type: ignore[attr-defined] | ||
is_validity=True, | ||
allow_copy=self._allow_copy, | ||
) | ||
return buffer, dtype | ||
|
||
if isinstance(self._col.dtype, BaseMaskedDtype): | ||
mask = self._col.array._mask # type: ignore[attr-defined] | ||
buffer = PandasBuffer(mask) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -298,13 +298,14 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]: | |
|
||
null_pos = None | ||
if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): | ||
assert buffers["validity"], "Validity buffers cannot be empty for masks" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this isn't true for pyarrow dtypes right? they use bitmasks, but their validity buffer can indeed be
|
||
valid_buff, valid_dtype = buffers["validity"] | ||
null_pos = buffer_to_ndarray( | ||
valid_buff, valid_dtype, offset=col.offset, length=col.size() | ||
) | ||
if sentinel_val == 0: | ||
null_pos = ~null_pos | ||
validity = buffers["validity"] | ||
if validity is not None: | ||
valid_buff, valid_dtype = validity | ||
null_pos = buffer_to_ndarray( | ||
WillAyd marked this conversation as resolved.
Show resolved
Hide resolved
|
||
valid_buff, valid_dtype, offset=col.offset, length=col.size() | ||
) | ||
if sentinel_val == 0: | ||
null_pos = ~null_pos | ||
|
||
# Assemble the strings from the code units | ||
str_list: list[None | float | str] = [None] * col.size() | ||
|
@@ -516,6 +517,8 @@ def set_nulls( | |
np.ndarray or pd.Series | ||
Data with the nulls being set. | ||
""" | ||
if validity is None: | ||
return data | ||
null_kind, sentinel_val = col.describe_null | ||
null_pos = None | ||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.