Skip to content

BUG: PyArrow dtypes were not supported in the interchange protocol #57764

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Mar 20, 2024
Merged
4 changes: 3 additions & 1 deletion doc/source/whatsnew/v2.2.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,16 @@ including other versions of pandas.
Fixed regressions
~~~~~~~~~~~~~~~~~
- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`)
- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pyarrow nullable on with missing values (:issue:`57664`)
-

.. ---------------------------------------------------------------------------
.. _whatsnew_222.bug_fixes:

Bug fixes
~~~~~~~~~
-
- :meth:`DataFrame.__dataframe__` was showing bytemask instead of bitmask for ``'string[pyarrow]'`` validity buffer (:issue:`57762`)
- :meth:`DataFrame.__dataframe__` was showing non-null validity buffer (instead of ``None``) ``'string[pyarrow]'`` without missing values (:issue:`57761`)

.. ---------------------------------------------------------------------------
.. _whatsnew_222.other:
Expand Down
76 changes: 76 additions & 0 deletions pandas/core/interchange/buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

if TYPE_CHECKING:
import numpy as np
import pyarrow as pa


class PandasBuffer(Buffer):
Expand Down Expand Up @@ -76,3 +77,78 @@ def __repr__(self) -> str:
)
+ ")"
)


class PandasBufferPyarrow(Buffer):
"""
Data in the buffer is guaranteed to be contiguous in memory.
"""

def __init__(
self,
chunked_array: pa.ChunkedArray,
*,
is_validity: bool,
allow_copy: bool = True,
) -> None:
"""
Handle pyarrow chunked arrays.
"""
if len(chunked_array.chunks) == 1:
arr = chunked_array.chunks[0]
else:
if not allow_copy:
raise RuntimeError(
"Found multi-chunk pyarrow array, but `allow_copy` is False"
)
arr = chunked_array.combine_chunks()
if is_validity:
self._buffer = arr.buffers()[0]
else:
self._buffer = arr.buffers()[1]
self._length = len(arr)
self._dlpack = getattr(arr, "__dlpack__", None)
self._is_validity = is_validity

@property
def bufsize(self) -> int:
"""
Buffer size in bytes.
"""
return self._buffer.size

@property
def ptr(self) -> int:
"""
Pointer to start of the buffer as an integer.
"""
return self._buffer.address

def __dlpack__(self) -> Any:
"""
Represent this structure as DLPack interface.
"""
if self._dlpack is None:
raise NotImplementedError(
"pyarrow>=15.0.0 required for DLPack support for pyarrow-backed buffers"
)
return self._dlpack()

def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
"""
Device type and device ID for where the data in the buffer resides.
"""
return (DlpackDeviceType.CPU, None)

def __repr__(self) -> str:
return (
"PandasBuffer[pyarrow]("
+ str(
{
"bufsize": self.bufsize,
"ptr": self.ptr,
"device": "CPU",
}
)
+ ")"
)
75 changes: 63 additions & 12 deletions pandas/core/interchange/column.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from __future__ import annotations

from typing import Any
from typing import (
TYPE_CHECKING,
Any,
)

import numpy as np

Expand All @@ -9,15 +12,18 @@
from pandas.errors import NoBufferPresent
from pandas.util._decorators import cache_readonly

from pandas.core.dtypes.dtypes import (
from pandas.core.dtypes.dtypes import BaseMaskedDtype

import pandas as pd
from pandas import (
ArrowDtype,
BaseMaskedDtype,
DatetimeTZDtype,
)

import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.core.interchange.buffer import PandasBuffer
from pandas.core.interchange.buffer import (
PandasBuffer,
PandasBufferPyarrow,
)
from pandas.core.interchange.dataframe_protocol import (
Column,
ColumnBuffers,
Expand All @@ -30,6 +36,9 @@
dtype_to_arrow_c_fmt,
)

if TYPE_CHECKING:
from pandas.core.interchange.dataframe_protocol import Buffer

_NP_KINDS = {
"i": DtypeKind.INT,
"u": DtypeKind.UINT,
Expand Down Expand Up @@ -157,6 +166,14 @@ def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]:
else:
byteorder = dtype.byteorder

if dtype == "bool[pyarrow]":
return (
kind,
dtype.itemsize, # pyright: ignore[reportAttributeAccessIssue]
ArrowCTypes.BOOL,
byteorder,
)

return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), byteorder

@property
Expand Down Expand Up @@ -194,6 +211,13 @@ def describe_null(self):
column_null_dtype = ColumnNullType.USE_BYTEMASK
null_value = 1
return column_null_dtype, null_value
if isinstance(self._col.dtype, ArrowDtype):
if all(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a real need to iterate the chunks like this and check null / not-null? Arrow leaves it implementation defined as to whether or not there is a bitmask

https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps

I just wonder if there is value in use trying to dictate that through the interchange protocol versus letting consumers handle that

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've changed things round a bit to just rechunk upfront (if allow_copy allows)

so once get here, we just need to check if buffers()[0] is None

The issue with just returning ColumnNullType.USE_BITMASK, 0 in all cases, even if there's no validity mask, is that then pyarrow.interchange.from_dataframe would raise

chunk.buffers()[0] is None
for chunk in self._col.array._pa_array.chunks # type: ignore[attr-defined]
):
return ColumnNullType.NON_NULLABLE, None
return ColumnNullType.USE_BITMASK, 0
kind = self.dtype[0]
try:
null, value = _NULL_DESCRIPTION[kind]
Expand Down Expand Up @@ -278,7 +302,7 @@ def get_buffers(self) -> ColumnBuffers:

def _get_data_buffer(
self,
) -> tuple[PandasBuffer, Any]: # Any is for self.dtype tuple
) -> tuple[Buffer, tuple[DtypeKind, int, str, str]]:
"""
Return the buffer containing the data and the buffer's associated dtype.
"""
Expand All @@ -289,7 +313,7 @@ def _get_data_buffer(
np_arr = self._col.dt.tz_convert(None).to_numpy()
else:
np_arr = self._col.to_numpy()
buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy)
buffer: Buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy)
dtype = (
DtypeKind.INT,
64,
Expand All @@ -302,15 +326,27 @@ def _get_data_buffer(
DtypeKind.FLOAT,
DtypeKind.BOOL,
):
dtype = self.dtype
arr = self._col.array
if isinstance(self._col.dtype, ArrowDtype):
buffer = PandasBufferPyarrow(
arr._pa_array, # type: ignore[attr-defined]
is_validity=False,
allow_copy=self._allow_copy,
)
if self.dtype[0] == DtypeKind.BOOL:
dtype = (
DtypeKind.BOOL,
1,
ArrowCTypes.BOOL,
Endianness.NATIVE,
)
return buffer, dtype
if isinstance(self._col.dtype, BaseMaskedDtype):
np_arr = arr._data # type: ignore[attr-defined]
elif isinstance(self._col.dtype, ArrowDtype):
raise NotImplementedError("ArrowDtype not handled yet")
else:
np_arr = arr._ndarray # type: ignore[attr-defined]
buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy)
dtype = self.dtype
elif self.dtype[0] == DtypeKind.CATEGORICAL:
codes = self._col.values._codes
buffer = PandasBuffer(codes, allow_copy=self._allow_copy)
Expand Down Expand Up @@ -343,14 +379,29 @@ def _get_data_buffer(

return buffer, dtype

def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]:
def _get_validity_buffer(self) -> tuple[Buffer, Any] | None:
"""
Return the buffer containing the mask values indicating missing data and
the buffer's associated dtype.
Raises NoBufferPresent if null representation is not a bit or byte mask.
"""
null, invalid = self.describe_null

if isinstance(self._col.dtype, ArrowDtype):
arr = self._col.array
dtype = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, Endianness.NATIVE)
if all(
chunk.buffers()[0] is None
for chunk in arr._pa_array.chunks # type: ignore[attr-defined]
):
return None
buffer: Buffer = PandasBufferPyarrow(
arr._pa_array, # type: ignore[attr-defined]
is_validity=True,
allow_copy=self._allow_copy,
)
return buffer, dtype

if isinstance(self._col.dtype, BaseMaskedDtype):
mask = self._col.array._mask # type: ignore[attr-defined]
buffer = PandasBuffer(mask)
Expand Down
17 changes: 10 additions & 7 deletions pandas/core/interchange/from_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,13 +298,14 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:

null_pos = None
if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
assert buffers["validity"], "Validity buffers cannot be empty for masks"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this isn't true for pyarrow dtypes right? they use bitmasks, but their validity buffer can indeed be None, whereas pandas nullables always seem to be set?

In [7]: pd.Series([1, 2, 3], dtype='Int64').array._mask
Out[7]: array([False, False, False])

In [8]: pd.Series([1, 2, 3], dtype='Int64[pyarrow]').array._pa_array.chunks[0].buffers()[0] is None
Out[8]: True

valid_buff, valid_dtype = buffers["validity"]
null_pos = buffer_to_ndarray(
valid_buff, valid_dtype, offset=col.offset, length=col.size()
)
if sentinel_val == 0:
null_pos = ~null_pos
validity = buffers["validity"]
if validity is not None:
valid_buff, valid_dtype = validity
null_pos = buffer_to_ndarray(
valid_buff, valid_dtype, offset=col.offset, length=col.size()
)
if sentinel_val == 0:
null_pos = ~null_pos

# Assemble the strings from the code units
str_list: list[None | float | str] = [None] * col.size()
Expand Down Expand Up @@ -516,6 +517,8 @@ def set_nulls(
np.ndarray or pd.Series
Data with the nulls being set.
"""
if validity is None:
return data
null_kind, sentinel_val = col.describe_null
null_pos = None

Expand Down
Loading