fix pyarrow interchange

MarcoGorelli · MarcoGorelli · commit fd557f25a231 · 2024-03-10T14:11:13.000Z
diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst
@@ -14,14 +14,16 @@ including other versions of pandas.
 Fixed regressions
 ~~~~~~~~~~~~~~~~~
 - :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`)
+- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pyarrow nullable on with missing values (:issue:`57664`)
 -
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_222.bug_fixes:
 
 Bug fixes
 ~~~~~~~~~
--
+- :meth:`DataFrame.__dataframe__` was showing bytemask instead of bitmask for ``'string[pyarrow]'`` validity buffer (:issue:`57762`)
+- :meth:`DataFrame.__dataframe__` was showing non-null validity buffer (instead of ``None``) ``'string[pyarrow]'`` without missing values (:issue:`57761`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_222.other:
diff --git a/pandas/core/interchange/buffer.py b/pandas/core/interchange/buffer.py
@@ -12,6 +12,7 @@
 
 if TYPE_CHECKING:
     import numpy as np
+    import pyarrow as pa
 
 
 class PandasBuffer(Buffer):
@@ -76,3 +77,78 @@ def __repr__(self) -> str:
             )
             + ")"
         )
+
+
+class PandasBufferPyarrow(Buffer):
+    """
+    Data in the buffer is guaranteed to be contiguous in memory.
+    """
+
+    def __init__(
+        self,
+        chunked_array: pa.ChunkedArray,
+        *,
+        is_validity: bool,
+        allow_copy: bool = True,
+    ) -> None:
+        """
+        Handle pyarrow chunked arrays.
+        """
+        if len(chunked_array.chunks) == 1:
+            arr = chunked_array.chunks[0]
+        else:
+            if not allow_copy:
+                raise RuntimeError(
+                    "Found multi-chunk pyarrow array, but `allow_copy` is False"
+                )
+            arr = chunked_array.combine_chunks()
+        if is_validity:
+            self._buffer = arr.buffers()[0]
+        else:
+            self._buffer = arr.buffers()[1]
+        self._length = len(arr)
+        self._dlpack = getattr(arr, "__dlpack__", None)
+        self._is_validity = is_validity
+
+    @property
+    def bufsize(self) -> int:
+        """
+        Buffer size in bytes.
+        """
+        return self._buffer.size
+
+    @property
+    def ptr(self) -> int:
+        """
+        Pointer to start of the buffer as an integer.
+        """
+        return self._buffer.address
+
+    def __dlpack__(self) -> Any:
+        """
+        Represent this structure as DLPack interface.
+        """
+        if self._dlpack is not None:
+            return self._dlpack()
+        raise NotImplementedError(
+            "pyarrow>=15.0.0 is required for DLPack support for pyarrow-backed buffers"
+        )
+
+    def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
+        """
+        Device type and device ID for where the data in the buffer resides.
+        """
+        return (DlpackDeviceType.CPU, None)
+
+    def __repr__(self) -> str:
+        return (
+            "PandasBuffer[pyarrow]("
+            + str(
+                {
+                    "bufsize": self.bufsize,
+                    "ptr": self.ptr,
+                    "device": "CPU",
+                }
+            )
+            + ")"
+        )
diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py
@@ -1,6 +1,9 @@
 from __future__ import annotations
 
-from typing import Any
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
 
 import numpy as np
 
@@ -9,15 +12,18 @@
 from pandas.errors import NoBufferPresent
 from pandas.util._decorators import cache_readonly
 
-from pandas.core.dtypes.dtypes import (
+from pandas.core.dtypes.dtypes import BaseMaskedDtype
+
+import pandas as pd
+from pandas import (
     ArrowDtype,
-    BaseMaskedDtype,
     DatetimeTZDtype,
 )
-
-import pandas as pd
 from pandas.api.types import is_string_dtype
-from pandas.core.interchange.buffer import PandasBuffer
+from pandas.core.interchange.buffer import (
+    PandasBuffer,
+    PandasBufferPyarrow,
+)
 from pandas.core.interchange.dataframe_protocol import (
     Column,
     ColumnBuffers,
@@ -30,6 +36,9 @@
     dtype_to_arrow_c_fmt,
 )
 
+if TYPE_CHECKING:
+    from pandas.core.interchange.dataframe_protocol import Buffer
+
 _NP_KINDS = {
     "i": DtypeKind.INT,
     "u": DtypeKind.UINT,
@@ -157,6 +166,14 @@ def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]:
         else:
             byteorder = dtype.byteorder
 
+        if dtype == "bool[pyarrow]":
+            return (
+                kind,
+                dtype.itemsize,  # pyright: ignore[reportAttributeAccessIssue]
+                ArrowCTypes.BOOL,
+                byteorder,
+            )
+
         return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), byteorder
 
     @property
@@ -194,6 +211,13 @@ def describe_null(self):
             column_null_dtype = ColumnNullType.USE_BYTEMASK
             null_value = 1
             return column_null_dtype, null_value
+        if isinstance(self._col.dtype, ArrowDtype):
+            if all(
+                chunk.buffers()[0] is None
+                for chunk in self._col.array._pa_array.chunks  # type: ignore[attr-defined]
+            ):
+                return ColumnNullType.NON_NULLABLE, None
+            return ColumnNullType.USE_BITMASK, 0
         kind = self.dtype[0]
         try:
             null, value = _NULL_DESCRIPTION[kind]
@@ -278,7 +302,7 @@ def get_buffers(self) -> ColumnBuffers:
 
     def _get_data_buffer(
         self,
-    ) -> tuple[PandasBuffer, Any]:  # Any is for self.dtype tuple
+    ) -> tuple[Buffer, tuple[DtypeKind, int, str, str]]:
         """
         Return the buffer containing the data and the buffer's associated dtype.
         """
@@ -289,7 +313,7 @@ def _get_data_buffer(
                 np_arr = self._col.dt.tz_convert(None).to_numpy()
             else:
                 np_arr = self._col.to_numpy()
-            buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy)
+            buffer: Buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy)
             dtype = (
                 DtypeKind.INT,
                 64,
@@ -302,15 +326,27 @@ def _get_data_buffer(
             DtypeKind.FLOAT,
             DtypeKind.BOOL,
         ):
+            dtype = self.dtype
             arr = self._col.array
+            if isinstance(self._col.dtype, ArrowDtype):
+                buffer = PandasBufferPyarrow(
+                    arr._pa_array,  # type: ignore[attr-defined]
+                    is_validity=False,
+                    allow_copy=self._allow_copy,
+                )
+                if self.dtype[0] == DtypeKind.BOOL:
+                    dtype = (
+                        DtypeKind.BOOL,
+                        1,
+                        ArrowCTypes.BOOL,
+                        Endianness.NATIVE,
+                    )
+                return buffer, dtype
             if isinstance(self._col.dtype, BaseMaskedDtype):
                 np_arr = arr._data  # type: ignore[attr-defined]
-            elif isinstance(self._col.dtype, ArrowDtype):
-                raise NotImplementedError("ArrowDtype not handled yet")
             else:
                 np_arr = arr._ndarray  # type: ignore[attr-defined]
             buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy)
-            dtype = self.dtype
         elif self.dtype[0] == DtypeKind.CATEGORICAL:
             codes = self._col.values._codes
             buffer = PandasBuffer(codes, allow_copy=self._allow_copy)
@@ -343,14 +379,29 @@ def _get_data_buffer(
 
         return buffer, dtype
 
-    def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]:
+    def _get_validity_buffer(self) -> tuple[Buffer, Any] | None:
         """
         Return the buffer containing the mask values indicating missing data and
         the buffer's associated dtype.
         Raises NoBufferPresent if null representation is not a bit or byte mask.
         """
         null, invalid = self.describe_null
 
+        if isinstance(self._col.dtype, ArrowDtype):
+            arr = self._col.array
+            dtype = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, Endianness.NATIVE)
+            if all(
+                chunk.buffers()[0] is None
+                for chunk in arr._pa_array.chunks  # type: ignore[attr-defined]
+            ):
+                return None
+            buffer: Buffer = PandasBufferPyarrow(
+                arr._pa_array,  # type: ignore[attr-defined]
+                is_validity=True,
+                allow_copy=self._allow_copy,
+            )
+            return buffer, dtype
+
         if isinstance(self._col.dtype, BaseMaskedDtype):
             mask = self._col.array._mask  # type: ignore[attr-defined]
             buffer = PandasBuffer(mask)
diff --git a/pandas/core/interchange/from_dataframe.py b/pandas/core/interchange/from_dataframe.py
@@ -298,13 +298,14 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
 
     null_pos = None
     if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
-        assert buffers["validity"], "Validity buffers cannot be empty for masks"
-        valid_buff, valid_dtype = buffers["validity"]
-        null_pos = buffer_to_ndarray(
-            valid_buff, valid_dtype, offset=col.offset, length=col.size()
-        )
-        if sentinel_val == 0:
-            null_pos = ~null_pos
+        validity = buffers["validity"]
+        if validity is not None:
+            valid_buff, valid_dtype = validity
+            null_pos = buffer_to_ndarray(
+                valid_buff, valid_dtype, offset=col.offset, length=col.size()
+            )
+            if sentinel_val == 0:
+                null_pos = ~null_pos
 
     # Assemble the strings from the code units
     str_list: list[None | float | str] = [None] * col.size()
@@ -516,19 +517,21 @@ def set_nulls(
     np.ndarray or pd.Series
         Data with the nulls being set.
     """
+    if validity is None:
+        return data
     null_kind, sentinel_val = col.describe_null
     null_pos = None
 
     if null_kind == ColumnNullType.USE_SENTINEL:
         null_pos = pd.Series(data) == sentinel_val
     elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
-        assert validity, "Expected to have a validity buffer for the mask"
         valid_buff, valid_dtype = validity
-        null_pos = buffer_to_ndarray(
-            valid_buff, valid_dtype, offset=col.offset, length=col.size()
-        )
-        if sentinel_val == 0:
-            null_pos = ~null_pos
+        if valid_buff is not None:
+            null_pos = buffer_to_ndarray(
+                valid_buff, valid_dtype, offset=col.offset, length=col.size()
+            )
+            if sentinel_val == 0:
+                null_pos = ~null_pos
     elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN):
         pass
     else:
diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py