pandas-dev · WillAyd · Oct 24, 2023 · Oct 24, 2023 · Oct 25, 2023 · Oct 25, 2023
diff --git a/pandas/_libs/buffer.pyi b/pandas/_libs/buffer.pyi
@@ -0,0 +1 @@
+class CBuffer: ...
diff --git a/pandas/_libs/buffer.pyx b/pandas/_libs/buffer.pyx
@@ -0,0 +1,140 @@
+from cpython cimport (
+    Py_buffer,
+    PyLong_FromVoidPtr,
+)
+from libc.stdint cimport (
+    int8_t,
+    int16_t,
+    int32_t,
+    int64_t,
+    uint8_t,
+    uint16_t,
+    uint32_t,
+    uint64_t,
+)
+
+ctypedef fused supported_buffer_t:
+    float
+    double
+    int8_t
+    int16_t
+    int32_t
+    int64_t
+    uint8_t
+    uint16_t
+    uint32_t
+    uint64_t
+
+
+cdef class PandasBuffer:
+    """
+    Data in the buffer is guaranteed to be contiguous in memory.
+
+    Note that there is no dtype attribute present, a buffer can be thought of
+    as simply a block of memory. However, if the column that the buffer is
+    attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
+    implemented, then that dtype information will be contained in the return
+    value from ``__dlpack__``.
+
+    This distinction is useful to support both data exchange via DLPack on a
+    buffer and (b) dtypes like variable-length strings which do not have a
+    fixed number of bytes per element.
+    """
+
+    # we cannot use a fused type as a class attribute, so we instead
+    # unpack the items we need for the buffer protocol in __init__
+
+    cdef:
+        void *ptr_
+        Py_ssize_t len_
+        Py_ssize_t itemsize
+
+        int readonly
+        int ndim
+        bytes format
+        Py_ssize_t *shape
+        Py_ssize_t *strides
+        Py_ssize_t *suboffsets
+
+    def __init__(self, supported_buffer_t[:] buf, allow_copy: bool = True) -> None:
+        """
+        Handle only regular columns (= numpy arrays) for now.
+        """
+        if buf.strides[0] and not buf.strides == (buf.dtype.itemsize,):
+            # The protocol does not support strided buffers, so a copy is
+            # necessary. If that's not allowed, we need to raise an exception.
+            if allow_copy:
+                buf = buf.copy()
+            else:
+                raise RuntimeError(
+                    "Exports cannot be zero-copy in the case "
+                    "of a non-contiguous buffer"
+                )
+
+        # Store the numpy array in which the data resides as a private
+        # attribute, so we can use it to retrieve the public attributes
+        self.buf = buf
+        self.ptr_ = &buf[0]
+        self.len_ = len(buf)
+        self.itemsize = buf.itemsize
+        self.readonly = buf.readonly
+        self.ndim = buf.ndim
+        self.format = buf.format
+        self.shape = buf.shape
+        self.strides = buf.strides
+        self.suboffsets = buf.suboffsets
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        buffer.buf = self.ptr_
+        # assumes sizeof(unsigned char) == sizeof(uint8_t)
+        # TODO: use C11 static_assert macro in Cython
+        buffer.format = self.format
+        buffer.itemsize = self.itemsize
+        buffer.len = self.len_
+        buffer.ndim = self.ndim
+        buffer.obj = self.obj
+        buffer.readonly = self.readonly
+        buffer.shape = self.shape
+        buffer.strides = self.strides
+        buffer.suboffsets = self.suboffsets
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        pass
+
+    @property
+    def bufsize(self) -> int:
+        """
+        Buffer size in bytes.
+        """
+        return self.buf.size * self.buf.itemsize
+
+    @property
+    def ptr(self) -> int:
+        """
+        Pointer to start of the buffer as an integer.
+        """
+        return PyLong_FromVoidPtr(self.ptr_)
+
+    def __dlpack__(self):
+        """
+        Represent this structure as DLPack interface.
+        """
+        raise NotImplementedError
+
+    def __dlpack_device__(self):
+        """
+        Device type and device ID for where the data in the buffer resides.
+        """
+        raise NotImplementedError
+
+    def __repr__(self) -> str:
+        return (
+            "PandasBuffer("
+            + str(
+                {
+                    "bufsize": self.bufsize,
+                    "ptr": self.ptr,
+                }
+            )
+            + ")"
+        )
diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build
@@ -63,6 +63,7 @@ libs_sources = {
     # numpy include dir is implicitly included
     'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper], 'deps': _khash_primitive_helper_dep},
     'arrays': {'sources': ['arrays.pyx']},
+    'buffer': {'sources': ['buffer.pyx']},
     'groupby': {'sources': ['groupby.pyx']},
     'hashing': {'sources': ['hashing.pyx']},
     'hashtable': {'sources': ['hashtable.pyx', _hashtable_class_helper, _hashtable_func_helper], 'deps': _khash_primitive_helper_dep},

diff --git a/pandas/core/interchange/dataframe_protocol.py b/pandas/core/interchange/dataframe_protocol.py
@@ -21,6 +21,8 @@
         Sequence,
     )
 
+from pandas._libs.buffer import CBuffer
+
 
 class DlpackDeviceType(enum.IntEnum):
     """Integer enum for device type codes matching DLPack."""
@@ -118,7 +120,7 @@ class CategoricalDescription(TypedDict):
     categories: Column | None
 
 
-class Buffer(ABC):
+class Buffer(CBuffer, ABC):
     """
     Data in the buffer is guaranteed to be contiguous in memory.
 

diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py
@@ -1,4 +1,5 @@
 from datetime import datetime
+import sys
 
 import numpy as np
 import pytest
@@ -355,6 +356,33 @@ def test_interchange_from_non_pandas_tz_aware(request):
     tm.assert_frame_equal(expected, result)
 
 
+@pytest.mark.parametrize(
+    "data,dtype,expected",
+    [
+        ([True, False, True], np.bool_, bytearray(b"\x01\x00\x01")),
+        ([0, 1, 42], np.uint8, bytearray(b"\x00\x01\x2a")),
+        ([-42, 0, 42], np.int8, bytearray(b"\xd6\x00\x2a")),
+        (
+            [-42, 0, 42],
+            np.int32,
+            bytearray(b"\xd6\xff\xff\xff\x00\x00\x00\x00\x2a\x00\x00\x00")
+            if sys.byteorder == "little"
+            else bytearray(b"\xff\xff\xff\xd6\x00\x00\x00\x00\x00\x00\x00\x2a"),
+        ),
+        (
+            ["foo", "bar", "baz"],
+            "string",
+            bytearray(b"\x66\x6f\x6f\x62\x61\x72\x62\x61\x7a"),
+        ),
+    ],
+)
+def test_buffer_buffer_protocol(data, dtype, expected):
+    df = pd.DataFrame({"col": data}, dtype=dtype)
+    col = df.__dataframe__().get_column_by_name("col")
+    result = bytearray(memoryview(col.get_buffers()["data"][0])).copy()
+    assert result == expected
+
+
 def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None:
     # https://github.com/pandas-dev/pandas/issues/54781
     df = pd.DataFrame({"a": ["foo", "bar"]}).__dataframe__()