Skip to content

Implement Buffer Protocol for PandasBuffer #55671

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 25 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pandas/_libs/buffer.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
class CBuffer: ...
140 changes: 140 additions & 0 deletions pandas/_libs/buffer.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
from cpython cimport (
Py_buffer,
PyLong_FromVoidPtr,
)
from libc.stdint cimport (
int8_t,
int16_t,
int32_t,
int64_t,
uint8_t,
uint16_t,
uint32_t,
uint64_t,
)

ctypedef fused supported_buffer_t:
float
double
int8_t
int16_t
int32_t
int64_t
uint8_t
uint16_t
uint32_t
uint64_t


cdef class PandasBuffer:
"""
Data in the buffer is guaranteed to be contiguous in memory.

Note that there is no dtype attribute present, a buffer can be thought of
as simply a block of memory. However, if the column that the buffer is
attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
implemented, then that dtype information will be contained in the return
value from ``__dlpack__``.

This distinction is useful to support both data exchange via DLPack on a
buffer and (b) dtypes like variable-length strings which do not have a
fixed number of bytes per element.
"""

# we cannot use a fused type as a class attribute, so we instead
# unpack the items we need for the buffer protocol in __init__

cdef:
void *ptr_
Py_ssize_t len_
Py_ssize_t itemsize

int readonly
int ndim
bytes format
Py_ssize_t *shape
Py_ssize_t *strides
Py_ssize_t *suboffsets

def __init__(self, supported_buffer_t[:] buf, allow_copy: bool = True) -> None:
"""
Handle only regular columns (= numpy arrays) for now.
"""
if buf.strides[0] and not buf.strides == (buf.dtype.itemsize,):
# The protocol does not support strided buffers, so a copy is
# necessary. If that's not allowed, we need to raise an exception.
if allow_copy:
buf = buf.copy()
else:
raise RuntimeError(
"Exports cannot be zero-copy in the case "
"of a non-contiguous buffer"
)

# Store the numpy array in which the data resides as a private
# attribute, so we can use it to retrieve the public attributes
self.buf = buf
self.ptr_ = &buf[0]
self.len_ = len(buf)
self.itemsize = buf.itemsize
self.readonly = buf.readonly
self.ndim = buf.ndim
self.format = buf.format
self.shape = buf.shape
self.strides = buf.strides
self.suboffsets = buf.suboffsets

def __getbuffer__(self, Py_buffer *buffer, int flags):
buffer.buf = self.ptr_
# assumes sizeof(unsigned char) == sizeof(uint8_t)
# TODO: use C11 static_assert macro in Cython
buffer.format = self.format
buffer.itemsize = self.itemsize
buffer.len = self.len_
buffer.ndim = self.ndim
buffer.obj = self.obj
buffer.readonly = self.readonly
buffer.shape = self.shape
buffer.strides = self.strides
buffer.suboffsets = self.suboffsets

def __releasebuffer__(self, Py_buffer *buffer):
pass

@property
def bufsize(self) -> int:
"""
Buffer size in bytes.
"""
return self.buf.size * self.buf.itemsize

@property
def ptr(self) -> int:
"""
Pointer to start of the buffer as an integer.
"""
return PyLong_FromVoidPtr(self.ptr_)

def __dlpack__(self):
"""
Represent this structure as DLPack interface.
"""
raise NotImplementedError

def __dlpack_device__(self):
"""
Device type and device ID for where the data in the buffer resides.
"""
raise NotImplementedError

def __repr__(self) -> str:
return (
"PandasBuffer("
+ str(
{
"bufsize": self.bufsize,
"ptr": self.ptr,
}
)
+ ")"
)
1 change: 1 addition & 0 deletions pandas/_libs/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ libs_sources = {
# numpy include dir is implicitly included
'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper], 'deps': _khash_primitive_helper_dep},
'arrays': {'sources': ['arrays.pyx']},
'buffer': {'sources': ['buffer.pyx']},
'groupby': {'sources': ['groupby.pyx']},
'hashing': {'sources': ['hashing.pyx']},
'hashtable': {'sources': ['hashtable.pyx', _hashtable_class_helper, _hashtable_func_helper], 'deps': _khash_primitive_helper_dep},
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/interchange/dataframe_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
Sequence,
)

from pandas._libs.buffer import CBuffer


class DlpackDeviceType(enum.IntEnum):
"""Integer enum for device type codes matching DLPack."""
Expand Down Expand Up @@ -118,7 +120,7 @@ class CategoricalDescription(TypedDict):
categories: Column | None


class Buffer(ABC):
class Buffer(CBuffer, ABC):
"""
Data in the buffer is guaranteed to be contiguous in memory.

Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/interchange/test_impl.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from datetime import datetime
import sys

import numpy as np
import pytest
Expand Down Expand Up @@ -355,6 +356,33 @@ def test_interchange_from_non_pandas_tz_aware(request):
tm.assert_frame_equal(expected, result)


@pytest.mark.parametrize(
"data,dtype,expected",
[
([True, False, True], np.bool_, bytearray(b"\x01\x00\x01")),
([0, 1, 42], np.uint8, bytearray(b"\x00\x01\x2a")),
([-42, 0, 42], np.int8, bytearray(b"\xd6\x00\x2a")),
(
[-42, 0, 42],
np.int32,
bytearray(b"\xd6\xff\xff\xff\x00\x00\x00\x00\x2a\x00\x00\x00")
if sys.byteorder == "little"
else bytearray(b"\xff\xff\xff\xd6\x00\x00\x00\x00\x00\x00\x00\x2a"),
),
(
["foo", "bar", "baz"],
"string",
bytearray(b"\x66\x6f\x6f\x62\x61\x72\x62\x61\x7a"),
),
],
)
def test_buffer_buffer_protocol(data, dtype, expected):
df = pd.DataFrame({"col": data}, dtype=dtype)
col = df.__dataframe__().get_column_by_name("col")
result = bytearray(memoryview(col.get_buffers()["data"][0])).copy()
assert result == expected


def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None:
# https://github.com/pandas-dev/pandas/issues/54781
df = pd.DataFrame({"a": ["foo", "bar"]}).__dataframe__()
Expand Down