Skip to content

Implement Buffer Protocol for PandasBuffer #55671

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 25 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions pandas/_libs/buffer.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from cpython cimport Py_buffer
from libc.stdint cimport (
uint8_t,
uintptr_t,
)


cdef class CBuffer:
def __getbuffer__(self, Py_buffer *buffer, int flags):
cdef Py_ssize_t itemsize = sizeof(uint8_t)
cdef uintptr_t ptr = self.ptr
cdef Py_ssize_t[1] shape = tuple((self.bufsize // itemsize,))
cdef Py_ssize_t[1] strides = tuple((itemsize,))

buffer.buf = <void*>ptr
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There might be a better way of doing this overall, but I figured since self.ptr returns an int it is an easy bolt on to the existing design

# assumes sizeof(unsigned char) == sizeof(uint8_t)
# TODO: use C11 static_assert macro in Cython
buffer.format = "@B"
buffer.itemsize = itemsize
buffer.len = self.bufsize
buffer.ndim = 1
buffer.obj = self
buffer.readonly = 1
buffer.shape = shape
buffer.strides = strides
buffer.suboffsets = NULL

def __releasebuffer__(self, Py_buffer *buffer):
pass
1 change: 1 addition & 0 deletions pandas/_libs/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ libs_sources = {
# numpy include dir is implicitly included
'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper, _khash_primitive_helper]},
'arrays': {'sources': ['arrays.pyx']},
'buffer': {'sources': ['buffer.pyx']},
'groupby': {'sources': ['groupby.pyx']},
'hashing': {'sources': ['hashing.pyx']},
'hashtable': {'sources': ['hashtable.pyx', _khash_primitive_helper, _hashtable_class_helper, _hashtable_func_helper]},
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/interchange/dataframe_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
Sequence,
)

from pandas._libs.buffer import CBuffer


class DlpackDeviceType(enum.IntEnum):
"""Integer enum for device type codes matching DLPack."""
Expand Down Expand Up @@ -118,7 +120,7 @@ class CategoricalDescription(TypedDict):
categories: Column | None


class Buffer(ABC):
class Buffer(ABC, CBuffer):
"""
Data in the buffer is guaranteed to be contiguous in memory.

Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/interchange/test_impl.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from datetime import datetime
import sys

import numpy as np
import pytest
Expand Down Expand Up @@ -340,3 +341,30 @@ def test_interchange_from_non_pandas_tz_aware(request):
dtype="datetime64[us, Asia/Kathmandu]",
)
tm.assert_frame_equal(expected, result)


@pytest.mark.parametrize(
"data,dtype,expected",
[
([True, False, True], np.bool_, bytearray(b"\x01\x00\x01")),
([0, 1, 42], np.uint8, bytearray(b"\x00\x01\x2a")),
([-42, 0, 42], np.int8, bytearray(b"\xd6\x00\x2a")),
(
[-42, 0, 42],
np.int32,
bytearray(b"\xd6\xff\xff\xff\x00\x00\x00\x00\x2a\x00\x00\x00")
if sys.byteorder == "little"
else bytearray(b"\xff\xff\xff\xd6\x00\x00\x00\x00\x00\x00\x00\x2a"),
),
(
["foo", "bar", "baz"],
"string",
bytearray(b"\x66\x6f\x6f\x62\x61\x72\x62\x61\x7a"),
),
],
)
def test_buffer_buffer_protocol(data, dtype, expected):
df = pd.DataFrame({"col": data}, dtype=dtype)
col = df.__dataframe__().get_column_by_name("col")
result = bytearray(memoryview(col.get_buffers()["data"][0]))
assert result == expected