diff --git a/pandas/_libs/buffer.pyi b/pandas/_libs/buffer.pyi new file mode 100644 index 0000000000000..2c49b959e79ae --- /dev/null +++ b/pandas/_libs/buffer.pyi @@ -0,0 +1 @@ +class CBuffer: ... diff --git a/pandas/_libs/buffer.pyx b/pandas/_libs/buffer.pyx new file mode 100644 index 0000000000000..d444edeeedbf1 --- /dev/null +++ b/pandas/_libs/buffer.pyx @@ -0,0 +1,140 @@ +from cpython cimport ( + Py_buffer, + PyLong_FromVoidPtr, +) +from libc.stdint cimport ( + int8_t, + int16_t, + int32_t, + int64_t, + uint8_t, + uint16_t, + uint32_t, + uint64_t, +) + +ctypedef fused supported_buffer_t: + float + double + int8_t + int16_t + int32_t + int64_t + uint8_t + uint16_t + uint32_t + uint64_t + + +cdef class PandasBuffer: + """ + Data in the buffer is guaranteed to be contiguous in memory. + + Note that there is no dtype attribute present, a buffer can be thought of + as simply a block of memory. However, if the column that the buffer is + attached to has a dtype that's supported by DLPack and ``__dlpack__`` is + implemented, then that dtype information will be contained in the return + value from ``__dlpack__``. + + This distinction is useful to support both data exchange via DLPack on a + buffer and (b) dtypes like variable-length strings which do not have a + fixed number of bytes per element. + """ + + # we cannot use a fused type as a class attribute, so we instead + # unpack the items we need for the buffer protocol in __init__ + + cdef: + void *ptr_ + Py_ssize_t len_ + Py_ssize_t itemsize + + int readonly + int ndim + bytes format + Py_ssize_t *shape + Py_ssize_t *strides + Py_ssize_t *suboffsets + + def __init__(self, supported_buffer_t[:] buf, allow_copy: bool = True) -> None: + """ + Handle only regular columns (= numpy arrays) for now. + """ + if buf.strides[0] and not buf.strides == (buf.dtype.itemsize,): + # The protocol does not support strided buffers, so a copy is + # necessary. If that's not allowed, we need to raise an exception. + if allow_copy: + buf = buf.copy() + else: + raise RuntimeError( + "Exports cannot be zero-copy in the case " + "of a non-contiguous buffer" + ) + + # Store the numpy array in which the data resides as a private + # attribute, so we can use it to retrieve the public attributes + self.buf = buf + self.ptr_ = &buf[0] + self.len_ = len(buf) + self.itemsize = buf.itemsize + self.readonly = buf.readonly + self.ndim = buf.ndim + self.format = buf.format + self.shape = buf.shape + self.strides = buf.strides + self.suboffsets = buf.suboffsets + + def __getbuffer__(self, Py_buffer *buffer, int flags): + buffer.buf = self.ptr_ + # assumes sizeof(unsigned char) == sizeof(uint8_t) + # TODO: use C11 static_assert macro in Cython + buffer.format = self.format + buffer.itemsize = self.itemsize + buffer.len = self.len_ + buffer.ndim = self.ndim + buffer.obj = self.obj + buffer.readonly = self.readonly + buffer.shape = self.shape + buffer.strides = self.strides + buffer.suboffsets = self.suboffsets + + def __releasebuffer__(self, Py_buffer *buffer): + pass + + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + return self.buf.size * self.buf.itemsize + + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + return PyLong_FromVoidPtr(self.ptr_) + + def __dlpack__(self): + """ + Represent this structure as DLPack interface. + """ + raise NotImplementedError + + def __dlpack_device__(self): + """ + Device type and device ID for where the data in the buffer resides. + """ + raise NotImplementedError + + def __repr__(self) -> str: + return ( + "PandasBuffer(" + + str( + { + "bufsize": self.bufsize, + "ptr": self.ptr, + } + ) + + ")" + ) diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index c27386743c6e9..ad55e4c50aa5d 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -63,6 +63,7 @@ libs_sources = { # numpy include dir is implicitly included 'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper], 'deps': _khash_primitive_helper_dep}, 'arrays': {'sources': ['arrays.pyx']}, + 'buffer': {'sources': ['buffer.pyx']}, 'groupby': {'sources': ['groupby.pyx']}, 'hashing': {'sources': ['hashing.pyx']}, 'hashtable': {'sources': ['hashtable.pyx', _hashtable_class_helper, _hashtable_func_helper], 'deps': _khash_primitive_helper_dep}, diff --git a/pandas/core/interchange/dataframe_protocol.py b/pandas/core/interchange/dataframe_protocol.py index 95e7b6a26f93a..4c7f9d49693dc 100644 --- a/pandas/core/interchange/dataframe_protocol.py +++ b/pandas/core/interchange/dataframe_protocol.py @@ -21,6 +21,8 @@ Sequence, ) +from pandas._libs.buffer import CBuffer + class DlpackDeviceType(enum.IntEnum): """Integer enum for device type codes matching DLPack.""" @@ -118,7 +120,7 @@ class CategoricalDescription(TypedDict): categories: Column | None -class Buffer(ABC): +class Buffer(CBuffer, ABC): """ Data in the buffer is guaranteed to be contiguous in memory. diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index e4fa6e4451a4c..e16ef69d7c45c 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -1,4 +1,5 @@ from datetime import datetime +import sys import numpy as np import pytest @@ -355,6 +356,33 @@ def test_interchange_from_non_pandas_tz_aware(request): tm.assert_frame_equal(expected, result) +@pytest.mark.parametrize( + "data,dtype,expected", + [ + ([True, False, True], np.bool_, bytearray(b"\x01\x00\x01")), + ([0, 1, 42], np.uint8, bytearray(b"\x00\x01\x2a")), + ([-42, 0, 42], np.int8, bytearray(b"\xd6\x00\x2a")), + ( + [-42, 0, 42], + np.int32, + bytearray(b"\xd6\xff\xff\xff\x00\x00\x00\x00\x2a\x00\x00\x00") + if sys.byteorder == "little" + else bytearray(b"\xff\xff\xff\xd6\x00\x00\x00\x00\x00\x00\x00\x2a"), + ), + ( + ["foo", "bar", "baz"], + "string", + bytearray(b"\x66\x6f\x6f\x62\x61\x72\x62\x61\x7a"), + ), + ], +) +def test_buffer_buffer_protocol(data, dtype, expected): + df = pd.DataFrame({"col": data}, dtype=dtype) + col = df.__dataframe__().get_column_by_name("col") + result = bytearray(memoryview(col.get_buffers()["data"][0])).copy() + assert result == expected + + def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: # https://github.com/pandas-dev/pandas/issues/54781 df = pd.DataFrame({"a": ["foo", "bar"]}).__dataframe__()