From 0ac11efc20f73699292ce91820d64604f30dd70c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 24 Oct 2023 18:58:25 -0400 Subject: [PATCH 01/19] Implement Buffer Protocol for PandasBuffer --- pandas/_libs/meson.build | 1 + pandas/core/interchange/dataframe_protocol.py | 4 ++- pandas/tests/interchange/test_impl.py | 28 +++++++++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/meson.build b/pandas/_libs/meson.build index b4662d6bf8dd2..ac30f35a94611 100644 --- a/pandas/_libs/meson.build +++ b/pandas/_libs/meson.build @@ -63,6 +63,7 @@ libs_sources = { # numpy include dir is implicitly included 'algos': {'sources': ['algos.pyx', _algos_common_helper, _algos_take_helper, _khash_primitive_helper]}, 'arrays': {'sources': ['arrays.pyx']}, + 'buffer': {'sources': ['buffer.pyx']}, 'groupby': {'sources': ['groupby.pyx']}, 'hashing': {'sources': ['hashing.pyx']}, 'hashtable': {'sources': ['hashtable.pyx', _khash_primitive_helper, _hashtable_class_helper, _hashtable_func_helper]}, diff --git a/pandas/core/interchange/dataframe_protocol.py b/pandas/core/interchange/dataframe_protocol.py index 95e7b6a26f93a..bdcee68e67e39 100644 --- a/pandas/core/interchange/dataframe_protocol.py +++ b/pandas/core/interchange/dataframe_protocol.py @@ -21,6 +21,8 @@ Sequence, ) +from pandas._libs.buffer import CBuffer + class DlpackDeviceType(enum.IntEnum): """Integer enum for device type codes matching DLPack.""" @@ -118,7 +120,7 @@ class CategoricalDescription(TypedDict): categories: Column | None -class Buffer(ABC): +class Buffer(ABC, CBuffer): """ Data in the buffer is guaranteed to be contiguous in memory. diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 5dbc0156816aa..3a8e5da5024fb 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -1,4 +1,5 @@ from datetime import datetime +import sys import numpy as np import pytest @@ -340,3 +341,30 @@ def test_interchange_from_non_pandas_tz_aware(request): dtype="datetime64[us, Asia/Kathmandu]", ) tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize( + "data,dtype,expected", + [ + ([True, False, True], np.bool_, bytearray(b"\x01\x00\x01")), + ([0, 1, 42], np.uint8, bytearray(b"\x00\x01\x2a")), + ([-42, 0, 42], np.int8, bytearray(b"\xd6\x00\x2a")), + ( + [-42, 0, 42], + np.int32, + bytearray(b"\xd6\xff\xff\xff\x00\x00\x00\x00\x2a\x00\x00\x00") + if sys.byteorder == "little" + else bytearray(b"\xff\xff\xff\xd6\x00\x00\x00\x00\x00\x00\x00\x2a"), + ), + ( + ["foo", "bar", "baz"], + "string", + bytearray(b"\x66\x6f\x6f\x62\x61\x72\x62\x61\x7a"), + ), + ], +) +def test_buffer_buffer_protocol(data, dtype, expected): + df = pd.DataFrame({"col": data}, dtype=dtype) + col = df.__dataframe__().get_column_by_name("col") + result = bytearray(memoryview(col.get_buffers()["data"][0])) + assert result == expected From 28ad6d88c359cf661984a9092ae09176c154b8d4 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 24 Oct 2023 19:00:58 -0400 Subject: [PATCH 02/19] Add buffer.pyx --- pandas/_libs/buffer.pyx | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 pandas/_libs/buffer.pyx diff --git a/pandas/_libs/buffer.pyx b/pandas/_libs/buffer.pyx new file mode 100644 index 0000000000000..d5c97f83b88fd --- /dev/null +++ b/pandas/_libs/buffer.pyx @@ -0,0 +1,29 @@ +from cpython cimport Py_buffer +from libc.stdint cimport ( + uint8_t, + uintptr_t, +) + + +cdef class CBuffer: + def __getbuffer__(self, Py_buffer *buffer, int flags): + cdef Py_ssize_t itemsize = sizeof(uint8_t) + cdef uintptr_t ptr = self.ptr + cdef Py_ssize_t[1] shape = tuple((self.bufsize // itemsize,)) + cdef Py_ssize_t[1] strides = tuple((itemsize,)) + + buffer.buf = ptr + # assumes sizeof(unsigned char) == sizeof(uint8_t) + # TODO: use C11 static_assert macro in Cython + buffer.format = "@B" + buffer.itemsize = itemsize + buffer.len = self.bufsize + buffer.ndim = 1 + buffer.obj = self + buffer.readonly = 1 + buffer.shape = shape + buffer.strides = strides + buffer.suboffsets = NULL + + def __releasebuffer__(self, Py_buffer *buffer): + pass From 8878fae1c47e8bfc806b4238e5a70bc0738507eb Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 25 Oct 2023 14:44:32 -0400 Subject: [PATCH 03/19] switch mro --- pandas/core/interchange/dataframe_protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/interchange/dataframe_protocol.py b/pandas/core/interchange/dataframe_protocol.py index bdcee68e67e39..4c7f9d49693dc 100644 --- a/pandas/core/interchange/dataframe_protocol.py +++ b/pandas/core/interchange/dataframe_protocol.py @@ -120,7 +120,7 @@ class CategoricalDescription(TypedDict): categories: Column | None -class Buffer(ABC, CBuffer): +class Buffer(CBuffer, ABC): """ Data in the buffer is guaranteed to be contiguous in memory. From 7a65b966053d5164dc94a3f7c2572b11d9915b9c Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 25 Oct 2023 16:58:13 -0400 Subject: [PATCH 04/19] try PyLong_AsVoidPtr --- pandas/_libs/buffer.pyx | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/buffer.pyx b/pandas/_libs/buffer.pyx index d5c97f83b88fd..2fa19080b6768 100644 --- a/pandas/_libs/buffer.pyx +++ b/pandas/_libs/buffer.pyx @@ -1,18 +1,17 @@ -from cpython cimport Py_buffer -from libc.stdint cimport ( - uint8_t, - uintptr_t, +from cpython cimport ( + Py_buffer, + PyLong_AsVoidPtr, ) +from libc.stdint cimport uint8_t cdef class CBuffer: def __getbuffer__(self, Py_buffer *buffer, int flags): cdef Py_ssize_t itemsize = sizeof(uint8_t) - cdef uintptr_t ptr = self.ptr cdef Py_ssize_t[1] shape = tuple((self.bufsize // itemsize,)) cdef Py_ssize_t[1] strides = tuple((itemsize,)) - buffer.buf = ptr + buffer.buf = PyLong_AsVoidPtr(self.ptr) # assumes sizeof(unsigned char) == sizeof(uint8_t) # TODO: use C11 static_assert macro in Cython buffer.format = "@B" From 7e1923e11286db40fa7f46d3f32066c229f6e64f Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 25 Oct 2023 20:05:27 -0400 Subject: [PATCH 05/19] fix pyright --- pandas/_libs/buffer.pyi | 1 + 1 file changed, 1 insertion(+) create mode 100644 pandas/_libs/buffer.pyi diff --git a/pandas/_libs/buffer.pyi b/pandas/_libs/buffer.pyi new file mode 100644 index 0000000000000..2c49b959e79ae --- /dev/null +++ b/pandas/_libs/buffer.pyi @@ -0,0 +1 @@ +class CBuffer: ... From 41ae1a783121a9d388f45f2d9d962cc57cc33c28 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 26 Oct 2023 14:16:43 -0400 Subject: [PATCH 06/19] try simpler tests --- pandas/_libs/buffer.pyx | 2 +- pandas/tests/interchange/test_impl.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/buffer.pyx b/pandas/_libs/buffer.pyx index 2fa19080b6768..8c18402eb8a76 100644 --- a/pandas/_libs/buffer.pyx +++ b/pandas/_libs/buffer.pyx @@ -14,7 +14,7 @@ cdef class CBuffer: buffer.buf = PyLong_AsVoidPtr(self.ptr) # assumes sizeof(unsigned char) == sizeof(uint8_t) # TODO: use C11 static_assert macro in Cython - buffer.format = "@B" + buffer.format = "B" buffer.itemsize = itemsize buffer.len = self.bufsize buffer.ndim = 1 diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 3a8e5da5024fb..fe5cd4d8c7040 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -1,5 +1,4 @@ from datetime import datetime -import sys import numpy as np import pytest @@ -349,6 +348,7 @@ def test_interchange_from_non_pandas_tz_aware(request): ([True, False, True], np.bool_, bytearray(b"\x01\x00\x01")), ([0, 1, 42], np.uint8, bytearray(b"\x00\x01\x2a")), ([-42, 0, 42], np.int8, bytearray(b"\xd6\x00\x2a")), + """ ( [-42, 0, 42], np.int32, @@ -361,6 +361,7 @@ def test_interchange_from_non_pandas_tz_aware(request): "string", bytearray(b"\x66\x6f\x6f\x62\x61\x72\x62\x61\x7a"), ), + """, ], ) def test_buffer_buffer_protocol(data, dtype, expected): From f003ebcdbbc5cdacfc12e51a5953b86c146d0803 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 26 Oct 2023 15:55:43 -0400 Subject: [PATCH 07/19] fix parametrization --- pandas/tests/interchange/test_impl.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index fe5cd4d8c7040..ce398af73ebaf 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -348,20 +348,6 @@ def test_interchange_from_non_pandas_tz_aware(request): ([True, False, True], np.bool_, bytearray(b"\x01\x00\x01")), ([0, 1, 42], np.uint8, bytearray(b"\x00\x01\x2a")), ([-42, 0, 42], np.int8, bytearray(b"\xd6\x00\x2a")), - """ - ( - [-42, 0, 42], - np.int32, - bytearray(b"\xd6\xff\xff\xff\x00\x00\x00\x00\x2a\x00\x00\x00") - if sys.byteorder == "little" - else bytearray(b"\xff\xff\xff\xd6\x00\x00\x00\x00\x00\x00\x00\x2a"), - ), - ( - ["foo", "bar", "baz"], - "string", - bytearray(b"\x66\x6f\x6f\x62\x61\x72\x62\x61\x7a"), - ), - """, ], ) def test_buffer_buffer_protocol(data, dtype, expected): From 572f358add924d4fc448f405842f29f1753b5d38 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Fri, 27 Oct 2023 13:38:39 -0400 Subject: [PATCH 08/19] try PyLong_Check --- pandas/_libs/buffer.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/buffer.pyx b/pandas/_libs/buffer.pyx index 8c18402eb8a76..17c27ac03b0e3 100644 --- a/pandas/_libs/buffer.pyx +++ b/pandas/_libs/buffer.pyx @@ -1,6 +1,7 @@ from cpython cimport ( Py_buffer, PyLong_AsVoidPtr, + PyLong_Check, ) from libc.stdint cimport uint8_t @@ -11,7 +12,10 @@ cdef class CBuffer: cdef Py_ssize_t[1] shape = tuple((self.bufsize // itemsize,)) cdef Py_ssize_t[1] strides = tuple((itemsize,)) - buffer.buf = PyLong_AsVoidPtr(self.ptr) + if PyLong_Check(self.ptr): + buffer.buf = PyLong_AsVoidPtr(self.ptr) + else: + raise TypeError("self.ptr must be an integer") # assumes sizeof(unsigned char) == sizeof(uint8_t) # TODO: use C11 static_assert macro in Cython buffer.format = "B" From c1fc642ff7a7a3594ca971002fa31c0839155350 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 30 Oct 2023 20:04:39 -0400 Subject: [PATCH 09/19] try intptr_t --- pandas/_libs/buffer.pyx | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/buffer.pyx b/pandas/_libs/buffer.pyx index 17c27ac03b0e3..a71604f098064 100644 --- a/pandas/_libs/buffer.pyx +++ b/pandas/_libs/buffer.pyx @@ -1,9 +1,8 @@ -from cpython cimport ( - Py_buffer, - PyLong_AsVoidPtr, - PyLong_Check, +from cpython cimport Py_buffer +from libc.stdint cimport ( + intptr_t, + uint8_t, ) -from libc.stdint cimport uint8_t cdef class CBuffer: @@ -11,11 +10,8 @@ cdef class CBuffer: cdef Py_ssize_t itemsize = sizeof(uint8_t) cdef Py_ssize_t[1] shape = tuple((self.bufsize // itemsize,)) cdef Py_ssize_t[1] strides = tuple((itemsize,)) - - if PyLong_Check(self.ptr): - buffer.buf = PyLong_AsVoidPtr(self.ptr) - else: - raise TypeError("self.ptr must be an integer") + cdef intptr_t bufaddr = self.ptr + buffer.buf = bufaddr # assumes sizeof(unsigned char) == sizeof(uint8_t) # TODO: use C11 static_assert macro in Cython buffer.format = "B" From 55fd4ea5a2e51736391faad1e3f6103ec565f659 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 30 Oct 2023 21:38:00 -0400 Subject: [PATCH 10/19] heap store --- pandas/_libs/buffer.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/buffer.pyx b/pandas/_libs/buffer.pyx index a71604f098064..89de612e566ee 100644 --- a/pandas/_libs/buffer.pyx +++ b/pandas/_libs/buffer.pyx @@ -6,12 +6,14 @@ from libc.stdint cimport ( cdef class CBuffer: + cdef intptr_t bufaddr + def __getbuffer__(self, Py_buffer *buffer, int flags): cdef Py_ssize_t itemsize = sizeof(uint8_t) cdef Py_ssize_t[1] shape = tuple((self.bufsize // itemsize,)) cdef Py_ssize_t[1] strides = tuple((itemsize,)) - cdef intptr_t bufaddr = self.ptr - buffer.buf = bufaddr + self.bufaddr = self.ptr + buffer.buf = self.bufaddr # assumes sizeof(unsigned char) == sizeof(uint8_t) # TODO: use C11 static_assert macro in Cython buffer.format = "B" From 7592e3423c0d2fdaefffb98f16f058a4248dc3df Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 31 Oct 2023 21:53:10 -0400 Subject: [PATCH 11/19] try changing buf reference to numpy array --- pandas/_libs/buffer.pyx | 11 +++-------- pandas/tests/interchange/test_impl.py | 13 +++++++++++++ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/buffer.pyx b/pandas/_libs/buffer.pyx index 89de612e566ee..a8d9ed30149dc 100644 --- a/pandas/_libs/buffer.pyx +++ b/pandas/_libs/buffer.pyx @@ -6,25 +6,20 @@ from libc.stdint cimport ( cdef class CBuffer: - cdef intptr_t bufaddr - def __getbuffer__(self, Py_buffer *buffer, int flags): cdef Py_ssize_t itemsize = sizeof(uint8_t) cdef Py_ssize_t[1] shape = tuple((self.bufsize // itemsize,)) cdef Py_ssize_t[1] strides = tuple((itemsize,)) - self.bufaddr = self.ptr - buffer.buf = self.bufaddr + cdef intptr_t bufaddr = self.ptr + buffer.buf = bufaddr # assumes sizeof(unsigned char) == sizeof(uint8_t) # TODO: use C11 static_assert macro in Cython buffer.format = "B" buffer.itemsize = itemsize buffer.len = self.bufsize buffer.ndim = 1 - buffer.obj = self + buffer.obj = self._x buffer.readonly = 1 buffer.shape = shape buffer.strides = strides buffer.suboffsets = NULL - - def __releasebuffer__(self, Py_buffer *buffer): - pass diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index ce398af73ebaf..3a8e5da5024fb 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -1,4 +1,5 @@ from datetime import datetime +import sys import numpy as np import pytest @@ -348,6 +349,18 @@ def test_interchange_from_non_pandas_tz_aware(request): ([True, False, True], np.bool_, bytearray(b"\x01\x00\x01")), ([0, 1, 42], np.uint8, bytearray(b"\x00\x01\x2a")), ([-42, 0, 42], np.int8, bytearray(b"\xd6\x00\x2a")), + ( + [-42, 0, 42], + np.int32, + bytearray(b"\xd6\xff\xff\xff\x00\x00\x00\x00\x2a\x00\x00\x00") + if sys.byteorder == "little" + else bytearray(b"\xff\xff\xff\xd6\x00\x00\x00\x00\x00\x00\x00\x2a"), + ), + ( + ["foo", "bar", "baz"], + "string", + bytearray(b"\x66\x6f\x6f\x62\x61\x72\x62\x61\x7a"), + ), ], ) def test_buffer_buffer_protocol(data, dtype, expected): From f1f911c5237c315e2f80aaf8ed21efd9ab2d6dc3 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 5 Dec 2023 15:24:23 -0800 Subject: [PATCH 12/19] try as voidptr --- pandas/_libs/buffer.pyx | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/buffer.pyx b/pandas/_libs/buffer.pyx index a8d9ed30149dc..c4a54a6d5afec 100644 --- a/pandas/_libs/buffer.pyx +++ b/pandas/_libs/buffer.pyx @@ -1,8 +1,8 @@ -from cpython cimport Py_buffer -from libc.stdint cimport ( - intptr_t, - uint8_t, +from cpython cimport ( + Py_buffer, + PyLong_AsVoidPtr, ) +from libc.stdint cimport uint8_t cdef class CBuffer: @@ -10,8 +10,7 @@ cdef class CBuffer: cdef Py_ssize_t itemsize = sizeof(uint8_t) cdef Py_ssize_t[1] shape = tuple((self.bufsize // itemsize,)) cdef Py_ssize_t[1] strides = tuple((itemsize,)) - cdef intptr_t bufaddr = self.ptr - buffer.buf = bufaddr + buffer.buf = PyLong_AsVoidPtr(self.ptr) # assumes sizeof(unsigned char) == sizeof(uint8_t) # TODO: use C11 static_assert macro in Cython buffer.format = "B" From 6e5a6cd06868059bda8e8e2f60466fae82e45e4b Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Thu, 4 Jan 2024 15:27:06 -0500 Subject: [PATCH 13/19] add releasebuffer --- pandas/_libs/buffer.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/buffer.pyx b/pandas/_libs/buffer.pyx index c4a54a6d5afec..b2d8ce50ed6b1 100644 --- a/pandas/_libs/buffer.pyx +++ b/pandas/_libs/buffer.pyx @@ -6,6 +6,7 @@ from libc.stdint cimport uint8_t cdef class CBuffer: + def __getbuffer__(self, Py_buffer *buffer, int flags): cdef Py_ssize_t itemsize = sizeof(uint8_t) cdef Py_ssize_t[1] shape = tuple((self.bufsize // itemsize,)) @@ -17,8 +18,11 @@ cdef class CBuffer: buffer.itemsize = itemsize buffer.len = self.bufsize buffer.ndim = 1 - buffer.obj = self._x + buffer.obj = self buffer.readonly = 1 buffer.shape = shape buffer.strides = strides buffer.suboffsets = NULL + + def __releasebuffer__(self, Py_buffer *buffer): + pass From c45270f7e482b3cadfb8a50b154c85f2c19998f6 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 6 Jan 2024 12:11:44 -0500 Subject: [PATCH 14/19] try adding handle --- pandas/_libs/buffer.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/_libs/buffer.pyx b/pandas/_libs/buffer.pyx index b2d8ce50ed6b1..bde318afccf2f 100644 --- a/pandas/_libs/buffer.pyx +++ b/pandas/_libs/buffer.pyx @@ -6,6 +6,7 @@ from libc.stdint cimport uint8_t cdef class CBuffer: + cdef object _x_handle def __getbuffer__(self, Py_buffer *buffer, int flags): cdef Py_ssize_t itemsize = sizeof(uint8_t) @@ -23,6 +24,7 @@ cdef class CBuffer: buffer.shape = shape buffer.strides = strides buffer.suboffsets = NULL + self._x_handle = self._x def __releasebuffer__(self, Py_buffer *buffer): pass From 0324242eac5cd93724f6d048c88ab5c55e7eea6e Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Sat, 6 Jan 2024 12:54:02 -0500 Subject: [PATCH 15/19] remove handle --- pandas/_libs/buffer.pyx | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/_libs/buffer.pyx b/pandas/_libs/buffer.pyx index bde318afccf2f..04bc1de5312c9 100644 --- a/pandas/_libs/buffer.pyx +++ b/pandas/_libs/buffer.pyx @@ -6,8 +6,6 @@ from libc.stdint cimport uint8_t cdef class CBuffer: - cdef object _x_handle - def __getbuffer__(self, Py_buffer *buffer, int flags): cdef Py_ssize_t itemsize = sizeof(uint8_t) cdef Py_ssize_t[1] shape = tuple((self.bufsize // itemsize,)) @@ -24,7 +22,6 @@ cdef class CBuffer: buffer.shape = shape buffer.strides = strides buffer.suboffsets = NULL - self._x_handle = self._x def __releasebuffer__(self, Py_buffer *buffer): pass From 5e7a4de2e4c000b6f4bcbc174f85d540ed199618 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Mon, 8 Jan 2024 08:43:58 -0500 Subject: [PATCH 16/19] lifecycle hack --- pandas/tests/interchange/test_impl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index e66fcaf2211e5..d2a874ac859c8 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -360,6 +360,8 @@ def test_buffer_buffer_protocol(data, dtype, expected): col = df.__dataframe__().get_column_by_name("col") result = bytearray(memoryview(col.get_buffers()["data"][0])) assert result == expected + # hack to keep df lifecycle around + len(df) def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: From 2e86d6036abbc7e7b30f9cd05810b66d38d798fc Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Wed, 10 Jan 2024 14:51:00 -0500 Subject: [PATCH 17/19] try buffer copy --- pandas/tests/interchange/test_impl.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index d2a874ac859c8..02b94dc181bc8 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -358,10 +358,8 @@ def test_interchange_from_non_pandas_tz_aware(request): def test_buffer_buffer_protocol(data, dtype, expected): df = pd.DataFrame({"col": data}, dtype=dtype) col = df.__dataframe__().get_column_by_name("col") - result = bytearray(memoryview(col.get_buffers()["data"][0])) + result = bytearray(memoryview(col.get_buffers()["data"][0])).copy() assert result == expected - # hack to keep df lifecycle around - len(df) def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: From 553438b65918575265d3eb0b35677fd649758ffc Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 27 Feb 2024 17:50:13 -0500 Subject: [PATCH 18/19] move buffer to C --- pandas/_libs/buffer.pyx | 118 ++++++++++++++++++++++++++++++++++------ 1 file changed, 102 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/buffer.pyx b/pandas/_libs/buffer.pyx index 04bc1de5312c9..c71bddd1d85ff 100644 --- a/pandas/_libs/buffer.pyx +++ b/pandas/_libs/buffer.pyx @@ -1,27 +1,113 @@ -from cpython cimport ( - Py_buffer, - PyLong_AsVoidPtr, +from cpython cimport Py_buffer +from libc.stdint cimport ( + int8_t, + int16_t, + int32_t, + int64_t, + uint8_t, + uint16_t, + uint32_t, + uint64_t, ) -from libc.stdint cimport uint8_t +ctypedef fused supported_buffer_t: + float + double + int8_t + int16_t + int32_t + int64_t + uint8_t + uint16_t + uint32_t + uint64_t + + +cdef class PandasBuffer: + """ + Data in the buffer is guaranteed to be contiguous in memory. + + Note that there is no dtype attribute present, a buffer can be thought of + as simply a block of memory. However, if the column that the buffer is + attached to has a dtype that's supported by DLPack and ``__dlpack__`` is + implemented, then that dtype information will be contained in the return + value from ``__dlpack__``. + + This distinction is useful to support both data exchange via DLPack on a + buffer and (b) dtypes like variable-length strings which do not have a + fixed number of bytes per element. + """ + + def __init__(self, supported_buffer_t[:] buf, allow_copy: bool = True) -> None: + """ + Handle only regular columns (= numpy arrays) for now. + """ + if buf.strides[0] and not buf.strides == (buf.dtype.itemsize,): + # The protocol does not support strided buffers, so a copy is + # necessary. If that's not allowed, we need to raise an exception. + if allow_copy: + buf = buf.copy() + else: + raise RuntimeError( + "Exports cannot be zero-copy in the case " + "of a non-contiguous buffer" + ) + + # Store the numpy array in which the data resides as a private + # attribute, so we can use it to retrieve the public attributes + self.buf = buf -cdef class CBuffer: def __getbuffer__(self, Py_buffer *buffer, int flags): - cdef Py_ssize_t itemsize = sizeof(uint8_t) - cdef Py_ssize_t[1] shape = tuple((self.bufsize // itemsize,)) - cdef Py_ssize_t[1] strides = tuple((itemsize,)) - buffer.buf = PyLong_AsVoidPtr(self.ptr) + buffer.buf = &self.buf[0] # assumes sizeof(unsigned char) == sizeof(uint8_t) # TODO: use C11 static_assert macro in Cython - buffer.format = "B" - buffer.itemsize = itemsize - buffer.len = self.bufsize - buffer.ndim = 1 + buffer.format = self.buf.format + buffer.itemsize = self.buf.itemsize + buffer.len = len(self.buf) + buffer.ndim = self.buf.ndim buffer.obj = self buffer.readonly = 1 - buffer.shape = shape - buffer.strides = strides - buffer.suboffsets = NULL + buffer.shape = self.buf.shape + buffer.strides = self.buf.strides + buffer.suboffsets = self.buf.suboffsets def __releasebuffer__(self, Py_buffer *buffer): pass + + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + return self.buf.size * self.buf.itemsize + + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + return &self.buf[0] + + def __dlpack__(self): + """ + Represent this structure as DLPack interface. + """ + raise NotImplementedError + + def __dlpack_device__(self): + """ + Device type and device ID for where the data in the buffer resides. + """ + raise NotImplementedError + + def __repr__(self) -> str: + return ( + "PandasBuffer(" + + str( + { + "bufsize": self.bufsize, + "ptr": self.ptr, + } + ) + + ")" + ) From 9e15c5b66c2491eb2745466cb75d121e8e1db797 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 27 Feb 2024 18:32:34 -0500 Subject: [PATCH 19/19] Some hacks --- pandas/_libs/buffer.pyx | 51 +++++++++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/buffer.pyx b/pandas/_libs/buffer.pyx index c71bddd1d85ff..d444edeeedbf1 100644 --- a/pandas/_libs/buffer.pyx +++ b/pandas/_libs/buffer.pyx @@ -1,4 +1,7 @@ -from cpython cimport Py_buffer +from cpython cimport ( + Py_buffer, + PyLong_FromVoidPtr, +) from libc.stdint cimport ( int8_t, int16_t, @@ -38,6 +41,21 @@ cdef class PandasBuffer: fixed number of bytes per element. """ + # we cannot use a fused type as a class attribute, so we instead + # unpack the items we need for the buffer protocol in __init__ + + cdef: + void *ptr_ + Py_ssize_t len_ + Py_ssize_t itemsize + + int readonly + int ndim + bytes format + Py_ssize_t *shape + Py_ssize_t *strides + Py_ssize_t *suboffsets + def __init__(self, supported_buffer_t[:] buf, allow_copy: bool = True) -> None: """ Handle only regular columns (= numpy arrays) for now. @@ -56,20 +74,29 @@ cdef class PandasBuffer: # Store the numpy array in which the data resides as a private # attribute, so we can use it to retrieve the public attributes self.buf = buf + self.ptr_ = &buf[0] + self.len_ = len(buf) + self.itemsize = buf.itemsize + self.readonly = buf.readonly + self.ndim = buf.ndim + self.format = buf.format + self.shape = buf.shape + self.strides = buf.strides + self.suboffsets = buf.suboffsets def __getbuffer__(self, Py_buffer *buffer, int flags): - buffer.buf = &self.buf[0] + buffer.buf = self.ptr_ # assumes sizeof(unsigned char) == sizeof(uint8_t) # TODO: use C11 static_assert macro in Cython - buffer.format = self.buf.format - buffer.itemsize = self.buf.itemsize - buffer.len = len(self.buf) - buffer.ndim = self.buf.ndim - buffer.obj = self - buffer.readonly = 1 - buffer.shape = self.buf.shape - buffer.strides = self.buf.strides - buffer.suboffsets = self.buf.suboffsets + buffer.format = self.format + buffer.itemsize = self.itemsize + buffer.len = self.len_ + buffer.ndim = self.ndim + buffer.obj = self.obj + buffer.readonly = self.readonly + buffer.shape = self.shape + buffer.strides = self.strides + buffer.suboffsets = self.suboffsets def __releasebuffer__(self, Py_buffer *buffer): pass @@ -86,7 +113,7 @@ cdef class PandasBuffer: """ Pointer to start of the buffer as an integer. """ - return &self.buf[0] + return PyLong_FromVoidPtr(self.ptr_) def __dlpack__(self): """