Skip to content

Commit 49e3ec4

Browse files
committed
is this...it?
1 parent f1a6cee commit 49e3ec4

File tree

5 files changed

+237
-42
lines changed

5 files changed

+237
-42
lines changed

pandas/core/interchange/buffer.py

+72
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
if TYPE_CHECKING:
1414
import numpy as np
15+
import pyarrow as pa
1516

1617

1718
class PandasBuffer(Buffer):
@@ -76,3 +77,74 @@ def __repr__(self) -> str:
7677
)
7778
+ ")"
7879
)
80+
81+
82+
class PandasBufferPyarrow(Buffer):
83+
"""
84+
Data in the buffer is guaranteed to be contiguous in memory.
85+
"""
86+
87+
def __init__(
88+
self,
89+
chunked_array: pa.ChunkedArray,
90+
*,
91+
is_validity: bool,
92+
allow_copy: bool = True,
93+
) -> None:
94+
"""
95+
Handle pyarrow chunked arrays.
96+
"""
97+
if len(chunked_array.chunks) == 1:
98+
arr = chunked_array.chunks[0]
99+
else:
100+
if not allow_copy:
101+
raise RuntimeError(
102+
"Found multi-chunk pyarrow array, but `allow_copy` is False"
103+
)
104+
arr = chunked_array.combine_chunks()
105+
if is_validity:
106+
self._buffer = arr.buffers()[0]
107+
else:
108+
self._buffer = arr.buffers()[1]
109+
self._length = len(arr)
110+
self._dlpack = arr.__dlpack__
111+
self._is_validity = is_validity
112+
113+
@property
114+
def bufsize(self) -> int:
115+
"""
116+
Buffer size in bytes.
117+
"""
118+
return self._buffer.size
119+
120+
@property
121+
def ptr(self) -> int:
122+
"""
123+
Pointer to start of the buffer as an integer.
124+
"""
125+
return self._buffer.address
126+
127+
def __dlpack__(self) -> Any:
128+
"""
129+
Represent this structure as DLPack interface.
130+
"""
131+
return self._dlpack()
132+
133+
def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
134+
"""
135+
Device type and device ID for where the data in the buffer resides.
136+
"""
137+
return (DlpackDeviceType.CPU, None)
138+
139+
def __repr__(self) -> str:
140+
return (
141+
"PandasBuffer[pyarrow]("
142+
+ str(
143+
{
144+
"bufsize": self.bufsize,
145+
"ptr": self.ptr,
146+
"device": self.__dlpack_device__()[0].name,
147+
}
148+
)
149+
+ ")"
150+
)

pandas/core/interchange/column.py

+41-8
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,18 @@
99
from pandas.errors import NoBufferPresent
1010
from pandas.util._decorators import cache_readonly
1111

12-
from pandas.core.dtypes.dtypes import (
12+
from pandas.core.dtypes.dtypes import BaseMaskedDtype
13+
14+
import pandas as pd
15+
from pandas import (
1316
ArrowDtype,
14-
BaseMaskedDtype,
1517
DatetimeTZDtype,
1618
)
17-
18-
import pandas as pd
1919
from pandas.api.types import is_string_dtype
20-
from pandas.core.interchange.buffer import PandasBuffer
20+
from pandas.core.interchange.buffer import (
21+
PandasBuffer,
22+
PandasBufferPyarrow,
23+
)
2124
from pandas.core.interchange.dataframe_protocol import (
2225
Column,
2326
ColumnBuffers,
@@ -157,6 +160,9 @@ def _dtype_from_pandasdtype(self, dtype) -> tuple[DtypeKind, int, str, str]:
157160
else:
158161
byteorder = dtype.byteorder
159162

163+
if dtype == "bool[pyarrow]":
164+
return kind, dtype.itemsize, ArrowCTypes.BOOL, byteorder
165+
160166
return kind, dtype.itemsize * 8, dtype_to_arrow_c_fmt(dtype), byteorder
161167

162168
@property
@@ -194,6 +200,12 @@ def describe_null(self):
194200
column_null_dtype = ColumnNullType.USE_BYTEMASK
195201
null_value = 1
196202
return column_null_dtype, null_value
203+
if isinstance(self._col.dtype, ArrowDtype):
204+
if all(
205+
chunk.buffers()[0] is None for chunk in self._col.array._pa_array.chunks
206+
):
207+
return ColumnNullType.NON_NULLABLE, None
208+
return ColumnNullType.USE_BITMASK, 0
197209
kind = self.dtype[0]
198210
try:
199211
null, value = _NULL_DESCRIPTION[kind]
@@ -302,15 +314,26 @@ def _get_data_buffer(
302314
DtypeKind.FLOAT,
303315
DtypeKind.BOOL,
304316
):
317+
dtype = self.dtype
305318
arr = self._col.array
319+
if isinstance(self._col.dtype, ArrowDtype):
320+
arr = self._col.array
321+
buffer = PandasBufferPyarrow(
322+
arr._pa_array, is_validity=False, allow_copy=self._allow_copy
323+
)
324+
if self.dtype[0] == DtypeKind.BOOL:
325+
dtype = (
326+
DtypeKind.BOOL,
327+
1,
328+
ArrowCTypes.BOOL,
329+
Endianness.NATIVE,
330+
)
331+
return buffer, dtype
306332
if isinstance(self._col.dtype, BaseMaskedDtype):
307333
np_arr = arr._data # type: ignore[attr-defined]
308-
elif isinstance(self._col.dtype, ArrowDtype):
309-
raise NotImplementedError("ArrowDtype not handled yet")
310334
else:
311335
np_arr = arr._ndarray # type: ignore[attr-defined]
312336
buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy)
313-
dtype = self.dtype
314337
elif self.dtype[0] == DtypeKind.CATEGORICAL:
315338
codes = self._col.values._codes
316339
buffer = PandasBuffer(codes, allow_copy=self._allow_copy)
@@ -351,6 +374,16 @@ def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]:
351374
"""
352375
null, invalid = self.describe_null
353376

377+
if isinstance(self._col.dtype, ArrowDtype):
378+
arr = self._col.array
379+
dtype = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, Endianness.NATIVE)
380+
if all(chunk.buffers()[0] is None for chunk in arr._pa_array.chunks):
381+
return None
382+
buffer = PandasBufferPyarrow(
383+
arr._pa_array, is_validity=True, allow_copy=self._allow_copy
384+
)
385+
return buffer, dtype
386+
354387
if isinstance(self._col.dtype, BaseMaskedDtype):
355388
mask = self._col.array._mask # type: ignore[attr-defined]
356389
buffer = PandasBuffer(mask)

pandas/core/interchange/from_dataframe.py

+16-13
Original file line numberDiff line numberDiff line change
@@ -298,13 +298,14 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
298298

299299
null_pos = None
300300
if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
301-
assert buffers["validity"], "Validity buffers cannot be empty for masks"
302-
valid_buff, valid_dtype = buffers["validity"]
303-
null_pos = buffer_to_ndarray(
304-
valid_buff, valid_dtype, offset=col.offset, length=col.size()
305-
)
306-
if sentinel_val == 0:
307-
null_pos = ~null_pos
301+
validity = buffers["validity"]
302+
if validity is not None:
303+
valid_buff, valid_dtype = validity
304+
null_pos = buffer_to_ndarray(
305+
valid_buff, valid_dtype, offset=col.offset, length=col.size()
306+
)
307+
if sentinel_val == 0:
308+
null_pos = ~null_pos
308309

309310
# Assemble the strings from the code units
310311
str_list: list[None | float | str] = [None] * col.size()
@@ -516,19 +517,21 @@ def set_nulls(
516517
np.ndarray or pd.Series
517518
Data with the nulls being set.
518519
"""
520+
if validity is None:
521+
return data
519522
null_kind, sentinel_val = col.describe_null
520523
null_pos = None
521524

522525
if null_kind == ColumnNullType.USE_SENTINEL:
523526
null_pos = pd.Series(data) == sentinel_val
524527
elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
525-
assert validity, "Expected to have a validity buffer for the mask"
526528
valid_buff, valid_dtype = validity
527-
null_pos = buffer_to_ndarray(
528-
valid_buff, valid_dtype, offset=col.offset, length=col.size()
529-
)
530-
if sentinel_val == 0:
531-
null_pos = ~null_pos
529+
if valid_buff is not None:
530+
null_pos = buffer_to_ndarray(
531+
valid_buff, valid_dtype, offset=col.offset, length=col.size()
532+
)
533+
if sentinel_val == 0:
534+
null_pos = ~null_pos
532535
elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN):
533536
pass
534537
else:

pandas/core/interchange/utils.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,9 @@
1010

1111
from pandas._libs import lib
1212

13-
from pandas.core.dtypes.dtypes import (
13+
from pandas import (
1414
ArrowDtype,
15+
BooleanDtype,
1516
CategoricalDtype,
1617
DatetimeTZDtype,
1718
)
@@ -142,6 +143,9 @@ def dtype_to_arrow_c_fmt(dtype: DtypeObj) -> str:
142143
elif isinstance(dtype, DatetimeTZDtype):
143144
return ArrowCTypes.TIMESTAMP.format(resolution=dtype.unit[0], tz=dtype.tz)
144145

146+
elif isinstance(dtype, BooleanDtype):
147+
return ArrowCTypes.BOOL
148+
145149
raise NotImplementedError(
146150
f"Conversion of {dtype} to Arrow C format string is not implemented."
147151
)

0 commit comments

Comments
 (0)