Skip to content

Commit a2b6f12

Browse files
MarcoGorelliim-vinicius
authored and
im-vinicius
committed
BUG: interchange bitmasks not supported in interchange/from_dataframe.py (pandas-dev#52824)
* support bitmasks in interchange * remove dead code * fixup for slice, add tests * tighten typing * reduce diff * post-merge fixup * add new whatsnew note * move to 2.0.2 * revert --------- Co-authored-by: MarcoGorelli <>
1 parent 52e8c11 commit a2b6f12

File tree

3 files changed

+62
-70
lines changed

3 files changed

+62
-70
lines changed

doc/source/whatsnew/v2.0.2.rst

+2
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ Fixed regressions
2020

2121
Bug fixes
2222
~~~~~~~~~
23+
- Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`)
24+
- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`)
2325
-
2426

2527
.. ---------------------------------------------------------------------------

pandas/core/interchange/from_dataframe.py

+34-70
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
import numpy as np
88

9+
from pandas.compat._optional import import_optional_dependency
10+
911
import pandas as pd
1012
from pandas.core.interchange.dataframe_protocol import (
1113
Buffer,
@@ -23,7 +25,7 @@
2325
DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64},
2426
DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64},
2527
DtypeKind.FLOAT: {32: np.float32, 64: np.float64},
26-
DtypeKind.BOOL: {8: bool},
28+
DtypeKind.BOOL: {1: bool, 8: bool},
2729
}
2830

2931

@@ -154,7 +156,9 @@ def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
154156
buffers = col.get_buffers()
155157

156158
data_buff, data_dtype = buffers["data"]
157-
data = buffer_to_ndarray(data_buff, data_dtype, col.offset, col.size())
159+
data = buffer_to_ndarray(
160+
data_buff, data_dtype, offset=col.offset, length=col.size()
161+
)
158162

159163
data = set_nulls(data, col, buffers["validity"])
160164
return data, buffers
@@ -192,7 +196,9 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]:
192196
buffers = col.get_buffers()
193197

194198
codes_buff, codes_dtype = buffers["data"]
195-
codes = buffer_to_ndarray(codes_buff, codes_dtype, col.offset, col.size())
199+
codes = buffer_to_ndarray(
200+
codes_buff, codes_dtype, offset=col.offset, length=col.size()
201+
)
196202

197203
# Doing module in order to not get ``IndexError`` for
198204
# out-of-bounds sentinel values in `codes`
@@ -252,7 +258,7 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
252258
Endianness.NATIVE,
253259
)
254260
# Specify zero offset as we don't want to chunk the string data
255-
data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size())
261+
data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize)
256262

257263
# Retrieve the offsets buffer containing the index offsets demarcating
258264
# the beginning and the ending of each string
@@ -261,14 +267,16 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
261267
# meaning that it has more elements than in the data buffer, do `col.size() + 1`
262268
# here to pass a proper offsets buffer size
263269
offsets = buffer_to_ndarray(
264-
offset_buff, offset_dtype, col.offset, length=col.size() + 1
270+
offset_buff, offset_dtype, offset=col.offset, length=col.size() + 1
265271
)
266272

267273
null_pos = None
268274
if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
269275
assert buffers["validity"], "Validity buffers cannot be empty for masks"
270276
valid_buff, valid_dtype = buffers["validity"]
271-
null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size())
277+
null_pos = buffer_to_ndarray(
278+
valid_buff, valid_dtype, offset=col.offset, length=col.size()
279+
)
272280
if sentinel_val == 0:
273281
null_pos = ~null_pos
274282

@@ -356,8 +364,8 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
356364
getattr(ArrowCTypes, f"UINT{dtype[1]}"),
357365
Endianness.NATIVE,
358366
),
359-
col.offset,
360-
col.size(),
367+
offset=col.offset,
368+
length=col.size(),
361369
)
362370

363371
data = parse_datetime_format_str(format_str, data)
@@ -368,8 +376,9 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
368376
def buffer_to_ndarray(
369377
buffer: Buffer,
370378
dtype: tuple[DtypeKind, int, str, str],
379+
*,
380+
length: int,
371381
offset: int = 0,
372-
length: int | None = None,
373382
) -> np.ndarray:
374383
"""
375384
Build a NumPy array from the passed buffer.
@@ -406,74 +415,27 @@ def buffer_to_ndarray(
406415
# and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports
407416
# it since https://github.com/numpy/numpy/pull/19083
408417
ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype)
409-
data_pointer = ctypes.cast(
410-
buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type)
411-
)
412418

413419
if bit_width == 1:
414420
assert length is not None, "`length` must be specified for a bit-mask buffer."
415-
arr = np.ctypeslib.as_array(data_pointer, shape=(buffer.bufsize,))
416-
return bitmask_to_bool_ndarray(arr, length, first_byte_offset=offset % 8)
421+
pa = import_optional_dependency("pyarrow")
422+
arr = pa.BooleanArray.from_buffers(
423+
pa.bool_(),
424+
length,
425+
[None, pa.foreign_buffer(buffer.ptr, length)],
426+
offset=offset,
427+
)
428+
return np.asarray(arr)
417429
else:
430+
data_pointer = ctypes.cast(
431+
buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type)
432+
)
418433
return np.ctypeslib.as_array(
419-
data_pointer, shape=(buffer.bufsize // (bit_width // 8),)
434+
data_pointer,
435+
shape=(length,),
420436
)
421437

422438

423-
def bitmask_to_bool_ndarray(
424-
bitmask: np.ndarray, mask_length: int, first_byte_offset: int = 0
425-
) -> np.ndarray:
426-
"""
427-
Convert bit-mask to a boolean NumPy array.
428-
429-
Parameters
430-
----------
431-
bitmask : np.ndarray[uint8]
432-
NumPy array of uint8 dtype representing the bitmask.
433-
mask_length : int
434-
Number of elements in the mask to interpret.
435-
first_byte_offset : int, default: 0
436-
Number of elements to offset from the start of the first byte.
437-
438-
Returns
439-
-------
440-
np.ndarray[bool]
441-
"""
442-
bytes_to_skip = first_byte_offset // 8
443-
bitmask = bitmask[bytes_to_skip:]
444-
first_byte_offset %= 8
445-
446-
bool_mask = np.zeros(mask_length, dtype=bool)
447-
448-
# Processing the first byte separately as it has its own offset
449-
val = bitmask[0]
450-
mask_idx = 0
451-
bits_in_first_byte = min(8 - first_byte_offset, mask_length)
452-
for j in range(bits_in_first_byte):
453-
if val & (1 << (j + first_byte_offset)):
454-
bool_mask[mask_idx] = True
455-
mask_idx += 1
456-
457-
# `mask_length // 8` describes how many full bytes to process
458-
for i in range((mask_length - bits_in_first_byte) // 8):
459-
# doing `+ 1` as we already processed the first byte
460-
val = bitmask[i + 1]
461-
for j in range(8):
462-
if val & (1 << j):
463-
bool_mask[mask_idx] = True
464-
mask_idx += 1
465-
466-
if len(bitmask) > 1:
467-
# Processing reminder of last byte
468-
val = bitmask[-1]
469-
for j in range(len(bool_mask) - mask_idx):
470-
if val & (1 << j):
471-
bool_mask[mask_idx] = True
472-
mask_idx += 1
473-
474-
return bool_mask
475-
476-
477439
def set_nulls(
478440
data: np.ndarray | pd.Series,
479441
col: Column,
@@ -509,7 +471,9 @@ def set_nulls(
509471
elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
510472
assert validity, "Expected to have a validity buffer for the mask"
511473
valid_buff, valid_dtype = validity
512-
null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size())
474+
null_pos = buffer_to_ndarray(
475+
valid_buff, valid_dtype, offset=col.offset, length=col.size()
476+
)
513477
if sentinel_val == 0:
514478
null_pos = ~null_pos
515479
elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN):

pandas/tests/interchange/test_impl.py

+26
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,32 @@ def test_large_string_pyarrow():
104104
assert pa.Table.equals(pa.interchange.from_dataframe(result), table)
105105

106106

107+
@pytest.mark.parametrize(
108+
("offset", "length", "expected_values"),
109+
[
110+
(0, None, [3.3, float("nan"), 2.1]),
111+
(1, None, [float("nan"), 2.1]),
112+
(2, None, [2.1]),
113+
(0, 2, [3.3, float("nan")]),
114+
(0, 1, [3.3]),
115+
(1, 1, [float("nan")]),
116+
],
117+
)
118+
def test_bitmasks_pyarrow(offset, length, expected_values):
119+
# GH 52795
120+
pa = pytest.importorskip("pyarrow", "11.0.0")
121+
122+
arr = [3.3, None, 2.1]
123+
table = pa.table({"arr": arr}).slice(offset, length)
124+
exchange_df = table.__dataframe__()
125+
result = from_dataframe(exchange_df)
126+
expected = pd.DataFrame({"arr": expected_values})
127+
tm.assert_frame_equal(result, expected)
128+
129+
# check round-trip
130+
assert pa.Table.equals(pa.interchange.from_dataframe(result), table)
131+
132+
107133
@pytest.mark.parametrize(
108134
"data", [int_data, uint_data, float_data, bool_data, datetime_data]
109135
)

0 commit comments

Comments
 (0)