Skip to content

Backport PR #52824 on branch 2.0.x (BUG: interchange bitmasks not supported in interchange/from_dataframe.py) #52907

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.0.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ Fixed regressions

Bug fixes
~~~~~~~~~
- Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`)
- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`)
-

.. ---------------------------------------------------------------------------
Expand Down
104 changes: 34 additions & 70 deletions pandas/core/interchange/from_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

import numpy as np

from pandas.compat._optional import import_optional_dependency

import pandas as pd
from pandas.core.interchange.dataframe_protocol import (
Buffer,
Expand All @@ -23,7 +25,7 @@
DtypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64},
DtypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64},
DtypeKind.FLOAT: {32: np.float32, 64: np.float64},
DtypeKind.BOOL: {8: bool},
DtypeKind.BOOL: {1: bool, 8: bool},
}


Expand Down Expand Up @@ -154,7 +156,9 @@ def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
buffers = col.get_buffers()

data_buff, data_dtype = buffers["data"]
data = buffer_to_ndarray(data_buff, data_dtype, col.offset, col.size())
data = buffer_to_ndarray(
data_buff, data_dtype, offset=col.offset, length=col.size()
)

data = set_nulls(data, col, buffers["validity"])
return data, buffers
Expand Down Expand Up @@ -192,7 +196,9 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]:
buffers = col.get_buffers()

codes_buff, codes_dtype = buffers["data"]
codes = buffer_to_ndarray(codes_buff, codes_dtype, col.offset, col.size())
codes = buffer_to_ndarray(
codes_buff, codes_dtype, offset=col.offset, length=col.size()
)

# Doing module in order to not get ``IndexError`` for
# out-of-bounds sentinel values in `codes`
Expand Down Expand Up @@ -252,7 +258,7 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
Endianness.NATIVE,
)
# Specify zero offset as we don't want to chunk the string data
data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size())
data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=data_buff.bufsize)

# Retrieve the offsets buffer containing the index offsets demarcating
# the beginning and the ending of each string
Expand All @@ -261,14 +267,16 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
# meaning that it has more elements than in the data buffer, do `col.size() + 1`
# here to pass a proper offsets buffer size
offsets = buffer_to_ndarray(
offset_buff, offset_dtype, col.offset, length=col.size() + 1
offset_buff, offset_dtype, offset=col.offset, length=col.size() + 1
)

null_pos = None
if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
assert buffers["validity"], "Validity buffers cannot be empty for masks"
valid_buff, valid_dtype = buffers["validity"]
null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size())
null_pos = buffer_to_ndarray(
valid_buff, valid_dtype, offset=col.offset, length=col.size()
)
if sentinel_val == 0:
null_pos = ~null_pos

Expand Down Expand Up @@ -356,8 +364,8 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
getattr(ArrowCTypes, f"UINT{dtype[1]}"),
Endianness.NATIVE,
),
col.offset,
col.size(),
offset=col.offset,
length=col.size(),
)

data = parse_datetime_format_str(format_str, data)
Expand All @@ -368,8 +376,9 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
def buffer_to_ndarray(
buffer: Buffer,
dtype: tuple[DtypeKind, int, str, str],
*,
length: int,
offset: int = 0,
length: int | None = None,
) -> np.ndarray:
"""
Build a NumPy array from the passed buffer.
Expand Down Expand Up @@ -406,74 +415,27 @@ def buffer_to_ndarray(
# and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports
# it since https://github.com/numpy/numpy/pull/19083
ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype)
data_pointer = ctypes.cast(
buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type)
)

if bit_width == 1:
assert length is not None, "`length` must be specified for a bit-mask buffer."
arr = np.ctypeslib.as_array(data_pointer, shape=(buffer.bufsize,))
return bitmask_to_bool_ndarray(arr, length, first_byte_offset=offset % 8)
pa = import_optional_dependency("pyarrow")
arr = pa.BooleanArray.from_buffers(
pa.bool_(),
length,
[None, pa.foreign_buffer(buffer.ptr, length)],
offset=offset,
)
return np.asarray(arr)
else:
data_pointer = ctypes.cast(
buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type)
)
return np.ctypeslib.as_array(
data_pointer, shape=(buffer.bufsize // (bit_width // 8),)
data_pointer,
shape=(length,),
)


def bitmask_to_bool_ndarray(
bitmask: np.ndarray, mask_length: int, first_byte_offset: int = 0
) -> np.ndarray:
"""
Convert bit-mask to a boolean NumPy array.

Parameters
----------
bitmask : np.ndarray[uint8]
NumPy array of uint8 dtype representing the bitmask.
mask_length : int
Number of elements in the mask to interpret.
first_byte_offset : int, default: 0
Number of elements to offset from the start of the first byte.

Returns
-------
np.ndarray[bool]
"""
bytes_to_skip = first_byte_offset // 8
bitmask = bitmask[bytes_to_skip:]
first_byte_offset %= 8

bool_mask = np.zeros(mask_length, dtype=bool)

# Processing the first byte separately as it has its own offset
val = bitmask[0]
mask_idx = 0
bits_in_first_byte = min(8 - first_byte_offset, mask_length)
for j in range(bits_in_first_byte):
if val & (1 << (j + first_byte_offset)):
bool_mask[mask_idx] = True
mask_idx += 1

# `mask_length // 8` describes how many full bytes to process
for i in range((mask_length - bits_in_first_byte) // 8):
# doing `+ 1` as we already processed the first byte
val = bitmask[i + 1]
for j in range(8):
if val & (1 << j):
bool_mask[mask_idx] = True
mask_idx += 1

if len(bitmask) > 1:
# Processing reminder of last byte
val = bitmask[-1]
for j in range(len(bool_mask) - mask_idx):
if val & (1 << j):
bool_mask[mask_idx] = True
mask_idx += 1

return bool_mask


def set_nulls(
data: np.ndarray | pd.Series,
col: Column,
Expand Down Expand Up @@ -509,7 +471,9 @@ def set_nulls(
elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK):
assert validity, "Expected to have a validity buffer for the mask"
valid_buff, valid_dtype = validity
null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size())
null_pos = buffer_to_ndarray(
valid_buff, valid_dtype, offset=col.offset, length=col.size()
)
if sentinel_val == 0:
null_pos = ~null_pos
elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN):
Expand Down
26 changes: 26 additions & 0 deletions pandas/tests/interchange/test_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,32 @@ def test_large_string_pyarrow():
assert pa.Table.equals(pa.interchange.from_dataframe(result), table)


@pytest.mark.parametrize(
("offset", "length", "expected_values"),
[
(0, None, [3.3, float("nan"), 2.1]),
(1, None, [float("nan"), 2.1]),
(2, None, [2.1]),
(0, 2, [3.3, float("nan")]),
(0, 1, [3.3]),
(1, 1, [float("nan")]),
],
)
def test_bitmasks_pyarrow(offset, length, expected_values):
# GH 52795
pa = pytest.importorskip("pyarrow", "11.0.0")

arr = [3.3, None, 2.1]
table = pa.table({"arr": arr}).slice(offset, length)
exchange_df = table.__dataframe__()
result = from_dataframe(exchange_df)
expected = pd.DataFrame({"arr": expected_values})
tm.assert_frame_equal(result, expected)

# check round-trip
assert pa.Table.equals(pa.interchange.from_dataframe(result), table)


@pytest.mark.parametrize(
"data", [int_data, uint_data, float_data, bool_data, datetime_data]
)
Expand Down