6
6
7
7
import numpy as np
8
8
9
+ from pandas .compat ._optional import import_optional_dependency
10
+
9
11
import pandas as pd
10
12
from pandas .core .interchange .dataframe_protocol import (
11
13
Buffer ,
23
25
DtypeKind .INT : {8 : np .int8 , 16 : np .int16 , 32 : np .int32 , 64 : np .int64 },
24
26
DtypeKind .UINT : {8 : np .uint8 , 16 : np .uint16 , 32 : np .uint32 , 64 : np .uint64 },
25
27
DtypeKind .FLOAT : {32 : np .float32 , 64 : np .float64 },
26
- DtypeKind .BOOL : {8 : bool },
28
+ DtypeKind .BOOL : {1 : bool , 8 : bool },
27
29
}
28
30
29
31
@@ -154,7 +156,9 @@ def primitive_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
154
156
buffers = col .get_buffers ()
155
157
156
158
data_buff , data_dtype = buffers ["data" ]
157
- data = buffer_to_ndarray (data_buff , data_dtype , col .offset , col .size ())
159
+ data = buffer_to_ndarray (
160
+ data_buff , data_dtype , offset = col .offset , length = col .size ()
161
+ )
158
162
159
163
data = set_nulls (data , col , buffers ["validity" ])
160
164
return data , buffers
@@ -192,7 +196,9 @@ def categorical_column_to_series(col: Column) -> tuple[pd.Series, Any]:
192
196
buffers = col .get_buffers ()
193
197
194
198
codes_buff , codes_dtype = buffers ["data" ]
195
- codes = buffer_to_ndarray (codes_buff , codes_dtype , col .offset , col .size ())
199
+ codes = buffer_to_ndarray (
200
+ codes_buff , codes_dtype , offset = col .offset , length = col .size ()
201
+ )
196
202
197
203
# Doing module in order to not get ``IndexError`` for
198
204
# out-of-bounds sentinel values in `codes`
@@ -252,7 +258,7 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
252
258
Endianness .NATIVE ,
253
259
)
254
260
# Specify zero offset as we don't want to chunk the string data
255
- data = buffer_to_ndarray (data_buff , data_dtype , offset = 0 , length = col . size () )
261
+ data = buffer_to_ndarray (data_buff , data_dtype , offset = 0 , length = data_buff . bufsize )
256
262
257
263
# Retrieve the offsets buffer containing the index offsets demarcating
258
264
# the beginning and the ending of each string
@@ -261,14 +267,16 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
261
267
# meaning that it has more elements than in the data buffer, do `col.size() + 1`
262
268
# here to pass a proper offsets buffer size
263
269
offsets = buffer_to_ndarray (
264
- offset_buff , offset_dtype , col .offset , length = col .size () + 1
270
+ offset_buff , offset_dtype , offset = col .offset , length = col .size () + 1
265
271
)
266
272
267
273
null_pos = None
268
274
if null_kind in (ColumnNullType .USE_BITMASK , ColumnNullType .USE_BYTEMASK ):
269
275
assert buffers ["validity" ], "Validity buffers cannot be empty for masks"
270
276
valid_buff , valid_dtype = buffers ["validity" ]
271
- null_pos = buffer_to_ndarray (valid_buff , valid_dtype , col .offset , col .size ())
277
+ null_pos = buffer_to_ndarray (
278
+ valid_buff , valid_dtype , offset = col .offset , length = col .size ()
279
+ )
272
280
if sentinel_val == 0 :
273
281
null_pos = ~ null_pos
274
282
@@ -356,8 +364,8 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
356
364
getattr (ArrowCTypes , f"UINT{ dtype [1 ]} " ),
357
365
Endianness .NATIVE ,
358
366
),
359
- col .offset ,
360
- col .size (),
367
+ offset = col .offset ,
368
+ length = col .size (),
361
369
)
362
370
363
371
data = parse_datetime_format_str (format_str , data )
@@ -368,8 +376,9 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
368
376
def buffer_to_ndarray (
369
377
buffer : Buffer ,
370
378
dtype : tuple [DtypeKind , int , str , str ],
379
+ * ,
380
+ length : int ,
371
381
offset : int = 0 ,
372
- length : int | None = None ,
373
382
) -> np .ndarray :
374
383
"""
375
384
Build a NumPy array from the passed buffer.
@@ -406,74 +415,27 @@ def buffer_to_ndarray(
406
415
# and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports
407
416
# it since https://github.com/numpy/numpy/pull/19083
408
417
ctypes_type = np .ctypeslib .as_ctypes_type (column_dtype )
409
- data_pointer = ctypes .cast (
410
- buffer .ptr + (offset * bit_width // 8 ), ctypes .POINTER (ctypes_type )
411
- )
412
418
413
419
if bit_width == 1 :
414
420
assert length is not None , "`length` must be specified for a bit-mask buffer."
415
- arr = np .ctypeslib .as_array (data_pointer , shape = (buffer .bufsize ,))
416
- return bitmask_to_bool_ndarray (arr , length , first_byte_offset = offset % 8 )
421
+ pa = import_optional_dependency ("pyarrow" )
422
+ arr = pa .BooleanArray .from_buffers (
423
+ pa .bool_ (),
424
+ length ,
425
+ [None , pa .foreign_buffer (buffer .ptr , length )],
426
+ offset = offset ,
427
+ )
428
+ return np .asarray (arr )
417
429
else :
430
+ data_pointer = ctypes .cast (
431
+ buffer .ptr + (offset * bit_width // 8 ), ctypes .POINTER (ctypes_type )
432
+ )
418
433
return np .ctypeslib .as_array (
419
- data_pointer , shape = (buffer .bufsize // (bit_width // 8 ),)
434
+ data_pointer ,
435
+ shape = (length ,),
420
436
)
421
437
422
438
423
- def bitmask_to_bool_ndarray (
424
- bitmask : np .ndarray , mask_length : int , first_byte_offset : int = 0
425
- ) -> np .ndarray :
426
- """
427
- Convert bit-mask to a boolean NumPy array.
428
-
429
- Parameters
430
- ----------
431
- bitmask : np.ndarray[uint8]
432
- NumPy array of uint8 dtype representing the bitmask.
433
- mask_length : int
434
- Number of elements in the mask to interpret.
435
- first_byte_offset : int, default: 0
436
- Number of elements to offset from the start of the first byte.
437
-
438
- Returns
439
- -------
440
- np.ndarray[bool]
441
- """
442
- bytes_to_skip = first_byte_offset // 8
443
- bitmask = bitmask [bytes_to_skip :]
444
- first_byte_offset %= 8
445
-
446
- bool_mask = np .zeros (mask_length , dtype = bool )
447
-
448
- # Processing the first byte separately as it has its own offset
449
- val = bitmask [0 ]
450
- mask_idx = 0
451
- bits_in_first_byte = min (8 - first_byte_offset , mask_length )
452
- for j in range (bits_in_first_byte ):
453
- if val & (1 << (j + first_byte_offset )):
454
- bool_mask [mask_idx ] = True
455
- mask_idx += 1
456
-
457
- # `mask_length // 8` describes how many full bytes to process
458
- for i in range ((mask_length - bits_in_first_byte ) // 8 ):
459
- # doing `+ 1` as we already processed the first byte
460
- val = bitmask [i + 1 ]
461
- for j in range (8 ):
462
- if val & (1 << j ):
463
- bool_mask [mask_idx ] = True
464
- mask_idx += 1
465
-
466
- if len (bitmask ) > 1 :
467
- # Processing reminder of last byte
468
- val = bitmask [- 1 ]
469
- for j in range (len (bool_mask ) - mask_idx ):
470
- if val & (1 << j ):
471
- bool_mask [mask_idx ] = True
472
- mask_idx += 1
473
-
474
- return bool_mask
475
-
476
-
477
439
def set_nulls (
478
440
data : np .ndarray | pd .Series ,
479
441
col : Column ,
@@ -509,7 +471,9 @@ def set_nulls(
509
471
elif null_kind in (ColumnNullType .USE_BITMASK , ColumnNullType .USE_BYTEMASK ):
510
472
assert validity , "Expected to have a validity buffer for the mask"
511
473
valid_buff , valid_dtype = validity
512
- null_pos = buffer_to_ndarray (valid_buff , valid_dtype , col .offset , col .size ())
474
+ null_pos = buffer_to_ndarray (
475
+ valid_buff , valid_dtype , offset = col .offset , length = col .size ()
476
+ )
513
477
if sentinel_val == 0 :
514
478
null_pos = ~ null_pos
515
479
elif null_kind in (ColumnNullType .NON_NULLABLE , ColumnNullType .USE_NAN ):
0 commit comments