Skip to content

Commit 31cbf86

Browse files
committed
Port changes from #45
This is a fresh port of changes made in order to support variable length strings in order to provide a cleaner merge.
1 parent 52abf7a commit 31cbf86

File tree

2 files changed

+256
-49
lines changed

2 files changed

+256
-49
lines changed

protocol/dataframe_protocol.py

Lines changed: 34 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -83,14 +83,14 @@ class Buffer:
8383
@property
8484
def bufsize(self) -> int:
8585
"""
86-
Buffer size in bytes
86+
Buffer size in bytes.
8787
"""
8888
pass
8989

9090
@property
9191
def ptr(self) -> int:
9292
"""
93-
Pointer to start of the buffer as an integer
93+
Pointer to start of the buffer as an integer.
9494
"""
9595
pass
9696

@@ -133,9 +133,10 @@ class Column:
133133
A column object, with only the methods and properties required by the
134134
interchange protocol defined.
135135
136-
A column can contain one or more chunks. Each chunk can contain either one
137-
or two buffers - one data buffer and (depending on null representation) it
138-
may have a mask buffer.
136+
A column can contain one or more chunks. Each chunk can contain up to three
137+
buffers - a data buffer, a mask buffer (depending on null representation),
138+
and an offsets buffer (if variable-size binary; e.g., variable-length
139+
strings).
139140
140141
TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
141142
Instead, it seems to use "children" for both columns with a bit mask,
@@ -185,7 +186,7 @@ def size(self) -> Optional[int]:
185186
@property
186187
def offset(self) -> int:
187188
"""
188-
Offset of first element
189+
Offset of first element.
189190
190191
May be > 0 if using chunks; for example for a column with N chunks of
191192
equal size M (only the last chunk may be shorter),
@@ -196,7 +197,7 @@ def offset(self) -> int:
196197
@property
197198
def dtype(self) -> Tuple[enum.IntEnum, int, str, str]:
198199
"""
199-
Dtype description as a tuple ``(kind, bit-width, format string, endianness)``
200+
Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
200201
201202
Kind :
202203
@@ -272,7 +273,9 @@ def describe_null(self) -> Tuple[int, Any]:
272273
- 3 : bit mask
273274
- 4 : byte mask
274275
275-
Value : if kind is "sentinel value", the actual value. None otherwise.
276+
Value : if kind is "sentinel value", the actual value. If kind is a bit
277+
mask or a byte mask, the value (0 or 1) indicating a missing value. None
278+
otherwise.
276279
"""
277280
pass
278281

@@ -299,24 +302,33 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[Column]:
299302
"""
300303
pass
301304

302-
def get_data_buffer(self) -> Buffer:
305+
def get_buffers(self) -> dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]], Optional[Tuple[Buffer, Any]]]:
303306
"""
304-
Return the buffer containing the data.
305-
"""
306-
pass
307+
Return a dictionary containing the underlying buffers.
307308
308-
def get_mask(self) -> Buffer:
309-
"""
310-
Return the buffer containing the mask values indicating missing data.
309+
The returned dictionary has the following contents:
311310
312-
Raises RuntimeError if null representation is not a bit or byte mask.
311+
- "data": a two-element tuple whose first element is a tuple
312+
containing the buffer containing the data and whose second
313+
element is the data buffer's associated dtype.
314+
- "validity": a two-element tuple whose first element is a tuple
315+
containing the buffer containing mask values
316+
indicating missing data and whose second element is
317+
the mask value buffer's associated dtype. None if the
318+
null representation is not a bit or byte mask.
319+
- "offsets": a two-element tuple whose first element is a tuple
320+
containing the buffer containing the offset values for
321+
variable-size binary data (e.g., variable-length
322+
strings) and whose second element is the offsets
323+
buffer's associated dtype. None if the data buffer does
324+
not have an associated offsets buffer.
313325
"""
314326
pass
315327

316328
# def get_children(self) -> Iterable[Column]:
317329
# """
318330
# Children columns underneath the column, each object in this iterator
319-
# must adhere to the column specification
331+
# must adhere to the column specification.
320332
# """
321333
# pass
322334

@@ -337,7 +349,7 @@ class DataFrame:
337349
"""
338350
def __dataframe__(self, nan_as_null : bool = False) -> dict:
339351
"""
340-
Produces a dictionary object following the dataframe protocol spec
352+
Produces a dictionary object following the dataframe protocol specification.
341353
342354
``nan_as_null`` is a keyword intended for the consumer to tell the
343355
producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
@@ -352,7 +364,7 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict:
352364

353365
def num_columns(self) -> int:
354366
"""
355-
Return the number of columns in the DataFrame
367+
Return the number of columns in the DataFrame.
356368
"""
357369
pass
358370

@@ -361,13 +373,13 @@ def num_rows(self) -> Optional[int]:
361373
# why include it if it may be None - what do we expect consumers
362374
# to do here?
363375
"""
364-
Return the number of rows in the DataFrame, if available
376+
Return the number of rows in the DataFrame, if available.
365377
"""
366378
pass
367379

368380
def num_chunks(self) -> int:
369381
"""
370-
Return the number of chunks the DataFrame consists of
382+
Return the number of chunks the DataFrame consists of.
371383
"""
372384
pass
373385

@@ -397,7 +409,7 @@ def get_columns(self) -> Iterable[Column]:
397409

398410
def select_columns(self, indices: Sequence[int]) -> DataFrame:
399411
"""
400-
Create a new DataFrame by selecting a subset of columns by index
412+
Create a new DataFrame by selecting a subset of columns by index.
401413
"""
402414
pass
403415

0 commit comments

Comments
 (0)