@@ -83,14 +83,14 @@ class Buffer:
83
83
@property
84
84
def bufsize (self ) -> int :
85
85
"""
86
- Buffer size in bytes
86
+ Buffer size in bytes.
87
87
"""
88
88
pass
89
89
90
90
@property
91
91
def ptr (self ) -> int :
92
92
"""
93
- Pointer to start of the buffer as an integer
93
+ Pointer to start of the buffer as an integer.
94
94
"""
95
95
pass
96
96
@@ -133,9 +133,10 @@ class Column:
133
133
A column object, with only the methods and properties required by the
134
134
interchange protocol defined.
135
135
136
- A column can contain one or more chunks. Each chunk can contain either one
137
- or two buffers - one data buffer and (depending on null representation) it
138
- may have a mask buffer.
136
+ A column can contain one or more chunks. Each chunk can contain up to three
137
+ buffers - a data buffer, a mask buffer (depending on null representation),
138
+ and an offsets buffer (if variable-size binary; e.g., variable-length
139
+ strings).
139
140
140
141
TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
141
142
Instead, it seems to use "children" for both columns with a bit mask,
@@ -185,7 +186,7 @@ def size(self) -> Optional[int]:
185
186
@property
186
187
def offset (self ) -> int :
187
188
"""
188
- Offset of first element
189
+ Offset of first element.
189
190
190
191
May be > 0 if using chunks; for example for a column with N chunks of
191
192
equal size M (only the last chunk may be shorter),
@@ -196,7 +197,7 @@ def offset(self) -> int:
196
197
@property
197
198
def dtype (self ) -> Tuple [enum .IntEnum , int , str , str ]:
198
199
"""
199
- Dtype description as a tuple ``(kind, bit-width, format string, endianness)``
200
+ Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
200
201
201
202
Kind :
202
203
@@ -272,7 +273,9 @@ def describe_null(self) -> Tuple[int, Any]:
272
273
- 3 : bit mask
273
274
- 4 : byte mask
274
275
275
- Value : if kind is "sentinel value", the actual value. None otherwise.
276
+ Value : if kind is "sentinel value", the actual value. If kind is a bit
277
+ mask or a byte mask, the value (0 or 1) indicating a missing value. None
278
+ otherwise.
276
279
"""
277
280
pass
278
281
@@ -299,24 +302,33 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[Column]:
299
302
"""
300
303
pass
301
304
302
- def get_data_buffer (self ) -> Buffer :
305
+ def get_buffers (self ) -> dict [ Tuple [ Buffer , Any ], Optional [ Tuple [ Buffer , Any ]], Optional [ Tuple [ Buffer , Any ]]] :
303
306
"""
304
- Return the buffer containing the data.
305
- """
306
- pass
307
+ Return a dictionary containing the underlying buffers.
307
308
308
- def get_mask (self ) -> Buffer :
309
- """
310
- Return the buffer containing the mask values indicating missing data.
309
+ The returned dictionary has the following contents:
311
310
312
- Raises RuntimeError if null representation is not a bit or byte mask.
311
+ - "data": a two-element tuple whose first element is a tuple
312
+ containing the buffer containing the data and whose second
313
+ element is the data buffer's associated dtype.
314
+ - "validity": a two-element tuple whose first element is a tuple
315
+ containing the buffer containing mask values
316
+ indicating missing data and whose second element is
317
+ the mask value buffer's associated dtype. None if the
318
+ null representation is not a bit or byte mask.
319
+ - "offsets": a two-element tuple whose first element is a tuple
320
+ containing the buffer containing the offset values for
321
+ variable-size binary data (e.g., variable-length
322
+ strings) and whose second element is the offsets
323
+ buffer's associated dtype. None if the data buffer does
324
+ not have an associated offsets buffer.
313
325
"""
314
326
pass
315
327
316
328
# def get_children(self) -> Iterable[Column]:
317
329
# """
318
330
# Children columns underneath the column, each object in this iterator
319
- # must adhere to the column specification
331
+ # must adhere to the column specification.
320
332
# """
321
333
# pass
322
334
@@ -337,7 +349,7 @@ class DataFrame:
337
349
"""
338
350
def __dataframe__ (self , nan_as_null : bool = False ) -> dict :
339
351
"""
340
- Produces a dictionary object following the dataframe protocol spec
352
+ Produces a dictionary object following the dataframe protocol specification.
341
353
342
354
``nan_as_null`` is a keyword intended for the consumer to tell the
343
355
producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
@@ -352,7 +364,7 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict:
352
364
353
365
def num_columns (self ) -> int :
354
366
"""
355
- Return the number of columns in the DataFrame
367
+ Return the number of columns in the DataFrame.
356
368
"""
357
369
pass
358
370
@@ -361,13 +373,13 @@ def num_rows(self) -> Optional[int]:
361
373
# why include it if it may be None - what do we expect consumers
362
374
# to do here?
363
375
"""
364
- Return the number of rows in the DataFrame, if available
376
+ Return the number of rows in the DataFrame, if available.
365
377
"""
366
378
pass
367
379
368
380
def num_chunks (self ) -> int :
369
381
"""
370
- Return the number of chunks the DataFrame consists of
382
+ Return the number of chunks the DataFrame consists of.
371
383
"""
372
384
pass
373
385
@@ -397,7 +409,7 @@ def get_columns(self) -> Iterable[Column]:
397
409
398
410
def select_columns (self , indices : Sequence [int ]) -> DataFrame :
399
411
"""
400
- Create a new DataFrame by selecting a subset of columns by index
412
+ Create a new DataFrame by selecting a subset of columns by index.
401
413
"""
402
414
pass
403
415
0 commit comments