Skip to content

Add variable-length string support #47

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Aug 23, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 34 additions & 22 deletions protocol/dataframe_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,14 @@ class Buffer:
@property
def bufsize(self) -> int:
"""
Buffer size in bytes
Buffer size in bytes.
"""
pass

@property
def ptr(self) -> int:
"""
Pointer to start of the buffer as an integer
Pointer to start of the buffer as an integer.
"""
pass

Expand Down Expand Up @@ -133,9 +133,10 @@ class Column:
A column object, with only the methods and properties required by the
interchange protocol defined.

A column can contain one or more chunks. Each chunk can contain either one
or two buffers - one data buffer and (depending on null representation) it
may have a mask buffer.
A column can contain one or more chunks. Each chunk can contain up to three
buffers - a data buffer, a mask buffer (depending on null representation),
and an offsets buffer (if variable-size binary; e.g., variable-length
strings).

TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
Instead, it seems to use "children" for both columns with a bit mask,
Expand Down Expand Up @@ -185,7 +186,7 @@ def size(self) -> Optional[int]:
@property
def offset(self) -> int:
"""
Offset of first element
Offset of first element.

May be > 0 if using chunks; for example for a column with N chunks of
equal size M (only the last chunk may be shorter),
Expand All @@ -196,7 +197,7 @@ def offset(self) -> int:
@property
def dtype(self) -> Tuple[enum.IntEnum, int, str, str]:
"""
Dtype description as a tuple ``(kind, bit-width, format string, endianness)``
Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.

Kind :

Expand Down Expand Up @@ -272,7 +273,9 @@ def describe_null(self) -> Tuple[int, Any]:
- 3 : bit mask
- 4 : byte mask

Value : if kind is "sentinel value", the actual value. None otherwise.
Value : if kind is "sentinel value", the actual value. If kind is a bit
mask or a byte mask, the value (0 or 1) indicating a missing value. None
otherwise.
"""
pass

Expand All @@ -299,24 +302,33 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[Column]:
"""
pass

def get_data_buffer(self) -> Buffer:
def get_buffers(self) -> dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]], Optional[Tuple[Buffer, Any]]]:
"""
Return the buffer containing the data.
"""
pass
Return a dictionary containing the underlying buffers.

def get_mask(self) -> Buffer:
"""
Return the buffer containing the mask values indicating missing data.
The returned dictionary has the following contents:

Raises RuntimeError if null representation is not a bit or byte mask.
- "data": a two-element tuple whose first element is a buffer
containing the data and whose second element is the data
buffer's associated dtype.
- "validity": a two-element tuple whose first element is a buffer
containing mask values indicating missing data and
whose second element is the mask value buffer's
associated dtype. None if the null representation is
not a bit or byte mask.
- "offsets": a two-element tuple whose first element is a buffer
containing the offset values for variable-size binary
data (e.g., variable-length strings) and whose second
element is the offsets buffer's associated dtype. None
if the data buffer does not have an associated offsets
buffer.
"""
pass

# def get_children(self) -> Iterable[Column]:
# """
# Children columns underneath the column, each object in this iterator
# must adhere to the column specification
# must adhere to the column specification.
# """
# pass

Expand All @@ -337,7 +349,7 @@ class DataFrame:
"""
def __dataframe__(self, nan_as_null : bool = False) -> dict:
"""
Produces a dictionary object following the dataframe protocol spec
Produces a dictionary object following the dataframe protocol specification.

``nan_as_null`` is a keyword intended for the consumer to tell the
producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
Expand All @@ -352,7 +364,7 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict:

def num_columns(self) -> int:
"""
Return the number of columns in the DataFrame
Return the number of columns in the DataFrame.
"""
pass

Expand All @@ -361,13 +373,13 @@ def num_rows(self) -> Optional[int]:
# why include it if it may be None - what do we expect consumers
# to do here?
"""
Return the number of rows in the DataFrame, if available
Return the number of rows in the DataFrame, if available.
"""
pass

def num_chunks(self) -> int:
"""
Return the number of chunks the DataFrame consists of
Return the number of chunks the DataFrame consists of.
"""
pass

Expand Down Expand Up @@ -397,7 +409,7 @@ def get_columns(self) -> Iterable[Column]:

def select_columns(self, indices: Sequence[int]) -> DataFrame:
"""
Create a new DataFrame by selecting a subset of columns by index
Create a new DataFrame by selecting a subset of columns by index.
"""
pass

Expand Down
Loading