data-apis · kgryte · Jun 23, 2021 · Jun 23, 2021 · Jun 24, 2021 · Jun 24, 2021
diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py
@@ -83,14 +83,14 @@ class Buffer:
     @property
     def bufsize(self) -> int:
         """
-        Buffer size in bytes
+        Buffer size in bytes.
         """
         pass
 
     @property
     def ptr(self) -> int:
         """
-        Pointer to start of the buffer as an integer
+        Pointer to start of the buffer as an integer.
         """
         pass
 
@@ -133,9 +133,10 @@ class Column:
     A column object, with only the methods and properties required by the
     interchange protocol defined.
 
-    A column can contain one or more chunks. Each chunk can contain either one
-    or two buffers - one data buffer and (depending on null representation) it
-    may have a mask buffer.
+    A column can contain one or more chunks. Each chunk can contain up to three
+    buffers - a data buffer, a mask buffer (depending on null representation),
+    and an offsets buffer (if variable-size binary; e.g., variable-length
+    strings).
 
     TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
          Instead, it seems to use "children" for both columns with a bit mask,
@@ -185,7 +186,7 @@ def size(self) -> Optional[int]:
     @property
     def offset(self) -> int:
         """
-        Offset of first element
+        Offset of first element.
 
         May be > 0 if using chunks; for example for a column with N chunks of
         equal size M (only the last chunk may be shorter),
@@ -196,7 +197,7 @@ def offset(self) -> int:
     @property
     def dtype(self) -> Tuple[enum.IntEnum, int, str, str]:
         """
-        Dtype description as a tuple ``(kind, bit-width, format string, endianness)``
+        Dtype description as a tuple ``(kind, bit-width, format string, endianness)``.
 
         Kind :
 
@@ -299,24 +300,32 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[Column]:
         """
         pass
 
-    def get_data_buffer(self) -> Buffer:
+    def get_data_buffer(self) -> Tuple[Buffer, Any]:
         """
-        Return the buffer containing the data.
+        Return the buffer containing the data and the buffer's associated dtype.
         """
         pass
 
-    def get_mask(self) -> Buffer:
+    def get_mask(self) -> Tuple[Buffer, Any]:
         """
-        Return the buffer containing the mask values indicating missing data.
+        Return the buffer containing the mask values indicating missing data and
+        the buffer's associated dtype.
 
         Raises RuntimeError if null representation is not a bit or byte mask.
         """
         pass
 
+    def get_offsets(self) -> Tuple[Buffer, Any]:
+        """
+        Return the buffer containing the offset values for variable-size binary
+        data (e.g., variable-length strings) and the buffer's associated dtype.
+        """
+        pass
+
 #    def get_children(self) -> Iterable[Column]:
 #        """
 #        Children columns underneath the column, each object in this iterator
-#        must adhere to the column specification
+#        must adhere to the column specification.
 #        """
 #        pass
 
@@ -337,7 +346,7 @@ class DataFrame:
     """
     def __dataframe__(self, nan_as_null : bool = False) -> dict:
         """
-        Produces a dictionary object following the dataframe protocol spec
+        Produces a dictionary object following the dataframe protocol specification.
 
         ``nan_as_null`` is a keyword intended for the consumer to tell the
         producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
@@ -352,7 +361,7 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict:
 
     def num_columns(self) -> int:
         """
-        Return the number of columns in the DataFrame
+        Return the number of columns in the DataFrame.
         """
         pass
 
@@ -361,13 +370,13 @@ def num_rows(self) -> Optional[int]:
         #       why include it if it may be None - what do we expect consumers
         #       to do here?
         """
-        Return the number of rows in the DataFrame, if available
+        Return the number of rows in the DataFrame, if available.
         """
         pass
 
     def num_chunks(self) -> int:
         """
-        Return the number of chunks the DataFrame consists of
+        Return the number of chunks the DataFrame consists of.
         """
         pass
 
@@ -397,7 +406,7 @@ def get_columns(self) -> Iterable[Column]:
 
     def select_columns(self, indices: Sequence[int]) -> DataFrame:
         """
-        Create a new DataFrame by selecting a subset of columns by index
+        Create a new DataFrame by selecting a subset of columns by index.
         """
         pass
 

diff --git a/protocol/dataframe_protocol_summary.md b/protocol/dataframe_protocol_summary.md
@@ -1,8 +1,9 @@
-# `__dataframe__` protocol - summary
+# The `__dataframe__` protocol
+
+This document aims to describe the scope of the dataframe interchange protocol,
+as well as its essential design requirements/principles and the functionality
+it needs to support.
 
-_We've had a lot of discussion in a couple of GitHub issues and in meetings.
-This description attempts to summarize that, and extract the essential design
-requirements/principles and functionality it needs to support._
 
 ## Purpose of `__dataframe__`
 
@@ -11,7 +12,8 @@ a way to convert one type of dataframe into another type (for example,
 convert a Koalas dataframe into a Pandas dataframe, or a cuDF dataframe into
 a Vaex dataframe).
 
-Currently (Nov'20) there is no way to do this in an implementation-independent way.
+Currently (June 2020) there is no way to do this in an
+implementation-independent way.
 
 The main use case this protocol intends to enable is to make it possible to
 write code that can accept any type of dataframe instead of being tied to a
@@ -30,7 +32,7 @@ def somefunc(df, ...):
 
 ### Non-goals
 
-Providing a _complete standardized dataframe API_ is not a goal of the
+Providing a _complete, standardized dataframe API_ is not a goal of the
 `__dataframe__` protocol. Instead, this is a goal of the full dataframe API
 standard, which the Consortium for Python Data API Standards aims to provide
 in the future. When that full API standard is implemented by dataframe
@@ -40,8 +42,8 @@ libraries, the example above can change to:
 def get_df_module(df):
     """Utility function to support programming against a dataframe API"""
     if hasattr(df, '__dataframe_namespace__'):
-       # Retrieve the namespace 
-       pdx = df.__dataframe_namespace__()  
+       # Retrieve the namespace
+       pdx = df.__dataframe_namespace__()
     else:
         # Here we can raise an exception if we only want to support compliant dataframes,
         # or convert to our default choice of dataframe if we want to accept (e.g.) dicts
@@ -57,6 +59,7 @@ def somefunc(df, ...):
     # From now on, use `df` methods and `pdx` functions/objects
 ```
 
+
 ### Constraints
 
 An important constraint on the `__dataframe__` protocol is that it should not
@@ -94,13 +97,14 @@ For a protocol to exchange dataframes between libraries, we need both a model
 of what we mean by "dataframe" conceptually for the purposes of the protocol,
 and a model of how the data is represented in memory:
 
-![Image of a dataframe model, containing chunks, columns and 1-D arrays](conceptual_model_df_memory.png)
+![Conceptual model of a dataframe, containing chunks, columns and 1-D arrays](images/dataframe_conceptual_model.png)
 
-The smallest building block are **1-D arrays**, which are contiguous in
-memory and contain data with the same dtype. A **column** consists of one or
-more 1-D arrays (if, e.g., missing data is represented with a boolean mask,
-that's a separate array). A **chunk** contains a set of columns of uniform
-length. A **dataframe** contains one or more chunks.
+The smallest building blocks are **1-D arrays** (or "buffers"), which are
+contiguous in memory and contain data with the same dtype. A **column**
+consists of one or more 1-D arrays (if, e.g., missing data is represented with
+a boolean mask, that's a separate array). A **dataframe** contains one or more columns.
+A column or a dataframe can be "chunked"; a **chunk** is a subset of a column
+or dataframe that contains a set of (neighboring) rows.
 
 
 ## Protocol design requirements
@@ -121,7 +125,7 @@ length. A **dataframe** contains one or more chunks.
 6. Must avoid device transfers by default (e.g. copy data from GPU to CPU),
    and provide an explicit way to force such transfers (e.g. a `force=` or
    `copy=` keyword that the caller can set to `True`).
-7. Must be zero-copy if possible.
+7. Must be zero-copy wherever possible.
 8. Must support missing values (`NA`) for all supported dtypes.
 9. Must supports string, categorical and datetime dtypes.
 10. Must allow the consumer to inspect the representation for missing values
@@ -141,7 +145,7 @@ length. A **dataframe** contains one or more chunks.
     _Rationale: prescribing a single in-memory representation in this
     protocol would lead to unnecessary copies being made if that represention
     isn't the native one a library uses._
-    _Note: the memory layout is columnnar. Row-major dataframes can use this
+    _Note: the memory layout is columnar. Row-major dataframes can use this
     protocol, but not in a zero-copy fashion (see requirement 2 above)._
 12. Must support chunking, i.e. accessing the data in "batches" of rows.
     There must be metadata the consumer can access to learn in how many
@@ -168,14 +172,21 @@ We'll also list some things that were discussed but are not requirements:
 3. Extension dtypes, i.e. a way to extend the set of dtypes that is
    explicitly support, are out of scope.
    _Rationale: complex to support, not used enough to justify that complexity._
-4. "virtual columns", i.e. columns for which the data is not yet in memory
+4. Support for strided storage in buffers.
+   _Rationale: this is supported by a subset of dataframes only, mainly those
+   that use NumPy arrays. In many real-world use cases, strided arrays will
+   force a copy at some point, so requiring contiguous memory layout (and hence
+   an extra copy at the moment `__dataframe__` is used) is considered a good
+   trade-off for reduced implementation complexity._
+5. "virtual columns", i.e. columns for which the data is not yet in memory
    because it uses lazy evaluation, are not supported other than through
    letting the producer materialize the data in memory when the consumer
    calls `__dataframe__`.
    _Rationale: the full dataframe API will support this use case by
    "programming to an interface"; this data interchange protocol is
    fundamentally built around describing data in memory_.
 
+
 ### To be decided
 
 _The connection between dataframe and array interchange protocols_. If we
@@ -194,7 +205,7 @@ _Should there be a standard `from_dataframe` constructor function?_ This
 isn't completely necessary, however it's expected that a full dataframe API
 standard will have such a function. The array API standard also has such a
 function, namely `from_dlpack`. Adding at least a recommendation on syntax
-for this function would make sense, e.g., `from_dataframe(df, stream=None)`.
+for this function makes sense, e.g., simply `from_dataframe(df)`.
 Discussion at https://github.com/data-apis/dataframe-api/issues/29#issuecomment-685903651
 is relevant.
 
@@ -209,14 +220,16 @@ except `__dataframe__` is a Python-level rather than C-level interface.
 The data types format specification of that interface is something that could
 be used unchanged.
 
-The main (only?) limitation seems to be that it does not have device support
-- @kkraus14 will bring this up on the Arrow dev mailing list. Also note that
-that interface only talks about arrays; dataframes, chunking and the metadata
-inspection can all be layered on top in this Python-level protocol, but are
-not discussed in the interface itself.
+The main limitation is to be that it does not have device support
+-- `@kkraus14` will bring this up on the Arrow dev mailing list. Another
+identified issue is that the "deleter" on the Arrow C struct is present at the
+column level, and there are use cases for having it at the buffer level
+(mixed-device dataframes, more granular control over memory).
 
 Note that categoricals are supported, Arrow uses the phrasing
-"dictionary-encoded types" for categorical.
+"dictionary-encoded types" for categorical. Also, what it calls "array" means
+"column" in the terminology of this document (and every Python dataframe
+library).
 
 The Arrow C Data Interface says specifically it was inspired by [Python's
 buffer protocol](https://docs.python.org/3/c-api/buffer.html), which is also
@@ -245,7 +258,7 @@ library that implements `__array__` must depend (optionally at least) on
 NumPy, and call a NumPy `ndarray` constructor itself from within `__array__`.
 
 
-### What is wrong with `.to_numpy?` and `.to_arrow()`? 
+### What is wrong with `.to_numpy?` and `.to_arrow()`?
 
 Such methods ask the object it is attached to to turn itself into a NumPy or
 Arrow array. Which means each library must have at least an optional
@@ -261,7 +274,7 @@ constructor it needs. For example, `x = np.asarray(df['colname'])` (where
 
 ### Does an interface describing memory work for virtual columns?
 
-Vaex is an example of a library that can have "virtual columns" (see @maartenbreddels
+Vaex is an example of a library that can have "virtual columns" (see `@maartenbreddels`
 [comment here](https://github.com/data-apis/dataframe-api/issues/29#issuecomment-686373569)).
 If the protocol includes a description of data layout in memory, does that
 work for such a virtual column?
@@ -285,17 +298,15 @@ computational graph approach like Dask uses, etc.)._
 
 ## Possible direction for implementation
 
-### Rough prototypes
+### Rough initial prototypes (historical)
 
 The `cuDFDataFrame`, `cuDFColumn` and `cuDFBuffer` sketched out by @kkraus14
 [here](https://github.com/data-apis/dataframe-api/issues/29#issuecomment-685123386)
-seems to be in the right direction.
+looked like it was in the right direction.
 
 [This prototype](https://github.com/wesm/dataframe-protocol/pull/1) by Wes
 McKinney was the first attempt, and has some useful features.
 
-TODO: work this out after making sure we're all on the same page regarding requirements.
-
 
 ### Relevant existing protocols
 
@@ -363,4 +374,4 @@ The `=`, `<`, `>` are denoting endianness; Arrow only supports native endianness
 - [`__array_interface__` protocol](https://numpy.org/devdocs/reference/arrays.interface.html)
 - [Arrow C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html)
 - [DLPack](https://github.com/dmlc/dlpack)
-- [Array data interchange in API standard](https://data-apis.github.io/array-api/latest/design_topics/data_interchange.html)
+- [Array data interchange in API standard](https://data-apis.github.io/array-api/latest/design_topics/data_interchange.html)
diff --git a/protocol/images/dataframe_conceptual_model.png b/protocol/images/dataframe_conceptual_model.png