|
5 | 5 |
|
6 | 6 | For guiding requirements, see https://github.com/data-apis/dataframe-api/pull/35
|
7 | 7 |
|
| 8 | +
|
| 9 | +Concepts in this design |
| 10 | +----------------------- |
| 11 | +
|
| 12 | +1. A `Buffer` class. A *buffer* is a contiguous block of memory - this is the |
| 13 | + only thing that actually maps to a 1-D array in a sense that it could be |
| 14 | + converted to NumPy, CuPy, et al. |
| 15 | +2. A `Column` class. A *column* has a name and a single dtype. It can consist |
| 16 | + of multiple *chunks*. A single chunk of a column (which may be the whole |
| 17 | + column if ``num_chunks == 1``) is modeled as again a `Column` instance, and |
| 18 | + contains 1 data *buffer* and (optionally) one *mask* for missing data. |
| 19 | +3. A `DataFrame` class. A *data frame* is an ordered collection of *columns*. |
| 20 | + It has a single device, and all its rows are the same length. It can consist |
| 21 | + of multiple *chunks*. A single chunk of a data frame is modeled as |
| 22 | + again a `DataFrame` instance. |
| 23 | +4. A *mask* concept. A *mask* of a single-chunk column is a *buffer*. |
| 24 | +5. A *chunk* concept. A *chunk* is a sub-dividing element that can be applied |
| 25 | + to a *data frame* or a *column*. |
| 26 | +
|
| 27 | +Note that the only way to access these objects is through a call to |
| 28 | +``__dataframe__`` on a data frame object. This is NOT meant as public API; |
| 29 | +only think of instances of the different classes here to describe the API of |
| 30 | +what is returned by a call to ``__dataframe__``. They are the concepts needed |
| 31 | +to capture the memory layout and data access of a data frame. |
| 32 | +
|
| 33 | +
|
8 | 34 | Design decisions
|
9 | 35 | ----------------
|
10 | 36 |
|
|
31 | 57 | (see cuDF experience, forced to add because pandas has them).
|
32 | 58 | Requiring row names seems worse than leaving them out.
|
33 | 59 |
|
| 60 | +Note that row labels could be added in the future - right now there's no clear |
| 61 | +requirements for more complex row labels that cannot be represented by a single |
| 62 | +column. That do exist, for example Modin has has table and tree-based row |
| 63 | +labels. |
| 64 | +
|
34 | 65 | """
|
35 | 66 |
|
36 | 67 |
|
37 | 68 | class Buffer:
|
38 | 69 | """
|
39 | 70 | Data in the buffer is guaranteed to be contiguous in memory.
|
| 71 | +
|
| 72 | + Note that there is no dtype attribute present, a buffer can be thought of |
| 73 | + as simply a block of memory. However, if the column that the buffer is |
| 74 | + attached to has a dtype that's supported by DLPack and ``__dlpack__`` is |
| 75 | + implemented, then that dtype information will be contained in the return |
| 76 | + value from ``__dlpack__``. |
| 77 | +
|
| 78 | + This distinction is useful to support both data exchange via DLPack on a |
| 79 | + buffer and (b) dtypes like variable-length strings which do not have a |
| 80 | + fixed number of bytes per element. |
40 | 81 | """
|
41 | 82 |
|
42 | 83 | @property
|
@@ -67,6 +108,25 @@ def __dlpack__(self):
|
67 | 108 | """
|
68 | 109 | raise NotImplementedError("__dlpack__")
|
69 | 110 |
|
| 111 | + def __dlpack_device__(self) -> Tuple[enum.IntEnum, int]: |
| 112 | + """ |
| 113 | + Device type and device ID for where the data in the buffer resides. |
| 114 | +
|
| 115 | + Uses device type codes matching DLPack. Enum members are:: |
| 116 | +
|
| 117 | + - CPU = 1 |
| 118 | + - CUDA = 2 |
| 119 | + - CPU_PINNED = 3 |
| 120 | + - OPENCL = 4 |
| 121 | + - VULKAN = 7 |
| 122 | + - METAL = 8 |
| 123 | + - VPI = 9 |
| 124 | + - ROCM = 10 |
| 125 | +
|
| 126 | + Note: must be implemented even if ``__dlpack__`` is not. |
| 127 | + """ |
| 128 | + pass |
| 129 | + |
70 | 130 |
|
71 | 131 | class Column:
|
72 | 132 | """
|
@@ -279,6 +339,11 @@ class DataFrame:
|
279 | 339 | def __dataframe__(self, nan_as_null : bool = False) -> dict:
|
280 | 340 | """
|
281 | 341 | Produces a dictionary object following the dataframe protocol spec
|
| 342 | +
|
| 343 | + ``nan_as_null`` is a keyword intended for the consumer to tell the |
| 344 | + producer to overwrite null values in the data with ``NaN`` (or ``NaT``). |
| 345 | + It is intended for cases where the consumer does not support the bit |
| 346 | + mask or byte mask that is the producer's native representation. |
282 | 347 | """
|
283 | 348 | self._nan_as_null = nan_as_null
|
284 | 349 | return {
|
@@ -354,20 +419,3 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[DataFrame]:
|
354 | 419 | """
|
355 | 420 | pass
|
356 | 421 |
|
357 |
| - @property |
358 |
| - def device(self) -> int: |
359 |
| - """ |
360 |
| - Device type the dataframe resides on. |
361 |
| -
|
362 |
| - Uses device type codes matching DLPack: |
363 |
| -
|
364 |
| - - 1 : CPU |
365 |
| - - 2 : CUDA |
366 |
| - - 3 : CPU pinned |
367 |
| - - 4 : OpenCL |
368 |
| - - 7 : Vulkan |
369 |
| - - 8 : Metal |
370 |
| - - 9 : Verilog |
371 |
| - - 10 : ROCm |
372 |
| - """ |
373 |
| - pass |
|
0 commit comments