|
| 1 | +""" |
| 2 | +Specification for objects to be accessed, for the purpose of dataframe |
| 3 | +interchange between libraries, via the ``__dataframe__`` method on a libraries' |
| 4 | +data frame object. |
| 5 | +
|
| 6 | +For guiding requirements, see https://github.com/data-apis/dataframe-api/pull/35 |
| 7 | +
|
| 8 | +Design decisions |
| 9 | +---------------- |
| 10 | +
|
| 11 | +**1. Use a separate column abstraction in addition to a dataframe interface.** |
| 12 | +
|
| 13 | +Rationales: |
| 14 | +- This is how it works in R, Julia and Apache Arrow. |
| 15 | +- Semantically most existing applications and users treat a column similar to a 1-D array |
| 16 | +- We should be able to connect a column to the array data interchange mechanism(s) |
| 17 | +
|
| 18 | +Note that this does not imply a library must have such a public user-facing |
| 19 | +abstraction (ex. ``pandas.Series``) - it can only be accessed via ``__dataframe__``. |
| 20 | +
|
| 21 | +**2. Use methods and properties on an opaque object rather than returning |
| 22 | +hierarchical dictionaries describing memory** |
| 23 | +
|
| 24 | +This is better for implementations that may rely on, for example, lazy |
| 25 | +computation. |
| 26 | +
|
| 27 | +**3. No row names. If a library uses row names, use a regular column for them.** |
| 28 | +
|
| 29 | +See discussion at https://github.com/wesm/dataframe-protocol/pull/1/files#r394316241 |
| 30 | +Optional row names are not a good idea, because people will assume they're present |
| 31 | +(see cuDF experience, forced to add because pandas has them). |
| 32 | +Requiring row names seems worse than leaving them out. |
| 33 | +
|
| 34 | +""" |
| 35 | + |
| 36 | + |
| 37 | +class Buffer: |
| 38 | + """ |
| 39 | + Data in the buffer is guaranteed to be contiguous in memory. |
| 40 | + """ |
| 41 | + |
| 42 | + @property |
| 43 | + def bufsize(self) -> int: |
| 44 | + """ |
| 45 | + Buffer size in bytes |
| 46 | + """ |
| 47 | + pass |
| 48 | + |
| 49 | + @property |
| 50 | + def ptr(self) -> int: |
| 51 | + """ |
| 52 | + Pointer to start of the buffer as an integer |
| 53 | + """ |
| 54 | + pass |
| 55 | + |
| 56 | + def __dlpack__(self): |
| 57 | + """ |
| 58 | + Produce DLPack capsule (see array API standard). |
| 59 | +
|
| 60 | + Raises: |
| 61 | +
|
| 62 | + - TypeError : if the buffer contains unsupported dtypes. |
| 63 | + - NotImplementedError : if DLPack support is not implemented |
| 64 | +
|
| 65 | + Useful to have to connect to array libraries. Support optional because |
| 66 | + it's not completely trivial to implement for a Python-only library. |
| 67 | + """ |
| 68 | + raise NotImplementedError("__dlpack__") |
| 69 | + |
| 70 | + def __array_interface__(self): |
| 71 | + """ |
| 72 | + TBD: implement or not? Will work for all dtypes except bit masks. |
| 73 | + """ |
| 74 | + raise NotImplementedError("__array_interface__") |
| 75 | + |
| 76 | + |
| 77 | +class Column: |
| 78 | + """ |
| 79 | + A column object, with only the methods and properties required by the |
| 80 | + interchange protocol defined. |
| 81 | +
|
| 82 | + A column can contain one or more chunks. Each chunk can contain either one |
| 83 | + or two buffers - one data buffer and (depending on null representation) it |
| 84 | + may have a mask buffer. |
| 85 | +
|
| 86 | + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. |
| 87 | + Instead, it seems to use "children" for both columns with a bit mask, |
| 88 | + and for nested dtypes. Unclear whether this is elegant or confusing. |
| 89 | + This design requires checking the null representation explicitly. |
| 90 | +
|
| 91 | + The Arrow design requires checking: |
| 92 | + 1. the ARROW_FLAG_NULLABLE (for sentinel values) |
| 93 | + 2. if a column has two children, combined with one of those children |
| 94 | + having a null dtype. |
| 95 | +
|
| 96 | + Making the mask concept explicit seems useful. One null dtype would |
| 97 | + not be enough to cover both bit and byte masks, so that would mean |
| 98 | + even more checking if we did it the Arrow way. |
| 99 | +
|
| 100 | + TBD: there's also the "chunk" concept here, which is implicit in Arrow as |
| 101 | + multiple buffers per array (= column here). Semantically it may make |
| 102 | + sense to have both: chunks were meant for example for lazy evaluation |
| 103 | + of data which doesn't fit in memory, while multiple buffers per column |
| 104 | + could also come from doing a selection operation on a single |
| 105 | + contiguous buffer. |
| 106 | +
|
| 107 | + Given these concepts, one would expect chunks to be all of the same |
| 108 | + size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), |
| 109 | + while multiple buffers could have data-dependent lengths. Not an issue |
| 110 | + in pandas if one column is backed by a single NumPy array, but in |
| 111 | + Arrow it seems possible. |
| 112 | + Are multiple chunks *and* multiple buffers per column necessary for |
| 113 | + the purposes of this interchange protocol, or must producers either |
| 114 | + reuse the chunk concept for this or copy the data? |
| 115 | +
|
| 116 | + Note: this Column object can only be produced by ``__dataframe__``, so |
| 117 | + doesn't need its own version or ``__column__`` protocol. |
| 118 | +
|
| 119 | + """ |
| 120 | + @property |
| 121 | + def name(self) -> str: |
| 122 | + pass |
| 123 | + |
| 124 | + @property |
| 125 | + def size(self) -> Optional[int]: |
| 126 | + """ |
| 127 | + Size of the column, in elements. |
| 128 | +
|
| 129 | + Corresponds to DataFrame.num_rows() if column is a single chunk; |
| 130 | + equal to size of this current chunk otherwise. |
| 131 | + """ |
| 132 | + pass |
| 133 | + |
| 134 | + @property |
| 135 | + def offset(self) -> int: |
| 136 | + """ |
| 137 | + Offset of first element |
| 138 | +
|
| 139 | + May be > 0 if using chunks; for example for a column with N chunks of |
| 140 | + equal size M (only the last chunk may be shorter), |
| 141 | + ``offset = n * M``, ``n = 0 .. N-1``. |
| 142 | + """ |
| 143 | + pass |
| 144 | + |
| 145 | + @property |
| 146 | + def dtype(self) -> Tuple[int, int, str, str]: |
| 147 | + """ |
| 148 | + Dtype description as a tuple ``(kind, bit-width, format string, endianness)`` |
| 149 | +
|
| 150 | + Kind : |
| 151 | +
|
| 152 | + - 0 : signed integer |
| 153 | + - 1 : unsigned integer |
| 154 | + - 2 : IEEE floating point |
| 155 | + - 20 : boolean |
| 156 | + - 21 : string (UTF-8) |
| 157 | + - 22 : datetime |
| 158 | + - 23 : categorical |
| 159 | +
|
| 160 | + Bit-width : the number of bits as an integer |
| 161 | + Format string : data type description format string in Apache Arrow C |
| 162 | + Data Interface format. |
| 163 | + Endianness : current only native endianness (``=``) is supported |
| 164 | +
|
| 165 | + Notes: |
| 166 | +
|
| 167 | + - Kind specifiers are aligned with DLPack where possible (hence the |
| 168 | + jump to 20, leave enough room for future extension) |
| 169 | + - Masks must be specified as boolean with either bit width 1 (for bit |
| 170 | + masks) or 8 (for byte masks). |
| 171 | + - Dtype width in bits was preferred over bytes |
| 172 | + - Endianness isn't too useful, but included now in case in the future |
| 173 | + we need to support non-native endianness |
| 174 | + - Went with Apache Arrow format strings over NumPy format strings |
| 175 | + because they're more complete from a dataframe perspective |
| 176 | + - Format strings are mostly useful for datetime specification, and |
| 177 | + for categoricals. |
| 178 | + - For categoricals, the format string describes the type of the |
| 179 | + categorical in the data buffer. In case of a separate encoding of |
| 180 | + the categorical (e.g. an integer to string mapping), this can |
| 181 | + be derived from ``self.describe_categorical``. |
| 182 | + - Data types not included: complex, Arrow-style null, binary, decimal, |
| 183 | + and nested (list, struct, map, union) dtypes. |
| 184 | + """ |
| 185 | + pass |
| 186 | + |
| 187 | + @property |
| 188 | + def describe_categorical(self) -> dict[bool, bool, Optional[dict]]: |
| 189 | + """ |
| 190 | + If the dtype is categorical, there are two options: |
| 191 | +
|
| 192 | + - There are only values in the data buffer. |
| 193 | + - There is a separate dictionary-style encoding for categorical values. |
| 194 | +
|
| 195 | + Raises RuntimeError if the dtype is not categorical |
| 196 | +
|
| 197 | + Content of returned dict: |
| 198 | +
|
| 199 | + - "is_ordered" : bool, whether the ordering of dictionary indices is |
| 200 | + semantically meaningful. |
| 201 | + - "is_dictionary" : bool, whether a dictionary-style mapping of |
| 202 | + categorical values to other objects exists |
| 203 | + - "mapping" : dict, Python-level only (e.g. ``{int: str}``). |
| 204 | + None if not a dictionary-style categorical. |
| 205 | +
|
| 206 | + TBD: are there any other in-memory representations that are needed? |
| 207 | + """ |
| 208 | + pass |
| 209 | + |
| 210 | + @property |
| 211 | + def describe_null(self) -> Tuple[int, Any]: |
| 212 | + """ |
| 213 | + Return the missing value (or "null") representation the column dtype |
| 214 | + uses, as a tuple ``(kind, value)``. |
| 215 | +
|
| 216 | + Kind: |
| 217 | +
|
| 218 | + - 0 : NaN/NaT |
| 219 | + - 1 : sentinel value |
| 220 | + - 2 : bit mask |
| 221 | + - 3 : byte mask |
| 222 | +
|
| 223 | + Value : if kind is "sentinel value", the actual value. None otherwise. |
| 224 | + """ |
| 225 | + pass |
| 226 | + |
| 227 | + @property |
| 228 | + def null_count(self) -> Optional[int]: |
| 229 | + """ |
| 230 | + Number of null elements, if known. |
| 231 | +
|
| 232 | + Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. |
| 233 | + """ |
| 234 | + pass |
| 235 | + |
| 236 | + def num_chunks(self) -> int: |
| 237 | + """ |
| 238 | + Return the number of chunks the column consists of. |
| 239 | + """ |
| 240 | + pass |
| 241 | + |
| 242 | + def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[Column]: |
| 243 | + """ |
| 244 | + Return an iterator yielding the chunks. |
| 245 | +
|
| 246 | + See `DataFrame.get_chunks` for details on ``n_chunks``. |
| 247 | + """ |
| 248 | + pass |
| 249 | + |
| 250 | + def get_buffer(self) -> Buffer: |
| 251 | + """ |
| 252 | + Return the buffer containing the data. |
| 253 | + """ |
| 254 | + pass |
| 255 | + |
| 256 | + def get_mask(self) -> Buffer: |
| 257 | + """ |
| 258 | + Return the buffer containing the mask values indicating missing data. |
| 259 | +
|
| 260 | + Raises RuntimeError if null representation is not a bit or byte mask. |
| 261 | + """ |
| 262 | + pass |
| 263 | + |
| 264 | +# # NOTE: not needed unless one considers nested dtypes |
| 265 | +# def get_children(self) -> Iterable[Column]: |
| 266 | +# """ |
| 267 | +# Children columns underneath the column, each object in this iterator |
| 268 | +# must adhere to the column specification |
| 269 | +# """ |
| 270 | +# pass |
| 271 | + |
| 272 | + |
| 273 | +class DataFrame: |
| 274 | + """ |
| 275 | + A data frame class, with only the methods required by the interchange |
| 276 | + protocol defined. |
| 277 | +
|
| 278 | + A "data frame" represents an ordered collection of named columns. |
| 279 | + A column's "name" must be a unique string. |
| 280 | + Columns may be accessed by name or by position. |
| 281 | +
|
| 282 | + This could be a public data frame class, or an object with the methods and |
| 283 | + attributes defined on this DataFrame class could be returned from the |
| 284 | + ``__dataframe__`` method of a public data frame class in a library adhering |
| 285 | + to the dataframe interchange protocol specification. |
| 286 | + """ |
| 287 | + def __dataframe__(self, nan_as_null : bool = False) -> dict: |
| 288 | + """ |
| 289 | + Produces a dictionary object following the dataframe protocol spec |
| 290 | + """ |
| 291 | + self._nan_as_null = nan_as_null |
| 292 | + return { |
| 293 | + "dataframe": self, # DataFrame object adhering to the protocol |
| 294 | + "version": 0 # Version number of the protocol |
| 295 | + } |
| 296 | + |
| 297 | + def num_columns(self) -> int: |
| 298 | + """ |
| 299 | + Return the number of columns in the DataFrame |
| 300 | + """ |
| 301 | + pass |
| 302 | + |
| 303 | + def num_rows(self) -> Optional[int]: |
| 304 | + # TODO: not happy with Optional, but need to flag it may be expensive |
| 305 | + # why include it if it may be None - what do we expect consumers |
| 306 | + # to do here? |
| 307 | + """ |
| 308 | + Return the number of rows in the DataFrame, if available |
| 309 | + """ |
| 310 | + pass |
| 311 | + |
| 312 | + def num_chunks(self) -> int: |
| 313 | + """ |
| 314 | + Return the number of chunks the DataFrame consists of |
| 315 | + """ |
| 316 | + pass |
| 317 | + |
| 318 | + def column_names(self) -> Iterable[str]: |
| 319 | + """ |
| 320 | + Return an iterator yielding the column names. |
| 321 | + """ |
| 322 | + pass |
| 323 | + |
| 324 | + def get_column(self, i: int) -> Column: |
| 325 | + """ |
| 326 | + Return the column at the indicated position. |
| 327 | + """ |
| 328 | + pass |
| 329 | + |
| 330 | + def get_column_by_name(self, name: str) -> Column: |
| 331 | + """ |
| 332 | + Return the column whose name is the indicated name. |
| 333 | + """ |
| 334 | + pass |
| 335 | + |
| 336 | + def get_columns(self) -> Iterable[Column]: |
| 337 | + """ |
| 338 | + Return an iterator yielding the columns. |
| 339 | + """ |
| 340 | + pass |
| 341 | + |
| 342 | + def select_columns(self, indices: Sequence[int]) -> DataFrame: |
| 343 | + """ |
| 344 | + Create a new DataFrame by selecting a subset of columns by index |
| 345 | + """ |
| 346 | + pass |
| 347 | + |
| 348 | + def select_columns_by_name(self, names: Sequence[str]) -> DataFrame: |
| 349 | + """ |
| 350 | + Create a new DataFrame by selecting a subset of columns by name. |
| 351 | + """ |
| 352 | + pass |
| 353 | + |
| 354 | + def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[DataFrame]: |
| 355 | + """ |
| 356 | + Return an iterator yielding the chunks. |
| 357 | +
|
| 358 | + By default (None), yields the chunks that the data is stored as by the |
| 359 | + producer. If given, ``n_chunks`` must be a multiple of |
| 360 | + ``self.num_chunks()``, meaning the producer must subdivide each chunk |
| 361 | + before yielding it. |
| 362 | + """ |
| 363 | + pass |
| 364 | + |
| 365 | + @property |
| 366 | + def device(self) -> int: |
| 367 | + """ |
| 368 | + Device type the dataframe resides on. |
| 369 | +
|
| 370 | + Uses device type codes matching DLPack: |
| 371 | +
|
| 372 | + - 1 : CPU |
| 373 | + - 2 : CUDA |
| 374 | + - 3 : CPU pinned |
| 375 | + - 4 : OpenCL |
| 376 | + - 7 : Vulkan |
| 377 | + - 8 : Metal |
| 378 | + - 9 : Verilog |
| 379 | + - 10 : ROCm |
| 380 | + """ |
| 381 | + pass |
0 commit comments