diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index c11da438..c9253fec 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -17,6 +17,95 @@ class Column: constructor functions or an already-created dataframe object retrieved via """ + def to_array_obj(self, *, null_handling: str | None = None) -> object: + """ + Obtain an object that can be used as input to ``asarray`` or ``from_dlpack`` + + The returned object must only be used for one thing: calling the ``asarray`` + or ``from_dlpack`` functions of a library implementing the array API + standard, or equivalent ``asarray``/``from_dlpack`` functions to the + one in that standard. In practice that means that the returned object + may either already *be* an array type of a specific array library, or + it may implement one or more of the following methods or behaviors: + + - ``__dlpack__`` + - the Python buffer protocol + - ``__array__`` + - ``__array_interface__`` + - ``__cuda_array_interface__`` + - the Python ``Sequence`` interface + - any other method that is known to work with an ``asarray`` function + in a library of interest + + Importantly, the returned object must only implement/expose those + methods that work. An ``asarray`` function may try to use any of the + above methods, and the order in which it does so is not guaranteed. + Hence it is expected that if a method is present, it works correctly + and does not raise an exception (e.g., because of an unsupported dtype + or device). + + .. admonition:: Tip + + One way to expose methods only if they will work for the dtype, + device, and other characteristics of the column is to hide access to + the methods dynamically, so ``hasattr`` does the right thing. For + example:: + + def __dir__(self): + methods = dir(self.__class__) + attrs = list(self.__dict__.keys()) + keys = methods + attrs + if not self.dtype in _dlpack_supported_dtypes: + keys.remove("__dlpack__") + + return keys + + def __dlpack__(self): + ... + + Parameters + ---------- + null_handling : str or None + Determine how to treat ``null`` values that may be present in the + column. Valid options are: + + - ``None`` (default): no special handling. This assumes that either + no missing values are present, or there is an array type with + native support for missing values that is/can be converted to. + *Note: there is currently no such library that is in wide use; + NumPy's masked arrays are non-recommended, and other array + libraries do not support missing values at all.* + - ``raise``: always raise a ``ValueError`` if nulls are present. + - ``to-nan``: for floating-point dtypes, convert any nulls to ``nan``. + For other dtypes, do the same as ``None``. + + Note that if it is desired to convert nulls to a dtype-specific + sentinel value, the user should do this before calling + ``is_array_obj`` with `.isnull()` and replacing the values + directly. + + Returns + ------- + array_obj : object + Either a custom object or an instance of the array type of a + specific array library. In the latter case, the array must have the + closest matching dtype of the array library (e.g., a column with a + floating-point dtype must produce the corresponding floating-point + dtype of the same precision that the array library offers). + + Raises + ------ + TypeError + In case it is not possible to convert the column to any (known) array + library type, or use any of the possible interchange methods. + This can be due to the dtype (e.g., no array library supports datetime + dtypes with a time zone), device, or other reasons. + ValueError + If the column contains ``null`` values which prevent returning an + array object. + + """ + @classmethod def from_sequence(cls, sequence: Sequence[object], dtype: dtype) -> Column: """