data-apis · MarcoGorelli · Nov 17, 2023 · Oct 31, 2023 · Nov 7, 2023 · Nov 8, 2023
diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py
@@ -5,7 +5,8 @@
 if TYPE_CHECKING:
     from typing_extensions import Self
 
-    from .typing import DType, Namespace, NullType, Scalar
+    from .scalar_object import Scalar
+    from .typing import DType, Namespace, NullType
 
 
 __all__ = ["Column"]

diff --git a/spec/API_specification/dataframe_api/scalar_object.py b/spec/API_specification/dataframe_api/scalar_object.py
@@ -0,0 +1,87 @@
+from __future__ import annotations
+
+from typing import Any, Protocol
+
+__all__ = ["Scalar"]
+
+
+class Scalar(Protocol):
+    """Scalar object.
+
+    Not meant to be instantiated directly, but rather created via
+    `:meth:Column.get_value` or one of the column reductions such
+    as `:meth:`Column.sum`.
+    """
+
+    def __lt__(self, other: Any) -> Scalar:
+        ...
+
+    def __le__(self, other: Any) -> Scalar:
+        ...
+
+    def __eq__(self, other: object) -> Scalar:  # type: ignore[override]
+        ...
+
+    def __ne__(self, other: object) -> Scalar:  # type: ignore[override]
+        ...
+
+    def __gt__(self, other: Any) -> Scalar:
+        ...
+
+    def __ge__(self, other: Any) -> Scalar:
+        ...
+
+    def __add__(self, other: Any) -> Scalar:
+        ...
+
+    def __radd__(self, other: Any) -> Scalar:
+        ...
+
+    def __sub__(self, other: Any) -> Scalar:
+        ...
+
+    def __rsub__(self, other: Any) -> Scalar:
+        ...
+
+    def __mul__(self, other: Any) -> Scalar:
+        ...
+
+    def __rmul__(self, other: Any) -> Scalar:
+        ...
+
+    def __mod__(self, other: Any) -> Scalar:
+        ...
+
+    def __rmod__(self, other: Any) -> Scalar:
+        ...
+
+    def __pow__(self, other: Any) -> Scalar:
+        ...
+
+    def __rpow__(self, other: Any) -> Scalar:
+        ...
+
+    def __floordiv__(self, other: Any) -> Scalar:
+        ...
+
+    def __rfloordiv__(self, other: Any) -> Scalar:
+        ...
+
+    def __truediv__(self, other: Any) -> Scalar:
+        ...
+
+    def __rtruediv__(self, other: Any) -> Scalar:
+        ...
+
+    def __neg__(self) -> Scalar:
+        ...
+
+    def __abs__(self) -> Scalar:
+        ...
+
+    def __bool__(self) -> bool:
+        """Note that this return a Python scalar.
+
+        Depending on the implementation, this may raise or trigger computation.
+        """
+        ...
diff --git a/spec/API_specification/dataframe_api/typing.py b/spec/API_specification/dataframe_api/typing.py
@@ -14,6 +14,8 @@
 from dataframe_api.groupby_object import Aggregation as AggregationT
 from dataframe_api.groupby_object import GroupBy
 
+from .scalar_object import Scalar
+
 if TYPE_CHECKING:
     from collections.abc import Sequence
 
@@ -53,9 +55,6 @@
         Duration,
     ]
 
-# Type alias: Mypy needs Any, but for readability we need to make clear this
-# is a Python scalar (i.e., an instance of `bool`, `int`, `float`, `str`, etc.)
-Scalar = Any
 # null is a special object which represents a missing value.
 # It is not valid as a type.
 NullType = Any
@@ -183,5 +182,4 @@ def __column_consortium_standard__(
     "Scalar",
     "SupportsColumnAPI",
     "SupportsDataFrameAPI",
-    "Scalar",
 ]
diff --git a/spec/design_topics/python_builtin_types.md b/spec/design_topics/python_builtin_types.md
@@ -18,7 +18,7 @@ class DataFrame:
         ...
 
 class Column:
-    def mean(self, skip_nulls: bool = True) -> float | NullType:
+    def mean(self, skip_nulls: bool = True) -> Scalar | NullType:
         ...
 
 larger = df2 > df1.col('foo').mean()
@@ -27,15 +27,37 @@ larger = df2 > df1.col('foo').mean()
 For a GPU dataframe library, it is desirable for all data to reside on the GPU,
 and not incur a performance penalty from synchronizing instances of Python
 builtin types to CPU. In the above example, the `.mean()` call returns a
-`float`. It is likely beneficial though to implement this as a library-specific
-scalar object which duck types with `float`. This means that it should (a) have
-the same semantics as a builtin `float` when used within a library, and (b)
-support usage as a `float` outside of the library (i.e., implement
-`__float__`). Duck typing is usually not perfect, for example `isinstance`
-usage on the float-like duck type will behave differently. Such explicit "type
-of object" checks don't have to be supported.
-
-The following design rule applies everywhere builtin Python types are used
-within this API standard: _where a Python builtin type is specified, an
-implementation may always replace it by an equivalent library-specific type
-that duck types with the Python builtin type._
+`Scalar`. It is likely beneficial though to implement this as a library-specific
+scalar object which (partially) duck types with `float`. The required methods it
+must implement are listed in the spec for class `Scalar`.
+
+## Example
+
+For example, if a library implements `FancyFloat` and `FancyBool` scalars,
+then the following should all be supported:
+```python
+df: DataFrame
+column_1: Column = df.col('a')
+column_2: Column = df.col('b')
+
+scalar: FancyFloat = column_1.std()
+result_1: Column = column_2 - column_1.std()
+result_2: FancyBool = column_2.std() > column_1.std()
+```
+
+Note that the scalars above are library-specific ones - they may be used to keep
+data on GPU, or to keep data lazy.
+
+The following, however, may raise, dependening on the
+implementation:
+```python
+df: DataFrame
+column = df.col('a')
+
+if column.std() > 0:  # this line may raise!
+    print('std is positive')
+```
+This is because `if column.std() > 0` will call `(column.std() > 0).__bool__()`,
+which is required by Python to produce a Python scalar.
+Therefore, a purely lazy dataframe library may choose to raise here, whereas as
+one which allows for eager execution may return a Python bool.