data-apis · MarcoGorelli · Oct 26, 2023 · Oct 23, 2023 · Oct 23, 2023 · Oct 23, 2023
diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py
@@ -40,6 +40,7 @@
     "Duration",
     "String",
     "is_dtype",
+    "Aggregation",
 ]
 
 

diff --git a/spec/API_specification/dataframe_api/groupby_object.py b/spec/API_specification/dataframe_api/groupby_object.py
@@ -1,12 +1,15 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Protocol
 
 if TYPE_CHECKING:
     from .dataframe_object import DataFrame
 
 
-__all__ = ['GroupBy']
+__all__ = [
+    "Aggregation",
+    "GroupBy",
+]
 
 
 class GroupBy:
@@ -51,3 +54,71 @@ def var(self, *, correction: int | float = 1, skip_nulls: bool = True) -> DataFr
 
     def size(self) -> DataFrame:
         ...
+
+    def aggregate(self, *aggregation: Aggregation) -> DataFrame:
+        """
+        Aggregate columns according to given aggregation function.
+
+        Examples
+        --------
+        >>> df: DataFrame
+        >>> namespace = df.__dataframe_namespace__()
+        >>> df.group_by('year').aggregate(
+        ...     namespace.Aggregation.sum('l_quantity').rename('sum_qty'),
+        ...     namespace.Aggregation.mean('l_quantity').rename('avg_qty'),
+        ...     namespace.Aggregation.mean('l_extended_price').rename('avg_price'),
+        ...     namespace.Aggregation.mean('l_discount').rename('avg_disc'),
+        ...     namespace.Aggregation.size().rename('count_order'),
+        ... )
+        """
+        ...
+
+class Aggregation(Protocol):
+    def rename(self, name: str) -> Aggregation:
+        """Assign given name to output of aggregation. """
+        ...
+
+    @classmethod
+    def any(cls, column: str, *, skip_nulls: bool = True) -> Aggregation:
+        ...
+
+    @classmethod
+    def all(cls, column: str, *, skip_nulls: bool = True) -> Aggregation:
+        ...
+
+    @classmethod
+    def min(cls, column: str, *, skip_nulls: bool = True) -> Aggregation:
+        ...
+
+    @classmethod
+    def max(cls, column: str, *, skip_nulls: bool = True) -> Aggregation:
+        ...
+
+    @classmethod
+    def sum(cls, column: str, *, skip_nulls: bool = True) -> Aggregation:
+        ...
+
+    @classmethod
+    def prod(cls, column: str, *, skip_nulls: bool = True) -> Aggregation:
+        ...
+
+    @classmethod
+    def median(cls, column: str, *, skip_nulls: bool = True) -> Aggregation:
+        ...
+
+    @classmethod
+    def mean(cls, column: str, *, skip_nulls: bool=True) -> Aggregation:
+        ...
+
+    @classmethod
+    def std(cls, column: str, *, correction: int|float=1, skip_nulls: bool=True) -> Aggregation:
+        ...
+
+    @classmethod
+    def var(cls, column: str, *, correction: int|float=1, skip_nulls: bool=True) -> Aggregation:
+        ...
+
+    @classmethod
+    def size(cls) -> Aggregation:
+        ...
+
diff --git a/spec/API_specification/dataframe_api/typing.py b/spec/API_specification/dataframe_api/typing.py
@@ -15,7 +15,7 @@
 
 from dataframe_api.column_object import Column
 from dataframe_api.dataframe_object import DataFrame
-from dataframe_api.groupby_object import GroupBy
+from dataframe_api.groupby_object import GroupBy, Aggregation as AggregationT
 
 if TYPE_CHECKING:
     from .dtypes import (
@@ -147,7 +147,9 @@ def is_null(value: object, /) -> bool:
     @staticmethod
     def is_dtype(dtype: Any, kind: str | tuple[str, ...]) -> bool:
         ...
-
+
+    class Aggregation(AggregationT):
+        ...
 
 class SupportsDataFrameAPI(Protocol):
     def __dataframe_consortium_standard__(
@@ -163,6 +165,7 @@ def __column_consortium_standard__(
 
 
 __all__ = [
+    "Aggregation",
     "Column",
     "DataFrame",
     "DType",

diff --git a/spec/API_specification/examples/tpch/q1.py b/spec/API_specification/examples/tpch/q1.py
@@ -0,0 +1,39 @@
+from typing import Any, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from dataframe_api.typing import SupportsDataFrameAPI
+
+
+def query(lineitem_raw: SupportsDataFrameAPI) -> Any:
+    lineitem = lineitem_raw.__dataframe_consortium_standard__()
+    namespace = lineitem.__dataframe_namespace__()
+
+    mask = lineitem.get_column_by_name("l_shipdate") <= namespace.date(1998, 9, 2)  # type: ignore
+    lineitem = lineitem.assign(
+        [
+            (
+                lineitem.get_column_by_name("l_extended_price")
+                * (1 - lineitem.get_column_by_name("l_discount"))
+            ).rename("l_disc_price"),
+            (
+                lineitem.get_column_by_name("l_extended_price")
+                * (1 - lineitem.get_column_by_name("l_discount"))
+                * (1 + lineitem.get_column_by_name("l_tax"))
+            ).rename("l_charge"),
+        ]
+    )
+    result = (
+        lineitem.filter(mask)
+        .group_by(["l_returnflag", "l_linestatus"])
+        .aggregate(
+            namespace.Aggregation.sum("l_quantity").rename("sum_qty"),
+            namespace.Aggregation.sum("l_extendedprice").rename("sum_base_price"),
+            namespace.Aggregation.sum("l_disc_price").rename("sum_disc_price"),
+            namespace.Aggregation.sum("change").rename("sum_charge"),
+            namespace.Aggregation.mean("l_quantity").rename("avg_qty"),
+            namespace.Aggregation.mean("l_discount").rename("avg_disc"),
+            namespace.Aggregation.size().rename("count_order"),
+        )
+        .sort(["l_returnflag", "l_linestatus"])
+    )
+    return result.dataframe
diff --git a/spec/API_specification/examples/tpch/q5.py b/spec/API_specification/examples/tpch/q5.py
@@ -68,7 +68,6 @@ def query(
         * (1 - result.get_column_by_name("l_discount"))
     ).rename("revenue")
     result = result.assign(new_column)
-    result = result.select(["revenue", "n_name"])
-    result = result.group_by("n_name").sum()
+    result = result.group_by("n_name").aggregate(namespace.Aggregation.sum("revenue"))
 
     return result.dataframe
diff --git a/spec/conf.py b/spec/conf.py
@@ -84,6 +84,7 @@
     ('py:class', 'Scalar'),
     ('py:class', 'Bool'),
     ('py:class', 'optional'),
+    ('py:class', 'Aggregation'),
     ('py:class', 'NullType'),
     ('py:class', 'Namespace'),
     ('py:class', 'SupportsDataFrameAPI'),