diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml index a11bde7a..c9a10442 100644 --- a/.github/workflows/mypy.yml +++ b/.github/workflows/mypy.yml @@ -29,4 +29,4 @@ jobs: - name: install-reqs run: python -m pip install --upgrade mypy==1.4.0 - name: run mypy - run: cd spec/API_specification && mypy dataframe_api + run: cd spec/API_specification && mypy dataframe_api && mypy examples diff --git a/spec/API_specification/dataframe_api/_types.py b/spec/API_specification/dataframe_api/_types.py index 2b43e115..81a824af 100644 --- a/spec/API_specification/dataframe_api/_types.py +++ b/spec/API_specification/dataframe_api/_types.py @@ -3,18 +3,22 @@ """ from __future__ import annotations -from dataclasses import dataclass from typing import ( + TYPE_CHECKING, Any, List, Literal, + Mapping, Optional, + Protocol, Sequence, Tuple, Union, - TYPE_CHECKING, ) -from enum import Enum + +if TYPE_CHECKING: + from .dataframe_object import DataFrame as DataFrameType + from .column_object import Column as ColumnType if TYPE_CHECKING: from .dtypes import ( @@ -41,6 +45,117 @@ NullType = Any +class Namespace(Protocol): + __dataframe_api_version__: str + + @staticmethod + def DataFrame() -> DataFrameType: + ... + + @staticmethod + def Column() -> ColumnType: + ... + + @staticmethod + def Int64() -> Int64:... + @staticmethod + def Int16() -> Int16:... + + @staticmethod + def Int32() -> Int32: + ... + + + @staticmethod + def Int8() -> Int8: + ... + + @staticmethod + def UInt64() -> UInt64: + ... + + @staticmethod + def UInt32() -> UInt32: + ... + + @staticmethod + def UInt16() -> UInt16: + ... + + @staticmethod + def UInt8() -> UInt8: + ... + + @staticmethod + def Float64() -> Float64: + ... + + @staticmethod + def Float32() -> Float32: + ... + + @staticmethod + def Bool() -> Bool: + ... + + @staticmethod + def concat(dataframes: Sequence[DataFrameType]) -> DataFrameType: + ... + + @staticmethod + def column_from_sequence( + sequence: Sequence[Any], + *, + dtype: Any, + name: str = "", + api_version: str | None = None, + ) -> ColumnType: + ... + + @staticmethod + def dataframe_from_dict( + data: Mapping[str, ColumnType], *, api_version: str | None = None + ) -> DataFrameType: + ... + + @staticmethod + def column_from_1d_array( + array: Any, *, dtype: Any, name: str = "", api_version: str | None = None + ) -> ColumnType: + ... + + @staticmethod + def dataframe_from_2d_array( + array: Any, + *, + names: Sequence[str], + dtypes: Mapping[str, Any], + api_version: str | None = None, + ) -> DataFrameType: + ... + + @staticmethod + def is_null(value: object, /) -> bool: + ... + + @staticmethod + def is_dtype(dtype: Any, kind: str | tuple[str, ...]) -> bool: + ... + + +class SupportsDataFrameAPI(Protocol): + def __dataframe_consortium_standard__( + self, *, api_version: str | None = None + ) -> DataFrameType: + ... + +class SupportsColumnAPI(Protocol): + def __column_consortium_standard__( + self, *, api_version: str | None = None + ) -> ColumnType: + ... + + __all__ = [ "Any", "DataFrame", @@ -58,5 +173,4 @@ "device", "DType", "ellipsis", - "Enum", ] diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 7b0f552a..ff4d7ba3 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -3,7 +3,7 @@ from typing import Any,NoReturn, TYPE_CHECKING, Literal, Generic if TYPE_CHECKING: - from ._types import NullType, Scalar, DType + from ._types import NullType, Scalar, DType, Namespace __all__ = ['Column'] @@ -19,7 +19,7 @@ class Column: """ - def __column_namespace__(self) -> Any: + def __column_namespace__(self) -> Namespace: """ Returns an object that has all the Dataframe Standard API functions on it. diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 54e380d8..e8a9a21e 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -6,7 +6,7 @@ if TYPE_CHECKING: from .column_object import Column from .groupby_object import GroupBy - from ._types import NullType, Scalar, DType + from ._types import NullType, Scalar, Namespace, DType __all__ = ["DataFrame"] @@ -36,7 +36,7 @@ class DataFrame: **Methods and Attributes** """ - def __dataframe_namespace__(self) -> Any: + def __dataframe_namespace__(self) -> Namespace: """ Returns an object that has all the top-level dataframe API functions on it. diff --git a/spec/API_specification/examples/01_standardise_columns.py b/spec/API_specification/examples/01_standardise_columns.py new file mode 100644 index 00000000..cb6b49b1 --- /dev/null +++ b/spec/API_specification/examples/01_standardise_columns.py @@ -0,0 +1,15 @@ +from typing import Any + +from dataframe_api._types import SupportsDataFrameAPI + +def my_dataframe_agnostic_function(df_non_standard: SupportsDataFrameAPI) -> Any: + df = df_non_standard.__dataframe_consortium_standard__(api_version='2023.09-beta') + + for column_name in df.column_names: + if column_name == 'species': + continue + new_column = df.get_column_by_name(column_name) + new_column = (new_column - new_column.mean()) / new_column.std() + df = df.assign(new_column.rename(f'{column_name}_scaled')) + + return df.dataframe diff --git a/spec/API_specification/examples/02_plotting.py b/spec/API_specification/examples/02_plotting.py new file mode 100644 index 00000000..82068835 --- /dev/null +++ b/spec/API_specification/examples/02_plotting.py @@ -0,0 +1,24 @@ +from typing import Callable, Any + +my_plotting_function: Callable[[Any, Any], Any] + +from dataframe_api._types import SupportsColumnAPI + +def group_by_and_plot( + x_any: SupportsColumnAPI, + y_any: SupportsColumnAPI, + color_any: SupportsColumnAPI, +) -> None: + x = x_any.__column_consortium_standard__() + y = y_any.__column_consortium_standard__() + color = color_any.__column_consortium_standard__() + + namespace = x.__column_namespace__() + + df = namespace.dataframe_from_dict({"x": x, "y": y, "color": color}) + + agg = df.group_by("color").mean() + x = agg.get_column_by_name("x").to_array_object(namespace.Float64()) + y = agg.get_column_by_name("y").to_array_object(namespace.Float64()) + + my_plotting_function(x, y) diff --git a/spec/API_specification/examples/README.md b/spec/API_specification/examples/README.md new file mode 100644 index 00000000..f429f6f2 --- /dev/null +++ b/spec/API_specification/examples/README.md @@ -0,0 +1,5 @@ +# Examples + +Here are some examples of how to use the DataFrame API. + +These should work for any library which has an implemenation of the Standard. diff --git a/spec/API_specification/examples/__init__.py b/spec/API_specification/examples/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/spec/conf.py b/spec/conf.py index c3aabb4d..2af862cc 100644 --- a/spec/conf.py +++ b/spec/conf.py @@ -85,6 +85,7 @@ ('py:class', 'Bool'), ('py:class', 'optional'), ('py:class', 'NullType'), + ('py:class', 'Namespace'), ] # NOTE: this alias handling isn't used yet - added in anticipation of future # need based on dataframe API aliases. @@ -112,7 +113,12 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = [ + '_build', + 'Thumbs.db', + '.DS_Store', + 'API_specification/examples/*', +] # MyST options myst_heading_anchors = 3 diff --git a/spec/purpose_and_scope.md b/spec/purpose_and_scope.md index 7e04bce8..96a0d5c9 100644 --- a/spec/purpose_and_scope.md +++ b/spec/purpose_and_scope.md @@ -275,36 +275,7 @@ latest version of the dataframe API specification. If the given version is invalid or not implemented for the given module, an error should be raised. Default: ``None``. -Example: - -```python -import pandas as pd -import polars as pl - - -df_pandas = pd.read_parquet('iris.parquet') -df_polars = pl.scan_parquet('iris.parquet') - -def my_dataframe_agnostic_function(df): - df = df.__dataframe_consortium_standard__(api_version='2023.09-beta') - - mask = df.get_column_by_name('species') != 'setosa' - df = df.filter(mask) - - for column_name in df.column_names: - if column_name == 'species': - continue - new_column = df.get_column_by_name(column_name) - new_column = (new_column - new_column.mean()) / new_column.std() - df = df.assign(new_column.rename(f'{column_name}_scaled')) - - return df.dataframe - -# Then, either of the following will work as expected: -my_dataframe_agnostic_function(df_pandas) -my_dataframe_agnostic_function(df_polars) -my_dataframe_agnostic_function(df_any_other_library_with_a_standard_compliant_namespace) -``` +For some examples, please check https://github.com/data-apis/dataframe-api/tree/main/spec/examples. ### Checking a dataframe object for Compliance