From dd919a1cf5d24c1d76340d252841d968ad63edad Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 13 Mar 2020 19:05:58 -0500 Subject: [PATCH 1/4] Draft strawman data frame interchange protocol for discussion --- dataframe.py | 270 +++++++++++++++++++++++++++++++++++++ example_dict_of_ndarray.py | 148 ++++++++++++++++++++ 2 files changed, 418 insertions(+) create mode 100644 dataframe.py create mode 100644 example_dict_of_ndarray.py diff --git a/dataframe.py b/dataframe.py new file mode 100644 index 0000000..15a3bbe --- /dev/null +++ b/dataframe.py @@ -0,0 +1,270 @@ +# MIT License +# +# Copyright (c) 2020 Wes McKinney + +from abc import ABC, abstractmethod +from collections.abc import Mapping, MutableMapping +from typing import Any, Hashable, Iterable, Sequence + +# ---------------------------------------------------------------------- +# A simple data type class hierarchy for illustration + + +class DataType(ABC): + """ + A metadata object representing the logical value type of a cell in a data + frame column. This metadata does not guarantee an specific underlying data + representation + """ + def __eq__(self, other: 'DataType'): + return self.equals(other) + + def __str__(self): + return self.to_string() + + def __repr__(self): + return str(self) + + @abstractmethod + def to_string(self) -> str: + """ + Return human-readable representation of the data type + """ + + @abstractmethod + def equals(self, other: 'DataType') -> bool: + """ + Return true if other DataType contains the same metadata as this + DataType + """ + pass + + +class PrimitiveType(DataType): + + def equals(self, other: DataType) -> bool: + return type(self) == type(other) + + +class NullType(PrimitiveType): + """ + A data type whose values are always null + """ + def to_string(self): + return "null" + + +class Boolean(PrimitiveType): + + def to_string(self): + return "bool" + + +class NumberType(PrimitiveType): + pass + + +class IntegerType(NumberType): + pass + + +class SignedIntegerType(IntegerType): + pass + + +class Int8(SignedIntegerType): + + def to_string(self): + return "int8" + + +class Int16(SignedIntegerType): + + def to_string(self): + return "int16" + + +class Int32(SignedIntegerType): + + def to_string(self): + return "int32" + + +class Int64(SignedIntegerType): + + def to_string(self): + return "int64" + + +class Binary(PrimitiveType): + """ + A variable-size binary (bytes) value + """ + def to_string(self): + return "binary" + + +class String(PrimitiveType): + """ + A UTF8-encoded string value + """ + def to_string(self): + return "string" + + +class Object(PrimitiveType): + """ + Any PyObject value + """ + def to_string(self): + return "object" + + +class Categorical(DataType): + """ + A categorical value is an ordinal (integer) value that references a + sequence of category values of an arbitrary data type + """ + + def __init__(self, index_type: IntegerType, category_type: DataType, + ordered: bool = False): + self.index_type = index_type + self.category_type = category_type + self.ordered = ordered + + def equals(self, other: DataType) -> bool: + return (isinstance(other, Categorical) and + self.index_type == other.index_type and + self.category_type == other.category_type and + self.ordered == other.ordered) + + def to_string(self): + return ("categorical(indices={}, categories={}, ordered=)" + .format(str(self.index_type), str(self.category_type), + self.ordered)) + + +# ---------------------------------------------------------------------- +# Classes representing a column in a DataFrame + + +class Column(ABC): + + @property + @abstractmethod + def name(self) -> Hashable: + pass + + @property + @abstractmethod + def type(self) -> DataType: + """ + Return the logical type of each column cell value + """ + pass + + def to_numpy(self): + """ + Access column's data as a NumPy array. Recommended to return a view if + able but not required + """ + raise NotImplementedError("Conversion to NumPy not available") + + def to_arrow(self, **kwargs): + """ + Access column's data in the Apache Arrow format as pyarrow.Array or + ChunkedArray. Recommended to return a view if able but not required + """ + raise NotImplementedError("Conversion to Arrow not available") + + +# ---------------------------------------------------------------------- +# DataFrame: the main public API + + +class DataFrame(ABC, Mapping): + """ + An abstract data frame base class. + + A "data frame" represents an ordered collection of named columns. A + column's "name" is permitted to be any hashable Python value, but strings + are common. Names are not required to be unique. Columns may be accessed by + name (when the name is unique) or by position. + """ + + def __dataframe__(self): + """ + Idempotence of data frame protocol + """ + return self + + def __iter__(self): + # TBD: Decide what iterating should return + return iter(self.column_names) + + def __len__(self): + return self.num_rows + + @property + @abstractmethod + def num_columns(self): + """ + Return the number of columns in the DataFrame + """ + pass + + @property + @abstractmethod + def num_rows(self): + """ + Return the number of rows in the DataFrame (if known) + """ + pass + + @abstractmethod + def iter_column_names(self) -> Iterable[Any]: + """ + Return the column names as an iterable + """ + pass + + # TODO: Should this be a method or property? + @property + @abstractmethod + def column_names(self) -> Sequence[Any]: + """ + Return the column names as a materialized sequence + """ + pass + + # TODO: Should this be a method or property? + @property + def row_names(self) -> Sequence[Any]: + """ + Return the row names (if any) as a materialized sequence. It is not + necessary to implement this method + """ + raise NotImplementedError("row_names") + + def __getitem__(self, key: Hashable) -> Column: + return self.column_by_name(key) + + @abstractmethod + def column_by_name(self, key: Hashable) -> Column: + """ + Return the column whose name is the indicated key + """ + pass + + @abstractmethod + def column_by_index(self, i: int) -> Column: + """ + Return the column at the indicated position + """ + pass + + +class MutableDataFrame(DataFrame, MutableMapping): + # TODO: Mutable data frames are fraught at this interface level and + # need more discussion + pass diff --git a/example_dict_of_ndarray.py b/example_dict_of_ndarray.py new file mode 100644 index 0000000..40bf757 --- /dev/null +++ b/example_dict_of_ndarray.py @@ -0,0 +1,148 @@ +# MIT License +# +# Copyright (c) 2020 Wes McKinney + +from typing import Dict, Hashable, Sequence +import dataframe + +import numpy as np + + +_numeric_types = { + 'int8': dataframe.Int8(), + 'int16': dataframe.Int16(), + 'int32': dataframe.Int32(), + 'int64': dataframe.Int64() +} + + +def _integer_factory(dtype): + return _numeric_types[dtype.name] + + +def _constant_factory(type_instance): + def factory(*unused): + return type_instance + return factory + + +_type_factories = { + 'b': _constant_factory(dataframe.Boolean()), + 'i': _integer_factory, + 'O': _constant_factory(dataframe.Object()), + 'S': _constant_factory(dataframe.Binary()), + 'U': _constant_factory(dataframe.String()) +} + + +class NumPyColumn(dataframe.Column): + + def __init__(self, name, data): + self._name = name + self._data = data + + @property + def name(self) -> Hashable: + return self._name + + @property + def type(self) -> dataframe.DataType: + factory = _type_factories.get(self._data.dtype.kind) + if factory is None: + raise NotImplementedError("Data frame type for NumPy Type {} " + "not known" + .format(str(self._data.dtype))) + return factory(self._data.dtype) + + def to_numpy(self): + return self._data + + +class DictDataFrame(dataframe.DataFrame): + """ + Construct data frame from dict of NumPy arrays + + Parameters + ---------- + data : dict + names : sequence, default None + If not passed, the names will be determined by the data's keys + num_rows : int, default None + If not passed, determined from the data + """ + + def __init__(self, columns: Dict[Hashable, np.ndarray], + names: Sequence[Hashable] = None, + num_rows: int = None): + if names is None: + names = list(columns.keys()) + + assert len(columns) == len(names) + + self._columns = columns + self._names = list(names) + # self._name_to_index = {i: k for i, k in enumerate(self._names)} + + if len(columns) > 0: + assert num_rows is None + self._num_rows = len(next(iter(columns.values()))) + else: + self._num_rows = num_rows + + @property + def num_columns(self): + return len(self._columns) + + @property + def num_rows(self): + return self._num_rows + + def iter_column_names(self): + return iter(self._names) + + @property + def column_names(self): + return self._names + + def column_by_name(self, key: Hashable) -> NumPyColumn: + return NumPyColumn(key, self._columns[key]) + + def column_by_index(self, i: int) -> NumPyColumn: + return NumPyColumn(self._names[i], self._columns[self._names[i]]) + + +def get_example(): + data = { + 'a': np.array([1, 2, 3, 4, 5], dtype='int64'), + 'b': np.array(['a', 'b', 'c', 'd', 'e']), + 'c': np.array([True, False, True, False, True]) + } + names = ['a', 'b', 'c'] + return data, names, DictDataFrame(data, names=names) + + +def test_basic_behavior(): + raw_data, names, df = get_example() + + assert len(df) == 5 + assert df.num_columns == 3 + assert df.num_rows == 5 + + for i, name in enumerate(df.column_names): + assert name == names[i] + + for i, name in enumerate(df.iter_column_names()): + assert name == names[i] + + expected_types = { + 'a': dataframe.Int64(), + 'b': dataframe.String(), + 'c': dataframe.Boolean() + } + + for i, name in enumerate(names): + col = df[name] + assert col.name == name + assert col.type == expected_types[name] + assert col.to_numpy() is raw_data[name] + assert df.column_by_index(i).name == col.name From d6f5d9311728039eba538f5a18a7612c76a844c5 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 13 Mar 2020 19:22:11 -0500 Subject: [PATCH 2/4] Fix a couple buglets --- dataframe.py | 2 +- example_dict_of_ndarray.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dataframe.py b/dataframe.py index 15a3bbe..58ecd81 100644 --- a/dataframe.py +++ b/dataframe.py @@ -139,7 +139,7 @@ def equals(self, other: DataType) -> bool: self.ordered == other.ordered) def to_string(self): - return ("categorical(indices={}, categories={}, ordered=)" + return ("categorical(indices={}, categories={}, ordered={})" .format(str(self.index_type), str(self.category_type), self.ordered)) diff --git a/example_dict_of_ndarray.py b/example_dict_of_ndarray.py index 40bf757..34f8a59 100644 --- a/example_dict_of_ndarray.py +++ b/example_dict_of_ndarray.py @@ -79,7 +79,7 @@ def __init__(self, columns: Dict[Hashable, np.ndarray], assert len(columns) == len(names) - self._columns = columns + self._columns = columns.copy() self._names = list(names) # self._name_to_index = {i: k for i, k in enumerate(self._names)} From aa71380c34d59fe0b74b574ff660939ecc2d3cdd Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 8 Apr 2020 17:17:11 -0500 Subject: [PATCH 3/4] Incorporate feedback from PR --- LICENSE | 2 +- dataframe.py | 69 +++++++++++++++++++------------------- example_dict_of_ndarray.py | 15 +++------ 3 files changed, 39 insertions(+), 47 deletions(-) diff --git a/LICENSE b/LICENSE index d2b0d6d..eef6a56 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2020 Wes McKinney +Copyright (c) 2020 DataFrame Protocol Contributors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/dataframe.py b/dataframe.py index 58ecd81..b989251 100644 --- a/dataframe.py +++ b/dataframe.py @@ -1,10 +1,6 @@ -# MIT License -# -# Copyright (c) 2020 Wes McKinney - from abc import ABC, abstractmethod -from collections.abc import Mapping, MutableMapping -from typing import Any, Hashable, Iterable, Sequence +from collections.abc import Mapping +from typing import Any, Hashable, Sequence # ---------------------------------------------------------------------- # A simple data type class hierarchy for illustration @@ -163,6 +159,13 @@ def type(self) -> DataType: """ pass + @property + def attrs(self) -> Mapping: + """ + Metadata for this column. Default implementation returns empty dict + """ + return {} + def to_numpy(self): """ Access column's data as a NumPy array. Recommended to return a view if @@ -182,7 +185,7 @@ def to_arrow(self, **kwargs): # DataFrame: the main public API -class DataFrame(ABC, Mapping): +class DataFrame(ABC): """ An abstract data frame base class. @@ -198,13 +201,6 @@ def __dataframe__(self): """ return self - def __iter__(self): - # TBD: Decide what iterating should return - return iter(self.column_names) - - def __len__(self): - return self.num_rows - @property @abstractmethod def num_columns(self): @@ -221,14 +217,6 @@ def num_rows(self): """ pass - @abstractmethod - def iter_column_names(self) -> Iterable[Any]: - """ - Return the column names as an iterable - """ - pass - - # TODO: Should this be a method or property? @property @abstractmethod def column_names(self) -> Sequence[Any]: @@ -237,34 +225,45 @@ def column_names(self) -> Sequence[Any]: """ pass - # TODO: Should this be a method or property? @property def row_names(self) -> Sequence[Any]: """ Return the row names (if any) as a materialized sequence. It is not necessary to implement this method """ - raise NotImplementedError("row_names") - - def __getitem__(self, key: Hashable) -> Column: - return self.column_by_name(key) + raise NotImplementedError("This DataFrame has no row names") @abstractmethod - def column_by_name(self, key: Hashable) -> Column: + def get_column(self, i: int) -> Column: """ - Return the column whose name is the indicated key + Return the column at the indicated position """ pass @abstractmethod - def column_by_index(self, i: int) -> Column: + def get_column_by_name(self, name: Hashable) -> Column: """ - Return the column at the indicated position + Return the column whose name is the indicated name. If the column names + are not unique, may raise an exception. """ pass + def select_columns(self, indices: Sequence[int]): + """ + Create a new DataFrame by selecting a subset of columns by index + """ + raise NotImplementedError("select_columns") -class MutableDataFrame(DataFrame, MutableMapping): - # TODO: Mutable data frames are fraught at this interface level and - # need more discussion - pass + def select_columns_by_name(self, names: Sequence[Hashable]): + """ + Create a new DataFrame by selecting a subset of columns by name. If the + column names are not unique, may raise an exception. + """ + raise NotImplementedError("select_columns_by_name") + + def to_dict_of_numpy(self): + """ + Convert DataFrame to a dict with column names as keys and values the + corresponding columns converted to NumPy arrays + """ + raise NotImplementedError("TODO") diff --git a/example_dict_of_ndarray.py b/example_dict_of_ndarray.py index 34f8a59..6ee3ff3 100644 --- a/example_dict_of_ndarray.py +++ b/example_dict_of_ndarray.py @@ -97,17 +97,14 @@ def num_columns(self): def num_rows(self): return self._num_rows - def iter_column_names(self): - return iter(self._names) - @property def column_names(self): return self._names - def column_by_name(self, key: Hashable) -> NumPyColumn: + def get_column_by_name(self, key: Hashable) -> NumPyColumn: return NumPyColumn(key, self._columns[key]) - def column_by_index(self, i: int) -> NumPyColumn: + def get_column(self, i: int) -> NumPyColumn: return NumPyColumn(self._names[i], self._columns[self._names[i]]) @@ -124,16 +121,12 @@ def get_example(): def test_basic_behavior(): raw_data, names, df = get_example() - assert len(df) == 5 assert df.num_columns == 3 assert df.num_rows == 5 for i, name in enumerate(df.column_names): assert name == names[i] - for i, name in enumerate(df.iter_column_names()): - assert name == names[i] - expected_types = { 'a': dataframe.Int64(), 'b': dataframe.String(), @@ -141,8 +134,8 @@ def test_basic_behavior(): } for i, name in enumerate(names): - col = df[name] + col = df.get_column(i) assert col.name == name assert col.type == expected_types[name] assert col.to_numpy() is raw_data[name] - assert df.column_by_index(i).name == col.name + assert df.get_column_by_name(name).name == col.name From ab385e92110f351f616e4df997b3f81e4e88c607 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 9 Apr 2020 10:17:37 -0500 Subject: [PATCH 4/4] Relax hashability requirements. Use Iterable for DataFrame.column_names --- dataframe.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dataframe.py b/dataframe.py index b989251..d618782 100644 --- a/dataframe.py +++ b/dataframe.py @@ -1,6 +1,6 @@ from abc import ABC, abstractmethod from collections.abc import Mapping -from typing import Any, Hashable, Sequence +from typing import Any, Iterable, Sequence # ---------------------------------------------------------------------- # A simple data type class hierarchy for illustration @@ -148,7 +148,7 @@ class Column(ABC): @property @abstractmethod - def name(self) -> Hashable: + def name(self) -> Any: pass @property @@ -190,8 +190,8 @@ class DataFrame(ABC): An abstract data frame base class. A "data frame" represents an ordered collection of named columns. A - column's "name" is permitted to be any hashable Python value, but strings - are common. Names are not required to be unique. Columns may be accessed by + column's "name" is permitted to be any Python value, but strings are + common. Names are not required to be unique. Columns may be accessed by name (when the name is unique) or by position. """ @@ -219,7 +219,7 @@ def num_rows(self): @property @abstractmethod - def column_names(self) -> Sequence[Any]: + def column_names(self) -> Iterable[Any]: """ Return the column names as a materialized sequence """ @@ -241,7 +241,7 @@ def get_column(self, i: int) -> Column: pass @abstractmethod - def get_column_by_name(self, name: Hashable) -> Column: + def get_column_by_name(self, name: Any) -> Column: """ Return the column whose name is the indicated name. If the column names are not unique, may raise an exception. @@ -254,7 +254,7 @@ def select_columns(self, indices: Sequence[int]): """ raise NotImplementedError("select_columns") - def select_columns_by_name(self, names: Sequence[Hashable]): + def select_columns_by_name(self, names: Sequence[Any]): """ Create a new DataFrame by selecting a subset of columns by name. If the column names are not unique, may raise an exception.