From dd919a1cf5d24c1d76340d252841d968ad63edad Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Fri, 13 Mar 2020 19:05:58 -0500
Subject: [PATCH 1/4] Draft strawman data frame interchange protocol for
 discussion

---
 dataframe.py               | 270 +++++++++++++++++++++++++++++++++++++
 example_dict_of_ndarray.py | 148 ++++++++++++++++++++
 2 files changed, 418 insertions(+)
 create mode 100644 dataframe.py
 create mode 100644 example_dict_of_ndarray.py

diff --git a/dataframe.py b/dataframe.py
new file mode 100644
index 0000000..15a3bbe
--- /dev/null
+++ b/dataframe.py
@@ -0,0 +1,270 @@
+# MIT License
+#
+# Copyright (c) 2020 Wes McKinney
+
+from abc import ABC, abstractmethod
+from collections.abc import Mapping, MutableMapping
+from typing import Any, Hashable, Iterable, Sequence
+
+# ----------------------------------------------------------------------
+# A simple data type class hierarchy for illustration
+
+
+class DataType(ABC):
+    """
+    A metadata object representing the logical value type of a cell in a data
+    frame column. This metadata does not guarantee an specific underlying data
+    representation
+    """
+    def __eq__(self, other: 'DataType'):
+        return self.equals(other)
+
+    def __str__(self):
+        return self.to_string()
+
+    def __repr__(self):
+        return str(self)
+
+    @abstractmethod
+    def to_string(self) -> str:
+        """
+        Return human-readable representation of the data type
+        """
+
+    @abstractmethod
+    def equals(self, other: 'DataType') -> bool:
+        """
+        Return true if other DataType contains the same metadata as this
+        DataType
+        """
+        pass
+
+
+class PrimitiveType(DataType):
+
+    def equals(self, other: DataType) -> bool:
+        return type(self) == type(other)
+
+
+class NullType(PrimitiveType):
+    """
+    A data type whose values are always null
+    """
+    def to_string(self):
+        return "null"
+
+
+class Boolean(PrimitiveType):
+
+    def to_string(self):
+        return "bool"
+
+
+class NumberType(PrimitiveType):
+    pass
+
+
+class IntegerType(NumberType):
+    pass
+
+
+class SignedIntegerType(IntegerType):
+    pass
+
+
+class Int8(SignedIntegerType):
+
+    def to_string(self):
+        return "int8"
+
+
+class Int16(SignedIntegerType):
+
+    def to_string(self):
+        return "int16"
+
+
+class Int32(SignedIntegerType):
+
+    def to_string(self):
+        return "int32"
+
+
+class Int64(SignedIntegerType):
+
+    def to_string(self):
+        return "int64"
+
+
+class Binary(PrimitiveType):
+    """
+    A variable-size binary (bytes) value
+    """
+    def to_string(self):
+        return "binary"
+
+
+class String(PrimitiveType):
+    """
+    A UTF8-encoded string value
+    """
+    def to_string(self):
+        return "string"
+
+
+class Object(PrimitiveType):
+    """
+    Any PyObject value
+    """
+    def to_string(self):
+        return "object"
+
+
+class Categorical(DataType):
+    """
+    A categorical value is an ordinal (integer) value that references a
+    sequence of category values of an arbitrary data type
+    """
+
+    def __init__(self, index_type: IntegerType, category_type: DataType,
+                 ordered: bool = False):
+        self.index_type = index_type
+        self.category_type = category_type
+        self.ordered = ordered
+
+    def equals(self, other: DataType) -> bool:
+        return (isinstance(other, Categorical) and
+                self.index_type == other.index_type and
+                self.category_type == other.category_type and
+                self.ordered == other.ordered)
+
+    def to_string(self):
+        return ("categorical(indices={}, categories={}, ordered=)"
+                .format(str(self.index_type), str(self.category_type),
+                        self.ordered))
+
+
+# ----------------------------------------------------------------------
+# Classes representing a column in a DataFrame
+
+
+class Column(ABC):
+
+    @property
+    @abstractmethod
+    def name(self) -> Hashable:
+        pass
+
+    @property
+    @abstractmethod
+    def type(self) -> DataType:
+        """
+        Return the logical type of each column cell value
+        """
+        pass
+
+    def to_numpy(self):
+        """
+        Access column's data as a NumPy array. Recommended to return a view if
+        able but not required
+        """
+        raise NotImplementedError("Conversion to NumPy not available")
+
+    def to_arrow(self, **kwargs):
+        """
+        Access column's data in the Apache Arrow format as pyarrow.Array or
+        ChunkedArray. Recommended to return a view if able but not required
+        """
+        raise NotImplementedError("Conversion to Arrow not available")
+
+
+# ----------------------------------------------------------------------
+# DataFrame: the main public API
+
+
+class DataFrame(ABC, Mapping):
+    """
+    An abstract data frame base class.
+
+    A "data frame" represents an ordered collection of named columns. A
+    column's "name" is permitted to be any hashable Python value, but strings
+    are common. Names are not required to be unique. Columns may be accessed by
+    name (when the name is unique) or by position.
+    """
+
+    def __dataframe__(self):
+        """
+        Idempotence of data frame protocol
+        """
+        return self
+
+    def __iter__(self):
+        # TBD: Decide what iterating should return
+        return iter(self.column_names)
+
+    def __len__(self):
+        return self.num_rows
+
+    @property
+    @abstractmethod
+    def num_columns(self):
+        """
+        Return the number of columns in the DataFrame
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def num_rows(self):
+        """
+        Return the number of rows in the DataFrame (if known)
+        """
+        pass
+
+    @abstractmethod
+    def iter_column_names(self) -> Iterable[Any]:
+        """
+        Return the column names as an iterable
+        """
+        pass
+
+    # TODO: Should this be a method or property?
+    @property
+    @abstractmethod
+    def column_names(self) -> Sequence[Any]:
+        """
+        Return the column names as a materialized sequence
+        """
+        pass
+
+    # TODO: Should this be a method or property?
+    @property
+    def row_names(self) -> Sequence[Any]:
+        """
+        Return the row names (if any) as a materialized sequence. It is not
+        necessary to implement this method
+        """
+        raise NotImplementedError("row_names")
+
+    def __getitem__(self, key: Hashable) -> Column:
+        return self.column_by_name(key)
+
+    @abstractmethod
+    def column_by_name(self, key: Hashable) -> Column:
+        """
+        Return the column whose name is the indicated key
+        """
+        pass
+
+    @abstractmethod
+    def column_by_index(self, i: int) -> Column:
+        """
+        Return the column at the indicated position
+        """
+        pass
+
+
+class MutableDataFrame(DataFrame, MutableMapping):
+    # TODO: Mutable data frames are fraught at this interface level and
+    # need more discussion
+    pass
diff --git a/example_dict_of_ndarray.py b/example_dict_of_ndarray.py
new file mode 100644
index 0000000..40bf757
--- /dev/null
+++ b/example_dict_of_ndarray.py
@@ -0,0 +1,148 @@
+# MIT License
+#
+# Copyright (c) 2020 Wes McKinney
+
+from typing import Dict, Hashable, Sequence
+import dataframe
+
+import numpy as np
+
+
+_numeric_types = {
+    'int8': dataframe.Int8(),
+    'int16': dataframe.Int16(),
+    'int32': dataframe.Int32(),
+    'int64': dataframe.Int64()
+}
+
+
+def _integer_factory(dtype):
+    return _numeric_types[dtype.name]
+
+
+def _constant_factory(type_instance):
+    def factory(*unused):
+        return type_instance
+    return factory
+
+
+_type_factories = {
+    'b': _constant_factory(dataframe.Boolean()),
+    'i': _integer_factory,
+    'O': _constant_factory(dataframe.Object()),
+    'S': _constant_factory(dataframe.Binary()),
+    'U': _constant_factory(dataframe.String())
+}
+
+
+class NumPyColumn(dataframe.Column):
+
+    def __init__(self, name, data):
+        self._name = name
+        self._data = data
+
+    @property
+    def name(self) -> Hashable:
+        return self._name
+
+    @property
+    def type(self) -> dataframe.DataType:
+        factory = _type_factories.get(self._data.dtype.kind)
+        if factory is None:
+            raise NotImplementedError("Data frame type for NumPy Type {} "
+                                      "not known"
+                                      .format(str(self._data.dtype)))
+        return factory(self._data.dtype)
+
+    def to_numpy(self):
+        return self._data
+
+
+class DictDataFrame(dataframe.DataFrame):
+    """
+    Construct data frame from dict of NumPy arrays
+
+    Parameters
+    ----------
+    data : dict
+    names : sequence, default None
+        If not passed, the names will be determined by the data's keys
+    num_rows : int, default None
+        If not passed, determined from the data
+    """
+
+    def __init__(self, columns: Dict[Hashable, np.ndarray],
+                 names: Sequence[Hashable] = None,
+                 num_rows: int = None):
+        if names is None:
+            names = list(columns.keys())
+
+        assert len(columns) == len(names)
+
+        self._columns = columns
+        self._names = list(names)
+        # self._name_to_index = {i: k for i, k in enumerate(self._names)}
+
+        if len(columns) > 0:
+            assert num_rows is None
+            self._num_rows = len(next(iter(columns.values())))
+        else:
+            self._num_rows = num_rows
+
+    @property
+    def num_columns(self):
+        return len(self._columns)
+
+    @property
+    def num_rows(self):
+        return self._num_rows
+
+    def iter_column_names(self):
+        return iter(self._names)
+
+    @property
+    def column_names(self):
+        return self._names
+
+    def column_by_name(self, key: Hashable) -> NumPyColumn:
+        return NumPyColumn(key, self._columns[key])
+
+    def column_by_index(self, i: int) -> NumPyColumn:
+        return NumPyColumn(self._names[i], self._columns[self._names[i]])
+
+
+def get_example():
+    data = {
+        'a': np.array([1, 2, 3, 4, 5], dtype='int64'),
+        'b': np.array(['a', 'b', 'c', 'd', 'e']),
+        'c': np.array([True, False, True, False, True])
+    }
+    names = ['a', 'b', 'c']
+    return data, names, DictDataFrame(data, names=names)
+
+
+def test_basic_behavior():
+    raw_data, names, df = get_example()
+
+    assert len(df) == 5
+    assert df.num_columns == 3
+    assert df.num_rows == 5
+
+    for i, name in enumerate(df.column_names):
+        assert name == names[i]
+
+    for i, name in enumerate(df.iter_column_names()):
+        assert name == names[i]
+
+    expected_types = {
+        'a': dataframe.Int64(),
+        'b': dataframe.String(),
+        'c': dataframe.Boolean()
+    }
+
+    for i, name in enumerate(names):
+        col = df[name]
+        assert col.name == name
+        assert col.type == expected_types[name]
+        assert col.to_numpy() is raw_data[name]
+        assert df.column_by_index(i).name == col.name

From d6f5d9311728039eba538f5a18a7612c76a844c5 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Fri, 13 Mar 2020 19:22:11 -0500
Subject: [PATCH 2/4] Fix a couple buglets

---
 dataframe.py               | 2 +-
 example_dict_of_ndarray.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dataframe.py b/dataframe.py
index 15a3bbe..58ecd81 100644
--- a/dataframe.py
+++ b/dataframe.py
@@ -139,7 +139,7 @@ def equals(self, other: DataType) -> bool:
                 self.ordered == other.ordered)
 
     def to_string(self):
-        return ("categorical(indices={}, categories={}, ordered=)"
+        return ("categorical(indices={}, categories={}, ordered={})"
                 .format(str(self.index_type), str(self.category_type),
                         self.ordered))
 
diff --git a/example_dict_of_ndarray.py b/example_dict_of_ndarray.py
index 40bf757..34f8a59 100644
--- a/example_dict_of_ndarray.py
+++ b/example_dict_of_ndarray.py
@@ -79,7 +79,7 @@ def __init__(self, columns: Dict[Hashable, np.ndarray],
 
         assert len(columns) == len(names)
 
-        self._columns = columns
+        self._columns = columns.copy()
         self._names = list(names)
         # self._name_to_index = {i: k for i, k in enumerate(self._names)}
 

From aa71380c34d59fe0b74b574ff660939ecc2d3cdd Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Wed, 8 Apr 2020 17:17:11 -0500
Subject: [PATCH 3/4] Incorporate feedback from PR

---
 LICENSE                    |  2 +-
 dataframe.py               | 69 +++++++++++++++++++-------------------
 example_dict_of_ndarray.py | 15 +++------
 3 files changed, 39 insertions(+), 47 deletions(-)

diff --git a/LICENSE b/LICENSE
index d2b0d6d..eef6a56 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2020 Wes McKinney
+Copyright (c) 2020 DataFrame Protocol Contributors
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/dataframe.py b/dataframe.py
index 58ecd81..b989251 100644
--- a/dataframe.py
+++ b/dataframe.py
@@ -1,10 +1,6 @@
-# MIT License
-#
-# Copyright (c) 2020 Wes McKinney
-
 from abc import ABC, abstractmethod
-from collections.abc import Mapping, MutableMapping
-from typing import Any, Hashable, Iterable, Sequence
+from collections.abc import Mapping
+from typing import Any, Hashable, Sequence
 
 # ----------------------------------------------------------------------
 # A simple data type class hierarchy for illustration
@@ -163,6 +159,13 @@ def type(self) -> DataType:
         """
         pass
 
+    @property
+    def attrs(self) -> Mapping:
+        """
+        Metadata for this column. Default implementation returns empty dict
+        """
+        return {}
+
     def to_numpy(self):
         """
         Access column's data as a NumPy array. Recommended to return a view if
@@ -182,7 +185,7 @@ def to_arrow(self, **kwargs):
 # DataFrame: the main public API
 
 
-class DataFrame(ABC, Mapping):
+class DataFrame(ABC):
     """
     An abstract data frame base class.
 
@@ -198,13 +201,6 @@ def __dataframe__(self):
         """
         return self
 
-    def __iter__(self):
-        # TBD: Decide what iterating should return
-        return iter(self.column_names)
-
-    def __len__(self):
-        return self.num_rows
-
     @property
     @abstractmethod
     def num_columns(self):
@@ -221,14 +217,6 @@ def num_rows(self):
         """
         pass
 
-    @abstractmethod
-    def iter_column_names(self) -> Iterable[Any]:
-        """
-        Return the column names as an iterable
-        """
-        pass
-
-    # TODO: Should this be a method or property?
     @property
     @abstractmethod
     def column_names(self) -> Sequence[Any]:
@@ -237,34 +225,45 @@ def column_names(self) -> Sequence[Any]:
         """
         pass
 
-    # TODO: Should this be a method or property?
     @property
     def row_names(self) -> Sequence[Any]:
         """
         Return the row names (if any) as a materialized sequence. It is not
         necessary to implement this method
         """
-        raise NotImplementedError("row_names")
-
-    def __getitem__(self, key: Hashable) -> Column:
-        return self.column_by_name(key)
+        raise NotImplementedError("This DataFrame has no row names")
 
     @abstractmethod
-    def column_by_name(self, key: Hashable) -> Column:
+    def get_column(self, i: int) -> Column:
         """
-        Return the column whose name is the indicated key
+        Return the column at the indicated position
         """
         pass
 
     @abstractmethod
-    def column_by_index(self, i: int) -> Column:
+    def get_column_by_name(self, name: Hashable) -> Column:
         """
-        Return the column at the indicated position
+        Return the column whose name is the indicated name. If the column names
+        are not unique, may raise an exception.
         """
         pass
 
+    def select_columns(self, indices: Sequence[int]):
+        """
+        Create a new DataFrame by selecting a subset of columns by index
+        """
+        raise NotImplementedError("select_columns")
 
-class MutableDataFrame(DataFrame, MutableMapping):
-    # TODO: Mutable data frames are fraught at this interface level and
-    # need more discussion
-    pass
+    def select_columns_by_name(self, names: Sequence[Hashable]):
+        """
+        Create a new DataFrame by selecting a subset of columns by name. If the
+        column names are not unique, may raise an exception.
+        """
+        raise NotImplementedError("select_columns_by_name")
+
+    def to_dict_of_numpy(self):
+        """
+        Convert DataFrame to a dict with column names as keys and values the
+        corresponding columns converted to NumPy arrays
+        """
+        raise NotImplementedError("TODO")
diff --git a/example_dict_of_ndarray.py b/example_dict_of_ndarray.py
index 34f8a59..6ee3ff3 100644
--- a/example_dict_of_ndarray.py
+++ b/example_dict_of_ndarray.py
@@ -97,17 +97,14 @@ def num_columns(self):
     def num_rows(self):
         return self._num_rows
 
-    def iter_column_names(self):
-        return iter(self._names)
-
     @property
     def column_names(self):
         return self._names
 
-    def column_by_name(self, key: Hashable) -> NumPyColumn:
+    def get_column_by_name(self, key: Hashable) -> NumPyColumn:
         return NumPyColumn(key, self._columns[key])
 
-    def column_by_index(self, i: int) -> NumPyColumn:
+    def get_column(self, i: int) -> NumPyColumn:
         return NumPyColumn(self._names[i], self._columns[self._names[i]])
 
 
@@ -124,16 +121,12 @@ def get_example():
 def test_basic_behavior():
     raw_data, names, df = get_example()
 
-    assert len(df) == 5
     assert df.num_columns == 3
     assert df.num_rows == 5
 
     for i, name in enumerate(df.column_names):
         assert name == names[i]
 
-    for i, name in enumerate(df.iter_column_names()):
-        assert name == names[i]
-
     expected_types = {
         'a': dataframe.Int64(),
         'b': dataframe.String(),
@@ -141,8 +134,8 @@ def test_basic_behavior():
     }
 
     for i, name in enumerate(names):
-        col = df[name]
+        col = df.get_column(i)
         assert col.name == name
         assert col.type == expected_types[name]
         assert col.to_numpy() is raw_data[name]
-        assert df.column_by_index(i).name == col.name
+        assert df.get_column_by_name(name).name == col.name

From ab385e92110f351f616e4df997b3f81e4e88c607 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm+git@apache.org>
Date: Thu, 9 Apr 2020 10:17:37 -0500
Subject: [PATCH 4/4] Relax hashability requirements. Use Iterable for
 DataFrame.column_names

---
 dataframe.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/dataframe.py b/dataframe.py
index b989251..d618782 100644
--- a/dataframe.py
+++ b/dataframe.py
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
-from typing import Any, Hashable, Sequence
+from typing import Any, Iterable, Sequence
 
 # ----------------------------------------------------------------------
 # A simple data type class hierarchy for illustration
@@ -148,7 +148,7 @@ class Column(ABC):
 
     @property
     @abstractmethod
-    def name(self) -> Hashable:
+    def name(self) -> Any:
         pass
 
     @property
@@ -190,8 +190,8 @@ class DataFrame(ABC):
     An abstract data frame base class.
 
     A "data frame" represents an ordered collection of named columns. A
-    column's "name" is permitted to be any hashable Python value, but strings
-    are common. Names are not required to be unique. Columns may be accessed by
+    column's "name" is permitted to be any Python value, but strings are
+    common. Names are not required to be unique. Columns may be accessed by
     name (when the name is unique) or by position.
     """
 
@@ -219,7 +219,7 @@ def num_rows(self):
 
     @property
     @abstractmethod
-    def column_names(self) -> Sequence[Any]:
+    def column_names(self) -> Iterable[Any]:
         """
         Return the column names as a materialized sequence
         """
@@ -241,7 +241,7 @@ def get_column(self, i: int) -> Column:
         pass
 
     @abstractmethod
-    def get_column_by_name(self, name: Hashable) -> Column:
+    def get_column_by_name(self, name: Any) -> Column:
         """
         Return the column whose name is the indicated name. If the column names
         are not unique, may raise an exception.
@@ -254,7 +254,7 @@ def select_columns(self, indices: Sequence[int]):
         """
         raise NotImplementedError("select_columns")
 
-    def select_columns_by_name(self, names: Sequence[Hashable]):
+    def select_columns_by_name(self, names: Sequence[Any]):
         """
         Create a new DataFrame by selecting a subset of columns by name. If the
         column names are not unique, may raise an exception.