Skip to content

PR: Add allow_copy flag to interchange protocol #44

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion protocol/dataframe_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,16 +335,22 @@ class DataFrame:
``__dataframe__`` method of a public data frame class in a library adhering
to the dataframe interchange protocol specification.
"""
def __dataframe__(self, nan_as_null : bool = False) -> dict:
def __dataframe__(self, nan_as_null : bool = False,
allow_copy : bool = True) -> dict:
"""
Produces a dictionary object following the dataframe protocol spec

``nan_as_null`` is a keyword intended for the consumer to tell the
producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
It is intended for cases where the consumer does not support the bit
mask or byte mask that is the producer's native representation.

``allow_copy`` is a keyword that defines if the given implementation
is going to support striding buffers. It is optional, and the libraries
do not need to implement it.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rephrasing this a bit:

        ``allow_copy`` is a keyword that defines whether or not the library is
        allowed to make a copy of the data. This can for example happen if a
        library supports strided buffers (those would need a copy, because this
        protocol specifies contiguous buffers).

"""
self._nan_as_null = nan_as_null
self._allow_zero_zopy = allow_copy
return {
"dataframe": self, # DataFrame object adhering to the protocol
"version": 0 # Version number of the protocol
Expand Down
19 changes: 9 additions & 10 deletions protocol/dataframe_protocol_summary.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ libraries, the example above can change to:
def get_df_module(df):
"""Utility function to support programming against a dataframe API"""
if hasattr(df, '__dataframe_namespace__'):
# Retrieve the namespace
pdx = df.__dataframe_namespace__()
# Retrieve the namespace
pdx = df.__dataframe_namespace__()
else:
# Here we can raise an exception if we only want to support compliant dataframes,
# or convert to our default choice of dataframe if we want to accept (e.g.) dicts
Expand Down Expand Up @@ -168,13 +168,12 @@ We'll also list some things that were discussed but are not requirements:
3. Extension dtypes, i.e. a way to extend the set of dtypes that is
explicitly support, are out of scope.
_Rationale: complex to support, not used enough to justify that complexity._
4. "virtual columns", i.e. columns for which the data is not yet in memory
because it uses lazy evaluation, are not supported other than through
letting the producer materialize the data in memory when the consumer
calls `__dataframe__`.
_Rationale: the full dataframe API will support this use case by
"programming to an interface"; this data interchange protocol is
fundamentally built around describing data in memory_.
4. Support for strided storage in buffers.
_Rationale: this is supported by a subset of dataframes only, mainly those
that use NumPy arrays. In many real-world use cases, strided arrays will
force a copy at some point, so requiring contiguous memory layout (and hence
an extra copy at the moment `__dataframe__` is used) is considered a good
trade-off for reduced implementation complexity._

### To be decided

Expand Down Expand Up @@ -245,7 +244,7 @@ library that implements `__array__` must depend (optionally at least) on
NumPy, and call a NumPy `ndarray` constructor itself from within `__array__`.


### What is wrong with `.to_numpy?` and `.to_arrow()`?
### What is wrong with `.to_numpy?` and `.to_arrow()`?

Such methods ask the object it is attached to to turn itself into a NumPy or
Arrow array. Which means each library must have at least an optional
Expand Down
60 changes: 38 additions & 22 deletions protocol/pandas_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@
ColumnObject = Any


def from_dataframe(df : DataFrameObject) -> pd.DataFrame:
def from_dataframe(df : DataFrameObject,
allow_copy : bool = True) -> pd.DataFrame:
"""
Construct a pandas DataFrame from ``df`` if it supports ``__dataframe__``
"""
Expand All @@ -46,7 +47,7 @@ def from_dataframe(df : DataFrameObject) -> pd.DataFrame:
if not hasattr(df, '__dataframe__'):
raise ValueError("`df` does not support __dataframe__")

return _from_dataframe(df.__dataframe__())
return _from_dataframe(df.__dataframe__(allow_copy=allow_copy))


def _from_dataframe(df : DataFrameObject) -> pd.DataFrame:
Expand Down Expand Up @@ -160,7 +161,8 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series:
return series


def __dataframe__(cls, nan_as_null : bool = False) -> dict:
def __dataframe__(cls, nan_as_null : bool = False,
allow_copy : bool = True) -> dict:
"""
The public method to attach to pd.DataFrame

Expand All @@ -171,8 +173,14 @@ def __dataframe__(cls, nan_as_null : bool = False) -> dict:
producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
This currently has no effect; once support for nullable extension
dtypes is added, this value should be propagated to columns.

``allow_copy`` is a keyword that defines if the given implementation
is going to support striding buffers. It is optional, and the libraries
do not need to implement it. Currently, if the flag is set to ``True`` it
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"False and a copy is needed"

will raise a ``RuntimeError``.
"""
return _PandasDataFrame(cls, nan_as_null=nan_as_null)
return _PandasDataFrame(
cls, nan_as_null=nan_as_null, allow_copy=allow_copy)


# Monkeypatch the Pandas DataFrame class to support the interchange protocol
Expand All @@ -187,16 +195,16 @@ class _PandasBuffer:
Data in the buffer is guaranteed to be contiguous in memory.
"""

def __init__(self, x : np.ndarray) -> None:
def __init__(self, x : np.ndarray, allow_copy : bool = True) -> None:
"""
Handle only regular columns (= numpy arrays) for now.
"""
if not x.strides == (x.dtype.itemsize,):
# Array is not contiguous - this is possible to get in Pandas,
# there was some discussion on whether to support it. Som extra
# complexity for libraries that don't support it (e.g. Arrow),
# but would help with numpy-based libraries like Pandas.
raise RuntimeError("Design needs fixing - non-contiguous buffer")
if not allow_copy:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This removes the actual check for striding (not x.strides == (x.dtype.itemsize,):), that needs to be put back.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Logic:

        if not x.strides == (x.dtype.itemsize,):
            # The protocol does not support strided buffers, so a copy is
            # necessary. If that's not allowed, we need to raise an exception.
            if allow_copy:
                x = x.copy()
            else:
                raise RuntimeError("Exports cannot be zero-copy in the case "
                                   "of a non-contiguous buffer")

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And then adding a test case shows there's a problem - we don't hold on to the memory.

# Array is not contiguous and strided buffers do not need to be
# supported. It brings some extra complexity for libraries that
# don't support it (e.g. Arrow).
raise RuntimeError(
"Exports cannot be zero-copy in the case of a non-contiguous buffer")

# Store the numpy array in which the data resides as a private
# attribute, so we can use it to retrieve the public attributes
Expand Down Expand Up @@ -251,7 +259,8 @@ class _PandasColumn:

"""

def __init__(self, column : pd.Series) -> None:
def __init__(self, column : pd.Series,
allow_copy : bool = True) -> None:
"""
Note: doesn't deal with extension arrays yet, just assume a regular
Series/ndarray for now.
Expand All @@ -262,6 +271,7 @@ def __init__(self, column : pd.Series) -> None:

# Store the column as a private attribute
self._col = column
self._allow_copy = allow_copy

@property
def size(self) -> int:
Expand Down Expand Up @@ -446,11 +456,13 @@ def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype
"""
_k = _DtypeKind
if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL):
buffer = _PandasBuffer(self._col.to_numpy())
buffer = _PandasBuffer(
self._col.to_numpy(), allow_copy=self._allow_copy)
dtype = self.dtype
elif self.dtype[0] == _k.CATEGORICAL:
codes = self._col.values.codes
buffer = _PandasBuffer(codes)
buffer = _PandasBuffer(
codes, allow_copy=self._allow_copy)
dtype = self._dtype_from_pandasdtype(codes.dtype)
else:
raise NotImplementedError(f"Data type {self._col.dtype} not handled yet")
Expand Down Expand Up @@ -483,7 +495,8 @@ class _PandasDataFrame:
``pd.DataFrame.__dataframe__`` as objects with the methods and
attributes defined on this class.
"""
def __init__(self, df : pd.DataFrame, nan_as_null : bool = False) -> None:
def __init__(self, df : pd.DataFrame, nan_as_null : bool = False,
allow_copy : bool = True) -> None:
"""
Constructor - an instance of this (private) class is returned from
`pd.DataFrame.__dataframe__`.
Expand All @@ -494,6 +507,7 @@ def __init__(self, df : pd.DataFrame, nan_as_null : bool = False) -> None:
# This currently has no effect; once support for nullable extension
# dtypes is added, this value should be propagated to columns.
self._nan_as_null = nan_as_null
self._allow_copy = allow_copy

def num_columns(self) -> int:
return len(self._df.columns)
Expand All @@ -508,13 +522,16 @@ def column_names(self) -> Iterable[str]:
return self._df.columns.tolist()

def get_column(self, i: int) -> _PandasColumn:
return _PandasColumn(self._df.iloc[:, i])
return _PandasColumn(
self._df.iloc[:, i], allow_copy=self._allow_copy)

def get_column_by_name(self, name: str) -> _PandasColumn:
return _PandasColumn(self._df[name])
return _PandasColumn(
self._df[name], allow_copy=self._allow_copy)

def get_columns(self) -> Iterable[_PandasColumn]:
return [_PandasColumn(self._df[name]) for name in self._df.columns]
return [_PandasColumn(self._df[name], allow_copy=self._allow_copy)
for name in self._df.columns]

def select_columns(self, indices: Sequence[int]) -> '_PandasDataFrame':
if not isinstance(indices, collections.Sequence):
Expand Down Expand Up @@ -552,13 +569,12 @@ def test_mixed_intfloat():


def test_noncontiguous_columns():
# Currently raises: TBD whether it should work or not, see code comment
# where the RuntimeError is raised.
# Currently raises if the flag of allow zero copy is True.
arr = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df = pd.DataFrame(arr)
assert df[0].to_numpy().strides == (24,)
pytest.raises(RuntimeError, from_dataframe, df)
#df2 = from_dataframe(df)
with pytest.raises(RuntimeError):
df2 = from_dataframe(df, allow_copy=False)


def test_categorical_dtype():
Expand Down