Change default value and update the requirements doc

steff456 · steff456 · commit c82e90ba14f6 · 2021-06-25T17:17:50.000-05:00
diff --git a/protocol/dataframe_protocol_summary.md b/protocol/dataframe_protocol_summary.md
@@ -40,8 +40,8 @@ libraries, the example above can change to:
 def get_df_module(df):
     """Utility function to support programming against a dataframe API"""
     if hasattr(df, '__dataframe_namespace__'):
-       # Retrieve the namespace 
-       pdx = df.__dataframe_namespace__()  
+       # Retrieve the namespace
+       pdx = df.__dataframe_namespace__()
     else:
         # Here we can raise an exception if we only want to support compliant dataframes,
         # or convert to our default choice of dataframe if we want to accept (e.g.) dicts
@@ -168,13 +168,12 @@ We'll also list some things that were discussed but are not requirements:
 3. Extension dtypes, i.e. a way to extend the set of dtypes that is
    explicitly support, are out of scope.
    _Rationale: complex to support, not used enough to justify that complexity._
-4. "virtual columns", i.e. columns for which the data is not yet in memory
-   because it uses lazy evaluation, are not supported other than through
-   letting the producer materialize the data in memory when the consumer
-   calls `__dataframe__`.
-   _Rationale: the full dataframe API will support this use case by
-   "programming to an interface"; this data interchange protocol is
-   fundamentally built around describing data in memory_.
+4. Support for strided storage in buffers.
+   _Rationale: this is supported by a subset of dataframes only, mainly those
+   that use NumPy arrays. In many real-world use cases, strided arrays will
+   force a copy at some point, so requiring contiguous memory layout (and hence
+   an extra copy at the moment `__dataframe__` is used) is considered a good
+   trade-off for reduced implementation complexity._
 
 ### To be decided
 
@@ -245,7 +244,7 @@ library that implements `__array__` must depend (optionally at least) on
 NumPy, and call a NumPy `ndarray` constructor itself from within `__array__`.
 
 
-### What is wrong with `.to_numpy?` and `.to_arrow()`? 
+### What is wrong with `.to_numpy?` and `.to_arrow()`?
 
 Such methods ask the object it is attached to to turn itself into a NumPy or
 Arrow array. Which means each library must have at least an optional
diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py
@@ -36,7 +36,7 @@
 
 
 def from_dataframe(df : DataFrameObject,
-                   allow_copy : bool = False) -> pd.DataFrame:
+                   allow_copy : bool = True) -> pd.DataFrame:
     """
     Construct a pandas DataFrame from ``df`` if it supports ``__dataframe__``
     """
@@ -162,7 +162,7 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series:
 
 
 def __dataframe__(cls, nan_as_null : bool = False,
-                  allow_copy : bool = False) -> dict:
+                  allow_copy : bool = True) -> dict:
     """
     The public method to attach to pd.DataFrame
 
@@ -195,11 +195,11 @@ class _PandasBuffer:
     Data in the buffer is guaranteed to be contiguous in memory.
     """
 
-    def __init__(self, x : np.ndarray, allow_copy : bool = False) -> None:
+    def __init__(self, x : np.ndarray, allow_copy : bool = True) -> None:
         """
         Handle only regular columns (= numpy arrays) for now.
         """
-        if allow_copy:
+        if not allow_copy:
             # Array is not contiguous and strided buffers do not need to be
             # supported. It brings some extra complexity for libraries that
             # don't support it (e.g. Arrow).
@@ -260,7 +260,7 @@ class _PandasColumn:
     """
 
     def __init__(self, column : pd.Series,
-                 allow_copy : bool = False) -> None:
+                 allow_copy : bool = True) -> None:
         """
         Note: doesn't deal with extension arrays yet, just assume a regular
         Series/ndarray for now.
@@ -496,7 +496,7 @@ class _PandasDataFrame:
     attributes defined on this class.
     """
     def __init__(self, df : pd.DataFrame, nan_as_null : bool = False,
-                 allow_copy : bool = False) -> None:
+                 allow_copy : bool = True) -> None:
         """
         Constructor - an instance of this (private) class is returned from
         `pd.DataFrame.__dataframe__`.
@@ -574,7 +574,7 @@ def test_noncontiguous_columns():
     df = pd.DataFrame(arr)
     assert df[0].to_numpy().strides == (24,)
     with pytest.raises(RuntimeError):
-        df2 = from_dataframe(df, allow_copy=True)
+        df2 = from_dataframe(df, allow_copy=False)
 
 
 def test_categorical_dtype():