From 5a8d030b1d2ce70b941492a0391625f51e0bcefa Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Wed, 23 Jun 2021 13:32:22 -0700 Subject: [PATCH 01/32] Update protocol to support returning an offsets buffer --- protocol/dataframe_protocol.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index 00cf5b12..0b88058f 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -133,9 +133,10 @@ class Column: A column object, with only the methods and properties required by the interchange protocol defined. - A column can contain one or more chunks. Each chunk can contain either one - or two buffers - one data buffer and (depending on null representation) it - may have a mask buffer. + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). TBD: Arrow has a separate "null" dtype, and has no separate mask concept. Instead, it seems to use "children" for both columns with a bit mask, @@ -313,6 +314,13 @@ def get_mask(self) -> Buffer: """ pass + def get_offsets(self) -> Buffer: + """ + Return the buffer containing the offset values for variable-size binary + data (e.g., variable-length strings). + """ + pass + # def get_children(self) -> Iterable[Column]: # """ # Children columns underneath the column, each object in this iterator From 3aac477243c71499a21c3e8af9db7379304b1fca Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Wed, 23 Jun 2021 15:05:51 -0700 Subject: [PATCH 02/32] Add punctuation --- protocol/dataframe_protocol.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index 0b88058f..c9434609 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -83,14 +83,14 @@ class Buffer: @property def bufsize(self) -> int: """ - Buffer size in bytes + Buffer size in bytes. """ pass @property def ptr(self) -> int: """ - Pointer to start of the buffer as an integer + Pointer to start of the buffer as an integer. """ pass @@ -186,7 +186,7 @@ def size(self) -> Optional[int]: @property def offset(self) -> int: """ - Offset of first element + Offset of first element. May be > 0 if using chunks; for example for a column with N chunks of equal size M (only the last chunk may be shorter), @@ -197,7 +197,7 @@ def offset(self) -> int: @property def dtype(self) -> Tuple[enum.IntEnum, int, str, str]: """ - Dtype description as a tuple ``(kind, bit-width, format string, endianness)`` + Dtype description as a tuple ``(kind, bit-width, format string, endianness)``. Kind : @@ -324,7 +324,7 @@ def get_offsets(self) -> Buffer: # def get_children(self) -> Iterable[Column]: # """ # Children columns underneath the column, each object in this iterator -# must adhere to the column specification +# must adhere to the column specification. # """ # pass @@ -345,7 +345,7 @@ class DataFrame: """ def __dataframe__(self, nan_as_null : bool = False) -> dict: """ - Produces a dictionary object following the dataframe protocol spec + Produces a dictionary object following the dataframe protocol specification. ``nan_as_null`` is a keyword intended for the consumer to tell the producer to overwrite null values in the data with ``NaN`` (or ``NaT``). @@ -360,7 +360,7 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict: def num_columns(self) -> int: """ - Return the number of columns in the DataFrame + Return the number of columns in the DataFrame. """ pass @@ -369,13 +369,13 @@ def num_rows(self) -> Optional[int]: # why include it if it may be None - what do we expect consumers # to do here? """ - Return the number of rows in the DataFrame, if available + Return the number of rows in the DataFrame, if available. """ pass def num_chunks(self) -> int: """ - Return the number of chunks the DataFrame consists of + Return the number of chunks the DataFrame consists of. """ pass @@ -405,7 +405,7 @@ def get_columns(self) -> Iterable[Column]: def select_columns(self, indices: Sequence[int]) -> DataFrame: """ - Create a new DataFrame by selecting a subset of columns by index + Create a new DataFrame by selecting a subset of columns by index. """ pass From c7728e2a09565c37f11859d446a9e073ff75355c Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Thu, 24 Jun 2021 08:45:37 -0700 Subject: [PATCH 03/32] Update protocol to return dtype along with buffer --- protocol/dataframe_protocol.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index c9434609..cb97c6cd 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -300,24 +300,25 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[Column]: """ pass - def get_data_buffer(self) -> Buffer: + def get_data_buffer(self) -> Tuple[Buffer, Any]: """ - Return the buffer containing the data. + Return the buffer containing the data and the buffer's associated dtype. """ pass - def get_mask(self) -> Buffer: + def get_mask(self) -> Tuple[Buffer, Any]: """ - Return the buffer containing the mask values indicating missing data. + Return the buffer containing the mask values indicating missing data and + the buffer's associated dtype. Raises RuntimeError if null representation is not a bit or byte mask. """ pass - def get_offsets(self) -> Buffer: + def get_offsets(self) -> Tuple[Buffer, Any]: """ Return the buffer containing the offset values for variable-size binary - data (e.g., variable-length strings). + data (e.g., variable-length strings) and the buffer's associated dtype. """ pass From c8000f7da0448b21d1866a6b43e7adb1174223dc Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Thu, 24 Jun 2021 09:28:23 -0700 Subject: [PATCH 04/32] Add string support in various methods and add todos --- protocol/pandas_implementation.py | 69 ++++++++++++++++++++++++------- 1 file changed, 54 insertions(+), 15 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index e3e3e62e..5311bccc 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -23,6 +23,8 @@ import ctypes from typing import Any, Optional, Tuple, Dict, Iterable, Sequence +from io import StringIO + import pandas as pd import numpy as np import pandas._testing as tm @@ -70,6 +72,8 @@ def _from_dataframe(df : DataFrameObject) -> pd.DataFrame: columns[name] = convert_column_to_ndarray(col) elif col.dtype[0] == _k.CATEGORICAL: columns[name] = convert_categorical_column(col) + elif col.dtype[0] == _k.STRING: + columns[name] = convert_string_column(col) else: raise NotImplementedError(f"Data type {col.dtype[0]} not handled yet") @@ -88,7 +92,7 @@ class _DtypeKind(enum.IntEnum): def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray: """ - Convert an int, uint, float or bool column to a numpy array + Convert an int, uint, float or bool column to a numpy array. """ if col.offset != 0: raise NotImplementedError("column.offset > 0 not handled yet") @@ -131,7 +135,7 @@ def buffer_to_ndarray(_buffer, _dtype) -> np.ndarray: def convert_categorical_column(col : ColumnObject) -> pd.Series: """ - Convert a categorical column to a Series instance + Convert a categorical column to a Series instance. """ ordered, is_dict, mapping = col.describe_categorical if not is_dict: @@ -160,9 +164,19 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series: return series +def convert_string_column(col : ColumnObject) -> pd.Series: + """ + Convert a string column to a Series instance. + """ + buffer, bdtype = col.get_data_buffer() + offsets, odtype = col.get_offsets() + + # TODO: implementation + + def __dataframe__(cls, nan_as_null : bool = False) -> dict: """ - The public method to attach to pd.DataFrame + The public method to attach to pd.DataFrame. We'll attach it via monkeypatching here for demo purposes. If Pandas adopt the protocol, this will be a regular method on pandas.DataFrame. @@ -205,20 +219,20 @@ def __init__(self, x : np.ndarray) -> None: @property def bufsize(self) -> int: """ - Buffer size in bytes + Buffer size in bytes. """ return self._x.size * self._x.dtype.itemsize @property def ptr(self) -> int: """ - Pointer to start of the buffer as an integer + Pointer to start of the buffer as an integer. """ return self._x.__array_interface__['data'][0] def __dlpack__(self): """ - DLPack not implemented in NumPy yet, so leave it out here + DLPack not implemented in NumPy yet, so leave it out here. """ raise NotImplementedError("__dlpack__") @@ -242,9 +256,10 @@ class _PandasColumn: A column object, with only the methods and properties required by the interchange protocol defined. - A column can contain one or more chunks. Each chunk can contain either one - or two buffers - one data buffer and (depending on null representation) it - may have a mask buffer. + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). Note: this Column object can only be produced by ``__dataframe__``, so doesn't need its own version or ``__column__`` protocol. @@ -322,7 +337,7 @@ def dtype(self) -> Tuple[enum.IntEnum, int, str, str]: def _dtype_from_pandasdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]: """ - See `self.dtype` for details + See `self.dtype` for details. """ # Note: 'c' (complex) not handled yet (not in array spec v1). # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled @@ -340,7 +355,7 @@ def _dtype_from_pandasdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]: raise ValueError(f"Data type {dtype} not supported by exchange" "protocol") - if kind not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL, _k.CATEGORICAL): + if kind not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL, _k.CATEGORICAL, _k.STRING): raise NotImplementedError(f"Data type {dtype} not handled yet") bitwidth = dtype.itemsize * 8 @@ -407,13 +422,15 @@ def describe_null(self) -> Tuple[int, Any]: null = 1 # np.datetime64('NaT') elif kind in (_k.INT, _k.UINT, _k.BOOL): # TODO: check if extension dtypes are used once support for them is - # implemented in this procotol code + # implemented in this protocol code null = 0 # integer and boolean dtypes are non-nullable elif kind == _k.CATEGORICAL: # Null values for categoricals are stored as `-1` sentinel values # in the category date (e.g., `col.values.codes` is int8 np.ndarray) null = 2 value = -1 + elif kind == _k.STRING: + null = 1 # np.nan (object dtype) else: raise NotImplementedError(f'Data type {self.dtype} not yet supported') @@ -442,7 +459,7 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_PandasColumn def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype tuple """ - Return the buffer containing the data. + Return the buffer containing the data and the buffer's associated dtype. """ _k = _DtypeKind if self.dtype[0] in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): @@ -452,14 +469,19 @@ def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype codes = self._col.values.codes buffer = _PandasBuffer(codes) dtype = self._dtype_from_pandasdtype(codes.dtype) + elif self.dtype[0] == _k.STRING: + buffer = _PandasBuffer(self._col.to_numpy()) + bdtype = buffer.dtype; # should be object dtype + dtype = (_k.STRING, bdtype.itemsize*8, '|U', bdtype.byteorder) else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") return buffer, dtype - def get_mask(self) -> _PandasBuffer: + def get_mask(self) -> Tuple[_PandasBuffer, Any]: """ - Return the buffer containing the mask values indicating missing data. + Return the buffer containing the mask values indicating missing data and + the buffer's associated dtype. Raises RuntimeError if null representation is not a bit or byte mask. """ @@ -473,6 +495,23 @@ def get_mask(self) -> _PandasBuffer: raise RuntimeError(msg) + def get_offsets(self) -> Tuple[_PandasBuffer, Any]: + """ + Return the buffer containing the offset values for variable-size binary + data (e.g., variable-length strings) and the buffer's associated dtype. + + Raises RuntimeError if the data buffer does not have an associated + offsets buffer. + """ + _k = _DtypeKind + if self.dtype[0] == _k.STRING: + # TODO: implementation => we need to manually create the offsets array + + else: + raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer") + + return buffer, dtype + class _PandasDataFrame: """ From 040f928d4edaaae4e27b0ebe78a5a7bbc7c65af9 Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Thu, 24 Jun 2021 09:46:42 -0700 Subject: [PATCH 05/32] Add support for resolving an offsets buffer --- protocol/pandas_implementation.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 5311bccc..e23113a7 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -471,8 +471,7 @@ def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype dtype = self._dtype_from_pandasdtype(codes.dtype) elif self.dtype[0] == _k.STRING: buffer = _PandasBuffer(self._col.to_numpy()) - bdtype = buffer.dtype; # should be object dtype - dtype = (_k.STRING, bdtype.itemsize*8, '|U', bdtype.byteorder) + dtype = (_k.STRING, 8, '|U', '=') else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") @@ -505,8 +504,17 @@ def get_offsets(self) -> Tuple[_PandasBuffer, Any]: """ _k = _DtypeKind if self.dtype[0] == _k.STRING: - # TODO: implementation => we need to manually create the offsets array - + # For each string, we need to manually determine the next offset + values = self._col.to_numpy() + ptr = 0 + offsets = [ptr] + for v in values: + b = v.encode(encoding="utf-8") + ptr += len(b) + offsets.append(ptr) + + buffer = np.asarray(offsets, dtype='int64') + dtype = (_k.INT, buffer.itemsize*8, buffer.str, buffer.byteorder) else: raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer") From a982987e85f49413c7b203489cc90c2b78388eb5 Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Thu, 24 Jun 2021 13:34:30 -0700 Subject: [PATCH 06/32] Add support for returning a data buffer for string dtypes --- protocol/pandas_implementation.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index e23113a7..5c09dc5a 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -470,8 +470,16 @@ def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype buffer = _PandasBuffer(codes) dtype = self._dtype_from_pandasdtype(codes.dtype) elif self.dtype[0] == _k.STRING: - buffer = _PandasBuffer(self._col.to_numpy()) - dtype = (_k.STRING, 8, '|U', '=') + # Marshal the strings from a NumPy object array into a byte array + b = bytearray() + for v in self._col: + b.extend(v.encode(encoding="utf-8")) + + # Convert the byte array to a Pandas "buffer" using a NumPy array as the backing store + buffer = _PandasBuffer(np.frombuffer(b, dtype="uint8")) + + # Define the dtype for the returned buffer + dtype = (_k.STRING, 8, "=U1", "=") else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") From fd4d71bf8a4fca4b7c456f73abe17c45605da2c8 Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Thu, 24 Jun 2021 13:43:29 -0700 Subject: [PATCH 07/32] Update offsets buffer accessor --- protocol/pandas_implementation.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 5c09dc5a..0a4ada79 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -432,7 +432,7 @@ def describe_null(self) -> Tuple[int, Any]: elif kind == _k.STRING: null = 1 # np.nan (object dtype) else: - raise NotImplementedError(f'Data type {self.dtype} not yet supported') + raise NotImplementedError(f"Data type {self.dtype} not yet supported") return null, value @@ -479,7 +479,7 @@ def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype buffer = _PandasBuffer(np.frombuffer(b, dtype="uint8")) # Define the dtype for the returned buffer - dtype = (_k.STRING, 8, "=U1", "=") + dtype = (_k.STRING, 8, "=U1", "=") # note: currently only support native endianness else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") @@ -498,7 +498,7 @@ def get_mask(self) -> Tuple[_PandasBuffer, Any]: elif null == 1: msg = "This column uses NaN as null so does not have a separate mask" else: - raise NotImplementedError('See self.describe_null') + raise NotImplementedError("See self.describe_null") raise RuntimeError(msg) @@ -521,8 +521,15 @@ def get_offsets(self) -> Tuple[_PandasBuffer, Any]: ptr += len(b) offsets.append(ptr) - buffer = np.asarray(offsets, dtype='int64') - dtype = (_k.INT, buffer.itemsize*8, buffer.str, buffer.byteorder) + # Convert the list of offsets to a NumPy array of signed 64-bit integers (note: Arrow allows the offsets array to be either `int32` or `int64`; here, we default to the latter) + buf = np.asarray(offsets, dtype="int64") + + # Convert the offsets to a Pandas "buffer" using the NumPy array as the backing store + buffer = _PandasBuffer(buf) + + # Assemble the buffer dtype info + bdtype = buf.dtype; + dtype = (_k.INT, bdtype.itemsize*8, bdtype.str, "=") # note: currently only support native endianness else: raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer") From e40f9022f7d4683f1364834982cee9269ae77fc0 Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Thu, 24 Jun 2021 14:55:57 -0700 Subject: [PATCH 08/32] Add implementation to convert a string column --- protocol/pandas_implementation.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 0a4ada79..7b897943 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -23,8 +23,6 @@ import ctypes from typing import Any, Optional, Tuple, Dict, Iterable, Sequence -from io import StringIO - import pandas as pd import numpy as np import pandas._testing as tm @@ -168,10 +166,32 @@ def convert_string_column(col : ColumnObject) -> pd.Series: """ Convert a string column to a Series instance. """ + # Retrieve the data buffer containing the UTF-8 code units buffer, bdtype = col.get_data_buffer() + + # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string offsets, odtype = col.get_offsets() - # TODO: implementation + # Convert the buffers to NumPy arrays + dt = (_DtypeKind.UINT, 8, None, None) # note: in order to go from STRING to an equivalent ndarray, we need to claim that the buffer is uint8 (i.e., a byte array) + dbuf = buffer_to_ndarray(buffer, dt) + + obuf = buffer_to_ndarray(offset, odtype) # note: we assume that the offsets buffer has an intelligible dtype + + # Assemble the strings from the code units + str_list = [] + for i in obuf.size-1: + # Extract a range of code units + b = bytes(dbuf[obuf[i]:obuf[i+1]]) + + # Create the string + s = b.decode(encoding="utf-8") + + # Add to our list of strings: + str_list.append(s) + + # Convert the string list to a NumPy array + return np.asarray(str_list, dtype="object") def __dataframe__(cls, nan_as_null : bool = False) -> dict: @@ -430,6 +450,7 @@ def describe_null(self) -> Tuple[int, Any]: null = 2 value = -1 elif kind == _k.STRING: + # For Pandas string extension dtype, this may change! null = 1 # np.nan (object dtype) else: raise NotImplementedError(f"Data type {self.dtype} not yet supported") From c122b3cc4d51be66cc8f6be6ddb5c3ee78c51acf Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Thu, 24 Jun 2021 14:59:45 -0700 Subject: [PATCH 09/32] Add tests --- protocol/pandas_implementation.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 7b897943..406dded2 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -661,9 +661,26 @@ def test_categorical_dtype(): tm.assert_frame_equal(df, df2) +def test_string_dtype(): + df = pdf.DataFrame({"A": ["a", "b", "cdef", "g"]}) + df["B"] = df["A"].astype("object") + df.at[1, "B"] = np.nan # Set one item to null + + # Test for correctness and null handling: + col = df.__dataframe__().get_column_by_name("B") + assert col.dtype[0] == _DtypeKind.STRING + assert col.null_count == 1 + assert col.describe_null == (1, None) + assert col.num_chunks() == 1 + + df2 = from_dataframe(df) + tm.assert_frame_equal(df, df2) + + if __name__ == '__main__': test_categorical_dtype() test_float_only() test_mixed_intfloat() test_noncontiguous_columns() + test_string_dtype() From 0d04af3b251c820d71b0f0ac9c4c4b0a41119c48 Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Thu, 24 Jun 2021 16:22:35 -0700 Subject: [PATCH 10/32] Handle missing values --- protocol/pandas_implementation.py | 73 ++++++++++++++++++++++++------- 1 file changed, 57 insertions(+), 16 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 406dded2..16a7cdf0 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -25,7 +25,7 @@ import pandas as pd import numpy as np -import pandas._testing as tm +import pandas.testing as tm import pytest @@ -167,22 +167,34 @@ def convert_string_column(col : ColumnObject) -> pd.Series: Convert a string column to a Series instance. """ # Retrieve the data buffer containing the UTF-8 code units - buffer, bdtype = col.get_data_buffer() + dbuffer, bdtype = col.get_data_buffer() # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string - offsets, odtype = col.get_offsets() + obuffer, odtype = col.get_offsets() + + # Retrieve the mask buffer indicating the presence of missing values: + mbuffer, mdtype = col.get_mask() # Convert the buffers to NumPy arrays - dt = (_DtypeKind.UINT, 8, None, None) # note: in order to go from STRING to an equivalent ndarray, we need to claim that the buffer is uint8 (i.e., a byte array) - dbuf = buffer_to_ndarray(buffer, dt) + dt = (_DtypeKind.UINT, 8, None, None) # note: in order to go from STRING to an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) + dbuf = buffer_to_ndarray(dbuffer, dt) - obuf = buffer_to_ndarray(offset, odtype) # note: we assume that the offsets buffer has an intelligible dtype + obuf = buffer_to_ndarray(obuffer, odtype) + mbuf = buffer_to_ndarray(mbuffer, mdtype) # Assemble the strings from the code units str_list = [] - for i in obuf.size-1: + for i in range(obuf.size-1): + # Check for missing values + if mbuf[i] == 0: # FIXME: we need to account for a mask buffer which is a bit array + str_list.append(np.nan) + continue + # Extract a range of code units - b = bytes(dbuf[obuf[i]:obuf[i+1]]) + units = dbuf[obuf[i]:obuf[i+1]]; + + # Convert the list of code units to bytes: + b = bytes(units) # Create the string s = b.decode(encoding="utf-8") @@ -198,7 +210,7 @@ def __dataframe__(cls, nan_as_null : bool = False) -> dict: """ The public method to attach to pd.DataFrame. - We'll attach it via monkeypatching here for demo purposes. If Pandas adopt + We'll attach it via monkey-patching here for demo purposes. If Pandas adopts the protocol, this will be a regular method on pandas.DataFrame. ``nan_as_null`` is a keyword intended for the consumer to tell the @@ -353,6 +365,11 @@ def dtype(self) -> Tuple[enum.IntEnum, int, str, str]: and nested (list, struct, map, union) dtypes. """ dtype = self._col.dtype + + # For now, assume that, if the column dtype is 'O' (i.e., `object`), then we have an array of strings + if not isinstance(dtype, pd.CategoricalDtype) and dtype.kind == 'O': + return (_DtypeKind.STRING, 8, '=U1', '=') + return self._dtype_from_pandasdtype(dtype) def _dtype_from_pandasdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]: @@ -450,8 +467,8 @@ def describe_null(self) -> Tuple[int, Any]: null = 2 value = -1 elif kind == _k.STRING: - # For Pandas string extension dtype, this may change! - null = 1 # np.nan (object dtype) + # For Pandas string extension dtype, use of `np.nan` for missing values may change! + null = 1 # np.nan else: raise NotImplementedError(f"Data type {self.dtype} not yet supported") @@ -492,9 +509,11 @@ def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype dtype = self._dtype_from_pandasdtype(codes.dtype) elif self.dtype[0] == _k.STRING: # Marshal the strings from a NumPy object array into a byte array + buf = self._col.to_numpy() b = bytearray() - for v in self._col: - b.extend(v.encode(encoding="utf-8")) + for i in range(buf.size): + if type(buf[i]) == str: + b.extend(buf[i].encode(encoding="utf-8")) # Convert the byte array to a Pandas "buffer" using a NumPy array as the backing store buffer = _PandasBuffer(np.frombuffer(b, dtype="uint8")) @@ -513,6 +532,26 @@ def get_mask(self) -> Tuple[_PandasBuffer, Any]: Raises RuntimeError if null representation is not a bit or byte mask. """ + _k = _DtypeKind + if self.dtype[0] == _k.STRING: + # For now, have the mask array be comprised of bytes, rather than a bit array + buf = self._col.to_numpy() + mask = [] + for i in range(buf.size): + v = 0; + if type(buf[i]) == str: + v += 1; # follows Arrow where a valid value is 1 and null is 0 + + mask.append(v) + + # Convert the mask array to a Pandas "buffer" using a NumPy array as the backing store + buffer = _PandasBuffer(np.asarray(mask, dtype="uint8")) + + # Define the dtype of the returned buffer + dtype = (_k.UINT, 8, "=B", "=") + + return buffer, dtype + null, value = self.describe_null if null == 0: msg = "This column is non-nullable so does not have a mask" @@ -538,8 +577,10 @@ def get_offsets(self) -> Tuple[_PandasBuffer, Any]: ptr = 0 offsets = [ptr] for v in values: - b = v.encode(encoding="utf-8") - ptr += len(b) + if type(v) == str: + b = v.encode(encoding="utf-8") + ptr += len(b) + offsets.append(ptr) # Convert the list of offsets to a NumPy array of signed 64-bit integers (note: Arrow allows the offsets array to be either `int32` or `int64`; here, we default to the latter) @@ -662,7 +703,7 @@ def test_categorical_dtype(): def test_string_dtype(): - df = pdf.DataFrame({"A": ["a", "b", "cdef", "g"]}) + df = pd.DataFrame({"A": ["a", "b", "cdef", "", "g"]}) df["B"] = df["A"].astype("object") df.at[1, "B"] = np.nan # Set one item to null From 58fee899f3b2fd32ed274808c44c79ca15b8872b Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Thu, 24 Jun 2021 16:26:40 -0700 Subject: [PATCH 11/32] Update typing and docs --- protocol/pandas_implementation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 16a7cdf0..92e45279 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -162,9 +162,9 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series: return series -def convert_string_column(col : ColumnObject) -> pd.Series: +def convert_string_column(col : ColumnObject) -> np.ndarray: """ - Convert a string column to a Series instance. + Convert a string column to a NumPy array. """ # Retrieve the data buffer containing the UTF-8 code units dbuffer, bdtype = col.get_data_buffer() From 2c4a84620259d8f3d6a833d483f3050c3793c297 Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Thu, 24 Jun 2021 16:29:42 -0700 Subject: [PATCH 12/32] Add comment --- protocol/pandas_implementation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 92e45279..b8065b70 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -577,6 +577,7 @@ def get_offsets(self) -> Tuple[_PandasBuffer, Any]: ptr = 0 offsets = [ptr] for v in values: + # For missing values (in this case, `np.nan` values), we don't increment the pointer) if type(v) == str: b = v.encode(encoding="utf-8") ptr += len(b) From 2e3914fac570f219ea45c5a9907d4ff26fb53128 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Fri, 25 Jun 2021 21:45:21 +0200 Subject: [PATCH 13/32] Requirements document for the dataframe interchange protocol (#35) * Add a summary document for the dataframe interchange protocol Summarizes the various discussions about and goals/non-goals and requirements for the `__dataframe__` data interchange protocol. The intended audience for this document is Consortium members and dataframe library maintainers who may want to support this protocol. The aim is to keep updating this till we have captured all the requirements and answered all the FAQs, so we can actually design the protocol after and verify it meets all our requirements. Closes gh-29 * Process some review comments * Process a few more review comments. * Link to Release callback semantics in Arrow C Data Interface docs * Add design requirements for column selection and df metadata * Edit the nested/heterogeneous dtypes non-requirement * Add requirements for chunking and memory layout description Also address some smaller review comments. * Add TBD notes on dataframe-array connection and from_dataframe Also add more details on the Arrow C Data Interface. * Address review comments * Add details on implementation options * Add details about the C implementation * Add an image of the dataframe model and its memory layout. * Add link to discussion on array-dataframe connection * Some more updates for review comments * Update table to indicate Arrow does support categoricals. * Add section on dtype format strings * Reflow some lines * Add a requirement on semantic meaning of NaN/NaT, and timezone detail * Textual tweak: say columns in a data frame are ordered * Update requirements document for recent decisions/insights --- protocol/conceptual_model_df_memory.png | Bin 0 -> 21396 bytes protocol/conceptual_model_df_memory.svg | 627 ++++++++++++++++++ protocol/dataframe_protocol_summary.md | 371 +++++++++++ .../images/dataframe_conceptual_model.png | Bin 0 -> 40752 bytes 4 files changed, 998 insertions(+) create mode 100644 protocol/conceptual_model_df_memory.png create mode 100644 protocol/conceptual_model_df_memory.svg create mode 100644 protocol/dataframe_protocol_summary.md create mode 100644 protocol/images/dataframe_conceptual_model.png diff --git a/protocol/conceptual_model_df_memory.png b/protocol/conceptual_model_df_memory.png new file mode 100644 index 0000000000000000000000000000000000000000..b5bbd31033213a83a89c254fdbbdc8ff9746bab3 GIT binary patch literal 21396 zcmc%xcTiK``vnRI8y`TyhDcWly$MKfHcF_{J18yEd*}oUB2o>#1(g5-(tAe{kQSsv zsM10U9YW7NiGF|czBBj#>&W2boU_j^&$FJj*4{DfnTFC8N(M?02y_La{8SqRI(G^J zk!AgN3HZcr2XX`WMd7M!c*f~3R zSiNwy5pZ#brmo8{fIxRZkf)DyebUw@y?t&Djkj(2i>FdkuwS^$b@hX%!ZQ?w3OVCF z9>MpP3cBA$rN0E<;rV2El}uro>-D{RUpycAePpc|lj2pf z=|Q*LpbiaOdL#Zaam;r{ja_R0KcJTnJLc5nf!``2%+i&Khc73pdA)IGykdVgn(`%? zs8`?hjGObX?em*FfJ@1;u{{<>JBGe}i&9X~lNv6y$>-O+;%lqps;nGlG26%H=p1!n#xy1H3b z5-R1U`>fg<0i ztMZt8w>On%wu9(p94?YBaeJd0Du4QO&dK!Hbbl4`mH4dOu_3~9AX4_oJkX{L8b0KM z-DGJac0KNXYJs&&%*9;7q<4XN?QY_fgMPs`LA&w8S;Y10nbKa%`+;vRU5P@<1xmS2 zr5UAj1nl-%?X8Fh7zYE?daEy6uHW z%kLlPjE`4r#N1@K1+)K-rz#9QYc&w7-42uY3&v34s<5UHc`CrsO9y=0v7+1 zTpVI>J!M}N+Kcq7e4N+vxjTV0()4IQJB4-(ozbCb^MzIp&R#&%ad6-7) zz1mgRE61x}n4u<}X}C4e5<|=1JxP2MbV+YJ-xvO;EhYED9YkUR*!ef>c#nMbs!QJ1! z-8V2*8_F}9aF6 z6I1-Bu&67ZH_<2wm#vnJd5T~VwX4Sl2yHY-*V;XP{1`{m7I;waaMB1CZTQyb0Btu$ z?KHYkl^gf^;KCuXmiUeFk$}xP%_yer?lKe8)$s~D$%CIAv3-YD$NQ^IyERZ`<9ckg zd|uhhzG#CBbu8Vvc8D9-e9-__lmZ zin;2!yVNIj_w#4yT9r$a^qe9!oY#3cVtJ&kwN=!4w0i5q`(M8%W?9BVzOVvImi1WQ zHrwmuz-^Cx*+<(w4%}6jsU#YQ;!sZk!`22Rzfa0(Yi1>QIK^(_r>{v@xY z{MHZY6zOyyj^vu*H~TGuDEu8aGjWQMLHH_QHT7?A&(GogF@@QQqOvl$F2TJYJH4>< zgTwia{q@NkVh0}|otBlAwV(x#*mIr6VX951u=PmcNMAF1OQvq{uT#BSW zMSC2M)HFH|ET+^jOM4;boLQBkS?uoH?n++)ci2}fDkv(l*!i{6sshuspObG=DW&zh zQm}Em^PNQ^znLU-IO2G*C)ukCh`yKZhYPCb&YhEy8VFLF4kEWjrGa9 z39FmD#mIQ9)Zn*gz%1#9Y8%_ql-=ph+h`&bx`a^DQcw1;4*O_E;T{u_Az1rvw!PYL ze^%Ee2HROj$5@}aDC^& zuWpv1QOq}P+_?U_UnA!fyI$V`fqaM6$$s)m6=TU*=W^vNc_H$M+MVc4sj2L#6U zyt0;-)>J;6X-*Wscv5%q(j{9N!jCRq!j9!NBfd5@EfNSlV8hKpJX4pG0xIItgKC*K9e1 zwO?*pMQ_BE&plqNYaYJ}6p)$6pDsM);9` zg^5a7sEhmB18MSJt<6NmfXWL%R3C5yF!E7ifQa8bsI2Tu-^?5GzKr?$@Q79Fr&KL= zg<_L(fbjox)97fme+ilqwB2**3d7B1fzm99rR5j?N*L~9Y-5sC@0#iVxXTxtt^}j- zTl_o@FqT$PuNB?E!-WU$q7&~G{*gU-2E0^-iLUO8b@e+uY5L0`cew@A~-9`#3-%gFQ>70};ViH#L>6!Ul&-`R3b z_?z?RcQBPs{^gJ6IC2jyG*#8FB%Sq)i0WvhQ#y|{1_qSb_UKSzrgP(Nn*1oq{9fX< zf-)%}qaHc>`S$In@&P+CQHM6$G-kRgd4b*I4Xi5@|jKT%Gp! zj$0`0N)bScLe~wx-@S`HxovY`(vqT?E_L|Frl~04J|XbJ`pR_G#-D?OQFRmVdOpaL z(8l4}RJ(yoXBJICtUU*dsbn=ZHFpsY{$y@71qvN@aX0wy1xTeht+OYCe~1BXw!ZhV@A0H+z9~= zl-desH8CHm-^!!ql~y)hiZngiU6#(AHuK&{MA66}&9}eXTbW3UjlCfgIJ}k&kO(#a zHFot2OaxBt`o4^NxV!Vcdi9EMnZ9G$WP;^NIY@hQ))k0+~)4;+j4c9+XlanE z_a^Z*$k33Gg^+je(j&rhYvl=hAU%QFrRquND%?a(rrMcPs%g;S)QN?Xl7WVbrl#OV zWc`Ou+X^YLDNmuazSt(+1Mz)A>>bg64eaAI)1g&G}S`m2iC zWlpjE1;541FC zF8jz7QLAXxe!->_0rd4~5}$|-9n7t*=l|4^Up_EK+`Gr-!Ksdvvizl^xeGAa zHd$X^-&fbJT~oR=;W0f*XC7qp#z9_7ON*07KjH}c&KO`kzD4!hn9(Yw`oN8!n)38a zH*X>^H6A&>c6N3zLZg`qS3cxFyUSDXSa<@5@o6G{w6MAJJXOF68hn0idsh8{Hd8|{;=_lX74f|HY7L)&<(+{)8w!if-d&%_ z>Y^97Pmw3og)jCX54XQ4gx9;@&7;rONY4{&N)z**rHjS-Siy;W8$hi6lYtjVoUY1; zg0J3n>YAD~uCyy>&da^wnMe2y@$kQzGrlv9f3@V5I$L7FGn=OnY z-T&&+toM`abt1EEGdr zsc`=t{u4@zh&|0v$LZ1<$0~ICda^VWHiVE_mu6FX(D>)b**jrgsDacXNN(;|iPdM5 zD(6O6FH)9PWxIO3(y_1+&>@xVzG^OCmdJrtPvSk0K?3uDx$rQzY;7bVm!ADXr%T_%R5XdXjSs6|+i6O-;H_1OFP95EbFm;6vtyaW zrx!1UjG7(udZq2`k|@ikL5X;TWnINc&v}^R^wcoGFfU%XU=2U%J1AZsv5;Sw)8r6< zwc9okiM7rIrw-lW3qM1a${MD}qZdQUB!oK}%s)+)OhR6MWIcDQm z=(jzKo@|BMyG+&20Y&Ke%-=vQb${VW)r8+FbQ+!Rp9oeWO1-x+PpWu@|85EA6mNk7yl~R_qxB1-o&l*u)~&Dkk6TE4S2ivTk8`Y#R^S^0 z$!A%a2z^Su=0hi)r-yS1r~9jWczKD0-YCjMZ^tDMI%D?LxJ^9lZYM@M^pr0+(9cyGLk-P6C4 zJSm4TaZAAT89yMkzqStXi`Rua4SwpQhn{8f>PaC#e!TWEHrA>`{#{dXt=H=HM8fL( zU+?E2VPQ};s+HpL^R=+`O;cN2*a6l;jR8}uUN`@RrJe8c1~3pVqg$aZ8hvnTZf@@H zh?}%}%jI>WF~!AcCl)6B`k~-(S;3;_je;lGpcY!A!pHmc$7RMi>UOv#KRrGDHL+CD zI`^#3N8Szix+Q|DQmMW|57G=hhMXGh&foni!wFtd(voEw*0`<~J(cy*^S) z^vc4bql-2=f=nvx15f@o)=9aO^&{i2#(ND&kR`YQAr}IHlsK7KoUX;>WH_E4ITSru z4eaWvl0DjNzGsFSU#%RcN#HdQ2--&VZUk+IeRpDIJaMfX_uC5)0!v5?4an@!oi+I5 z{b18m4KI-EJCEJ2XpvHx%MDg`gR24w--?gayh1Ph7-0iwPH)ZRlhy_|^4XO^m)}e8 z!SO)b(i|MN;|lS-ISbPQ9N8;jNl8gg!@;0XFrYA`6r=p`r|t`#%&Fo~jiwCvQLfU+ z`ec*Kc;)1*{ZJn66k2C5#A#(|Ss70u7@G)(16ot|x=sP65k8m4p%?)(*Iy)O>Nk7Fb;pLL<`wvGb@+Jzp#KHr0b$9z%gN#oOD@MwL!2JCD z!UP9dzlq|nK`rknG8L(tcGgaNPWO-d1RHh(LY+)1O_A`xNCr{w-ZT*x#qdS-jIwbY z*`Y<U+S=MZ#$GEYAK`)4 zhrb41eE#nHaiNPdy+PgZGP zPcM>_kF&9L<(xv{3tj05)Hd+&0#a{RNm0>Oa(kPP*ch{Hz=QqKk8DHX7?`O(MN%w=OdeJEc)VS^azW+R0?pUlOleUu=q(nfQBk3kb)Q#mnQM=I@}cb1duP!+Eg|8S>}-0U;Hd>*P1L7bK>pn>wf)4qtM;EMeoPy4EW{Fp7U$sW7FbpO2F3q6SLBkDeAHVj~fikef73$W*d)T z=G`en`GV+Q-T&DeHl6jIonN11ynTBq26gZx^l_#07~Azti?+#V(_IFiBC@(XYo>qa+m2wN(+s%ADDyf zuGrPVTz@}wH+`aM;NB30zR4%6S!jkBx~GfcHqi)$9O0^LXyHbs)wZ2sam6VW_U!w( zaTiU%gMbTUYY>I4ICP)A08@S{%`4q?)QSpI2QPmoCwGb{(Bm~j$YmTD6%?3=w?D8S zoc|?@pQ^&I!LR^vmOJ$MkyQs<|9V|ZKsc~D__(sRQkIp(zYzvqt#;`O^hri`T9q%FHTzsd+&ccLp-oL+O2~c+m5m?f8Z^=-4D2UgKbw< zBYS!}D6UXZ$s^NanU7kr8Hsj5LwfSv>EZ+j2!mvRbu@0gs`4YMSN_=g6W=W*ytAyW zjEv7F2AcujkIXM}mL&9EfmeEu;>ZBiWyGc!W$xP>d^SlZXgtUX#JK4F`&?|Yr=G1e z?UAxV>n&R@-fM%Rj}T{=J%eOY8-U7(*EvtJG6QCpyNHNL7dl-QJBaGAs0ZS>Z5JwOnZ)A%}v{;ne>GrT< zVcOj+Bm_JuXk%0(x%{67eYV5ekDxX-@xJi213K6mW@m6Wxh!L16PrL=$J z0b7&@QZ)^0kK-SQO*L7KAah_FlLS}%YEee5GY+$ULq7SL3F6jC^~>$%S8v)N{iu#u ziwWZM)zKU$vr=b(Y3A98M;^;fdfz$w9-mL}Q9%?o)-J4Tat3YfB2UT5aibO3+}wdi zU!FV738xs|qFrfeg4gQV(&lzoQ52>^e7&!?DQR0x!&&3kDSqnM)Z85Dv@^_1vo;2I zeYUe`a8~TTu+1pugt`3(Oa-LS`0RuEFB7Cbgn;?haQE>?5ri&_N3%g zH#VL|$&oXP+SkuMH#JTF!L(X%jm<#msp6sv0x0hX2eDz{M3HIm&Cuv8{p%I+m&LOo zi56pbAccRVX=Dz5uNO|%p5i{#ET+IZaD`6zfxb*7LhJx-Pb$H^%412RT_gR5K(2XU z*YZFPZ{X1ws-v);<@UjELBJKQwBGun3bs2Ex3F*r?ByfE$OhW3a8CC_6q-e94JRlf z+}{O1T$tafO#dSbfbr&sU+&j(`ujUMYoleVhIkEh-6fdFdQ!yYt(g3C&}RDqdkb+Q zmg3HabcaXzMSG)FE-t47IU=3o%)Vm&=H|~`T{oA3A?}1_45i4}Y2#5SlznpY@-^xX zL~87fl%yoXlKbN$g?gOpYhGKS)FCqXwE?qAm+|2T{5a2$iTa5V0E-m` zxSh~i%hka=4=zF_AoH`|hJ-Np%8NCbpB`}|cJU=||AQFUxcjO@j4Lb^V$G{2Jlj%l zgtuBwm-&0!K*oEOJ0_ZM`Sb;9U!u;z)@`;bUko2DnWeqHS+JF5<|5nQJ2IFt`v*!Ur zMAqT!R_QYM^-(DpH(8eR1r=DhmwI-x6LEOhltD1-&eh2g@WWM%STx~Zz4A9*%~4C9 z;D%2ewv`ZypFbLiiHXrsO*q?sN-$t$Rk2E9>r9Wb-0+uOw(_bSKb{Jp!$ zo9c!M^*&mev+FyKeR^LsRr)8-ub<|A)p;*W^p`%}^f|3CNb$mXP6pay|Af7@qR^tC zXwRedKF&8TmttXIL7sFWTAg-^GUQHEMH)3^y?6`})`wMblchEuW3v7*@h(;a970u_ zeR;I6#b&3d7mcPwc(?pN}Cgr}vY70)grz5bbumJ|hNrO=4#(1qw z*N<=Cx<6UK%`ogTGABHQX8Tw#Or z0I;E5>%H!S`*e%n_^bv#J(b5Ob}}Z%$G7&>&7;Vo>5;Lg=kEGOW2(2m-_deRb93{= zG@MY-Fl2wakv{q%m$>0P?X%Nkfld}(ok}ne4SzYj*6H==?jy-iYNq;^T?zdB?eN_# z6Tpp-*-4=!nFL95rY-)@eV~I$w1CqyoIxkJzCBrAli2q2=b9f-9JBx*H^|@7(Gg+Y zZ7AY6I-c1U;ft>fp`x3(e*KnTo=&2`bYe6g9I!O?66^M90U{y}J308n8cEo!ookOa zDDQgF(}^yn4qDq{1`_H7aGh^?0Vc#SGcAw4%WX$|CcIqmEHb z#fbh_L+&nMe>=Ifrli85D6+t4PG0;nyw4cN7sLgiZPvpY_XTY>Hn!+wz}=BMv!1mq^JuOixixX;&~_|vZ$`rpdmH1} zwkM&~=D=PQ?OZqdBJ?l2yomI-RqnD#;A^b(>q)6ITV)V)tOjfY*69bXGpFJi%!FTT z(R`C-4&F7aaO1%Tgrm|wHkxob-_9S8Xk>ri6%s0Wd^TcS>DbVqgGG~1`GLi4}9<^v2J5!WNvqP0IFs;R$vk>pKnm)X8+^u7LMJn zcYX3o!L9HS+x%DmHc`&eCoezB&CR8)a-K5X%7}-Bva_*Stq%937RgBm5vESz0JSUY z94i6@^5R=mh>5t1b6Qf8kK5Tur^3^0F}r@QEz@JwgX5+Cg7Hc-_k9Prk`im&ZdZbO zYN0YN{KhGia+W%hIp&iW(QZZviHPz7NDf5-g{JJWup?Q-gm|9 z7Dvc=TavjUw!}B9>PHe35ZS8sg1K-qH);0BIPdZ8*C;FDqr8=iKLh#fB7qy4z>zV! zPud7YI~{Qkl&74!I1&1)wW`{6=Vncv?648CiAC)Eu-L>VMsDs-^=@*6n2O&s?eKH6 zja~v~;7Rj`6C98Y)x4b7$GbNnn#zB;Jwl&@B_Mvr{ciLXYTbvgnL-RxB&-bT!Wd^F7_l7A z^&-x}sntX1>Ne+D_nivParHGDbhV!I?7CZ9q}L7X(OmVo`b5g(N_IgDlhTQw=IaqA z%N*!E+xhe2ZiVtubrU}eyxK&09-9bH{uJ|SD{#Gm<*A>nXx(e?n^vD-MOVLbo?Sb{ zAl#5i5`WxiPXaxE`DF6Z6E0AGP<3k?qb$)JVQf`*o;6M1d@AmindbIp;wQgXTZQWl z?@oHUXYVTqz1o~TZ(#jA77F<@)7@xl!j6aNz2j4>b)jh$SDk7G>Fd6$PY@*MpUCe+E(FlBXK5zSLbczn&*v?0eXCVs>Ddg6K|)(K6>xS)as{DQ=hw- zSt&^QT!7GF?|XS}ZY3;i^iV?OV$&0|g@ClXbBsQ6sXMLm#t+k0X;z9;t4}VDSqYz& zAT{fE+~O#M46ycHYV=K`*s|8cYK(eRt6kR|kEw}b{!HaJPdRj4-=DPTe13S?64qlJ|}Yr?uSaQG#t$I#6}&86OUuOBN=+LN6q54m;s8{U}1q-72b zVKj$o8)NkL#h*18VMrUKc*m{6CM?yeU6R$iE=<*1?p7AR%h#F8>L5bX?1&{% zKh*S3xaQQ*=Qn-DPNjPvfP8q&2^v-uAfz!)?!v5s4hIapNe8ZV=qo#pafB}=__XZ05*iCem`n!c-EPc{@=_hX+sgT>Crtv zlpvAbzIl605JHV57513MMVZjjB;y;J=Vx9k0}(%6&F zc&xIS3UTdu_QOy!GhSwXbha3Ttv-!9m|(V=28^NQt)aE1wURk5FyrBd4+PfHHucU!bTT#+|}m7v`E9UDFd3l8)c$RFTb^PT<$+4<4ax z(wfj1G7&LQFIo-wzPTNx=z1>2!rj8cyM6FQ6_2E$Z?Gv;lM%Yd!dAKyD6NSn2! zDS?WJ4}XB$5kjESOqY?@as(b`D~P*4r#I8>s}9ty%VkyOF+qKi_a3XN#lL$|yPWN; z((AeOy)Tz#!6UHi&Sa3;`&-}8!{VyKvY3&jM>aRjrt%iL5*_+ddzEjV2KQpvaz7`L zQ3ZR~^H>F)SpiYr(k5GT|$q-LNd!Fd*`aeNVYVly$A%au%h2cDYP>Zpe?P@bR zz4Dd)Rg}}QEMcrk?2akGi7_FZ!q$UaK)ZC-su*Y3bHf zjhfD3Iw{fxe=X@8C8hrd%An)JdScR+$z|wxa+6V9q;RkYuJWV;YcoxECsY2TdcPx9 zWpRmoCU;W!C~l9GjeK!JX3*to%+1XgVvwqo^b4S-lI9vvu=r(RX|iUEQLvYXoCUbU z*-`)$VBrP10%ndY@0m9R9W=raKftFP%jF$4Ah{l%xy-T<#A7vLBCc|0%KLV&Jp~UH6Sp5CE#m1A4Kjxn^B}+XHd?6v88+|+0DJ?u?s(J=4oM&n%=;Bq7D{I zPLAHxXHnXpB z^=)oiR~7;`b^eGON-4c!mBTX1m*upn7={a>B)g z(hA3QT^-t!m~ghY1j^NtwI>BS4jP2NgCV@&0@1k{(1}Vy1Vf1Q_Br#IdsY+N00Qt3!>Tj3 zOp#kufk0j+vfGuh2!HFRlluJ1(qe-{3JvGj-e!_+5w7=!h()ySCp!2nTXvSikk}4+ z|3r{k@sa2KJ*izDg!#ufzGWMc8?Wy2w|owANr+n2l=IL2_T^rK_s$N*w9Qhx=psJe z%;bXIOyra_)%r3a_+e^a`@$lmsu3}(B_L~jY58nbU9x3k8;-yzr*PZlCIc|;`wB6Y zsG`-^6l4(9Xr@(OQb_mMQ7za2pO~`FMGH)iOx=@L=Jk%wJyHEzA`aSJyB2el2o{9r z!<(mBk6{|ihjqty$|7zqH=H_6#G3G6^s~O;r+z8-XVl5AYMOFBScCP#_vw`UtDIVY zn*$gOBUU%)Ih#SQQi#;NX$R9ONXAP=F$3WUBUYss^aJ6kgc9MIIY)$mkTeA-XwCze z!KvSvwF@Bli+J5WAa8OWTkmrIA*EJaY+sI#_Zt7E{4XBvj+(d00PIeSKU0JYw~>+~ z5=ahJpvr7|Wm^;wPwvh9_;)lQZP|nQT+mE4il#L}6jW~o)E;}WWMY;lvSliMULvob zNkWD8Xq*e4KEM!P>OTKX3)s);RZv;s=2q|Li}fIzIX3RBcJ(Si;Q+*LuDYmPs{^!N z|2Yo@5nlhL+DFr@bKZL5Bc1}Nmo?gE4D9V!i5wch5%L418_GW zU9TU94oMCG`K)NupHk=~o!flJXuZq*ku+k`nGetjlY%^P$+5Tl&-*ynMWk8)m@6}5 z3ToC_b|2;ffE>6=ylJM!524W|F-HSJGVo?<`i>G~r~Vnhoa_MO30c-z?W>jq%=0GI zcj2ShheX`p;d3A66oa@(87w9xP=nwlii#QAp`?3&QYXmW2Q*;;@P|?oqsb?+fjV1G z&;eB{iZi`eJ|@nA1ot|S5fu#IGL#fe>+x<@g@RyaadFz_sxfdROfG}??H8mSXQt+% zF5sZb;7d^}h@iMI1$cDAqdZ;oNsXp`>jfV#_$#q+x!b10`Nw>PoeK^4(<@X7?Kpe= zk?OSytFk-Mcs=g^lWFEKU*cH`bd2}0l^-Yf!wrptsWn}6+?|HUuNtb|Y?68vopQXUNXlw+;A zU1y%z_&}=^NSnJcnu=xA7)RLk5Urf|TUtU=w}*7d1(k)(b<37g(;D-_T~4}JQoLUh z1MC=F5ff)aT3r)_T&!m^Y4X+RMJa)WxtNR?CKb$USkq^w)a<{$WD>*o zNE`D7tZc)q5JyR=lpy&{6p9fX8T(*--ZVg`s*|}}gN%#)wTYHusx85-oz;ymqJc8K8Q8-fqHHXfEY=`7L-!34AVTTVXpy_&N^1HgS29$d!vdyt zEanUA=7bg`T1D#b96f-f+FW*c09=N`8RJhw%bK%sUnk4rTw zYHGkVj9km=0K#v+VM*XHgDpncGc{kJ1Dl^rd<4#A0y2uUj}n|=HNihq6N|`JQF%ZF z-WTzxtB`Tx6TMyWTEx6CcYUf2FN+1uh;vBNfi`t3(?|(bQ;Mklm6Uo$P6C|5n47$I znJC$RvP2aQ2?K-H#+JI}fP5k<`E3H&Z?tLOi(vc$BFqDKwVemIIKQYKYwi z)J-BJ(qSN!aisLMfZmw9NamyYf$MYG2^-wT7$IOPQh%v}7NE%7NKm{6 z9FRQoDi=P z7AAx0IaSp%uQdB>vjbIY>X|{m5Cl%tElrzI5poMbq`7(w%ziuY#RVULGL#;0u>^ZR z?Of9NL5iF*AS$h6cN;xyqU@GIn{>o_%1esE15@pbkPcF5O+~enDz)JvP)8Yo12QB9 zBRdlyr=^VyBLZL~xFW9nNx%Ewk`|+@->(MY1nh%NJADYNbmDW;=Ta(1#P}qJ4ROjX zyNMjFn|KTg80|m1f<#y(=v$110dpc4?qksJ38@2a1Y$hyZ;ZzX*oBb_tTfRw=oElP z0`5bz2c7DXMt=^81j+E(HKG_v!=pm8)>(Q)>+G-_#IL>3@RIP+@Q&b&LjL zpll4bE9j^>(JU=*cr*}Y{qk&(^~}s@s6N0nNHaDl!(OHrFc^L@Zs==k!JZvn52;)( z99K1}121od5acxLc2#l}-`sOzbv4rrti@Mn*I*W3)Y-PHmZeWSXyW&N1uae_mEPhDPUB!HH(mB<} zeLIVQna8}X|Eg4W&jsP$m1*osroEN-65=HtzpM2I5cH)T5;#F2C7|dmbS&w(lYFLm zAgI4;foR6SzTUyNg9$cPtvSq~`kW|Y6@N|KQEhG-b;t1XzK>1WlozXO7)|X=>QqA9 z%R)3(0QM@Z7@rgKsA-{%Yx({{>t87t@!s z$$byx@*kov=UFC*Wj3mwgj{ouOfxk)h(#5Z`@}5&qX{4v3n?4jfovqY5Iqt&>~qaDG6Hi^ z^_t3ly#V(ah=#+ta88Kk9KLKc> z3fqfE*&q56RZU$Le1dnQ}n|_g;4YRvQfNF=A%F2 z%9?tk({80kDtITFHR{lQ9{u=xJU9JPgAXPtdwXhwjeM1ATeBxt_S7UR)%)GCJJbjj zf;ZJeyJ3@lD@0@bZWO65h_IW1b2CuY2&tD#$C@v9Gx5unTrSmbXNX@{QpGioXrY8=!Dv@)>a_Ep-+r!dV=n1^Oy3j7~W)Xtfya| zDofOVu*791|LOO5C^h^UD@`C|vd-H8X=dCoSq9U~x7OP0yu}Pj*k|2ywCGtezIolc zZ)GUoYbX6QPTD3@>6lj_6*Y{;l4VBD2ne(*9QnVXAqyq(WHs|5pfkxH(d@9YLTQ6vY1*KwBG$ z(jg&LNJmn(u$fvhS+mCiJ8X{}$7gI^d??a@@F(l}D0g#fpfApga!mt#8bh0G^ z4Qs&=-{Y>K6b23$`+Io;05|m#4xSN8y33%u; z$NzKo=07aYgZ^=l(0^t!)?O9EwroJWk97(_-w7vo766jpN1R#H0K4D47a@7V?oMnOYXqcdDI{gdw8O_5gv9UwN^FwE6kWgy5UCe`*-LFNDSIYP5>vO(wGcbuI3w_gC3+k~O+N>^L`p&!hmn64RG)S%id*;sF0 zBoKY^IU>lWfE1UK|4QO@_J)`LC)1FfAIpd?j*P)0Z8*KiaDX!jM9CqM?EJCi--&L9 zSSm`%R1@Tr{Q)I}pnebdQds9i&n6Tn@Fjp|kw-wQ|7?VGbppD#m%otA$Nn3N0^$C5 z7IhH{+6h1h0ABDR5HuBwCGsJzeKNrk6{&So;-O(pSuSp)en9mDG}#d-4ub&6BTiYo zw_H^c(F?iCVaN7vC&3862RY7$p&UyD+?OfA63V|Y=_?UjV1+tc8y&#A+XFWvoJ z(r>@rek`DUS$o3~zcT6En2{$?0n0!hHi(qQZD<$hO{yR+jX=Qv`+}bluJt z_!jU2n~XCHgO9*^KQiAUOYeJKg=CF9S>n_<$~q@QwU#RN7S$;sFMx;}ai3*sg*3%Y zKYLF1Be|GGIZZ+P&`F_sX?mbpHLOgTr%LL6jkkifoXI(FvLH_l4roNjiY#tE~rhTFDLAt={ zkit3e$cCk9YZ%`ITYAKe2>6{W{tiypuus{-x!z_{;3yMt^U0YbFPiD?T~W-z4#{2n$V%j4M0FlA%$sDP~d!)oa4 zLLvLYjzP0qGk^0%;?SG&U%54xycNkRpbL-A_+GKPiav6X*gQQo9=Dxkkr8sE3A%sh z6{Tl4DnZ=n0q&eF^~1QwPVy8Q(%`X4@!%-~wUP{a$PpLj^>3ZwaX}~~exLriM}22i zTq^mt3IYtF6_<#bis>vMH@r|EEA<|wV)%H8d)ZbQW`PZQzb%Ss- za;MtYIL=Kr$L79K&U>p$A-jMA!Xn{2Vd_N)$)I^xp*e zM4Uu8YFnEA_7&6r>W8rfwheyw?gs@6LzW*H@XrNroz{MJKQAFtnDHhy2y(avS>iUB zL0M=AO@mE8;-Tfj%Q=T$&)Z*xf!THQ`8n41hlOe81 zErbT0(fWvb#YE>7E{D1fp6QJGcVa(tgn*44&FZ0FiTpo}a9_;F*xph_->r zzCUbna`bFnHuY@n#y6Ea#-ud_Dj)OMZ{0ncS|40A$XC1=g>NL`RP$)5>*JI_c z%(EI~7C?Z%nwpWck{QQSC9+~0FVM+CZ)`?RTnfh>`m}!#(?>Vhc_xfBN%<0gH1nv} zY6hNCPrFcprjrj@pcXskm}kv&`z${M(Nmz{8jp5YMft@qEw!~%tp86HXBy6C-i7g~ zqH1X@8MVezRZHzft!<`4s$?uJ#wbHAt(K~Z*q33tA&9k>s6j0?ww4e}GO99-7S)ur zgoLqc>|1Q_lg#^G*Sw#S56|`dbI!T%`**HvBTcNc-&;*|mx}WS9$xUJ#ld7jc|~}o zb&s(w(WU0TemTN~ z3XQ9X;z48}$hb$R?%H=YPS?#H=G4QuN-&l<+;Pt~ee4o0!dT%JSwE$Wu-<_$fSj%y zCL~R<&y#AB-OuDq{ zs+%?Fe70OvIj1;aIrHCR#x*%pmiZWVt;FhbpPvNBc*gPoXAF5bPu=bdag(W;F) zfyi0iYqfQ{X~7l0I?+DiRl-Q(r%&)q7ap(UTl{qb!AKy;+M0Q#68ARjtWS%`3CU%0 z97cOI=o}8n+G)O7g8i;h#QXC8@3V7x*(1s@WTZu?DB|;q+~9G7vp^=L?s98B<_v^c z1DqtbOVkBfdJ&cat1DdGUAg z7o~D6Cam<{@xKQX_!~lh^1|u^_3sGYmuD(oK~RFEx>#iJXJn>JsshXks0HH9qkEBapq|RV0R;9odoCNH#W{0Z zuRByvXHt6W%(OFa__6Kzyr&HiU8(#W&IEU~ODkjaI6yPb9z@QV{x8GiBu6@*W6C?? z1kHKO3prPsh|hK7l&6P5!3a)lYCQ&s!bDf|+Ft(%ic=U#SCo64-=nIQ&LAJ@ntuj< zLN=H2V<{g0&Hr9U=qLDZRTRRCrB65|}hh_U9P1W&Iz zRZ!CI_$Gy(veqH-a9xKithr&dS2h>-VD+_F5gK+BFF?JcgtUo1za8eaLCfxYOLAk} zV(j*~mpcW4oTu2XwkGq?VRv7*fK8cuouaas+L=;`H;;zB12=I7(@p0~D?6;a5%GWA zpqI4Q^~-odmU65QMXMIVj>J&lhKo#wKjI=yn%i-^aNOZ_`<-@`^xt1;9+0U)>*HoA zbGJh*CW@k2{)NSABA0%j>}X&-%f;U|VrIM%tt@*L1`>z(u8xL$G`N$s8H=~~2Gowe z`u99}<`lc6eHIsYa%%@>H@acS{Hq>nGqNJao2@}qUDke435{l3?2`_X4ljD_kkxLK zuzU{On|qZJksJMvL-y>v_;qvKvi6^XNd$Py4dPyaJAr9}lgl@J`}IY$SBqw)7v+_O zCjYr>iSX2$6Wb=xK#vJ@&4p`YNsv!E>f{0XaLL{3m@k5}E0ayzoXbo-Vv=qRS!x{( z|27@b-!h7dotO#*fYgyIK6`Gy?yvT}>|1Dy=e@`ic94VJu4)NeXz=@p9cp5=vv8`X zRhH?Wq|WKoG}f(t~%JL)H^ zQ*O_60WOY;Z}iZF%ml(U}vI$5r z9_(#`Xto()gcvpR&B`iUqeV7_?;{OsiM##nKZM+S`t|NI1Jkx15n$UI6*(TdX*oP) zGfn)X9eU@%_V+8~-Sw(wb`his-fFHx()C7^;EOgXW+X+z2T^}_{XZ{OYjAols6;{uLR=i9lgDD*)OnI5gH_>(o3UT-`u*Qe9~N23PE`ZaI6&x?4>iL zVv-0jo|f)$G2*ay-yn3fVclZU)eH9Ih#>nQzB|m`&cRT)GRY zC+kG5f)SDDl{O^PLo!^FjT3$*QK*;8n}R^Hl^hOflhM1XkN`Xt_&BRdVtfldqJXdt z1hWw~oK}_s1>gAxf#WGD67h)0V%s@2DF7_h;tDDo*+?$rrDlOju^Pc2oWn&uePh*` zq;aK7qoyrhq8nfb0csb9013UFf^RdjSTzcY$Ic=z`1X#EEwX6?jkU$T?(H{vC&~R# zYVQK_R5Wzmlv&1Zb_c$c$o{)=d-Kyf!bHbh?7=S$EN98>t~i<6=#ewzreKWUw;!cb zEUVDQ&;&n{MicPgDcw9I_w-$gTj3_#9kMLjFjm8;ZKcI$QLY1i9&v9n|TnlKYkOs)_Eb%*vB&f?kPdj zAMWp~DwJ8QdsnshbFGATT zA&U=R>*hW(SU5a8d=L$%r@Eni;Pen*`roMpysxJXw72PcZS1)+)`}bYI@~iP{x6xn zZ^;Ul5;UpvjHL9BiL22^n@STNVHXXC4}9VDdLMe1G-`lM5HVONNM6K)%!?h-Kh*9; z!s%$XNp7JJeO7j*;z^l3qCF|!5BK@O6wE=^ih`$T$V|5r{Ir?m0 cRJIQd!7Z++g2Rx&`)VMBwZoMz>% literal 0 HcmV?d00001 diff --git a/protocol/conceptual_model_df_memory.svg b/protocol/conceptual_model_df_memory.svg new file mode 100644 index 00000000..9081e42f --- /dev/null +++ b/protocol/conceptual_model_df_memory.svg @@ -0,0 +1,627 @@ + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 1-D array + column + chunk + dataframe + + + + diff --git a/protocol/dataframe_protocol_summary.md b/protocol/dataframe_protocol_summary.md new file mode 100644 index 00000000..41f1421a --- /dev/null +++ b/protocol/dataframe_protocol_summary.md @@ -0,0 +1,371 @@ +# The `__dataframe__` protocol + +This document aims to describe the scope of the dataframe interchange protocol, +as well as its essential design requirements/principles and the functionality +it needs to support. + + +## Purpose of `__dataframe__` + +The purpose of `__dataframe__` is to be a _data interchange_ protocol. I.e., +a way to convert one type of dataframe into another type (for example, +convert a Koalas dataframe into a Pandas dataframe, or a cuDF dataframe into +a Vaex dataframe). + +Currently (June 2020) there is no way to do this in an +implementation-independent way. + +The main use case this protocol intends to enable is to make it possible to +write code that can accept any type of dataframe instead of being tied to a +single type of dataframe. To illustrate that: + +```python +def somefunc(df, ...): + """`df` can be any dataframe supporting the protocol, rather than (say) + only a pandas.DataFrame""" + # could also be `cudf.from_dataframe(df)`, or `vaex.from_dataframe(df)` + # note: this should throw a TypeError if it cannot be done without a device + # transfer (e.g. move data from GPU to CPU) - add `force=True` in that case + new_pandas_df = pd.from_dataframe(df) + # From now on, use Pandas dataframe internally +``` + +### Non-goals + +Providing a _complete, standardized dataframe API_ is not a goal of the +`__dataframe__` protocol. Instead, this is a goal of the full dataframe API +standard, which the Consortium for Python Data API Standards aims to provide +in the future. When that full API standard is implemented by dataframe +libraries, the example above can change to: + +```python +def get_df_module(df): + """Utility function to support programming against a dataframe API""" + if hasattr(df, '__dataframe_namespace__'): + # Retrieve the namespace + pdx = df.__dataframe_namespace__() + else: + # Here we can raise an exception if we only want to support compliant dataframes, + # or convert to our default choice of dataframe if we want to accept (e.g.) dicts + pdx = pd + df = pd.DataFrame(df) + + return pdx, df + + +def somefunc(df, ...): + """`df` can be any dataframe conforming to the dataframe API standard""" + pdx, df = get_df_module(df) + # From now on, use `df` methods and `pdx` functions/objects +``` + + +### Constraints + +An important constraint on the `__dataframe__` protocol is that it should not +make achieving the goal of the complete standardized dataframe API more +difficult to achieve. + +There is a small concern here. Say that a library adopts `__dataframe__` first, +and it goes from supporting only Pandas to officially supporting other +dataframes like `modin.pandas.DataFrame`. At that point, changing to +supporting the full dataframe API standard as a next step _implies a +backwards compatibility break_ for users that now start relying on Modin +dataframe support. E.g., the second transition will change from returning a +Pandas dataframe from `somefunc(df_modin)` to returning a Modin dataframe +later. It must be made very clear to libraries accepting `__dataframe__` that +this is a consequence, and that that should be acceptable to them. + + +### Progression / timeline + +- **Current status**: most dataframe-consuming libraries work _only_ with + Pandas, and rely on many Pandas-specific functions, methods and behavior. +- **Status after `__dataframe__`**: with minor code changes (as in first + example above), libraries can start supporting all conforming dataframes, + convert them to Pandas dataframes, and still rely on the same + Pandas-specific functions, methods and behavior. +- **Status after standard dataframe API adoption**: libraries can start + supporting all conforming dataframes _without converting to Pandas or + relying on its implementation details_. At this point, it's possible to + "program to an interface" rather than to a specific library like Pandas. + + +## Conceptual model of a dataframe + +For a protocol to exchange dataframes between libraries, we need both a model +of what we mean by "dataframe" conceptually for the purposes of the protocol, +and a model of how the data is represented in memory: + +![Conceptual model of a dataframe, containing chunks, columns and 1-D arrays](images/dataframe_conceptual_model.png) + +The smallest building blocks are **1-D arrays** (or "buffers"), which are +contiguous in memory and contain data with the same dtype. A **column** +consists of one or more 1-D arrays (if, e.g., missing data is represented with +a boolean mask, that's a separate array). A **dataframe** contains one or more columns. +A column or a dataframe can be "chunked"; a **chunk** is a subset of a column +or dataframe that contains a set of (neighboring) rows. + + +## Protocol design requirements + +1. Must be a standard Python-level API that is unambiguously specified, and + not rely on implementation details of any particular dataframe library. +2. Must treat dataframes as an ordered collection of columns (which are + conceptually 1-D arrays with a dtype and missing data support). + _Note: this relates to the API for `__dataframe__`, and does not imply + that the underlying implementation must use columnar storage!_ +3. Must allow the consumer to select a specific set of columns for conversion. +4. Must allow the consumer to access the following "metadata" of the dataframe: + number of rows, number of columns, column names, column data types. + _Note: this implies that a data type specification needs to be created._ + _Note: column names are required; they must be strings and unique. If a + dataframe doesn't have them, dummy ones like `'0', '1', ...` can be used._ +5. Must include device support. +6. Must avoid device transfers by default (e.g. copy data from GPU to CPU), + and provide an explicit way to force such transfers (e.g. a `force=` or + `copy=` keyword that the caller can set to `True`). +7. Must be zero-copy wherever possible. +8. Must support missing values (`NA`) for all supported dtypes. +9. Must supports string, categorical and datetime dtypes. +10. Must allow the consumer to inspect the representation for missing values + that the producer uses for each column or data type. Sentinel values, + bit masks, and boolean masks must be supported. + Must also be able to define whether the semantic meaning of `NaN` and + `NaT` is "not-a-number/datetime" or "missing". + _Rationale: this enables the consumer to control how conversion happens, + for example if the producer uses `-128` as a sentinel value in an `int8` + column while the consumer uses a separate bit mask, that information + allows the consumer to make this mapping._ +11. Must allow the producer to describe its memory layout in sufficient + detail. In particular, for missing data and data types that may have + multiple in-memory representations (e.g., categorical), those + representations must all be describable in order to let the consumer map + that to the representation it uses. + _Rationale: prescribing a single in-memory representation in this + protocol would lead to unnecessary copies being made if that represention + isn't the native one a library uses._ + _Note: the memory layout is columnar. Row-major dataframes can use this + protocol, but not in a zero-copy fashion (see requirement 2 above)._ +12. Must support chunking, i.e. accessing the data in "batches" of rows. + There must be metadata the consumer can access to learn in how many + chunks the data is stored. The consumer may also convert the data in + more chunks than it is stored in, i.e. it can ask the producer to slice + its columns to shorter length. That request may not be such that it would + force the producer to concatenate data that is already stored in separate + chunks. + _Rationale: support for chunking is more efficient for libraries that + natively store chunks, and it is needed for dataframes that do not fit in + memory (e.g. dataframes stored on disk or lazily evaluated)._ + +We'll also list some things that were discussed but are not requirements: + +1. Object dtype does not need to be supported +2. Nested/structured dtypes within a single column does not need to be + supported. + _Rationale: not used a lot, additional design complexity not justified. + May be added in the future (does have support in the Arrow C Data + Interface). Also note that Arrow and NumPy structured dtypes have + different memory layouts, e.g. a `(float, int)` dtype would be stored as + two separate child arrays in Arrow and as a single `f0, i0, f1, i1, ...` + interleaved array in NumPy._ +3. Extension dtypes, i.e. a way to extend the set of dtypes that is + explicitly support, are out of scope. + _Rationale: complex to support, not used enough to justify that complexity._ +4. "virtual columns", i.e. columns for which the data is not yet in memory + because it uses lazy evaluation, are not supported other than through + letting the producer materialize the data in memory when the consumer + calls `__dataframe__`. + _Rationale: the full dataframe API will support this use case by + "programming to an interface"; this data interchange protocol is + fundamentally built around describing data in memory_. + + +### To be decided + +_The connection between dataframe and array interchange protocols_. If we +treat a dataframe as a set of columns which each are a set of 1-D arrays +(there may be more than one in the case of using masks for missing data, or +in the future for nested dtypes), it may be expected that there is a +connection to be made with the array data interchange method. The array +interchange is based on DLPack; its major limitation from the point of view +of dataframes is the lack of support of all required data types (string, +categorical, datetime) and missing data. A requirement could be added that +`__dlpack__` should be supported in case the data types in a column are +supported by DLPack. Missing data via a boolean mask as a separate array +could also be supported. + +_Should there be a standard `from_dataframe` constructor function?_ This +isn't completely necessary, however it's expected that a full dataframe API +standard will have such a function. The array API standard also has such a +function, namely `from_dlpack`. Adding at least a recommendation on syntax +for this function makes sense, e.g., simply `from_dataframe(df)`. +Discussion at https://github.com/data-apis/dataframe-api/issues/29#issuecomment-685903651 +is relevant. + + +## Frequently asked questions + +### Can the Arrow C Data Interface be used for this? + +What we are aiming for is quite similar to the Arrow C Data Interface (see +the [rationale for the Arrow C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html#rationale)), +except `__dataframe__` is a Python-level rather than C-level interface. +The data types format specification of that interface is something that could +be used unchanged. + +The main limitation is to be that it does not have device support +-- `@kkraus14` will bring this up on the Arrow dev mailing list. Another +identified issue is that the "deleter" on the Arrow C struct is present at the +column level, and there are use cases for having it at the buffer level +(mixed-device dataframes, more granular control over memory). + +Note that categoricals are supported, Arrow uses the phrasing +"dictionary-encoded types" for categorical. Also, what it calls "array" means +"column" in the terminology of this document (and every Python dataframe +library). + +The Arrow C Data Interface says specifically it was inspired by [Python's +buffer protocol](https://docs.python.org/3/c-api/buffer.html), which is also +a C-only and CPU-only interface. See `__array_interface__` below for a +Python-level equivalent of the buffer protocol. + +Note that specifying the precise semantics for implementers (both producing +and consuming libraries) will be important. The Arrow C Data interface relies +on providing a deletion / finalization method similar to DLPack. The desired +semantics here need to be ironed out. See Arrow docs on +[release callback semantics](https://arrow.apache.org/docs/format/CDataInterface.html#release-callback-semantics-for-consumers). + + +### Is `__dataframe__` analogous to `__array__` or `__array_interface__`? + +Yes, it is fairly analogous to `__array_interface__`. There will be some +differences though, for example `__array_interface__` doesn't know about +devices, and it's a `dict` with a pointer to memory so there's an assumption +that the data lives in CPU memory (which may not be true, e.g. in the case of +cuDF or Vaex). + +It is _not_ analogous to `__array__`, which is NumPy-specific. `__array__` is a +method attached to array/tensor-like objects, and calling it is requesting +the object it's attached to to turn itself into a NumPy array. Hence, the +library that implements `__array__` must depend (optionally at least) on +NumPy, and call a NumPy `ndarray` constructor itself from within `__array__`. + + +### What is wrong with `.to_numpy?` and `.to_arrow()`? + +Such methods ask the object it is attached to to turn itself into a NumPy or +Arrow array. Which means each library must have at least an optional +dependency on NumPy and on Arrow if it implements those methods. This leads +to unnecessary coupling between libraries, and hence is a suboptimal choice - +we'd like to avoid this if we can. + +Instead, it should be dataframe consumers that rely on NumPy or Arrow, since +they are the ones that need such a particular format. So, it can call the +constructor it needs. For example, `x = np.asarray(df['colname'])` (where +`df` supports `__dataframe__`). + + +### Does an interface describing memory work for virtual columns? + +Vaex is an example of a library that can have "virtual columns" (see `@maartenbreddels` +[comment here](https://github.com/data-apis/dataframe-api/issues/29#issuecomment-686373569)). +If the protocol includes a description of data layout in memory, does that +work for such a virtual column? + +Yes. Virtual columns need to be materialized in memory before they can be +turned into a column for a different type of dataframe - that will be true +for every discussed form of the protocol; whether there's a `to_arrow()` or +something else does not matter. Vaex can choose _how_ to materialize (e.g., +to an Arrow array, a NumPy array, or a raw memory buffer) - as long as the +returned description of memory layout is valid, all those options can later +be turned into the desired column format without a data copy, so the +implementation choice here really doesn't matter much. + +_Note: the above statement on materialization assumes that there are many +forms a virtual column can be implemented, and that those are all +custom/different and that at this point it makes little sense to standardize +that. For example, one could do this with a simple string DSL (`'col_C = +col_A + col_B'`, with a fancier C++-style lazy evaluation, with a +computational graph approach like Dask uses, etc.)._ + + +## Possible direction for implementation + +### Rough initial prototypes (historical) + +The `cuDFDataFrame`, `cuDFColumn` and `cuDFBuffer` sketched out by @kkraus14 +[here](https://github.com/data-apis/dataframe-api/issues/29#issuecomment-685123386) +looked like it was in the right direction. + +[This prototype](https://github.com/wesm/dataframe-protocol/pull/1) by Wes +McKinney was the first attempt, and has some useful features. + + +### Relevant existing protocols + +Here are the four most relevant existing protocols, and what requirements they support: + +| *supports* | buffer protocol | `__array_interface__` | DLPack | Arrow C Data Interface | +|---------------------|:---------------:|:---------------------:|:------:|:----------------------:| +| Python API | | Y | (1) | | +| C API | Y | Y | Y | Y | +| arrays | Y | Y | Y | Y | +| dataframes | | | | | +| chunking | | | | | +| devices | | | Y | | +| bool/int/uint/float | Y | Y | Y | Y | +| missing data | (2) | (3) | (4) | Y | +| string dtype | (4) | (4) | | Y | +| datetime dtypes | | (5) | | Y | +| categoricals | (6) | (6) | (7) | Y | + +1. The Python API is only an interface to call the C API under the hood, it + doesn't contain a description of how the data is laid out in memory. +2. Can be done only via separate masks of boolean arrays. +3. `__array_interface__` has a `mask` attribute, which is a separate boolean array also implementing the `__array_interface__` protocol. +4. Only fixed-length strings as sequence of char or unicode. +5. Only NumPy datetime and timedelta, not timezones. For the purpose of data + interchange, timezones could be represented separately in metadata if + desired. +6. No explicit support, however categoricals can be mapped to either integers + or strings. Unclear how to communicate that information from producer to consumer. +7. No explicit support, categoricals can only be mapped to integers. + +| *implementation* | buffer protocol | `__array_interface__` | DLPack | Arrow C Data Interface | +|---------------------|:---------------:|:---------------------:|:---------:|:----------------------:| +| refcounting | Y | Y | | | +| call deleter | | | Y | Y | +| supporting C code | in CPython | in NumPy | spec-only | spec-only | + +It is worth noting that for all four protocols, both dataframes and chunking +can be easily layered on top. However only arrays, which are the only parts +that have a specific memory layout, are explicitly specified in all those +protocols. For Arrow this would be a little easier than for other protocols +given the inclusion of "children" and "dictionary-encoded types", and indeed +PyArrow already does provide such functionality. + +#### Data type descriptions + +There are multiple options for how to specify the dtype. The buffer protocol, +NumPy and Arrow use format strings, DLPack uses enums. Furthermore dtype +literals can be used for a Python API; this is what the array API standard +does. Here are some examples: + +| *dtype* | buffer protocol | `__array_interface__` | DLPack | Arrow C Data Interface | +|---------------------|:---------------:|:---------------------:|:---------:|:----------------------:| +| int8 | `'=b'` | `'i1'` | `(0, 8)` | `'c'` | +| int32 | `=i'` | `'i4'` | `(0, 32)` | `'i'` | +| float64 | `'=d'` | `'` are denoting endianness; Arrow only supports native endianness. + + +## References + +- [Python buffer protocol](https://docs.python.org/3/c-api/buffer.html) +- [`__array_interface__` protocol](https://numpy.org/devdocs/reference/arrays.interface.html) +- [Arrow C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) +- [DLPack](https://github.com/dmlc/dlpack) +- [Array data interchange in API standard](https://data-apis.github.io/array-api/latest/design_topics/data_interchange.html) diff --git a/protocol/images/dataframe_conceptual_model.png b/protocol/images/dataframe_conceptual_model.png new file mode 100644 index 0000000000000000000000000000000000000000..3643b3895786ca938efafc05611fdbfa50934186 GIT binary patch literal 40752 zcmeFac|4Tu8$K*iNkuBO*peh7DU`KTiXttR#Pl>%DqFH|BW)^sieworQrR9O`<8?$ zS+eg$_GN5i8OFTVboFcz#>2L1(m(VdBb;TFU3!w1jW)%G@^Obj~gi4%swX=_Se9+_U> z8C>+?j>q?R4w9WG|5$ZVw8As&!uq94t*5QzxtHhrvT0tI)ane@wcvhnjy*!;yTH;j zYIn)owqr)$n$&J$>GKs_(ZaILbEoioE#}k}V%g%W5;J;Ea^Dy!F?95|9XC32{z6X; zk?iEU+_QdHpAIp{HUmW&ttFCEa7#h&{`coUnE`$!HieNAN5#=N@4~4ceA7Q7ZLb@M zgs+U$Y2CzCbwJ07p*c=XBp1DeA>?KUBlVA<;M4C_Nq^{I=l)&2SI}D?Po5CQO@Feq zY;rgz60Y_>IWc#i*EdE6?{jJ*ysQkJ9^+qCthJP;s(%ECPgS@$Hd84@k<_5mYEN}7 zJl%~AtQ&?CBxc);9xBq)-)y_yM>qT|3X_&Zv>XTb}DE z@Mmi2`8VnD2$A8u~|Cz{J?UTf#p;cYk{$S z65;E8nHdIj!=JRoG2HS$T$%Ug$yPiA18?M$)AF1Q%9y989}yTC?4nMq4eB!RlR@r^7vnUSz3C2>kL~#n}<0WKB7UJ`HT#{U~aau&i3js=>G$;nXdfu<%Bro z<6HUQmEOz@?0Vtz`}qL$Z>=7~%zV@P`IlcVJo&p}&}U)Mm}mM)o8MbJ$?;)|7b^0> zmchvg^5Yw-I&e;8)^pBd_X60Sts9QwJIvYa$KaiS69s>l=e+tyPL|+@<;3zhIi-9j_wjh%W=$U8bBoIm}u%#V6)sK<>@ZtC>eDYAs7 zg!9`kWA9=M?1~$vr+sc+cRCl?;fU)HRwDh>aqCEUul8|zWE{tnIiN%Bb1h{u*@EqN z%b&pUQh%b3D&dbw`ZD9Ds1u#4N@PAaN}TrQ?0!|nA>Tq|c4VDJl?0VX+HSK(-B%vvxrVJ8KNM!oA@tDX!50_15v2=_W(LzJYR4;?j!^qi zD&fx;Zl_jzOGsM{JDCq}x%nX{!TywhR;&1*twWZ4hce1!Td7SS?AGZGd`TPoVtc#F zHzvY%npf(;O7O};Dety8dCt!Bp^*;p4`c7{V<%*)1#966Uid&A5<$>AN_tSFJ>!uH zAvgJ9h3vOsv%@wQrU!TTD~$<63{JoDkg7ZqVC&2d8)wBhSF?dO?ZuEbtECzzx~V}b z+3neCk6WpEu|2UYHI4nntk)jY_V*Z$aS-yV8oF)u%JbVruk1L8L0j@RZflL)R`G_1 zDuXp}IcYLoN3o9!JYt&kqS2j@muJ|N+=DJ4Z+LAH*pW&w8zrTT-_4lzelXtDx+AW& z)ccQzo>T+sdxI$|I+&y+lez3DCR-Q``GxYRkyqLyIa@cy5tnne*=YMH`}R0_|9nPI zsi_KTbU%}w>Z#?n`&PMB^oA&P6c+a1s_r$u20EB&$y<|0^o;-5owcT>>YPU!VU5e> z>yes|R4#T0bq(mCT8e9W%ZdvP9b>m{-fT^BY>j$Ljzyo5DRvabYvr_*ld9a+`c)}L zj%g_|W80=JzE|j7ILzsEa|EnwvAizTGDvZZ6Knwg{2^om;OYtu7z92A9q-zKqCRT= z@J=~Vyj${3YQ4E_67z%JDYaEU zAkrcu{^A94Tu|(uJlQnt4o0y1>aF=5zwzMby(ojTlgvmRFmrLKUm`hTB+pkmP~_rb z-5F`Dr5MQvxl41Hgo z>DRVWDTPC?i!6dWQiF4G`U<{@?|;O<$2We_I;7NnT2|chBv^-}RTz|Whgbc<)i~Fl zAf0(#dBUi$EvDFftfPu}J-JRAciCVVUNp^1Ts^yD^&NKHMWHm0?# zqOgd2(=<82zOl|t;Iq<}{&oRQvc=d|xi!P|_ar%YRyW97!FioISO%;k4fV`_HdGN3 z*Y&2cU|L_S(Whj-kTY`5Se5&rx54RVN<-E8#JU0El{;Qbo0G`SB)K(vu#~G8+U;dD zBRuWB;xNt(xyfU92#l?LerYQ(GERH)T3qAijU$7-Q-?U}bzhch9I;%soOm^QdARL5 zuurWMU!?X}UOR}14?XyyPE)r>^I=1`@&q}A!8$uVXthIpY*o4F@EaxZ;q;mseS!BC zembuX)gQI@xrLi_B<3dlbN`VrTc_lM zyVx6aTt1lw|41m~S*xPDD&0x~tYnH|-8qWv(u?(nZJy*1dvJ|axa5sSEBi)ROr2F? zmEJa+y|x^$kLY_py>It$eQyqXqGOiDJBu!`Yq1fyF7}2P>JnA3fVN6S(T!~L=Je=_ zE}kP-nta>3v-M)NZCQKQ->&3Kw3a|gZuTc>6-}2KctjHNw#$EQ-xOp^`sh0T`UCli zrLcXJ((7i!%-eg#*j@#%z8pHHfpH^>bIbVh1YpaHpVajqVpZTl3DauaACMrPVqQr+zoMl|#Wo^SPvJ-6bW^KKRdLOx>6PaxUT$gZ zoorQ2F7d)_oQC&r^wmSv?)QSGx$knP%xV zlNlL5ph2O?$PusU+DJL?y8_mKy|3q3B-Nnh%!tLh8jG&{fnuv;*QX|OTBBU7cTL;s zbi7y#tX~5&o|xVglrR?b{f_~7mJutc+-h=Dx`!;cl|1EhLFZscW+Bsy?N!TL`|tmt z8V2-$PU@)c_X-oQW!lub6CV_E?VM??z#dXz=6mveC8vvy>59_*KQs@!wOJ-@c5r^x z*e@z!y|!SxZtL!1(nMsQcB}(mUYF0bMPL4aQcE{JwMMqJD(9s-ST4EYFC8))^)n24 zcq2ME;AWxqiFwq+HuL&uy9TPFy5&n!zRdx}i{s)C8y@s;XLGEU=~h}MzmW^{r9uZK zU6<}=94~CK#=J4;OZGrSVp>g!o6@_6eq*tmn~(dxybJ8GPtuIwsY+X!_l~c^fIa9- z@0AKt!(M3N3Y2Z07;%i>;G3~+_F{i_a8QPc@}HY03h!FUeP4aB@#k2^rmR@Y<7Np>)FAC$LxNEnC;V(Hk-{&s&ld=6e%7}Nj3HdK|gvv{aF@x zKCX*sbt=QYF@vCu!{}`I^rVlEMQ9W~&Sy2)u|{eAw#vg%zT(6k{P6%2r*Id0Ul5fe(f18~wJY(vLMi?GcM zs9N-82!yt(W0oxX_6)rH&9?!(`-?9_Ty)qUfI&K_T?7UfA+i}1_1iD+-a~`kL%!bi zE*)-DIYMWr3|fe< z7NHB{u>-pNEX4Zkfh*ZV7b=1VZ7VE*Dc`yocSv!FAQ%}QTQDw{*Hj@NpioTqeFc!6 z(__us`L)HshgY)kvi?l*`(gBRLW%kXw?QlD_y@Vo4@_))JvDh`>b|MN_x82;X%F{y zWSal_+9f$C(aAFEP*sn5l}=R_r8bLGYtZ&#R@p#Rp;aj13TMra*p!h!4Dy<4nx|n6isi%>m89UYVGPtH5nG$3OnrmONYbIisbL! z)A{6@?cs)d-)}lF`7;O0_`u|-uk+9uO>9DJ&k$TL=HUyKV4h>J0F)h9WZ^Yd>WFLC zQ?e5)kt7Xgs%1upDx-rBCt{~hHC0{?+MH@XYN?4?qWDEP8{O9dJZ8^wZ{`CVI;oC> zLX4}o2AQZ3>r#8wFop3r$E~ajTT~8X+4~@Bb%Xpmy@eYCdL`HGCH#14tF6Z13=tS- zmQBpN0xTQ*RnWAticIlgZKVfd>GI_L4ZJiqgm+e8|NmCKEx0L$D$#Ey?>kb!UMkhin09<#Xc@rWd{2AhfiR@_+MVc zO|@4fRc1L&lSswscJY3X#=Im4jt^8fRP_!AD2@<%@8vAd9Lnf=&pY)~O37&;4&9^W zT_jt6m6 zlFO9Fi4#$=QBhG@q)Ppev+1@Tf5@~7m+X-A*>%m}${I|eWh2&UYMr>N}kBg?x99dr%*1sGZUKh-J9B0 z)>J)Cnyzt2y7uB7)`at`v~`4-_YrgE1DTKXGHlGu%$!Y4O*4!yt&;S=enHOJ;ED39 z56X!jtONJ?>?^jPTw8ZA`fmnPgJjK0Y;8@#-bT5+-A+zR zz_e3pHm%X`r$~|3jKc8`ozr;e((Ar4%zPlael3{}g zCwf~+=NW$e+SG45@$LJKlv-lSS8?bE)O2gqM1h0YSt{K6LoeqK6VCf8HGRAzxx(L7 zy*=GVR=LGZJ%Xnp?VJ+$WIOl-{k={((cz++(>dwQl5e`RcQWdYi1_R>HMp`tZ#X!f zFU|S6EBQTK zlUH%FHm%5`@F?X(4LBu!z|@;rB-PoVl-hM5kf<*7+P{C`De>g6ZnHR3YUXY!=fZ>h z?uyItLNNs%z|;(n3YJx+GXWfzjDY%o0~I*Za!;7Xyv9 ziuwrQ*$FE#*=S}w5@EQC@%wBb9O761^MRPJ5S~$Q1%?@aOW7YasLLzr(-y~H!l=it z=PCy5#9{FA6&UCZl#!SZypjodfW;q}Ek?bqtgcbYoS=(HXug5K$OxW$AeOxq-yR5G zDLmV70eJhzMkc(mu`y`92D8~gq-ReW;d#deif(G=PS8j0GG&_|-KMa2ZE*HhV(i6M zjM#8_CF`Dzb$5TG%y~F*a&^XM>p(5!)ZN{?67x_jFJGAL8;X}HR9K>(oxrHD1~UO? zO$LmWlgXijRki%{&+@z;ZEfvR0|NSlWFq&gg-e>R5!>_f5^Y7V@jDKEc8DAz9(P$XFZr!`ges=Uj$=vEo9*cG%AN6x9_;dsK z^m6}l-y{CYqs_kSx?&nuO zL1+YK_k^oSV$VCUw22M0jiH|i7Q}Vg(wZOcs{Wg-dzm%~EyD{%7kq;b^VMBoH83m~ zz7Ui~5|-$4MUS7Ax9%oC@Z~4v6YH@n=I_Tya5UjILG~0t7+}>}xd>*Yb8g~aeVN$_ z{z>BbgOP^q`-^olGOY(ydr;J(r`n>8WxmYKeuZ1~N=T$~@YgtdFR^C0PrR~V&+aqo zv5^>kt&e572|P1mQyDlV5%x~GQYGUSb~zp~BW$D=u(bs*$i6YsSgoj4aAR#O>RY7d zugf=h4ZG}Nbb*Pr+d*gz!MNCX_xtucq!K%?!42U~-w-DB4FNpZ=2-VwZj4~whqy2G z8{r09iBZ=N(1SLkH~($Ng#^6{of7=$9l}<1en+Q4r_`PJd2db>8=$26Y{l&U=yqW-w$KemG#cYsVK>nNkByK5YMExLD!z z#mYn$>mMx+C7ug3@u{MYQ7LFx0{;Z29pMt7Z2|Vl6`C6mW~{^r0J8=x5IhOcUfovq zw)>&0ufL9Po3OpEWOg9=rKEGfy5&AwhG*3=XN7wc9&l*H4u&PA-T0jQ(!t=uUS>St zYRx<6EcI(GeszqbqN4r9=^P z1^6PDn;GA!&adB+`fUgBe&$Lx4hCEs&|e|A(O6pJr?bqpV=#9ZSO>Q&#Vo)|>#wAl zur^09Syy-XgoN%VjkW|oJ4Uj0<5u?&$>T(?C%n{jHM!QaClVOYV9NNoFs~-bIu?Fv z2>>dd3BTMs_?WFigYC!%xGi$$pW*wAHMycm%iIAl5YNq(&tpfw%iFN9$#lwtES+V6 zo|5Gqvc|Lq*%WQWfKAL44h5Uw7YnAJbuWW_cUR^m_KwUC_FCRFTcW77*?Nj;Q#dm| zJh~tX7z)cVLB&FERDOz%shVlJPP$RLt+ndGQFzE!V&V-nnSt-?TZSxtHC_R%?Otxe z8k9V~{4IJGa&bavC^Ui`rOD2zo2ev6s#F0yO8!grCik1S#I=p8K(EEJm%ix$#F*#* zdfb#bxwDjQB4ZC&X?cm=Qcg^_-tUDX?I|WpYJjg2I0$Aon1fY4RdgfsCvJE;~U&gx_CJaCpmALR1C;VvwCl4gGt-ODw}RW0J3yOYl|*ACNo> z&hmPKf1-FiILnpZrvc>m74lKA`*}E(ls0QvK+q*KSdE_)K&-eh+(}$XAB=!uvCvAB z`inN8w<0U-b<}`H_Y&Rc_5!2px{S%8i`;!NJ9oce%>&7Et?LLcC0tD&_Uw5DA`;4u zcZ=*ubdwt{Z5udXrXoHdzS+*Jk5yvAD=%ft`17XapUrz3v?Y9w;Fi{KR9!v_PAeJo z^7-JGg+_!nf+1uCYcNtC@B@R_Azio>jCvrZ5X;^}3*8`i z830b#o7Q_8mH}awKcO3J3)m7rZR9i@BxopGSy{;h48a%ak%fccifQD8ts^^*Q4d6H z84dT)wAtPYjtIdf>qP*~a>GHCcxduI%aUIquW6VJrp;js0dNBAymw{s0n@w0)$94Q z;99$euV2+tSyg9__-~d(TC#D>2_A5WEjuemwR`DU7OId$R?FOQV~dRW(_m%~^#(R>Qvm_H_XwvHScRL=wdp zPCtRo=zP%3WT+iDt&4UaFf?!NPLjyuBiqcbd0QT>O@Hv&G1oEAQ3u!TaRAHcS;|;b zT6tL;?hGw8U}p9(<7+BoX`o~NX#$igm$%ry?rN63YO65n=K}(PL+Vjwp`BL616jdl zlt$_|Jbbc^ehyRqaq;scd>Xnd$qPY8$|OCSBkOaye3&K?FI-D#QD_BGn3os>Ef!y4aT)I;rZxF&c>Ld11(pUvqD4H`8!%G{9@;Q!@{gMoc*BLDc5>mTUaA zo5K>Cb27mpwrxE@qd0hz(@Xy!TMe_GLA(ZC#ezVoY>E~rt+@jOC19F(V&|F}Yaeg`0mHS?mWH?j7>;|bduW=$GNtYIv^7n#J2SN5iX+1~ev4=472^8JU{ zXMVr{O^G(}(;y2$4@c-Y6C&dUv5TwNj5WyGQ;c|jh#zhfIK+Ub0%(KN=`<4j%miWc zJPJS$_aO8@@C#sU5R(J#$L%8^Bmpl*a0(s80BghS$~*WeE-`Vt53}#d#1{d8kDvu{ zAn*o3l)EIDLh)(c+1Jh36G3s#ik_C201^52VO1n%r(eJmJ~cS_CC3Q0nJIKF769OB z`tk+Q9v;3Yc>U7N>#f$JqM}{mKPBIA>w9SNyJs@Zg`Vh$Dr+stH{Za{0wFl0dJZ;k zB>5%a-lX{F>2{;tp50BJ#2I3B7|w@?Y|icsG&cWmdeD$0g^q*}4;c?F z&iEIA%N@i_&Etkos}a--h^@xWuepkBR)2o zh|8$H4vnw*7=r|$S?daYnGwg4kGLQK*U?iF?0Rk^j@-8aJQ;*xYofBv7I)1<{J^r& z-*)w6SOg57vcKH;{uU+wUeoH52h*mxV5dh_1k4u=Y{fItN7N4;)3LB$zzn0O;w&&E zy?O#bARJe7oF3h&ULFG|0dR1D?Ox6*UbOKG^Z(gMsXxU!V#T-id>;<_aBE zBFAbMIEe`_*69Jp+i(FhLi0Azq#$w5>@0wVf=OLEs3FvrJfwk|@z-#f5e!g)U>urG6;I^Dbp$D6*E!JJm4{W4{d6C5`)mjDwl zuMH8PI-`Q*{)EBQr%#_+ad~l{@z|AjEbM!Law3h40iXixHyRZ*YS!&r!|E1!QKa@l34aG#>a>`&Tl+f zsOZwALb`D|63yBRA!B#`fB{L%V)c)>j|MA8TAu3PIU*w?6VTGK7X2XZZ0jRQj;nK| zAPuR|X4JH0OU=$FfbFA!i!4Y-|JZGlhSsnHOw$8Kp4fMWe7MlhyZS7Ojs?lb}6CFKTSz{G#q$aAnah-Ph8wU!}iCjZ+>nM&^vT5%o z^HE4jQA&e-Rp;N)dcvJ!mPbv~lvrg*4ZanOI%~9g0kwK2W%KUIhNKuuDsl?Jd2rH{3NC>T2tYrm+9@XZ3 zygYM64Po4?3&&mB-evKmPX{QC*;3CJ<`@Rpn;C_Y(6(UTxeBu0QeeM|&_g@YAw5I^ z>`=3z)6MZ`w^Q7e45?pgHRbfv1{)>s12a1g*iJWxL73eE!QN~3PLQRzBj>y4!rpE! z@9v`4+=8x0af^^EaB+5al{dJ9?)^ArleXM>@c77gv=J_AR~(}OdUjK>}5g0FZ^|n>T#N{XTv%785reVd4+w`Yh;)X zb7@J*b*5>te04A?sqkLFTZ6<#`^D_bsnLe>scRCn&aHj`xV@2&Q3hBjEo~SqV$4U{ za6$B`ut|F~IaFs{l&+o0McWM0lgk2LN3SY#4&Q zR?s3Age6`aKeqWb#{qi{<^~BgnSo3CjctYDBf=no)SfU*4?!LtnoWeZgof{Tgf&7~ z7BbCHM~K&f5hb&0WB;c(D`)FJn=ko7Mk}qUgq}Bml*84$#XL1F;A(;a*DYn$lP5Gx z)6Fhl7HYUBI1{xkGO!EiqDms2K=h}(%QnPaA~D5c(>gz@pc^kbC0$9(d=&wm?yhC? zz4Z-O+{+nILf2sdi2^1i3Md0fQ$%atTE})fq|si0vhB09P4at?7SE| z!9dTXop3v*Dprv7BYJp?-bJ`cZKmj~j6yl<;mQ{Sy!S7Ll5zmQ{03p3$viY~PC6w4 zxB!Ju_s|aQ<<0AYS8;igb6=4U3;*tj(}}ty>==&%fEK?@UoVooXz?HY8#4jg$mc4K z(9fEo4;O^;x0EFeP&{JEvPRo}iQ@o`22;Ht?Ly}>kEW6)2P^}Ar4^LJxU|jIa9(&* z*A4cPZMG*|{<&%kjyi=zv&B5JX&kx&ZfCm!DkURqUa`u_94Z+0-ukG-eHNUp@IfyBxYwjB^90Wp}&lrmt z?pO?&ZQa)}$JH-0vXCiTI>Xq5X#+tnjjW`jTrDJvps%NQ-1f(VnWWNe$UVDMqd%V> zY_^9P1d`*`(O}4Ugwq+135Y@G=cmkKz2iPVyJYegC-?8UN3PiVO=77jDJko@yt!38 z!Z+!_@NqF>hdCdoUR}_crgg!v)PZ!kX<29V?;p$cAjIqu_$kiP@yOI9ouU+&pBG; z?7(abqQ0Tj!;B2@_q43?eD{w?W$5l7P%jC%p!pdjnpy_9S{iQy#6h$Wd0{k73o3pg zmxb{_+Cw9k7ucoW@iz29q}?DT&IWNXCB@#^C%%Oit#X3Y>AY+;9E9r?r=gEiZQ%3{ zeFBVhN$Y}RyTWQ-o?7|3cQQxb}{GdzZx3@%ixk zx*oLH4Ds`ulMAz&W|V$82avt{mAajmBNF^gj)?BD!RcGhC+83N*SSpeyxq5+a1=uJ z{0LzSb;n?-()J_U8buqW8q>&3M&*M*oWr>&B_iTen24pPfso@=fY1lKeRwkMFz>YT zHNvzm@#~kSeiMV-Fn&|#)VV-$DRWe^DFw3Aj4iFw% z_1Mlax(lXH1HwXavunkG(QSQcW3)hOFxSQchmcF(S+rEtijh)MMfPoX%cJkQf7Av&L)!DfE!FpqW5hk5&5ry0U&wfo2ngi+5V(& zNTbG%_+z(p9sz`wV~Puc0!E&H4--_;6o5_Ck)brklI{~VD~^H;;;W~R#@^#L>YoE# z)UPkO11bq}1?STrSCA~zA(&+v2ePnKy&ZMdxz7^ii{I#H!w@afA%k}A>YXa zQa8ZS>o+Mm##KOlO=JHOLm&c83=*VqSzazkpv2fWmpwvghrJsKUg$x<^g=j6%Vb8g zi~!4?EiAiHd=trCmGn@taId4o#Y|2qK69t{<8X zI?9I}DWZ2S^z4IzTxgZ;Xkxi7VvSbt&AL zk%~JJ8cW^P@HZ^MU(xy}g9Y{Pf!dg_h|TY_Rcd*@drO{~?Kc*f9@7c~^({TEWqttb z7HzyXyp@BVrnxP(Fik^CF%UQj*x_#d9+V=LOa=^A&GB5vd~O)H%2n0G2e3`GoGGAQ zU_1Y^acrWEZ#r+JKmyzBEQn{lX6~k1JV;7G$a|vy6$E7xD3Y|iRW(lpw*rd0&*qa0gZesnR6A2 zuLbbiYv{URCZqXiWy@T+;xu%RFDk|bWUf$r({B1YP$dX8$RWj#Dffz<$58kvgYdJB zkBpD3GXf`{9dV2o)%a=ds{WN`p20h!%Yi1PEkMmE4Xw2^e_D+JDBDP=X#lKOn1E|z zx9CfQCxWc}KD*2fyjp#k+y6eTT|d#G^#2|yTi2QINc?tWirmCUb*ZFIb_M81!))7_ zbh&pjr!1bKdMP~_^$8}UiYmRw9d{X^`lbYwW>V=t+xI!0$ga=ji=|@vB0>z*VS+v< zp9=`obs_u$DOsS&@i~NKtNK^brG9d`3$s-ny{h$3pC7!`vq!%{B#5pUmX|k{^(g#w zV@F!Uoy_rVI0W|?Aq5>T8Kk>OV)tj7CgbC&O_s@Dj+>)`Lt_Vau`zJ-90r@9L^kwvXt2P0J zX_Rm7GB7L23FPKuBs5Lv+|c{XZz*b`aWClop|BoSsj{jqfS;KqbbvZ-k8`0qKdWNO zFu6g=QL@c~8R`rgLio}uED*f|ln67A?)EAC2X!D+^QK+A2Q|e|`a6%_-2k{2TBZ*H zVSvH`nJ_5c6Ve4b91fttp@A@D!soCdQ2#xLVrF2jfdfTcuox6zLWf7sy8T63{;>bW zwR#}427&{G-leZ3quxbx#+rnLeE@ly;8$lnu?{oVnw&s@x-I7G%%B%pKaqj6clT+UmEwoi2v}0D+gD6#mb$@_T(|!@Yh7hgrS{QEKZW z8c(3HnKkDq+Xuhldw}f2+ofgjENbi#_;e2_356O|7<5#X&8dG7(e!1y;&2}M`wdiO zfU1Fo6cUi4jyqoVso_>}_KK&8S|_0n$Uu|m*aDTkzer?B`qe;b+akKmqF+6Ti!_s1 zOIVm#+Yg*V+0?jouw%uW@;sVU$aX!32!cFF43R*>5;DWc`t7O%yGr@*a2_B=K|u;2 zM#q%m#bBJgpN>+@0OXptAAe0ZxDK*n{wAy5dqtJgDg(=-$G{uZL109pomPa$KLvB; zKn2tVRX`vM-+(BoK{ja#Wrh}^3#{Z6EUyo+Xxw){N@>w@ez|Ue{&Ft1zN4z~T0d(} zHck_bA)MF?dJ+i)*3y?wZ01Uk`-?Uf%EcD&{K{bgF|mNx4VL;qC=QKz-{mLsu`PFCT#Mjn=dWg`m zgeG)+HCzD5QBYuo2pwn6VckD1{>3TAnrO3V;#?9ttNSv@_{>%zkgYLG>`X!z26C?8 zu)7ittZ&R=2YGorAPhbWZGv(Zos{M6&IKZK)#Mu|p!Dyy-;bACS$1b}ou3y#hkr3i zM}r^rrJ}+Cw+pQFGv*8Y=xNWfV#6&=p@`BRx3deldrO3BCSlnMLqXv~e&TvsUXrFX22KN{CF|TRFKw;pBY{dvDCPN$ z_TV2AlfO~j{9a~!$D2nTlmyM<0-9Z$A(mdX{994qdcqa`Ea-r27UVeo4;Bghp4`Dm z{SyK}bigzwXT$vZ07O~B-nmwM%l`@lm^b}@xj(2Hcl5S7Y58i4U$V?gbG(gfL| zrvD^6v{1yWmnP{?uD%onq9q&Re<0H{^iQDEapa>LCY4)JrGoLU;;<1$<0r`0HN- z2lvB@8*N{#;X{>N&egR^m4c)tM0+&Zq%dPfMKoz16Qh#>KmlMloNkV}{`=Zogbt}> z13I?nt1$U-<3KhH;RKC|u4HmpEI9esNZ$KQ4u}|SUK}yMT$=_XXlnN$q4`8U;CT@) zvnALU$Wz`AuDv=}B0$qC&{UU&be-!=t&5><%6{D&fo73hgqOv0)ksLKjv|YzNjx2) zfVdors(>EqmnZ=$Tfw?W)4D^0*NY&6KhQJ6Q%6Vnw0h@X#AOg6APfRM_XH#qvjs$u z>_j^CUlb65dcfZn5J3(D)K(H<9X-5B0^zX3m|YD)(TMJixML8p%+(&!RBV7x?vGph zNjqEZuGlPm`LM0z7vTs{WY3nY)AGhNMKP3QPF&_%uybAov(-BZ->VR`?U}01%}xh7Zw=ObmSOT zOqM+*|rB08lOMc@ov?569UR`1hrF%{8x68hFs-|_Zi^2!S;I5KQ?44w#+wt>D8o@0R zKthd#EAwutP_jyH61V+GAK+W5h z23TiM_^BEc0hS*(<^0-j6(NQ@j8d?P6l865>!bMRyWqw`wS6-U4SD@Vc;g?+1vfR=S1)iB0l45t_uN z|HSW<-!Ij19&Sdf-Qx7P*IZ6=EaAc(=2Ng4olxi7v<6-vLldWi&sve}$f}tdszPx6 z!(V0GVVlExJyhAAp`z--{wLAirfKmo-_sdqu?FMpxvFU+yu}7Mf&0WK(I~0svg-c% zF9T-mWqLYdFIg~)74+^P9SbtVjn-F8q`Qs2V%u|rp!0T5Ye<|V$I;R+)e9)xe~{%} z`nupgiP@Y|bWFCRf+bRDxO5~$Hk&wmjjEv9Zj!eZ7z&XjNeTAT^^T~NXM+qezX zI>Ju%p5#wkTu}8tVGgJVpCb?dC(QYuFlS*|<^NBFIW)4FUVk=|wgI<3KpE7LoePR7 z=_IxMZaT>gEXzU(^HN5heT!?K0AcbQ4V9+hMyAk(Qfw$u0tHL{^yB~_{k!{e_e=VA zkCK0Llf&G}{d?LU+?8%#ZW6qDw41iL3H~3%r9fqCr8fudegkJK?EVIF1~u?9e6oQ4 zBa9rh6idG=fz1YfqsUy)0U

a&Zce@}3GBw43-lc^S-~&)i2Gl7t3~7+jEJF%QZzkadcu?H~voZ9m z9e`enb{`bjhP0q{-X&FwL_4&=3nq4fCRlwouS1U{k)&|vT8S--FO!-X1pQiFV4z>?R16gk;0`UM()oYaM1hOx7T)lJT$c3z zuWO<#xmAL7{5d1@#50U*r;!p}7l1 zXvKfzRF&V{{$g2#+-?#^)sReVCMl*wsw|FqjxaV~bfMfl1@Q3i%7MT| zhSP={G17UbaIMIlj0!b3GNqIn-0YU1D(0PQoi zTEAW_b?hg+=5t_>*+ARla)cammjm#7w8~SfttI--6))UxVwN!B+oB6BrrrnG7Y(0_ zH8+_0POi$b3j9dlTN~G!g=3Uh=x|?Z^51!5%?Uy*}-z&Ci+__VR_cq6V zc9GLZj~{-yBjY@=UFdlLb4!4hn9w7!gJ|_hpJNy)(L*P0pS*!y_2@RgZ1mNDi(*5| z%l$2MS8Y4it1C$uTDB3xeIx4d(9+uvyhu$YUSUX$gxcS?w*N0<1}#$}Skvpu61lGqMl!{p?-bcA36nc<%PZ zN8tl@e}%HPf@ZuzX~UEVQ=H_pJ|pfE2?|X3V+1wAR<97XL)uP4UK{dxZD31|E5lG& z>he%0afM5}_gxDLy@260|D~xn&&Ip>4hCI!T!HcRdFpdyi`>(@QR8Y`M7e4y_$9t;Qe3i^O-R7;0p;c+*C>ZjGe@Tu&)5^PHuEhK&yu*)uj>wA$ zuyWvAL}%tebC-bJ5e~fk$lcx-BL(l4$I$J51(dMrRj#ewI*KR^p%a^t%7;3UA`j}cb>3FUFFpeyiqegV-;j{HZ+w^oD+ zVb+4#TZyTNR|dn~hROA#J>Q}WyN=B4KeP`>7hOg@k{F!6#Y36YAY_6|a5?ZRFv2zm zm7ZYtli(6YDkxIIWrP4*LtDCxw!6K9Y|f*v<|ZKtOuXL_YKm3q6Q3jej$>RSD!l<` zR$(fJt?C4sJvqRDwqZc$^A#zNAOGomNk~ZVvkvSuTs1S1Ti6K-pWRT3Orst8rtr{* zlh^7>rB{a8?_jB!YX|-4A@0M;GiE^7L7TYPGlouWmJ+0gPtPh&x@S$?Ms~u6{K{Iv z5W$xHf*~rO<8Bl(kOd>XT@oxar-okh+8(QWoG4 zZq@ZN^YEq%@oGL)J}WWa3n~w^MsS=8xry!bN4TmEUgX<=#Zru>GI!_6m>+M1;Xv7?r&XPV=6pc0)$k$-+JvYIUsX zuxp*c(+|6@1$_ob#I5Qvik-8CYeg$W*2(14YuOrb8mB= z-!<*-Di=G4rTK6d!iVcvYD&fQj%Yn8?FDDJ`V51CBW6tekr1nok3u@uo-aV`RB>&q zVuG8pf0~zYnOOPhJ)`>6R<8cnPafk{q%PP30vqBR82>-DlzQ7r(C&}Vt79+G$Ee?kBTjiW_!W< zbn-+~!}OrO%S|I^8MrD=6O4n~T685y9Qp5!`hJ8P?tX|I6x6rF&p#a(43#Gb9>2Dk zxkUaoFl(fOcgyhIdX8JvzPbmH4%~*Q*Aa$Y z0`AK2^T<%1F~nzt9!5T)^@sMn_O0+xW;~b;n)LyD*&@n;GaqT{t{9^pmld#tm7w_t z!a|sXhrm+SZv#uofgiBqz?)}~{K1!M9vYM^8j>?15~ywRXNJxk;_TPl5(HcW z_1JOFDof}X9FGw`?P4ln(usC z8Yj|&Y#_zn(PJHVN}80D zMJJ&c3SS6e*rHJy`;hZ-3OOJ7eoDi2=4M&bIk70UTcgHb-+b;AcIja=3xD18$T)9Q z2>ML;?#Q-;DlEkSH3`hv4x)@m&0~(;*E|X4Q(k-%A?8z~ZPHJ(sBy6|`Sm-Yi_o0@ zl2t$59D%i-T!)*))oD5nx-`^I@Xr`|o0{pYv877(gA?ZDP~23_+2u)E<->jD@uS>L z(EK-jI4(rD{xT;B?RT2rJ&CImztzqTkXgAzL@+W0b~uFfEnhw;8jFZHRtcG&G49ia zosEqV(pUSs^U=#EkK-nF<+Q={90nrr^LzuKmh>(#~qZL-7Ii@ z`yHXzRFe<9@}j+ftN!xwx;=ZPwB;Ylj`juO!vk+Tdbz`*eJb-x-Xq|ob-qSqjkJ%h zGk0p!Rwozsm_#x>SJ5Ew9?D7w2N$+xiTGgoK*P|dSHd$#Y6Fs>=`c@t4SlqOYk(bx zg~{X*)HUEU-rn5l(uSjyx5r{Tyv@(-pZ2XvB9b3ARcAQ$R+&?re6Yg@>d5tD&e!XR zMAUSL(xjZTf_xw1P80IY`h1I+yMq7p=CXSTdd8=_R(bdo2YxI8)tGb_p6sz`uyG#< zL(B**DKO{{&<7cI-8#zOb6XtTjCdWmiH@G1EjvYbpMj8~cP;++6*M$uo*4?fl@2G&-< zi+RZhhE1+ftf>y10MRo4-_x@{_AD9ko-QwjQ=`#)dm2~<17>`8jRm;aWbiduPhgo5 z%WWZ6|7a|8EF*%lB;>$TFxcLP05)vZawYPq$u?8Nl8X3NaLF8;@)liakiZ%NBVpY% zYT9J8h4E`2+*;Y8drLEOolj2ouV5ztmqz){H&rE}P>u~8HBjIK{=xeiKpvcn-y1iU z-l0HDE)QH|=ipqMlpK4liDwC8jmR=QwX4aQ+`yKAdgH89>;^m&`aEb!_Re$*4AF{F&95Ve1FHYFQ^Hqe9u@0mQI^?qqgTK)Wu> zxxLw$=-5;xuNWqycWVM;;8XsiRG*QEK3*3ZJy;QNvesyqgipv#{>cKR;XvDr@fLS; z#V24m`}*P?6u*HvD7!}Ai1TLU*#on_*pmX)@*nVVm0V*2K7CbNcN&zlMZb1VFGvV0 zk8hn8_{}_=Fu#&RzNNXvSZZT3D5Q0X*yl3U=A%}dBvRj{UF3mc%x9eW{Q-Kz5v|X03@b3; zClj7w{_ktSX@H9+Vf7NFVqVrQ{pom+Bt0U<{!z^Mwb8 zt+rA#*|6i22KDQY*zXVu8BSFmt$)g(yd`qSqpe=d&6VjyXQF<%--NqF=J=cZ4DhWL zLLoh+`z1mqOZS&7_$Ju7_Nvt69QCH@(!h(_T2EBiE|sOkC=tUg!$g``~gg~-yYf$Cl_=HyMwnQ z54YuOZc^3`jhzGi2kO$Bq<7`2^r<%RJMmF2CdMAVfb&?*@?o87_FD&iTkAwZoh+MO zsl{1x@9Ns=ZOq1T4XGINwp#JR(UtiV-@Y>jDD=m6#1R{FQ?5ViC|05Nsa;6_E|?tK zntemXWzUzC8)a?cSDGcp2I~gz7p^ucA?{ks)om@8VEpxSK5OJ@K9PiiQ11pkTXEgX zt@nmmNL4kVUY5_kEy-ZvF+07a(V*!5t#>1@6pv+gF{HDNkKP!3e$V-&ZL?7EQ8jCv zaPFjB-Go7-k5kH^n48auYMbLVmrwMc_|b2hoa54e%=M=H`pLdB`LX*o!m8`u-;oQN zcxo{G8O+)kyWfdxHgyJ#R$b=)O(V-DGPmEj`*X^PZL$&-&v}JJR5W~G%~@-!)uk!y zcr~`{+1)j=^u4{F28abkL}lx>&p>Q?XFGDZ}vZp^uNs-;(ism1=K z6Dt4oiYS?TY26*R`vhOFWs2FdVIa}0vh2w1y{j6%W8_MDI2>~~U$^i-myG?EmfaX7 zt^8iW^}B>0N3jJ3ch9mkYSOFqtNFK09iR33w0in&y(TFA7X(K3n-rKER9uv`h7&ysNZ5mek*ffZ3dRi;YYrBQXAg;m67|2_$2K=nG*Sz&=jECL zyoNJ1)TrCHp!4r3$*nvdA>1)3f30LJ=yb-2=~KR9gUw>+kH0J7t4e6y+8^|pSukdG zr0jTd|IO_Uq!uwpr;+0v+ehQ3sGV=mKT1mb~~`x zwU=IpggWf_Je@ySd+HfsZ=|Q>O35Dl`hdLD?oqVfW{JDp*9|r~ejfFZY$-{8z0a_- zSo04)aZgDb9*LiE^)apY^6X5s9_CNBnuO+uz3?LU9)DUmbT5HqpQ9C?aLh2**lWFeZ z@~)-pZ|7n^rleoFl@-O5uG3#`6>{^;#MFELW1Vt8OV*PPF^-k4`ty^y8SlM-b#1Pg zAKOw_bKymsF7vFTTy3KKKka?zTT@%muO9mm^$1FlCI^sCq*uWM3IZyEQbLD75CVjj zPy>1t6eUFIouddS5F&)o0s#>SEkr2^AwZCTlq4W6lu$0d?;mi#+>dvE*w0@3*=yFC z-#pLEtXXT%$X6{~<~__;v%Avm!)OT=(kn9U~6`yWHbFxQwvmN?n8aU30BGJEb~~+3eN}{+nC!G#u#<3m^8nQ&{l`*NzfDVDME?@+;TP8PQ95aK;neH72`G`K_j>R)A?E3b%>kWOSq_o)#$NSv^{2)$EdOy*xbwIG zTI0bogT!<`iSo9McGF{ER`*bxiz&~5VjUfzYE1)KHw{HPRzW6l74%D2^g-tg(S~ma zRHVRnEz>h`qi(%BSmBFgya8{mz_;URC`O} z70E>MWi_o7uF|tJQ?lj@zu{=V$hP#7aeeU1 zLN_$n5G zrsx>det#dL0;~_x&2$^QykE(MR%|EbO$zs=raZ$2Vb5q-^H1YIvM8nYY;Hs7jh2Du zElLp0Ie>-u6@d4KA$0tM<7IFz;B3z2z>FDH-eJTT3YOkd)N?(R>blNWiVT` zAPCnc$E<%TWBWcyXlCzJsOz1R!-VDL@J*wW{!zc2#x|9Kx)meK-zJ#0PcHwF;v;pZ zN!Ms+4YVNZin|rXjMY#gJr<1_IB!*#|KcX>S5Sa93*uHmybgf`=R9{Z@s{7J#JOto z{>z;LjHf#aoeZn!Vsq>`V~@)F;?9|&mO`K9UCSDHhOhF|zg`R%l;n{3#aK?w_Il(c zu{>PWjS^tN_J5yhl<#kV&1F9U5mJ_LQVz<(a)f`jy721;NF(@IfasVHCL~1aB3AfS z81^U!D18xxG-iGo<2!&!6ZHDXXLXWJtb#PG6eUJd-$+%_FM&Tn{pYHFg%3sjAP-#8 zaWm@-$I1A^(;#^xWcB-#!cD5fUM+FW50b*FWd6m9a!`tADsRt@t3|um8Wh&ZZ4M}c za$rBk;Py@MWJLKVO}EoTZ5=S^K{b^UH%mKd=-&f z_V#Hk-)At+Lj@leH$1la4!03>#bbkbldty5hGs>7Gowy$9&`Ky3u1J?K{&c{sfp<4 zd1Gm@`a$&W=hVlJ^{M#@5VMWZ)6@!5aRuuu`s`6kl(dJF=2(m^6!~+%_oS8_E{vL; za|z%4-4I``hH*{fUdjt}YITnu^PB(ap8s?R!^+I?@x1QPa=rsH{7tJRy;9<^7~}Z@ zxv$8omd?(M(TIuc>X=Mw6l>vnNcD<`f|0OVp_9NQUhA#LKCOovi-g}rG{yh zTG#EkCeIO*g-)(oICvGX6Khu5O*O-(p{|-&lHRL5duqpuQ4;!$6{nt3Al0A=1q2)h zjpMWW(gX9Yd?$$m*Vm>(_e?f%xRe^Ab4A1))Aiff&JC0*BgxH33vR!n-<4;W5dB1qtL_&e|C-X zkGE4p;`}uwSAF~Yx+&5qX6>tRvzw;z{$xSg-{5FvGeef_G_x%^hMqEm1nCi3yU zK@ _p!iZt;T)ir0v{q$bCJ>Kj(hfX!wiH5!S>PqbBsrQtqA zSLMWllTsr<91MCt?AdahKF6ulD%stPQI-c1Y{3k}${?cL-X&u&ir6*S=}(qNVe zR;?ef?(3B)y+`=)s%&23O_#}tfr`8#_$P4oh>`!fBbE^_6j{&uuYCDx@k1snA{D6e zrgGdu#HsZ$gmhtVUjJ-ZJ@{ru3VN4Anm7Q+3ZtBLlxm*LSO$v-z)kWpTA$qP6E)ZkN@GlkSSzmpl>>xtp9(D0Cs|NpJsbT5q51eg9`kh!#FYcZMJN zOu-~VwPJ(a#3s|cKcZMYZySCFZQ7=Iox$E0I;JschAE38t<8)!MApqU8BdzK7_Ptk z*f1Iy9FSTmxaOuK_)(b|;*u?8EGa6q6#Ug!BYmo}(^PsaKyCFJSvd%5Xj@k48 znrEiowElP3m^2r?GvVhln5*Stbvmny{}@0C|2E1MHk3yOs+}TdRY3SgzZdm~=D+?|-tFd2(SS*Zveyj|@1>K#}C10~+2a;IV zmw+!AW9eOIOISDk&b;Xg4KbLw+44n?8Tle2h^_AC$Y-b!>T;(Ys$_jNC9XW!V`6KX zo0a-iI`g8)-Iu-qbGX1-8%YrwS9vm6nt&PW!Rzdb8~09*8q}&<9c&(hq}aKu+r)GP zW*uo>ZGW}#@@Q|DXQ?D-6eV$Pz4&(EwI6N8&h(4_e1Fgm8H!BXg$Kl6xwAu3{IX7~ z66)<`7L-{_xAiW==PbIPp5UF@45QLONDze9@@CrG{n*pq+|)AKHX({=z-a9aJx`RT zcqnENluPEeZ{PmgZAkagyBboh%PnPo zq2v;IuTCh^HNeQR#pw5@*hdi}Q1D)O_1Ep$kFYAm09I(gzv$4af(y#lp|cS+Ga~{0 zjh-HeNuW}-eH6naFzG2!dZjfh%)a4WviR}sG+#XJl*?kW`C)$t`*-ShWL&VRN!tcH$!x6rGn;LDBxF=jy)K|(MNivs9px`nwbU?qDfM7xG=ylgmr+tjX0T)qEv?jo0-V zVe{Fz8V9@>2958MroJ@5^yg;JwSJH8&^fyZHnk|+jg@o{e{`nXOx6|hWm1cO8XH%D zHlL-vV9lwFlkco-O@^5LTxpxL59#6*V+Pc3sLC7zgDAcvioEeFR7Yt%6|qUX8KBJ# z?CxCi<;;R;p~`*Nf8}K@*8Q5!0@h#10$)sVDP;8uDANMAeLQ(O!1nTnfRB9?%1?d zQx*_gJ#;)}x9P7enZ9emmjoiwVPD%Cyhn1+O+>4guV}WFUMdsXxn0EPhALQL#K*TH zO3I}Dm#LqUUsKSV5y>r2$L@4XEH`O(zhu_#q~hAoHVqKRH9uCKa*pu=+czn=oQM0GG8hfDY`OY8zDhC2Ax|S7DRaI#TVKf9>f)Oij?0)4hUr2di z(|zpugZ;r1H5(MND|qT$OUP;y)X?0xS0HmjZDRBFJ%#R`oe9GjX|}pSoG-1-j|R2Y z&US^PzDR7c);f=Rv(Ax$w%Z?I|12&gOKe1V;Tes0PJdv~V0v#U9$HpAK|74)0f#(S zUsW(_Wn{_Lz446>SP)~HIx}O#l@F3tYwBYs1MBXOmsfoC2}DkHt;F*$hKLMomOORL z@A<5nzr6NzFXO6=Rr&fH&kjy-Urr4d-!{Rx3QSk0Q1_UOYIh376HfZ?;u};nZ+(=w zuy!=E&TNL3Eh>r*L(HB48q(rW!9 z<8s4A=g}@DI?$O1%W9I$2}gHzmSkN`dfizKB0iJOvn_(+fiyUtehZu=A>jN)(UO9Cp}Ut9RniX;;dqaDXV z?igGLgf$+F4C%>+VaypH$cx@0L?x;);1Nb|TfY&ff0(^yy}Fz3!#+1miIietSyfBc zOf7JK$Q6N4h#O$y!u!y3F{^n$*68h5=F2klk#jGZl`0VI68TDXTe*EvMxGwX-KhCb zmLB;bA~9~fZH;#2qyXbxeV?r`?Vr%^{QcA7 z36R^48KNYt>ObpQYIlo?>b#G7OKegx6l@JL6pS5=uGKydcjg(VD+h?VsMt z@cbY}>nr_`Io|I5xWGiyOkEV9#Bq31jKFOAN{xEw;CI~yFUSQ{D-C4U-M^IZbhJq_ zJ=suh1KKPh77SY<)wEuuw62$5f8(ShAsJhPD51Ugd_q`a2#-VEJrT{*MsRwAYV!uZ z;U2I+d^VnX;Tj%c6u8k?wTxkICU9u8PHHTc-ejA9EO+{+eQNy}?x?`e5#$wjTaO76RCVrb+ zMAVI8H&5ojewVvOq&-RA8K>R@`mx zM~}-!3cJp-#`q?brSS5d0||M_uXj7WOm!Y$(uzLL0PFCDWKjn8%8;@D-lt5uNJxq> zZhyNinYu{Ir>x2Y+3JvF3msFP)&|DW{r5!(?HU0R-T+F($HrZqZ`S>bXP~<9AEen8 zZiB~8EYtF2>L2gHrNE+WAgi^&^6RPG62arM?@d zGUfIt$ZR8C2T;o_fs4ctXfWQ_<^1uP*bzypX%Et*##1evfC(6?#M|dTH_xvUJ4Myg zwj?fnTQU45hid1W85(SSZ^d80VU`3O33l&#rXyxOp)1(XBaC_Ei`;6|-bb!XDyR5H z`1J|T&B1I$)&_XE*tM-2LF8|dQeEFmWIHXHS(fTGBF|F9v-pyQQv5l#y&~yc4KbpQ z2q6FP4H(3`6Sr%K-o2$uNP()m&37SQ$BHjdwxuDPG>L(W<~(6QdZ@YSVaRm3zn#q* zdb_Q0jB zy2jy-k=BB4 z$ml;C?cO$3vl7 z(%$awE8wtuSANiMs5PC4(*p-kGu5JRJ(V zM>FaXkvxjBn5cF2G$8LV?)5arAxTSXe&9)E-qP(N7q(IB0Qbk^fCAWt#^PJ{(hA3} zyk3n_P>1H}sf-5;UP5(7qCWY0;m(QaeDGrdTkF{^U6D z*op@MpZ42pjuf*nkWHVh@!RpI8TDd7)LN`y;936!B?kUZD(8GU~#cl=207@ocTceAzQizdU@ zNn~~d?QktyD9lR3sC^P77+Wv823QvdQ6}y^$Esn~tQc=cCT@!pOXOFXEun@gtB6H` zitTgHN8_ZVT6&E0>R}4ND{-uVyL+8+shf=T*1ZGCFy_^P-K98Vou=iQ<6Sm1kK2?EF_`*LGdi-OU$zV^8}pci-&!PT{P;r4V~ z&--O-Qm(1=nTPZ#d=qdcsbSVgRGI?PW`T+EjjV{7S^>7CS}D-F*fb-TQ1bx!9`{t=l= zj9v~peu-6Kyz~{Bc<)EZft||s6}s;sSIovt;Q{^$&`w)rcyo9>K~eTI$T)@JZeR}$ zWoHbi{N^d`TeLuQmOOCN;+@f3yO=Ncm6Fm)SCwVOc&$pZ!9uuOHe37me&DNQydfIm z_$FOZ9{>9?++qEJE}zG!G*yd8^oP0TsmDjOtpnd0`%9$c4YFqj z`NjhSC0Cg;afEPova7ZBFg=4t3ONbiUDLHP*j-URD17smVNAG|TC88eQ1&g={+hxO ziZ^v`aumbLUAfB0NeVVR!fsT90NvkUvhL?CFn4@_c0Fr17`?p5BYZln$ zX;GwHd1F9K^2LRCxI&1CfOCe{eTM|%m;-W#T$kFGKigB5{ib}E3x4sOJx{AO$mZ1W znLNf9_iDR2F|{jrw(5pO7sb=ESYz-H%bZj!La!%nrX+_PQtIy)Z-pu%9*?cRV9?tA zskSHq-V&bdP)=^3Jxi~fHhY%Qo8 zpNjY!ae#SA34dacZt@N8Z}6{4=WXa@k1h&i;$$9r!daCoCYWjEX4`*t;~%xw?%{Gm zfq$SgDswmu-e2aUMJq6QYFM74hW6SrzE<-V=lieNq8jsQD)RYe&FUaV39HR(LiG>= zv=@q&gdv zW9-@8#9J!78CWUjTH{bUI_Q&gK$zclLV)6@J#$mX2P3JS6zFa|!1&o?#S+BA4=80BIi zR%ZUo+7eKX|7Xv2f$pi7IAnQnL-HYtb}xf=)4iAHcJN32exzO%4z>^}7Q;s7%$`34 zIGAOLa;ZZu*y{nj^QV297W!5Q#GTkYTw#3$`H_LR z=Ixsxl3L+pQl%NznAt2A{nXy?r2lNfqY+pnQ90uq6m;iE5$YpzQF(?Uy;?%c1!eKN z-HhsmX>!EoZxAXjyCR7dfS4?P8W^B@_JKeCGg=NuCW0EdCYb9(JI%ptl9N^FZ|RRW zQYybKEo%d9Kjwb!NyTHA*3cUsRwyZ`Kg!1;wQA(}VQl}OV=SWT3ra-*8+xd*gN=UC zDAMzC6UEDN?+8xVOQ^MI2u^s%f6ufYtz8^oAI&zD2~?Fets%b5(>QHv!afYwE-^5Q8ppDbA6QYK#Ho^wyMbM< zTf5Oz^8(XBma?;SI7RlIJ-&Zb3C+u)Lb-#*UjO3gR8q>B1*JgxomVPW*h^Si0j_De zwYV`en6QVw2iyXFzWABavzh<$jaaEsnBhTt0%Vzsp^C34+@Bvvx=6KKDEiS$N`X=SUB^?0$V+ zUS{0O>-wIn4hwpjAK16G-TN9L7*f#xrj2%}P!F*}FIy`D*8Q(>s<77r3XlH@%WnNR zANhj&@*pnbAcC{W%le0tJEb_hX5n){DG@;}FiMP3Ju7dfoXeIUx4bFWTx(S4X$&+Q zvZ&{fL`HX3^|@=VYu;{tInS<(h7}x|TWDnykW^M8d)Z4I&imXw1{lzk+%u^rR4HQ9 z=V*Q3C{YYi>~V8Lx1IQ*deg(-HWXvyh7j-^+-zs=XB|DR39pJM_S;1|67hFDD5@v` zZD?skPE`{sLKQ(|k6=Gcq|I1pSFFkJPR@UTY~j<|nL8s|7EC^Zt1o31ySqBW)=D+O z?(%j%j1;f-Pu_laV_;P#*fD0UGL-L&uP(VFa7HQ%Xw|rO+o^~&8 z`P(7}5VO@Jw_%|DPrgya5Vv8;h|$UXv7p;h!I&)B>ykJdV;E!xb+7EBM-Ll|m2GlI zkNt+^@uF{+z1w`97(PzvAHB0^0t!feqqJ xtWQ_Nq7@u+RUqQ_|I_dP-~L}B@T6zwK-5RMSJ>vL>~Fdjx9{JoGV%EL{{W~*q~ibp literal 0 HcmV?d00001 From f9f259cd89d0f294baf94e27b5ed119014ba2875 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Fri, 25 Jun 2021 21:56:34 +0200 Subject: [PATCH 14/32] Remove outdated figures --- protocol/conceptual_model_df_memory.png | Bin 21396 -> 0 bytes protocol/conceptual_model_df_memory.svg | 627 ------------------------ 2 files changed, 627 deletions(-) delete mode 100644 protocol/conceptual_model_df_memory.png delete mode 100644 protocol/conceptual_model_df_memory.svg diff --git a/protocol/conceptual_model_df_memory.png b/protocol/conceptual_model_df_memory.png deleted file mode 100644 index b5bbd31033213a83a89c254fdbbdc8ff9746bab3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 21396 zcmc%xcTiK``vnRI8y`TyhDcWly$MKfHcF_{J18yEd*}oUB2o>#1(g5-(tAe{kQSsv zsM10U9YW7NiGF|czBBj#>&W2boU_j^&$FJj*4{DfnTFC8N(M?02y_La{8SqRI(G^J zk!AgN3HZcr2XX`WMd7M!c*f~3R zSiNwy5pZ#brmo8{fIxRZkf)DyebUw@y?t&Djkj(2i>FdkuwS^$b@hX%!ZQ?w3OVCF z9>MpP3cBA$rN0E<;rV2El}uro>-D{RUpycAePpc|lj2pf z=|Q*LpbiaOdL#Zaam;r{ja_R0KcJTnJLc5nf!``2%+i&Khc73pdA)IGykdVgn(`%? zs8`?hjGObX?em*FfJ@1;u{{<>JBGe}i&9X~lNv6y$>-O+;%lqps;nGlG26%H=p1!n#xy1H3b z5-R1U`>fg<0i ztMZt8w>On%wu9(p94?YBaeJd0Du4QO&dK!Hbbl4`mH4dOu_3~9AX4_oJkX{L8b0KM z-DGJac0KNXYJs&&%*9;7q<4XN?QY_fgMPs`LA&w8S;Y10nbKa%`+;vRU5P@<1xmS2 zr5UAj1nl-%?X8Fh7zYE?daEy6uHW z%kLlPjE`4r#N1@K1+)K-rz#9QYc&w7-42uY3&v34s<5UHc`CrsO9y=0v7+1 zTpVI>J!M}N+Kcq7e4N+vxjTV0()4IQJB4-(ozbCb^MzIp&R#&%ad6-7) zz1mgRE61x}n4u<}X}C4e5<|=1JxP2MbV+YJ-xvO;EhYED9YkUR*!ef>c#nMbs!QJ1! z-8V2*8_F}9aF6 z6I1-Bu&67ZH_<2wm#vnJd5T~VwX4Sl2yHY-*V;XP{1`{m7I;waaMB1CZTQyb0Btu$ z?KHYkl^gf^;KCuXmiUeFk$}xP%_yer?lKe8)$s~D$%CIAv3-YD$NQ^IyERZ`<9ckg zd|uhhzG#CBbu8Vvc8D9-e9-__lmZ zin;2!yVNIj_w#4yT9r$a^qe9!oY#3cVtJ&kwN=!4w0i5q`(M8%W?9BVzOVvImi1WQ zHrwmuz-^Cx*+<(w4%}6jsU#YQ;!sZk!`22Rzfa0(Yi1>QIK^(_r>{v@xY z{MHZY6zOyyj^vu*H~TGuDEu8aGjWQMLHH_QHT7?A&(GogF@@QQqOvl$F2TJYJH4>< zgTwia{q@NkVh0}|otBlAwV(x#*mIr6VX951u=PmcNMAF1OQvq{uT#BSW zMSC2M)HFH|ET+^jOM4;boLQBkS?uoH?n++)ci2}fDkv(l*!i{6sshuspObG=DW&zh zQm}Em^PNQ^znLU-IO2G*C)ukCh`yKZhYPCb&YhEy8VFLF4kEWjrGa9 z39FmD#mIQ9)Zn*gz%1#9Y8%_ql-=ph+h`&bx`a^DQcw1;4*O_E;T{u_Az1rvw!PYL ze^%Ee2HROj$5@}aDC^& zuWpv1QOq}P+_?U_UnA!fyI$V`fqaM6$$s)m6=TU*=W^vNc_H$M+MVc4sj2L#6U zyt0;-)>J;6X-*Wscv5%q(j{9N!jCRq!j9!NBfd5@EfNSlV8hKpJX4pG0xIItgKC*K9e1 zwO?*pMQ_BE&plqNYaYJ}6p)$6pDsM);9` zg^5a7sEhmB18MSJt<6NmfXWL%R3C5yF!E7ifQa8bsI2Tu-^?5GzKr?$@Q79Fr&KL= zg<_L(fbjox)97fme+ilqwB2**3d7B1fzm99rR5j?N*L~9Y-5sC@0#iVxXTxtt^}j- zTl_o@FqT$PuNB?E!-WU$q7&~G{*gU-2E0^-iLUO8b@e+uY5L0`cew@A~-9`#3-%gFQ>70};ViH#L>6!Ul&-`R3b z_?z?RcQBPs{^gJ6IC2jyG*#8FB%Sq)i0WvhQ#y|{1_qSb_UKSzrgP(Nn*1oq{9fX< zf-)%}qaHc>`S$In@&P+CQHM6$G-kRgd4b*I4Xi5@|jKT%Gp! zj$0`0N)bScLe~wx-@S`HxovY`(vqT?E_L|Frl~04J|XbJ`pR_G#-D?OQFRmVdOpaL z(8l4}RJ(yoXBJICtUU*dsbn=ZHFpsY{$y@71qvN@aX0wy1xTeht+OYCe~1BXw!ZhV@A0H+z9~= zl-desH8CHm-^!!ql~y)hiZngiU6#(AHuK&{MA66}&9}eXTbW3UjlCfgIJ}k&kO(#a zHFot2OaxBt`o4^NxV!Vcdi9EMnZ9G$WP;^NIY@hQ))k0+~)4;+j4c9+XlanE z_a^Z*$k33Gg^+je(j&rhYvl=hAU%QFrRquND%?a(rrMcPs%g;S)QN?Xl7WVbrl#OV zWc`Ou+X^YLDNmuazSt(+1Mz)A>>bg64eaAI)1g&G}S`m2iC zWlpjE1;541FC zF8jz7QLAXxe!->_0rd4~5}$|-9n7t*=l|4^Up_EK+`Gr-!Ksdvvizl^xeGAa zHd$X^-&fbJT~oR=;W0f*XC7qp#z9_7ON*07KjH}c&KO`kzD4!hn9(Yw`oN8!n)38a zH*X>^H6A&>c6N3zLZg`qS3cxFyUSDXSa<@5@o6G{w6MAJJXOF68hn0idsh8{Hd8|{;=_lX74f|HY7L)&<(+{)8w!if-d&%_ z>Y^97Pmw3og)jCX54XQ4gx9;@&7;rONY4{&N)z**rHjS-Siy;W8$hi6lYtjVoUY1; zg0J3n>YAD~uCyy>&da^wnMe2y@$kQzGrlv9f3@V5I$L7FGn=OnY z-T&&+toM`abt1EEGdr zsc`=t{u4@zh&|0v$LZ1<$0~ICda^VWHiVE_mu6FX(D>)b**jrgsDacXNN(;|iPdM5 zD(6O6FH)9PWxIO3(y_1+&>@xVzG^OCmdJrtPvSk0K?3uDx$rQzY;7bVm!ADXr%T_%R5XdXjSs6|+i6O-;H_1OFP95EbFm;6vtyaW zrx!1UjG7(udZq2`k|@ikL5X;TWnINc&v}^R^wcoGFfU%XU=2U%J1AZsv5;Sw)8r6< zwc9okiM7rIrw-lW3qM1a${MD}qZdQUB!oK}%s)+)OhR6MWIcDQm z=(jzKo@|BMyG+&20Y&Ke%-=vQb${VW)r8+FbQ+!Rp9oeWO1-x+PpWu@|85EA6mNk7yl~R_qxB1-o&l*u)~&Dkk6TE4S2ivTk8`Y#R^S^0 z$!A%a2z^Su=0hi)r-yS1r~9jWczKD0-YCjMZ^tDMI%D?LxJ^9lZYM@M^pr0+(9cyGLk-P6C4 zJSm4TaZAAT89yMkzqStXi`Rua4SwpQhn{8f>PaC#e!TWEHrA>`{#{dXt=H=HM8fL( zU+?E2VPQ};s+HpL^R=+`O;cN2*a6l;jR8}uUN`@RrJe8c1~3pVqg$aZ8hvnTZf@@H zh?}%}%jI>WF~!AcCl)6B`k~-(S;3;_je;lGpcY!A!pHmc$7RMi>UOv#KRrGDHL+CD zI`^#3N8Szix+Q|DQmMW|57G=hhMXGh&foni!wFtd(voEw*0`<~J(cy*^S) z^vc4bql-2=f=nvx15f@o)=9aO^&{i2#(ND&kR`YQAr}IHlsK7KoUX;>WH_E4ITSru z4eaWvl0DjNzGsFSU#%RcN#HdQ2--&VZUk+IeRpDIJaMfX_uC5)0!v5?4an@!oi+I5 z{b18m4KI-EJCEJ2XpvHx%MDg`gR24w--?gayh1Ph7-0iwPH)ZRlhy_|^4XO^m)}e8 z!SO)b(i|MN;|lS-ISbPQ9N8;jNl8gg!@;0XFrYA`6r=p`r|t`#%&Fo~jiwCvQLfU+ z`ec*Kc;)1*{ZJn66k2C5#A#(|Ss70u7@G)(16ot|x=sP65k8m4p%?)(*Iy)O>Nk7Fb;pLL<`wvGb@+Jzp#KHr0b$9z%gN#oOD@MwL!2JCD z!UP9dzlq|nK`rknG8L(tcGgaNPWO-d1RHh(LY+)1O_A`xNCr{w-ZT*x#qdS-jIwbY z*`Y<U+S=MZ#$GEYAK`)4 zhrb41eE#nHaiNPdy+PgZGP zPcM>_kF&9L<(xv{3tj05)Hd+&0#a{RNm0>Oa(kPP*ch{Hz=QqKk8DHX7?`O(MN%w=OdeJEc)VS^azW+R0?pUlOleUu=q(nfQBk3kb)Q#mnQM=I@}cb1duP!+Eg|8S>}-0U;Hd>*P1L7bK>pn>wf)4qtM;EMeoPy4EW{Fp7U$sW7FbpO2F3q6SLBkDeAHVj~fikef73$W*d)T z=G`en`GV+Q-T&DeHl6jIonN11ynTBq26gZx^l_#07~Azti?+#V(_IFiBC@(XYo>qa+m2wN(+s%ADDyf zuGrPVTz@}wH+`aM;NB30zR4%6S!jkBx~GfcHqi)$9O0^LXyHbs)wZ2sam6VW_U!w( zaTiU%gMbTUYY>I4ICP)A08@S{%`4q?)QSpI2QPmoCwGb{(Bm~j$YmTD6%?3=w?D8S zoc|?@pQ^&I!LR^vmOJ$MkyQs<|9V|ZKsc~D__(sRQkIp(zYzvqt#;`O^hri`T9q%FHTzsd+&ccLp-oL+O2~c+m5m?f8Z^=-4D2UgKbw< zBYS!}D6UXZ$s^NanU7kr8Hsj5LwfSv>EZ+j2!mvRbu@0gs`4YMSN_=g6W=W*ytAyW zjEv7F2AcujkIXM}mL&9EfmeEu;>ZBiWyGc!W$xP>d^SlZXgtUX#JK4F`&?|Yr=G1e z?UAxV>n&R@-fM%Rj}T{=J%eOY8-U7(*EvtJG6QCpyNHNL7dl-QJBaGAs0ZS>Z5JwOnZ)A%}v{;ne>GrT< zVcOj+Bm_JuXk%0(x%{67eYV5ekDxX-@xJi213K6mW@m6Wxh!L16PrL=$J z0b7&@QZ)^0kK-SQO*L7KAah_FlLS}%YEee5GY+$ULq7SL3F6jC^~>$%S8v)N{iu#u ziwWZM)zKU$vr=b(Y3A98M;^;fdfz$w9-mL}Q9%?o)-J4Tat3YfB2UT5aibO3+}wdi zU!FV738xs|qFrfeg4gQV(&lzoQ52>^e7&!?DQR0x!&&3kDSqnM)Z85Dv@^_1vo;2I zeYUe`a8~TTu+1pugt`3(Oa-LS`0RuEFB7Cbgn;?haQE>?5ri&_N3%g zH#VL|$&oXP+SkuMH#JTF!L(X%jm<#msp6sv0x0hX2eDz{M3HIm&Cuv8{p%I+m&LOo zi56pbAccRVX=Dz5uNO|%p5i{#ET+IZaD`6zfxb*7LhJx-Pb$H^%412RT_gR5K(2XU z*YZFPZ{X1ws-v);<@UjELBJKQwBGun3bs2Ex3F*r?ByfE$OhW3a8CC_6q-e94JRlf z+}{O1T$tafO#dSbfbr&sU+&j(`ujUMYoleVhIkEh-6fdFdQ!yYt(g3C&}RDqdkb+Q zmg3HabcaXzMSG)FE-t47IU=3o%)Vm&=H|~`T{oA3A?}1_45i4}Y2#5SlznpY@-^xX zL~87fl%yoXlKbN$g?gOpYhGKS)FCqXwE?qAm+|2T{5a2$iTa5V0E-m` zxSh~i%hka=4=zF_AoH`|hJ-Np%8NCbpB`}|cJU=||AQFUxcjO@j4Lb^V$G{2Jlj%l zgtuBwm-&0!K*oEOJ0_ZM`Sb;9U!u;z)@`;bUko2DnWeqHS+JF5<|5nQJ2IFt`v*!Ur zMAqT!R_QYM^-(DpH(8eR1r=DhmwI-x6LEOhltD1-&eh2g@WWM%STx~Zz4A9*%~4C9 z;D%2ewv`ZypFbLiiHXrsO*q?sN-$t$Rk2E9>r9Wb-0+uOw(_bSKb{Jp!$ zo9c!M^*&mev+FyKeR^LsRr)8-ub<|A)p;*W^p`%}^f|3CNb$mXP6pay|Af7@qR^tC zXwRedKF&8TmttXIL7sFWTAg-^GUQHEMH)3^y?6`})`wMblchEuW3v7*@h(;a970u_ zeR;I6#b&3d7mcPwc(?pN}Cgr}vY70)grz5bbumJ|hNrO=4#(1qw z*N<=Cx<6UK%`ogTGABHQX8Tw#Or z0I;E5>%H!S`*e%n_^bv#J(b5Ob}}Z%$G7&>&7;Vo>5;Lg=kEGOW2(2m-_deRb93{= zG@MY-Fl2wakv{q%m$>0P?X%Nkfld}(ok}ne4SzYj*6H==?jy-iYNq;^T?zdB?eN_# z6Tpp-*-4=!nFL95rY-)@eV~I$w1CqyoIxkJzCBrAli2q2=b9f-9JBx*H^|@7(Gg+Y zZ7AY6I-c1U;ft>fp`x3(e*KnTo=&2`bYe6g9I!O?66^M90U{y}J308n8cEo!ookOa zDDQgF(}^yn4qDq{1`_H7aGh^?0Vc#SGcAw4%WX$|CcIqmEHb z#fbh_L+&nMe>=Ifrli85D6+t4PG0;nyw4cN7sLgiZPvpY_XTY>Hn!+wz}=BMv!1mq^JuOixixX;&~_|vZ$`rpdmH1} zwkM&~=D=PQ?OZqdBJ?l2yomI-RqnD#;A^b(>q)6ITV)V)tOjfY*69bXGpFJi%!FTT z(R`C-4&F7aaO1%Tgrm|wHkxob-_9S8Xk>ri6%s0Wd^TcS>DbVqgGG~1`GLi4}9<^v2J5!WNvqP0IFs;R$vk>pKnm)X8+^u7LMJn zcYX3o!L9HS+x%DmHc`&eCoezB&CR8)a-K5X%7}-Bva_*Stq%937RgBm5vESz0JSUY z94i6@^5R=mh>5t1b6Qf8kK5Tur^3^0F}r@QEz@JwgX5+Cg7Hc-_k9Prk`im&ZdZbO zYN0YN{KhGia+W%hIp&iW(QZZviHPz7NDf5-g{JJWup?Q-gm|9 z7Dvc=TavjUw!}B9>PHe35ZS8sg1K-qH);0BIPdZ8*C;FDqr8=iKLh#fB7qy4z>zV! zPud7YI~{Qkl&74!I1&1)wW`{6=Vncv?648CiAC)Eu-L>VMsDs-^=@*6n2O&s?eKH6 zja~v~;7Rj`6C98Y)x4b7$GbNnn#zB;Jwl&@B_Mvr{ciLXYTbvgnL-RxB&-bT!Wd^F7_l7A z^&-x}sntX1>Ne+D_nivParHGDbhV!I?7CZ9q}L7X(OmVo`b5g(N_IgDlhTQw=IaqA z%N*!E+xhe2ZiVtubrU}eyxK&09-9bH{uJ|SD{#Gm<*A>nXx(e?n^vD-MOVLbo?Sb{ zAl#5i5`WxiPXaxE`DF6Z6E0AGP<3k?qb$)JVQf`*o;6M1d@AmindbIp;wQgXTZQWl z?@oHUXYVTqz1o~TZ(#jA77F<@)7@xl!j6aNz2j4>b)jh$SDk7G>Fd6$PY@*MpUCe+E(FlBXK5zSLbczn&*v?0eXCVs>Ddg6K|)(K6>xS)as{DQ=hw- zSt&^QT!7GF?|XS}ZY3;i^iV?OV$&0|g@ClXbBsQ6sXMLm#t+k0X;z9;t4}VDSqYz& zAT{fE+~O#M46ycHYV=K`*s|8cYK(eRt6kR|kEw}b{!HaJPdRj4-=DPTe13S?64qlJ|}Yr?uSaQG#t$I#6}&86OUuOBN=+LN6q54m;s8{U}1q-72b zVKj$o8)NkL#h*18VMrUKc*m{6CM?yeU6R$iE=<*1?p7AR%h#F8>L5bX?1&{% zKh*S3xaQQ*=Qn-DPNjPvfP8q&2^v-uAfz!)?!v5s4hIapNe8ZV=qo#pafB}=__XZ05*iCem`n!c-EPc{@=_hX+sgT>Crtv zlpvAbzIl605JHV57513MMVZjjB;y;J=Vx9k0}(%6&F zc&xIS3UTdu_QOy!GhSwXbha3Ttv-!9m|(V=28^NQt)aE1wURk5FyrBd4+PfHHucU!bTT#+|}m7v`E9UDFd3l8)c$RFTb^PT<$+4<4ax z(wfj1G7&LQFIo-wzPTNx=z1>2!rj8cyM6FQ6_2E$Z?Gv;lM%Yd!dAKyD6NSn2! zDS?WJ4}XB$5kjESOqY?@as(b`D~P*4r#I8>s}9ty%VkyOF+qKi_a3XN#lL$|yPWN; z((AeOy)Tz#!6UHi&Sa3;`&-}8!{VyKvY3&jM>aRjrt%iL5*_+ddzEjV2KQpvaz7`L zQ3ZR~^H>F)SpiYr(k5GT|$q-LNd!Fd*`aeNVYVly$A%au%h2cDYP>Zpe?P@bR zz4Dd)Rg}}QEMcrk?2akGi7_FZ!q$UaK)ZC-su*Y3bHf zjhfD3Iw{fxe=X@8C8hrd%An)JdScR+$z|wxa+6V9q;RkYuJWV;YcoxECsY2TdcPx9 zWpRmoCU;W!C~l9GjeK!JX3*to%+1XgVvwqo^b4S-lI9vvu=r(RX|iUEQLvYXoCUbU z*-`)$VBrP10%ndY@0m9R9W=raKftFP%jF$4Ah{l%xy-T<#A7vLBCc|0%KLV&Jp~UH6Sp5CE#m1A4Kjxn^B}+XHd?6v88+|+0DJ?u?s(J=4oM&n%=;Bq7D{I zPLAHxXHnXpB z^=)oiR~7;`b^eGON-4c!mBTX1m*upn7={a>B)g z(hA3QT^-t!m~ghY1j^NtwI>BS4jP2NgCV@&0@1k{(1}Vy1Vf1Q_Br#IdsY+N00Qt3!>Tj3 zOp#kufk0j+vfGuh2!HFRlluJ1(qe-{3JvGj-e!_+5w7=!h()ySCp!2nTXvSikk}4+ z|3r{k@sa2KJ*izDg!#ufzGWMc8?Wy2w|owANr+n2l=IL2_T^rK_s$N*w9Qhx=psJe z%;bXIOyra_)%r3a_+e^a`@$lmsu3}(B_L~jY58nbU9x3k8;-yzr*PZlCIc|;`wB6Y zsG`-^6l4(9Xr@(OQb_mMQ7za2pO~`FMGH)iOx=@L=Jk%wJyHEzA`aSJyB2el2o{9r z!<(mBk6{|ihjqty$|7zqH=H_6#G3G6^s~O;r+z8-XVl5AYMOFBScCP#_vw`UtDIVY zn*$gOBUU%)Ih#SQQi#;NX$R9ONXAP=F$3WUBUYss^aJ6kgc9MIIY)$mkTeA-XwCze z!KvSvwF@Bli+J5WAa8OWTkmrIA*EJaY+sI#_Zt7E{4XBvj+(d00PIeSKU0JYw~>+~ z5=ahJpvr7|Wm^;wPwvh9_;)lQZP|nQT+mE4il#L}6jW~o)E;}WWMY;lvSliMULvob zNkWD8Xq*e4KEM!P>OTKX3)s);RZv;s=2q|Li}fIzIX3RBcJ(Si;Q+*LuDYmPs{^!N z|2Yo@5nlhL+DFr@bKZL5Bc1}Nmo?gE4D9V!i5wch5%L418_GW zU9TU94oMCG`K)NupHk=~o!flJXuZq*ku+k`nGetjlY%^P$+5Tl&-*ynMWk8)m@6}5 z3ToC_b|2;ffE>6=ylJM!524W|F-HSJGVo?<`i>G~r~Vnhoa_MO30c-z?W>jq%=0GI zcj2ShheX`p;d3A66oa@(87w9xP=nwlii#QAp`?3&QYXmW2Q*;;@P|?oqsb?+fjV1G z&;eB{iZi`eJ|@nA1ot|S5fu#IGL#fe>+x<@g@RyaadFz_sxfdROfG}??H8mSXQt+% zF5sZb;7d^}h@iMI1$cDAqdZ;oNsXp`>jfV#_$#q+x!b10`Nw>PoeK^4(<@X7?Kpe= zk?OSytFk-Mcs=g^lWFEKU*cH`bd2}0l^-Yf!wrptsWn}6+?|HUuNtb|Y?68vopQXUNXlw+;A zU1y%z_&}=^NSnJcnu=xA7)RLk5Urf|TUtU=w}*7d1(k)(b<37g(;D-_T~4}JQoLUh z1MC=F5ff)aT3r)_T&!m^Y4X+RMJa)WxtNR?CKb$USkq^w)a<{$WD>*o zNE`D7tZc)q5JyR=lpy&{6p9fX8T(*--ZVg`s*|}}gN%#)wTYHusx85-oz;ymqJc8K8Q8-fqHHXfEY=`7L-!34AVTTVXpy_&N^1HgS29$d!vdyt zEanUA=7bg`T1D#b96f-f+FW*c09=N`8RJhw%bK%sUnk4rTw zYHGkVj9km=0K#v+VM*XHgDpncGc{kJ1Dl^rd<4#A0y2uUj}n|=HNihq6N|`JQF%ZF z-WTzxtB`Tx6TMyWTEx6CcYUf2FN+1uh;vBNfi`t3(?|(bQ;Mklm6Uo$P6C|5n47$I znJC$RvP2aQ2?K-H#+JI}fP5k<`E3H&Z?tLOi(vc$BFqDKwVemIIKQYKYwi z)J-BJ(qSN!aisLMfZmw9NamyYf$MYG2^-wT7$IOPQh%v}7NE%7NKm{6 z9FRQoDi=P z7AAx0IaSp%uQdB>vjbIY>X|{m5Cl%tElrzI5poMbq`7(w%ziuY#RVULGL#;0u>^ZR z?Of9NL5iF*AS$h6cN;xyqU@GIn{>o_%1esE15@pbkPcF5O+~enDz)JvP)8Yo12QB9 zBRdlyr=^VyBLZL~xFW9nNx%Ewk`|+@->(MY1nh%NJADYNbmDW;=Ta(1#P}qJ4ROjX zyNMjFn|KTg80|m1f<#y(=v$110dpc4?qksJ38@2a1Y$hyZ;ZzX*oBb_tTfRw=oElP z0`5bz2c7DXMt=^81j+E(HKG_v!=pm8)>(Q)>+G-_#IL>3@RIP+@Q&b&LjL zpll4bE9j^>(JU=*cr*}Y{qk&(^~}s@s6N0nNHaDl!(OHrFc^L@Zs==k!JZvn52;)( z99K1}121od5acxLc2#l}-`sOzbv4rrti@Mn*I*W3)Y-PHmZeWSXyW&N1uae_mEPhDPUB!HH(mB<} zeLIVQna8}X|Eg4W&jsP$m1*osroEN-65=HtzpM2I5cH)T5;#F2C7|dmbS&w(lYFLm zAgI4;foR6SzTUyNg9$cPtvSq~`kW|Y6@N|KQEhG-b;t1XzK>1WlozXO7)|X=>QqA9 z%R)3(0QM@Z7@rgKsA-{%Yx({{>t87t@!s z$$byx@*kov=UFC*Wj3mwgj{ouOfxk)h(#5Z`@}5&qX{4v3n?4jfovqY5Iqt&>~qaDG6Hi^ z^_t3ly#V(ah=#+ta88Kk9KLKc> z3fqfE*&q56RZU$Le1dnQ}n|_g;4YRvQfNF=A%F2 z%9?tk({80kDtITFHR{lQ9{u=xJU9JPgAXPtdwXhwjeM1ATeBxt_S7UR)%)GCJJbjj zf;ZJeyJ3@lD@0@bZWO65h_IW1b2CuY2&tD#$C@v9Gx5unTrSmbXNX@{QpGioXrY8=!Dv@)>a_Ep-+r!dV=n1^Oy3j7~W)Xtfya| zDofOVu*791|LOO5C^h^UD@`C|vd-H8X=dCoSq9U~x7OP0yu}Pj*k|2ywCGtezIolc zZ)GUoYbX6QPTD3@>6lj_6*Y{;l4VBD2ne(*9QnVXAqyq(WHs|5pfkxH(d@9YLTQ6vY1*KwBG$ z(jg&LNJmn(u$fvhS+mCiJ8X{}$7gI^d??a@@F(l}D0g#fpfApga!mt#8bh0G^ z4Qs&=-{Y>K6b23$`+Io;05|m#4xSN8y33%u; z$NzKo=07aYgZ^=l(0^t!)?O9EwroJWk97(_-w7vo766jpN1R#H0K4D47a@7V?oMnOYXqcdDI{gdw8O_5gv9UwN^FwE6kWgy5UCe`*-LFNDSIYP5>vO(wGcbuI3w_gC3+k~O+N>^L`p&!hmn64RG)S%id*;sF0 zBoKY^IU>lWfE1UK|4QO@_J)`LC)1FfAIpd?j*P)0Z8*KiaDX!jM9CqM?EJCi--&L9 zSSm`%R1@Tr{Q)I}pnebdQds9i&n6Tn@Fjp|kw-wQ|7?VGbppD#m%otA$Nn3N0^$C5 z7IhH{+6h1h0ABDR5HuBwCGsJzeKNrk6{&So;-O(pSuSp)en9mDG}#d-4ub&6BTiYo zw_H^c(F?iCVaN7vC&3862RY7$p&UyD+?OfA63V|Y=_?UjV1+tc8y&#A+XFWvoJ z(r>@rek`DUS$o3~zcT6En2{$?0n0!hHi(qQZD<$hO{yR+jX=Qv`+}bluJt z_!jU2n~XCHgO9*^KQiAUOYeJKg=CF9S>n_<$~q@QwU#RN7S$;sFMx;}ai3*sg*3%Y zKYLF1Be|GGIZZ+P&`F_sX?mbpHLOgTr%LL6jkkifoXI(FvLH_l4roNjiY#tE~rhTFDLAt={ zkit3e$cCk9YZ%`ITYAKe2>6{W{tiypuus{-x!z_{;3yMt^U0YbFPiD?T~W-z4#{2n$V%j4M0FlA%$sDP~d!)oa4 zLLvLYjzP0qGk^0%;?SG&U%54xycNkRpbL-A_+GKPiav6X*gQQo9=Dxkkr8sE3A%sh z6{Tl4DnZ=n0q&eF^~1QwPVy8Q(%`X4@!%-~wUP{a$PpLj^>3ZwaX}~~exLriM}22i zTq^mt3IYtF6_<#bis>vMH@r|EEA<|wV)%H8d)ZbQW`PZQzb%Ss- za;MtYIL=Kr$L79K&U>p$A-jMA!Xn{2Vd_N)$)I^xp*e zM4Uu8YFnEA_7&6r>W8rfwheyw?gs@6LzW*H@XrNroz{MJKQAFtnDHhy2y(avS>iUB zL0M=AO@mE8;-Tfj%Q=T$&)Z*xf!THQ`8n41hlOe81 zErbT0(fWvb#YE>7E{D1fp6QJGcVa(tgn*44&FZ0FiTpo}a9_;F*xph_->r zzCUbna`bFnHuY@n#y6Ea#-ud_Dj)OMZ{0ncS|40A$XC1=g>NL`RP$)5>*JI_c z%(EI~7C?Z%nwpWck{QQSC9+~0FVM+CZ)`?RTnfh>`m}!#(?>Vhc_xfBN%<0gH1nv} zY6hNCPrFcprjrj@pcXskm}kv&`z${M(Nmz{8jp5YMft@qEw!~%tp86HXBy6C-i7g~ zqH1X@8MVezRZHzft!<`4s$?uJ#wbHAt(K~Z*q33tA&9k>s6j0?ww4e}GO99-7S)ur zgoLqc>|1Q_lg#^G*Sw#S56|`dbI!T%`**HvBTcNc-&;*|mx}WS9$xUJ#ld7jc|~}o zb&s(w(WU0TemTN~ z3XQ9X;z48}$hb$R?%H=YPS?#H=G4QuN-&l<+;Pt~ee4o0!dT%JSwE$Wu-<_$fSj%y zCL~R<&y#AB-OuDq{ zs+%?Fe70OvIj1;aIrHCR#x*%pmiZWVt;FhbpPvNBc*gPoXAF5bPu=bdag(W;F) zfyi0iYqfQ{X~7l0I?+DiRl-Q(r%&)q7ap(UTl{qb!AKy;+M0Q#68ARjtWS%`3CU%0 z97cOI=o}8n+G)O7g8i;h#QXC8@3V7x*(1s@WTZu?DB|;q+~9G7vp^=L?s98B<_v^c z1DqtbOVkBfdJ&cat1DdGUAg z7o~D6Cam<{@xKQX_!~lh^1|u^_3sGYmuD(oK~RFEx>#iJXJn>JsshXks0HH9qkEBapq|RV0R;9odoCNH#W{0Z zuRByvXHt6W%(OFa__6Kzyr&HiU8(#W&IEU~ODkjaI6yPb9z@QV{x8GiBu6@*W6C?? z1kHKO3prPsh|hK7l&6P5!3a)lYCQ&s!bDf|+Ft(%ic=U#SCo64-=nIQ&LAJ@ntuj< zLN=H2V<{g0&Hr9U=qLDZRTRRCrB65|}hh_U9P1W&Iz zRZ!CI_$Gy(veqH-a9xKithr&dS2h>-VD+_F5gK+BFF?JcgtUo1za8eaLCfxYOLAk} zV(j*~mpcW4oTu2XwkGq?VRv7*fK8cuouaas+L=;`H;;zB12=I7(@p0~D?6;a5%GWA zpqI4Q^~-odmU65QMXMIVj>J&lhKo#wKjI=yn%i-^aNOZ_`<-@`^xt1;9+0U)>*HoA zbGJh*CW@k2{)NSABA0%j>}X&-%f;U|VrIM%tt@*L1`>z(u8xL$G`N$s8H=~~2Gowe z`u99}<`lc6eHIsYa%%@>H@acS{Hq>nGqNJao2@}qUDke435{l3?2`_X4ljD_kkxLK zuzU{On|qZJksJMvL-y>v_;qvKvi6^XNd$Py4dPyaJAr9}lgl@J`}IY$SBqw)7v+_O zCjYr>iSX2$6Wb=xK#vJ@&4p`YNsv!E>f{0XaLL{3m@k5}E0ayzoXbo-Vv=qRS!x{( z|27@b-!h7dotO#*fYgyIK6`Gy?yvT}>|1Dy=e@`ic94VJu4)NeXz=@p9cp5=vv8`X zRhH?Wq|WKoG}f(t~%JL)H^ zQ*O_60WOY;Z}iZF%ml(U}vI$5r z9_(#`Xto()gcvpR&B`iUqeV7_?;{OsiM##nKZM+S`t|NI1Jkx15n$UI6*(TdX*oP) zGfn)X9eU@%_V+8~-Sw(wb`his-fFHx()C7^;EOgXW+X+z2T^}_{XZ{OYjAols6;{uLR=i9lgDD*)OnI5gH_>(o3UT-`u*Qe9~N23PE`ZaI6&x?4>iL zVv-0jo|f)$G2*ay-yn3fVclZU)eH9Ih#>nQzB|m`&cRT)GRY zC+kG5f)SDDl{O^PLo!^FjT3$*QK*;8n}R^Hl^hOflhM1XkN`Xt_&BRdVtfldqJXdt z1hWw~oK}_s1>gAxf#WGD67h)0V%s@2DF7_h;tDDo*+?$rrDlOju^Pc2oWn&uePh*` zq;aK7qoyrhq8nfb0csb9013UFf^RdjSTzcY$Ic=z`1X#EEwX6?jkU$T?(H{vC&~R# zYVQK_R5Wzmlv&1Zb_c$c$o{)=d-Kyf!bHbh?7=S$EN98>t~i<6=#ewzreKWUw;!cb zEUVDQ&;&n{MicPgDcw9I_w-$gTj3_#9kMLjFjm8;ZKcI$QLY1i9&v9n|TnlKYkOs)_Eb%*vB&f?kPdj zAMWp~DwJ8QdsnshbFGATT zA&U=R>*hW(SU5a8d=L$%r@Eni;Pen*`roMpysxJXw72PcZS1)+)`}bYI@~iP{x6xn zZ^;Ul5;UpvjHL9BiL22^n@STNVHXXC4}9VDdLMe1G-`lM5HVONNM6K)%!?h-Kh*9; z!s%$XNp7JJeO7j*;z^l3qCF|!5BK@O6wE=^ih`$T$V|5r{Ir?m0 cRJIQd!7Z++g2Rx&`)VMBwZoMz>% diff --git a/protocol/conceptual_model_df_memory.svg b/protocol/conceptual_model_df_memory.svg deleted file mode 100644 index 9081e42f..00000000 --- a/protocol/conceptual_model_df_memory.svg +++ /dev/null @@ -1,627 +0,0 @@ - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 1-D array - column - chunk - dataframe - - - - From a545faa1fdc6fa926866600fb989826a6b9721e3 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Fri, 25 Jun 2021 22:02:32 +0200 Subject: [PATCH 15/32] Document that strided buffers do not need to be supported --- protocol/dataframe_protocol_summary.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/protocol/dataframe_protocol_summary.md b/protocol/dataframe_protocol_summary.md index 41f1421a..9b6647a4 100644 --- a/protocol/dataframe_protocol_summary.md +++ b/protocol/dataframe_protocol_summary.md @@ -172,7 +172,13 @@ We'll also list some things that were discussed but are not requirements: 3. Extension dtypes, i.e. a way to extend the set of dtypes that is explicitly support, are out of scope. _Rationale: complex to support, not used enough to justify that complexity._ -4. "virtual columns", i.e. columns for which the data is not yet in memory +4. Support for strided storage in buffers. + _Rationale: this is supported by a subset of dataframes only, mainly those + that use NumPy arrays. In many real-world use cases, strided arrays will + force a copy at some point, so requiring contiguous memory layout (and hence + an extra copy at the moment `__dataframe__` is used) is considered a good + trade-off for reduced implementation complexity._ +5. "virtual columns", i.e. columns for which the data is not yet in memory because it uses lazy evaluation, are not supported other than through letting the producer materialize the data in memory when the consumer calls `__dataframe__`. From 6010ae7aaafb0782221b4accbfdc36423d850277 Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Mon, 28 Jun 2021 10:48:16 -0700 Subject: [PATCH 16/32] Add todo --- protocol/pandas_implementation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index b8065b70..48903f2b 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -511,6 +511,8 @@ def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype # Marshal the strings from a NumPy object array into a byte array buf = self._col.to_numpy() b = bytearray() + + # TODO: this for-loop is slow; can be implemented in Cython/C/C++ later for i in range(buf.size): if type(buf[i]) == str: b.extend(buf[i].encode(encoding="utf-8")) From 89a799644e4880d84901ab68ac207c4d3e03c390 Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Mon, 28 Jun 2021 11:56:07 -0700 Subject: [PATCH 17/32] Remove colons --- protocol/pandas_implementation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 48903f2b..2ea10b85 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -193,13 +193,13 @@ def convert_string_column(col : ColumnObject) -> np.ndarray: # Extract a range of code units units = dbuf[obuf[i]:obuf[i+1]]; - # Convert the list of code units to bytes: + # Convert the list of code units to bytes b = bytes(units) # Create the string s = b.decode(encoding="utf-8") - # Add to our list of strings: + # Add to our list of strings str_list.append(s) # Convert the string list to a NumPy array From a3ff4e74aa2688c77334c1a35c14bb8b34cb2805 Mon Sep 17 00:00:00 2001 From: Athan Date: Thu, 8 Jul 2021 08:09:43 -0700 Subject: [PATCH 18/32] Fix grammar Co-authored-by: Joris Van den Bossche --- protocol/dataframe_protocol_summary.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/protocol/dataframe_protocol_summary.md b/protocol/dataframe_protocol_summary.md index c9cf2350..aa33b9a3 100644 --- a/protocol/dataframe_protocol_summary.md +++ b/protocol/dataframe_protocol_summary.md @@ -220,7 +220,7 @@ except `__dataframe__` is a Python-level rather than C-level interface. The data types format specification of that interface is something that could be used unchanged. -The main limitation is to be that it does not have device support +The main limitation is that it does not have device support -- `@kkraus14` will bring this up on the Arrow dev mailing list. Another identified issue is that the "deleter" on the Arrow C struct is present at the column level, and there are use cases for having it at the buffer level @@ -374,4 +374,4 @@ The `=`, `<`, `>` are denoting endianness; Arrow only supports native endianness - [`__array_interface__` protocol](https://numpy.org/devdocs/reference/arrays.interface.html) - [Arrow C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) - [DLPack](https://github.com/dmlc/dlpack) -- [Array data interchange in API standard](https://data-apis.github.io/array-api/latest/design_topics/data_interchange.html) \ No newline at end of file +- [Array data interchange in API standard](https://data-apis.github.io/array-api/latest/design_topics/data_interchange.html) From ff84e8c396ea7bfd37b1028d81c83945b59798a1 Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Thu, 8 Jul 2021 09:20:42 -0700 Subject: [PATCH 19/32] Rename methods --- protocol/dataframe_protocol.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index cb97c6cd..a7a5fbe5 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -306,7 +306,7 @@ def get_data_buffer(self) -> Tuple[Buffer, Any]: """ pass - def get_mask(self) -> Tuple[Buffer, Any]: + def get_validity_buffer(self) -> Tuple[Buffer, Any]: """ Return the buffer containing the mask values indicating missing data and the buffer's associated dtype. @@ -315,7 +315,7 @@ def get_mask(self) -> Tuple[Buffer, Any]: """ pass - def get_offsets(self) -> Tuple[Buffer, Any]: + def get_offsets_buffer(self) -> Tuple[Buffer, Any]: """ Return the buffer containing the offset values for variable-size binary data (e.g., variable-length strings) and the buffer's associated dtype. From c954f3c258d57f4ed593f5e8cb30671f5f6913a5 Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Thu, 8 Jul 2021 09:21:17 -0700 Subject: [PATCH 20/32] Rename methods --- protocol/pandas_implementation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 2ea10b85..119be47c 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -170,10 +170,10 @@ def convert_string_column(col : ColumnObject) -> np.ndarray: dbuffer, bdtype = col.get_data_buffer() # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string - obuffer, odtype = col.get_offsets() + obuffer, odtype = col.get_offsets_buffer() # Retrieve the mask buffer indicating the presence of missing values: - mbuffer, mdtype = col.get_mask() + mbuffer, mdtype = col.get_validity_buffer() # Convert the buffers to NumPy arrays dt = (_DtypeKind.UINT, 8, None, None) # note: in order to go from STRING to an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) @@ -527,7 +527,7 @@ def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype return buffer, dtype - def get_mask(self) -> Tuple[_PandasBuffer, Any]: + def get_validity_buffer(self) -> Tuple[_PandasBuffer, Any]: """ Return the buffer containing the mask values indicating missing data and the buffer's associated dtype. @@ -564,7 +564,7 @@ def get_mask(self) -> Tuple[_PandasBuffer, Any]: raise RuntimeError(msg) - def get_offsets(self) -> Tuple[_PandasBuffer, Any]: + def get_offsets_buffer(self) -> Tuple[_PandasBuffer, Any]: """ Return the buffer containing the offset values for variable-size binary data (e.g., variable-length strings) and the buffer's associated dtype. From ed64fb7f4fec862dbe9cf67bf4829a00c5cc9e2d Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Thu, 8 Jul 2021 09:26:25 -0700 Subject: [PATCH 21/32] Update describe_null to indicate a byte array for string dtype --- protocol/pandas_implementation.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 119be47c..34e07a10 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -467,8 +467,7 @@ def describe_null(self) -> Tuple[int, Any]: null = 2 value = -1 elif kind == _k.STRING: - # For Pandas string extension dtype, use of `np.nan` for missing values may change! - null = 1 # np.nan + null = 4 else: raise NotImplementedError(f"Data type {self.dtype} not yet supported") @@ -714,7 +713,7 @@ def test_string_dtype(): col = df.__dataframe__().get_column_by_name("B") assert col.dtype[0] == _DtypeKind.STRING assert col.null_count == 1 - assert col.describe_null == (1, None) + assert col.describe_null == (4, None) assert col.num_chunks() == 1 df2 = from_dataframe(df) From 9b9aecf92bc4fd1ebacf794187590d75566d1c48 Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Mon, 19 Jul 2021 09:20:37 -0700 Subject: [PATCH 22/32] Return encoding for missing values --- protocol/dataframe_protocol.py | 4 +++- protocol/pandas_implementation.py | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index a7a5fbe5..ab754a0e 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -273,7 +273,9 @@ def describe_null(self) -> Tuple[int, Any]: - 3 : bit mask - 4 : byte mask - Value : if kind is "sentinel value", the actual value. None otherwise. + Value : if kind is "sentinel value", the actual value. If kind is a bit + mask or byte mask, the value (0 or 1) indicating a missing value. None + otherwise. """ pass diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 34e07a10..afd4cc69 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -448,7 +448,9 @@ def describe_null(self) -> Tuple[int, Any]: - 3 : bit mask - 4 : byte mask - Value : if kind is "sentinel value", the actual value. None otherwise. + Value : if kind is "sentinel value", the actual value. If kind is a bit + mask or byte mask, the value (0 or 1) indicating a missing value. None + otherwise. """ _k = _DtypeKind kind = self.dtype[0] @@ -468,6 +470,7 @@ def describe_null(self) -> Tuple[int, Any]: value = -1 elif kind == _k.STRING: null = 4 + value = 0 else: raise NotImplementedError(f"Data type {self.dtype} not yet supported") From 40269002aa65a34c920abff883a1814aa4bbb8f8 Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Mon, 19 Jul 2021 09:22:36 -0700 Subject: [PATCH 23/32] Update test --- protocol/pandas_implementation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index afd4cc69..1f1c8ddd 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -716,7 +716,7 @@ def test_string_dtype(): col = df.__dataframe__().get_column_by_name("B") assert col.dtype[0] == _DtypeKind.STRING assert col.null_count == 1 - assert col.describe_null == (4, None) + assert col.describe_null == (4, 0) assert col.num_chunks() == 1 df2 = from_dataframe(df) From 87d7143945af15bffcc69e37601bbbd9190bb3e4 Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Mon, 19 Jul 2021 09:30:57 -0700 Subject: [PATCH 24/32] Use invalid value encoding --- protocol/pandas_implementation.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 1f1c8ddd..32175bb7 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -175,6 +175,9 @@ def convert_string_column(col : ColumnObject) -> np.ndarray: # Retrieve the mask buffer indicating the presence of missing values: mbuffer, mdtype = col.get_validity_buffer() + # Retrieve the missing value encoding: + null_value = col.describe_null[1] + # Convert the buffers to NumPy arrays dt = (_DtypeKind.UINT, 8, None, None) # note: in order to go from STRING to an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) dbuf = buffer_to_ndarray(dbuffer, dt) @@ -186,7 +189,7 @@ def convert_string_column(col : ColumnObject) -> np.ndarray: str_list = [] for i in range(obuf.size-1): # Check for missing values - if mbuf[i] == 0: # FIXME: we need to account for a mask buffer which is a bit array + if mbuf[i] == null_value: # FIXME: we need to account for a mask buffer which is a bit array str_list.append(np.nan) continue @@ -470,7 +473,7 @@ def describe_null(self) -> Tuple[int, Any]: value = -1 elif kind == _k.STRING: null = 4 - value = 0 + value = 0 # follow Arrow in using 1 as valid value and 0 for missing/null value else: raise NotImplementedError(f"Data type {self.dtype} not yet supported") @@ -536,15 +539,25 @@ def get_validity_buffer(self) -> Tuple[_PandasBuffer, Any]: Raises RuntimeError if null representation is not a bit or byte mask. """ + null, invalid = self.describe_null + _k = _DtypeKind if self.dtype[0] == _k.STRING: # For now, have the mask array be comprised of bytes, rather than a bit array buf = self._col.to_numpy() mask = [] + + # Determine the encoding for valid values + if invalid == 0: + valid = 1 + else: + valid = 0 + for i in range(buf.size): - v = 0; if type(buf[i]) == str: - v += 1; # follows Arrow where a valid value is 1 and null is 0 + v = valid; + else: + v = invalid; mask.append(v) @@ -556,7 +569,6 @@ def get_validity_buffer(self) -> Tuple[_PandasBuffer, Any]: return buffer, dtype - null, value = self.describe_null if null == 0: msg = "This column is non-nullable so does not have a mask" elif null == 1: From 56ee2da7ba058728b34df4846e4ab12788f764a5 Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Mon, 19 Jul 2021 09:34:20 -0700 Subject: [PATCH 25/32] Update copy --- protocol/dataframe_protocol.py | 2 +- protocol/pandas_implementation.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index ab754a0e..7c02a25c 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -274,7 +274,7 @@ def describe_null(self) -> Tuple[int, Any]: - 4 : byte mask Value : if kind is "sentinel value", the actual value. If kind is a bit - mask or byte mask, the value (0 or 1) indicating a missing value. None + mask or a byte mask, the value (0 or 1) indicating a missing value. None otherwise. """ pass diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 32175bb7..8aa607b2 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -452,7 +452,7 @@ def describe_null(self) -> Tuple[int, Any]: - 4 : byte mask Value : if kind is "sentinel value", the actual value. If kind is a bit - mask or byte mask, the value (0 or 1) indicating a missing value. None + mask or a byte mask, the value (0 or 1) indicating a missing value. None otherwise. """ _k = _DtypeKind From 0035c903920b29cd4dce4af336d84e929727c932 Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Mon, 19 Jul 2021 09:45:09 -0700 Subject: [PATCH 26/32] Use Arrow format strings --- protocol/pandas_implementation.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 8aa607b2..19f7404c 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -371,7 +371,7 @@ def dtype(self) -> Tuple[enum.IntEnum, int, str, str]: # For now, assume that, if the column dtype is 'O' (i.e., `object`), then we have an array of strings if not isinstance(dtype, pd.CategoricalDtype) and dtype.kind == 'O': - return (_DtypeKind.STRING, 8, '=U1', '=') + return (_DtypeKind.STRING, 8, 'u', '=') return self._dtype_from_pandasdtype(dtype) @@ -383,9 +383,9 @@ def _dtype_from_pandasdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]: # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) not handled # datetime and timedelta both map to datetime (is timedelta handled?) _k = _DtypeKind - _np_kinds = {'i': _k.INT, 'u': _k.UINT, 'f': _k.FLOAT, 'b': _k.BOOL, - 'U': _k.STRING, - 'M': _k.DATETIME, 'm': _k.DATETIME} + _np_kinds = {"i": _k.INT, "u": _k.UINT, "f": _k.FLOAT, "b": _k.BOOL, + "U": _k.STRING, + "M": _k.DATETIME, "m": _k.DATETIME} kind = _np_kinds.get(dtype.kind, None) if kind is None: # Not a NumPy dtype. Check if it's a categorical maybe @@ -526,7 +526,7 @@ def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype buffer = _PandasBuffer(np.frombuffer(b, dtype="uint8")) # Define the dtype for the returned buffer - dtype = (_k.STRING, 8, "=U1", "=") # note: currently only support native endianness + dtype = (_k.STRING, 8, "u", "=") # note: currently only support native endianness else: raise NotImplementedError(f"Data type {self._col.dtype} not handled yet") @@ -565,7 +565,7 @@ def get_validity_buffer(self) -> Tuple[_PandasBuffer, Any]: buffer = _PandasBuffer(np.asarray(mask, dtype="uint8")) # Define the dtype of the returned buffer - dtype = (_k.UINT, 8, "=B", "=") + dtype = (_k.UINT, 8, "C", "=") return buffer, dtype @@ -607,8 +607,7 @@ def get_offsets_buffer(self) -> Tuple[_PandasBuffer, Any]: buffer = _PandasBuffer(buf) # Assemble the buffer dtype info - bdtype = buf.dtype; - dtype = (_k.INT, bdtype.itemsize*8, bdtype.str, "=") # note: currently only support native endianness + dtype = (_k.INT, 64, 'l', "=") # note: currently only support native endianness else: raise RuntimeError("This column has a fixed-length dtype so does not have an offsets buffer") From 91ed6a1c24cf05eb3eda384d7bb5187ecdeacc5f Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Mon, 19 Jul 2021 09:56:17 -0700 Subject: [PATCH 27/32] Add `get_buffers` method to the protocol --- protocol/dataframe_protocol.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index 7c02a25c..68dd0909 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -302,6 +302,29 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[Column]: """ pass + def get_buffers(self) -> dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]], Optional[Tuple[Buffer, Any]]]: + """ + Return the underlying buffers. + + The returned dictionary has the following contents: + + - "data": a two-element tuple whose first element is a tuple + containing the buffer containing the data and whose second + element is the data buffer's associated dtype. + - "validity": a two-element tuple whose first element is a tuple + containing the buffer containing mask values + indicating missing data and whose second element is + the mask value buffer's associated dtype. None if the + null representation is not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a tuple + containing the buffer containing the offset values for + variable-size binary data (e.g., variable-length + strings) and whose second element is the offsets + buffer's associated dtype. None if the data buffer does + not have an associated offsets buffer. + """ + pass + def get_data_buffer(self) -> Tuple[Buffer, Any]: """ Return the buffer containing the data and the buffer's associated dtype. @@ -321,6 +344,9 @@ def get_offsets_buffer(self) -> Tuple[Buffer, Any]: """ Return the buffer containing the offset values for variable-size binary data (e.g., variable-length strings) and the buffer's associated dtype. + + Raises RuntimeError if the data buffer does not have an associated + offsets buffer. """ pass From 26fb48d46f553d83ca5b04c924cf1f03376db12a Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Mon, 19 Jul 2021 09:56:43 -0700 Subject: [PATCH 28/32] Remove individual methods --- protocol/dataframe_protocol.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index 68dd0909..d6efbe28 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -325,31 +325,6 @@ def get_buffers(self) -> dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]], """ pass - def get_data_buffer(self) -> Tuple[Buffer, Any]: - """ - Return the buffer containing the data and the buffer's associated dtype. - """ - pass - - def get_validity_buffer(self) -> Tuple[Buffer, Any]: - """ - Return the buffer containing the mask values indicating missing data and - the buffer's associated dtype. - - Raises RuntimeError if null representation is not a bit or byte mask. - """ - pass - - def get_offsets_buffer(self) -> Tuple[Buffer, Any]: - """ - Return the buffer containing the offset values for variable-size binary - data (e.g., variable-length strings) and the buffer's associated dtype. - - Raises RuntimeError if the data buffer does not have an associated - offsets buffer. - """ - pass - # def get_children(self) -> Iterable[Column]: # """ # Children columns underneath the column, each object in this iterator From 0d0e94b4fedc903a16e5adbf1952cd32b4986a89 Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Mon, 19 Jul 2021 09:57:07 -0700 Subject: [PATCH 29/32] Update copy --- protocol/dataframe_protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index d6efbe28..34ef2619 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -304,7 +304,7 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable[Column]: def get_buffers(self) -> dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]], Optional[Tuple[Buffer, Any]]]: """ - Return the underlying buffers. + Return a dictionary containing the underlying buffers. The returned dictionary has the following contents: From 9ec830cc495199c438a832e16ade227ded6ed75b Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Mon, 19 Jul 2021 10:20:26 -0700 Subject: [PATCH 30/32] Refactor to return a dictionary of buffers --- protocol/pandas_implementation.py | 54 ++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 8 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 19f7404c..4a609c29 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -99,7 +99,7 @@ def convert_column_to_ndarray(col : ColumnObject) -> np.ndarray: raise NotImplementedError("Null values represented as masks or " "sentinel values not handled yet") - _buffer, _dtype = col.get_data_buffer() + _buffer, _dtype = col.get_buffers()["data"] return buffer_to_ndarray(_buffer, _dtype) @@ -143,7 +143,7 @@ def convert_categorical_column(col : ColumnObject) -> pd.Series: # categories = col._col.values.categories.values # codes = col._col.values.codes categories = np.asarray(list(mapping.values())) - codes_buffer, codes_dtype = col.get_data_buffer() + codes_buffer, codes_dtype = col.get_buffers()["data"] codes = buffer_to_ndarray(codes_buffer, codes_dtype) values = categories[codes] @@ -166,14 +166,17 @@ def convert_string_column(col : ColumnObject) -> np.ndarray: """ Convert a string column to a NumPy array. """ + # Retrieve the data buffers: + buffers = col.get_buffers() + # Retrieve the data buffer containing the UTF-8 code units - dbuffer, bdtype = col.get_data_buffer() + dbuffer, bdtype = buffers["data"] # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string - obuffer, odtype = col.get_offsets_buffer() + obuffer, odtype = buffers["offsets"] # Retrieve the mask buffer indicating the presence of missing values: - mbuffer, mdtype = col.get_validity_buffer() + mbuffer, mdtype = buffers["validity"] # Retrieve the missing value encoding: null_value = col.describe_null[1] @@ -500,7 +503,42 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_PandasColumn """ return (self,) - def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype tuple + def get_buffers(self) -> Dict[str, Any]: + """ + Return a dictionary containing the underlying buffers. + + The returned dictionary has the following contents: + + - "data": a two-element tuple whose first element is a tuple + containing the buffer containing the data and whose second + element is the data buffer's associated dtype. + - "validity": a two-element tuple whose first element is a tuple + containing the buffer containing mask values + indicating missing data and whose second element is + the mask value buffer's associated dtype. None if the + null representation is not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a tuple + containing the buffer containing the offset values for + variable-size binary data (e.g., variable-length + strings) and whose second element is the offsets + buffer's associated dtype. None if the data buffer does + not have an associated offsets buffer. + """ + buffers = {} + buffers["data"] = self._get_data_buffer() + try: + buffers["validity"] = self._get_validity_buffer() + except: + buffers["validity"] = None + + try: + buffers["offsets"] = self._get_offsets_buffer() + except: + buffers["offsets"] = None + + return buffers + + def _get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype tuple """ Return the buffer containing the data and the buffer's associated dtype. """ @@ -532,7 +570,7 @@ def get_data_buffer(self) -> Tuple[_PandasBuffer, Any]: # Any is for self.dtype return buffer, dtype - def get_validity_buffer(self) -> Tuple[_PandasBuffer, Any]: + def _get_validity_buffer(self) -> Tuple[_PandasBuffer, Any]: """ Return the buffer containing the mask values indicating missing data and the buffer's associated dtype. @@ -578,7 +616,7 @@ def get_validity_buffer(self) -> Tuple[_PandasBuffer, Any]: raise RuntimeError(msg) - def get_offsets_buffer(self) -> Tuple[_PandasBuffer, Any]: + def _get_offsets_buffer(self) -> Tuple[_PandasBuffer, Any]: """ Return the buffer containing the offset values for variable-size binary data (e.g., variable-length strings) and the buffer's associated dtype. From 0dd4e2c3e5bc6343ddafb1f76e0a84e67f3e94ba Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Mon, 19 Jul 2021 11:28:11 -0700 Subject: [PATCH 31/32] Update comments --- protocol/pandas_implementation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 4a609c29..112f035b 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -166,7 +166,7 @@ def convert_string_column(col : ColumnObject) -> np.ndarray: """ Convert a string column to a NumPy array. """ - # Retrieve the data buffers: + # Retrieve the data buffers buffers = col.get_buffers() # Retrieve the data buffer containing the UTF-8 code units @@ -175,10 +175,10 @@ def convert_string_column(col : ColumnObject) -> np.ndarray: # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string obuffer, odtype = buffers["offsets"] - # Retrieve the mask buffer indicating the presence of missing values: + # Retrieve the mask buffer indicating the presence of missing values mbuffer, mdtype = buffers["validity"] - # Retrieve the missing value encoding: + # Retrieve the missing value encoding null_value = col.describe_null[1] # Convert the buffers to NumPy arrays From ade0d76587263c6552925e11a3c940aaa41d5402 Mon Sep 17 00:00:00 2001 From: Athan Reines Date: Mon, 19 Jul 2021 11:28:33 -0700 Subject: [PATCH 32/32] Fix copy --- protocol/dataframe_protocol.py | 28 ++++++++++++++-------------- protocol/pandas_implementation.py | 28 ++++++++++++++-------------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/protocol/dataframe_protocol.py b/protocol/dataframe_protocol.py index 34ef2619..41b2e6e4 100644 --- a/protocol/dataframe_protocol.py +++ b/protocol/dataframe_protocol.py @@ -308,20 +308,20 @@ def get_buffers(self) -> dict[Tuple[Buffer, Any], Optional[Tuple[Buffer, Any]], The returned dictionary has the following contents: - - "data": a two-element tuple whose first element is a tuple - containing the buffer containing the data and whose second - element is the data buffer's associated dtype. - - "validity": a two-element tuple whose first element is a tuple - containing the buffer containing mask values - indicating missing data and whose second element is - the mask value buffer's associated dtype. None if the - null representation is not a bit or byte mask. - - "offsets": a two-element tuple whose first element is a tuple - containing the buffer containing the offset values for - variable-size binary data (e.g., variable-length - strings) and whose second element is the offsets - buffer's associated dtype. None if the data buffer does - not have an associated offsets buffer. + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. """ pass diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 112f035b..0eea3aae 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -509,20 +509,20 @@ def get_buffers(self) -> Dict[str, Any]: The returned dictionary has the following contents: - - "data": a two-element tuple whose first element is a tuple - containing the buffer containing the data and whose second - element is the data buffer's associated dtype. - - "validity": a two-element tuple whose first element is a tuple - containing the buffer containing mask values - indicating missing data and whose second element is - the mask value buffer's associated dtype. None if the - null representation is not a bit or byte mask. - - "offsets": a two-element tuple whose first element is a tuple - containing the buffer containing the offset values for - variable-size binary data (e.g., variable-length - strings) and whose second element is the offsets - buffer's associated dtype. None if the data buffer does - not have an associated offsets buffer. + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. """ buffers = {} buffers["data"] = self._get_data_buffer()