From f7b42e1d04594b5d95ba0a49b8aa6e4cf83a9998 Mon Sep 17 00:00:00 2001 From: iskode Date: Tue, 17 Aug 2021 16:25:54 +0000 Subject: [PATCH 1/5] __dataframe__ test --- protocol/pandas_implementation.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index e3e3e62e..8ed31dd6 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -538,9 +538,18 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_PandasDataFr # Roundtrip testing # ----------------- +def test__dataframe__(df:pd.DataFrame, dfo: DataFrameObject): + assert dfo.num_columns() == len(df.columns) + assert dfo.num_rows() == len(df) + assert dfo.num_chunks() == 1 + assert dfo.column_names() == list(df.columns) + + + def test_float_only(): df = pd.DataFrame(data=dict(a=[1.5, 2.5, 3.5], b=[9.2, 10.5, 11.8])) df2 = from_dataframe(df) + test__dataframe__(df, df.__dataframe__()) tm.assert_frame_equal(df, df2) @@ -548,6 +557,7 @@ def test_mixed_intfloat(): df = pd.DataFrame(data=dict(a=[1, 2, 3], b=[3, 4, 5], c=[1.5, 2.5, 3.5], d=[9, 10, 11])) df2 = from_dataframe(df) + test__dataframe__(df, df.__dataframe__()) tm.assert_frame_equal(df, df2) @@ -575,6 +585,7 @@ def test_categorical_dtype(): assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) df2 = from_dataframe(df) + test__dataframe__(df, df.__dataframe__()) tm.assert_frame_equal(df, df2) From bfe643216ca99230020c6e1ed16ab60e3404bbd8 Mon Sep 17 00:00:00 2001 From: iskode Date: Tue, 17 Aug 2021 18:08:43 +0000 Subject: [PATCH 2/5] add some column tests for the protocol --- protocol/pandas_implementation.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 8ed31dd6..78f27d0f 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -538,13 +538,18 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_PandasDataFr # Roundtrip testing # ----------------- +def test_column(pdcol:pd.Series, col: _PandasColumn): + assert pdcol.size == col.size + assert col.offset == 0 + assert pdcol.isnull().sum() == col.null_count + def test__dataframe__(df:pd.DataFrame, dfo: DataFrameObject): assert dfo.num_columns() == len(df.columns) assert dfo.num_rows() == len(df) assert dfo.num_chunks() == 1 assert dfo.column_names() == list(df.columns) - - + for col in df.columns: + test_column(df[col], dfo.get_column_by_name(col)) def test_float_only(): df = pd.DataFrame(data=dict(a=[1.5, 2.5, 3.5], b=[9.2, 10.5, 11.8])) From 7718c76f465bd83e1a921f66cf484178d47028f3 Mon Sep 17 00:00:00 2001 From: iskode Date: Tue, 24 Aug 2021 08:12:20 +0000 Subject: [PATCH 3/5] refactors helper function names and signatures --- protocol/pandas_implementation.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 78f27d0f..129691af 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -538,23 +538,25 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_PandasDataFr # Roundtrip testing # ----------------- -def test_column(pdcol:pd.Series, col: _PandasColumn): - assert pdcol.size == col.size +def assert_column_equal(col: _PandasColumn, pdcol:pd.Series): + assert col.size == pdcol.size assert col.offset == 0 - assert pdcol.isnull().sum() == col.null_count + assert col.null_count == pdcol.isnull().sum() + assert col.num_chunks() == 1 + pytest.raises(RuntimeError, col.get_mask) -def test__dataframe__(df:pd.DataFrame, dfo: DataFrameObject): +def assert_dataframe_equal(dfo: DataFrameObject, df:pd.DataFrame): assert dfo.num_columns() == len(df.columns) assert dfo.num_rows() == len(df) assert dfo.num_chunks() == 1 assert dfo.column_names() == list(df.columns) for col in df.columns: - test_column(df[col], dfo.get_column_by_name(col)) + assert_column_equal(dfo.get_column_by_name(col), df[col]) def test_float_only(): df = pd.DataFrame(data=dict(a=[1.5, 2.5, 3.5], b=[9.2, 10.5, 11.8])) df2 = from_dataframe(df) - test__dataframe__(df, df.__dataframe__()) + assert_dataframe_equal(df.__dataframe__(), df) tm.assert_frame_equal(df, df2) @@ -562,7 +564,7 @@ def test_mixed_intfloat(): df = pd.DataFrame(data=dict(a=[1, 2, 3], b=[3, 4, 5], c=[1.5, 2.5, 3.5], d=[9, 10, 11])) df2 = from_dataframe(df) - test__dataframe__(df, df.__dataframe__()) + assert_dataframe_equal(df.__dataframe__(), df) tm.assert_frame_equal(df, df2) @@ -590,7 +592,7 @@ def test_categorical_dtype(): assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) df2 = from_dataframe(df) - test__dataframe__(df, df.__dataframe__()) + assert_dataframe_equal(df.__dataframe__(), df) tm.assert_frame_equal(df, df2) From 011dd5d0dd0ebff8cc1d6c9942d9b911676fc81a Mon Sep 17 00:00:00 2001 From: iskode Date: Tue, 24 Aug 2021 10:13:42 +0000 Subject: [PATCH 4/5] add few tests on the buffer --- protocol/pandas_implementation.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 129691af..825d055f 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -538,12 +538,29 @@ def get_chunks(self, n_chunks : Optional[int] = None) -> Iterable['_PandasDataFr # Roundtrip testing # ----------------- +def assert_buffer_equal(buffer_dtype: Tuple[_PandasBuffer, Any], pdcol:pd.Series): + buf, dtype = buffer_dtype + pytest.raises(NotImplementedError, buf.__dlpack__) + assert buf.__dlpack_device__() == (1, None) + # It seems that `bitwidth` is handled differently for `int` and `category` + # assert dtype[1] == pdcol.dtype.itemsize * 8, f"{dtype[1]} is not {pdcol.dtype.itemsize}" + # print(pdcol) + # if isinstance(pdcol, pd.CategoricalDtype): + # col = pdcol.values.codes + # else: + # col = pdcol + + # assert dtype[1] == col.dtype.itemsize * 8, f"{dtype[1]} is not {col.dtype.itemsize * 8}" + # assert dtype[2] == col.dtype.str, f"{dtype[2]} is not {col.dtype.str}" + + def assert_column_equal(col: _PandasColumn, pdcol:pd.Series): assert col.size == pdcol.size assert col.offset == 0 assert col.null_count == pdcol.isnull().sum() assert col.num_chunks() == 1 pytest.raises(RuntimeError, col.get_mask) + assert_buffer_equal(col.get_data_buffer(), pdcol) def assert_dataframe_equal(dfo: DataFrameObject, df:pd.DataFrame): assert dfo.num_columns() == len(df.columns) From 1c2f0392d50e82d8e3a80d1968545a6f33893677 Mon Sep 17 00:00:00 2001 From: iskode Date: Thu, 26 Aug 2021 17:24:15 +0000 Subject: [PATCH 5/5] update assert_dataframe_equal with recent changes --- protocol/pandas_implementation.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/protocol/pandas_implementation.py b/protocol/pandas_implementation.py index 5fef41e0..344cb107 100644 --- a/protocol/pandas_implementation.py +++ b/protocol/pandas_implementation.py @@ -786,8 +786,9 @@ def assert_column_equal(col: _PandasColumn, pdcol:pd.Series): assert col.offset == 0 assert col.null_count == pdcol.isnull().sum() assert col.num_chunks() == 1 - pytest.raises(RuntimeError, col.get_mask) - assert_buffer_equal(col.get_data_buffer(), pdcol) + if col.dtype[0] != _DtypeKind.STRING: + pytest.raises(RuntimeError, col._get_validity_buffer) + assert_buffer_equal(col._get_data_buffer(), pdcol) def assert_dataframe_equal(dfo: DataFrameObject, df:pd.DataFrame): assert dfo.num_columns() == len(df.columns) @@ -817,6 +818,7 @@ def test_noncontiguous_columns(): df = pd.DataFrame(arr, columns=['a', 'b', 'c']) assert df['a'].to_numpy().strides == (24,) df2 = from_dataframe(df) # uses default of allow_copy=True + assert_dataframe_equal(df.__dataframe__(), df) tm.assert_frame_equal(df, df2) with pytest.raises(RuntimeError): @@ -853,6 +855,8 @@ def test_string_dtype(): assert col.describe_null == (4, 0) assert col.num_chunks() == 1 + assert_dataframe_equal(df.__dataframe__(), df) + def test_metadata(): df = pd.DataFrame({'A': [1, 2, 3, 4],'B': [1, 2, 3, 4]}) @@ -869,6 +873,7 @@ def test_metadata(): assert col_metadata[key] == expected[key] df2 = from_dataframe(df) + assert_dataframe_equal(df.__dataframe__(), df) tm.assert_frame_equal(df, df2)