From 3bf21ef45a090de52907ba224676996c45acb336 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 28 Feb 2024 18:58:13 +0000 Subject: [PATCH 1/5] BUG: interchange protocol with nullable datatypes a non-null validity provides nonsense results --- pandas/core/interchange/column.py | 18 ++++++++++++++++- pandas/tests/interchange/test_impl.py | 29 +++++++++++++++++++++------ 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index 7effc42d5ba28..e149a767ed0d9 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -190,6 +190,10 @@ def describe_categorical(self): @property def describe_null(self): + if isinstance(self._col.dtype, BaseMaskedDtype): + column_null_dtype = ColumnNullType.USE_BYTEMASK + null_value = 1 + return column_null_dtype, null_value kind = self.dtype[0] try: null, value = _NULL_DESCRIPTION[kind] @@ -298,7 +302,13 @@ def _get_data_buffer( DtypeKind.FLOAT, DtypeKind.BOOL, ): - np_arr = self._col.to_numpy() + arr = self._col.array + if isinstance(self._col.dtype, BaseMaskedDtype): + np_arr = arr._data + elif isinstance(self._col.dtype, ArrowDtype): + raise NotImplementedError("ArrowDtype not handled yet") + else: + np_arr = arr._ndarray buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy) dtype = self.dtype elif self.dtype[0] == DtypeKind.CATEGORICAL: @@ -341,6 +351,12 @@ def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]: """ null, invalid = self.describe_null + if isinstance(self._col.dtype, BaseMaskedDtype): + mask = self._col.array._mask + buffer = PandasBuffer(mask) + dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE) + return buffer, dtype + if self.dtype[0] == DtypeKind.STRING: # For now, use byte array as the mask. # TODO: maybe store as bit array to save space?.. diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index e4fa6e4451a4c..8519f3995f197 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -8,7 +8,6 @@ is_ci_environment, is_platform_windows, ) -import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm @@ -417,17 +416,35 @@ def test_non_str_names_w_duplicates(): pd.api.interchange.from_dataframe(dfi, allow_copy=False) -@pytest.mark.parametrize( - "dtype", ["Int8", pytest.param("Int8[pyarrow]", marks=td.skip_if_no("pyarrow"))] -) -def test_nullable_integers(dtype: str) -> None: +def test_nullable_integers() -> None: + # https://github.com/pandas-dev/pandas/issues/55069 + df = pd.DataFrame({"a": [1]}, dtype="Int8") + expected = pd.DataFrame({"a": [1]}, dtype="int8") + result = pd.api.interchange.from_dataframe(df.__dataframe__()) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/57664") +def test_nullable_integers_pyarrow() -> None: # https://github.com/pandas-dev/pandas/issues/55069 - df = pd.DataFrame({"a": [1]}, dtype=dtype) + df = pd.DataFrame({"a": [1]}, dtype="Int8[pyarrow]") expected = pd.DataFrame({"a": [1]}, dtype="int8") result = pd.api.interchange.from_dataframe(df.__dataframe__()) tm.assert_frame_equal(result, expected) +def test_nullable_integers_w_missing_values() -> None: + # https://github.com/pandas-dev/pandas/issues/57643 + pytest.importorskip("pyarrow", "11.0.0") + import pyarrow.interchange as pai + + df = pd.DataFrame({"a": [1, 2, None]}, dtype="Int64") + result = pai.from_dataframe(df.__dataframe__())["a"] + assert result[0].as_py() == 1 + assert result[1].as_py() == 2 + assert result[2].as_py() is None + + def test_empty_dataframe(): # https://github.com/pandas-dev/pandas/issues/56700 df = pd.DataFrame({"a": []}, dtype="int8") From b99da09614c7d586c91980d3dbfd3770cf79453c Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 28 Feb 2024 19:30:43 +0000 Subject: [PATCH 2/5] whatsnew --- doc/source/whatsnew/v2.2.1.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index 310dd921e44f6..fb4ecd8008e69 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -19,6 +19,7 @@ Enhancements Fixed regressions ~~~~~~~~~~~~~~~~~ +- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`) - Fixed memory leak in :func:`read_csv` (:issue:`57039`) - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression causing overflow for near-minimum timestamps (:issue:`57150`) From 9ccec2d5748fac979bc1236454ef18b9fdb77e47 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 28 Feb 2024 19:55:39 +0000 Subject: [PATCH 3/5] :label: typing --- pandas/core/interchange/column.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index e149a767ed0d9..bf20f0b5433cd 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -304,11 +304,11 @@ def _get_data_buffer( ): arr = self._col.array if isinstance(self._col.dtype, BaseMaskedDtype): - np_arr = arr._data + np_arr = arr._data # type: ignore[attr-defined] elif isinstance(self._col.dtype, ArrowDtype): raise NotImplementedError("ArrowDtype not handled yet") else: - np_arr = arr._ndarray + np_arr = arr._ndarray # type: ignore[attr-defined] buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy) dtype = self.dtype elif self.dtype[0] == DtypeKind.CATEGORICAL: @@ -352,7 +352,7 @@ def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]: null, invalid = self.describe_null if isinstance(self._col.dtype, BaseMaskedDtype): - mask = self._col.array._mask + mask = self._col.array._mask # type: ignore[attr-defined] buffer = PandasBuffer(mask) dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE) return buffer, dtype From 8650b6fb482f9b2ea0e0d0bb0e717de0e0270c56 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 29 Feb 2024 08:43:31 +0000 Subject: [PATCH 4/5] parametrise over more types --- pandas/tests/interchange/test_impl.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 8519f3995f197..94b2da894ad0f 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -433,15 +433,30 @@ def test_nullable_integers_pyarrow() -> None: tm.assert_frame_equal(result, expected) -def test_nullable_integers_w_missing_values() -> None: +@pytest.mark.parametrize( + ("data", "dtype", "expected_dtype"), + [ + ([1, 2, None], "Int64", "int64"), + ( + [1, 2, None], + "UInt64", + "uint64", + ), + ([1.0, 2.25, None], "Float32", "float32"), + ], +) +def test_pandas_nullable_w_missing_values( + data: list, dtype: str, expected_dtype: str +) -> None: # https://github.com/pandas-dev/pandas/issues/57643 pytest.importorskip("pyarrow", "11.0.0") import pyarrow.interchange as pai - df = pd.DataFrame({"a": [1, 2, None]}, dtype="Int64") + df = pd.DataFrame({"a": data}, dtype=dtype) result = pai.from_dataframe(df.__dataframe__())["a"] - assert result[0].as_py() == 1 - assert result[1].as_py() == 2 + assert result.type == expected_dtype + assert result[0].as_py() == data[0] + assert result[1].as_py() == data[1] assert result[2].as_py() is None From b9c15c675c4feac016413b2616c5b0cfdc20b9d3 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 7 Mar 2024 12:55:05 +0000 Subject: [PATCH 5/5] move whatsnew --- doc/source/whatsnew/v2.2.1.rst | 1 - doc/source/whatsnew/v2.2.2.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst index fb4ecd8008e69..310dd921e44f6 100644 --- a/doc/source/whatsnew/v2.2.1.rst +++ b/doc/source/whatsnew/v2.2.1.rst @@ -19,7 +19,6 @@ Enhancements Fixed regressions ~~~~~~~~~~~~~~~~~ -- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`) - Fixed memory leak in :func:`read_csv` (:issue:`57039`) - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`) - Fixed regression causing overflow for near-minimum timestamps (:issue:`57150`) diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index 058f7aebcd538..96f210ce6b7b9 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -13,6 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`) - .. ---------------------------------------------------------------------------