diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index 058f7aebcd538..96f210ce6b7b9 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -13,6 +13,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/interchange/column.py b/pandas/core/interchange/column.py index 7effc42d5ba28..bf20f0b5433cd 100644 --- a/pandas/core/interchange/column.py +++ b/pandas/core/interchange/column.py @@ -190,6 +190,10 @@ def describe_categorical(self): @property def describe_null(self): + if isinstance(self._col.dtype, BaseMaskedDtype): + column_null_dtype = ColumnNullType.USE_BYTEMASK + null_value = 1 + return column_null_dtype, null_value kind = self.dtype[0] try: null, value = _NULL_DESCRIPTION[kind] @@ -298,7 +302,13 @@ def _get_data_buffer( DtypeKind.FLOAT, DtypeKind.BOOL, ): - np_arr = self._col.to_numpy() + arr = self._col.array + if isinstance(self._col.dtype, BaseMaskedDtype): + np_arr = arr._data # type: ignore[attr-defined] + elif isinstance(self._col.dtype, ArrowDtype): + raise NotImplementedError("ArrowDtype not handled yet") + else: + np_arr = arr._ndarray # type: ignore[attr-defined] buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy) dtype = self.dtype elif self.dtype[0] == DtypeKind.CATEGORICAL: @@ -341,6 +351,12 @@ def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]: """ null, invalid = self.describe_null + if isinstance(self._col.dtype, BaseMaskedDtype): + mask = self._col.array._mask # type: ignore[attr-defined] + buffer = PandasBuffer(mask) + dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE) + return buffer, dtype + if self.dtype[0] == DtypeKind.STRING: # For now, use byte array as the mask. # TODO: maybe store as bit array to save space?.. diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index e4fa6e4451a4c..94b2da894ad0f 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -8,7 +8,6 @@ is_ci_environment, is_platform_windows, ) -import pandas.util._test_decorators as td import pandas as pd import pandas._testing as tm @@ -417,17 +416,50 @@ def test_non_str_names_w_duplicates(): pd.api.interchange.from_dataframe(dfi, allow_copy=False) -@pytest.mark.parametrize( - "dtype", ["Int8", pytest.param("Int8[pyarrow]", marks=td.skip_if_no("pyarrow"))] -) -def test_nullable_integers(dtype: str) -> None: +def test_nullable_integers() -> None: # https://github.com/pandas-dev/pandas/issues/55069 - df = pd.DataFrame({"a": [1]}, dtype=dtype) + df = pd.DataFrame({"a": [1]}, dtype="Int8") expected = pd.DataFrame({"a": [1]}, dtype="int8") result = pd.api.interchange.from_dataframe(df.__dataframe__()) tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/57664") +def test_nullable_integers_pyarrow() -> None: + # https://github.com/pandas-dev/pandas/issues/55069 + df = pd.DataFrame({"a": [1]}, dtype="Int8[pyarrow]") + expected = pd.DataFrame({"a": [1]}, dtype="int8") + result = pd.api.interchange.from_dataframe(df.__dataframe__()) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + ("data", "dtype", "expected_dtype"), + [ + ([1, 2, None], "Int64", "int64"), + ( + [1, 2, None], + "UInt64", + "uint64", + ), + ([1.0, 2.25, None], "Float32", "float32"), + ], +) +def test_pandas_nullable_w_missing_values( + data: list, dtype: str, expected_dtype: str +) -> None: + # https://github.com/pandas-dev/pandas/issues/57643 + pytest.importorskip("pyarrow", "11.0.0") + import pyarrow.interchange as pai + + df = pd.DataFrame({"a": data}, dtype=dtype) + result = pai.from_dataframe(df.__dataframe__())["a"] + assert result.type == expected_dtype + assert result[0].as_py() == data[0] + assert result[1].as_py() == data[1] + assert result[2].as_py() is None + + def test_empty_dataframe(): # https://github.com/pandas-dev/pandas/issues/56700 df = pd.DataFrame({"a": []}, dtype="int8")