Skip to content

Backport PR #57665 on branch 2.2.x (BUG: interchange protocol with nullable datatypes a non-null validity) #57769

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ including other versions of pandas.

Fixed regressions
~~~~~~~~~~~~~~~~~
- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`)
-

.. ---------------------------------------------------------------------------
Expand Down
18 changes: 17 additions & 1 deletion pandas/core/interchange/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,10 @@ def describe_categorical(self):

@property
def describe_null(self):
if isinstance(self._col.dtype, BaseMaskedDtype):
column_null_dtype = ColumnNullType.USE_BYTEMASK
null_value = 1
return column_null_dtype, null_value
kind = self.dtype[0]
try:
null, value = _NULL_DESCRIPTION[kind]
Expand Down Expand Up @@ -290,7 +294,13 @@ def _get_data_buffer(
if self.dtype[0] == DtypeKind.DATETIME and len(self.dtype[2]) > 4:
np_arr = self._col.dt.tz_convert(None).to_numpy()
else:
np_arr = self._col.to_numpy()
arr = self._col.array
if isinstance(self._col.dtype, BaseMaskedDtype):
np_arr = arr._data # type: ignore[attr-defined]
elif isinstance(self._col.dtype, ArrowDtype):
raise NotImplementedError("ArrowDtype not handled yet")
else:
np_arr = arr._ndarray # type: ignore[attr-defined]
buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy)
dtype = self.dtype
elif self.dtype[0] == DtypeKind.CATEGORICAL:
Expand Down Expand Up @@ -328,6 +338,12 @@ def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]:
"""
null, invalid = self.describe_null

if isinstance(self._col.dtype, BaseMaskedDtype):
mask = self._col.array._mask # type: ignore[attr-defined]
buffer = PandasBuffer(mask)
dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE)
return buffer, dtype

if self.dtype[0] == DtypeKind.STRING:
# For now, use byte array as the mask.
# TODO: maybe store as bit array to save space?..
Expand Down
44 changes: 38 additions & 6 deletions pandas/tests/interchange/test_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
is_platform_windows,
)
from pandas.compat.numpy import np_version_lt1p23
import pandas.util._test_decorators as td

import pandas as pd
import pandas._testing as tm
Expand Down Expand Up @@ -404,17 +403,50 @@ def test_non_str_names_w_duplicates():
pd.api.interchange.from_dataframe(dfi, allow_copy=False)


@pytest.mark.parametrize(
"dtype", ["Int8", pytest.param("Int8[pyarrow]", marks=td.skip_if_no("pyarrow"))]
)
def test_nullable_integers(dtype: str) -> None:
def test_nullable_integers() -> None:
# https://github.com/pandas-dev/pandas/issues/55069
df = pd.DataFrame({"a": [1]}, dtype="Int8")
expected = pd.DataFrame({"a": [1]}, dtype="int8")
result = pd.api.interchange.from_dataframe(df.__dataframe__())
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/57664")
def test_nullable_integers_pyarrow() -> None:
# https://github.com/pandas-dev/pandas/issues/55069
df = pd.DataFrame({"a": [1]}, dtype=dtype)
df = pd.DataFrame({"a": [1]}, dtype="Int8[pyarrow]")
expected = pd.DataFrame({"a": [1]}, dtype="int8")
result = pd.api.interchange.from_dataframe(df.__dataframe__())
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
("data", "dtype", "expected_dtype"),
[
([1, 2, None], "Int64", "int64"),
(
[1, 2, None],
"UInt64",
"uint64",
),
([1.0, 2.25, None], "Float32", "float32"),
],
)
def test_pandas_nullable_w_missing_values(
data: list, dtype: str, expected_dtype: str
) -> None:
# https://github.com/pandas-dev/pandas/issues/57643
pytest.importorskip("pyarrow", "11.0.0")
import pyarrow.interchange as pai

df = pd.DataFrame({"a": data}, dtype=dtype)
result = pai.from_dataframe(df.__dataframe__())["a"]
assert result.type == expected_dtype
assert result[0].as_py() == data[0]
assert result[1].as_py() == data[1]
assert result[2].as_py() is None


def test_empty_dataframe():
# https://github.com/pandas-dev/pandas/issues/56700
df = pd.DataFrame({"a": []}, dtype="int8")
Expand Down