Skip to content

Commit 03717bc

Browse files
authored
BUG: interchange protocol with nullable datatypes a non-null validity (pandas-dev#57665)
* BUG: interchange protocol with nullable datatypes a non-null validity provides nonsense results * whatsnew * 🏷️ typing * parametrise over more types * move whatsnew
1 parent c9f876c commit 03717bc

File tree

3 files changed

+56
-7
lines changed

3 files changed

+56
-7
lines changed

doc/source/whatsnew/v2.2.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ including other versions of pandas.
1313

1414
Fixed regressions
1515
~~~~~~~~~~~~~~~~~
16+
- :meth:`DataFrame.__dataframe__` was producing incorrect data buffers when the a column's type was a pandas nullable on with missing values (:issue:`56702`)
1617
-
1718

1819
.. ---------------------------------------------------------------------------

pandas/core/interchange/column.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,10 @@ def describe_categorical(self):
190190

191191
@property
192192
def describe_null(self):
193+
if isinstance(self._col.dtype, BaseMaskedDtype):
194+
column_null_dtype = ColumnNullType.USE_BYTEMASK
195+
null_value = 1
196+
return column_null_dtype, null_value
193197
kind = self.dtype[0]
194198
try:
195199
null, value = _NULL_DESCRIPTION[kind]
@@ -298,7 +302,13 @@ def _get_data_buffer(
298302
DtypeKind.FLOAT,
299303
DtypeKind.BOOL,
300304
):
301-
np_arr = self._col.to_numpy()
305+
arr = self._col.array
306+
if isinstance(self._col.dtype, BaseMaskedDtype):
307+
np_arr = arr._data # type: ignore[attr-defined]
308+
elif isinstance(self._col.dtype, ArrowDtype):
309+
raise NotImplementedError("ArrowDtype not handled yet")
310+
else:
311+
np_arr = arr._ndarray # type: ignore[attr-defined]
302312
buffer = PandasBuffer(np_arr, allow_copy=self._allow_copy)
303313
dtype = self.dtype
304314
elif self.dtype[0] == DtypeKind.CATEGORICAL:
@@ -341,6 +351,12 @@ def _get_validity_buffer(self) -> tuple[PandasBuffer, Any]:
341351
"""
342352
null, invalid = self.describe_null
343353

354+
if isinstance(self._col.dtype, BaseMaskedDtype):
355+
mask = self._col.array._mask # type: ignore[attr-defined]
356+
buffer = PandasBuffer(mask)
357+
dtype = (DtypeKind.BOOL, 8, ArrowCTypes.BOOL, Endianness.NATIVE)
358+
return buffer, dtype
359+
344360
if self.dtype[0] == DtypeKind.STRING:
345361
# For now, use byte array as the mask.
346362
# TODO: maybe store as bit array to save space?..

pandas/tests/interchange/test_impl.py

+38-6
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
is_ci_environment,
99
is_platform_windows,
1010
)
11-
import pandas.util._test_decorators as td
1211

1312
import pandas as pd
1413
import pandas._testing as tm
@@ -417,17 +416,50 @@ def test_non_str_names_w_duplicates():
417416
pd.api.interchange.from_dataframe(dfi, allow_copy=False)
418417

419418

420-
@pytest.mark.parametrize(
421-
"dtype", ["Int8", pytest.param("Int8[pyarrow]", marks=td.skip_if_no("pyarrow"))]
422-
)
423-
def test_nullable_integers(dtype: str) -> None:
419+
def test_nullable_integers() -> None:
424420
# https://github.com/pandas-dev/pandas/issues/55069
425-
df = pd.DataFrame({"a": [1]}, dtype=dtype)
421+
df = pd.DataFrame({"a": [1]}, dtype="Int8")
426422
expected = pd.DataFrame({"a": [1]}, dtype="int8")
427423
result = pd.api.interchange.from_dataframe(df.__dataframe__())
428424
tm.assert_frame_equal(result, expected)
429425

430426

427+
@pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/57664")
428+
def test_nullable_integers_pyarrow() -> None:
429+
# https://github.com/pandas-dev/pandas/issues/55069
430+
df = pd.DataFrame({"a": [1]}, dtype="Int8[pyarrow]")
431+
expected = pd.DataFrame({"a": [1]}, dtype="int8")
432+
result = pd.api.interchange.from_dataframe(df.__dataframe__())
433+
tm.assert_frame_equal(result, expected)
434+
435+
436+
@pytest.mark.parametrize(
437+
("data", "dtype", "expected_dtype"),
438+
[
439+
([1, 2, None], "Int64", "int64"),
440+
(
441+
[1, 2, None],
442+
"UInt64",
443+
"uint64",
444+
),
445+
([1.0, 2.25, None], "Float32", "float32"),
446+
],
447+
)
448+
def test_pandas_nullable_w_missing_values(
449+
data: list, dtype: str, expected_dtype: str
450+
) -> None:
451+
# https://github.com/pandas-dev/pandas/issues/57643
452+
pytest.importorskip("pyarrow", "11.0.0")
453+
import pyarrow.interchange as pai
454+
455+
df = pd.DataFrame({"a": data}, dtype=dtype)
456+
result = pai.from_dataframe(df.__dataframe__())["a"]
457+
assert result.type == expected_dtype
458+
assert result[0].as_py() == data[0]
459+
assert result[1].as_py() == data[1]
460+
assert result[2].as_py() is None
461+
462+
431463
def test_empty_dataframe():
432464
# https://github.com/pandas-dev/pandas/issues/56700
433465
df = pd.DataFrame({"a": []}, dtype="int8")

0 commit comments

Comments
 (0)