Skip to content

API: membership checks on ExtensionArray containing NA values #37867

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Nov 29, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -752,6 +752,7 @@ ExtensionArray
- Fixed bug when applying a NumPy ufunc with multiple outputs to an :class:`.IntegerArray` returning None (:issue:`36913`)
- Fixed an inconsistency in :class:`.PeriodArray`'s ``__init__`` signature to those of :class:`.DatetimeArray` and :class:`.TimedeltaArray` (:issue:`37289`)
- Reductions for :class:`.BooleanArray`, :class:`.Categorical`, :class:`.DatetimeArray`, :class:`.FloatingArray`, :class:`.IntegerArray`, :class:`.PeriodArray`, :class:`.TimedeltaArray`, and :class:`.PandasArray` are now keyword-only methods (:issue:`37541`)
- Fixed a bug where a ``TypeError`` was wrongly raised if a membership check was made on an ``ExtensionArray`` containing nan-like values (:issue:`37867`)

Other
^^^^^
Expand Down
18 changes: 18 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
is_array_like,
is_dtype_equal,
is_list_like,
is_scalar,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import ExtensionDtype
Expand Down Expand Up @@ -354,6 +355,23 @@ def __iter__(self):
for i in range(len(self)):
yield self[i]

def __contains__(self, item) -> bool:
"""
Return for `item in self`.
"""
# GH37867
# comparisons of any item to pd.NA always return pd.NA, so e.g. "a" in [pd.NA]
# would raise a TypeError. The implementation below works around that.
if is_scalar(item) and isna(item):
if not self._can_hold_na:
return False
elif item is self.dtype.na_value or isinstance(item, self.dtype.type):
return self.isna().any()
else:
return False
else:
return (item == self).any()

def __eq__(self, other: Any) -> ArrayLike:
"""
Return for `self == other` (element-wise equality).
Expand Down
4 changes: 4 additions & 0 deletions pandas/tests/extension/arrow/test_bool.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ def test_view(self, data):
# __setitem__ does not work, so we only have a smoke-test
data.view()

@pytest.mark.xfail(raises=AssertionError, reason="Not implemented yet")
def test_contains(self, data, data_missing, nulls_fixture):
super().test_contains(data, data_missing, nulls_fixture)


class TestConstructors(BaseArrowTests, base.BaseConstructorsTests):
def test_from_dtype(self, data):
Expand Down
23 changes: 23 additions & 0 deletions pandas/tests/extension/base/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,29 @@ def test_can_hold_na_valid(self, data):
# GH-20761
assert data._can_hold_na is True

def test_contains(self, data, data_missing, nulls_fixture):
# GH-37867
# Tests for membership checks. Membership checks for nan-likes is tricky and
# the settled on rule is: `nan_like in arr` is True if nan_like is
# arr.dtype.na_value and arr.isna().any() is True. Else the check returns False.

na_value = data.dtype.na_value
# ensure data without missing values
data = data[~data.isna()]

# first elements are non-missing
assert data[0] in data
assert data_missing[0] in data_missing

# check the presence of na_value
assert na_value in data_missing
assert na_value not in data

if nulls_fixture is not na_value:
# the data can never contain other nan-likes than na_value
assert nulls_fixture not in data
assert nulls_fixture not in data_missing

def test_memory_usage(self, data):
s = pd.Series(data)
result = s.memory_usage(index=False)
Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/extension/decimal/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,14 @@ def __setitem__(self, key, value):
def __len__(self) -> int:
return len(self._data)

def __contains__(self, item) -> bool:
if not isinstance(item, decimal.Decimal):
return False
elif item.is_nan():
return self.isna().any()
else:
return super().__contains__(item)

@property
def nbytes(self) -> int:
n = len(self)
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/extension/json/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,13 @@ def test_custom_asserts(self):
with pytest.raises(AssertionError, match=msg):
self.assert_frame_equal(a.to_frame(), b.to_frame())

@pytest.mark.xfail(
reason="comparison method not implemented for JSONArray (GH-37867)"
)
def test_contains(self, data):
# GH-37867
super().test_contains(data)


class TestConstructors(BaseJSON, base.BaseConstructorsTests):
@pytest.mark.skip(reason="not implemented constructor from dtype")
Expand Down
22 changes: 22 additions & 0 deletions pandas/tests/extension/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,28 @@ def test_memory_usage(self, data):
# Is this deliberate?
super().test_memory_usage(data)

def test_contains(self, data, data_missing, nulls_fixture):
# GH-37867
# na value handling in Categorical.__contains__ is deprecated.
# See base.BaseInterFaceTests.test_contains for more details.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This duplicates what you have in tests/arrays/categorical/test_operators.py ?

Copy link
Contributor Author

@topper-123 topper-123 Nov 28, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately not: Categorical already had a __contains__ method and it's more permissive than the new one. So, in this file we have (below) assert na_value_type in data_missing, while the base tests method is assert na_value_type not in data_missing (notice the not).

na values is also more complicated in categoricals, because in some cases we want to accept pd.NaT and in other cases not. I'd like to take it in another round (or let it slide)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be clear, I was not referring to the base tests that this is overriding, but the original test you added to tests/arrays/categorical/test_operators.py which also tests this more permissive behaviour?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah. I'll delete the ones in tests/arrays/categorical/test_operators.py.


na_value = data.dtype.na_value
# ensure data without missing values
data = data[~data.isna()]

# first elements are non-missing
assert data[0] in data
assert data_missing[0] in data_missing

# check the presence of na_value
assert na_value in data_missing
assert na_value not in data

# Categoricals can contain other nan-likes than na_value
if nulls_fixture is not na_value:
assert nulls_fixture not in data
assert nulls_fixture in data_missing # this line differs from super method


class TestConstructors(base.BaseConstructorsTests):
pass
Expand Down