Skip to content

REGR: isin with nullable types with missing values raising #42473

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Jul 14, 2021
Merged
18 changes: 3 additions & 15 deletions asv_bench/benchmarks/algos/isin.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
import numpy as np

try:
from pandas.compat import np_version_under1p20
except ImportError:
from pandas.compat.numpy import _np_version_under1p20 as np_version_under1p20

from pandas import (
Categorical,
NaT,
Expand Down Expand Up @@ -283,10 +278,6 @@ class IsInLongSeriesLookUpDominates:
def setup(self, dtype, MaxNumber, series_type):
N = 10 ** 7

# https://github.com/pandas-dev/pandas/issues/39844
if not np_version_under1p20 and dtype in ("Int64", "Float64"):
raise NotImplementedError

if series_type == "random_hits":
array = np.random.randint(0, MaxNumber, N)
if series_type == "random_misses":
Expand All @@ -297,7 +288,8 @@ def setup(self, dtype, MaxNumber, series_type):
array = np.arange(N) + MaxNumber

self.series = Series(array).astype(dtype)
self.values = np.arange(MaxNumber).astype(dtype)

self.values = np.arange(MaxNumber).astype(dtype.lower())

def time_isin(self, dtypes, MaxNumber, series_type):
self.series.isin(self.values)
Expand All @@ -313,16 +305,12 @@ class IsInLongSeriesValuesDominate:
def setup(self, dtype, series_type):
N = 10 ** 7

# https://github.com/pandas-dev/pandas/issues/39844
if not np_version_under1p20 and dtype in ("Int64", "Float64"):
raise NotImplementedError

if series_type == "random":
vals = np.random.randint(0, 10 * N, N)
if series_type == "monotone":
vals = np.arange(N)

self.values = vals.astype(dtype)
self.values = vals.astype(dtype.lower())
M = 10 ** 6 + 1
self.series = Series(np.arange(M)).astype(dtype)

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Fixed regressions
- Performance regression in :class:`DataFrame` in reduction operations requiring casting such as :meth:`DataFrame.mean` on integer data (:issue:`38592`)
- Performance regression in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when ``orient`` argument one of "records", "dict", or "split" (:issue:`42352`)
- Fixed regression in indexing with a ``list`` subclass incorrectly raising ``TypeError`` (:issue:`42433`, :issue:`42461`)
- Fixed regression in :meth:`DataFrame.isin` and :meth:`Series.isin` raising ``TypeError`` with nullable data containing at least one missing value (:issue:`42405`)
-

.. ---------------------------------------------------------------------------
Expand Down
18 changes: 13 additions & 5 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,12 +403,20 @@ def isin(self, values) -> BooleanArray: # type: ignore[override]

from pandas.core.arrays import BooleanArray

result = isin(self._data, values)
# algorithms.isin will eventually convert values to an ndarray, so no extra
# cost to doing it here first
values_arr = np.asarray(values)
result = isin(self._data, values_arr)

if self._hasna:
if libmissing.NA in values:
result += self._mask
else:
result *= np.invert(self._mask)
values_have_NA = is_object_dtype(values_arr.dtype) and any(
val is self.dtype.na_value for val in values_arr
)

# For now, NA does not propagate so set result according to presence of NA,
# see https://github.com/pandas-dev/pandas/pull/38379 for some discussion
result[self._mask] = values_have_NA
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line should be equivalent to the deleted if/else AFAICT, but I think makes logic easier to follow


mask = np.zeros_like(self, dtype=bool)
return BooleanArray(result, mask, copy=False)

Expand Down
21 changes: 21 additions & 0 deletions pandas/tests/series/methods/test_isin.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,27 @@ def test_isin_float_in_int_series(self, values):
expected = Series([True, False])
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("dtype", ["boolean", "Int64", "Float64"])
@pytest.mark.parametrize(
"data,values,expected",
[
([0, 1, 0], [1], [False, True, False]),
([0, 1, 0], [1, pd.NA], [False, True, False]),
([0, pd.NA, 0], [1, 0], [True, False, True]),
([0, 1, pd.NA], [1, pd.NA], [False, True, True]),
([0, 1, pd.NA], [1, np.nan], [False, True, False]),
([0, pd.NA, pd.NA], [np.nan, pd.NaT, None], [False, False, False]),
],
)
def test_isin_masked_types(self, dtype, data, values, expected):
# GH#42405
ser = Series(data, dtype=dtype)

result = ser.isin(values)
expected = Series(expected, dtype="boolean")

tm.assert_series_equal(result, expected)


@pytest.mark.slow
def test_isin_large_series_mixed_dtypes_and_nan():
Expand Down