Skip to content

Backport PR #42473: REGR: isin with nullable types with missing values raising #42543

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 3 additions & 12 deletions asv_bench/benchmarks/algos/isin.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import numpy as np

from pandas.compat.numpy import np_version_under1p20

from pandas import (
Categorical,
NaT,
Expand Down Expand Up @@ -280,10 +278,6 @@ class IsInLongSeriesLookUpDominates:
def setup(self, dtype, MaxNumber, series_type):
N = 10 ** 7

# https://github.com/pandas-dev/pandas/issues/39844
if not np_version_under1p20 and dtype in ("Int64", "Float64"):
raise NotImplementedError

if series_type == "random_hits":
array = np.random.randint(0, MaxNumber, N)
if series_type == "random_misses":
Expand All @@ -294,7 +288,8 @@ def setup(self, dtype, MaxNumber, series_type):
array = np.arange(N) + MaxNumber

self.series = Series(array).astype(dtype)
self.values = np.arange(MaxNumber).astype(dtype)

self.values = np.arange(MaxNumber).astype(dtype.lower())

def time_isin(self, dtypes, MaxNumber, series_type):
self.series.isin(self.values)
Expand All @@ -310,16 +305,12 @@ class IsInLongSeriesValuesDominate:
def setup(self, dtype, series_type):
N = 10 ** 7

# https://github.com/pandas-dev/pandas/issues/39844
if not np_version_under1p20 and dtype in ("Int64", "Float64"):
raise NotImplementedError

if series_type == "random":
vals = np.random.randint(0, 10 * N, N)
if series_type == "monotone":
vals = np.arange(N)

self.values = vals.astype(dtype)
self.values = vals.astype(dtype.lower())
M = 10 ** 6 + 1
self.series = Series(np.arange(M)).astype(dtype)

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Fixed regressions
- Performance regression in :class:`DataFrame` in reduction operations requiring casting such as :meth:`DataFrame.mean` on integer data (:issue:`38592`)
- Performance regression in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when ``orient`` argument one of "records", "dict", or "split" (:issue:`42352`)
- Fixed regression in indexing with a ``list`` subclass incorrectly raising ``TypeError`` (:issue:`42433`, :issue:`42461`)
- Fixed regression in :meth:`DataFrame.isin` and :meth:`Series.isin` raising ``TypeError`` with nullable data containing at least one missing value (:issue:`42405`)
-

.. ---------------------------------------------------------------------------
Expand Down
18 changes: 13 additions & 5 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,12 +403,20 @@ def isin(self, values) -> BooleanArray: # type: ignore[override]

from pandas.core.arrays import BooleanArray

result = isin(self._data, values)
# algorithms.isin will eventually convert values to an ndarray, so no extra
# cost to doing it here first
values_arr = np.asarray(values)
result = isin(self._data, values_arr)

if self._hasna:
if libmissing.NA in values:
result += self._mask
else:
result *= np.invert(self._mask)
values_have_NA = is_object_dtype(values_arr.dtype) and any(
val is self.dtype.na_value for val in values_arr
)

# For now, NA does not propagate so set result according to presence of NA,
# see https://github.com/pandas-dev/pandas/pull/38379 for some discussion
result[self._mask] = values_have_NA

mask = np.zeros_like(self, dtype=bool)
return BooleanArray(result, mask, copy=False)

Expand Down
21 changes: 21 additions & 0 deletions pandas/tests/series/methods/test_isin.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,27 @@ def test_isin_float_in_int_series(self, values):
expected = Series([True, False])
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("dtype", ["boolean", "Int64", "Float64"])
@pytest.mark.parametrize(
"data,values,expected",
[
([0, 1, 0], [1], [False, True, False]),
([0, 1, 0], [1, pd.NA], [False, True, False]),
([0, pd.NA, 0], [1, 0], [True, False, True]),
([0, 1, pd.NA], [1, pd.NA], [False, True, True]),
([0, 1, pd.NA], [1, np.nan], [False, True, False]),
([0, pd.NA, pd.NA], [np.nan, pd.NaT, None], [False, False, False]),
],
)
def test_isin_masked_types(self, dtype, data, values, expected):
# GH#42405
ser = Series(data, dtype=dtype)

result = ser.isin(values)
expected = Series(expected, dtype="boolean")

tm.assert_series_equal(result, expected)


@pytest.mark.slow
def test_isin_large_series_mixed_dtypes_and_nan():
Expand Down