Skip to content

Commit f8855eb

Browse files
authored
REGR: isin with nullable types with missing values raising (#42473)
1 parent 2001798 commit f8855eb

File tree

4 files changed

+38
-20
lines changed

4 files changed

+38
-20
lines changed

asv_bench/benchmarks/algos/isin.py

+3-15
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,5 @@
11
import numpy as np
22

3-
try:
4-
from pandas.compat import np_version_under1p20
5-
except ImportError:
6-
from pandas.compat.numpy import _np_version_under1p20 as np_version_under1p20
7-
83
from pandas import (
94
Categorical,
105
NaT,
@@ -283,10 +278,6 @@ class IsInLongSeriesLookUpDominates:
283278
def setup(self, dtype, MaxNumber, series_type):
284279
N = 10 ** 7
285280

286-
# https://github.com/pandas-dev/pandas/issues/39844
287-
if not np_version_under1p20 and dtype in ("Int64", "Float64"):
288-
raise NotImplementedError
289-
290281
if series_type == "random_hits":
291282
array = np.random.randint(0, MaxNumber, N)
292283
if series_type == "random_misses":
@@ -297,7 +288,8 @@ def setup(self, dtype, MaxNumber, series_type):
297288
array = np.arange(N) + MaxNumber
298289

299290
self.series = Series(array).astype(dtype)
300-
self.values = np.arange(MaxNumber).astype(dtype)
291+
292+
self.values = np.arange(MaxNumber).astype(dtype.lower())
301293

302294
def time_isin(self, dtypes, MaxNumber, series_type):
303295
self.series.isin(self.values)
@@ -313,16 +305,12 @@ class IsInLongSeriesValuesDominate:
313305
def setup(self, dtype, series_type):
314306
N = 10 ** 7
315307

316-
# https://github.com/pandas-dev/pandas/issues/39844
317-
if not np_version_under1p20 and dtype in ("Int64", "Float64"):
318-
raise NotImplementedError
319-
320308
if series_type == "random":
321309
vals = np.random.randint(0, 10 * N, N)
322310
if series_type == "monotone":
323311
vals = np.arange(N)
324312

325-
self.values = vals.astype(dtype)
313+
self.values = vals.astype(dtype.lower())
326314
M = 10 ** 6 + 1
327315
self.series = Series(np.arange(M)).astype(dtype)
328316

doc/source/whatsnew/v1.3.1.rst

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ Fixed regressions
2222
- Performance regression in :class:`DataFrame` in reduction operations requiring casting such as :meth:`DataFrame.mean` on integer data (:issue:`38592`)
2323
- Performance regression in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when ``orient`` argument one of "records", "dict", or "split" (:issue:`42352`)
2424
- Fixed regression in indexing with a ``list`` subclass incorrectly raising ``TypeError`` (:issue:`42433`, :issue:`42461`)
25+
- Fixed regression in :meth:`DataFrame.isin` and :meth:`Series.isin` raising ``TypeError`` with nullable data containing at least one missing value (:issue:`42405`)
2526
-
2627

2728
.. ---------------------------------------------------------------------------

pandas/core/arrays/masked.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -403,12 +403,20 @@ def isin(self, values) -> BooleanArray: # type: ignore[override]
403403

404404
from pandas.core.arrays import BooleanArray
405405

406-
result = isin(self._data, values)
406+
# algorithms.isin will eventually convert values to an ndarray, so no extra
407+
# cost to doing it here first
408+
values_arr = np.asarray(values)
409+
result = isin(self._data, values_arr)
410+
407411
if self._hasna:
408-
if libmissing.NA in values:
409-
result += self._mask
410-
else:
411-
result *= np.invert(self._mask)
412+
values_have_NA = is_object_dtype(values_arr.dtype) and any(
413+
val is self.dtype.na_value for val in values_arr
414+
)
415+
416+
# For now, NA does not propagate so set result according to presence of NA,
417+
# see https://github.com/pandas-dev/pandas/pull/38379 for some discussion
418+
result[self._mask] = values_have_NA
419+
412420
mask = np.zeros_like(self, dtype=bool)
413421
return BooleanArray(result, mask, copy=False)
414422

pandas/tests/series/methods/test_isin.py

+21
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,27 @@ def test_isin_float_in_int_series(self, values):
156156
expected = Series([True, False])
157157
tm.assert_series_equal(result, expected)
158158

159+
@pytest.mark.parametrize("dtype", ["boolean", "Int64", "Float64"])
160+
@pytest.mark.parametrize(
161+
"data,values,expected",
162+
[
163+
([0, 1, 0], [1], [False, True, False]),
164+
([0, 1, 0], [1, pd.NA], [False, True, False]),
165+
([0, pd.NA, 0], [1, 0], [True, False, True]),
166+
([0, 1, pd.NA], [1, pd.NA], [False, True, True]),
167+
([0, 1, pd.NA], [1, np.nan], [False, True, False]),
168+
([0, pd.NA, pd.NA], [np.nan, pd.NaT, None], [False, False, False]),
169+
],
170+
)
171+
def test_isin_masked_types(self, dtype, data, values, expected):
172+
# GH#42405
173+
ser = Series(data, dtype=dtype)
174+
175+
result = ser.isin(values)
176+
expected = Series(expected, dtype="boolean")
177+
178+
tm.assert_series_equal(result, expected)
179+
159180

160181
@pytest.mark.slow
161182
def test_isin_large_series_mixed_dtypes_and_nan():

0 commit comments

Comments
 (0)