diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py index 427af9307f2c9..5d7a76bc01d49 100644 --- a/asv_bench/benchmarks/algos/isin.py +++ b/asv_bench/benchmarks/algos/isin.py @@ -1,7 +1,5 @@ import numpy as np -from pandas.compat.numpy import np_version_under1p20 - from pandas import ( Categorical, NaT, @@ -280,10 +278,6 @@ class IsInLongSeriesLookUpDominates: def setup(self, dtype, MaxNumber, series_type): N = 10 ** 7 - # https://github.com/pandas-dev/pandas/issues/39844 - if not np_version_under1p20 and dtype in ("Int64", "Float64"): - raise NotImplementedError - if series_type == "random_hits": array = np.random.randint(0, MaxNumber, N) if series_type == "random_misses": @@ -294,7 +288,8 @@ def setup(self, dtype, MaxNumber, series_type): array = np.arange(N) + MaxNumber self.series = Series(array).astype(dtype) - self.values = np.arange(MaxNumber).astype(dtype) + + self.values = np.arange(MaxNumber).astype(dtype.lower()) def time_isin(self, dtypes, MaxNumber, series_type): self.series.isin(self.values) @@ -310,16 +305,12 @@ class IsInLongSeriesValuesDominate: def setup(self, dtype, series_type): N = 10 ** 7 - # https://github.com/pandas-dev/pandas/issues/39844 - if not np_version_under1p20 and dtype in ("Int64", "Float64"): - raise NotImplementedError - if series_type == "random": vals = np.random.randint(0, 10 * N, N) if series_type == "monotone": vals = np.arange(N) - self.values = vals.astype(dtype) + self.values = vals.astype(dtype.lower()) M = 10 ** 6 + 1 self.series = Series(np.arange(M)).astype(dtype) diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst index 3dba36bf5b933..8ba0986a2ab7b 100644 --- a/doc/source/whatsnew/v1.3.1.rst +++ b/doc/source/whatsnew/v1.3.1.rst @@ -22,6 +22,7 @@ Fixed regressions - Performance regression in :class:`DataFrame` in reduction operations requiring casting such as :meth:`DataFrame.mean` on integer data (:issue:`38592`) - Performance regression in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when ``orient`` argument one of "records", "dict", or "split" (:issue:`42352`) - Fixed regression in indexing with a ``list`` subclass incorrectly raising ``TypeError`` (:issue:`42433`, :issue:`42461`) +- Fixed regression in :meth:`DataFrame.isin` and :meth:`Series.isin` raising ``TypeError`` with nullable data containing at least one missing value (:issue:`42405`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index c4b9fab28c27e..3a152bd5889b7 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -403,12 +403,20 @@ def isin(self, values) -> BooleanArray: # type: ignore[override] from pandas.core.arrays import BooleanArray - result = isin(self._data, values) + # algorithms.isin will eventually convert values to an ndarray, so no extra + # cost to doing it here first + values_arr = np.asarray(values) + result = isin(self._data, values_arr) + if self._hasna: - if libmissing.NA in values: - result += self._mask - else: - result *= np.invert(self._mask) + values_have_NA = is_object_dtype(values_arr.dtype) and any( + val is self.dtype.na_value for val in values_arr + ) + + # For now, NA does not propagate so set result according to presence of NA, + # see https://github.com/pandas-dev/pandas/pull/38379 for some discussion + result[self._mask] = values_have_NA + mask = np.zeros_like(self, dtype=bool) return BooleanArray(result, mask, copy=False) diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index 898a769dfac48..d3a3434872826 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -156,6 +156,27 @@ def test_isin_float_in_int_series(self, values): expected = Series([True, False]) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("dtype", ["boolean", "Int64", "Float64"]) + @pytest.mark.parametrize( + "data,values,expected", + [ + ([0, 1, 0], [1], [False, True, False]), + ([0, 1, 0], [1, pd.NA], [False, True, False]), + ([0, pd.NA, 0], [1, 0], [True, False, True]), + ([0, 1, pd.NA], [1, pd.NA], [False, True, True]), + ([0, 1, pd.NA], [1, np.nan], [False, True, False]), + ([0, pd.NA, pd.NA], [np.nan, pd.NaT, None], [False, False, False]), + ], + ) + def test_isin_masked_types(self, dtype, data, values, expected): + # GH#42405 + ser = Series(data, dtype=dtype) + + result = ser.isin(values) + expected = Series(expected, dtype="boolean") + + tm.assert_series_equal(result, expected) + @pytest.mark.slow def test_isin_large_series_mixed_dtypes_and_nan():