diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 3351bb7cac7d6..10817a6ae61bd 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -288,10 +288,12 @@ cdef class IndexEngine: object val int count = 0, count_missing = 0 Py_ssize_t i, j, n, n_t, n_alloc + bint d_has_nan = False, stargets_has_nan = False, need_nan_check = True self._ensure_mapping_populated() values = np.array(self._get_index_values(), copy=False) stargets = set(targets) + n = len(values) n_t = len(targets) if n > 10_000: @@ -321,6 +323,7 @@ cdef class IndexEngine: if stargets: # otherwise, map by iterating through all items in the index + for i in range(n): val = values[i] if val in stargets: @@ -328,12 +331,27 @@ cdef class IndexEngine: d[val] = [] d[val].append(i) + elif util.is_nan(val): + # GH#35392 + if need_nan_check: + # Do this check only once + stargets_has_nan = any(util.is_nan(x) for x in stargets) + need_nan_check = False + + if stargets_has_nan: + if not d_has_nan: + # use a canonical nan object + d[np.nan] = [] + d_has_nan = True + d[np.nan].append(i) + for i in range(n_t): val = targets[i] # found - if val in d: - for j in d[val]: + if val in d or (d_has_nan and util.is_nan(val)): + key = val if not util.is_nan(val) else np.nan + for j in d[key]: # realloc if needed if count >= n_alloc: diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index a683e9faed1f2..b5b32771d201e 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -64,6 +64,27 @@ def test_get_indexer_with_NA_values( tm.assert_numpy_array_equal(result, expected) +class TestGetIndexerNonUnique: + # TODO: test with matching-but-not-identical nans + def test_get_indexer_non_unique_nas(self, nulls_fixture): + # even though this isn't non-unique, this should still work + index = Index(["a", "b", nulls_fixture]) + indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + + expected_indexer = np.array([2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + + # actually non-unique + index = Index(["a", nulls_fixture, "b", nulls_fixture]) + indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + + expected_indexer = np.array([1, 3], dtype=np.intp) + tm.assert_numpy_array_equal(indexer, expected_indexer) + tm.assert_numpy_array_equal(missing, expected_missing) + + class TestSliceLocs: @pytest.mark.parametrize( "in_slice,expected", diff --git a/pandas/tests/indexes/test_indexing.py b/pandas/tests/indexes/test_indexing.py index 5f6d0155ae6cf..6ca3bc0b155ee 100644 --- a/pandas/tests/indexes/test_indexing.py +++ b/pandas/tests/indexes/test_indexing.py @@ -25,6 +25,7 @@ Int64Index, IntervalIndex, MultiIndex, + NaT, PeriodIndex, RangeIndex, Series, @@ -294,3 +295,50 @@ def test_maybe_cast_slice_bound_kind_deprecated(index): with tm.assert_produces_warning(FutureWarning): # pass as positional index._maybe_cast_slice_bound(index[0], "left", "loc") + + +@pytest.mark.parametrize( + "idx,target,expected", + [ + ([np.nan, "var1", np.nan], [np.nan], np.array([0, 2], dtype=np.intp)), + ( + [np.nan, "var1", np.nan], + [np.nan, "var1"], + np.array([0, 2, 1], dtype=np.intp), + ), + ( + np.array([np.nan, "var1", np.nan], dtype=object), + [np.nan], + np.array([0, 2], dtype=np.intp), + ), + ( + DatetimeIndex(["2020-08-05", NaT, NaT]), + [NaT], + np.array([1, 2], dtype=np.intp), + ), + ( + np.array([np.nan, -1, np.nan], dtype=object), + [np.nan], + np.array([0, 2], dtype=np.intp), + ), + ( + np.array([np.nan, -1, np.nan], dtype=object), + [-1], + np.array([1], dtype=np.intp), + ), + ( + np.array([np.nan, -1, np.nan], dtype=object), + [np.nan, -1], + np.array([0, 2, 1], dtype=np.intp), + ), + ( + [float("NaN"), "var1", float("NaN")], + [float("NaN")], + np.array([0, 2], dtype=np.intp), + ), + ], +) +def test_get_indexer_non_unique_multiple_nans(idx, target, expected): + # GH#35392 + actual = Index(idx).get_indexer_for(target) + tm.assert_numpy_array_equal(actual, expected)