Skip to content

Commit 88c999c

Browse files
alexhlimfeefladder
authored andcommitted
BUG: Index.get_indexer_non_unique misbehaves when index contains multiple nan (pandas-dev#35392) (pandas-dev#35498)
1 parent 9eda74a commit 88c999c

File tree

5 files changed

+89
-2
lines changed

5 files changed

+89
-2
lines changed

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ Indexing
236236
- Bug in indexing on a :class:`MultiIndex` failing to drop scalar levels when the indexer is a tuple containing a datetime-like string (:issue:`42476`)
237237
- Bug in :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` when passing an ascending value, failed to raise or incorrectly raising ``ValueError`` (:issue:`41634`)
238238
- Bug in updating values of :class:`pandas.Series` using boolean index, created by using :meth:`pandas.DataFrame.pop` (:issue:`42530`)
239+
- Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.nan`` (:issue:`35392`)
239240
- Bug in :meth:`DataFrame.query` did not handle the degree sign in a backticked column name, such as \`Temp(°C)\`, used in an expression to query a dataframe (:issue:`42826`)
240241
-
241242

pandas/_libs/index.pyx

+20-2
Original file line numberDiff line numberDiff line change
@@ -288,10 +288,12 @@ cdef class IndexEngine:
288288
object val
289289
int count = 0, count_missing = 0
290290
Py_ssize_t i, j, n, n_t, n_alloc
291+
bint d_has_nan = False, stargets_has_nan = False, need_nan_check = True
291292

292293
self._ensure_mapping_populated()
293294
values = np.array(self._get_index_values(), copy=False)
294295
stargets = set(targets)
296+
295297
n = len(values)
296298
n_t = len(targets)
297299
if n > 10_000:
@@ -321,19 +323,35 @@ cdef class IndexEngine:
321323

322324
if stargets:
323325
# otherwise, map by iterating through all items in the index
326+
324327
for i in range(n):
325328
val = values[i]
326329
if val in stargets:
327330
if val not in d:
328331
d[val] = []
329332
d[val].append(i)
330333

334+
elif util.is_nan(val):
335+
# GH#35392
336+
if need_nan_check:
337+
# Do this check only once
338+
stargets_has_nan = any(util.is_nan(val) for x in stargets)
339+
need_nan_check = False
340+
341+
if stargets_has_nan:
342+
if not d_has_nan:
343+
# use a canonical nan object
344+
d[np.nan] = []
345+
d_has_nan = True
346+
d[np.nan].append(i)
347+
331348
for i in range(n_t):
332349
val = targets[i]
333350

334351
# found
335-
if val in d:
336-
for j in d[val]:
352+
if val in d or (d_has_nan and util.is_nan(val)):
353+
key = val if not util.is_nan(val) else np.nan
354+
for j in d[key]:
337355

338356
# realloc if needed
339357
if count >= n_alloc:

pandas/core/indexes/base.py

+6
Original file line numberDiff line numberDiff line change
@@ -5383,6 +5383,12 @@ def get_indexer_for(self, target) -> npt.NDArray[np.intp]:
53835383
-------
53845384
np.ndarray[np.intp]
53855385
List of indices.
5386+
5387+
Examples
5388+
--------
5389+
>>> idx = pd.Index([np.nan, 'var1', np.nan])
5390+
>>> idx.get_indexer_for([np.nan])
5391+
array([0, 2])
53865392
"""
53875393
if self._index_as_unique:
53885394
return self.get_indexer(target)

pandas/tests/indexes/object/test_indexing.py

+31
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import numpy as np
22
import pytest
33

4+
from pandas._libs.missing import is_matching_na
5+
46
import pandas as pd
57
from pandas import Index
68
import pandas._testing as tm
@@ -66,6 +68,35 @@ def test_get_indexer_with_NA_values(
6668
tm.assert_numpy_array_equal(result, expected)
6769

6870

71+
class TestGetIndexerNonUnique:
72+
def test_get_indexer_non_unique_nas(self, nulls_fixture):
73+
# even though this isn't non-unique, this should still work
74+
index = Index(["a", "b", nulls_fixture])
75+
indexer, missing = index.get_indexer_non_unique([nulls_fixture])
76+
77+
expected_indexer = np.array([2], dtype=np.intp)
78+
expected_missing = np.array([], dtype=np.intp)
79+
tm.assert_numpy_array_equal(indexer, expected_indexer)
80+
tm.assert_numpy_array_equal(missing, expected_missing)
81+
82+
# actually non-unique
83+
index = Index(["a", nulls_fixture, "b", nulls_fixture])
84+
indexer, missing = index.get_indexer_non_unique([nulls_fixture])
85+
86+
expected_indexer = np.array([1, 3], dtype=np.intp)
87+
tm.assert_numpy_array_equal(indexer, expected_indexer)
88+
tm.assert_numpy_array_equal(missing, expected_missing)
89+
90+
# matching-but-not-identical nans
91+
if is_matching_na(nulls_fixture, float("NaN")):
92+
index = Index(["a", float("NaN"), "b", float("NaN")])
93+
indexer, missing = index.get_indexer_non_unique([nulls_fixture])
94+
95+
expected_indexer = np.array([1, 3], dtype=np.intp)
96+
tm.assert_numpy_array_equal(indexer, expected_indexer)
97+
tm.assert_numpy_array_equal(missing, expected_missing)
98+
99+
69100
class TestSliceLocs:
70101
@pytest.mark.parametrize(
71102
"in_slice,expected",

pandas/tests/indexes/test_indexing.py

+31
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
take
88
where
99
get_indexer
10+
get_indexer_for
1011
slice_locs
1112
asof_locs
1213
@@ -25,6 +26,7 @@
2526
Int64Index,
2627
IntervalIndex,
2728
MultiIndex,
29+
NaT,
2830
PeriodIndex,
2931
RangeIndex,
3032
Series,
@@ -294,3 +296,32 @@ def test_maybe_cast_slice_bound_kind_deprecated(index):
294296
with tm.assert_produces_warning(FutureWarning):
295297
# pass as positional
296298
index._maybe_cast_slice_bound(index[0], "left", "loc")
299+
300+
301+
@pytest.mark.parametrize(
302+
"idx,target,expected",
303+
[
304+
([np.nan, "var1", np.nan], [np.nan], np.array([0, 2], dtype=np.intp)),
305+
(
306+
[np.nan, "var1", np.nan],
307+
[np.nan, "var1"],
308+
np.array([0, 2, 1], dtype=np.intp),
309+
),
310+
(
311+
np.array([np.nan, "var1", np.nan], dtype=object),
312+
[np.nan],
313+
np.array([0, 2], dtype=np.intp),
314+
),
315+
(
316+
DatetimeIndex(["2020-08-05", NaT, NaT]),
317+
[NaT],
318+
np.array([1, 2], dtype=np.intp),
319+
),
320+
(["a", "b", "a", np.nan], [np.nan], np.array([3], dtype=np.intp)),
321+
],
322+
)
323+
def test_get_indexer_non_unique_multiple_nans(idx, target, expected):
324+
# GH 35392
325+
axis = Index(idx)
326+
actual = axis.get_indexer_for(target)
327+
tm.assert_numpy_array_equal(actual, expected)

0 commit comments

Comments
 (0)