Skip to content

BUG: get_indexer_non_unique with np.nan #42289

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions pandas/_libs/index.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -288,10 +288,12 @@ cdef class IndexEngine:
object val
int count = 0, count_missing = 0
Py_ssize_t i, j, n, n_t, n_alloc
bint d_has_nan = False, stargets_has_nan = False, need_nan_check = True

self._ensure_mapping_populated()
values = np.array(self._get_index_values(), copy=False)
stargets = set(targets)

n = len(values)
n_t = len(targets)
if n > 10_000:
Expand Down Expand Up @@ -321,19 +323,35 @@ cdef class IndexEngine:

if stargets:
# otherwise, map by iterating through all items in the index

for i in range(n):
val = values[i]
if val in stargets:
if val not in d:
d[val] = []
d[val].append(i)

elif util.is_nan(val):
# GH#35392
if need_nan_check:
# Do this check only once
stargets_has_nan = any(util.is_nan(x) for x in stargets)
need_nan_check = False

if stargets_has_nan:
if not d_has_nan:
# use a canonical nan object
d[np.nan] = []
d_has_nan = True
d[np.nan].append(i)

for i in range(n_t):
val = targets[i]

# found
if val in d:
for j in d[val]:
if val in d or (d_has_nan and util.is_nan(val)):
key = val if not util.is_nan(val) else np.nan
for j in d[key]:

# realloc if needed
if count >= n_alloc:
Expand Down
21 changes: 21 additions & 0 deletions pandas/tests/indexes/object/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,27 @@ def test_get_indexer_with_NA_values(
tm.assert_numpy_array_equal(result, expected)


class TestGetIndexerNonUnique:
# TODO: test with matching-but-not-identical nans
def test_get_indexer_non_unique_nas(self, nulls_fixture):
# even though this isn't non-unique, this should still work
index = Index(["a", "b", nulls_fixture])
indexer, missing = index.get_indexer_non_unique([nulls_fixture])

expected_indexer = np.array([2], dtype=np.intp)
expected_missing = np.array([], dtype=np.intp)
tm.assert_numpy_array_equal(indexer, expected_indexer)
tm.assert_numpy_array_equal(missing, expected_missing)

# actually non-unique
index = Index(["a", nulls_fixture, "b", nulls_fixture])
indexer, missing = index.get_indexer_non_unique([nulls_fixture])

expected_indexer = np.array([1, 3], dtype=np.intp)
tm.assert_numpy_array_equal(indexer, expected_indexer)
tm.assert_numpy_array_equal(missing, expected_missing)


class TestSliceLocs:
@pytest.mark.parametrize(
"in_slice,expected",
Expand Down
48 changes: 48 additions & 0 deletions pandas/tests/indexes/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
Int64Index,
IntervalIndex,
MultiIndex,
NaT,
PeriodIndex,
RangeIndex,
Series,
Expand Down Expand Up @@ -294,3 +295,50 @@ def test_maybe_cast_slice_bound_kind_deprecated(index):
with tm.assert_produces_warning(FutureWarning):
# pass as positional
index._maybe_cast_slice_bound(index[0], "left", "loc")


@pytest.mark.parametrize(
"idx,target,expected",
[
([np.nan, "var1", np.nan], [np.nan], np.array([0, 2], dtype=np.intp)),
(
[np.nan, "var1", np.nan],
[np.nan, "var1"],
np.array([0, 2, 1], dtype=np.intp),
),
(
np.array([np.nan, "var1", np.nan], dtype=object),
[np.nan],
np.array([0, 2], dtype=np.intp),
),
(
DatetimeIndex(["2020-08-05", NaT, NaT]),
[NaT],
np.array([1, 2], dtype=np.intp),
),
(
np.array([np.nan, -1, np.nan], dtype=object),
[np.nan],
np.array([0, 2], dtype=np.intp),
),
(
np.array([np.nan, -1, np.nan], dtype=object),
[-1],
np.array([1], dtype=np.intp),
),
(
np.array([np.nan, -1, np.nan], dtype=object),
[np.nan, -1],
np.array([0, 2, 1], dtype=np.intp),
),
(
[float("NaN"), "var1", float("NaN")],
[float("NaN")],
np.array([0, 2], dtype=np.intp),
),
],
)
def test_get_indexer_non_unique_multiple_nans(idx, target, expected):
# GH#35392
actual = Index(idx).get_indexer_for(target)
tm.assert_numpy_array_equal(actual, expected)