Skip to content

Commit f2bac75

Browse files
committed
ENH: pd.MultiIndex.get_loc(np.nan) (pandas-dev#28919)
1 parent 1d36851 commit f2bac75

File tree

4 files changed

+139
-4
lines changed

4 files changed

+139
-4
lines changed

doc/source/whatsnew/v1.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -797,6 +797,7 @@ Indexing
797797
- Bug when indexing with ``.loc`` where the index was a :class:`CategoricalIndex` with non-string categories didn't work (:issue:`17569`, :issue:`30225`)
798798
- :meth:`Index.get_indexer_non_unique` could fail with `TypeError` in some cases, such as when searching for ints in a string index (:issue:`28257`)
799799
- Bug in :meth:`Float64Index.get_loc` incorrectly raising ``TypeError`` instead of ``KeyError`` (:issue:`29189`)
800+
- When index is ``MultiIndex``, Using ``.get_loc`` can't find ``nan`` with values including missing values as input (:issue:`19132`)
800801

801802
Missing
802803
^^^^^^^

pandas/core/indexes/multi.py

+32-4
Original file line numberDiff line numberDiff line change
@@ -2507,7 +2507,7 @@ def _partial_tup_index(self, tup, side="left"):
25072507
for k, (lab, lev, labs) in enumerate(zipped):
25082508
section = labs[start:end]
25092509

2510-
if lab not in lev:
2510+
if lab not in lev and not isna(lab):
25112511
if not lev.is_type_compatible(lib.infer_dtype([lab], skipna=False)):
25122512
raise TypeError(f"Level type mismatch: {lab}")
25132513

@@ -2517,13 +2517,39 @@ def _partial_tup_index(self, tup, side="left"):
25172517
loc -= 1
25182518
return start + section.searchsorted(loc, side=side)
25192519

2520-
idx = lev.get_loc(lab)
2520+
idx = self._get_loc_single_level_index(lev, lab)
25212521
if k < n - 1:
25222522
end = start + section.searchsorted(idx, side="right")
25232523
start = start + section.searchsorted(idx, side="left")
25242524
else:
25252525
return start + section.searchsorted(idx, side=side)
25262526

2527+
2528+
def _get_loc_single_level_index(self, level_index: Index, key) -> int:
2529+
"""
2530+
If key is NA value, location of index unify as -1.
2531+
2532+
Parameters
2533+
----------
2534+
level_index: Index
2535+
key : label
2536+
2537+
Returns
2538+
-------
2539+
loc : int
2540+
If key is NA value, loc is -1
2541+
Else, location of key in index
2542+
2543+
See Also
2544+
--------
2545+
Index.get_loc : The get_loc method for (single-level) index.
2546+
"""
2547+
2548+
if is_scalar(key) and isna(key):
2549+
return -1
2550+
else:
2551+
return level_index.get_loc(key)
2552+
25272553
def get_loc(self, key, method=None):
25282554
"""
25292555
Get location for a label or a tuple of labels as an integer, slice or
@@ -2622,7 +2648,9 @@ def _maybe_to_slice(loc):
26222648
loc = np.arange(start, stop, dtype="int64")
26232649

26242650
for i, k in enumerate(follow_key, len(lead_key)):
2625-
mask = self.codes[i][loc] == self.levels[i].get_loc(k)
2651+
mask = self.codes[i][loc] == self._get_loc_single_level_index(
2652+
self.levels[i], k
2653+
)
26262654
if not mask.all():
26272655
loc = loc[mask]
26282656
if not len(loc):
@@ -2850,7 +2878,7 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes):
28502878

28512879
else:
28522880

2853-
code = level_index.get_loc(key)
2881+
code = self._get_loc_single_level_index(level_index, key)
28542882

28552883
if level > 0 or self.lexsort_depth == 0:
28562884
# Desired level is not sorted

pandas/tests/indexes/multi/test_contains.py

+10
Original file line numberDiff line numberDiff line change
@@ -98,3 +98,13 @@ def test_isin_level_kwarg():
9898

9999
with pytest.raises(KeyError, match="'Level C not found'"):
100100
idx.isin(vals_1, level="C")
101+
102+
103+
def test_contains_with_missing_value():
104+
# issue 19132
105+
idx = MultiIndex.from_arrays([[1, np.nan, 2]])
106+
assert np.nan in idx
107+
108+
idx = MultiIndex.from_arrays([[1, 2], [np.nan, 3]])
109+
assert np.nan not in idx
110+
assert (1, np.nan) in idx

pandas/tests/indexes/multi/test_indexing.py

+96
Original file line numberDiff line numberDiff line change
@@ -437,3 +437,99 @@ def test_timestamp_multiindex_indexer():
437437
)
438438
should_be = pd.Series(data=np.arange(24, len(qidx) + 24), index=qidx, name="foo")
439439
tm.assert_series_equal(result, should_be)
440+
441+
442+
def test_get_loc_with_values_including_missing_values():
443+
# issue 19132
444+
idx = MultiIndex.from_product([[np.nan, 1]] * 2)
445+
expected = slice(0, 2, None)
446+
assert idx.get_loc(np.nan) == expected
447+
448+
idx = MultiIndex.from_arrays([[np.nan, 1, 2, np.nan]])
449+
expected = np.array([True, False, False, True])
450+
tm.assert_numpy_array_equal(idx.get_loc(np.nan), expected)
451+
452+
idx = MultiIndex.from_product([[np.nan, 1]] * 3)
453+
expected = slice(2, 4, None)
454+
assert idx.get_loc((np.nan, 1)) == expected
455+
456+
457+
@pytest.mark.parametrize(
458+
"index_arr,labels,expected",
459+
[
460+
(
461+
[[1, np.nan, 2], [3, 4, 5]],
462+
[1, np.nan, 2],
463+
np.array([-1, -1, -1], dtype="int64")
464+
),
465+
(
466+
[[1, np.nan, 2], [3, 4, 5]],
467+
[(np.nan, 4)],
468+
np.array([1], dtype="int64")
469+
),
470+
(
471+
[[1, 2, 3], [np.nan, 4, 5]],
472+
[(1, np.nan)],
473+
np.array([0], dtype="int64")
474+
),
475+
(
476+
[[1, 2, 3], [np.nan, 4, 5]],
477+
[np.nan, 4, 5],
478+
np.array([-1, -1, -1], dtype="int64")
479+
)
480+
]
481+
)
482+
def test_get_indexer_with_missing_value(index_arr, labels, expected):
483+
# issue 19132
484+
idx = MultiIndex.from_arrays(index_arr)
485+
result = idx.get_indexer(labels)
486+
tm.assert_numpy_array_equal(result.astype("int64"), expected)
487+
488+
489+
@pytest.mark.parametrize(
490+
"index_arr,expected,target,algo",
491+
[
492+
([[np.nan, "a", "b"], ["c", "d", "e"]], 0, np.nan, "left"),
493+
([[np.nan, "a", "b"], ["c", "d", "e"]], 1, (np.nan, "c"), "right"),
494+
([["a", "b", "c"], ["d", np.nan, "d"]], 1, ("b", np.nan), "left"),
495+
]
496+
)
497+
def test_get_slice_bound_with_missing_value(index_arr, expected, target, algo):
498+
# issue 19132
499+
idx = MultiIndex.from_arrays(index_arr)
500+
result = idx.get_slice_bound(target, side=algo, kind="loc")
501+
assert result == expected
502+
503+
504+
@pytest.mark.parametrize(
505+
"index_arr,expected,start_idx,end_idx",
506+
[
507+
([[np.nan, 1, 2], [3, 4, 5]], slice(0, 2, None), np.nan, 1),
508+
([[np.nan, 1, 2], [3, 4, 5]], slice(0, 3, None), np.nan, (2, 5)),
509+
([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), 3),
510+
([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), (3, 5)),
511+
]
512+
)
513+
def test_slice_indexer_with_missing_value(index_arr, expected, start_idx, end_idx):
514+
# issue 19132
515+
idx = MultiIndex.from_arrays(index_arr)
516+
result = idx.slice_indexer(start=start_idx, end=end_idx)
517+
assert result == expected
518+
519+
520+
@pytest.mark.parametrize(
521+
"index_arr,expected,start_idx,end_idx",
522+
[
523+
([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, None),
524+
([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, "b"),
525+
([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, ("b", "e")),
526+
([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), None),
527+
([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), "c"),
528+
([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), ("c", "e")),
529+
]
530+
)
531+
def test_slice_locs_with_missing_value(index_arr, expected, start_idx, end_idx):
532+
# issue 19132
533+
idx = MultiIndex.from_arrays(index_arr)
534+
result = idx.slice_locs(start=start_idx, end=end_idx)
535+
assert result == expected

0 commit comments

Comments
 (0)