From 6951b04801107c10e29f25a55893178b7ffdbd88 Mon Sep 17 00:00:00 2001 From: proost Date: Sat, 28 Dec 2019 13:40:16 +0900 Subject: [PATCH] ENH: pd.MultiIndex.get_loc(np.nan) (#28919) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/indexes/multi.py | 46 ++++++++--- pandas/tests/indexes/multi/test_contains.py | 24 ++++++ pandas/tests/indexes/multi/test_indexing.py | 88 +++++++++++++++++++++ 4 files changed, 148 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 96ea682dd3caf..7ea1685b13328 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -797,6 +797,7 @@ Indexing - Bug when indexing with ``.loc`` where the index was a :class:`CategoricalIndex` with non-string categories didn't work (:issue:`17569`, :issue:`30225`) - :meth:`Index.get_indexer_non_unique` could fail with `TypeError` in some cases, such as when searching for ints in a string index (:issue:`28257`) - Bug in :meth:`Float64Index.get_loc` incorrectly raising ``TypeError`` instead of ``KeyError`` (:issue:`29189`) +- :meth:`MultiIndex.get_loc` can't find missing values when input includes missing values (:issue:`19132`) Missing ^^^^^^^ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index dac9b20104c36..e73ed9084faa0 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,7 +1,7 @@ from collections import OrderedDict import datetime from sys import getsizeof -from typing import List, Optional +from typing import Hashable, List, Optional import warnings import numpy as np @@ -2507,7 +2507,7 @@ def _partial_tup_index(self, tup, side="left"): for k, (lab, lev, labs) in enumerate(zipped): section = labs[start:end] - if lab not in lev: + if lab not in lev and not isna(lab): if not lev.is_type_compatible(lib.infer_dtype([lab], skipna=False)): raise TypeError(f"Level type mismatch: {lab}") @@ -2517,13 +2517,38 @@ def _partial_tup_index(self, tup, side="left"): loc -= 1 return start + section.searchsorted(loc, side=side) - idx = lev.get_loc(lab) + idx = self._get_loc_single_level_index(lev, lab) if k < n - 1: end = start + section.searchsorted(idx, side="right") start = start + section.searchsorted(idx, side="left") else: return start + section.searchsorted(idx, side=side) + def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int: + """ + If key is NA value, location of index unify as -1. + + Parameters + ---------- + level_index: Index + key : label + + Returns + ------- + loc : int + If key is NA value, loc is -1 + Else, location of key in index. + + See Also + -------- + Index.get_loc : The get_loc method for (single-level) index. + """ + + if is_scalar(key) and isna(key): + return -1 + else: + return level_index.get_loc(key) + def get_loc(self, key, method=None): """ Get location for a label or a tuple of labels as an integer, slice or @@ -2622,7 +2647,9 @@ def _maybe_to_slice(loc): loc = np.arange(start, stop, dtype="int64") for i, k in enumerate(follow_key, len(lead_key)): - mask = self.codes[i][loc] == self.levels[i].get_loc(k) + mask = self.codes[i][loc] == self._get_loc_single_level_index( + self.levels[i], k + ) if not mask.all(): loc = loc[mask] if not len(loc): @@ -2850,7 +2877,7 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): else: - code = level_index.get_loc(key) + code = self._get_loc_single_level_index(level_index, key) if level > 0 or self.lexsort_depth == 0: # Desired level is not sorted @@ -3345,14 +3372,11 @@ def isin(self, values, level=None): return algos.isin(self.values, values) else: num = self._get_level_number(level) - levs = self.levels[num] - level_codes = self.codes[num] + levs = self.get_level_values(num) - sought_labels = levs.isin(values).nonzero()[0] if levs.size == 0: - return np.zeros(len(level_codes), dtype=np.bool_) - else: - return np.lib.arraysetops.in1d(level_codes, sought_labels) + return np.zeros(len(levs), dtype=np.bool_) + return levs.isin(values) MultiIndex._add_numeric_methods_disabled() diff --git a/pandas/tests/indexes/multi/test_contains.py b/pandas/tests/indexes/multi/test_contains.py index 64d2859cd13db..b5b717c4453e5 100644 --- a/pandas/tests/indexes/multi/test_contains.py +++ b/pandas/tests/indexes/multi/test_contains.py @@ -98,3 +98,27 @@ def test_isin_level_kwarg(): with pytest.raises(KeyError, match="'Level C not found'"): idx.isin(vals_1, level="C") + + +def test_contains_with_missing_value(): + # issue 19132 + idx = MultiIndex.from_arrays([[1, np.nan, 2]]) + assert np.nan in idx + + idx = MultiIndex.from_arrays([[1, 2], [np.nan, 3]]) + assert np.nan not in idx + assert (1, np.nan) in idx + + +@pytest.mark.parametrize( + "labels,expected,level", + [ + ([("b", np.nan)], np.array([False, False, True]), None,), + ([np.nan, "a"], np.array([True, True, False]), 0), + (["d", np.nan], np.array([False, True, True]), 1), + ], +) +def test_isin_multi_index_with_missing_value(labels, expected, level): + # GH 19132 + midx = MultiIndex.from_arrays([[np.nan, "a", "b"], ["c", "d", np.nan]]) + tm.assert_numpy_array_equal(midx.isin(labels, level=level), expected) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 9ef2a77205acc..50905f7122186 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -437,3 +437,91 @@ def test_timestamp_multiindex_indexer(): ) should_be = pd.Series(data=np.arange(24, len(qidx) + 24), index=qidx, name="foo") tm.assert_series_equal(result, should_be) + + +def test_get_loc_with_values_including_missing_values(): + # issue 19132 + idx = MultiIndex.from_product([[np.nan, 1]] * 2) + expected = slice(0, 2, None) + assert idx.get_loc(np.nan) == expected + + idx = MultiIndex.from_arrays([[np.nan, 1, 2, np.nan]]) + expected = np.array([True, False, False, True]) + tm.assert_numpy_array_equal(idx.get_loc(np.nan), expected) + + idx = MultiIndex.from_product([[np.nan, 1]] * 3) + expected = slice(2, 4, None) + assert idx.get_loc((np.nan, 1)) == expected + + +@pytest.mark.parametrize( + "index_arr,labels,expected", + [ + ( + [[1, np.nan, 2], [3, 4, 5]], + [1, np.nan, 2], + np.array([-1, -1, -1], dtype=np.intp), + ), + ([[1, np.nan, 2], [3, 4, 5]], [(np.nan, 4)], np.array([1], dtype=np.intp)), + ([[1, 2, 3], [np.nan, 4, 5]], [(1, np.nan)], np.array([0], dtype=np.intp)), + ( + [[1, 2, 3], [np.nan, 4, 5]], + [np.nan, 4, 5], + np.array([-1, -1, -1], dtype=np.intp), + ), + ], +) +def test_get_indexer_with_missing_value(index_arr, labels, expected): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.get_indexer(labels) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "index_arr,expected,target,algo", + [ + ([[np.nan, "a", "b"], ["c", "d", "e"]], 0, np.nan, "left"), + ([[np.nan, "a", "b"], ["c", "d", "e"]], 1, (np.nan, "c"), "right"), + ([["a", "b", "c"], ["d", np.nan, "d"]], 1, ("b", np.nan), "left"), + ], +) +def test_get_slice_bound_with_missing_value(index_arr, expected, target, algo): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.get_slice_bound(target, side=algo, kind="loc") + assert result == expected + + +@pytest.mark.parametrize( + "index_arr,expected,start_idx,end_idx", + [ + ([[np.nan, 1, 2], [3, 4, 5]], slice(0, 2, None), np.nan, 1), + ([[np.nan, 1, 2], [3, 4, 5]], slice(0, 3, None), np.nan, (2, 5)), + ([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), 3), + ([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), (3, 5)), + ], +) +def test_slice_indexer_with_missing_value(index_arr, expected, start_idx, end_idx): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.slice_indexer(start=start_idx, end=end_idx) + assert result == expected + + +@pytest.mark.parametrize( + "index_arr,expected,start_idx,end_idx", + [ + ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, None), + ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, "b"), + ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, ("b", "e")), + ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), None), + ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), "c"), + ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), ("c", "e")), + ], +) +def test_slice_locs_with_missing_value(index_arr, expected, start_idx, end_idx): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.slice_locs(start=start_idx, end=end_idx) + assert result == expected