Skip to content

ENH: pd.MultiIndex.get_loc(np.nan) #28919

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 9, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -903,6 +903,7 @@ Indexing
- Bug when indexing with ``.loc`` where the index was a :class:`CategoricalIndex` with non-string categories didn't work (:issue:`17569`, :issue:`30225`)
- :meth:`Index.get_indexer_non_unique` could fail with `TypeError` in some cases, such as when searching for ints in a string index (:issue:`28257`)
- Bug in :meth:`Float64Index.get_loc` incorrectly raising ``TypeError`` instead of ``KeyError`` (:issue:`29189`)
- :meth:`MultiIndex.get_loc` can't find missing values when input includes missing values (:issue:`19132`)
- Bug in :meth:`Series.__setitem__` incorrectly assigning values with boolean indexer when the length of new data matches the number of ``True`` values and new data is not a ``Series`` or an ``np.array`` (:issue:`30567`)

Missing
Expand Down
44 changes: 34 additions & 10 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2539,7 +2539,7 @@ def _partial_tup_index(self, tup, side="left"):
for k, (lab, lev, labs) in enumerate(zipped):
section = labs[start:end]

if lab not in lev:
if lab not in lev and not isna(lab):
if not lev.is_type_compatible(lib.infer_dtype([lab], skipna=False)):
raise TypeError(f"Level type mismatch: {lab}")

Expand All @@ -2549,13 +2549,38 @@ def _partial_tup_index(self, tup, side="left"):
loc -= 1
return start + section.searchsorted(loc, side=side)

idx = lev.get_loc(lab)
idx = self._get_loc_single_level_index(lev, lab)
if k < n - 1:
end = start + section.searchsorted(idx, side="right")
start = start + section.searchsorted(idx, side="left")
else:
return start + section.searchsorted(idx, side=side)

def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int:
"""
If key is NA value, location of index unify as -1.

Parameters
----------
level_index: Index
key : label

Returns
-------
loc : int
If key is NA value, loc is -1
Else, location of key in index.

See Also
--------
Index.get_loc : The get_loc method for (single-level) index.
"""

if is_scalar(key) and isna(key):
return -1
else:
return level_index.get_loc(key)

def get_loc(self, key, method=None):
"""
Get location for a label or a tuple of labels as an integer, slice or
Expand Down Expand Up @@ -2654,7 +2679,9 @@ def _maybe_to_slice(loc):
loc = np.arange(start, stop, dtype="int64")

for i, k in enumerate(follow_key, len(lead_key)):
mask = self.codes[i][loc] == self.levels[i].get_loc(k)
mask = self.codes[i][loc] == self._get_loc_single_level_index(
self.levels[i], k
)
if not mask.all():
loc = loc[mask]
if not len(loc):
Expand Down Expand Up @@ -2882,7 +2909,7 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes):

else:

code = level_index.get_loc(key)
code = self._get_loc_single_level_index(level_index, key)

if level > 0 or self.lexsort_depth == 0:
# Desired level is not sorted
Expand Down Expand Up @@ -3377,14 +3404,11 @@ def isin(self, values, level=None):
return algos.isin(self.values, values)
else:
num = self._get_level_number(level)
levs = self.levels[num]
level_codes = self.codes[num]
levs = self.get_level_values(num)

sought_labels = levs.isin(values).nonzero()[0]
if levs.size == 0:
return np.zeros(len(level_codes), dtype=np.bool_)
else:
return np.lib.arraysetops.in1d(level_codes, sought_labels)
return np.zeros(len(levs), dtype=np.bool_)
return levs.isin(values)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the edits from L3408 down to here look like they are just nice cleanups independent of the rest of this PR. is that accurate?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jbrockmendel
For NA values, someone fixes #30677, then more accurate. "Index.isin" has a bug nonetheless in terms of checking NA value can be possible, This is more accurate



MultiIndex._add_numeric_methods_disabled()
Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/indexes/multi/test_contains.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,27 @@ def test_isin_level_kwarg():

with pytest.raises(KeyError, match="'Level C not found'"):
idx.isin(vals_1, level="C")


def test_contains_with_missing_value():
# issue 19132
idx = MultiIndex.from_arrays([[1, np.nan, 2]])
assert np.nan in idx

idx = MultiIndex.from_arrays([[1, 2], [np.nan, 3]])
assert np.nan not in idx
assert (1, np.nan) in idx


@pytest.mark.parametrize(
"labels,expected,level",
[
([("b", np.nan)], np.array([False, False, True]), None,),
([np.nan, "a"], np.array([True, True, False]), 0),
(["d", np.nan], np.array([False, True, True]), 1),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the issue specific to np.nan, or are there other NA values worth testing?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jbrockmendel
xref #30677. Yes. it is specific to np.nan. In "Index", np.nan , np.NaT, None are discernible not denoted by NA value. So if MultiIndex mixs with np.nan, np.NaT, None all together, result of ".isin" are different from what we know.

],
)
def test_isin_multi_index_with_missing_value(labels, expected, level):
# GH 19132
midx = MultiIndex.from_arrays([[np.nan, "a", "b"], ["c", "d", np.nan]])
tm.assert_numpy_array_equal(midx.isin(labels, level=level), expected)
88 changes: 88 additions & 0 deletions pandas/tests/indexes/multi/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,3 +437,91 @@ def test_timestamp_multiindex_indexer():
)
should_be = pd.Series(data=np.arange(24, len(qidx) + 24), index=qidx, name="foo")
tm.assert_series_equal(result, should_be)


def test_get_loc_with_values_including_missing_values():
# issue 19132
idx = MultiIndex.from_product([[np.nan, 1]] * 2)
expected = slice(0, 2, None)
assert idx.get_loc(np.nan) == expected

idx = MultiIndex.from_arrays([[np.nan, 1, 2, np.nan]])
expected = np.array([True, False, False, True])
tm.assert_numpy_array_equal(idx.get_loc(np.nan), expected)

idx = MultiIndex.from_product([[np.nan, 1]] * 3)
expected = slice(2, 4, None)
assert idx.get_loc((np.nan, 1)) == expected


@pytest.mark.parametrize(
"index_arr,labels,expected",
[
(
[[1, np.nan, 2], [3, 4, 5]],
[1, np.nan, 2],
np.array([-1, -1, -1], dtype=np.intp),
),
([[1, np.nan, 2], [3, 4, 5]], [(np.nan, 4)], np.array([1], dtype=np.intp)),
([[1, 2, 3], [np.nan, 4, 5]], [(1, np.nan)], np.array([0], dtype=np.intp)),
(
[[1, 2, 3], [np.nan, 4, 5]],
[np.nan, 4, 5],
np.array([-1, -1, -1], dtype=np.intp),
),
],
)
def test_get_indexer_with_missing_value(index_arr, labels, expected):
# issue 19132
idx = MultiIndex.from_arrays(index_arr)
result = idx.get_indexer(labels)
tm.assert_numpy_array_equal(result, expected)


@pytest.mark.parametrize(
"index_arr,expected,target,algo",
[
([[np.nan, "a", "b"], ["c", "d", "e"]], 0, np.nan, "left"),
([[np.nan, "a", "b"], ["c", "d", "e"]], 1, (np.nan, "c"), "right"),
([["a", "b", "c"], ["d", np.nan, "d"]], 1, ("b", np.nan), "left"),
],
)
def test_get_slice_bound_with_missing_value(index_arr, expected, target, algo):
# issue 19132
idx = MultiIndex.from_arrays(index_arr)
result = idx.get_slice_bound(target, side=algo, kind="loc")
assert result == expected


@pytest.mark.parametrize(
"index_arr,expected,start_idx,end_idx",
[
([[np.nan, 1, 2], [3, 4, 5]], slice(0, 2, None), np.nan, 1),
([[np.nan, 1, 2], [3, 4, 5]], slice(0, 3, None), np.nan, (2, 5)),
([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), 3),
([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), (3, 5)),
],
)
def test_slice_indexer_with_missing_value(index_arr, expected, start_idx, end_idx):
# issue 19132
idx = MultiIndex.from_arrays(index_arr)
result = idx.slice_indexer(start=start_idx, end=end_idx)
assert result == expected


@pytest.mark.parametrize(
"index_arr,expected,start_idx,end_idx",
[
([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, None),
([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, "b"),
([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, ("b", "e")),
([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), None),
([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), "c"),
([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), ("c", "e")),
],
)
def test_slice_locs_with_missing_value(index_arr, expected, start_idx, end_idx):
# issue 19132
idx = MultiIndex.from_arrays(index_arr)
result = idx.slice_locs(start=start_idx, end=end_idx)
assert result == expected