diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index df190a4df393c..2ddc8f478f41d 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -534,6 +534,7 @@ Missing MultiIndex ^^^^^^^^^^ +- Bug in :meth:`MultiIndex.get_indexer` not matching ``NaN`` values (:issue:`37222`) - Bug in :meth:`MultiIndex.argsort` raising ``TypeError`` when index contains :attr:`NA` (:issue:`48495`) - Bug in :meth:`MultiIndex.difference` losing extension array dtype (:issue:`48606`) - Bug in :class:`MultiIndex.set_levels` raising ``IndexError`` when setting empty level (:issue:`48636`) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index f968e879498b2..27edc83c6f329 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -36,6 +36,9 @@ from pandas._libs.missing cimport ( is_matching_na, ) +# Defines shift of MultiIndex codes to avoid negative codes (missing values) +multiindex_nulls_shift = 2 + cdef inline bint is_definitely_invalid_key(object val): try: @@ -648,10 +651,13 @@ cdef class BaseMultiIndexCodesEngine: self.levels = levels self.offsets = offsets - # Transform labels in a single array, and add 1 so that we are working - # with positive integers (-1 for NaN becomes 0): - codes = (np.array(labels, dtype='int64').T + 1).astype('uint64', - copy=False) + # Transform labels in a single array, and add 2 so that we are working + # with positive integers (-1 for NaN becomes 1). This enables us to + # differentiate between values that are missing in other and matching + # NaNs. We will set values that are not found to 0 later: + labels_arr = np.array(labels, dtype='int64').T + multiindex_nulls_shift + codes = labels_arr.astype('uint64', copy=False) + self.level_has_nans = [-1 in lab for lab in labels] # Map each codes combination in the index to an integer unambiguously # (no collisions possible), based on the "offsets", which describe the @@ -680,8 +686,13 @@ cdef class BaseMultiIndexCodesEngine: Integers representing one combination each """ zt = [target._get_level_values(i) for i in range(target.nlevels)] - level_codes = [lev.get_indexer_for(codes) + 1 for lev, codes - in zip(self.levels, zt)] + level_codes = [] + for i, (lev, codes) in enumerate(zip(self.levels, zt)): + result = lev.get_indexer_for(codes) + 1 + result[result > 0] += 1 + if self.level_has_nans[i] and codes.hasnans: + result[codes.isna()] += 1 + level_codes.append(result) return self._codes_to_ints(np.array(level_codes, dtype='uint64').T) def get_indexer(self, target: np.ndarray) -> np.ndarray: @@ -792,7 +803,7 @@ cdef class BaseMultiIndexCodesEngine: if not isinstance(key, tuple): raise KeyError(key) try: - indices = [0 if checknull(v) else lev.get_loc(v) + 1 + indices = [1 if checknull(v) else lev.get_loc(v) + multiindex_nulls_shift for lev, v in zip(self.levels, key)] except KeyError: raise KeyError(key) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 6fc458cf2f478..8d5653f16bbf6 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1087,8 +1087,18 @@ def set_codes(self, codes, *, level=None, verify_integrity: bool = True): @cache_readonly def _engine(self): # Calculate the number of bits needed to represent labels in each - # level, as log2 of their sizes (including -1 for NaN): - sizes = np.ceil(np.log2([len(level) + 1 for level in self.levels])) + # level, as log2 of their sizes: + # NaN values are shifted to 1 and missing values in other while + # calculating the indexer are shifted to 0 + sizes = np.ceil( + np.log2( + [ + len(level) + + libindex.multiindex_nulls_shift # type: ignore[attr-defined] + for level in self.levels + ] + ) + ) # Sum bit counts, starting from the _right_.... lev_bits = np.cumsum(sizes[::-1])[::-1] diff --git a/pandas/tests/indexes/multi/test_drop.py b/pandas/tests/indexes/multi/test_drop.py index 47959ec0a4a57..4bfba07332313 100644 --- a/pandas/tests/indexes/multi/test_drop.py +++ b/pandas/tests/indexes/multi/test_drop.py @@ -32,16 +32,16 @@ def test_drop(idx): tm.assert_index_equal(dropped, expected) index = MultiIndex.from_tuples([("bar", "two")]) - with pytest.raises(KeyError, match=r"^10$"): + with pytest.raises(KeyError, match=r"^15$"): idx.drop([("bar", "two")]) - with pytest.raises(KeyError, match=r"^10$"): + with pytest.raises(KeyError, match=r"^15$"): idx.drop(index) with pytest.raises(KeyError, match=r"^'two'$"): idx.drop(["foo", "two"]) # partially correct argument mixed_index = MultiIndex.from_tuples([("qux", "one"), ("bar", "two")]) - with pytest.raises(KeyError, match=r"^10$"): + with pytest.raises(KeyError, match=r"^15$"): idx.drop(mixed_index) # error='ignore' diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index fce3da6dd6aee..337f91e0f89b4 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -471,6 +471,16 @@ def test_get_indexer_kwarg_validation(self): with pytest.raises(ValueError, match=msg): mi.get_indexer(mi[:-1], tolerance="piano") + def test_get_indexer_nan(self): + # GH#37222 + idx1 = MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]) + idx2 = MultiIndex.from_product([["A"], [np.nan, 2.0]], names=["id1", "id2"]) + expected = np.array([-1, 1]) + result = idx2.get_indexer(idx1) + tm.assert_numpy_array_equal(result, expected, check_dtype=False) + result = idx1.get_indexer(idx2) + tm.assert_numpy_array_equal(result, expected, check_dtype=False) + def test_getitem(idx): # scalar @@ -527,7 +537,7 @@ class TestGetLoc: def test_get_loc(self, idx): assert idx.get_loc(("foo", "two")) == 1 assert idx.get_loc(("baz", "two")) == 3 - with pytest.raises(KeyError, match=r"^10$"): + with pytest.raises(KeyError, match=r"^15$"): idx.get_loc(("bar", "two")) with pytest.raises(KeyError, match=r"^'quux'$"): idx.get_loc("quux") diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index eaa4e0a7b5256..3a882b0c34b67 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -659,9 +659,8 @@ def test_union_keep_ea_dtype_with_na(any_numeric_ea_dtype): midx = MultiIndex.from_arrays([arr1, [2, 1]], names=["a", None]) midx2 = MultiIndex.from_arrays([arr2, [1, 2]]) result = midx.union(midx2) - # Expected is actually off and should contain (1, 1) too. See GH#37222 expected = MultiIndex.from_arrays( - [Series([4, pd.NA, pd.NA], dtype=any_numeric_ea_dtype), [2, 1, 2]] + [Series([1, 4, pd.NA, pd.NA], dtype=any_numeric_ea_dtype), [1, 2, 1, 2]] ) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 08e15545cb998..157f0de632e18 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -162,10 +162,10 @@ def test_rename_multiindex_with_duplicates(self): [1, 2], ], [ - [(81.0, np.nan), (np.nan, np.nan)], - [(81.0, np.nan), (np.nan, np.nan)], - [1, 2], - [1, 1], + [[81, 82.0, np.nan], Series([np.nan, np.nan, np.nan])], + [[81, 82.0, np.nan], Series([np.nan, np.nan, np.nan])], + [1, np.nan, 2], + [np.nan, 2, 1], ], ), ( @@ -176,8 +176,8 @@ def test_rename_multiindex_with_duplicates(self): [1, 2], ], [ - [(81.0, np.nan), (np.nan, np.nan)], - [(81.0, np.nan), (np.nan, np.nan)], + [[81.0, np.nan], Series([np.nan, np.nan])], + [[81.0, np.nan], Series([np.nan, np.nan])], [1, 2], [2, 1], ], @@ -188,28 +188,17 @@ def test_subtracting_two_series_with_unordered_index_and_all_nan_index( self, data_result, data_expected ): # GH 38439 + # TODO: Refactor. This is impossible to understand GH#49443 a_index_result = MultiIndex.from_tuples(data_result[0]) b_index_result = MultiIndex.from_tuples(data_result[1]) a_series_result = Series(data_result[2], index=a_index_result) b_series_result = Series(data_result[3], index=b_index_result) result = a_series_result.align(b_series_result) - a_index_expected = MultiIndex.from_tuples(data_expected[0]) - b_index_expected = MultiIndex.from_tuples(data_expected[1]) + a_index_expected = MultiIndex.from_arrays(data_expected[0]) + b_index_expected = MultiIndex.from_arrays(data_expected[1]) a_series_expected = Series(data_expected[2], index=a_index_expected) b_series_expected = Series(data_expected[3], index=b_index_expected) - a_series_expected.index = a_series_expected.index.set_levels( - [ - a_series_expected.index.levels[0].astype("float"), - a_series_expected.index.levels[1].astype("float"), - ] - ) - b_series_expected.index = b_series_expected.index.set_levels( - [ - b_series_expected.index.levels[0].astype("float"), - b_series_expected.index.levels[1].astype("float"), - ] - ) tm.assert_series_equal(result[0], a_series_expected) tm.assert_series_equal(result[1], b_series_expected)