From 77b4e0991d511bfe2dc2bf673e7ea4736d9b08db Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 1 Nov 2022 14:35:01 +0100 Subject: [PATCH 1/7] BUG: MultiIndex.get_indexer not matching nan values --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/_libs/index.pyx | 20 +++++++++---- pandas/core/indexes/multi.py | 6 ++-- pandas/tests/indexes/multi/test_drop.py | 6 ++-- pandas/tests/indexes/multi/test_indexing.py | 12 +++++++- pandas/tests/indexes/multi/test_setops.py | 3 +- .../indexing/multiindex/test_multiindex.py | 29 ++++++------------- 7 files changed, 43 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 5614b7a2c0846..05220bd5885ca 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -382,6 +382,7 @@ Missing MultiIndex ^^^^^^^^^^ +- Bug in :meth:`MultiIndex.get_indexer` not matching ``NaN`` values (:issue:`37222`) - Bug in :meth:`MultiIndex.argsort` raising ``TypeError`` when index contains :attr:`NA` (:issue:`48495`) - Bug in :meth:`MultiIndex.difference` losing extension array dtype (:issue:`48606`) - Bug in :class:`MultiIndex.set_levels` raising ``IndexError`` when setting empty level (:issue:`48636`) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index f968e879498b2..4e29c67f99dcf 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -648,10 +648,13 @@ cdef class BaseMultiIndexCodesEngine: self.levels = levels self.offsets = offsets - # Transform labels in a single array, and add 1 so that we are working - # with positive integers (-1 for NaN becomes 0): - codes = (np.array(labels, dtype='int64').T + 1).astype('uint64', + # Transform labels in a single array, and add 2 so that we are working + # with positive integers (-1 for NaN becomes 1). This enables us to + # differentiate between values that are missing in other and matching + # NaNs. We will set values that are not found to 0 later: + codes = (np.array(labels, dtype='int64').T + 2).astype('uint64', copy=False) + self.level_has_nans = [-1 in lab for lab in labels] # Map each codes combination in the index to an integer unambiguously # (no collisions possible), based on the "offsets", which describe the @@ -680,8 +683,13 @@ cdef class BaseMultiIndexCodesEngine: Integers representing one combination each """ zt = [target._get_level_values(i) for i in range(target.nlevels)] - level_codes = [lev.get_indexer_for(codes) + 1 for lev, codes - in zip(self.levels, zt)] + level_codes = [] + for i, (lev, codes) in enumerate(zip(self.levels, zt)): + result = lev.get_indexer_for(codes) + 1 + result[result > 0] += 1 + if self.level_has_nans[i] and codes.hasnans: + result[codes.isna()] += 1 + level_codes.append(result) return self._codes_to_ints(np.array(level_codes, dtype='uint64').T) def get_indexer(self, target: np.ndarray) -> np.ndarray: @@ -792,7 +800,7 @@ cdef class BaseMultiIndexCodesEngine: if not isinstance(key, tuple): raise KeyError(key) try: - indices = [0 if checknull(v) else lev.get_loc(v) + 1 + indices = [1 if checknull(v) else lev.get_loc(v) + 2 for lev, v in zip(self.levels, key)] except KeyError: raise KeyError(key) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 27058cb54dddf..04582a08353f3 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1088,8 +1088,10 @@ def set_codes(self, codes, *, level=None, verify_integrity: bool = True): @cache_readonly def _engine(self): # Calculate the number of bits needed to represent labels in each - # level, as log2 of their sizes (including -1 for NaN): - sizes = np.ceil(np.log2([len(level) + 1 for level in self.levels])) + # level, as log2 of their sizes: + # NaN values are shifted to 1 and missing values in other while + # calculating the indexer are shifted to 0 + sizes = np.ceil(np.log2([len(level) + 2 for level in self.levels])) # Sum bit counts, starting from the _right_.... lev_bits = np.cumsum(sizes[::-1])[::-1] diff --git a/pandas/tests/indexes/multi/test_drop.py b/pandas/tests/indexes/multi/test_drop.py index 47959ec0a4a57..4bfba07332313 100644 --- a/pandas/tests/indexes/multi/test_drop.py +++ b/pandas/tests/indexes/multi/test_drop.py @@ -32,16 +32,16 @@ def test_drop(idx): tm.assert_index_equal(dropped, expected) index = MultiIndex.from_tuples([("bar", "two")]) - with pytest.raises(KeyError, match=r"^10$"): + with pytest.raises(KeyError, match=r"^15$"): idx.drop([("bar", "two")]) - with pytest.raises(KeyError, match=r"^10$"): + with pytest.raises(KeyError, match=r"^15$"): idx.drop(index) with pytest.raises(KeyError, match=r"^'two'$"): idx.drop(["foo", "two"]) # partially correct argument mixed_index = MultiIndex.from_tuples([("qux", "one"), ("bar", "two")]) - with pytest.raises(KeyError, match=r"^10$"): + with pytest.raises(KeyError, match=r"^15$"): idx.drop(mixed_index) # error='ignore' diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index fce3da6dd6aee..6b1c06bb7aba3 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -471,6 +471,16 @@ def test_get_indexer_kwarg_validation(self): with pytest.raises(ValueError, match=msg): mi.get_indexer(mi[:-1], tolerance="piano") + def test_get_indexer_nan(self): + # GH#37222 + idx1 = MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]) + idx2 = MultiIndex.from_product([["A"], [np.nan, 2.0]], names=["id1", "id2"]) + expected = np.array([-1, 1]) + result = idx2.get_indexer(idx1) + tm.assert_numpy_array_equal(result, expected) + result = idx1.get_indexer(idx2) + tm.assert_numpy_array_equal(result, expected) + def test_getitem(idx): # scalar @@ -527,7 +537,7 @@ class TestGetLoc: def test_get_loc(self, idx): assert idx.get_loc(("foo", "two")) == 1 assert idx.get_loc(("baz", "two")) == 3 - with pytest.raises(KeyError, match=r"^10$"): + with pytest.raises(KeyError, match=r"^15$"): idx.get_loc(("bar", "two")) with pytest.raises(KeyError, match=r"^'quux'$"): idx.get_loc("quux") diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index eaa4e0a7b5256..3a882b0c34b67 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -659,9 +659,8 @@ def test_union_keep_ea_dtype_with_na(any_numeric_ea_dtype): midx = MultiIndex.from_arrays([arr1, [2, 1]], names=["a", None]) midx2 = MultiIndex.from_arrays([arr2, [1, 2]]) result = midx.union(midx2) - # Expected is actually off and should contain (1, 1) too. See GH#37222 expected = MultiIndex.from_arrays( - [Series([4, pd.NA, pd.NA], dtype=any_numeric_ea_dtype), [2, 1, 2]] + [Series([1, 4, pd.NA, pd.NA], dtype=any_numeric_ea_dtype), [1, 2, 1, 2]] ) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 08e15545cb998..67cd307052a48 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -162,10 +162,10 @@ def test_rename_multiindex_with_duplicates(self): [1, 2], ], [ - [(81.0, np.nan), (np.nan, np.nan)], - [(81.0, np.nan), (np.nan, np.nan)], - [1, 2], - [1, 1], + [[81, 82.0, np.nan], Series([np.nan, np.nan, np.nan])], + [[81, 82.0, np.nan], Series([np.nan, np.nan, np.nan])], + [1, np.nan, 2], + [np.nan, 2, 1], ], ), ( @@ -176,8 +176,8 @@ def test_rename_multiindex_with_duplicates(self): [1, 2], ], [ - [(81.0, np.nan), (np.nan, np.nan)], - [(81.0, np.nan), (np.nan, np.nan)], + [[81.0, np.nan], Series([np.nan, np.nan])], + [[81.0, np.nan], Series([np.nan, np.nan])], [1, 2], [2, 1], ], @@ -188,28 +188,17 @@ def test_subtracting_two_series_with_unordered_index_and_all_nan_index( self, data_result, data_expected ): # GH 38439 + # TODO: Refactor. This is impossible to understand a_index_result = MultiIndex.from_tuples(data_result[0]) b_index_result = MultiIndex.from_tuples(data_result[1]) a_series_result = Series(data_result[2], index=a_index_result) b_series_result = Series(data_result[3], index=b_index_result) result = a_series_result.align(b_series_result) - a_index_expected = MultiIndex.from_tuples(data_expected[0]) - b_index_expected = MultiIndex.from_tuples(data_expected[1]) + a_index_expected = MultiIndex.from_arrays(data_expected[0]) + b_index_expected = MultiIndex.from_arrays(data_expected[1]) a_series_expected = Series(data_expected[2], index=a_index_expected) b_series_expected = Series(data_expected[3], index=b_index_expected) - a_series_expected.index = a_series_expected.index.set_levels( - [ - a_series_expected.index.levels[0].astype("float"), - a_series_expected.index.levels[1].astype("float"), - ] - ) - b_series_expected.index = b_series_expected.index.set_levels( - [ - b_series_expected.index.levels[0].astype("float"), - b_series_expected.index.levels[1].astype("float"), - ] - ) tm.assert_series_equal(result[0], a_series_expected) tm.assert_series_equal(result[1], b_series_expected) From f1f300672b0531630f36876523fbd1ef8e2ab03c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 1 Nov 2022 15:45:34 +0100 Subject: [PATCH 2/7] Fix tests --- pandas/tests/indexes/multi/test_indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 6b1c06bb7aba3..337f91e0f89b4 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -477,9 +477,9 @@ def test_get_indexer_nan(self): idx2 = MultiIndex.from_product([["A"], [np.nan, 2.0]], names=["id1", "id2"]) expected = np.array([-1, 1]) result = idx2.get_indexer(idx1) - tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected, check_dtype=False) result = idx1.get_indexer(idx2) - tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected, check_dtype=False) def test_getitem(idx): From da727a22840d905c26fb6db921ced015453285f4 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 1 Nov 2022 19:04:57 +0100 Subject: [PATCH 3/7] Use variable for shifts --- pandas/_libs/index.pyx | 7 +++++-- pandas/core/indexes/multi.py | 5 ++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 4e29c67f99dcf..4f7ebfb666edd 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -36,6 +36,8 @@ from pandas._libs.missing cimport ( is_matching_na, ) +multiindex_nulls_shift = 2 + cdef inline bint is_definitely_invalid_key(object val): try: @@ -652,8 +654,9 @@ cdef class BaseMultiIndexCodesEngine: # with positive integers (-1 for NaN becomes 1). This enables us to # differentiate between values that are missing in other and matching # NaNs. We will set values that are not found to 0 later: - codes = (np.array(labels, dtype='int64').T + 2).astype('uint64', - copy=False) + codes = (np.array( + labels, dtype='int64').T + multiindex_nulls_shift + ).astype('uint64', copy=False) self.level_has_nans = [-1 in lab for lab in labels] # Map each codes combination in the index to an integer unambiguously diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 04582a08353f3..9a32fd057307b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -28,6 +28,7 @@ lib, ) from pandas._libs.hashtable import duplicated +from pandas._libs.index import multiindex_nulls_shift from pandas._typing import ( AnyAll, AnyArrayLike, @@ -1091,7 +1092,9 @@ def _engine(self): # level, as log2 of their sizes: # NaN values are shifted to 1 and missing values in other while # calculating the indexer are shifted to 0 - sizes = np.ceil(np.log2([len(level) + 2 for level in self.levels])) + sizes = np.ceil( + np.log2([len(level) + multiindex_nulls_shift for level in self.levels]) + ) # Sum bit counts, starting from the _right_.... lev_bits = np.cumsum(sizes[::-1])[::-1] From 7f9b6ffe5cb743982fbdafc767ffa0f534941752 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 1 Nov 2022 20:58:12 +0100 Subject: [PATCH 4/7] Fix mypy --- pandas/core/indexes/multi.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 9a32fd057307b..64c00e44f99bd 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -28,7 +28,6 @@ lib, ) from pandas._libs.hashtable import duplicated -from pandas._libs.index import multiindex_nulls_shift from pandas._typing import ( AnyAll, AnyArrayLike, @@ -1093,7 +1092,13 @@ def _engine(self): # NaN values are shifted to 1 and missing values in other while # calculating the indexer are shifted to 0 sizes = np.ceil( - np.log2([len(level) + multiindex_nulls_shift for level in self.levels]) + np.log2( + [ + len(level) + + libindex.multiindex_nulls_shift # type: ignore[attr-defined] + for level in self.levels + ] + ) ) # Sum bit counts, starting from the _right_.... From 0ae08278d21e85aceea76b4d7be0fcc9312f7dfb Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 2 Nov 2022 19:41:07 +0100 Subject: [PATCH 5/7] Add gh ref --- pandas/tests/indexing/multiindex/test_multiindex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 67cd307052a48..157f0de632e18 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -188,7 +188,7 @@ def test_subtracting_two_series_with_unordered_index_and_all_nan_index( self, data_result, data_expected ): # GH 38439 - # TODO: Refactor. This is impossible to understand + # TODO: Refactor. This is impossible to understand GH#49443 a_index_result = MultiIndex.from_tuples(data_result[0]) b_index_result = MultiIndex.from_tuples(data_result[1]) a_series_result = Series(data_result[2], index=a_index_result) From f7e92839e3787ff207199ede8671543bcbbd9805 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Wed, 2 Nov 2022 19:41:38 +0100 Subject: [PATCH 6/7] Update pandas/_libs/index.pyx Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/_libs/index.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 4f7ebfb666edd..f8fe5be670749 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -803,7 +803,7 @@ cdef class BaseMultiIndexCodesEngine: if not isinstance(key, tuple): raise KeyError(key) try: - indices = [1 if checknull(v) else lev.get_loc(v) + 2 + indices = [1 if checknull(v) else lev.get_loc(v) + multiindex_nulls_shift for lev, v in zip(self.levels, key)] except KeyError: raise KeyError(key) From 00cc5cd2c465c5614dee878e3b28ec9a5927a817 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 3 Nov 2022 23:46:28 +0100 Subject: [PATCH 7/7] Adress review --- pandas/_libs/index.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index f8fe5be670749..27edc83c6f329 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -36,6 +36,7 @@ from pandas._libs.missing cimport ( is_matching_na, ) +# Defines shift of MultiIndex codes to avoid negative codes (missing values) multiindex_nulls_shift = 2 @@ -654,9 +655,8 @@ cdef class BaseMultiIndexCodesEngine: # with positive integers (-1 for NaN becomes 1). This enables us to # differentiate between values that are missing in other and matching # NaNs. We will set values that are not found to 0 later: - codes = (np.array( - labels, dtype='int64').T + multiindex_nulls_shift - ).astype('uint64', copy=False) + labels_arr = np.array(labels, dtype='int64').T + multiindex_nulls_shift + codes = labels_arr.astype('uint64', copy=False) self.level_has_nans = [-1 in lab for lab in labels] # Map each codes combination in the index to an integer unambiguously