From 497c60d5138254f153d4b4decdb6f7011e17a487 Mon Sep 17 00:00:00 2001 From: Johannes Mueller Date: Fri, 12 Nov 2021 10:26:19 +0100 Subject: [PATCH 1/5] BUG: .get_indexer_non_unique() must return an array of ints (#44084) GH#44084 boils down to the following. According to the docs `.get_indexer_non_unique()` is supposed to return "integers from 0 to n - 1 indicating that the index at these positions matches the corresponding target values". However, for an index that is non unique and non monotonic it returns a boolean mask. That is because it uses `.get_loc()` which for non unique, non monotonic indexes returns a boolean mask. This patch catches that case and converts the boolean mask from `.get_loc()` into the corresponding array of integers if the index is not unique and not monotonic. --- pandas/core/indexes/interval.py | 2 ++ pandas/tests/indexes/interval/test_indexing.py | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 5791f89828ca3..885c922d1ee0f 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -727,6 +727,8 @@ def _get_indexer_pointwise( if isinstance(locs, slice): # Only needed for get_indexer_non_unique locs = np.arange(locs.start, locs.stop, locs.step, dtype="intp") + elif not self.is_unique and not self.is_monotonic: + locs = np.where(locs)[0] locs = np.array(locs, ndmin=1) except KeyError: missing.append(i) diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index 8df8eef69e9c9..0bc98ee02789d 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -373,6 +373,16 @@ def test_get_indexer_with_nans(self): expected = np.array([0, 1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + def test_get_index_non_unique_non_monotonic(self): + # GH#44084 + index = IntervalIndex.from_tuples( + [(0.0, 1.0), (1.0, 2.0), (0.0, 1.0), (1.0, 2.0)] + ) + + result, _ = index.get_indexer_non_unique([Interval(1.0, 2.0)]) + expected = np.array([1, 3]) + tm.assert_numpy_array_equal(result, expected) + class TestSliceLocs: def test_slice_locs_with_interval(self): From ed708251fb2d191312b9131929bb4f45b8405553 Mon Sep 17 00:00:00 2001 From: Johannes Mueller Date: Fri, 12 Nov 2021 11:23:58 +0100 Subject: [PATCH 2/5] Add whatsnew entry about fix for (#44084) --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index d1e209adb1b8f..2d70f361ba9cd 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -515,7 +515,7 @@ Strings Interval ^^^^^^^^ -- +- Bug in :meth:`IntervalIndex.get_indexer_non_unique` returning boolean mask instead of array of integers for a non unique and non monotonic index (:issue:`44084`) - Indexing From 0163419d79ac7a6a8d08f4348d8f3ab68d4ab713 Mon Sep 17 00:00:00 2001 From: Johannes Mueller Date: Fri, 12 Nov 2021 11:59:31 +0100 Subject: [PATCH 3/5] Add MultiIndex test case, as shown in the bug report (#44084) --- pandas/tests/indexes/interval/test_indexing.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index 0bc98ee02789d..f9e1ea03b99a8 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -8,8 +8,10 @@ from pandas import ( NA, CategoricalIndex, + Index, Interval, IntervalIndex, + MultiIndex, NaT, Timedelta, date_range, @@ -374,7 +376,7 @@ def test_get_indexer_with_nans(self): tm.assert_numpy_array_equal(result, expected) def test_get_index_non_unique_non_monotonic(self): - # GH#44084 + # GH#44084 (root cause) index = IntervalIndex.from_tuples( [(0.0, 1.0), (1.0, 2.0), (0.0, 1.0), (1.0, 2.0)] ) @@ -383,6 +385,20 @@ def test_get_index_non_unique_non_monotonic(self): expected = np.array([1, 3]) tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_multiindex_with_intervals(self): + # GH#44084 (MultiIndex case as reported) + interval_index = IntervalIndex.from_tuples( + [(2.0, 3.0), (0.0, 1.0), (1.0, 2.0)], name="interval" + ) + foo_index = Index([1, 2, 3], name="foo") + + multi_index = MultiIndex.from_product([foo_index, interval_index]) + + chosen_interval_indexer = multi_index.get_level_values( + "interval" + ).get_indexer_for([Interval(0.0, 1.0)]) + tm.assert_numpy_array_equal(chosen_interval_indexer, np.array([1, 4, 7])) + class TestSliceLocs: def test_slice_locs_with_interval(self): From a3099977dcebee3512b2fff454a5b09e27a3f66b Mon Sep 17 00:00:00 2001 From: Johannes Mueller Date: Fri, 12 Nov 2021 12:18:17 +0100 Subject: [PATCH 4/5] Explicitly expect int64 arrays from .get_indexer_non_unique() On Windows `np.array([1, 3])` is obviously int32 and thus the comparison to the int64 array fails due to dtype mismatch. --- pandas/tests/indexes/interval/test_indexing.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index f9e1ea03b99a8..68120b9620ed3 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -382,7 +382,7 @@ def test_get_index_non_unique_non_monotonic(self): ) result, _ = index.get_indexer_non_unique([Interval(1.0, 2.0)]) - expected = np.array([1, 3]) + expected = np.array([1, 3], dtype=np.int64) tm.assert_numpy_array_equal(result, expected) def test_get_indexer_multiindex_with_intervals(self): @@ -394,10 +394,11 @@ def test_get_indexer_multiindex_with_intervals(self): multi_index = MultiIndex.from_product([foo_index, interval_index]) - chosen_interval_indexer = multi_index.get_level_values( - "interval" - ).get_indexer_for([Interval(0.0, 1.0)]) - tm.assert_numpy_array_equal(chosen_interval_indexer, np.array([1, 4, 7])) + result = multi_index.get_level_values("interval").get_indexer_for( + [Interval(0.0, 1.0)] + ) + expected = np.array([1, 4, 7], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) class TestSliceLocs: From 6910216eaed00f29987a7d41e6a55d9f065b6072 Mon Sep 17 00:00:00 2001 From: Johannes Mueller Date: Fri, 12 Nov 2021 13:35:06 +0100 Subject: [PATCH 5/5] Correctly choose the int dtype to be expected from indexing methods Sometimes the world out there is a bit more complicated than what you have on your cozy desktop :) --- pandas/tests/indexes/interval/test_indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index 68120b9620ed3..75f7c69ce5300 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -382,7 +382,7 @@ def test_get_index_non_unique_non_monotonic(self): ) result, _ = index.get_indexer_non_unique([Interval(1.0, 2.0)]) - expected = np.array([1, 3], dtype=np.int64) + expected = np.array([1, 3], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) def test_get_indexer_multiindex_with_intervals(self): @@ -397,7 +397,7 @@ def test_get_indexer_multiindex_with_intervals(self): result = multi_index.get_level_values("interval").get_indexer_for( [Interval(0.0, 1.0)] ) - expected = np.array([1, 4, 7], dtype=np.int64) + expected = np.array([1, 4, 7], dtype=np.intp) tm.assert_numpy_array_equal(result, expected)