diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 793f0c7c03c77..18dbb7eae0615 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -74,10 +74,38 @@ def setup(self): ], dtype=object, ) + self.other_mi_many_mismatches = MultiIndex.from_tuples( + [ + (-7, 41), + (-2, 3), + (-0.7, 5), + (0, 0), + (0, 1.5), + (0, 340), + (0, 1001), + (1, -4), + (1, 20), + (1, 1040), + (432, -5), + (432, 17), + (439, 165.5), + (998, -4), + (998, 24065), + (999, 865.2), + (999, 1000), + (1045, -843), + ] + ) def time_get_indexer(self): self.mi_int.get_indexer(self.obj_index) + def time_get_indexer_and_backfill(self): + self.mi_int.get_indexer(self.other_mi_many_mismatches, method="backfill") + + def time_get_indexer_and_pad(self): + self.mi_int.get_indexer(self.other_mi_many_mismatches, method="pad") + def time_is_monotonic(self): self.mi_int.is_monotonic diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 6f2b9b4f946c7..f3d4c8c557dd8 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -115,6 +115,67 @@ Backwards incompatible API changes Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`) - :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`) - Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`) + +``MultiIndex.get_indexer`` interprets `method` argument differently +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This restores the behavior of :meth:`MultiIndex.get_indexer` with ``method='backfill'`` or ``method='pad'`` to the behavior before pandas 0.23.0. In particular, MultiIndexes are treated as a list of tuples and padding or backfilling is done with respect to the ordering of these lists of tuples (:issue:`29896`). + +As an example of this, given: + +.. ipython:: python + + df = pd.DataFrame({ + 'a': [0, 0, 0, 0], + 'b': [0, 2, 3, 4], + 'c': ['A', 'B', 'C', 'D'], + }).set_index(['a', 'b']) + mi_2 = pd.MultiIndex.from_product([[0], [-1, 0, 1, 3, 4, 5]]) + +The differences in reindexing ``df`` with ``mi_2`` and using ``method='backfill'`` can be seen here: + +*pandas >= 0.23, < 1.1.0*: + +.. code-block:: ipython + + In [1]: df.reindex(mi_2, method='backfill') + Out[1]: + c + 0 -1 A + 0 A + 1 D + 3 A + 4 A + 5 C + +*pandas <0.23, >= 1.1.0* + +.. ipython:: python + + df.reindex(mi_2, method='backfill') + +And the differences in reindexing ``df`` with ``mi_2`` and using ``method='pad'`` can be seen here: + +*pandas >= 0.23, < 1.1.0* + +.. code-block:: ipython + + In [1]: df.reindex(mi_2, method='pad') + Out[1]: + c + 0 -1 NaN + 0 NaN + 1 D + 3 NaN + 4 A + 5 C + +*pandas < 0.23, >= 1.1.0* + +.. ipython:: python + + df.reindex(mi_2, method='pad') + - .. _whatsnew_110.api_breaking.indexing_raises_key_errors: diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 4a9b504ffb0d9..d8e0d9c6bd7ab 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -612,25 +612,113 @@ cdef class BaseMultiIndexCodesEngine: in zip(self.levels, zip(*target))] return self._codes_to_ints(np.array(level_codes, dtype='uint64').T) - def get_indexer(self, object target, object method=None, - object limit=None): + def get_indexer_no_fill(self, object target) -> np.ndarray: + """ + Returns an array giving the positions of each value of `target` in + `self.values`, where -1 represents a value in `target` which does not + appear in `self.values` + + Parameters + ---------- + target : list-like of keys + Each key is a tuple, with a label for each level of the index + + Returns + ------- + np.ndarray[int64_t, ndim=1] of the indexer of `target` into + `self.values` + """ lab_ints = self._extract_level_codes(target) + return self._base.get_indexer(self, lab_ints) - # All methods (exact, backfill, pad) directly map to the respective - # methods of the underlying (integers) index... - if method is not None: - # but underlying backfill and pad methods require index and keys - # to be sorted. The index already is (checked in - # Index._get_fill_indexer), sort (integer representations of) keys: - order = np.argsort(lab_ints) - lab_ints = lab_ints[order] - indexer = (getattr(self._base, f'get_{method}_indexer') - (self, lab_ints, limit=limit)) - indexer = indexer[order] - else: - indexer = self._base.get_indexer(self, lab_ints) + def get_indexer(self, object target, object values = None, + object method = None, object limit = None) -> np.ndarray: + """ + Returns an array giving the positions of each value of `target` in + `values`, where -1 represents a value in `target` which does not + appear in `values` - return indexer + If `method` is "backfill" then the position for a value in `target` + which does not appear in `values` is that of the next greater value + in `values` (if one exists), and -1 if there is no such value. + + Similarly, if the method is "pad" then the position for a value in + `target` which does not appear in `values` is that of the next smaller + value in `values` (if one exists), and -1 if there is no such value. + + Parameters + ---------- + target: list-like of tuples + need not be sorted, but all must have the same length, which must be + the same as the length of all tuples in `values` + values : list-like of tuples + must be sorted and all have the same length. Should be the set of + the MultiIndex's values. Needed only if `method` is not None + method: string + "backfill" or "pad" + limit: int, optional + if provided, limit the number of fills to this value + + Returns + ------- + np.ndarray[int64_t, ndim=1] of the indexer of `target` into `values`, + filled with the `method` (and optionally `limit`) specified + """ + if method is None: + return self.get_indexer_no_fill(target) + + assert method in ("backfill", "pad") + cdef: + int64_t i, j, next_code + int64_t num_values, num_target_values + ndarray[int64_t, ndim=1] target_order + ndarray[object, ndim=1] target_values + ndarray[int64_t, ndim=1] new_codes, new_target_codes + ndarray[int64_t, ndim=1] sorted_indexer + + target_order = np.argsort(target.values).astype('int64') + target_values = target.values[target_order] + num_values, num_target_values = len(values), len(target_values) + new_codes, new_target_codes = ( + np.empty((num_values,)).astype('int64'), + np.empty((num_target_values,)).astype('int64'), + ) + + # `values` and `target_values` are both sorted, so we walk through them + # and memoize the (ordered) set of indices in the (implicit) merged-and + # sorted list of the two which belong to each of them + # the effect of this is to create a factorization for the (sorted) + # merger of the index values, where `new_codes` and `new_target_codes` + # are the subset of the factors which appear in `values` and `target`, + # respectively + i, j, next_code = 0, 0, 0 + while i < num_values and j < num_target_values: + val, target_val = values[i], target_values[j] + if val <= target_val: + new_codes[i] = next_code + i += 1 + if target_val <= val: + new_target_codes[j] = next_code + j += 1 + next_code += 1 + + # at this point, at least one should have reached the end + # the remaining values of the other should be added to the end + assert i == num_values or j == num_target_values + while i < num_values: + new_codes[i] = next_code + i += 1 + next_code += 1 + while j < num_target_values: + new_target_codes[j] = next_code + j += 1 + next_code += 1 + + # get the indexer, and undo the sorting of `target.values` + sorted_indexer = ( + algos.backfill if method == "backfill" else algos.pad + )(new_codes, new_target_codes, limit=limit).astype('int64') + return sorted_indexer[np.argsort(target_order)] def get_loc(self, object key): if is_definitely_invalid_key(key): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 4e2d07ddf9225..7aa1456846612 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2455,7 +2455,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): raise NotImplementedError( "tolerance not implemented yet for MultiIndex" ) - indexer = self._engine.get_indexer(target, method, limit) + indexer = self._engine.get_indexer( + values=self.values, target=target, method=method, limit=limit + ) elif method == "nearest": raise NotImplementedError( "method='nearest' not implemented yet " diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index a71b4a0983c63..c3b9a7bf05c7b 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1432,6 +1432,81 @@ def test_set_value_resize(self, float_frame): with pytest.raises(ValueError, match=msg): res._set_value("foobar", "baz", "sam") + def test_reindex_with_multi_index(self): + # https://github.com/pandas-dev/pandas/issues/29896 + # tests for reindexing a multi-indexed DataFrame with a new MultiIndex + # + # confirms that we can reindex a multi-indexed DataFrame with a new + # MultiIndex object correctly when using no filling, backfilling, and + # padding + # + # The DataFrame, `df`, used in this test is: + # c + # a b + # -1 0 A + # 1 B + # 2 C + # 3 D + # 4 E + # 5 F + # 6 G + # 0 0 A + # 1 B + # 2 C + # 3 D + # 4 E + # 5 F + # 6 G + # 1 0 A + # 1 B + # 2 C + # 3 D + # 4 E + # 5 F + # 6 G + # + # and the other MultiIndex, `new_multi_index`, is: + # 0: 0 0.5 + # 1: 2.0 + # 2: 5.0 + # 3: 5.8 + df = pd.DataFrame( + { + "a": [-1] * 7 + [0] * 7 + [1] * 7, + "b": list(range(7)) * 3, + "c": ["A", "B", "C", "D", "E", "F", "G"] * 3, + } + ).set_index(["a", "b"]) + new_index = [0.5, 2.0, 5.0, 5.8] + new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"]) + + # reindexing w/o a `method` value + reindexed = df.reindex(new_multi_index) + expected = pd.DataFrame( + {"a": [0] * 4, "b": new_index, "c": [np.nan, "C", "F", np.nan]} + ).set_index(["a", "b"]) + tm.assert_frame_equal(expected, reindexed) + + # reindexing with backfilling + expected = pd.DataFrame( + {"a": [0] * 4, "b": new_index, "c": ["B", "C", "F", "G"]} + ).set_index(["a", "b"]) + reindexed_with_backfilling = df.reindex(new_multi_index, method="bfill") + tm.assert_frame_equal(expected, reindexed_with_backfilling) + + reindexed_with_backfilling = df.reindex(new_multi_index, method="backfill") + tm.assert_frame_equal(expected, reindexed_with_backfilling) + + # reindexing with padding + expected = pd.DataFrame( + {"a": [0] * 4, "b": new_index, "c": ["A", "C", "F", "F"]} + ).set_index(["a", "b"]) + reindexed_with_padding = df.reindex(new_multi_index, method="pad") + tm.assert_frame_equal(expected, reindexed_with_padding) + + reindexed_with_padding = df.reindex(new_multi_index, method="ffill") + tm.assert_frame_equal(expected, reindexed_with_padding) + def test_set_value_with_index_dtype_change(self): df_orig = DataFrame(np.random.randn(3, 3), index=range(3), columns=list("ABC")) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index d104c773227d5..8c0dae433c8f4 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -239,6 +239,203 @@ def test_get_indexer_with_missing_value(self, index_arr, labels, expected): result = idx.get_indexer(labels) tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_methods(self): + # https://github.com/pandas-dev/pandas/issues/29896 + # test getting an indexer for another index with different methods + # confirms that getting an indexer without a filling method, getting an + # indexer and backfilling, and getting an indexer and padding all behave + # correctly in the case where all of the target values fall in between + # several levels in the MultiIndex into which they are getting an indexer + # + # visually, the MultiIndexes used in this test are: + # mult_idx_1: + # 0: -1 0 + # 1: 2 + # 2: 3 + # 3: 4 + # 4: 0 0 + # 5: 2 + # 6: 3 + # 7: 4 + # 8: 1 0 + # 9: 2 + # 10: 3 + # 11: 4 + # + # mult_idx_2: + # 0: 0 1 + # 1: 3 + # 2: 4 + mult_idx_1 = MultiIndex.from_product([[-1, 0, 1], [0, 2, 3, 4]]) + mult_idx_2 = MultiIndex.from_product([[0], [1, 3, 4]]) + + indexer = mult_idx_1.get_indexer(mult_idx_2) + expected = np.array([-1, 6, 7], dtype=indexer.dtype) + tm.assert_almost_equal(expected, indexer) + + backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="backfill") + expected = np.array([5, 6, 7], dtype=backfill_indexer.dtype) + tm.assert_almost_equal(expected, backfill_indexer) + + # ensure the legacy "bfill" option functions identically to "backfill" + backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="bfill") + expected = np.array([5, 6, 7], dtype=backfill_indexer.dtype) + tm.assert_almost_equal(expected, backfill_indexer) + + pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="pad") + expected = np.array([4, 6, 7], dtype=pad_indexer.dtype) + tm.assert_almost_equal(expected, pad_indexer) + + # ensure the legacy "ffill" option functions identically to "pad" + pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="ffill") + expected = np.array([4, 6, 7], dtype=pad_indexer.dtype) + tm.assert_almost_equal(expected, pad_indexer) + + def test_get_indexer_three_or_more_levels(self): + # https://github.com/pandas-dev/pandas/issues/29896 + # tests get_indexer() on MultiIndexes with 3+ levels + # visually, these are + # mult_idx_1: + # 0: 1 2 5 + # 1: 7 + # 2: 4 5 + # 3: 7 + # 4: 6 5 + # 5: 7 + # 6: 3 2 5 + # 7: 7 + # 8: 4 5 + # 9: 7 + # 10: 6 5 + # 11: 7 + # + # mult_idx_2: + # 0: 1 1 8 + # 1: 1 5 9 + # 2: 1 6 7 + # 3: 2 1 6 + # 4: 2 7 6 + # 5: 2 7 8 + # 6: 3 6 8 + mult_idx_1 = pd.MultiIndex.from_product([[1, 3], [2, 4, 6], [5, 7]]) + mult_idx_2 = pd.MultiIndex.from_tuples( + [ + (1, 1, 8), + (1, 5, 9), + (1, 6, 7), + (2, 1, 6), + (2, 7, 7), + (2, 7, 8), + (3, 6, 8), + ] + ) + # sanity check + assert mult_idx_1.is_monotonic + assert mult_idx_1.is_unique + assert mult_idx_2.is_monotonic + assert mult_idx_2.is_unique + + # show the relationships between the two + assert mult_idx_2[0] < mult_idx_1[0] + assert mult_idx_1[3] < mult_idx_2[1] < mult_idx_1[4] + assert mult_idx_1[5] == mult_idx_2[2] + assert mult_idx_1[5] < mult_idx_2[3] < mult_idx_1[6] + assert mult_idx_1[5] < mult_idx_2[4] < mult_idx_1[6] + assert mult_idx_1[5] < mult_idx_2[5] < mult_idx_1[6] + assert mult_idx_1[-1] < mult_idx_2[6] + + indexer_no_fill = mult_idx_1.get_indexer(mult_idx_2) + expected = np.array([-1, -1, 5, -1, -1, -1, -1], dtype=indexer_no_fill.dtype) + tm.assert_almost_equal(expected, indexer_no_fill) + + # test with backfilling + indexer_backfilled = mult_idx_1.get_indexer(mult_idx_2, method="backfill") + expected = np.array([0, 4, 5, 6, 6, 6, -1], dtype=indexer_backfilled.dtype) + tm.assert_almost_equal(expected, indexer_backfilled) + + # now, the same thing, but forward-filled (aka "padded") + indexer_padded = mult_idx_1.get_indexer(mult_idx_2, method="pad") + expected = np.array([-1, 3, 5, 5, 5, 5, 11], dtype=indexer_padded.dtype) + tm.assert_almost_equal(expected, indexer_padded) + + # now, do the indexing in the other direction + assert mult_idx_2[0] < mult_idx_1[0] < mult_idx_2[1] + assert mult_idx_2[0] < mult_idx_1[1] < mult_idx_2[1] + assert mult_idx_2[0] < mult_idx_1[2] < mult_idx_2[1] + assert mult_idx_2[0] < mult_idx_1[3] < mult_idx_2[1] + assert mult_idx_2[1] < mult_idx_1[4] < mult_idx_2[2] + assert mult_idx_2[2] == mult_idx_1[5] + assert mult_idx_2[5] < mult_idx_1[6] < mult_idx_2[6] + assert mult_idx_2[5] < mult_idx_1[7] < mult_idx_2[6] + assert mult_idx_2[5] < mult_idx_1[8] < mult_idx_2[6] + assert mult_idx_2[5] < mult_idx_1[9] < mult_idx_2[6] + assert mult_idx_2[5] < mult_idx_1[10] < mult_idx_2[6] + assert mult_idx_2[5] < mult_idx_1[11] < mult_idx_2[6] + + indexer = mult_idx_2.get_indexer(mult_idx_1) + expected = np.array( + [-1, -1, -1, -1, -1, 2, -1, -1, -1, -1, -1, -1], dtype=indexer.dtype + ) + tm.assert_almost_equal(expected, indexer) + + backfill_indexer = mult_idx_2.get_indexer(mult_idx_1, method="bfill") + expected = np.array( + [1, 1, 1, 1, 2, 2, 6, 6, 6, 6, 6, 6], dtype=backfill_indexer.dtype + ) + tm.assert_almost_equal(expected, backfill_indexer) + + pad_indexer = mult_idx_2.get_indexer(mult_idx_1, method="pad") + expected = np.array( + [0, 0, 0, 0, 1, 2, 5, 5, 5, 5, 5, 5], dtype=pad_indexer.dtype + ) + tm.assert_almost_equal(expected, pad_indexer) + + def test_get_indexer_crossing_levels(self): + # https://github.com/pandas-dev/pandas/issues/29896 + # tests a corner case with get_indexer() with MultiIndexes where, when we + # need to "carry" across levels, proper tuple ordering is respected + # + # the MultiIndexes used in this test, visually, are: + # mult_idx_1: + # 0: 1 1 1 1 + # 1: 2 + # 2: 2 1 + # 3: 2 + # 4: 1 2 1 1 + # 5: 2 + # 6: 2 1 + # 7: 2 + # 8: 2 1 1 1 + # 9: 2 + # 10: 2 1 + # 11: 2 + # 12: 2 2 1 1 + # 13: 2 + # 14: 2 1 + # 15: 2 + # + # mult_idx_2: + # 0: 1 3 2 2 + # 1: 2 3 2 2 + mult_idx_1 = pd.MultiIndex.from_product([[1, 2]] * 4) + mult_idx_2 = pd.MultiIndex.from_tuples([(1, 3, 2, 2), (2, 3, 2, 2)]) + + # show the tuple orderings, which get_indexer() should respect + assert mult_idx_1[7] < mult_idx_2[0] < mult_idx_1[8] + assert mult_idx_1[-1] < mult_idx_2[1] + + indexer = mult_idx_1.get_indexer(mult_idx_2) + expected = np.array([-1, -1], dtype=indexer.dtype) + tm.assert_almost_equal(expected, indexer) + + backfill_indexer = mult_idx_1.get_indexer(mult_idx_2, method="bfill") + expected = np.array([8, -1], dtype=backfill_indexer.dtype) + tm.assert_almost_equal(expected, backfill_indexer) + + pad_indexer = mult_idx_1.get_indexer(mult_idx_2, method="ffill") + expected = np.array([7, 15], dtype=pad_indexer.dtype) + tm.assert_almost_equal(expected, pad_indexer) + def test_getitem(idx): # scalar