BUG: Fix reindexing with multi-indexed DataFrames (#30766)

ChrisRobo · web-flow · commit 7222318fb1cc · 2020-04-08T13:22:43.000-04:00
diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py
@@ -74,10 +74,38 @@ def setup(self):
             ],
             dtype=object,
         )
+        self.other_mi_many_mismatches = MultiIndex.from_tuples(
+            [
+                (-7, 41),
+                (-2, 3),
+                (-0.7, 5),
+                (0, 0),
+                (0, 1.5),
+                (0, 340),
+                (0, 1001),
+                (1, -4),
+                (1, 20),
+                (1, 1040),
+                (432, -5),
+                (432, 17),
+                (439, 165.5),
+                (998, -4),
+                (998, 24065),
+                (999, 865.2),
+                (999, 1000),
+                (1045, -843),
+            ]
+        )
 
     def time_get_indexer(self):
         self.mi_int.get_indexer(self.obj_index)
 
+    def time_get_indexer_and_backfill(self):
+        self.mi_int.get_indexer(self.other_mi_many_mismatches, method="backfill")
+
+    def time_get_indexer_and_pad(self):
+        self.mi_int.get_indexer(self.other_mi_many_mismatches, method="pad")
+
     def time_is_monotonic(self):
         self.mi_int.is_monotonic
 
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -121,6 +121,67 @@ Backwards incompatible API changes
   Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`)
 - :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`)
 - Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`)
+
+``MultiIndex.get_indexer`` interprets `method` argument differently
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This restores the behavior of :meth:`MultiIndex.get_indexer` with ``method='backfill'`` or ``method='pad'`` to the behavior before pandas 0.23.0. In particular, MultiIndexes are treated as a list of tuples and padding or backfilling is done with respect to the ordering of these lists of tuples (:issue:`29896`).
+
+As an example of this, given:
+
+.. ipython:: python
+
+        df = pd.DataFrame({
+            'a': [0, 0, 0, 0],
+            'b': [0, 2, 3, 4],
+            'c': ['A', 'B', 'C', 'D'],
+        }).set_index(['a', 'b'])
+        mi_2 = pd.MultiIndex.from_product([[0], [-1, 0, 1, 3, 4, 5]])
+
+The differences in reindexing ``df`` with ``mi_2`` and using ``method='backfill'`` can be seen here:
+
+*pandas >= 0.23, < 1.1.0*:
+
+.. code-block:: ipython
+
+    In [1]: df.reindex(mi_2, method='backfill')
+    Out[1]:
+          c
+    0 -1  A
+       0  A
+       1  D
+       3  A
+       4  A
+       5  C
+
+*pandas <0.23, >= 1.1.0*
+
+.. ipython:: python
+
+        df.reindex(mi_2, method='backfill')
+
+And the differences in reindexing ``df`` with ``mi_2`` and using ``method='pad'`` can be seen here:
+
+*pandas >= 0.23, < 1.1.0*
+
+.. code-block:: ipython
+
+    In [1]: df.reindex(mi_2, method='pad')
+    Out[1]:
+            c
+    0 -1  NaN
+       0  NaN
+       1    D
+       3  NaN
+       4    A
+       5    C
+
+*pandas < 0.23, >= 1.1.0*
+
+.. ipython:: python
+
+        df.reindex(mi_2, method='pad')
+
 -
 
 .. _whatsnew_110.api_breaking.indexing_raises_key_errors:
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -612,25 +612,113 @@ cdef class BaseMultiIndexCodesEngine:
                        in zip(self.levels, zip(*target))]
         return self._codes_to_ints(np.array(level_codes, dtype='uint64').T)
 
-    def get_indexer(self, object target, object method=None,
-                    object limit=None):
+    def get_indexer_no_fill(self, object target) -> np.ndarray:
+        """
+        Returns an array giving the positions of each value of `target` in
+        `self.values`, where -1 represents a value in `target` which does not
+        appear in `self.values`
+
+        Parameters
+        ----------
+        target : list-like of keys
+            Each key is a tuple, with a label for each level of the index
+
+        Returns
+        -------
+        np.ndarray[int64_t, ndim=1] of the indexer of `target` into
+        `self.values`
+        """
         lab_ints = self._extract_level_codes(target)
+        return self._base.get_indexer(self, lab_ints)
 
-        # All methods (exact, backfill, pad) directly map to the respective
-        # methods of the underlying (integers) index...
-        if method is not None:
-            # but underlying backfill and pad methods require index and keys
-            # to be sorted. The index already is (checked in
-            # Index._get_fill_indexer), sort (integer representations of) keys:
-            order = np.argsort(lab_ints)
-            lab_ints = lab_ints[order]
-            indexer = (getattr(self._base, f'get_{method}_indexer')
-                       (self, lab_ints, limit=limit))
-            indexer = indexer[order]
-        else:
-            indexer = self._base.get_indexer(self, lab_ints)
+    def get_indexer(self, object target, object values = None,
+                    object method = None, object limit = None) -> np.ndarray:
+        """
+        Returns an array giving the positions of each value of `target` in
+        `values`, where -1 represents a value in `target` which does not
+        appear in `values`
 
-        return indexer
+        If `method` is "backfill" then the position for a value in `target`
+        which does not appear in `values` is that of the next greater value
+        in `values` (if one exists), and -1 if there is no such value.
+
+        Similarly, if the method is "pad" then the position for a value in
+        `target` which does not appear in `values` is that of the next smaller
+        value in `values` (if one exists), and -1 if there is no such value.
+
+        Parameters
+        ----------
+        target: list-like of tuples
+            need not be sorted, but all must have the same length, which must be
+            the same as the length of all tuples in `values`
+        values : list-like of tuples
+            must be sorted and all have the same length.  Should be the set of
+            the MultiIndex's values.  Needed only if `method` is not None
+        method: string
+            "backfill" or "pad"
+        limit: int, optional
+            if provided, limit the number of fills to this value
+
+        Returns
+        -------
+        np.ndarray[int64_t, ndim=1] of the indexer of `target` into `values`,
+        filled with the `method` (and optionally `limit`) specified
+        """
+        if method is None:
+            return self.get_indexer_no_fill(target)
+
+        assert method in ("backfill", "pad")
+        cdef:
+            int64_t i, j, next_code
+            int64_t num_values, num_target_values
+            ndarray[int64_t, ndim=1] target_order
+            ndarray[object, ndim=1] target_values
+            ndarray[int64_t, ndim=1] new_codes, new_target_codes
+            ndarray[int64_t, ndim=1] sorted_indexer
+
+        target_order = np.argsort(target.values).astype('int64')
+        target_values = target.values[target_order]
+        num_values, num_target_values = len(values), len(target_values)
+        new_codes, new_target_codes = (
+            np.empty((num_values,)).astype('int64'),
+            np.empty((num_target_values,)).astype('int64'),
+        )
+
+        # `values` and `target_values` are both sorted, so we walk through them
+        # and memoize the (ordered) set of indices in the (implicit) merged-and
+        # sorted list of the two which belong to each of them
+        # the effect of this is to create a factorization for the (sorted)
+        # merger of the index values, where `new_codes` and `new_target_codes`
+        # are the subset of the factors which appear in `values` and `target`,
+        # respectively
+        i, j, next_code = 0, 0, 0
+        while i < num_values and j < num_target_values:
+            val, target_val = values[i], target_values[j]
+            if val <= target_val:
+                new_codes[i] = next_code
+                i += 1
+            if target_val <= val:
+                new_target_codes[j] = next_code
+                j += 1
+            next_code += 1
+
+        # at this point, at least one should have reached the end
+        # the remaining values of the other should be added to the end
+        assert i == num_values or j == num_target_values
+        while i < num_values:
+            new_codes[i] = next_code
+            i += 1
+            next_code += 1
+        while j < num_target_values:
+            new_target_codes[j] = next_code
+            j += 1
+            next_code += 1
+
+        # get the indexer, and undo the sorting of `target.values`
+        sorted_indexer = (
+            algos.backfill if method == "backfill" else algos.pad
+        )(new_codes, new_target_codes, limit=limit).astype('int64')
+        return sorted_indexer[np.argsort(target_order)]
 
     def get_loc(self, object key):
         if is_definitely_invalid_key(key):
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -2455,7 +2455,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
                 raise NotImplementedError(
                     "tolerance not implemented yet for MultiIndex"
                 )
-            indexer = self._engine.get_indexer(target, method, limit)
+            indexer = self._engine.get_indexer(
+                values=self.values, target=target, method=method, limit=limit
+            )
         elif method == "nearest":
             raise NotImplementedError(
                 "method='nearest' not implemented yet "
diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
@@ -1432,6 +1432,81 @@ def test_set_value_resize(self, float_frame):
         with pytest.raises(ValueError, match=msg):
             res._set_value("foobar", "baz", "sam")
 
+    def test_reindex_with_multi_index(self):
+        # https://github.com/pandas-dev/pandas/issues/29896
+        # tests for reindexing a multi-indexed DataFrame with a new MultiIndex
+        #
+        # confirms that we can reindex a multi-indexed DataFrame with a new
+        # MultiIndex object correctly when using no filling, backfilling, and
+        # padding
+        #
+        # The DataFrame, `df`, used in this test is:
+        #       c
+        #  a b
+        # -1 0  A
+        #    1  B
+        #    2  C
+        #    3  D
+        #    4  E
+        #    5  F
+        #    6  G
+        #  0 0  A
+        #    1  B
+        #    2  C
+        #    3  D
+        #    4  E
+        #    5  F
+        #    6  G
+        #  1 0  A
+        #    1  B
+        #    2  C
+        #    3  D
+        #    4  E
+        #    5  F
+        #    6  G
+        #
+        # and the other MultiIndex, `new_multi_index`, is:
+        # 0: 0 0.5
+        # 1:   2.0
+        # 2:   5.0
+        # 3:   5.8
+        df = pd.DataFrame(
+            {
+                "a": [-1] * 7 + [0] * 7 + [1] * 7,
+                "b": list(range(7)) * 3,
+                "c": ["A", "B", "C", "D", "E", "F", "G"] * 3,
+            }
+        ).set_index(["a", "b"])
+        new_index = [0.5, 2.0, 5.0, 5.8]
+        new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"])
+
+        # reindexing w/o a `method` value
+        reindexed = df.reindex(new_multi_index)
+        expected = pd.DataFrame(
+            {"a": [0] * 4, "b": new_index, "c": [np.nan, "C", "F", np.nan]}
+        ).set_index(["a", "b"])
+        tm.assert_frame_equal(expected, reindexed)
+
+        # reindexing with backfilling
+        expected = pd.DataFrame(
+            {"a": [0] * 4, "b": new_index, "c": ["B", "C", "F", "G"]}
+        ).set_index(["a", "b"])
+        reindexed_with_backfilling = df.reindex(new_multi_index, method="bfill")
+        tm.assert_frame_equal(expected, reindexed_with_backfilling)
+
+        reindexed_with_backfilling = df.reindex(new_multi_index, method="backfill")
+        tm.assert_frame_equal(expected, reindexed_with_backfilling)
+
+        # reindexing with padding
+        expected = pd.DataFrame(
+            {"a": [0] * 4, "b": new_index, "c": ["A", "C", "F", "F"]}
+        ).set_index(["a", "b"])
+        reindexed_with_padding = df.reindex(new_multi_index, method="pad")
+        tm.assert_frame_equal(expected, reindexed_with_padding)
+
+        reindexed_with_padding = df.reindex(new_multi_index, method="ffill")
+        tm.assert_frame_equal(expected, reindexed_with_padding)
+
     def test_set_value_with_index_dtype_change(self):
         df_orig = DataFrame(np.random.randn(3, 3), index=range(3), columns=list("ABC"))
 
diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py