diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 607a2c02944b4..7071289ef3243 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -161,8 +161,14 @@ Missing MultiIndex ^^^^^^^^^^ +- Bug in :meth:`Dataframe.loc` when used with a :class:`MultiIndex`. The returned values were not in the same order as the given inputs (:issue:`22797`) -- +.. ipython:: python + + df = pd.DataFrame(np.arange(4), + index=[["a", "a", "b", "b"], [1, 2, 1, 2]]) + # Rows are now ordered as the requested keys + df.loc[(['b', 'a'], [2, 1]), :] - I/O diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 4af9901d79a46..c560d81ba95f6 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,6 +1,6 @@ import datetime from sys import getsizeof -from typing import Any, Hashable, List, Optional, Sequence, Union +from typing import Any, Hashable, Iterable, List, Optional, Sequence, Tuple, Union import warnings import numpy as np @@ -9,6 +9,7 @@ from pandas._libs import Timestamp, algos as libalgos, index as libindex, lib, tslibs from pandas._libs.hashtable import duplicated_int64 +from pandas._typing import AnyArrayLike, ArrayLike, Scalar from pandas.compat.numpy import function as nv from pandas.errors import PerformanceWarning, UnsortedIndexError from pandas.util._decorators import Appender, cache_readonly @@ -3081,9 +3082,69 @@ def _update_indexer(idxr, indexer=indexer): # empty indexer if indexer is None: return Int64Index([])._ndarray_values + + indexer = self._reorder_indexer(seq, indexer) + return indexer._ndarray_values - # -------------------------------------------------------------------- + def _reorder_indexer( + self, seq: Tuple[Union[Scalar, Iterable, AnyArrayLike], ...], indexer: ArrayLike + ) -> ArrayLike: + """ + Reorder an indexer of a MultiIndex (self) so that the label are in the + same order as given in seq + + Parameters + ---------- + seq : label/slice/list/mask or a sequence of such + indexer: an Int64Index indexer of self + + Returns + ------- + indexer : a sorted Int64Index indexer of self ordered as seq + """ + # If the index is lexsorted and the list_like label in seq are sorted + # then we do not need to sort + if self.is_lexsorted(): + need_sort = False + for i, k in enumerate(seq): + if is_list_like(k): + if not need_sort: + k_codes = self.levels[i].get_indexer(k) + k_codes = k_codes[k_codes >= 0] # Filter absent keys + # True if the given codes are not ordered + need_sort = (k_codes[:-1] > k_codes[1:]).any() + # Bail out if both index and seq are sorted + if not need_sort: + return indexer + + n = len(self) + keys: Tuple[np.ndarray, ...] = tuple() + # For each level of the sequence in seq, map the level codes with the + # order they appears in a list-like sequence + # This mapping is then use to reorder the indexer + for i, k in enumerate(seq): + if com.is_bool_indexer(k): + new_order = np.arange(n)[indexer] + elif is_list_like(k): + # Generate a map with all level codes as sorted initially + key_order_map = np.ones(len(self.levels[i]), dtype=np.uint64) * len( + self.levels[i] + ) + # Set order as given in the indexer list + level_indexer = self.levels[i].get_indexer(k) + level_indexer = level_indexer[level_indexer >= 0] # Filter absent keys + key_order_map[level_indexer] = np.arange(len(level_indexer)) + + new_order = key_order_map[self.codes[i][indexer]] + else: + # For all other case, use the same order as the level + new_order = np.arange(n)[indexer] + keys = (new_order,) + keys + + # Find the reordering using lexsort on the keys mapping + ind = np.lexsort(keys) + return indexer[ind] def truncate(self, before=None, after=None): """ diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 640cd8faf6811..b377ca2869bd3 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2534,3 +2534,29 @@ def test_sort_ascending_list(self): result = s.sort_index(level=["third", "first"], ascending=[False, True]) expected = s.iloc[[0, 4, 1, 5, 2, 6, 3, 7]] tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "keys, expected", + [ + (["b", "a"], [["b", "b", "a", "a"], [1, 2, 1, 2]]), + (["a", "b"], [["a", "a", "b", "b"], [1, 2, 1, 2]]), + ((["a", "b"], [1, 2]), [["a", "a", "b", "b"], [1, 2, 1, 2]]), + ((["a", "b"], [2, 1]), [["a", "a", "b", "b"], [2, 1, 2, 1]]), + ((["b", "a"], [2, 1]), [["b", "b", "a", "a"], [2, 1, 2, 1]]), + ((["b", "a"], [1, 2]), [["b", "b", "a", "a"], [1, 2, 1, 2]]), + ((["c", "a"], [2, 1]), [["c", "a", "a"], [1, 2, 1]]), + ], + ) + @pytest.mark.parametrize("dim", ["index", "columns"]) + def test_multilevel_index_loc_order(self, dim, keys, expected): + # GH 22797 + # Try to respect order of keys given for MultiIndex.loc + kwargs = {dim: [["c", "a", "a", "b", "b"], [1, 1, 2, 1, 2]]} + df = pd.DataFrame(np.arange(25).reshape(5, 5), **kwargs,) + exp_index = MultiIndex.from_arrays(expected) + if dim == "index": + res = df.loc[keys, :] + tm.assert_index_equal(res.index, exp_index) + elif dim == "columns": + res = df.loc[:, keys] + tm.assert_index_equal(res.columns, exp_index)