Skip to content

BUG: Preserve key order when using loc on MultiIndex DataFrame #28933

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,14 @@ Missing

MultiIndex
^^^^^^^^^^
- Bug in :meth:`Dataframe.loc` when used with a :class:`MultiIndex`. The returned values were not in the same order as the given inputs (:issue:`22797`)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you make a mini-example (separate sub-section). I think this is very hard to visualize from the text, but the basic example we are using is very clear.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure thing.


-
.. ipython:: python

df = pd.DataFrame(np.arange(4),
index=[["a", "a", "b", "b"], [1, 2, 1, 2]])
# Rows are now ordered as the requested keys
df.loc[(['b', 'a'], [2, 1]), :]
-

I/O
Expand Down
65 changes: 63 additions & 2 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import datetime
from sys import getsizeof
from typing import Any, Hashable, List, Optional, Sequence, Union
from typing import Any, Hashable, Iterable, List, Optional, Sequence, Tuple, Union
import warnings

import numpy as np
Expand All @@ -9,6 +9,7 @@

from pandas._libs import Timestamp, algos as libalgos, index as libindex, lib, tslibs
from pandas._libs.hashtable import duplicated_int64
from pandas._typing import AnyArrayLike, ArrayLike, Scalar
from pandas.compat.numpy import function as nv
from pandas.errors import PerformanceWarning, UnsortedIndexError
from pandas.util._decorators import Appender, cache_readonly
Expand Down Expand Up @@ -3081,9 +3082,69 @@ def _update_indexer(idxr, indexer=indexer):
# empty indexer
if indexer is None:
return Int64Index([])._ndarray_values

indexer = self._reorder_indexer(seq, indexer)

return indexer._ndarray_values

# --------------------------------------------------------------------
def _reorder_indexer(
self, seq: Tuple[Union[Scalar, Iterable, AnyArrayLike], ...], indexer: ArrayLike
) -> ArrayLike:
"""
Reorder an indexer of a MultiIndex (self) so that the label are in the
same order as given in seq

Parameters
----------
seq : label/slice/list/mask or a sequence of such
indexer: an Int64Index indexer of self

Returns
-------
indexer : a sorted Int64Index indexer of self ordered as seq
"""
# If the index is lexsorted and the list_like label in seq are sorted
# then we do not need to sort
if self.is_lexsorted():
need_sort = False
for i, k in enumerate(seq):
if is_list_like(k):
if not need_sort:
k_codes = self.levels[i].get_indexer(k)
k_codes = k_codes[k_codes >= 0] # Filter absent keys
# True if the given codes are not ordered
need_sort = (k_codes[:-1] > k_codes[1:]).any()
# Bail out if both index and seq are sorted
if not need_sort:
return indexer

n = len(self)
keys: Tuple[np.ndarray, ...] = tuple()
# For each level of the sequence in seq, map the level codes with the
# order they appears in a list-like sequence
# This mapping is then use to reorder the indexer
for i, k in enumerate(seq):
if com.is_bool_indexer(k):
new_order = np.arange(n)[indexer]
elif is_list_like(k):
# Generate a map with all level codes as sorted initially
key_order_map = np.ones(len(self.levels[i]), dtype=np.uint64) * len(
self.levels[i]
)
# Set order as given in the indexer list
level_indexer = self.levels[i].get_indexer(k)
level_indexer = level_indexer[level_indexer >= 0] # Filter absent keys
key_order_map[level_indexer] = np.arange(len(level_indexer))

new_order = key_order_map[self.codes[i][indexer]]
else:
# For all other case, use the same order as the level
new_order = np.arange(n)[indexer]
keys = (new_order,) + keys

# Find the reordering using lexsort on the keys mapping
ind = np.lexsort(keys)
return indexer[ind]

def truncate(self, before=None, after=None):
"""
Expand Down
26 changes: 26 additions & 0 deletions pandas/tests/test_multilevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2534,3 +2534,29 @@ def test_sort_ascending_list(self):
result = s.sort_index(level=["third", "first"], ascending=[False, True])
expected = s.iloc[[0, 4, 1, 5, 2, 6, 3, 7]]
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize(
"keys, expected",
[
(["b", "a"], [["b", "b", "a", "a"], [1, 2, 1, 2]]),
(["a", "b"], [["a", "a", "b", "b"], [1, 2, 1, 2]]),
((["a", "b"], [1, 2]), [["a", "a", "b", "b"], [1, 2, 1, 2]]),
((["a", "b"], [2, 1]), [["a", "a", "b", "b"], [2, 1, 2, 1]]),
((["b", "a"], [2, 1]), [["b", "b", "a", "a"], [2, 1, 2, 1]]),
((["b", "a"], [1, 2]), [["b", "b", "a", "a"], [1, 2, 1, 2]]),
((["c", "a"], [2, 1]), [["c", "a", "a"], [1, 2, 1]]),
],
)
@pytest.mark.parametrize("dim", ["index", "columns"])
def test_multilevel_index_loc_order(self, dim, keys, expected):
# GH 22797
# Try to respect order of keys given for MultiIndex.loc
kwargs = {dim: [["c", "a", "a", "b", "b"], [1, 1, 2, 1, 2]]}
df = pd.DataFrame(np.arange(25).reshape(5, 5), **kwargs,)
exp_index = MultiIndex.from_arrays(expected)
if dim == "index":
res = df.loc[keys, :]
tm.assert_index_equal(res.index, exp_index)
elif dim == "columns":
res = df.loc[:, keys]
tm.assert_index_equal(res.columns, exp_index)