From 2753c797b6ff4eb3df56f484ad866c5c1d17b807 Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Sun, 29 Sep 2019 17:24:12 +0200 Subject: [PATCH 01/20] TST: Test for issue #22797 Testing return order of MultiIndex.loc MultiIndex.loc try to return the result in the same order as the key given. --- pandas/tests/indexes/multi/test_indexing.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index b08280a712642..ebe23919a9d5c 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -438,6 +438,7 @@ def test_timestamp_multiindex_indexer(): ) should_be = pd.Series(data=np.arange(24, len(qidx) + 24), index=qidx, name="foo") tm.assert_series_equal(result, should_be) +<<<<<<< HEAD def test_get_loc_with_values_including_missing_values(): @@ -526,3 +527,21 @@ def test_slice_locs_with_missing_value(index_arr, expected, start_idx, end_idx): idx = MultiIndex.from_arrays(index_arr) result = idx.slice_locs(start=start_idx, end=end_idx) assert result == expected + + +def test_multiindex_loc_order(): + # GH 22797 + # Try to respect order of keys given for MultiIndex.loc + df = pd.DataFrame( + np.arange(12).reshape((4, 3)), + index=[["a", "a", "b", "b"], [1, 2, 1, 2]], + columns=[["Ohio", "Ohio", "Colorado"], ["Green", "Red", "Green"]], + ) + + res = df.loc[["b", "a"], :] + exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [1, 2, 1, 2]]) + tm.assert_index_equal(res.index, exp_index) + + res = df.loc[(["b", "a"], [2, 1]), :] + exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [2, 1, 2, 1]]) + tm.assert_index_equal(res.index, exp_index) From af5c678fcecdb595ffc5304b922bb789a0a62833 Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Tue, 8 Oct 2019 00:55:39 +0200 Subject: [PATCH 02/20] BUG: sort MultiIndex DataFrame loc result From issue #22797. When given a list like object as indexer, the returned result did not respect the order of the indexer, but the order of the MultiIndex levels. --- pandas/core/indexes/multi.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 4af9901d79a46..083c5874ea0a3 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3081,6 +3081,30 @@ def _update_indexer(idxr, indexer=indexer): # empty indexer if indexer is None: return Int64Index([])._ndarray_values + + # Generate tuples of keys by wich to order the results + keys = tuple() + for i, k in enumerate(seq): + if com.is_bool_indexer(k): + new_order = np.arange(n)[indexer] + elif is_list_like(k): + # Generate a map with all level codes as sorted initially + key_order_map = np.ones(len(self.levels[i]), dtype=np.uint64) * len( + self.levels[i] + ) + # Set order as given in the indexer list + for p, e in enumerate(k): + if e in self.levels[i]: + key_order_map[self.levels[i].get_loc(e)] = p + new_order = key_order_map[self.codes[i][indexer]] + else: + # For all other case, use the same order as the level + new_order = np.arange(n)[indexer] + keys = (new_order,) + keys + if len(keys) > 0: + ind = np.lexsort(keys) + indexer = indexer[ind] + return indexer._ndarray_values # -------------------------------------------------------------------- From dd53a91e3f09c31d90604532107dd472e563e432 Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Tue, 8 Oct 2019 22:00:52 +0200 Subject: [PATCH 03/20] PERF: Skip sort of MultiIndex DataFrame loc result if not needed Test if the result of the loc function need to be sorted to return them in the same order as the indexer. If not, skip the sort to improve performance. --- pandas/core/indexes/multi.py | 52 ++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 083c5874ea0a3..6777a6e415300 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3028,6 +3028,7 @@ def _update_indexer(idxr, indexer=indexer): return indexer return indexer & idxr + need_sort = False for i, k in enumerate(seq): if com.is_bool_indexer(k): @@ -3039,12 +3040,21 @@ def _update_indexer(idxr, indexer=indexer): # a collection of labels to include from this level (these # are or'd) indexers = None + start_pos = 0 for x in k: try: idxrs = _convert_to_indexer( self._get_level_indexer(x, level=i, indexer=indexer) ) indexers = idxrs if indexers is None else indexers | idxrs + + if not need_sort: + next_key_pos = self.levels[i].get_loc(x) + if next_key_pos < start_pos: + need_sort = True + else: + start_pos = next_key_pos + except KeyError: # ignore not founds @@ -3082,26 +3092,28 @@ def _update_indexer(idxr, indexer=indexer): if indexer is None: return Int64Index([])._ndarray_values - # Generate tuples of keys by wich to order the results - keys = tuple() - for i, k in enumerate(seq): - if com.is_bool_indexer(k): - new_order = np.arange(n)[indexer] - elif is_list_like(k): - # Generate a map with all level codes as sorted initially - key_order_map = np.ones(len(self.levels[i]), dtype=np.uint64) * len( - self.levels[i] - ) - # Set order as given in the indexer list - for p, e in enumerate(k): - if e in self.levels[i]: - key_order_map[self.levels[i].get_loc(e)] = p - new_order = key_order_map[self.codes[i][indexer]] - else: - # For all other case, use the same order as the level - new_order = np.arange(n)[indexer] - keys = (new_order,) + keys - if len(keys) > 0: + # Generate tuples of keys by which to order the results + if need_sort: + keys = tuple() + for i, k in enumerate(seq): + if com.is_bool_indexer(k): + new_order = np.arange(n)[indexer] + elif is_list_like(k): + # Generate a map with all level codes as sorted initially + key_order_map = np.ones(len(self.levels[i]), dtype=np.uint64) * len( + self.levels[i] + ) + # Set order as given in the indexer list + for p, e in enumerate(k): + if e in self.levels[i]: + key_order_map[self.levels[i].get_loc(e)] = p + new_order = key_order_map[self.codes[i][indexer]] + # Testing if the sort order of the result shoud be modified + else: + # For all other case, use the same order as the level + new_order = np.arange(n)[indexer] + keys = (new_order,) + keys + ind = np.lexsort(keys) indexer = indexer[ind] From 97b952d1c4b580851e5442f2da75c910efdedbac Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Wed, 23 Oct 2019 20:14:32 +0200 Subject: [PATCH 04/20] CLN: Some code simplification --- pandas/core/indexes/multi.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 6777a6e415300..f67511ae4f761 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3039,8 +3039,13 @@ def _update_indexer(idxr, indexer=indexer): elif is_list_like(k): # a collection of labels to include from this level (these # are or'd) + # Find out if the list_like label are sorted as the levels or not + if not need_sort: + k_codes = np.array( + [self.levels[i].get_loc(e) for e in k if e in self.levels[i]] + ) + need_sort = not (k_codes[:-1] < k_codes[1:]).all() indexers = None - start_pos = 0 for x in k: try: idxrs = _convert_to_indexer( @@ -3048,13 +3053,6 @@ def _update_indexer(idxr, indexer=indexer): ) indexers = idxrs if indexers is None else indexers | idxrs - if not need_sort: - next_key_pos = self.levels[i].get_loc(x) - if next_key_pos < start_pos: - need_sort = True - else: - start_pos = next_key_pos - except KeyError: # ignore not founds @@ -3108,7 +3106,6 @@ def _update_indexer(idxr, indexer=indexer): if e in self.levels[i]: key_order_map[self.levels[i].get_loc(e)] = p new_order = key_order_map[self.codes[i][indexer]] - # Testing if the sort order of the result shoud be modified else: # For all other case, use the same order as the level new_order = np.arange(n)[indexer] From 3fa3c6dcffab189dd1fb599e3bc08839ed1928f5 Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Mon, 18 Nov 2019 15:51:49 +0100 Subject: [PATCH 05/20] CLN: Move code into separate function Move code from get_locs to _reorder_indexer. Better use of get_indexer to get level_code location. --- pandas/core/indexes/multi.py | 71 +++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 26 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f67511ae4f761..f92bb943a674f 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -42,6 +42,7 @@ ensure_index, ) from pandas.core.indexes.frozen import FrozenList +from pandas.core.indexes.numeric import Int64Index import pandas.core.missing as missing from pandas.core.sorting import ( get_group_index, @@ -3041,9 +3042,8 @@ def _update_indexer(idxr, indexer=indexer): # are or'd) # Find out if the list_like label are sorted as the levels or not if not need_sort: - k_codes = np.array( - [self.levels[i].get_loc(e) for e in k if e in self.levels[i]] - ) + k_codes = self.levels[i].get_indexer(k) + k_codes = k_codes[k_codes >= 0] # Filter absent keys need_sort = not (k_codes[:-1] < k_codes[1:]).all() indexers = None for x in k: @@ -3090,33 +3090,52 @@ def _update_indexer(idxr, indexer=indexer): if indexer is None: return Int64Index([])._ndarray_values - # Generate tuples of keys by which to order the results if need_sort: - keys = tuple() - for i, k in enumerate(seq): - if com.is_bool_indexer(k): - new_order = np.arange(n)[indexer] - elif is_list_like(k): - # Generate a map with all level codes as sorted initially - key_order_map = np.ones(len(self.levels[i]), dtype=np.uint64) * len( - self.levels[i] - ) - # Set order as given in the indexer list - for p, e in enumerate(k): - if e in self.levels[i]: - key_order_map[self.levels[i].get_loc(e)] = p - new_order = key_order_map[self.codes[i][indexer]] - else: - # For all other case, use the same order as the level - new_order = np.arange(n)[indexer] - keys = (new_order,) + keys - - ind = np.lexsort(keys) - indexer = indexer[ind] + indexer = self._reorder_indexer(seq, indexer) return indexer._ndarray_values - # -------------------------------------------------------------------- + def _reorder_indexer(self, seq, indexer: Int64Index) -> Int64Index: + """ + Reorder an indexer of a MultiIndex (self) so that the label are in the + same order as given in seq + + Parameters + ---------- + seq : label/slice/list/mask or a sequence of such + indexer: an Int64Index for element of self + + Returns + ------- + indexer : a sorted Int64Index of element of self ordered as seq + """ + n = len(self) + keys = tuple() + # For each level of the sequence in seq, map the level codes with the + # order they appears in a list-like sequence + # This mapping is then use to reorder the indexer + for i, k in enumerate(seq): + if com.is_bool_indexer(k): + new_order = np.arange(n)[indexer] + elif is_list_like(k): + # Generate a map with all level codes as sorted initially + key_order_map = np.ones(len(self.levels[i]), dtype=np.uint64) * len( + self.levels[i] + ) + # Set order as given in the indexer list + level_indexer = self.levels[i].get_indexer(k) + level_indexer = level_indexer[level_indexer >= 0] # Filter absent keys + key_order_map[level_indexer] = np.arange(len(level_indexer)) + + new_order = key_order_map[self.codes[i][indexer]] + else: + # For all other case, use the same order as the level + new_order = np.arange(n)[indexer] + keys = (new_order,) + keys + + # Find the reordering using lexsort on the keys mapping + ind = np.lexsort(keys) + return indexer[ind] def truncate(self, before=None, after=None): """ From 8b5ec481b2f94533c1b956bf4642eeefaafb538f Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Mon, 18 Nov 2019 18:13:29 +0100 Subject: [PATCH 06/20] CLN: More typing and linting --- pandas/core/indexes/multi.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f92bb943a674f..7d3487600e444 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3109,8 +3109,10 @@ def _reorder_indexer(self, seq, indexer: Int64Index) -> Int64Index: ------- indexer : a sorted Int64Index of element of self ordered as seq """ + from typing import Tuple + n = len(self) - keys = tuple() + keys = tuple() # type: Tuple[np.ndarray, ...] # For each level of the sequence in seq, map the level codes with the # order they appears in a list-like sequence # This mapping is then use to reorder the indexer From fb336279d2acafeee42364fe8ac92543827467ee Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Mon, 18 Nov 2019 20:03:47 +0100 Subject: [PATCH 07/20] CLN: Improve readability and doc --- pandas/core/indexes/multi.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 7d3487600e444..ac3eb1f8009e7 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3044,7 +3044,9 @@ def _update_indexer(idxr, indexer=indexer): if not need_sort: k_codes = self.levels[i].get_indexer(k) k_codes = k_codes[k_codes >= 0] # Filter absent keys - need_sort = not (k_codes[:-1] < k_codes[1:]).all() + # True if the given codes are not ordered + need_sort = (k_codes[:-1] > k_codes[1:]).any() + indexers = None for x in k: try: @@ -3103,11 +3105,11 @@ def _reorder_indexer(self, seq, indexer: Int64Index) -> Int64Index: Parameters ---------- seq : label/slice/list/mask or a sequence of such - indexer: an Int64Index for element of self + indexer: an Int64Index indexer of self Returns ------- - indexer : a sorted Int64Index of element of self ordered as seq + indexer : a sorted Int64Index indexer of self ordered as seq """ from typing import Tuple From 2c6195f53bc6ee5f8af92dba3ae8e8e27f2041c5 Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Mon, 18 Nov 2019 20:34:40 +0100 Subject: [PATCH 08/20] DOC: Add a whatsnew --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 6597b764581a4..bcb627995e8a5 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1160,7 +1160,7 @@ MultiIndex - Constructor for :class:`MultiIndex` verifies that the given ``sortorder`` is compatible with the actual ``lexsort_depth`` if ``verify_integrity`` parameter is ``True`` (the default) (:issue:`28735`) - Series and MultiIndex `.drop` with `MultiIndex` raise exception if labels not in given in level (:issue:`8594`) -- +- Bug in :meth:`Dataframe.loc` when used with a :class:`MultiIndex`. The returned values were not in the same order as the given inputs (:issue:`22797`) I/O ^^^ From d911110d837691dc9d9639568055c1508735cac0 Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Wed, 20 Nov 2019 23:00:33 +0100 Subject: [PATCH 09/20] CLN: More typing for _reorder_indexer --- pandas/core/indexes/multi.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ac3eb1f8009e7..ff95708022324 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,6 +1,6 @@ import datetime from sys import getsizeof -from typing import Any, Hashable, List, Optional, Sequence, Union +from typing import Any, Hashable, Iterable, List, Optional, Sequence, Tuple, Union import warnings import numpy as np @@ -9,6 +9,7 @@ from pandas._libs import Timestamp, algos as libalgos, index as libindex, lib, tslibs from pandas._libs.hashtable import duplicated_int64 +from pandas._typing import AnyArrayLike, ArrayLike, Scalar from pandas.compat.numpy import function as nv from pandas.errors import PerformanceWarning, UnsortedIndexError from pandas.util._decorators import Appender, cache_readonly @@ -42,7 +43,6 @@ ensure_index, ) from pandas.core.indexes.frozen import FrozenList -from pandas.core.indexes.numeric import Int64Index import pandas.core.missing as missing from pandas.core.sorting import ( get_group_index, @@ -3097,7 +3097,9 @@ def _update_indexer(idxr, indexer=indexer): return indexer._ndarray_values - def _reorder_indexer(self, seq, indexer: Int64Index) -> Int64Index: + def _reorder_indexer( + self, seq: Tuple[Union[Scalar, Iterable, AnyArrayLike], ...], indexer: ArrayLike + ) -> ArrayLike: """ Reorder an indexer of a MultiIndex (self) so that the label are in the same order as given in seq @@ -3111,10 +3113,8 @@ def _reorder_indexer(self, seq, indexer: Int64Index) -> Int64Index: ------- indexer : a sorted Int64Index indexer of self ordered as seq """ - from typing import Tuple - n = len(self) - keys = tuple() # type: Tuple[np.ndarray, ...] + keys = tuple() # For each level of the sequence in seq, map the level codes with the # order they appears in a list-like sequence # This mapping is then use to reorder the indexer From 81edea5705fad93f8f6339f1cc642c44d75de92b Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Wed, 20 Nov 2019 23:38:17 +0100 Subject: [PATCH 10/20] TST: Add more test cases to test_multiindex_loc_order --- pandas/tests/indexes/multi/test_indexing.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index ebe23919a9d5c..7270ff42ba3b8 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -542,6 +542,22 @@ def test_multiindex_loc_order(): exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [1, 2, 1, 2]]) tm.assert_index_equal(res.index, exp_index) + res = df.loc[["a", "b"], :] + exp_index = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [1, 2, 1, 2]]) + tm.assert_index_equal(res.index, exp_index) + + res = df.loc[(["a", "b"], [1, 2]), :] + exp_index = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [1, 2, 1, 2]]) + tm.assert_index_equal(res.index, exp_index) + + res = df.loc[(["a", "b"], [2, 1]), :] + exp_index = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [2, 1, 2, 1]]) + tm.assert_index_equal(res.index, exp_index) + res = df.loc[(["b", "a"], [2, 1]), :] exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [2, 1, 2, 1]]) tm.assert_index_equal(res.index, exp_index) + + res = df.loc[(["b", "a"], [1, 2]), :] + exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [1, 2, 1, 2]]) + tm.assert_index_equal(res.index, exp_index) From f1407f1c1af9cecc96e4be31c359eb3e1090d20a Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Fri, 3 Jan 2020 19:14:40 +0100 Subject: [PATCH 11/20] TST: move test_multiindex_loc_order to tests/test_multilevel.py --- pandas/tests/indexes/multi/test_indexing.py | 35 --------------------- pandas/tests/test_multilevel.py | 34 ++++++++++++++++++++ 2 files changed, 34 insertions(+), 35 deletions(-) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 7270ff42ba3b8..b08280a712642 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -438,7 +438,6 @@ def test_timestamp_multiindex_indexer(): ) should_be = pd.Series(data=np.arange(24, len(qidx) + 24), index=qidx, name="foo") tm.assert_series_equal(result, should_be) -<<<<<<< HEAD def test_get_loc_with_values_including_missing_values(): @@ -527,37 +526,3 @@ def test_slice_locs_with_missing_value(index_arr, expected, start_idx, end_idx): idx = MultiIndex.from_arrays(index_arr) result = idx.slice_locs(start=start_idx, end=end_idx) assert result == expected - - -def test_multiindex_loc_order(): - # GH 22797 - # Try to respect order of keys given for MultiIndex.loc - df = pd.DataFrame( - np.arange(12).reshape((4, 3)), - index=[["a", "a", "b", "b"], [1, 2, 1, 2]], - columns=[["Ohio", "Ohio", "Colorado"], ["Green", "Red", "Green"]], - ) - - res = df.loc[["b", "a"], :] - exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [1, 2, 1, 2]]) - tm.assert_index_equal(res.index, exp_index) - - res = df.loc[["a", "b"], :] - exp_index = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [1, 2, 1, 2]]) - tm.assert_index_equal(res.index, exp_index) - - res = df.loc[(["a", "b"], [1, 2]), :] - exp_index = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [1, 2, 1, 2]]) - tm.assert_index_equal(res.index, exp_index) - - res = df.loc[(["a", "b"], [2, 1]), :] - exp_index = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [2, 1, 2, 1]]) - tm.assert_index_equal(res.index, exp_index) - - res = df.loc[(["b", "a"], [2, 1]), :] - exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [2, 1, 2, 1]]) - tm.assert_index_equal(res.index, exp_index) - - res = df.loc[(["b", "a"], [1, 2]), :] - exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [1, 2, 1, 2]]) - tm.assert_index_equal(res.index, exp_index) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 640cd8faf6811..109543f8d3b5d 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2534,3 +2534,37 @@ def test_sort_ascending_list(self): result = s.sort_index(level=["third", "first"], ascending=[False, True]) expected = s.iloc[[0, 4, 1, 5, 2, 6, 3, 7]] tm.assert_series_equal(result, expected) + + +def test_multiindex_loc_order(): + # GH 22797 + # Try to respect order of keys given for MultiIndex.loc + df = pd.DataFrame( + np.arange(12).reshape((4, 3)), + index=[["a", "a", "b", "b"], [1, 2, 1, 2]], + columns=[["Ohio", "Ohio", "Colorado"], ["Green", "Red", "Green"]], + ) + + res = df.loc[["b", "a"], :] + exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [1, 2, 1, 2]]) + tm.assert_index_equal(res.index, exp_index) + + res = df.loc[["a", "b"], :] + exp_index = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [1, 2, 1, 2]]) + tm.assert_index_equal(res.index, exp_index) + + res = df.loc[(["a", "b"], [1, 2]), :] + exp_index = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [1, 2, 1, 2]]) + tm.assert_index_equal(res.index, exp_index) + + res = df.loc[(["a", "b"], [2, 1]), :] + exp_index = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [2, 1, 2, 1]]) + tm.assert_index_equal(res.index, exp_index) + + res = df.loc[(["b", "a"], [2, 1]), :] + exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [2, 1, 2, 1]]) + tm.assert_index_equal(res.index, exp_index) + + res = df.loc[(["b", "a"], [1, 2]), :] + exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [1, 2, 1, 2]]) + tm.assert_index_equal(res.index, exp_index) From 4c667a7733112cd5c342fd61391d4b5ffc5c904e Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Fri, 3 Jan 2020 19:21:00 +0100 Subject: [PATCH 12/20] CLN: Move need_sort test in _reorder_indexer --- pandas/core/indexes/multi.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ff95708022324..bce927c7a508b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3029,7 +3029,6 @@ def _update_indexer(idxr, indexer=indexer): return indexer return indexer & idxr - need_sort = False for i, k in enumerate(seq): if com.is_bool_indexer(k): @@ -3040,13 +3039,6 @@ def _update_indexer(idxr, indexer=indexer): elif is_list_like(k): # a collection of labels to include from this level (these # are or'd) - # Find out if the list_like label are sorted as the levels or not - if not need_sort: - k_codes = self.levels[i].get_indexer(k) - k_codes = k_codes[k_codes >= 0] # Filter absent keys - # True if the given codes are not ordered - need_sort = (k_codes[:-1] > k_codes[1:]).any() - indexers = None for x in k: try: @@ -3054,7 +3046,6 @@ def _update_indexer(idxr, indexer=indexer): self._get_level_indexer(x, level=i, indexer=indexer) ) indexers = idxrs if indexers is None else indexers | idxrs - except KeyError: # ignore not founds @@ -3092,8 +3083,7 @@ def _update_indexer(idxr, indexer=indexer): if indexer is None: return Int64Index([])._ndarray_values - if need_sort: - indexer = self._reorder_indexer(seq, indexer) + indexer = self._reorder_indexer(seq, indexer) return indexer._ndarray_values @@ -3113,8 +3103,21 @@ def _reorder_indexer( ------- indexer : a sorted Int64Index indexer of self ordered as seq """ + # Find out if the list_like label are sorted as the levels or not + need_sort = False + for i, k in enumerate(seq): + if is_list_like(k): + if not need_sort: + k_codes = self.levels[i].get_indexer(k) + k_codes = k_codes[k_codes >= 0] # Filter absent keys + # True if the given codes are not ordered + need_sort = (k_codes[:-1] > k_codes[1:]).any() + # Bail out if no need to sort + if not need_sort: + return indexer + n = len(self) - keys = tuple() + keys: Tuple[np.ndarray, ...] = tuple() # For each level of the sequence in seq, map the level codes with the # order they appears in a list-like sequence # This mapping is then use to reorder the indexer From ee89a333aa9ea4b2d2c2fff053539b59635a5aae Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Fri, 3 Jan 2020 20:43:39 +0100 Subject: [PATCH 13/20] TST: Test also for columns --- pandas/tests/test_multilevel.py | 63 +++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 26 deletions(-) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 109543f8d3b5d..804d71ae0e676 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2535,36 +2535,47 @@ def test_sort_ascending_list(self): expected = s.iloc[[0, 4, 1, 5, 2, 6, 3, 7]] tm.assert_series_equal(result, expected) + def test_multiindex_loc_order(self): + # GH 22797 + # Try to respect order of keys given for MultiIndex.loc + df = pd.DataFrame( + np.arange(12).reshape((4, 3)), + index=[["a", "a", "b", "b"], [1, 2, 1, 2]], + columns=[["Ohio", "Ohio", "Colorado"], ["Green", "Red", "Green"]], + ) -def test_multiindex_loc_order(): - # GH 22797 - # Try to respect order of keys given for MultiIndex.loc - df = pd.DataFrame( - np.arange(12).reshape((4, 3)), - index=[["a", "a", "b", "b"], [1, 2, 1, 2]], - columns=[["Ohio", "Ohio", "Colorado"], ["Green", "Red", "Green"]], - ) + res = df.loc[["b", "a"], :] + exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [1, 2, 1, 2]]) + tm.assert_index_equal(res.index, exp_index) - res = df.loc[["b", "a"], :] - exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [1, 2, 1, 2]]) - tm.assert_index_equal(res.index, exp_index) + res = df.loc[["a", "b"], :] + exp_index = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [1, 2, 1, 2]]) + tm.assert_index_equal(res.index, exp_index) - res = df.loc[["a", "b"], :] - exp_index = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [1, 2, 1, 2]]) - tm.assert_index_equal(res.index, exp_index) + res = df.loc[(["a", "b"], [1, 2]), :] + exp_index = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [1, 2, 1, 2]]) + tm.assert_index_equal(res.index, exp_index) - res = df.loc[(["a", "b"], [1, 2]), :] - exp_index = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [1, 2, 1, 2]]) - tm.assert_index_equal(res.index, exp_index) + res = df.loc[(["a", "b"], [2, 1]), :] + exp_index = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [2, 1, 2, 1]]) + tm.assert_index_equal(res.index, exp_index) - res = df.loc[(["a", "b"], [2, 1]), :] - exp_index = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [2, 1, 2, 1]]) - tm.assert_index_equal(res.index, exp_index) + res = df.loc[(["b", "a"], [2, 1]), :] + exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [2, 1, 2, 1]]) + tm.assert_index_equal(res.index, exp_index) - res = df.loc[(["b", "a"], [2, 1]), :] - exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [2, 1, 2, 1]]) - tm.assert_index_equal(res.index, exp_index) + res = df.loc[(["b", "a"], [1, 2]), :] + exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [1, 2, 1, 2]]) + tm.assert_index_equal(res.index, exp_index) - res = df.loc[(["b", "a"], [1, 2]), :] - exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [1, 2, 1, 2]]) - tm.assert_index_equal(res.index, exp_index) + res = df.loc[:, ["Colorado", "Ohio"]] + exp_columns = pd.MultiIndex.from_arrays( + [["Colorado", "Ohio", "Ohio"], ["Green", "Green", "Red"]] + ) + tm.assert_index_equal(res.columns, exp_columns) + + res = df.loc[:, (["Colorado", "Ohio"], ["Red", "Green"])] + exp_columns = pd.MultiIndex.from_arrays( + [["Colorado", "Ohio", "Ohio"], ["Green", "Red", "Green"]] + ) + tm.assert_index_equal(res.columns, exp_columns) From c18d60d8e1dae6577badbbf5e202125ef5496dd5 Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Fri, 3 Jan 2020 20:44:00 +0100 Subject: [PATCH 14/20] FIX: Test need sort only work on lexsorted indexes --- pandas/core/indexes/multi.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index bce927c7a508b..26dff1d9c6e32 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3113,7 +3113,8 @@ def _reorder_indexer( # True if the given codes are not ordered need_sort = (k_codes[:-1] > k_codes[1:]).any() # Bail out if no need to sort - if not need_sort: + # This is only true for a lexsorted index + if not need_sort and self.is_lexsorted(): return indexer n = len(self) From 1717a147e5d55d66a0eca733e9a8bddf318acebb Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Sun, 19 Jan 2020 14:41:59 +0100 Subject: [PATCH 15/20] Minor change in how to determined if sorting is needed --- pandas/core/indexes/multi.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 26dff1d9c6e32..c560d81ba95f6 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3103,19 +3103,20 @@ def _reorder_indexer( ------- indexer : a sorted Int64Index indexer of self ordered as seq """ - # Find out if the list_like label are sorted as the levels or not - need_sort = False - for i, k in enumerate(seq): - if is_list_like(k): - if not need_sort: - k_codes = self.levels[i].get_indexer(k) - k_codes = k_codes[k_codes >= 0] # Filter absent keys - # True if the given codes are not ordered - need_sort = (k_codes[:-1] > k_codes[1:]).any() - # Bail out if no need to sort - # This is only true for a lexsorted index - if not need_sort and self.is_lexsorted(): - return indexer + # If the index is lexsorted and the list_like label in seq are sorted + # then we do not need to sort + if self.is_lexsorted(): + need_sort = False + for i, k in enumerate(seq): + if is_list_like(k): + if not need_sort: + k_codes = self.levels[i].get_indexer(k) + k_codes = k_codes[k_codes >= 0] # Filter absent keys + # True if the given codes are not ordered + need_sort = (k_codes[:-1] > k_codes[1:]).any() + # Bail out if both index and seq are sorted + if not need_sort: + return indexer n = len(self) keys: Tuple[np.ndarray, ...] = tuple() From 2ab8e305da603a10a54f77623f73dfc49890ea51 Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Sun, 19 Jan 2020 15:50:38 +0100 Subject: [PATCH 16/20] PERF: Delete flag for sorting multiindex loc call. --- pandas/core/indexes/multi.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c560d81ba95f6..7a6414b01d261 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3103,21 +3103,6 @@ def _reorder_indexer( ------- indexer : a sorted Int64Index indexer of self ordered as seq """ - # If the index is lexsorted and the list_like label in seq are sorted - # then we do not need to sort - if self.is_lexsorted(): - need_sort = False - for i, k in enumerate(seq): - if is_list_like(k): - if not need_sort: - k_codes = self.levels[i].get_indexer(k) - k_codes = k_codes[k_codes >= 0] # Filter absent keys - # True if the given codes are not ordered - need_sort = (k_codes[:-1] > k_codes[1:]).any() - # Bail out if both index and seq are sorted - if not need_sort: - return indexer - n = len(self) keys: Tuple[np.ndarray, ...] = tuple() # For each level of the sequence in seq, map the level codes with the From edde717d96a177a947710a022b6a2a6bef348f0e Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Sun, 19 Jan 2020 18:11:00 +0100 Subject: [PATCH 17/20] TST: Parametrize tests --- pandas/tests/test_multilevel.py | 65 ++++++++++++--------------------- 1 file changed, 23 insertions(+), 42 deletions(-) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 804d71ae0e676..b377ca2869bd3 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2535,47 +2535,28 @@ def test_sort_ascending_list(self): expected = s.iloc[[0, 4, 1, 5, 2, 6, 3, 7]] tm.assert_series_equal(result, expected) - def test_multiindex_loc_order(self): + @pytest.mark.parametrize( + "keys, expected", + [ + (["b", "a"], [["b", "b", "a", "a"], [1, 2, 1, 2]]), + (["a", "b"], [["a", "a", "b", "b"], [1, 2, 1, 2]]), + ((["a", "b"], [1, 2]), [["a", "a", "b", "b"], [1, 2, 1, 2]]), + ((["a", "b"], [2, 1]), [["a", "a", "b", "b"], [2, 1, 2, 1]]), + ((["b", "a"], [2, 1]), [["b", "b", "a", "a"], [2, 1, 2, 1]]), + ((["b", "a"], [1, 2]), [["b", "b", "a", "a"], [1, 2, 1, 2]]), + ((["c", "a"], [2, 1]), [["c", "a", "a"], [1, 2, 1]]), + ], + ) + @pytest.mark.parametrize("dim", ["index", "columns"]) + def test_multilevel_index_loc_order(self, dim, keys, expected): # GH 22797 # Try to respect order of keys given for MultiIndex.loc - df = pd.DataFrame( - np.arange(12).reshape((4, 3)), - index=[["a", "a", "b", "b"], [1, 2, 1, 2]], - columns=[["Ohio", "Ohio", "Colorado"], ["Green", "Red", "Green"]], - ) - - res = df.loc[["b", "a"], :] - exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [1, 2, 1, 2]]) - tm.assert_index_equal(res.index, exp_index) - - res = df.loc[["a", "b"], :] - exp_index = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [1, 2, 1, 2]]) - tm.assert_index_equal(res.index, exp_index) - - res = df.loc[(["a", "b"], [1, 2]), :] - exp_index = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [1, 2, 1, 2]]) - tm.assert_index_equal(res.index, exp_index) - - res = df.loc[(["a", "b"], [2, 1]), :] - exp_index = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [2, 1, 2, 1]]) - tm.assert_index_equal(res.index, exp_index) - - res = df.loc[(["b", "a"], [2, 1]), :] - exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [2, 1, 2, 1]]) - tm.assert_index_equal(res.index, exp_index) - - res = df.loc[(["b", "a"], [1, 2]), :] - exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [1, 2, 1, 2]]) - tm.assert_index_equal(res.index, exp_index) - - res = df.loc[:, ["Colorado", "Ohio"]] - exp_columns = pd.MultiIndex.from_arrays( - [["Colorado", "Ohio", "Ohio"], ["Green", "Green", "Red"]] - ) - tm.assert_index_equal(res.columns, exp_columns) - - res = df.loc[:, (["Colorado", "Ohio"], ["Red", "Green"])] - exp_columns = pd.MultiIndex.from_arrays( - [["Colorado", "Ohio", "Ohio"], ["Green", "Red", "Green"]] - ) - tm.assert_index_equal(res.columns, exp_columns) + kwargs = {dim: [["c", "a", "a", "b", "b"], [1, 1, 2, 1, 2]]} + df = pd.DataFrame(np.arange(25).reshape(5, 5), **kwargs,) + exp_index = MultiIndex.from_arrays(expected) + if dim == "index": + res = df.loc[keys, :] + tm.assert_index_equal(res.index, exp_index) + elif dim == "columns": + res = df.loc[:, keys] + tm.assert_index_equal(res.columns, exp_index) From 82e51097f0b4e63e70012cc6b07e4434940b4871 Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Sun, 19 Jan 2020 18:15:57 +0100 Subject: [PATCH 18/20] DOC: Move whatsnew entry from v1.0.0 to v1.1.0 --- doc/source/whatsnew/v1.0.0.rst | 2 +- doc/source/whatsnew/v1.1.0.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index bcb627995e8a5..6597b764581a4 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1160,7 +1160,7 @@ MultiIndex - Constructor for :class:`MultiIndex` verifies that the given ``sortorder`` is compatible with the actual ``lexsort_depth`` if ``verify_integrity`` parameter is ``True`` (the default) (:issue:`28735`) - Series and MultiIndex `.drop` with `MultiIndex` raise exception if labels not in given in level (:issue:`8594`) -- Bug in :meth:`Dataframe.loc` when used with a :class:`MultiIndex`. The returned values were not in the same order as the given inputs (:issue:`22797`) +- I/O ^^^ diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 607a2c02944b4..951e484d7da7e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -161,7 +161,7 @@ Missing MultiIndex ^^^^^^^^^^ - +- Bug in :meth:`Dataframe.loc` when used with a :class:`MultiIndex`. The returned values were not in the same order as the given inputs (:issue:`22797`) - - From 33671097edf0cd582be13fc5cbc69676c243e756 Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Sun, 19 Jan 2020 18:18:57 +0100 Subject: [PATCH 19/20] Revert "PERF: Delete flag for sorting multiindex loc call." This reverts commit 7fee53c6234e20830f61ab096b332f8c7e421359. --- pandas/core/indexes/multi.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 7a6414b01d261..c560d81ba95f6 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3103,6 +3103,21 @@ def _reorder_indexer( ------- indexer : a sorted Int64Index indexer of self ordered as seq """ + # If the index is lexsorted and the list_like label in seq are sorted + # then we do not need to sort + if self.is_lexsorted(): + need_sort = False + for i, k in enumerate(seq): + if is_list_like(k): + if not need_sort: + k_codes = self.levels[i].get_indexer(k) + k_codes = k_codes[k_codes >= 0] # Filter absent keys + # True if the given codes are not ordered + need_sort = (k_codes[:-1] > k_codes[1:]).any() + # Bail out if both index and seq are sorted + if not need_sort: + return indexer + n = len(self) keys: Tuple[np.ndarray, ...] = tuple() # For each level of the sequence in seq, map the level codes with the From 025d304ae21f1003bc3dedff732ab9f16da77dee Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Sun, 26 Jan 2020 19:48:03 +0100 Subject: [PATCH 20/20] DOC: Add mini example to whatsnew --- doc/source/whatsnew/v1.1.0.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 951e484d7da7e..7071289ef3243 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -162,7 +162,13 @@ Missing MultiIndex ^^^^^^^^^^ - Bug in :meth:`Dataframe.loc` when used with a :class:`MultiIndex`. The returned values were not in the same order as the given inputs (:issue:`22797`) -- + +.. ipython:: python + + df = pd.DataFrame(np.arange(4), + index=[["a", "a", "b", "b"], [1, 2, 1, 2]]) + # Rows are now ordered as the requested keys + df.loc[(['b', 'a'], [2, 1]), :] - I/O