From 4bd7e62041a44a69978694c8ab12c7f3e600a483 Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Sun, 29 Sep 2019 17:24:12 +0200 Subject: [PATCH 1/9] TST: Test for issue #22797 Testing return order of MultiIndex.loc MultiIndex.loc try to return the result in the same order as the key given. --- pandas/tests/indexes/multi/test_indexing.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index ad6f06d065150..dd1da055bb757 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -525,3 +525,21 @@ def test_slice_locs_with_missing_value(index_arr, expected, start_idx, end_idx): idx = MultiIndex.from_arrays(index_arr) result = idx.slice_locs(start=start_idx, end=end_idx) assert result == expected + + +def test_multiindex_loc_order(): + # GH 22797 + # Try to respect order of keys given for MultiIndex.loc + df = pd.DataFrame( + np.arange(12).reshape((4, 3)), + index=[["a", "a", "b", "b"], [1, 2, 1, 2]], + columns=[["Ohio", "Ohio", "Colorado"], ["Green", "Red", "Green"]], + ) + + res = df.loc[["b", "a"], :] + exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [1, 2, 1, 2]]) + tm.assert_index_equal(res.index, exp_index) + + res = df.loc[(["b", "a"], [2, 1]), :] + exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [2, 1, 2, 1]]) + tm.assert_index_equal(res.index, exp_index) From 694c769ebe48cc6bbc5bfa6f7c69ed6fefc98094 Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Sun, 6 Oct 2019 23:04:24 +0200 Subject: [PATCH 2/9] BUG: loc() return in order for MultiIndex Dataframe From issue #22797. Loc did not respect order for MultiIndex Dataframe. It does for single index. When possible, the returned row now respect the given order for keys. --- pandas/core/indexes/multi.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 84d7399cc4f2d..08f9635d5d4d0 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2993,9 +2993,10 @@ def _update_indexer(idxr, indexer=indexer): indexer = Index(np.arange(n)) if idxr is None: return indexer - return indexer & idxr + return idxr & indexer - for i, k in enumerate(seq): + for i, k in enumerate(reversed(seq)): + i = len(seq) - 1 - i # Counting in reverse if com.is_bool_indexer(k): # a boolean indexer, must be the same length! @@ -3011,21 +3012,31 @@ def _update_indexer(idxr, indexer=indexer): idxrs = _convert_to_indexer( self._get_level_indexer(x, level=i, indexer=indexer) ) - indexers = idxrs if indexers is None else indexers | idxrs + # We intersect with indexer to make idxrs + # ordered as previously seen indexes + if indexer is not None: + idxrs = indexer.intersection(idxrs) + + indexers = ( + idxrs + if indexers is None + else indexers.union(idxrs, sort=False) + ) except KeyError: - # ignore not founds continue if indexers is not None: - indexer = _update_indexer(indexers, indexer=indexer) + # No need to update anymore, as we intersect in main loop + indexer = indexers else: # no matches we are done return Int64Index([])._ndarray_values elif com.is_null_slice(k): # empty slice - indexer = _update_indexer(None, indexer=indexer) + # index is given to conserve the order of this level + indexer = _update_indexer(Int64Index(np.arange(n)), indexer=indexer) elif isinstance(k, slice): From 14372a50b2bb799c64d52be9437c0add3c3b545b Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Wed, 23 Oct 2019 19:40:35 +0200 Subject: [PATCH 3/9] PERF: Improve performance for sorted label input --- pandas/core/indexes/multi.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 08f9635d5d4d0..0b7bd6bdc5e86 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2995,6 +2995,7 @@ def _update_indexer(idxr, indexer=indexer): return indexer return idxr & indexer + need_sort = False for i, k in enumerate(reversed(seq)): i = len(seq) - 1 - i # Counting in reverse @@ -3007,14 +3008,19 @@ def _update_indexer(idxr, indexer=indexer): # a collection of labels to include from this level (these # are or'd) indexers = None + # Find out if the list_like label are sorted as the levels or not + k_codes = np.array( + [self.levels[i].get_loc(e) for e in k if e in self.levels[i]] + ) + need_sort = not (k_codes[:-1] < k_codes[1:]).all() for x in k: try: idxrs = _convert_to_indexer( self._get_level_indexer(x, level=i, indexer=indexer) ) - # We intersect with indexer to make idxrs - # ordered as previously seen indexes - if indexer is not None: + if need_sort and indexer is not None: + # We intersect with indexer to make idxrs + # ordered as previously seen indexes idxrs = indexer.intersection(idxrs) indexers = ( @@ -3027,8 +3033,10 @@ def _update_indexer(idxr, indexer=indexer): continue if indexers is not None: - # No need to update anymore, as we intersect in main loop - indexer = indexers + if need_sort: + indexer = indexers + else: + indexer = _update_indexer(indexers, indexer=indexer) else: # no matches we are done return Int64Index([])._ndarray_values From a4a50347f5a7271f0f84201b95712cd027df0d6c Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Sun, 15 Dec 2019 20:11:28 +0100 Subject: [PATCH 4/9] TST: Selection of columns and slice(None) --- pandas/tests/indexes/multi/test_indexing.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index dd1da055bb757..3125c28add8a3 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -543,3 +543,19 @@ def test_multiindex_loc_order(): res = df.loc[(["b", "a"], [2, 1]), :] exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [2, 1, 2, 1]]) tm.assert_index_equal(res.index, exp_index) + + res = df.loc[:, ["Colorado", "Ohio"]] + exp_columns = pd.MultiIndex.from_arrays( + [["Colorado", "Ohio", "Ohio"], ["Green", "Green", "Red"]] + ) + tm.assert_index_equal(res.columns, exp_columns) + + res = df.loc[:, (["Colorado", "Ohio"], ["Red", "Green"])] + exp_columns = pd.MultiIndex.from_arrays( + [["Colorado", "Ohio", "Ohio"], ["Green", "Red", "Green"]] + ) + tm.assert_index_equal(res.columns, exp_columns) + + res = df.loc[(slice(None), [2, 1]), :] + exp_index = pd.MultiIndex.from_arrays([["a", "b", "a", "b"], [2, 2, 1, 1]]) + tm.assert_index_equal(res.index, exp_index) From 854b5f30804aa39cd3e9de27c51fcd10a9891a04 Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Sun, 15 Dec 2019 20:14:19 +0100 Subject: [PATCH 5/9] =?UTF-8?q?FIX:=C2=A0Ensure=20indexer=20is=20sort=20wh?= =?UTF-8?q?en=20needed.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pandas/core/indexes/multi.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 0b7bd6bdc5e86..cf40134c0ce10 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3008,11 +3008,12 @@ def _update_indexer(idxr, indexer=indexer): # a collection of labels to include from this level (these # are or'd) indexers = None - # Find out if the list_like label are sorted as the levels or not - k_codes = np.array( - [self.levels[i].get_loc(e) for e in k if e in self.levels[i]] - ) - need_sort = not (k_codes[:-1] < k_codes[1:]).all() + if not need_sort: + # Find out if the list_like label are sorted as the levels or not + k_codes = np.array( + [self.levels[i].get_loc(e) for e in k if e in self.levels[i]] + ) + need_sort = not (k_codes[:-1] < k_codes[1:]).all() for x in k: try: idxrs = _convert_to_indexer( @@ -3044,7 +3045,7 @@ def _update_indexer(idxr, indexer=indexer): elif com.is_null_slice(k): # empty slice # index is given to conserve the order of this level - indexer = _update_indexer(Int64Index(np.arange(n)), indexer=indexer) + indexer = _update_indexer(None, indexer=indexer) elif isinstance(k, slice): From 4ea8570b34b74da878845aac9787034f4824c692 Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Sun, 15 Dec 2019 21:50:06 +0100 Subject: [PATCH 6/9] Revert slice(None) test We need to order the slice(None) to comply with test from pandas/tests/indexing/multiindex/test_slice.py test_per_axis_per_level_doc_examples --- pandas/core/indexes/multi.py | 2 +- pandas/tests/indexes/multi/test_indexing.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index cf40134c0ce10..dfbc9cf4ddeb5 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3045,7 +3045,7 @@ def _update_indexer(idxr, indexer=indexer): elif com.is_null_slice(k): # empty slice # index is given to conserve the order of this level - indexer = _update_indexer(None, indexer=indexer) + indexer = _update_indexer(Int64Index(np.arange(n)), indexer=indexer) elif isinstance(k, slice): diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 3125c28add8a3..4fc3035994eba 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -555,7 +555,3 @@ def test_multiindex_loc_order(): [["Colorado", "Ohio", "Ohio"], ["Green", "Red", "Green"]] ) tm.assert_index_equal(res.columns, exp_columns) - - res = df.loc[(slice(None), [2, 1]), :] - exp_index = pd.MultiIndex.from_arrays([["a", "b", "a", "b"], [2, 2, 1, 1]]) - tm.assert_index_equal(res.index, exp_index) From 07133b0be03293d66a1aa57a8cfbce85a8ebca3b Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Mon, 16 Dec 2019 19:38:39 +0100 Subject: [PATCH 7/9] CLN: Remove _update_indexer function --- pandas/core/indexes/multi.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index dfbc9cf4ddeb5..707bbdaff1852 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2996,18 +2996,19 @@ def _update_indexer(idxr, indexer=indexer): return idxr & indexer need_sort = False + indexer = Index(np.arange(n)) for i, k in enumerate(reversed(seq)): i = len(seq) - 1 - i # Counting in reverse if com.is_bool_indexer(k): # a boolean indexer, must be the same length! k = np.asarray(k) - indexer = _update_indexer(_convert_to_indexer(k), indexer=indexer) + indexer = _convert_to_indexer(k) & indexer elif is_list_like(k): # a collection of labels to include from this level (these # are or'd) - indexers = None + indexers_union = None if not need_sort: # Find out if the list_like label are sorted as the levels or not k_codes = np.array( @@ -3024,20 +3025,20 @@ def _update_indexer(idxr, indexer=indexer): # ordered as previously seen indexes idxrs = indexer.intersection(idxrs) - indexers = ( + indexers_union = ( idxrs - if indexers is None - else indexers.union(idxrs, sort=False) + if indexers_union is None + else indexers_union.union(idxrs, sort=False) ) except KeyError: # ignore not founds continue - if indexers is not None: + if indexers_union is not None: if need_sort: - indexer = indexers + indexer = indexers_union else: - indexer = _update_indexer(indexers, indexer=indexer) + indexer = indexers_union & indexer else: # no matches we are done return Int64Index([])._ndarray_values @@ -3045,24 +3046,24 @@ def _update_indexer(idxr, indexer=indexer): elif com.is_null_slice(k): # empty slice # index is given to conserve the order of this level - indexer = _update_indexer(Int64Index(np.arange(n)), indexer=indexer) + # See test TestMultiIndexSlicers.test_per_axis_per_level_doc_examples in tests/indexings + indexer = Int64Index(np.arange(n)) & indexer elif isinstance(k, slice): - # a slice, include BOTH of the labels - indexer = _update_indexer( + indexer = ( _convert_to_indexer( self._get_level_indexer(k, level=i, indexer=indexer) - ), - indexer=indexer, + ) + & indexer ) else: # a single label - indexer = _update_indexer( + indexer = ( _convert_to_indexer( self.get_loc_level(k, level=i, drop_level=False)[0] - ), - indexer=indexer, + ) + & indexer ) # empty indexer From 0c9c37a8ed53bfedb78ecada3dd70533d1c08b4c Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Mon, 16 Dec 2019 20:18:05 +0100 Subject: [PATCH 8/9] CLN: Fix a pep8 issue --- pandas/core/indexes/multi.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 707bbdaff1852..a577cd49978b5 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3046,7 +3046,6 @@ def _update_indexer(idxr, indexer=indexer): elif com.is_null_slice(k): # empty slice # index is given to conserve the order of this level - # See test TestMultiIndexSlicers.test_per_axis_per_level_doc_examples in tests/indexings indexer = Int64Index(np.arange(n)) & indexer elif isinstance(k, slice): From 43df8813d165f8fcf1e13dc0f5d372d89c457a6f Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Sat, 11 Jan 2020 11:36:17 +0100 Subject: [PATCH 9/9] TST: Move test to test_multilevel.py --- pandas/tests/indexes/multi/test_indexing.py | 30 -------------- pandas/tests/test_multilevel.py | 45 +++++++++++++++++++++ 2 files changed, 45 insertions(+), 30 deletions(-) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 4fc3035994eba..ad6f06d065150 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -525,33 +525,3 @@ def test_slice_locs_with_missing_value(index_arr, expected, start_idx, end_idx): idx = MultiIndex.from_arrays(index_arr) result = idx.slice_locs(start=start_idx, end=end_idx) assert result == expected - - -def test_multiindex_loc_order(): - # GH 22797 - # Try to respect order of keys given for MultiIndex.loc - df = pd.DataFrame( - np.arange(12).reshape((4, 3)), - index=[["a", "a", "b", "b"], [1, 2, 1, 2]], - columns=[["Ohio", "Ohio", "Colorado"], ["Green", "Red", "Green"]], - ) - - res = df.loc[["b", "a"], :] - exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [1, 2, 1, 2]]) - tm.assert_index_equal(res.index, exp_index) - - res = df.loc[(["b", "a"], [2, 1]), :] - exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [2, 1, 2, 1]]) - tm.assert_index_equal(res.index, exp_index) - - res = df.loc[:, ["Colorado", "Ohio"]] - exp_columns = pd.MultiIndex.from_arrays( - [["Colorado", "Ohio", "Ohio"], ["Green", "Green", "Red"]] - ) - tm.assert_index_equal(res.columns, exp_columns) - - res = df.loc[:, (["Colorado", "Ohio"], ["Red", "Green"])] - exp_columns = pd.MultiIndex.from_arrays( - [["Colorado", "Ohio", "Ohio"], ["Green", "Red", "Green"]] - ) - tm.assert_index_equal(res.columns, exp_columns) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 5382ad84bcca2..427a0bc0c9869 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2471,3 +2471,48 @@ def test_sort_ascending_list(self): result = s.sort_index(level=["third", "first"], ascending=[False, True]) expected = s.iloc[[0, 4, 1, 5, 2, 6, 3, 7]] tm.assert_series_equal(result, expected) + + def test_multiindex_loc_order(self): + # GH 22797 + # Try to respect order of keys given for MultiIndex.loc + df = pd.DataFrame( + np.arange(12).reshape((4, 3)), + index=[["a", "a", "b", "b"], [1, 2, 1, 2]], + columns=[["Ohio", "Ohio", "Colorado"], ["Green", "Red", "Green"]], + ) + + res = df.loc[["b", "a"], :] + exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [1, 2, 1, 2]]) + tm.assert_index_equal(res.index, exp_index) + + res = df.loc[["a", "b"], :] + exp_index = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [1, 2, 1, 2]]) + tm.assert_index_equal(res.index, exp_index) + + res = df.loc[(["a", "b"], [1, 2]), :] + exp_index = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [1, 2, 1, 2]]) + tm.assert_index_equal(res.index, exp_index) + + res = df.loc[(["a", "b"], [2, 1]), :] + exp_index = pd.MultiIndex.from_arrays([["a", "a", "b", "b"], [2, 1, 2, 1]]) + tm.assert_index_equal(res.index, exp_index) + + res = df.loc[(["b", "a"], [2, 1]), :] + exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [2, 1, 2, 1]]) + tm.assert_index_equal(res.index, exp_index) + + res = df.loc[(["b", "a"], [1, 2]), :] + exp_index = pd.MultiIndex.from_arrays([["b", "b", "a", "a"], [1, 2, 1, 2]]) + tm.assert_index_equal(res.index, exp_index) + + res = df.loc[:, ["Colorado", "Ohio"]] + exp_columns = pd.MultiIndex.from_arrays( + [["Colorado", "Ohio", "Ohio"], ["Green", "Green", "Red"]] + ) + tm.assert_index_equal(res.columns, exp_columns) + + res = df.loc[:, (["Colorado", "Ohio"], ["Red", "Green"])] + exp_columns = pd.MultiIndex.from_arrays( + [["Colorado", "Ohio", "Ohio"], ["Green", "Red", "Green"]] + ) + tm.assert_index_equal(res.columns, exp_columns)