From db1503610cd2512928d09abacfccc1a1a33681df Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Tue, 1 Oct 2019 19:10:02 +0200 Subject: [PATCH 1/8] TST: Add test for issue #28518 * test_lexsort_depth verify that lexsort_depth return the correct depth when sortorder is passed to the MultiIndex constructor * test_raise_invalid_sortorder test that the MultiIndex constructor raise when passing an incorrect sortorder * test_merge_multiindex_columns test the original issue --- pandas/tests/indexes/multi/test_indexing.py | 1 - pandas/tests/reshape/merge/test_merge.py | 29 +++++++++++++ pandas/tests/test_multilevel.py | 48 +++++++++++++++++++++ 3 files changed, 77 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index d366dbd8bc0a8..ec2e8aa6564a8 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -96,7 +96,6 @@ def test_slice_locs_not_contained(): index = MultiIndex( levels=[[0, 2, 4, 6], [0, 2, 4]], codes=[[0, 0, 0, 1, 1, 2, 3, 3, 3], [0, 1, 2, 1, 2, 2, 0, 1, 2]], - sortorder=0, ) result = index.slice_locs((1, 0), (5, 2)) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index a04f093ee7818..084b8d306df0c 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2094,3 +2094,32 @@ def test_merge_equal_cat_dtypes2(): # Categorical is unordered, so don't check ordering. tm.assert_frame_equal(result, expected, check_categorical=False) + +def test_merge_multiindex_columns(): + # Issue #28518 + # Verify that merging two dataframes give the expected labels + # The original cause of this issue come from a bug lexsort_depth and is tested in + # test_lexsort_depth + + index_tuples=[] + letters = ["a", "b", "c", "d"] + numbers = ["1", "2", "3"] + + for l in letters: + for n in numbers: + index_tuples.append([l, n]) + + index = pd.MultiIndex.from_tuples(index_tuples, names=["outer", "inner"]) + + frame_x = pd.DataFrame(columns = index) + frame_x["id"]="" + + frame_y = pd.DataFrame(columns = index) + frame_y["id"]="" + + l_suf = '_x' + r_suf = '_y' + expected_labels = sum(([l + l_suf, l + r_suf] for l in letters), []) + merged_frame = frame_x.merge(frame_y, on="id", suffixes=((l_suf, r_suf))).columns + for label in expected_labels: + assert label in merged_frame diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index dc4db6e7902a8..b666dc55bdb8a 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2056,6 +2056,54 @@ def test_is_lexsorted(self): assert not index.is_lexsorted() assert index.lexsort_depth == 0 + def test_raise_invalid_sortorder(self): + # Test that the MultiIndex constructor raise when a incorrect sortorder is given + # Issue #28518 + + levels = [[0, 1], [0, 1, 2]] + + # Correct sortorder + index = MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], + sortorder=2, + ) + + with pytest.raises(ValueError, match=r".* sortorder 2 with lexsort_depth 1.*"): + index = MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], + sortorder=2, + ) + + with pytest.raises(ValueError, match=r".* sortorder 1 with lexsort_depth 0.*"): + index = MultiIndex( + levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], + sortorder=1, + ) + + def test_lexsort_depth(self): + # Test that lexsort_depth return the correct sortorder when it was given to the MultiIndex const. + # Issue #28518 + + levels = [[0, 1], [0, 1, 2]] + + index = MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], + sortorder=2, + ) + assert index.lexsort_depth == 2 + + index = MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], + sortorder=1, + ) + assert index.lexsort_depth == 1 + + index = MultiIndex( + levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], + sortorder=0, + ) + assert index.lexsort_depth == 0 + def test_sort_index_and_reconstruction(self): # 15622 From 565f9827a8e0d23d0ba5ece8d0218583017f318a Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Tue, 1 Oct 2019 19:10:02 +0200 Subject: [PATCH 2/8] BUG: lexsort_depth now return sortorder if sortorder is not set to None This fix issue #28518, where the label of the merge index where invalid due to inconsistent lexsort_depth property of the intersection of the indexes --- pandas/core/indexes/multi.py | 18 +++++++++++---- pandas/tests/reshape/merge/test_merge.py | 17 +++++++------- pandas/tests/test_multilevel.py | 29 ++++++++++++------------ 3 files changed, 36 insertions(+), 28 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 3273c4f8cd13b..a587913675dca 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -365,6 +365,15 @@ def _verify_integrity(self, codes=None, levels=None): "Level values must be unique: {values} on " "level {level}".format(values=[value for value in level], level=i) ) + if self.sortorder is not None: + if int(self.sortorder) > self._lexsort_depth(): + raise ValueError( + "Value for sortorder must be inferior or equal " + "to actual lexsort_depth: " + "sortorder {sortorder} with lexsort_depth {lexsort_depth}".format( + sortorder=self.sortorder, lexsort_depth=self._lexsort_depth() + ) + ) codes = [ self._validate_codes(level, code) for level, code in zip(levels, codes) @@ -1783,16 +1792,15 @@ def is_lexsorted(self): @cache_readonly def lexsort_depth(self): if self.sortorder is not None: - if self.sortorder == 0: - return self.nlevels - else: - return 0 + return int(self.sortorder) + + return self._lexsort_depth() + def _lexsort_depth(self): int64_codes = [ensure_int64(level_codes) for level_codes in self.codes] for k in range(self.nlevels, 0, -1): if libalgos.is_lexsorted(int64_codes[:k]): return k - return 0 def _sort_levels_monotonic(self): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 084b8d306df0c..142ece9c1c05b 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2095,13 +2095,14 @@ def test_merge_equal_cat_dtypes2(): # Categorical is unordered, so don't check ordering. tm.assert_frame_equal(result, expected, check_categorical=False) + def test_merge_multiindex_columns(): # Issue #28518 # Verify that merging two dataframes give the expected labels # The original cause of this issue come from a bug lexsort_depth and is tested in # test_lexsort_depth - - index_tuples=[] + + index_tuples = [] letters = ["a", "b", "c", "d"] numbers = ["1", "2", "3"] @@ -2111,14 +2112,14 @@ def test_merge_multiindex_columns(): index = pd.MultiIndex.from_tuples(index_tuples, names=["outer", "inner"]) - frame_x = pd.DataFrame(columns = index) - frame_x["id"]="" + frame_x = pd.DataFrame(columns=index) + frame_x["id"] = "" - frame_y = pd.DataFrame(columns = index) - frame_y["id"]="" + frame_y = pd.DataFrame(columns=index) + frame_y["id"] = "" - l_suf = '_x' - r_suf = '_y' + l_suf = "_x" + r_suf = "_y" expected_labels = sum(([l + l_suf, l + r_suf] for l in letters), []) merged_frame = frame_x.merge(frame_y, on="id", suffixes=((l_suf, r_suf))).columns for label in expected_labels: diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index b666dc55bdb8a..4a60d3966a9bb 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2063,44 +2063,43 @@ def test_raise_invalid_sortorder(self): levels = [[0, 1], [0, 1, 2]] # Correct sortorder - index = MultiIndex( - levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], - sortorder=2, + MultiIndex( + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], sortorder=2 ) with pytest.raises(ValueError, match=r".* sortorder 2 with lexsort_depth 1.*"): - index = MultiIndex( - levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], + MultiIndex( + levels=levels, + codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], sortorder=2, ) with pytest.raises(ValueError, match=r".* sortorder 1 with lexsort_depth 0.*"): - index = MultiIndex( - levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], + MultiIndex( + levels=levels, + codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], sortorder=1, - ) + ) def test_lexsort_depth(self): - # Test that lexsort_depth return the correct sortorder when it was given to the MultiIndex const. + # Test that lexsort_depth return the correct sortorder + # when it was given to the MultiIndex const. # Issue #28518 levels = [[0, 1], [0, 1, 2]] index = MultiIndex( - levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], - sortorder=2, + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]], sortorder=2 ) assert index.lexsort_depth == 2 index = MultiIndex( - levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], - sortorder=1, + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], sortorder=1 ) assert index.lexsort_depth == 1 index = MultiIndex( - levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], - sortorder=0, + levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], sortorder=0 ) assert index.lexsort_depth == 0 From 67af7ec85d43e38590453203e222319fffde863e Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Wed, 2 Oct 2019 10:24:28 +0200 Subject: [PATCH 3/8] CLN: Remove unnecessary casting to int in lexsort_depth --- pandas/core/indexes/multi.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a587913675dca..ee23c6a446b8a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -366,7 +366,7 @@ def _verify_integrity(self, codes=None, levels=None): "level {level}".format(values=[value for value in level], level=i) ) if self.sortorder is not None: - if int(self.sortorder) > self._lexsort_depth(): + if self.sortorder > self._lexsort_depth(): raise ValueError( "Value for sortorder must be inferior or equal " "to actual lexsort_depth: " @@ -1792,7 +1792,7 @@ def is_lexsorted(self): @cache_readonly def lexsort_depth(self): if self.sortorder is not None: - return int(self.sortorder) + return self.sortorder return self._lexsort_depth() From 0fce75ee18e64fdb4f64b3a4aee8cf0342c5724a Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Wed, 2 Oct 2019 10:25:29 +0200 Subject: [PATCH 4/8] TST: Update test_merge_multiindex_columns --- pandas/tests/reshape/merge/test_merge.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 142ece9c1c05b..63de9777756cc 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2102,25 +2102,25 @@ def test_merge_multiindex_columns(): # The original cause of this issue come from a bug lexsort_depth and is tested in # test_lexsort_depth - index_tuples = [] letters = ["a", "b", "c", "d"] numbers = ["1", "2", "3"] - - for l in letters: - for n in numbers: - index_tuples.append([l, n]) - - index = pd.MultiIndex.from_tuples(index_tuples, names=["outer", "inner"]) + index = pd.MultiIndex.from_product((letters, numbers), names=["outer", "inner"]) frame_x = pd.DataFrame(columns=index) frame_x["id"] = "" - frame_y = pd.DataFrame(columns=index) frame_y["id"] = "" l_suf = "_x" r_suf = "_y" - expected_labels = sum(([l + l_suf, l + r_suf] for l in letters), []) - merged_frame = frame_x.merge(frame_y, on="id", suffixes=((l_suf, r_suf))).columns - for label in expected_labels: - assert label in merged_frame + result = frame_x.merge(frame_y, on="id", suffixes=((l_suf, r_suf))) + + # Constructing the expected results + expected_labels = [l + l_suf for l in letters] + [l + r_suf for l in letters] + expected_index = pd.MultiIndex.from_product( + [expected_labels, numbers], names=["outer", "inner"] + ) + expected = pd.DataFrame(columns=expected_index) + expected["id"] = "" + + tm.assert_frame_equal(result, expected) From 25303357b5c342d19b4503ec4c4d83e63e5a8d1d Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Wed, 2 Oct 2019 14:45:22 +0200 Subject: [PATCH 5/8] DOC: wahtsnew note in Reshaping and MultiIndex --- doc/source/whatsnew/v1.0.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index b075a9d8b5e8b..5a0cecc02f025 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -247,6 +247,7 @@ Missing MultiIndex ^^^^^^^^^^ +- Constructior for :class:`MultiIndex` verifies that the given ``sortorder`` is compatible with the actual ``lexsort_depth`` if ``verify_integrity`` parameter is ``True`` (the default) - - @@ -288,6 +289,7 @@ Reshaping - Bug in :meth:`DataFrame.apply` that caused incorrect output with empty :class:`DataFrame` (:issue:`28202`, :issue:`21959`) - Bug in :meth:`DataFrame.stack` not handling non-unique indexes correctly when creating MultiIndex (:issue: `28301`) - Bug :func:`merge_asof` could not use :class:`datetime.timedelta` for ``tolerance`` kwarg (:issue:`28098`) +- Bug in :func:`merge`, did not append suffixes correctly with MultiIndex (:issue:`28518`) Sparse ^^^^^^ From 716820e86abd2ca4ef15774af82d918bd3e49407 Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Wed, 2 Oct 2019 16:12:35 +0200 Subject: [PATCH 6/8] DOC: Add docstring to MultiIndex._lexsort_depth --- pandas/core/indexes/multi.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ee23c6a446b8a..0e7a72bcaedbd 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1797,6 +1797,14 @@ def lexsort_depth(self): return self._lexsort_depth() def _lexsort_depth(self): + """ + Compute and return the lexsort_depth, the number of levels of the + MultiIndex that are sorted lexically + + Returns + ------ + int + """ int64_codes = [ensure_int64(level_codes) for level_codes in self.codes] for k in range(self.nlevels, 0, -1): if libalgos.is_lexsorted(int64_codes[:k]): From 426768b226e95a4777b5919305f29dfbcdb3ea12 Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Wed, 2 Oct 2019 20:07:00 +0200 Subject: [PATCH 7/8] CLN: Add type hint to _lexsort_depth function --- pandas/core/indexes/multi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 0e7a72bcaedbd..c02561087f83f 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1796,7 +1796,7 @@ def lexsort_depth(self): return self._lexsort_depth() - def _lexsort_depth(self): + def _lexsort_depth(self) -> int: """ Compute and return the lexsort_depth, the number of levels of the MultiIndex that are sorted lexically From c09450684f5426cc9d51b622892369c6fffe040c Mon Sep 17 00:00:00 2001 From: nrebena <49879400+nrebena@users.noreply.github.com> Date: Thu, 3 Oct 2019 21:03:27 +0200 Subject: [PATCH 8/8] DOC: Add issue number in whatsnew --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 5a0cecc02f025..3b55901ec4740 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -247,7 +247,7 @@ Missing MultiIndex ^^^^^^^^^^ -- Constructior for :class:`MultiIndex` verifies that the given ``sortorder`` is compatible with the actual ``lexsort_depth`` if ``verify_integrity`` parameter is ``True`` (the default) +- Constructior for :class:`MultiIndex` verifies that the given ``sortorder`` is compatible with the actual ``lexsort_depth`` if ``verify_integrity`` parameter is ``True`` (the default) (:issue:`28735`) - -