diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index d43d5bec7175f..150e86f564ad0 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -81,6 +81,7 @@ Other API Changes - Inserting missing values into indexes will work for all types of indexes and automatically insert the correct type of missing value (``NaN``, ``NaT``, etc.) regardless of the type passed in (:issue:`18295`) - Restricted ``DateOffset`` keyword arguments. Previously, ``DateOffset`` subclasses allowed arbitrary keyword arguments which could lead to unexpected behavior. Now, only valid arguments will be accepted. (:issue:`17176`, :issue:`18226`). - :func:`DataFrame.from_items` provides a more informative error message when passed scalar values (:issue:`17312`) +- When created with duplicate labels, ``MultiIndex`` now raises a ``ValueError``. (:issue:`17464`) .. _whatsnew_0220.deprecations: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 456999b94c523..761dbc6086b53 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -177,8 +177,8 @@ def _verify_integrity(self, labels=None, levels=None): Raises ------ ValueError - * if length of levels and labels don't match or any label would - exceed level bounds + If length of levels and labels don't match, if any label would + exceed level bounds, or there are any duplicate levels. """ # NOTE: Currently does not check, among other things, that cached # nlevels matches nor that sortorder matches actually sortorder. @@ -198,6 +198,11 @@ def _verify_integrity(self, labels=None, levels=None): " level (%d). NOTE: this index is in an" " inconsistent state" % (i, label.max(), len(level))) + if not level.is_unique: + raise ValueError("Level values must be unique: {values} on " + "level {level}".format( + values=[value for value in level], + level=i)) @property def levels(self): diff --git a/pandas/tests/groupby/test_functional.py b/pandas/tests/groupby/test_functional.py index bc13d51c4f4f6..b9718663570bd 100644 --- a/pandas/tests/groupby/test_functional.py +++ b/pandas/tests/groupby/test_functional.py @@ -52,10 +52,10 @@ def test_frame_describe_multikey(self): desc_groups = [] for col in self.tsframe: group = grouped[col].describe() - group_col = pd.MultiIndex([[col] * len(group.columns), - group.columns], - [[0] * len(group.columns), - range(len(group.columns))]) + # GH 17464 - Remove duplicate MultiIndex levels + group_col = pd.MultiIndex( + levels=[[col], group.columns], + labels=[[0] * len(group.columns), range(len(group.columns))]) group = pd.DataFrame(group.values, columns=group_col, index=group.index) @@ -67,8 +67,9 @@ def test_frame_describe_multikey(self): 'C': 1, 'D': 1}, axis=1) result = groupedT.describe() expected = self.tsframe.describe().T - expected.index = pd.MultiIndex([[0, 0, 1, 1], expected.index], - [range(4), range(len(expected.index))]) + expected.index = pd.MultiIndex( + levels=[[0, 1], expected.index], + labels=[[0, 0, 1, 1], range(len(expected.index))]) tm.assert_frame_equal(result, expected) def test_frame_describe_tupleindex(self): diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 5c2a0254b072b..a2c0a75e21f43 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1618,7 +1618,9 @@ def test_is_(self): # shouldn't change assert mi2.is_(mi) mi4 = mi3.view() - mi4.set_levels([[1 for _ in range(10)], lrange(10)], inplace=True) + + # GH 17464 - Remove duplicate MultiIndex levels + mi4.set_levels([lrange(10), lrange(10)], inplace=True) assert not mi4.is_(mi3) mi5 = mi.view() mi5.set_levels(mi5.levels, inplace=True) @@ -2450,13 +2452,11 @@ def test_isna_behavior(self): pd.isna(self.index) def test_level_setting_resets_attributes(self): - ind = MultiIndex.from_arrays([ + ind = pd.MultiIndex.from_arrays([ ['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3] ]) assert ind.is_monotonic - ind.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]], - inplace=True) - + ind.set_levels([['A', 'B'], [1, 3, 2]], inplace=True) # if this fails, probably didn't reset the cache correctly. assert not ind.is_monotonic @@ -3083,3 +3083,16 @@ def test_million_record_attribute_error(self): with tm.assert_raises_regex(AttributeError, "'Series' object has no attribute 'foo'"): df['a'].foo() + + def test_duplicate_multiindex_labels(self): + # GH 17464 + # Make sure that a MultiIndex with duplicate levels throws a ValueError + with pytest.raises(ValueError): + ind = pd.MultiIndex([['A'] * 10, range(10)], [[0] * 10, range(10)]) + + # And that using set_levels with duplicate levels fails + ind = MultiIndex.from_arrays([['A', 'A', 'B', 'B', 'B'], + [1, 2, 1, 2, 3]]) + with pytest.raises(ValueError): + ind.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]], + inplace=True)