Skip to content

Commit 8172565

Browse files
cmazzullojreback
authored andcommitted
BUG: GH17464 MultiIndex now raises an error when levels aren't unique, tests changed (#17971)
1 parent 7a3f81a commit 8172565

File tree

4 files changed

+33
-13
lines changed

4 files changed

+33
-13
lines changed

doc/source/whatsnew/v0.22.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ Other API Changes
114114
- Inserting missing values into indexes will work for all types of indexes and automatically insert the correct type of missing value (``NaN``, ``NaT``, etc.) regardless of the type passed in (:issue:`18295`)
115115
- Restricted ``DateOffset`` keyword arguments. Previously, ``DateOffset`` subclasses allowed arbitrary keyword arguments which could lead to unexpected behavior. Now, only valid arguments will be accepted. (:issue:`17176`, :issue:`18226`).
116116
- :func:`DataFrame.from_items` provides a more informative error message when passed scalar values (:issue:`17312`)
117+
- When created with duplicate labels, ``MultiIndex`` now raises a ``ValueError``. (:issue:`17464`)
117118

118119
.. _whatsnew_0220.deprecations:
119120

pandas/core/indexes/multi.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,8 @@ def _verify_integrity(self, labels=None, levels=None):
177177
Raises
178178
------
179179
ValueError
180-
* if length of levels and labels don't match or any label would
181-
exceed level bounds
180+
If length of levels and labels don't match, if any label would
181+
exceed level bounds, or there are any duplicate levels.
182182
"""
183183
# NOTE: Currently does not check, among other things, that cached
184184
# nlevels matches nor that sortorder matches actually sortorder.
@@ -198,6 +198,11 @@ def _verify_integrity(self, labels=None, levels=None):
198198
" level (%d). NOTE: this index is in an"
199199
" inconsistent state" % (i, label.max(),
200200
len(level)))
201+
if not level.is_unique:
202+
raise ValueError("Level values must be unique: {values} on "
203+
"level {level}".format(
204+
values=[value for value in level],
205+
level=i))
201206

202207
@property
203208
def levels(self):

pandas/tests/groupby/test_functional.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,10 @@ def test_frame_describe_multikey(self):
5252
desc_groups = []
5353
for col in self.tsframe:
5454
group = grouped[col].describe()
55-
group_col = pd.MultiIndex([[col] * len(group.columns),
56-
group.columns],
57-
[[0] * len(group.columns),
58-
range(len(group.columns))])
55+
# GH 17464 - Remove duplicate MultiIndex levels
56+
group_col = pd.MultiIndex(
57+
levels=[[col], group.columns],
58+
labels=[[0] * len(group.columns), range(len(group.columns))])
5959
group = pd.DataFrame(group.values,
6060
columns=group_col,
6161
index=group.index)
@@ -67,8 +67,9 @@ def test_frame_describe_multikey(self):
6767
'C': 1, 'D': 1}, axis=1)
6868
result = groupedT.describe()
6969
expected = self.tsframe.describe().T
70-
expected.index = pd.MultiIndex([[0, 0, 1, 1], expected.index],
71-
[range(4), range(len(expected.index))])
70+
expected.index = pd.MultiIndex(
71+
levels=[[0, 1], expected.index],
72+
labels=[[0, 0, 1, 1], range(len(expected.index))])
7273
tm.assert_frame_equal(result, expected)
7374

7475
def test_frame_describe_tupleindex(self):

pandas/tests/indexes/test_multi.py

+18-5
Original file line numberDiff line numberDiff line change
@@ -1618,7 +1618,9 @@ def test_is_(self):
16181618
# shouldn't change
16191619
assert mi2.is_(mi)
16201620
mi4 = mi3.view()
1621-
mi4.set_levels([[1 for _ in range(10)], lrange(10)], inplace=True)
1621+
1622+
# GH 17464 - Remove duplicate MultiIndex levels
1623+
mi4.set_levels([lrange(10), lrange(10)], inplace=True)
16221624
assert not mi4.is_(mi3)
16231625
mi5 = mi.view()
16241626
mi5.set_levels(mi5.levels, inplace=True)
@@ -2450,13 +2452,11 @@ def test_isna_behavior(self):
24502452
pd.isna(self.index)
24512453

24522454
def test_level_setting_resets_attributes(self):
2453-
ind = MultiIndex.from_arrays([
2455+
ind = pd.MultiIndex.from_arrays([
24542456
['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3]
24552457
])
24562458
assert ind.is_monotonic
2457-
ind.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]],
2458-
inplace=True)
2459-
2459+
ind.set_levels([['A', 'B'], [1, 3, 2]], inplace=True)
24602460
# if this fails, probably didn't reset the cache correctly.
24612461
assert not ind.is_monotonic
24622462

@@ -3083,3 +3083,16 @@ def test_million_record_attribute_error(self):
30833083
with tm.assert_raises_regex(AttributeError,
30843084
"'Series' object has no attribute 'foo'"):
30853085
df['a'].foo()
3086+
3087+
def test_duplicate_multiindex_labels(self):
3088+
# GH 17464
3089+
# Make sure that a MultiIndex with duplicate levels throws a ValueError
3090+
with pytest.raises(ValueError):
3091+
ind = pd.MultiIndex([['A'] * 10, range(10)], [[0] * 10, range(10)])
3092+
3093+
# And that using set_levels with duplicate levels fails
3094+
ind = MultiIndex.from_arrays([['A', 'A', 'B', 'B', 'B'],
3095+
[1, 2, 1, 2, 3]])
3096+
with pytest.raises(ValueError):
3097+
ind.set_levels([['A', 'B', 'A', 'A', 'B'], [2, 1, 3, -2, 5]],
3098+
inplace=True)

0 commit comments

Comments
 (0)