Skip to content

Commit bb89098

Browse files
API: re-allow duplicate index level names (pandas-dev#21423)
(cherry picked from commit 66b517c)
1 parent ee93f61 commit bb89098

File tree

9 files changed

+90
-38
lines changed

9 files changed

+90
-38
lines changed

doc/source/whatsnew/v0.23.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ Fixed Regressions
5353
~~~~~~~~~~~~~~~~~
5454

5555
- Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`)
56+
- Re-allowed duplicate level names of a ``MultiIndex``. Accessing a level that has a duplicate name by name still raises an error (:issue:`19029`).
5657
- Bug in both :meth:`DataFrame.first_valid_index` and :meth:`Series.first_valid_index` raised for a row index having duplicate values (:issue:`21441`)
5758
- Fixed regression in unary negative operations with object dtype (:issue:`21380`)
5859
- Bug in :meth:`Timestamp.ceil` and :meth:`Timestamp.floor` when timestamp is a multiple of the rounding frequency (:issue:`21262`)

pandas/core/indexes/multi.py

+7-12
Original file line numberDiff line numberDiff line change
@@ -672,30 +672,18 @@ def _set_names(self, names, level=None, validate=True):
672672

673673
if level is None:
674674
level = range(self.nlevels)
675-
used = {}
676675
else:
677676
level = [self._get_level_number(l) for l in level]
678-
used = {self.levels[l].name: l
679-
for l in set(range(self.nlevels)) - set(level)}
680677

681678
# set the name
682679
for l, name in zip(level, names):
683680
if name is not None:
684-
685681
# GH 20527
686682
# All items in 'names' need to be hashable:
687683
if not is_hashable(name):
688684
raise TypeError('{}.name must be a hashable type'
689685
.format(self.__class__.__name__))
690-
691-
if name in used:
692-
raise ValueError(
693-
'Duplicated level name: "{}", assigned to '
694-
'level {}, is already used for level '
695-
'{}.'.format(name, l, used[name]))
696-
697686
self.levels[l].rename(name, inplace=True)
698-
used[name] = l
699687

700688
names = property(fset=_set_names, fget=_get_names,
701689
doc="Names of levels in MultiIndex")
@@ -2935,6 +2923,13 @@ def isin(self, values, level=None):
29352923
else:
29362924
return np.lib.arraysetops.in1d(labs, sought_labels)
29372925

2926+
def _reference_duplicate_name(self, name):
2927+
"""
2928+
Returns True if the name refered to in self.names is duplicated.
2929+
"""
2930+
# count the times name equals an element in self.names.
2931+
return sum(name == n for n in self.names) > 1
2932+
29382933

29392934
MultiIndex._add_numeric_methods_disabled()
29402935
MultiIndex._add_numeric_methods_add_sub_disabled()

pandas/core/reshape/reshape.py

+12
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,12 @@ def __init__(self, values, index, level=-1, value_columns=None,
115115

116116
self.index = index.remove_unused_levels()
117117

118+
if isinstance(self.index, MultiIndex):
119+
if index._reference_duplicate_name(level):
120+
msg = ("Ambiguous reference to {level}. The index "
121+
"names are not unique.".format(level=level))
122+
raise ValueError(msg)
123+
118124
self.level = self.index._get_level_number(level)
119125

120126
# when index includes `nan`, need to lift levels/strides by 1
@@ -528,6 +534,12 @@ def factorize(index):
528534

529535
N, K = frame.shape
530536

537+
if isinstance(frame.columns, MultiIndex):
538+
if frame.columns._reference_duplicate_name(level):
539+
msg = ("Ambiguous reference to {level}. The column "
540+
"names are not unique.".format(level=level))
541+
raise ValueError(msg)
542+
531543
# Will also convert negative level numbers and check if out of bounds.
532544
level_num = frame.columns._get_level_number(level)
533545

pandas/tests/frame/test_alter_axes.py

+29-8
Original file line numberDiff line numberDiff line change
@@ -130,19 +130,27 @@ def test_set_index2(self):
130130
result = df.set_index(df.C)
131131
assert result.index.name == 'C'
132132

133-
@pytest.mark.parametrize('level', ['a', pd.Series(range(3), name='a')])
133+
@pytest.mark.parametrize(
134+
'level', ['a', pd.Series(range(0, 8, 2), name='a')])
134135
def test_set_index_duplicate_names(self, level):
135-
# GH18872
136+
# GH18872 - GH19029
136137
df = pd.DataFrame(np.arange(8).reshape(4, 2), columns=['a', 'b'])
137138

138139
# Pass an existing level name:
139140
df.index.name = 'a'
140-
pytest.raises(ValueError, df.set_index, level, append=True)
141-
pytest.raises(ValueError, df.set_index, [level], append=True)
142-
143-
# Pass twice the same level name:
144-
df.index.name = 'c'
145-
pytest.raises(ValueError, df.set_index, [level, level])
141+
expected = pd.MultiIndex.from_tuples([(0, 0), (1, 2), (2, 4), (3, 6)],
142+
names=['a', 'a'])
143+
result = df.set_index(level, append=True)
144+
tm.assert_index_equal(result.index, expected)
145+
result = df.set_index([level], append=True)
146+
tm.assert_index_equal(result.index, expected)
147+
148+
# Pass twice the same level name (only works with passing actual data)
149+
if isinstance(level, pd.Series):
150+
result = df.set_index([level, level])
151+
expected = pd.MultiIndex.from_tuples(
152+
[(0, 0), (2, 2), (4, 4), (6, 6)], names=['a', 'a'])
153+
tm.assert_index_equal(result.index, expected)
146154

147155
def test_set_index_nonuniq(self):
148156
df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
@@ -617,6 +625,19 @@ def test_reorder_levels(self):
617625
index=e_idx)
618626
assert_frame_equal(result, expected)
619627

628+
result = df.reorder_levels([0, 0, 0])
629+
e_idx = MultiIndex(levels=[['bar'], ['bar'], ['bar']],
630+
labels=[[0, 0, 0, 0, 0, 0],
631+
[0, 0, 0, 0, 0, 0],
632+
[0, 0, 0, 0, 0, 0]],
633+
names=['L0', 'L0', 'L0'])
634+
expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)},
635+
index=e_idx)
636+
assert_frame_equal(result, expected)
637+
638+
result = df.reorder_levels(['L0', 'L0', 'L0'])
639+
assert_frame_equal(result, expected)
640+
620641
def test_reset_index(self):
621642
stacked = self.frame.stack()[::2]
622643
stacked = DataFrame({'foo': stacked, 'bar': stacked})

pandas/tests/frame/test_reshape.py

+10
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,16 @@ def test_unstack_dtypes(self):
560560
assert left.shape == (3, 2)
561561
tm.assert_frame_equal(left, right)
562562

563+
def test_unstack_non_unique_index_names(self):
564+
idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')],
565+
names=['c1', 'c1'])
566+
df = DataFrame([1, 2], index=idx)
567+
with pytest.raises(ValueError):
568+
df.unstack('c1')
569+
570+
with pytest.raises(ValueError):
571+
df.T.stack('c1')
572+
563573
def test_unstack_unused_levels(self):
564574
# GH 17845: unused labels in index make unstack() cast int to float
565575
idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1]

pandas/tests/groupby/test_categorical.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -555,15 +555,11 @@ def test_as_index():
555555
columns=['cat', 'A', 'B'])
556556
tm.assert_frame_equal(result, expected)
557557

558-
# another not in-axis grouper
559-
s = Series(['a', 'b', 'b'], name='cat2')
558+
# another not in-axis grouper (conflicting names in index)
559+
s = Series(['a', 'b', 'b'], name='cat')
560560
result = df.groupby(['cat', s], as_index=False, observed=True).sum()
561561
tm.assert_frame_equal(result, expected)
562562

563-
# GH18872: conflicting names in desired index
564-
with pytest.raises(ValueError):
565-
df.groupby(['cat', s.rename('cat')], observed=True).sum()
566-
567563
# is original index dropped?
568564
group_columns = ['cat', 'A']
569565
expected = DataFrame(

pandas/tests/indexes/test_multi.py

+15-10
Original file line numberDiff line numberDiff line change
@@ -655,22 +655,27 @@ def test_constructor_nonhashable_names(self):
655655
# With .set_names()
656656
tm.assert_raises_regex(TypeError, message, mi.set_names, names=renamed)
657657

658-
@pytest.mark.parametrize('names', [['a', 'b', 'a'], ['1', '1', '2'],
659-
['1', 'a', '1']])
658+
@pytest.mark.parametrize('names', [['a', 'b', 'a'], [1, 1, 2],
659+
[1, 'a', 1]])
660660
def test_duplicate_level_names(self, names):
661-
# GH18872
662-
pytest.raises(ValueError, pd.MultiIndex.from_product,
663-
[[0, 1]] * 3, names=names)
661+
# GH18872, GH19029
662+
mi = pd.MultiIndex.from_product([[0, 1]] * 3, names=names)
663+
assert mi.names == names
664664

665665
# With .rename()
666666
mi = pd.MultiIndex.from_product([[0, 1]] * 3)
667-
tm.assert_raises_regex(ValueError, "Duplicated level name:",
668-
mi.rename, names)
667+
mi = mi.rename(names)
668+
assert mi.names == names
669669

670670
# With .rename(., level=)
671-
mi.rename(names[0], level=1, inplace=True)
672-
tm.assert_raises_regex(ValueError, "Duplicated level name:",
673-
mi.rename, names[:2], level=[0, 2])
671+
mi.rename(names[1], level=1, inplace=True)
672+
mi = mi.rename([names[0], names[2]], level=[0, 2])
673+
assert mi.names == names
674+
675+
def test_duplicate_level_names_access_raises(self):
676+
self.index.names = ['foo', 'foo']
677+
tm.assert_raises_regex(KeyError, 'Level foo not found',
678+
self.index._get_level_number, 'foo')
674679

675680
def assert_multiindex_copied(self, copy, original):
676681
# Levels should be (at least, shallow copied)

pandas/tests/io/test_pytables.py

+6
Original file line numberDiff line numberDiff line change
@@ -1842,6 +1842,12 @@ def make_index(names=None):
18421842
'a', 'b'], index=make_index(['date', 'a', 't']))
18431843
pytest.raises(ValueError, store.append, 'df', df)
18441844

1845+
# dup within level
1846+
_maybe_remove(store, 'df')
1847+
df = DataFrame(np.zeros((12, 2)), columns=['a', 'b'],
1848+
index=make_index(['date', 'date', 'date']))
1849+
pytest.raises(ValueError, store.append, 'df', df)
1850+
18451851
# fully names
18461852
_maybe_remove(store, 'df')
18471853
df = DataFrame(np.zeros((12, 2)), columns=[

pandas/tests/reshape/test_pivot.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -1729,9 +1729,15 @@ def test_crosstab_with_numpy_size(self):
17291729
tm.assert_frame_equal(result, expected)
17301730

17311731
def test_crosstab_dup_index_names(self):
1732-
# GH 13279, GH 18872
1732+
# GH 13279
17331733
s = pd.Series(range(3), name='foo')
1734-
pytest.raises(ValueError, pd.crosstab, s, s)
1734+
1735+
result = pd.crosstab(s, s)
1736+
expected_index = pd.Index(range(3), name='foo')
1737+
expected = pd.DataFrame(np.eye(3, dtype=np.int64),
1738+
index=expected_index,
1739+
columns=expected_index)
1740+
tm.assert_frame_equal(result, expected)
17351741

17361742
@pytest.mark.parametrize("names", [['a', ('b', 'c')],
17371743
[('a', 'b'), 'c']])

0 commit comments

Comments
 (0)