Skip to content

Commit fd6f23f

Browse files
authored
ENH: reset_index on a MultiIndex with duplicate levels raises a ValueError (#44755)
1 parent f4c5797 commit fd6f23f

File tree

5 files changed

+113
-16
lines changed

5 files changed

+113
-16
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ Other enhancements
3535
- :class:`StringArray` now accepts array-likes containing nan-likes (``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`)
3636
- Improved the rendering of ``categories`` in :class:`CategoricalIndex` (:issue:`45218`)
3737
- :meth:`to_numeric` now preserves float64 arrays when downcasting would generate values not representable in float32 (:issue:`43693`)
38+
- :meth:`Series.reset_index` and :meth:`DataFrame.reset_index` now support the argument ``allow_duplicates`` (:issue:`44410`)
3839
- :meth:`.GroupBy.min` and :meth:`.GroupBy.max` now supports `Numba <https://numba.pydata.org/>`_ execution with the ``engine`` keyword (:issue:`45428`)
3940
-
4041

pandas/core/frame.py

+23-3
Original file line numberDiff line numberDiff line change
@@ -4390,7 +4390,7 @@ def insert(
43904390
loc: int,
43914391
column: Hashable,
43924392
value: Scalar | AnyArrayLike,
4393-
allow_duplicates: bool = False,
4393+
allow_duplicates: bool | lib.NoDefault = lib.no_default,
43944394
) -> None:
43954395
"""
43964396
Insert column into DataFrame at specified location.
@@ -4405,7 +4405,7 @@ def insert(
44054405
column : str, number, or hashable object
44064406
Label of the inserted column.
44074407
value : Scalar, Series, or array-like
4408-
allow_duplicates : bool, optional default False
4408+
allow_duplicates : bool, optional, default lib.no_default
44094409
44104410
See Also
44114411
--------
@@ -4437,6 +4437,8 @@ def insert(
44374437
0 NaN 100 1 99 3
44384438
1 5.0 100 2 99 4
44394439
"""
4440+
if allow_duplicates is lib.no_default:
4441+
allow_duplicates = False
44404442
if allow_duplicates and not self.flags.allows_duplicate_labels:
44414443
raise ValueError(
44424444
"Cannot specify 'allow_duplicates=True' when "
@@ -5579,6 +5581,7 @@ def reset_index(
55795581
inplace: Literal[False] = ...,
55805582
col_level: Hashable = ...,
55815583
col_fill: Hashable = ...,
5584+
allow_duplicates: bool | lib.NoDefault = ...,
55825585
) -> DataFrame:
55835586
...
55845587

@@ -5590,6 +5593,7 @@ def reset_index(
55905593
inplace: Literal[True],
55915594
col_level: Hashable = ...,
55925595
col_fill: Hashable = ...,
5596+
allow_duplicates: bool | lib.NoDefault = ...,
55935597
) -> None:
55945598
...
55955599

@@ -5601,6 +5605,7 @@ def reset_index(
56015605
inplace: Literal[True],
56025606
col_level: Hashable = ...,
56035607
col_fill: Hashable = ...,
5608+
allow_duplicates: bool | lib.NoDefault = ...,
56045609
) -> None:
56055610
...
56065611

@@ -5612,6 +5617,7 @@ def reset_index(
56125617
inplace: Literal[True],
56135618
col_level: Hashable = ...,
56145619
col_fill: Hashable = ...,
5620+
allow_duplicates: bool | lib.NoDefault = ...,
56155621
) -> None:
56165622
...
56175623

@@ -5622,6 +5628,7 @@ def reset_index(
56225628
inplace: Literal[True],
56235629
col_level: Hashable = ...,
56245630
col_fill: Hashable = ...,
5631+
allow_duplicates: bool | lib.NoDefault = ...,
56255632
) -> None:
56265633
...
56275634

@@ -5633,6 +5640,7 @@ def reset_index(
56335640
inplace: bool = ...,
56345641
col_level: Hashable = ...,
56355642
col_fill: Hashable = ...,
5643+
allow_duplicates: bool | lib.NoDefault = ...,
56365644
) -> DataFrame | None:
56375645
...
56385646

@@ -5644,6 +5652,7 @@ def reset_index(
56445652
inplace: bool = False,
56455653
col_level: Hashable = 0,
56465654
col_fill: Hashable = "",
5655+
allow_duplicates: bool | lib.NoDefault = lib.no_default,
56475656
) -> DataFrame | None:
56485657
"""
56495658
Reset the index, or a level of it.
@@ -5669,6 +5678,10 @@ def reset_index(
56695678
col_fill : object, default ''
56705679
If the columns have multiple levels, determines how the other
56715680
levels are named. If None then the index name is repeated.
5681+
allow_duplicates : bool, optional, default lib.no_default
5682+
Allow duplicate column labels to be created.
5683+
5684+
.. versionadded:: 1.5.0
56725685
56735686
Returns
56745687
-------
@@ -5792,6 +5805,8 @@ class max type
57925805
new_obj = self
57935806
else:
57945807
new_obj = self.copy()
5808+
if allow_duplicates is not lib.no_default:
5809+
allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates")
57955810

57965811
new_index = default_index(len(new_obj))
57975812
if level is not None:
@@ -5843,7 +5858,12 @@ class max type
58435858
level_values, lab, allow_fill=True, fill_value=lev._na_value
58445859
)
58455860

5846-
new_obj.insert(0, name, level_values)
5861+
new_obj.insert(
5862+
0,
5863+
name,
5864+
level_values,
5865+
allow_duplicates=allow_duplicates,
5866+
)
58475867

58485868
new_obj.index = new_index
58495869
if not inplace:

pandas/core/series.py

+15-2
Original file line numberDiff line numberDiff line change
@@ -1359,7 +1359,14 @@ def repeat(self, repeats, axis=None) -> Series:
13591359
)
13601360

13611361
@deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "level"])
1362-
def reset_index(self, level=None, drop=False, name=lib.no_default, inplace=False):
1362+
def reset_index(
1363+
self,
1364+
level=None,
1365+
drop=False,
1366+
name=lib.no_default,
1367+
inplace=False,
1368+
allow_duplicates: bool = False,
1369+
):
13631370
"""
13641371
Generate a new DataFrame or Series with the index reset.
13651372
@@ -1381,6 +1388,10 @@ def reset_index(self, level=None, drop=False, name=lib.no_default, inplace=False
13811388
when `drop` is True.
13821389
inplace : bool, default False
13831390
Modify the Series in place (do not create a new object).
1391+
allow_duplicates : bool, default False
1392+
Allow duplicate column labels to be created.
1393+
1394+
.. versionadded:: 1.5.0
13841395
13851396
Returns
13861397
-------
@@ -1497,7 +1508,9 @@ def reset_index(self, level=None, drop=False, name=lib.no_default, inplace=False
14971508
name = self.name
14981509

14991510
df = self.to_frame(name)
1500-
return df.reset_index(level=level, drop=drop)
1511+
return df.reset_index(
1512+
level=level, drop=drop, allow_duplicates=allow_duplicates
1513+
)
15011514

15021515
# ----------------------------------------------------------------------
15031516
# Rendering Methods

pandas/tests/frame/methods/test_reset_index.py

+54-11
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,12 @@
2727
import pandas._testing as tm
2828

2929

30+
@pytest.fixture()
31+
def multiindex_df():
32+
levels = [["A", ""], ["B", "b"]]
33+
return DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels))
34+
35+
3036
class TestResetIndex:
3137
def test_reset_index_empty_rangeindex(self):
3238
# GH#45230
@@ -381,33 +387,31 @@ def test_reset_index_range(self):
381387
)
382388
tm.assert_frame_equal(result, expected)
383389

384-
def test_reset_index_multiindex_columns(self):
385-
levels = [["A", ""], ["B", "b"]]
386-
df = DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels))
387-
result = df[["B"]].rename_axis("A").reset_index()
388-
tm.assert_frame_equal(result, df)
390+
def test_reset_index_multiindex_columns(self, multiindex_df):
391+
result = multiindex_df[["B"]].rename_axis("A").reset_index()
392+
tm.assert_frame_equal(result, multiindex_df)
389393

390394
# GH#16120: already existing column
391395
msg = r"cannot insert \('A', ''\), already exists"
392396
with pytest.raises(ValueError, match=msg):
393-
df.rename_axis("A").reset_index()
397+
multiindex_df.rename_axis("A").reset_index()
394398

395399
# GH#16164: multiindex (tuple) full key
396-
result = df.set_index([("A", "")]).reset_index()
397-
tm.assert_frame_equal(result, df)
400+
result = multiindex_df.set_index([("A", "")]).reset_index()
401+
tm.assert_frame_equal(result, multiindex_df)
398402

399403
# with additional (unnamed) index level
400404
idx_col = DataFrame(
401405
[[0], [1]], columns=MultiIndex.from_tuples([("level_0", "")])
402406
)
403-
expected = pd.concat([idx_col, df[[("B", "b"), ("A", "")]]], axis=1)
404-
result = df.set_index([("B", "b")], append=True).reset_index()
407+
expected = pd.concat([idx_col, multiindex_df[[("B", "b"), ("A", "")]]], axis=1)
408+
result = multiindex_df.set_index([("B", "b")], append=True).reset_index()
405409
tm.assert_frame_equal(result, expected)
406410

407411
# with index name which is a too long tuple...
408412
msg = "Item must have length equal to number of levels."
409413
with pytest.raises(ValueError, match=msg):
410-
df.rename_axis([("C", "c", "i")]).reset_index()
414+
multiindex_df.rename_axis([("C", "c", "i")]).reset_index()
411415

412416
# or too short...
413417
levels = [["A", "a", ""], ["B", "b", "i"]]
@@ -433,6 +437,45 @@ def test_reset_index_multiindex_columns(self):
433437
result = df2.rename_axis([("c", "ii")]).reset_index(col_level=1, col_fill="C")
434438
tm.assert_frame_equal(result, expected)
435439

440+
@pytest.mark.parametrize("flag", [False, True])
441+
@pytest.mark.parametrize("allow_duplicates", [False, True])
442+
def test_reset_index_duplicate_columns_allow(
443+
self, multiindex_df, flag, allow_duplicates
444+
):
445+
# GH#44755 reset_index with duplicate column labels
446+
df = multiindex_df.rename_axis("A")
447+
df = df.set_flags(allows_duplicate_labels=flag)
448+
449+
if flag and allow_duplicates:
450+
result = df.reset_index(allow_duplicates=allow_duplicates)
451+
levels = [["A", ""], ["A", ""], ["B", "b"]]
452+
expected = DataFrame(
453+
[[0, 0, 2], [1, 1, 3]], columns=MultiIndex.from_tuples(levels)
454+
)
455+
tm.assert_frame_equal(result, expected)
456+
else:
457+
if not flag and allow_duplicates:
458+
msg = "Cannot specify 'allow_duplicates=True' when "
459+
"'self.flags.allows_duplicate_labels' is False"
460+
else:
461+
msg = r"cannot insert \('A', ''\), already exists"
462+
with pytest.raises(ValueError, match=msg):
463+
df.reset_index(allow_duplicates=allow_duplicates)
464+
465+
@pytest.mark.parametrize("flag", [False, True])
466+
def test_reset_index_duplicate_columns_default(self, multiindex_df, flag):
467+
df = multiindex_df.rename_axis("A")
468+
df = df.set_flags(allows_duplicate_labels=flag)
469+
470+
msg = r"cannot insert \('A', ''\), already exists"
471+
with pytest.raises(ValueError, match=msg):
472+
df.reset_index()
473+
474+
@pytest.mark.parametrize("allow_duplicates", ["bad value"])
475+
def test_reset_index_allow_duplicates_check(self, multiindex_df, allow_duplicates):
476+
with pytest.raises(ValueError, match="expected type bool"):
477+
multiindex_df.reset_index(allow_duplicates=allow_duplicates)
478+
436479
@pytest.mark.filterwarnings("ignore:Timestamp.freq is deprecated:FutureWarning")
437480
def test_reset_index_datetime(self, tz_naive_fixture):
438481
# GH#3950

pandas/tests/series/methods/test_reset_index.py

+20
Original file line numberDiff line numberDiff line change
@@ -186,3 +186,23 @@ def test_reset_index_dtypes_on_empty_series_with_multiindex(array, dtype):
186186
{"level_0": np.int64, "level_1": np.float64, "level_2": dtype, 0: object}
187187
)
188188
tm.assert_series_equal(result, expected)
189+
190+
191+
@pytest.mark.parametrize(
192+
"names, expected_names",
193+
[
194+
(["A", "A"], ["A", "A"]),
195+
(["level_1", None], ["level_1", "level_1"]),
196+
],
197+
)
198+
@pytest.mark.parametrize("allow_duplicates", [False, True])
199+
def test_column_name_duplicates(names, expected_names, allow_duplicates):
200+
# GH#44755 reset_index with duplicate column labels
201+
s = Series([1], index=MultiIndex.from_arrays([[1], [1]], names=names))
202+
if allow_duplicates:
203+
result = s.reset_index(allow_duplicates=True)
204+
expected = DataFrame([[1, 1, 1]], columns=expected_names + [0])
205+
tm.assert_frame_equal(result, expected)
206+
else:
207+
with pytest.raises(ValueError, match="cannot insert"):
208+
s.reset_index()

0 commit comments

Comments
 (0)