ENH: reset_index on a MultiIndex with duplicate levels raises a ValueError (#44755)

johnzangwill · web-flow · commit fd6f23fba889 · 2022-01-30T13:36:59.000-05:00
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -35,6 +35,7 @@ Other enhancements
 - :class:`StringArray` now accepts array-likes containing nan-likes (``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`)
 - Improved the rendering of ``categories`` in :class:`CategoricalIndex` (:issue:`45218`)
 - :meth:`to_numeric` now preserves float64 arrays when downcasting would generate values not representable in float32 (:issue:`43693`)
+- :meth:`Series.reset_index` and :meth:`DataFrame.reset_index` now support the argument ``allow_duplicates`` (:issue:`44410`)
 - :meth:`.GroupBy.min` and :meth:`.GroupBy.max` now supports `Numba <https://numba.pydata.org/>`_ execution with the ``engine`` keyword (:issue:`45428`)
 -
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4390,7 +4390,7 @@ def insert(
         loc: int,
         column: Hashable,
         value: Scalar | AnyArrayLike,
-        allow_duplicates: bool = False,
+        allow_duplicates: bool | lib.NoDefault = lib.no_default,
     ) -> None:
         """
         Insert column into DataFrame at specified location.
@@ -4405,7 +4405,7 @@ def insert(
         column : str, number, or hashable object
             Label of the inserted column.
         value : Scalar, Series, or array-like
-        allow_duplicates : bool, optional default False
+        allow_duplicates : bool, optional, default lib.no_default
 
         See Also
         --------
@@ -4437,6 +4437,8 @@ def insert(
         0   NaN   100     1      99     3
         1   5.0   100     2      99     4
         """
+        if allow_duplicates is lib.no_default:
+            allow_duplicates = False
         if allow_duplicates and not self.flags.allows_duplicate_labels:
             raise ValueError(
                 "Cannot specify 'allow_duplicates=True' when "
@@ -5579,6 +5581,7 @@ def reset_index(
         inplace: Literal[False] = ...,
         col_level: Hashable = ...,
         col_fill: Hashable = ...,
+        allow_duplicates: bool | lib.NoDefault = ...,
     ) -> DataFrame:
         ...
 
@@ -5590,6 +5593,7 @@ def reset_index(
         inplace: Literal[True],
         col_level: Hashable = ...,
         col_fill: Hashable = ...,
+        allow_duplicates: bool | lib.NoDefault = ...,
     ) -> None:
         ...
 
@@ -5601,6 +5605,7 @@ def reset_index(
         inplace: Literal[True],
         col_level: Hashable = ...,
         col_fill: Hashable = ...,
+        allow_duplicates: bool | lib.NoDefault = ...,
     ) -> None:
         ...
 
@@ -5612,6 +5617,7 @@ def reset_index(
         inplace: Literal[True],
         col_level: Hashable = ...,
         col_fill: Hashable = ...,
+        allow_duplicates: bool | lib.NoDefault = ...,
     ) -> None:
         ...
 
@@ -5622,6 +5628,7 @@ def reset_index(
         inplace: Literal[True],
         col_level: Hashable = ...,
         col_fill: Hashable = ...,
+        allow_duplicates: bool | lib.NoDefault = ...,
     ) -> None:
         ...
 
@@ -5633,6 +5640,7 @@ def reset_index(
         inplace: bool = ...,
         col_level: Hashable = ...,
         col_fill: Hashable = ...,
+        allow_duplicates: bool | lib.NoDefault = ...,
     ) -> DataFrame | None:
         ...
 
@@ -5644,6 +5652,7 @@ def reset_index(
         inplace: bool = False,
         col_level: Hashable = 0,
         col_fill: Hashable = "",
+        allow_duplicates: bool | lib.NoDefault = lib.no_default,
     ) -> DataFrame | None:
         """
         Reset the index, or a level of it.
@@ -5669,6 +5678,10 @@ def reset_index(
         col_fill : object, default ''
             If the columns have multiple levels, determines how the other
             levels are named. If None then the index name is repeated.
+        allow_duplicates : bool, optional, default lib.no_default
+            Allow duplicate column labels to be created.
+
+            .. versionadded:: 1.5.0
 
         Returns
         -------
@@ -5792,6 +5805,8 @@ class    max    type
             new_obj = self
         else:
             new_obj = self.copy()
+        if allow_duplicates is not lib.no_default:
+            allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates")
 
         new_index = default_index(len(new_obj))
         if level is not None:
@@ -5843,7 +5858,12 @@ class    max    type
                         level_values, lab, allow_fill=True, fill_value=lev._na_value
                     )
 
-                new_obj.insert(0, name, level_values)
+                new_obj.insert(
+                    0,
+                    name,
+                    level_values,
+                    allow_duplicates=allow_duplicates,
+                )
 
         new_obj.index = new_index
         if not inplace:
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1359,7 +1359,14 @@ def repeat(self, repeats, axis=None) -> Series:
         )
 
     @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "level"])
-    def reset_index(self, level=None, drop=False, name=lib.no_default, inplace=False):
+    def reset_index(
+        self,
+        level=None,
+        drop=False,
+        name=lib.no_default,
+        inplace=False,
+        allow_duplicates: bool = False,
+    ):
         """
         Generate a new DataFrame or Series with the index reset.
 
@@ -1381,6 +1388,10 @@ def reset_index(self, level=None, drop=False, name=lib.no_default, inplace=False
             when `drop` is True.
         inplace : bool, default False
             Modify the Series in place (do not create a new object).
+        allow_duplicates : bool, default False
+            Allow duplicate column labels to be created.
+
+            .. versionadded:: 1.5.0
 
         Returns
         -------
@@ -1497,7 +1508,9 @@ def reset_index(self, level=None, drop=False, name=lib.no_default, inplace=False
                     name = self.name
 
             df = self.to_frame(name)
-            return df.reset_index(level=level, drop=drop)
+            return df.reset_index(
+                level=level, drop=drop, allow_duplicates=allow_duplicates
+            )
 
     # ----------------------------------------------------------------------
     # Rendering Methods
diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py
@@ -27,6 +27,12 @@
 import pandas._testing as tm
 
 
+@pytest.fixture()
+def multiindex_df():
+    levels = [["A", ""], ["B", "b"]]
+    return DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels))
+
+
 class TestResetIndex:
     def test_reset_index_empty_rangeindex(self):
         # GH#45230
@@ -381,33 +387,31 @@ def test_reset_index_range(self):
         )
         tm.assert_frame_equal(result, expected)
 
-    def test_reset_index_multiindex_columns(self):
-        levels = [["A", ""], ["B", "b"]]
-        df = DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels))
-        result = df[["B"]].rename_axis("A").reset_index()
-        tm.assert_frame_equal(result, df)
+    def test_reset_index_multiindex_columns(self, multiindex_df):
+        result = multiindex_df[["B"]].rename_axis("A").reset_index()
+        tm.assert_frame_equal(result, multiindex_df)
 
         # GH#16120: already existing column
         msg = r"cannot insert \('A', ''\), already exists"
         with pytest.raises(ValueError, match=msg):
-            df.rename_axis("A").reset_index()
+            multiindex_df.rename_axis("A").reset_index()
 
         # GH#16164: multiindex (tuple) full key
-        result = df.set_index([("A", "")]).reset_index()
-        tm.assert_frame_equal(result, df)
+        result = multiindex_df.set_index([("A", "")]).reset_index()
+        tm.assert_frame_equal(result, multiindex_df)
 
         # with additional (unnamed) index level
         idx_col = DataFrame(
             [[0], [1]], columns=MultiIndex.from_tuples([("level_0", "")])
         )
-        expected = pd.concat([idx_col, df[[("B", "b"), ("A", "")]]], axis=1)
-        result = df.set_index([("B", "b")], append=True).reset_index()
+        expected = pd.concat([idx_col, multiindex_df[[("B", "b"), ("A", "")]]], axis=1)
+        result = multiindex_df.set_index([("B", "b")], append=True).reset_index()
         tm.assert_frame_equal(result, expected)
 
         # with index name which is a too long tuple...
         msg = "Item must have length equal to number of levels."
         with pytest.raises(ValueError, match=msg):
-            df.rename_axis([("C", "c", "i")]).reset_index()
+            multiindex_df.rename_axis([("C", "c", "i")]).reset_index()
 
         # or too short...
         levels = [["A", "a", ""], ["B", "b", "i"]]
@@ -433,6 +437,45 @@ def test_reset_index_multiindex_columns(self):
         result = df2.rename_axis([("c", "ii")]).reset_index(col_level=1, col_fill="C")
         tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize("flag", [False, True])
+    @pytest.mark.parametrize("allow_duplicates", [False, True])
+    def test_reset_index_duplicate_columns_allow(
+        self, multiindex_df, flag, allow_duplicates
+    ):
+        # GH#44755 reset_index with duplicate column labels
+        df = multiindex_df.rename_axis("A")
+        df = df.set_flags(allows_duplicate_labels=flag)
+
+        if flag and allow_duplicates:
+            result = df.reset_index(allow_duplicates=allow_duplicates)
+            levels = [["A", ""], ["A", ""], ["B", "b"]]
+            expected = DataFrame(
+                [[0, 0, 2], [1, 1, 3]], columns=MultiIndex.from_tuples(levels)
+            )
+            tm.assert_frame_equal(result, expected)
+        else:
+            if not flag and allow_duplicates:
+                msg = "Cannot specify 'allow_duplicates=True' when "
+                "'self.flags.allows_duplicate_labels' is False"
+            else:
+                msg = r"cannot insert \('A', ''\), already exists"
+            with pytest.raises(ValueError, match=msg):
+                df.reset_index(allow_duplicates=allow_duplicates)
+
+    @pytest.mark.parametrize("flag", [False, True])
+    def test_reset_index_duplicate_columns_default(self, multiindex_df, flag):
+        df = multiindex_df.rename_axis("A")
+        df = df.set_flags(allows_duplicate_labels=flag)
+
+        msg = r"cannot insert \('A', ''\), already exists"
+        with pytest.raises(ValueError, match=msg):
+            df.reset_index()
+
+    @pytest.mark.parametrize("allow_duplicates", ["bad value"])
+    def test_reset_index_allow_duplicates_check(self, multiindex_df, allow_duplicates):
+        with pytest.raises(ValueError, match="expected type bool"):
+            multiindex_df.reset_index(allow_duplicates=allow_duplicates)
+
     @pytest.mark.filterwarnings("ignore:Timestamp.freq is deprecated:FutureWarning")
     def test_reset_index_datetime(self, tz_naive_fixture):
         # GH#3950
diff --git a/pandas/tests/series/methods/test_reset_index.py b/pandas/tests/series/methods/test_reset_index.py
@@ -186,3 +186,23 @@ def test_reset_index_dtypes_on_empty_series_with_multiindex(array, dtype):
         {"level_0": np.int64, "level_1": np.float64, "level_2": dtype, 0: object}
     )
     tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "names, expected_names",
+    [
+        (["A", "A"], ["A", "A"]),
+        (["level_1", None], ["level_1", "level_1"]),
+    ],
+)
+@pytest.mark.parametrize("allow_duplicates", [False, True])
+def test_column_name_duplicates(names, expected_names, allow_duplicates):
+    # GH#44755 reset_index with duplicate column labels
+    s = Series([1], index=MultiIndex.from_arrays([[1], [1]], names=names))
+    if allow_duplicates:
+        result = s.reset_index(allow_duplicates=True)
+        expected = DataFrame([[1, 1, 1]], columns=expected_names + [0])
+        tm.assert_frame_equal(result, expected)
+    else:
+        with pytest.raises(ValueError, match="cannot insert"):
+            s.reset_index()

Original file line number	Diff line number	Diff line change
`@@ -35,6 +35,7 @@ Other enhancements`
`35`	`35`	- :class:`StringArray` now accepts array-likes containing nan-likes (``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`)
`36`	`36`	- Improved the rendering of ``categories`` in :class:`CategoricalIndex` (:issue:`45218`)
`37`	`37`	- :meth:`to_numeric` now preserves float64 arrays when downcasting would generate values not representable in float32 (:issue:`43693`)
	`38`	+- :meth:`Series.reset_index` and :meth:`DataFrame.reset_index` now support the argument ``allow_duplicates`` (:issue:`44410`)
`38`	`39`	- :meth:`.GroupBy.min` and :meth:`.GroupBy.max` now supports `Numba <https://numba.pydata.org/>`_ execution with the ``engine`` keyword (:issue:`45428`)
`39`	`40`	`-`
`40`	`41`