diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 1d4054d5ea0f1..c80c446eca6b7 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -35,6 +35,7 @@ Other enhancements - :class:`StringArray` now accepts array-likes containing nan-likes (``None``, ``np.nan``) for the ``values`` parameter in its constructor in addition to strings and :attr:`pandas.NA`. (:issue:`40839`) - Improved the rendering of ``categories`` in :class:`CategoricalIndex` (:issue:`45218`) - :meth:`to_numeric` now preserves float64 arrays when downcasting would generate values not representable in float32 (:issue:`43693`) +- :meth:`Series.reset_index` and :meth:`DataFrame.reset_index` now support the argument ``allow_duplicates`` (:issue:`44410`) - :meth:`.GroupBy.min` and :meth:`.GroupBy.max` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`45428`) - diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d76af1ce42546..d33ee0550ec8b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4392,7 +4392,7 @@ def insert( loc: int, column: Hashable, value: Scalar | AnyArrayLike, - allow_duplicates: bool = False, + allow_duplicates: bool | lib.NoDefault = lib.no_default, ) -> None: """ Insert column into DataFrame at specified location. @@ -4407,7 +4407,7 @@ def insert( column : str, number, or hashable object Label of the inserted column. value : Scalar, Series, or array-like - allow_duplicates : bool, optional default False + allow_duplicates : bool, optional, default lib.no_default See Also -------- @@ -4439,6 +4439,8 @@ def insert( 0 NaN 100 1 99 3 1 5.0 100 2 99 4 """ + if allow_duplicates is lib.no_default: + allow_duplicates = False if allow_duplicates and not self.flags.allows_duplicate_labels: raise ValueError( "Cannot specify 'allow_duplicates=True' when " @@ -5581,6 +5583,7 @@ def reset_index( inplace: Literal[False] = ..., col_level: Hashable = ..., col_fill: Hashable = ..., + allow_duplicates: bool | lib.NoDefault = ..., ) -> DataFrame: ... @@ -5592,6 +5595,7 @@ def reset_index( inplace: Literal[True], col_level: Hashable = ..., col_fill: Hashable = ..., + allow_duplicates: bool | lib.NoDefault = ..., ) -> None: ... @@ -5603,6 +5607,7 @@ def reset_index( inplace: Literal[True], col_level: Hashable = ..., col_fill: Hashable = ..., + allow_duplicates: bool | lib.NoDefault = ..., ) -> None: ... @@ -5614,6 +5619,7 @@ def reset_index( inplace: Literal[True], col_level: Hashable = ..., col_fill: Hashable = ..., + allow_duplicates: bool | lib.NoDefault = ..., ) -> None: ... @@ -5624,6 +5630,7 @@ def reset_index( inplace: Literal[True], col_level: Hashable = ..., col_fill: Hashable = ..., + allow_duplicates: bool | lib.NoDefault = ..., ) -> None: ... @@ -5635,6 +5642,7 @@ def reset_index( inplace: bool = ..., col_level: Hashable = ..., col_fill: Hashable = ..., + allow_duplicates: bool | lib.NoDefault = ..., ) -> DataFrame | None: ... @@ -5646,6 +5654,7 @@ def reset_index( inplace: bool = False, col_level: Hashable = 0, col_fill: Hashable = "", + allow_duplicates: bool | lib.NoDefault = lib.no_default, ) -> DataFrame | None: """ Reset the index, or a level of it. @@ -5671,6 +5680,10 @@ def reset_index( col_fill : object, default '' If the columns have multiple levels, determines how the other levels are named. If None then the index name is repeated. + allow_duplicates : bool, optional, default lib.no_default + Allow duplicate column labels to be created. + + .. versionadded:: 1.5.0 Returns ------- @@ -5794,6 +5807,8 @@ class max type new_obj = self else: new_obj = self.copy() + if allow_duplicates is not lib.no_default: + allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates") new_index = default_index(len(new_obj)) if level is not None: @@ -5845,7 +5860,12 @@ class max type level_values, lab, allow_fill=True, fill_value=lev._na_value ) - new_obj.insert(0, name, level_values) + new_obj.insert( + 0, + name, + level_values, + allow_duplicates=allow_duplicates, + ) new_obj.index = new_index if not inplace: diff --git a/pandas/core/series.py b/pandas/core/series.py index 596953652d2ff..49c0263199318 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1359,7 +1359,14 @@ def repeat(self, repeats, axis=None) -> Series: ) @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "level"]) - def reset_index(self, level=None, drop=False, name=lib.no_default, inplace=False): + def reset_index( + self, + level=None, + drop=False, + name=lib.no_default, + inplace=False, + allow_duplicates: bool = False, + ): """ Generate a new DataFrame or Series with the index reset. @@ -1381,6 +1388,10 @@ def reset_index(self, level=None, drop=False, name=lib.no_default, inplace=False when `drop` is True. inplace : bool, default False Modify the Series in place (do not create a new object). + allow_duplicates : bool, default False + Allow duplicate column labels to be created. + + .. versionadded:: 1.5.0 Returns ------- @@ -1497,7 +1508,9 @@ def reset_index(self, level=None, drop=False, name=lib.no_default, inplace=False name = self.name df = self.to_frame(name) - return df.reset_index(level=level, drop=drop) + return df.reset_index( + level=level, drop=drop, allow_duplicates=allow_duplicates + ) # ---------------------------------------------------------------------- # Rendering Methods diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 8130c4fa41c12..7db72d9e35a30 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -27,6 +27,12 @@ import pandas._testing as tm +@pytest.fixture() +def multiindex_df(): + levels = [["A", ""], ["B", "b"]] + return DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels)) + + class TestResetIndex: def test_reset_index_empty_rangeindex(self): # GH#45230 @@ -381,33 +387,31 @@ def test_reset_index_range(self): ) tm.assert_frame_equal(result, expected) - def test_reset_index_multiindex_columns(self): - levels = [["A", ""], ["B", "b"]] - df = DataFrame([[0, 2], [1, 3]], columns=MultiIndex.from_tuples(levels)) - result = df[["B"]].rename_axis("A").reset_index() - tm.assert_frame_equal(result, df) + def test_reset_index_multiindex_columns(self, multiindex_df): + result = multiindex_df[["B"]].rename_axis("A").reset_index() + tm.assert_frame_equal(result, multiindex_df) # GH#16120: already existing column msg = r"cannot insert \('A', ''\), already exists" with pytest.raises(ValueError, match=msg): - df.rename_axis("A").reset_index() + multiindex_df.rename_axis("A").reset_index() # GH#16164: multiindex (tuple) full key - result = df.set_index([("A", "")]).reset_index() - tm.assert_frame_equal(result, df) + result = multiindex_df.set_index([("A", "")]).reset_index() + tm.assert_frame_equal(result, multiindex_df) # with additional (unnamed) index level idx_col = DataFrame( [[0], [1]], columns=MultiIndex.from_tuples([("level_0", "")]) ) - expected = pd.concat([idx_col, df[[("B", "b"), ("A", "")]]], axis=1) - result = df.set_index([("B", "b")], append=True).reset_index() + expected = pd.concat([idx_col, multiindex_df[[("B", "b"), ("A", "")]]], axis=1) + result = multiindex_df.set_index([("B", "b")], append=True).reset_index() tm.assert_frame_equal(result, expected) # with index name which is a too long tuple... msg = "Item must have length equal to number of levels." with pytest.raises(ValueError, match=msg): - df.rename_axis([("C", "c", "i")]).reset_index() + multiindex_df.rename_axis([("C", "c", "i")]).reset_index() # or too short... levels = [["A", "a", ""], ["B", "b", "i"]] @@ -433,6 +437,45 @@ def test_reset_index_multiindex_columns(self): result = df2.rename_axis([("c", "ii")]).reset_index(col_level=1, col_fill="C") tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("flag", [False, True]) + @pytest.mark.parametrize("allow_duplicates", [False, True]) + def test_reset_index_duplicate_columns_allow( + self, multiindex_df, flag, allow_duplicates + ): + # GH#44755 reset_index with duplicate column labels + df = multiindex_df.rename_axis("A") + df = df.set_flags(allows_duplicate_labels=flag) + + if flag and allow_duplicates: + result = df.reset_index(allow_duplicates=allow_duplicates) + levels = [["A", ""], ["A", ""], ["B", "b"]] + expected = DataFrame( + [[0, 0, 2], [1, 1, 3]], columns=MultiIndex.from_tuples(levels) + ) + tm.assert_frame_equal(result, expected) + else: + if not flag and allow_duplicates: + msg = "Cannot specify 'allow_duplicates=True' when " + "'self.flags.allows_duplicate_labels' is False" + else: + msg = r"cannot insert \('A', ''\), already exists" + with pytest.raises(ValueError, match=msg): + df.reset_index(allow_duplicates=allow_duplicates) + + @pytest.mark.parametrize("flag", [False, True]) + def test_reset_index_duplicate_columns_default(self, multiindex_df, flag): + df = multiindex_df.rename_axis("A") + df = df.set_flags(allows_duplicate_labels=flag) + + msg = r"cannot insert \('A', ''\), already exists" + with pytest.raises(ValueError, match=msg): + df.reset_index() + + @pytest.mark.parametrize("allow_duplicates", ["bad value"]) + def test_reset_index_allow_duplicates_check(self, multiindex_df, allow_duplicates): + with pytest.raises(ValueError, match="expected type bool"): + multiindex_df.reset_index(allow_duplicates=allow_duplicates) + @pytest.mark.filterwarnings("ignore:Timestamp.freq is deprecated:FutureWarning") def test_reset_index_datetime(self, tz_naive_fixture): # GH#3950 diff --git a/pandas/tests/series/methods/test_reset_index.py b/pandas/tests/series/methods/test_reset_index.py index f38491508cc23..e7340aaf376e5 100644 --- a/pandas/tests/series/methods/test_reset_index.py +++ b/pandas/tests/series/methods/test_reset_index.py @@ -186,3 +186,23 @@ def test_reset_index_dtypes_on_empty_series_with_multiindex(array, dtype): {"level_0": np.int64, "level_1": np.float64, "level_2": dtype, 0: object} ) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "names, expected_names", + [ + (["A", "A"], ["A", "A"]), + (["level_1", None], ["level_1", "level_1"]), + ], +) +@pytest.mark.parametrize("allow_duplicates", [False, True]) +def test_column_name_duplicates(names, expected_names, allow_duplicates): + # GH#44755 reset_index with duplicate column labels + s = Series([1], index=MultiIndex.from_arrays([[1], [1]], names=names)) + if allow_duplicates: + result = s.reset_index(allow_duplicates=True) + expected = DataFrame([[1, 1, 1]], columns=expected_names + [0]) + tm.assert_frame_equal(result, expected) + else: + with pytest.raises(ValueError, match="cannot insert"): + s.reset_index()