diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 9023cf2ab1b4f..e77beb2943101 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -208,6 +208,7 @@ Other enhancements - :func:`to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue: `30270`) - DataFrame constructor preserve `ExtensionArray` dtype with `ExtensionArray` (:issue:`11363`) +- :meth:`DataFrame.drop_duplicates` has gained ``ignore_index`` keyword to reset index (:issue:`30114`) Build Changes ^^^^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dfda1470413b7..e8b4b292163e6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4587,6 +4587,7 @@ def drop_duplicates( subset: Optional[Union[Hashable, Sequence[Hashable]]] = None, keep: Union[str, bool] = "first", inplace: bool = False, + ignore_index: bool = False, ) -> Optional["DataFrame"]: """ Return DataFrame with duplicate rows removed. @@ -4606,6 +4607,10 @@ def drop_duplicates( - False : Drop all duplicates. inplace : bool, default False Whether to drop duplicates in place or to return a copy. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 Returns ------- @@ -4621,9 +4626,16 @@ def drop_duplicates( if inplace: (inds,) = (-duplicated)._ndarray_values.nonzero() new_data = self._data.take(inds) + + if ignore_index: + new_data.axes[1] = ibase.default_index(len(inds)) self._update_inplace(new_data) else: - return self[-duplicated] + result = self[-duplicated] + + if ignore_index: + result.index = ibase.default_index(len(result)) + return result return None diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py index a7715d1f31673..29ab2e1bfd512 100644 --- a/pandas/tests/frame/methods/test_drop_duplicates.py +++ b/pandas/tests/frame/methods/test_drop_duplicates.py @@ -391,3 +391,36 @@ def test_drop_duplicates_inplace(): expected = orig2.drop_duplicates(["A", "B"], keep=False) result = df2 tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "origin_dict, output_dict, ignore_index, output_index", + [ + ({"A": [2, 2, 3]}, {"A": [2, 3]}, True, [0, 1]), + ({"A": [2, 2, 3]}, {"A": [2, 3]}, False, [0, 2]), + ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, [0, 1]), + ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]), + ], +) +def test_drop_duplicates_ignore_index( + origin_dict, output_dict, ignore_index, output_index +): + # GH 30114 + df = DataFrame(origin_dict) + expected = DataFrame(output_dict, index=output_index) + + # Test when inplace is False + result = df.drop_duplicates(ignore_index=ignore_index) + tm.assert_frame_equal(result, expected) + + # to verify original dataframe is not mutated + tm.assert_frame_equal(df, DataFrame(origin_dict)) + + # Test when inplace is True + copied_df = df.copy() + + copied_df.drop_duplicates(ignore_index=ignore_index, inplace=True) + tm.assert_frame_equal(copied_df, expected) + + # to verify that input is unchanged + tm.assert_frame_equal(df, DataFrame(origin_dict))