From 7e461a18d9f6928132afec6f48ce968b3e989ba6 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Mon, 3 Dec 2018 17:43:52 +0100 Subject: [PATCH 01/11] remove \n from docstring --- pandas/core/arrays/datetimes.py | 26 +++++++++++++------------- pandas/core/arrays/timedeltas.py | 16 ++++++++-------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index cfe3afcf3730a..b3df505d56d78 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -82,7 +82,7 @@ def f(self): return result f.__name__ = name - f.__doc__ = docstring + f.__doc__ = "\n{}\n".format(docstring) return property(f) @@ -1072,19 +1072,19 @@ def date(self): return tslib.ints_to_pydatetime(timestamps, box="date") - year = _field_accessor('year', 'Y', "\n The year of the datetime\n") + year = _field_accessor('year', 'Y', "The year of the datetime") month = _field_accessor('month', 'M', - "\n The month as January=1, December=12 \n") - day = _field_accessor('day', 'D', "\nThe days of the datetime\n") - hour = _field_accessor('hour', 'h', "\nThe hours of the datetime\n") - minute = _field_accessor('minute', 'm', "\nThe minutes of the datetime\n") - second = _field_accessor('second', 's', "\nThe seconds of the datetime\n") + "The month as January=1, December=12") + day = _field_accessor('day', 'D', "The days of the datetime") + hour = _field_accessor('hour', 'h', "The hours of the datetime") + minute = _field_accessor('minute', 'm', "The minutes of the datetime") + second = _field_accessor('second', 's', "The seconds of the datetime") microsecond = _field_accessor('microsecond', 'us', - "\nThe microseconds of the datetime\n") + "The microseconds of the datetime") nanosecond = _field_accessor('nanosecond', 'ns', - "\nThe nanoseconds of the datetime\n") + "The nanoseconds of the datetime") weekofyear = _field_accessor('weekofyear', 'woy', - "\nThe week ordinal of the year\n") + "The week ordinal of the year") week = weekofyear _dayofweek_doc = """ The day of the week with Monday=0, Sunday=6. @@ -1129,12 +1129,12 @@ def date(self): "The name of day in a week (ex: Friday)\n\n.. deprecated:: 0.23.0") dayofyear = _field_accessor('dayofyear', 'doy', - "\nThe ordinal day of the year\n") - quarter = _field_accessor('quarter', 'q', "\nThe quarter of the date\n") + "The ordinal day of the year") + quarter = _field_accessor('quarter', 'q', "The quarter of the date") days_in_month = _field_accessor( 'days_in_month', 'dim', - "\nThe number of days in the month\n") + "The number of days in the month") daysinmonth = days_in_month _is_month_doc = """ Indicates whether the date is the {first_or_last} day of the month. diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 830283d31a929..4afc9f5483c2a 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -59,7 +59,7 @@ def f(self): return result f.__name__ = name - f.__doc__ = docstring + f.__doc__ = "\n{}\n".format(docstring) return property(f) @@ -684,16 +684,16 @@ def to_pytimedelta(self): return tslibs.ints_to_pytimedelta(self.asi8) days = _field_accessor("days", "days", - "\nNumber of days for each element.\n") + "Number of days for each element.") seconds = _field_accessor("seconds", "seconds", - "\nNumber of seconds (>= 0 and less than 1 day) " - "for each element.\n") + "Number of seconds (>= 0 and less than 1 day) " + "for each element.") microseconds = _field_accessor("microseconds", "microseconds", - "\nNumber of microseconds (>= 0 and less " - "than 1 second) for each element.\n") + "Number of microseconds (>= 0 and less " + "than 1 second) for each element.") nanoseconds = _field_accessor("nanoseconds", "nanoseconds", - "\nNumber of nanoseconds (>= 0 and less " - "than 1 microsecond) for each element.\n") + "Number of nanoseconds (>= 0 and less " + "than 1 microsecond) for each element.") @property def components(self): From bdda8caecaab056f77648798d77783caf6c5d247 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 22 Dec 2019 16:50:15 +0100 Subject: [PATCH 02/11] Add ignore_index for drop duplicates --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/frame.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index a15d5b319fc82..219ded8c4d876 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -474,7 +474,7 @@ Other API changes Supplying anything else than ``how`` to ``**kwargs`` raised a ``TypeError`` previously (:issue:`29388`) - When testing pandas, the new minimum required version of pytest is 5.0.1 (:issue:`29664`) - :meth:`Series.str.__iter__` was deprecated and will be removed in future releases (:issue:`28277`). - +- Add ``ignore_index`` to :meth:`DataFrame.drop_duplicates` to reset index (:issue:`30114`) .. _whatsnew_1000.api.documentation: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 766437dbad8f8..53f1c6ca7d878 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4587,6 +4587,7 @@ def drop_duplicates( subset: Optional[Union[Hashable, Sequence[Hashable]]] = None, keep: Union[str, bool] = "first", inplace: bool = False, + ignore_index: bool = False, ) -> Optional["DataFrame"]: """ Return DataFrame with duplicate rows removed. @@ -4606,6 +4607,8 @@ def drop_duplicates( - False : Drop all duplicates. inplace : bool, default False Whether to drop duplicates in place or to return a copy. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, …, n - 1. Returns ------- @@ -4621,8 +4624,13 @@ def drop_duplicates( if inplace: (inds,) = (-duplicated)._ndarray_values.nonzero() new_data = self._data.take(inds) + if ignore_index: + new_data.axes[1] = ibase.default_index(len(inds)) self._update_inplace(new_data) else: + if ignore_index: + idx = ibase.default_index(len(self[-duplicated])) + return self[-duplicated].set_index(idx) return self[-duplicated] return None From 6e76e562c19a36fb277b3f566c46a51203b81603 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 22 Dec 2019 22:44:01 +0100 Subject: [PATCH 03/11] add forgotten test and code change based on review --- pandas/core/frame.py | 3 ++- pandas/tests/frame/test_duplicates.py | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 53f1c6ca7d878..6f7706c771b39 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4608,7 +4608,7 @@ def drop_duplicates( inplace : bool, default False Whether to drop duplicates in place or to return a copy. ignore_index : bool, default False - If True, the resulting axis will be labeled 0, …, n - 1. + If True, the resulting axis will be labeled 0, 1, …, n - 1. Returns ------- @@ -4624,6 +4624,7 @@ def drop_duplicates( if inplace: (inds,) = (-duplicated)._ndarray_values.nonzero() new_data = self._data.take(inds) + if ignore_index: new_data.axes[1] = ibase.default_index(len(inds)) self._update_inplace(new_data) diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index d2a1fc43d2046..110ee56e8c67e 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -477,3 +477,23 @@ def test_drop_duplicates_inplace(): expected = orig2.drop_duplicates(["A", "B"], keep=False) result = df2 tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "origin_dict, output_dict, ignore_index, output_index", + [ + ({"A": [2, 2, 3]}, {"A": [2, 3]}, True, [0, 1]), + ({"A": [2, 2, 3]}, {"A": [2, 3]}, False, [0, 2]), + ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, [0, 1]), + ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]), + ], +) +def test_drop_duplicates_ignore_index( + origin_dict, output_dict, ignore_index, output_index +): + # GH 30114 + df = DataFrame(origin_dict) + result = df.drop_duplicates(ignore_index=ignore_index) + + expected = DataFrame(output_dict, index=output_index) + tm.assert_frame_equal(result, expected) From c12beb6af337945729dfa623a2d76b32879cf1a9 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 23 Dec 2019 10:53:41 +0100 Subject: [PATCH 04/11] code change on WA review --- pandas/core/frame.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6f7706c771b39..26b5fe53cfc19 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4629,10 +4629,10 @@ def drop_duplicates( new_data.axes[1] = ibase.default_index(len(inds)) self._update_inplace(new_data) else: + result = self[-duplicated] if ignore_index: - idx = ibase.default_index(len(self[-duplicated])) - return self[-duplicated].set_index(idx) - return self[-duplicated] + return result.reset_index(drop=True) + return result return None From 1b6dc51a06102d8f88cf11fac6f0268d971556a6 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 23 Dec 2019 19:47:52 +0100 Subject: [PATCH 05/11] keep consistency --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 26b5fe53cfc19..3a9358e7e1aa5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4626,7 +4626,7 @@ def drop_duplicates( new_data = self._data.take(inds) if ignore_index: - new_data.axes[1] = ibase.default_index(len(inds)) + new_data = new_data.reset_index(drop=True) self._update_inplace(new_data) else: result = self[-duplicated] From 17dbcb0f06e77de0b89532584bbc9f9125bf2902 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 24 Dec 2019 20:11:14 +0100 Subject: [PATCH 06/11] code change based on JR review --- doc/source/whatsnew/v1.0.0.rst | 2 +- pandas/core/frame.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 219ded8c4d876..1d2610646eefe 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -208,6 +208,7 @@ Other enhancements - :func:`to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue: `30270`) - DataFrame constructor preserve `ExtensionArray` dtype with `ExtensionArray` (:issue:`11363`) +- :meth:`DataFrame.drop_duplicates` has gained ``ignore_index`` keyword to reset index (:issue:`30114`) Build Changes ^^^^^^^^^^^^^ @@ -474,7 +475,6 @@ Other API changes Supplying anything else than ``how`` to ``**kwargs`` raised a ``TypeError`` previously (:issue:`29388`) - When testing pandas, the new minimum required version of pytest is 5.0.1 (:issue:`29664`) - :meth:`Series.str.__iter__` was deprecated and will be removed in future releases (:issue:`28277`). -- Add ``ignore_index`` to :meth:`DataFrame.drop_duplicates` to reset index (:issue:`30114`) .. _whatsnew_1000.api.documentation: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3a9358e7e1aa5..7abfb717cd117 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4610,6 +4610,8 @@ def drop_duplicates( ignore_index : bool, default False If True, the resulting axis will be labeled 0, 1, …, n - 1. + .. versionadded:: 1.0.0 + Returns ------- DataFrame @@ -4626,12 +4628,13 @@ def drop_duplicates( new_data = self._data.take(inds) if ignore_index: - new_data = new_data.reset_index(drop=True) + new_data.axes[1] = ibase.default_index(len(inds)) self._update_inplace(new_data) else: result = self[-duplicated] + if ignore_index: - return result.reset_index(drop=True) + result.index = ibase.default_index(sum(-duplicated)) return result return None From a173eea033d262a11faea0c1381cad0bc9a39b13 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 24 Dec 2019 20:11:37 +0100 Subject: [PATCH 07/11] add test --- pandas/tests/frame/test_duplicates.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/test_duplicates.py index 110ee56e8c67e..277916771a9c8 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/test_duplicates.py @@ -488,7 +488,7 @@ def test_drop_duplicates_inplace(): ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]), ], ) -def test_drop_duplicates_ignore_index( +def test_drop_duplicates_ignore_index_inplace_false( origin_dict, output_dict, ignore_index, output_index ): # GH 30114 @@ -497,3 +497,26 @@ def test_drop_duplicates_ignore_index( expected = DataFrame(output_dict, index=output_index) tm.assert_frame_equal(result, expected) + + # to verify if original dataframe is not mutated + tm.assert_frame_equal(df, DataFrame(origin_dict)) + + +@pytest.mark.parametrize( + "origin_dict, output_dict, ignore_index, output_index", + [ + ({"A": [2, 2, 3]}, {"A": [2, 3]}, True, [0, 1]), + ({"A": [2, 2, 3]}, {"A": [2, 3]}, False, [0, 2]), + ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, [0, 1]), + ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]), + ], +) +def test_drop_duplicates_ignore_index_inplace_true( + origin_dict, output_dict, ignore_index, output_index +): + # GH 30114, to check if correct when inplace is True + df = DataFrame(origin_dict) + df.drop_duplicates(ignore_index=ignore_index, inplace=True) + + expected = DataFrame(output_dict, index=output_index) + tm.assert_frame_equal(df, expected) From 4a37e8f9d168295c19c52a228b92673b43477245 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 24 Dec 2019 20:12:44 +0100 Subject: [PATCH 08/11] restore wrong deleted code --- doc/source/whatsnew/v1.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 1d2610646eefe..3958935d766bc 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -475,6 +475,7 @@ Other API changes Supplying anything else than ``how`` to ``**kwargs`` raised a ``TypeError`` previously (:issue:`29388`) - When testing pandas, the new minimum required version of pytest is 5.0.1 (:issue:`29664`) - :meth:`Series.str.__iter__` was deprecated and will be removed in future releases (:issue:`28277`). +- .. _whatsnew_1000.api.documentation: From 79a49e118e772c0d5bbe5bf8de2c4824f714774e Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 24 Dec 2019 20:13:18 +0100 Subject: [PATCH 09/11] remove --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 3958935d766bc..597620737f6cb 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -475,7 +475,7 @@ Other API changes Supplying anything else than ``how`` to ``**kwargs`` raised a ``TypeError`` previously (:issue:`29388`) - When testing pandas, the new minimum required version of pytest is 5.0.1 (:issue:`29664`) - :meth:`Series.str.__iter__` was deprecated and will be removed in future releases (:issue:`28277`). -- + .. _whatsnew_1000.api.documentation: From a8552d83a4724e40e86a8d8d1ee44b9300270cd9 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 26 Dec 2019 09:09:27 +0100 Subject: [PATCH 10/11] code change based on JR review --- pandas/core/frame.py | 2 +- .../frame/methods/test_drop_duplicates.py | 30 +++++++------------ 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c3982b74c008c..e8b4b292163e6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4634,7 +4634,7 @@ def drop_duplicates( result = self[-duplicated] if ignore_index: - result.index = ibase.default_index(sum(-duplicated)) + result.index = ibase.default_index(len(result)) return result return None diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py index 729aa6db09a18..b4173de57d22d 100644 --- a/pandas/tests/frame/methods/test_drop_duplicates.py +++ b/pandas/tests/frame/methods/test_drop_duplicates.py @@ -402,35 +402,27 @@ def test_drop_duplicates_inplace(): ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]), ], ) -def test_drop_duplicates_ignore_index_inplace_false( +def test_drop_duplicates_ignore_index( origin_dict, output_dict, ignore_index, output_index ): # GH 30114 df = DataFrame(origin_dict) + + # Test when inplace is False result = df.drop_duplicates(ignore_index=ignore_index) expected = DataFrame(output_dict, index=output_index) tm.assert_frame_equal(result, expected) - # to verify if original dataframe is not mutated + # to verify original dataframe is not mutated tm.assert_frame_equal(df, DataFrame(origin_dict)) - -@pytest.mark.parametrize( - "origin_dict, output_dict, ignore_index, output_index", - [ - ({"A": [2, 2, 3]}, {"A": [2, 3]}, True, [0, 1]), - ({"A": [2, 2, 3]}, {"A": [2, 3]}, False, [0, 2]), - ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, [0, 1]), - ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]), - ], -) -def test_drop_duplicates_ignore_index_inplace_true( - origin_dict, output_dict, ignore_index, output_index -): - # GH 30114, to check if correct when inplace is True - df = DataFrame(origin_dict) - df.drop_duplicates(ignore_index=ignore_index, inplace=True) + # Test when inplace is True + copied_df = df.copy() + copied_df.drop_duplicates(ignore_index=ignore_index, inplace=True) expected = DataFrame(output_dict, index=output_index) - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(copied_df, expected) + + # to verify that input is unchanged + tm.assert_frame_equal(df, DataFrame(origin_dict)) From 6eaff2ed0afc39383f78e0c3e1ac3c050c630a9e Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Thu, 26 Dec 2019 09:35:16 +0100 Subject: [PATCH 11/11] simplify code --- pandas/tests/frame/methods/test_drop_duplicates.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py index b4173de57d22d..29ab2e1bfd512 100644 --- a/pandas/tests/frame/methods/test_drop_duplicates.py +++ b/pandas/tests/frame/methods/test_drop_duplicates.py @@ -407,11 +407,10 @@ def test_drop_duplicates_ignore_index( ): # GH 30114 df = DataFrame(origin_dict) + expected = DataFrame(output_dict, index=output_index) # Test when inplace is False result = df.drop_duplicates(ignore_index=ignore_index) - - expected = DataFrame(output_dict, index=output_index) tm.assert_frame_equal(result, expected) # to verify original dataframe is not mutated @@ -419,9 +418,8 @@ def test_drop_duplicates_ignore_index( # Test when inplace is True copied_df = df.copy() - copied_df.drop_duplicates(ignore_index=ignore_index, inplace=True) - expected = DataFrame(output_dict, index=output_index) + copied_df.drop_duplicates(ignore_index=ignore_index, inplace=True) tm.assert_frame_equal(copied_df, expected) # to verify that input is unchanged