diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index d6fbd0d31c389..03e654b4886cc 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -61,6 +61,19 @@ def time_frame_drop_duplicates(self): self.df.drop_duplicates(['key1', 'key2']) +class frame_drop_duplicates_int(object): + + def setup(self): + np.random.seed(1234) + self.N = 1000000 + self.K = 10000 + self.key1 = np.random.randint(0,self.K,size=self.N) + self.df = DataFrame({'key1': self.key1}) + + def time_frame_drop_duplicates_int(self): + self.df.drop_duplicates() + + class frame_drop_duplicates_na(object): goal_time = 0.2 @@ -381,4 +394,4 @@ def setup(self): self.s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10)) def time_series_drop_duplicates_string(self): - self.s2.drop_duplicates() \ No newline at end of file + self.s2.drop_duplicates() diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index d3d7fe1637900..33abc62b3f973 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -722,7 +722,7 @@ Performance Improvements - Added vbench benchmarks for alternative ExcelWriter engines and reading Excel files (:issue:`7171`) - Performance improvements in ``Categorical.value_counts`` (:issue:`10804`) - Performance improvements in ``SeriesGroupBy.nunique`` and ``SeriesGroupBy.value_counts`` (:issue:`10820`) - +- Performance improvements in ``DataFrame.drop_duplicates`` with integer dtypes (:issue:`10917`) - 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`) - 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`) - Significantly improved performance of indexing ``MultiIndex`` with slicers (:issue:`10287`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3e908bf9d579b..af2959e86274f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2962,7 +2962,13 @@ def duplicated(self, subset=None, keep='first'): from pandas.hashtable import duplicated_int64, _SIZE_HINT_LIMIT def f(vals): - labels, shape = factorize(vals, size_hint=min(len(self), _SIZE_HINT_LIMIT)) + + # if we have integers we can directly index with these + if com.is_integer_dtype(vals): + from pandas.core.nanops import unique1d + labels, shape = vals, unique1d(vals) + else: + labels, shape = factorize(vals, size_hint=min(len(self), _SIZE_HINT_LIMIT)) return labels.astype('i8',copy=False), len(shape) if subset is None: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 16143fa612c48..693b761ae7b4b 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -7975,6 +7975,22 @@ def test_drop_duplicates(self): expected = df2.drop_duplicates(['AAA', 'B'], take_last=True) assert_frame_equal(result, expected) + # integers + result = df.drop_duplicates('C') + expected = df.iloc[[0,2]] + assert_frame_equal(result, expected) + result = df.drop_duplicates('C',keep='last') + expected = df.iloc[[-2,-1]] + assert_frame_equal(result, expected) + + df['E'] = df['C'].astype('int8') + result = df.drop_duplicates('E') + expected = df.iloc[[0,2]] + assert_frame_equal(result, expected) + result = df.drop_duplicates('E',keep='last') + expected = df.iloc[[-2,-1]] + assert_frame_equal(result, expected) + def test_drop_duplicates_for_take_all(self): df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar', 'foo', 'bar', 'qux', 'foo'],