Skip to content

Commit a00c7ea

Browse files
committed
PERF: perf improvements in drop_duplicates for integer dtyped arrays
1 parent e67a981 commit a00c7ea

File tree

4 files changed

+38
-3
lines changed

4 files changed

+38
-3
lines changed

asv_bench/benchmarks/reindex.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,19 @@ def time_frame_drop_duplicates(self):
6161
self.df.drop_duplicates(['key1', 'key2'])
6262

6363

64+
class frame_drop_duplicates_int(object):
65+
66+
def setup(self):
67+
np.random.seed(1234)
68+
self.N = 1000000
69+
self.K = 10000
70+
self.key1 = np.random.randint(0,self.K,size=self.N)
71+
self.df = DataFrame({'key1': self.key1})
72+
73+
def time_frame_drop_duplicates_int(self):
74+
self.df.drop_duplicates()
75+
76+
6477
class frame_drop_duplicates_na(object):
6578
goal_time = 0.2
6679

@@ -381,4 +394,4 @@ def setup(self):
381394
self.s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10))
382395

383396
def time_series_drop_duplicates_string(self):
384-
self.s2.drop_duplicates()
397+
self.s2.drop_duplicates()

doc/source/whatsnew/v0.17.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -722,7 +722,7 @@ Performance Improvements
722722
- Added vbench benchmarks for alternative ExcelWriter engines and reading Excel files (:issue:`7171`)
723723
- Performance improvements in ``Categorical.value_counts`` (:issue:`10804`)
724724
- Performance improvements in ``SeriesGroupBy.nunique`` and ``SeriesGroupBy.value_counts`` (:issue:`10820`)
725-
725+
- Performance improvements in ``DataFrame.drop_duplicates`` with integer dtypes (:issue:`10917`)
726726
- 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`)
727727
- 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`)
728728
- Significantly improved performance of indexing ``MultiIndex`` with slicers (:issue:`10287`)

pandas/core/frame.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -2962,7 +2962,13 @@ def duplicated(self, subset=None, keep='first'):
29622962
from pandas.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
29632963

29642964
def f(vals):
2965-
labels, shape = factorize(vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
2965+
2966+
# if we have integers we can directly index with these
2967+
if com.is_integer_dtype(vals):
2968+
from pandas.core.nanops import unique1d
2969+
labels, shape = vals, unique1d(vals)
2970+
else:
2971+
labels, shape = factorize(vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
29662972
return labels.astype('i8',copy=False), len(shape)
29672973

29682974
if subset is None:

pandas/tests/test_frame.py

+16
Original file line numberDiff line numberDiff line change
@@ -7975,6 +7975,22 @@ def test_drop_duplicates(self):
79757975
expected = df2.drop_duplicates(['AAA', 'B'], take_last=True)
79767976
assert_frame_equal(result, expected)
79777977

7978+
# integers
7979+
result = df.drop_duplicates('C')
7980+
expected = df.iloc[[0,2]]
7981+
assert_frame_equal(result, expected)
7982+
result = df.drop_duplicates('C',keep='last')
7983+
expected = df.iloc[[-2,-1]]
7984+
assert_frame_equal(result, expected)
7985+
7986+
df['E'] = df['C'].astype('int8')
7987+
result = df.drop_duplicates('E')
7988+
expected = df.iloc[[0,2]]
7989+
assert_frame_equal(result, expected)
7990+
result = df.drop_duplicates('E',keep='last')
7991+
expected = df.iloc[[-2,-1]]
7992+
assert_frame_equal(result, expected)
7993+
79787994
def test_drop_duplicates_for_take_all(self):
79797995
df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar',
79807996
'foo', 'bar', 'qux', 'foo'],

0 commit comments

Comments
 (0)