Skip to content

Commit 4abf2b9

Browse files
committed
performance improvement in DataFrame.duplicated
1 parent bceb342 commit 4abf2b9

File tree

4 files changed

+41
-24
lines changed

4 files changed

+41
-24
lines changed

doc/source/whatsnew/v0.16.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,7 @@ Performance
208208
- Performance and memory usage improvements in ``merge`` when key space exceeds ``int64`` bounds (:issue:`9151`)
209209
- Performance improvements in multi-key ``groupby`` (:issue:`9429`)
210210
- Performance improvements in ``MultiIndex.sortlevel`` (:issue:`9445`)
211+
- Performance and memory usage improvements in ``DataFrame.duplicated`` (:issue:`9398`)
211212
- Cythonized ``Period`` (:issue:`9440`)
212213

213214
Bug Fixes

pandas/core/frame.py

+23-21
Original file line numberDiff line numberDiff line change
@@ -2734,30 +2734,32 @@ def duplicated(self, subset=None, take_last=False):
27342734
-------
27352735
duplicated : Series
27362736
"""
2737-
# kludge for #1833
2738-
def _m8_to_i8(x):
2739-
if issubclass(x.dtype.type, np.datetime64):
2740-
return x.view(np.int64)
2741-
return x
2737+
from pandas.core.groupby import get_group_index
2738+
from pandas.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
2739+
2740+
size_hint = min(len(self), _SIZE_HINT_LIMIT)
2741+
2742+
def factorize(vals):
2743+
(hash_klass, vec_klass), vals = \
2744+
algos._get_data_algo(vals, algos._hashtables)
2745+
2746+
uniques, table = vec_klass(), hash_klass(size_hint)
2747+
labels = table.get_labels(vals, uniques, 0, -1)
2748+
2749+
return labels.astype('i8', copy=False), len(uniques)
27422750

2743-
# if we are only duplicating on Categoricals this can be much faster
27442751
if subset is None:
2745-
values = list(_m8_to_i8(self.get_values().T))
2746-
else:
2747-
if np.iterable(subset) and not isinstance(subset, compat.string_types):
2748-
if isinstance(subset, tuple):
2749-
if subset in self.columns:
2750-
values = [self[subset].get_values()]
2751-
else:
2752-
values = [_m8_to_i8(self[x].get_values()) for x in subset]
2753-
else:
2754-
values = [_m8_to_i8(self[x].get_values()) for x in subset]
2755-
else:
2756-
values = [self[subset].get_values()]
2752+
subset = self.columns
2753+
elif not np.iterable(subset) or \
2754+
isinstance(subset, compat.string_types) or \
2755+
isinstance(subset, tuple) and subset in self.columns:
2756+
subset = subset,
2757+
2758+
vals = (self[col].values for col in subset)
2759+
labels, shape = map(list, zip( * map(factorize, vals)))
27572760

2758-
keys = lib.fast_zip_fillna(values)
2759-
duplicated = lib.duplicated(keys, take_last=take_last)
2760-
return Series(duplicated, index=self.index)
2761+
ids = get_group_index(labels, shape, sort=False, xnull=False)
2762+
return Series(duplicated_int64(ids, take_last), index=self.index)
27612763

27622764
#----------------------------------------------------------------------
27632765
# Sorting

pandas/lib.pyx

+3-3
Original file line numberDiff line numberDiff line change
@@ -1278,7 +1278,7 @@ def fast_zip_fillna(list ndarrays, fill_value=pandas_null):
12781278
def duplicated(ndarray[object] values, take_last=False):
12791279
cdef:
12801280
Py_ssize_t i, n
1281-
dict seen = {}
1281+
set seen = set()
12821282
object row
12831283

12841284
n = len(values)
@@ -1291,15 +1291,15 @@ def duplicated(ndarray[object] values, take_last=False):
12911291
if row in seen:
12921292
result[i] = 1
12931293
else:
1294-
seen[row] = None
1294+
seen.add(row)
12951295
result[i] = 0
12961296
else:
12971297
for i from 0 <= i < n:
12981298
row = values[i]
12991299
if row in seen:
13001300
result[i] = 1
13011301
else:
1302-
seen[row] = None
1302+
seen.add(row)
13031303
result[i] = 0
13041304

13051305
return result.view(np.bool_)

vb_suite/frame_methods.py

+14
Original file line numberDiff line numberDiff line change
@@ -506,3 +506,17 @@ def get_data(n=100000):
506506
setup,
507507
name='frame_from_records_generator_nrows',
508508
start_date=datetime(2013,10,04)) # issue-4911
509+
510+
setup = common_setup + '''
511+
n = 1 << 20
512+
513+
t = date_range('2015-01-01', freq='S', periods=n // 64)
514+
xs = np.random.randn(n // 64).round(2)
515+
516+
df = DataFrame({'a':np.random.randint(- 1 << 8, 1 << 8, n),
517+
'b':np.random.choice(t, n),
518+
'c':np.random.choice(xs, n)})
519+
'''
520+
521+
frame_duplicated = Benchmark('df.duplicated()', setup,
522+
name='frame_duplicated')

0 commit comments

Comments
 (0)