From 4abf2b9fa0a1db79d7c97cc1ce7d48c494ea2cbc Mon Sep 17 00:00:00 2001 From: behzad nouri Date: Sun, 1 Feb 2015 21:29:47 -0500 Subject: [PATCH] performance improvement in DataFrame.duplicated --- doc/source/whatsnew/v0.16.0.txt | 1 + pandas/core/frame.py | 44 +++++++++++++++++---------------- pandas/lib.pyx | 6 ++--- vb_suite/frame_methods.py | 14 +++++++++++ 4 files changed, 41 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 0468c220bcb98..e60a75102948f 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -208,6 +208,7 @@ Performance - Performance and memory usage improvements in ``merge`` when key space exceeds ``int64`` bounds (:issue:`9151`) - Performance improvements in multi-key ``groupby`` (:issue:`9429`) - Performance improvements in ``MultiIndex.sortlevel`` (:issue:`9445`) +- Performance and memory usage improvements in ``DataFrame.duplicated`` (:issue:`9398`) - Cythonized ``Period`` (:issue:`9440`) Bug Fixes diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 733de1fc202e5..e5bf8b1e73110 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2734,30 +2734,32 @@ def duplicated(self, subset=None, take_last=False): ------- duplicated : Series """ - # kludge for #1833 - def _m8_to_i8(x): - if issubclass(x.dtype.type, np.datetime64): - return x.view(np.int64) - return x + from pandas.core.groupby import get_group_index + from pandas.hashtable import duplicated_int64, _SIZE_HINT_LIMIT + + size_hint = min(len(self), _SIZE_HINT_LIMIT) + + def factorize(vals): + (hash_klass, vec_klass), vals = \ + algos._get_data_algo(vals, algos._hashtables) + + uniques, table = vec_klass(), hash_klass(size_hint) + labels = table.get_labels(vals, uniques, 0, -1) + + return labels.astype('i8', copy=False), len(uniques) - # if we are only duplicating on Categoricals this can be much faster if subset is None: - values = list(_m8_to_i8(self.get_values().T)) - else: - if np.iterable(subset) and not isinstance(subset, compat.string_types): - if isinstance(subset, tuple): - if subset in self.columns: - values = [self[subset].get_values()] - else: - values = [_m8_to_i8(self[x].get_values()) for x in subset] - else: - values = [_m8_to_i8(self[x].get_values()) for x in subset] - else: - values = [self[subset].get_values()] + subset = self.columns + elif not np.iterable(subset) or \ + isinstance(subset, compat.string_types) or \ + isinstance(subset, tuple) and subset in self.columns: + subset = subset, + + vals = (self[col].values for col in subset) + labels, shape = map(list, zip( * map(factorize, vals))) - keys = lib.fast_zip_fillna(values) - duplicated = lib.duplicated(keys, take_last=take_last) - return Series(duplicated, index=self.index) + ids = get_group_index(labels, shape, sort=False, xnull=False) + return Series(duplicated_int64(ids, take_last), index=self.index) #---------------------------------------------------------------------- # Sorting diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 71aeaf0895035..5ab2ee4327177 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -1278,7 +1278,7 @@ def fast_zip_fillna(list ndarrays, fill_value=pandas_null): def duplicated(ndarray[object] values, take_last=False): cdef: Py_ssize_t i, n - dict seen = {} + set seen = set() object row n = len(values) @@ -1291,7 +1291,7 @@ def duplicated(ndarray[object] values, take_last=False): if row in seen: result[i] = 1 else: - seen[row] = None + seen.add(row) result[i] = 0 else: for i from 0 <= i < n: @@ -1299,7 +1299,7 @@ def duplicated(ndarray[object] values, take_last=False): if row in seen: result[i] = 1 else: - seen[row] = None + seen.add(row) result[i] = 0 return result.view(np.bool_) diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py index 334534ed466f2..0fac1e01ff65b 100644 --- a/vb_suite/frame_methods.py +++ b/vb_suite/frame_methods.py @@ -506,3 +506,17 @@ def get_data(n=100000): setup, name='frame_from_records_generator_nrows', start_date=datetime(2013,10,04)) # issue-4911 + +setup = common_setup + ''' +n = 1 << 20 + +t = date_range('2015-01-01', freq='S', periods=n // 64) +xs = np.random.randn(n // 64).round(2) + +df = DataFrame({'a':np.random.randint(- 1 << 8, 1 << 8, n), + 'b':np.random.choice(t, n), + 'c':np.random.choice(xs, n)}) +''' + +frame_duplicated = Benchmark('df.duplicated()', setup, + name='frame_duplicated')