From 01ec863dd39f11b27ae9f0d67e6dc54105990a60 Mon Sep 17 00:00:00 2001 From: behzad nouri Date: Mon, 22 Dec 2014 21:15:18 -0500 Subject: [PATCH] use labels to find duplicates in multi-index --- doc/source/whatsnew/v0.16.0.txt | 1 + pandas/core/index.py | 9 ++++++--- pandas/hashtable.pyx | 24 ++++++++++++++++++++++++ pandas/tests/test_index.py | 10 ++++++++++ vb_suite/index_object.py | 11 +++++++++++ 5 files changed, 52 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index d572835a76218..531955c765deb 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -46,6 +46,7 @@ Performance .. _whatsnew_0160.performance: - Fixed a severe performance regression for ``.loc`` indexing with an array or list (:issue:9126:). +- Performance improvements in ``MultiIndex.duplicated`` by working with labels instead of values. Bug Fixes ~~~~~~~~~ diff --git a/pandas/core/index.py b/pandas/core/index.py index 97890299657cf..1b4a691851a8a 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -3225,14 +3225,17 @@ def _has_complex_internals(self): @cache_readonly def is_unique(self): - from pandas.hashtable import Int64HashTable + return not self.duplicated().any() + + @Appender(_shared_docs['duplicated'] % _index_doc_kwargs) + def duplicated(self, take_last=False): from pandas.core.groupby import get_flat_ids + from pandas.hashtable import duplicated_int64 shape = map(len, self.levels) ids = get_flat_ids(self.labels, shape, False) - table = Int64HashTable(min(1 << 20, len(ids))) - return len(table.unique(ids)) == len(self) + return duplicated_int64(ids, take_last) def get_value(self, series, key): # somewhat broken encapsulation diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index cf9428d5862ec..26fba1a4b9615 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -1062,3 +1062,27 @@ def mode_int64(ndarray[int64_t] values): kh_destroy_int64(table) return modes[:j+1] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last): + cdef: + int ret = 0 + Py_ssize_t i, n = len(values) + kh_int64_t * table = kh_init_int64() + ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') + + kh_resize_int64(table, min(1 << 20, n)) + + if take_last: + for i from n > i >=0: + kh_put_int64(table, values[i], &ret) + out[i] = ret == 0 + else: + for i from 0 <= i < n: + kh_put_int64(table, values[i], &ret) + out[i] = ret == 0 + + kh_destroy_int64(table) + return out diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 5c581b548e583..c8c46309eb016 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -3514,6 +3514,16 @@ def check(nlevels, with_nulls): check(8, False) check(8, True) + n, k = 200, 5000 + levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)] + labels = [np.random.choice(n, k * n) for lev in levels] + mi = MultiIndex(levels=levels, labels=labels) + + for take_last in [False, True]: + left = mi.duplicated(take_last=take_last) + right = pd.lib.duplicated(mi.values, take_last=take_last) + tm.assert_array_equal(left, right) + def test_tolist(self): result = self.index.tolist() exp = list(self.index.values) diff --git a/vb_suite/index_object.py b/vb_suite/index_object.py index a8cc0e9ec5f89..08ad96d1d0427 100644 --- a/vb_suite/index_object.py +++ b/vb_suite/index_object.py @@ -138,6 +138,17 @@ name='multiindex_with_datetime_level_sliced', start_date=datetime(2014, 10, 11)) +# multi-index duplicated +setup = common_setup + """ +n, k = 200, 5000 +levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)] +labels = [np.random.choice(n, k * n) for lev in levels] +mi = MultiIndex(levels=levels, labels=labels) +""" + +multiindex_duplicated = Benchmark('mi.duplicated()', setup, + name='multiindex_duplicated') + #---------------------------------------------------------------------- # repr