Skip to content

Commit 8f4a321

Browse files
behzadnourijreback
authored andcommitted
use labels to find duplicates in multi-index (GH9125)
1 parent eb77d1d commit 8f4a321

File tree

5 files changed

+54
-6
lines changed

5 files changed

+54
-6
lines changed

doc/source/whatsnew/v0.16.0.txt

+2-3
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,8 @@ Performance
4545

4646
.. _whatsnew_0160.performance:
4747

48-
49-
- Fixed a severe performance regression for ``.loc`` indexing with an array or list (:issue:9126:).
50-
48+
- Fixed a performance regression for ``.loc`` indexing with an array or list-like (:issue:`9126`:).
49+
- Performance improvements in ``MultiIndex.duplicated`` by working with labels instead of values (:issue:`9125`)
5150
- Improved the speed of `nunique` by calling `unique` instead of `value_counts` (:issue:`9129`, :issue:`7771`)
5251

5352

pandas/core/index.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -3225,14 +3225,17 @@ def _has_complex_internals(self):
32253225

32263226
@cache_readonly
32273227
def is_unique(self):
3228-
from pandas.hashtable import Int64HashTable
3228+
return not self.duplicated().any()
3229+
3230+
@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
3231+
def duplicated(self, take_last=False):
32293232
from pandas.core.groupby import get_flat_ids
3233+
from pandas.hashtable import duplicated_int64
32303234

32313235
shape = map(len, self.levels)
32323236
ids = get_flat_ids(self.labels, shape, False)
3233-
table = Int64HashTable(min(1 << 20, len(ids)))
32343237

3235-
return len(table.unique(ids)) == len(self)
3238+
return duplicated_int64(ids, take_last)
32363239

32373240
def get_value(self, series, key):
32383241
# somewhat broken encapsulation

pandas/hashtable.pyx

+24
Original file line numberDiff line numberDiff line change
@@ -1062,3 +1062,27 @@ def mode_int64(ndarray[int64_t] values):
10621062
kh_destroy_int64(table)
10631063

10641064
return modes[:j+1]
1065+
1066+
1067+
@cython.wraparound(False)
1068+
@cython.boundscheck(False)
1069+
def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last):
1070+
cdef:
1071+
int ret = 0
1072+
Py_ssize_t i, n = len(values)
1073+
kh_int64_t * table = kh_init_int64()
1074+
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
1075+
1076+
kh_resize_int64(table, min(1 << 20, n))
1077+
1078+
if take_last:
1079+
for i from n > i >=0:
1080+
kh_put_int64(table, values[i], &ret)
1081+
out[i] = ret == 0
1082+
else:
1083+
for i from 0 <= i < n:
1084+
kh_put_int64(table, values[i], &ret)
1085+
out[i] = ret == 0
1086+
1087+
kh_destroy_int64(table)
1088+
return out

pandas/tests/test_index.py

+11
Original file line numberDiff line numberDiff line change
@@ -3514,6 +3514,17 @@ def check(nlevels, with_nulls):
35143514
check(8, False)
35153515
check(8, True)
35163516

3517+
# GH 9125
3518+
n, k = 200, 5000
3519+
levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)]
3520+
labels = [np.random.choice(n, k * n) for lev in levels]
3521+
mi = MultiIndex(levels=levels, labels=labels)
3522+
3523+
for take_last in [False, True]:
3524+
left = mi.duplicated(take_last=take_last)
3525+
right = pd.lib.duplicated(mi.values, take_last=take_last)
3526+
tm.assert_array_equal(left, right)
3527+
35173528
def test_tolist(self):
35183529
result = self.index.tolist()
35193530
exp = list(self.index.values)

vb_suite/index_object.py

+11
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,17 @@
138138
name='multiindex_with_datetime_level_sliced',
139139
start_date=datetime(2014, 10, 11))
140140

141+
# multi-index duplicated
142+
setup = common_setup + """
143+
n, k = 200, 5000
144+
levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)]
145+
labels = [np.random.choice(n, k * n) for lev in levels]
146+
mi = MultiIndex(levels=levels, labels=labels)
147+
"""
148+
149+
multiindex_duplicated = Benchmark('mi.duplicated()', setup,
150+
name='multiindex_duplicated')
151+
141152
#----------------------------------------------------------------------
142153
# repr
143154

0 commit comments

Comments
 (0)