Skip to content

PERF: use labels to find duplicates in multi-index #9125

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.16.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ Performance
.. _whatsnew_0160.performance:

- Fixed a severe performance regression for ``.loc`` indexing with an array or list (:issue:9126:).
- Performance improvements in ``MultiIndex.duplicated`` by working with labels instead of values.

Bug Fixes
~~~~~~~~~
Expand Down
9 changes: 6 additions & 3 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -3225,14 +3225,17 @@ def _has_complex_internals(self):

@cache_readonly
def is_unique(self):
from pandas.hashtable import Int64HashTable
return not self.duplicated().any()

@Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
def duplicated(self, take_last=False):
from pandas.core.groupby import get_flat_ids
from pandas.hashtable import duplicated_int64

shape = map(len, self.levels)
ids = get_flat_ids(self.labels, shape, False)
table = Int64HashTable(min(1 << 20, len(ids)))

return len(table.unique(ids)) == len(self)
return duplicated_int64(ids, take_last)

def get_value(self, series, key):
# somewhat broken encapsulation
Expand Down
24 changes: 24 additions & 0 deletions pandas/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1062,3 +1062,27 @@ def mode_int64(ndarray[int64_t] values):
kh_destroy_int64(table)

return modes[:j+1]


@cython.wraparound(False)
@cython.boundscheck(False)
def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last):
cdef:
int ret = 0
Py_ssize_t i, n = len(values)
kh_int64_t * table = kh_init_int64()
ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')

kh_resize_int64(table, min(1 << 20, n))

if take_last:
for i from n > i >=0:
kh_put_int64(table, values[i], &ret)
out[i] = ret == 0
else:
for i from 0 <= i < n:
kh_put_int64(table, values[i], &ret)
out[i] = ret == 0

kh_destroy_int64(table)
return out
10 changes: 10 additions & 0 deletions pandas/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -3514,6 +3514,16 @@ def check(nlevels, with_nulls):
check(8, False)
check(8, True)

n, k = 200, 5000
levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)]
labels = [np.random.choice(n, k * n) for lev in levels]
mi = MultiIndex(levels=levels, labels=labels)

for take_last in [False, True]:
left = mi.duplicated(take_last=take_last)
right = pd.lib.duplicated(mi.values, take_last=take_last)
tm.assert_array_equal(left, right)

def test_tolist(self):
result = self.index.tolist()
exp = list(self.index.values)
Expand Down
11 changes: 11 additions & 0 deletions vb_suite/index_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,17 @@
name='multiindex_with_datetime_level_sliced',
start_date=datetime(2014, 10, 11))

# multi-index duplicated
setup = common_setup + """
n, k = 200, 5000
levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)]
labels = [np.random.choice(n, k * n) for lev in levels]
mi = MultiIndex(levels=levels, labels=labels)
"""

multiindex_duplicated = Benchmark('mi.duplicated()', setup,
name='multiindex_duplicated')

#----------------------------------------------------------------------
# repr

Expand Down