From 01ec863dd39f11b27ae9f0d67e6dc54105990a60 Mon Sep 17 00:00:00 2001
From: behzad nouri <behzadnouri@gmail.com>
Date: Mon, 22 Dec 2014 21:15:18 -0500
Subject: [PATCH] use labels to find duplicates in multi-index

---
 doc/source/whatsnew/v0.16.0.txt |  1 +
 pandas/core/index.py            |  9 ++++++---
 pandas/hashtable.pyx            | 24 ++++++++++++++++++++++++
 pandas/tests/test_index.py      | 10 ++++++++++
 vb_suite/index_object.py        | 11 +++++++++++
 5 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt
index d572835a76218..531955c765deb 100644
--- a/doc/source/whatsnew/v0.16.0.txt
+++ b/doc/source/whatsnew/v0.16.0.txt
@@ -46,6 +46,7 @@ Performance
 .. _whatsnew_0160.performance:
 
 - Fixed a severe performance regression for ``.loc`` indexing with an array or list (:issue:9126:).
+- Performance improvements in ``MultiIndex.duplicated`` by working with labels instead of values.
 
 Bug Fixes
 ~~~~~~~~~
diff --git a/pandas/core/index.py b/pandas/core/index.py
index 97890299657cf..1b4a691851a8a 100644
--- a/pandas/core/index.py
+++ b/pandas/core/index.py
@@ -3225,14 +3225,17 @@ def _has_complex_internals(self):
 
     @cache_readonly
     def is_unique(self):
-        from pandas.hashtable import Int64HashTable
+        return not self.duplicated().any()
+
+    @Appender(_shared_docs['duplicated'] % _index_doc_kwargs)
+    def duplicated(self, take_last=False):
         from pandas.core.groupby import get_flat_ids
+        from pandas.hashtable import duplicated_int64
 
         shape = map(len, self.levels)
         ids = get_flat_ids(self.labels, shape, False)
-        table = Int64HashTable(min(1 << 20, len(ids)))
 
-        return len(table.unique(ids)) == len(self)
+        return duplicated_int64(ids, take_last)
 
     def get_value(self, series, key):
         # somewhat broken encapsulation
diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
index cf9428d5862ec..26fba1a4b9615 100644
--- a/pandas/hashtable.pyx
+++ b/pandas/hashtable.pyx
@@ -1062,3 +1062,27 @@ def mode_int64(ndarray[int64_t] values):
     kh_destroy_int64(table)
 
     return modes[:j+1]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def duplicated_int64(ndarray[int64_t, ndim=1] values, int take_last):
+    cdef:
+        int ret = 0
+        Py_ssize_t i, n = len(values)
+        kh_int64_t * table = kh_init_int64()
+        ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
+
+    kh_resize_int64(table, min(1 << 20, n))
+
+    if take_last:
+        for i from n > i >=0:
+            kh_put_int64(table, values[i], &ret)
+            out[i] = ret == 0
+    else:
+        for i from 0 <= i < n:
+            kh_put_int64(table, values[i], &ret)
+            out[i] = ret == 0
+
+    kh_destroy_int64(table)
+    return out
diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py
index 5c581b548e583..c8c46309eb016 100644
--- a/pandas/tests/test_index.py
+++ b/pandas/tests/test_index.py
@@ -3514,6 +3514,16 @@ def check(nlevels, with_nulls):
         check(8, False)
         check(8, True)
 
+        n, k = 200, 5000
+        levels = [np.arange(n), tm.makeStringIndex(n), 1000 + np.arange(n)]
+        labels = [np.random.choice(n, k * n) for lev in levels]
+        mi = MultiIndex(levels=levels, labels=labels)
+
+        for take_last in [False, True]:
+            left = mi.duplicated(take_last=take_last)
+            right = pd.lib.duplicated(mi.values, take_last=take_last)
+            tm.assert_array_equal(left, right)
+
     def test_tolist(self):
         result = self.index.tolist()
         exp = list(self.index.values)
diff --git a/vb_suite/index_object.py b/vb_suite/index_object.py
index a8cc0e9ec5f89..08ad96d1d0427 100644
--- a/vb_suite/index_object.py
+++ b/vb_suite/index_object.py
@@ -138,6 +138,17 @@
               name='multiindex_with_datetime_level_sliced',
               start_date=datetime(2014, 10, 11))
 
+# multi-index duplicated
+setup = common_setup + """
+n, k = 200, 5000
+levels = [np.arange(n), tm.makeStringIndex(n).values, 1000 + np.arange(n)]
+labels = [np.random.choice(n, k * n) for lev in levels]
+mi = MultiIndex(levels=levels, labels=labels)
+"""
+
+multiindex_duplicated = Benchmark('mi.duplicated()', setup,
+                                  name='multiindex_duplicated')
+
 #----------------------------------------------------------------------
 # repr