Merge pull request #9101 from behzadnouri/mi-dups

jreback · jreback · commit f7af818daf91 · 2014-12-18T06:40:22.000-05:00
overflow bug in multi-index when checking for duplicates
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -1176,6 +1176,7 @@ Attributes
    Index.is_monotonic_increasing
    Index.is_monotonic_decreasing
    Index.is_unique
+   Index.has_duplicates
    Index.dtype
    Index.inferred_type
    Index.is_all_dates
diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt
@@ -50,4 +50,4 @@ Bug Fixes
 .. _whatsnew_0160.bug_fixes:
 
 - Fixed compatibility issue in ``DatetimeIndex`` affecting architectures where ``numpy.int_`` defaults to ``numpy.int32`` (:issue:`8943`)
-
+- Bug in ``MultiIndex.has_duplicates`` when having many levels causes an indexer overflow (:issue:`9075`)
diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -3,7 +3,7 @@
 import warnings
 import operator
 from functools import partial
-from pandas.compat import range, zip, lrange, lzip, u, reduce
+from pandas.compat import range, zip, lrange, lzip, u, reduce, filter, map
 from pandas import compat
 import numpy as np
 
@@ -600,6 +600,10 @@ def is_unique(self):
         """ return if the index has unique values """
         return self._engine.is_unique
 
+    @property
+    def has_duplicates(self):
+        return not self.is_unique
+
     def is_boolean(self):
         return self.inferred_type in ['boolean']
 
@@ -3218,22 +3222,47 @@ def _has_complex_internals(self):
         # to disable groupby tricks
         return True
 
-    @property
-    def has_duplicates(self):
-        """
-        Return True if there are no unique groups
-        """
-        # has duplicates
-        shape = [len(lev) for lev in self.levels]
-        group_index = np.zeros(len(self), dtype='i8')
-        for i in range(len(shape)):
-            stride = np.prod([x for x in shape[i + 1:]], dtype='i8')
-            group_index += self.labels[i] * stride
+    @cache_readonly
+    def is_unique(self):
+        from pandas.hashtable import Int64HashTable
 
-        if len(np.unique(group_index)) < len(group_index):
-            return True
+        def _get_group_index(labels, shape):
+            from pandas.core.groupby import _int64_overflow_possible, \
+                                            _compress_group_index
 
-        return False
+            # how many levels can be done without overflow
+            pred = lambda i: not _int64_overflow_possible(shape[:i])
+            nlev = next(filter(pred, range(len(shape), 0, -1)))
+
+            # compute group indicies for the first `nlev` levels
+            group_index = labels[0].astype('i8', subok=False, copy=True)
+            stride = shape[0]
+
+            for i in range(1, nlev):
+                group_index += labels[i] * stride
+                stride *= shape[i]
+
+            if nlev == len(shape):
+                return group_index
+
+            comp_ids, obs_ids = _compress_group_index(group_index, sort=False)
+
+            labels = [comp_ids] + labels[nlev:]
+            shape = [len(obs_ids)] + shape[nlev:]
+
+            return _get_group_index(labels, shape)
+
+        def _maybe_lift(lab, size):  # pormote nan values
+            return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
+
+        shape = map(len, self.levels)
+        labels = map(_ensure_int64, self.labels)
+
+        labels, shape = map(list, zip(*map(_maybe_lift, labels, shape)))
+        group_index = _get_group_index(labels, shape)
+
+        table = Int64HashTable(min(1 << 20, len(group_index)))
+        return len(table.unique(group_index)) == len(self)
 
     def get_value(self, series, key):
         # somewhat broken encapsulation
diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py
@@ -620,6 +620,9 @@ def test_duplicated_drop_duplicates(self):
                 tm.assert_index_equal(result, original)
                 self.assertFalse(result is original)
 
+                # has_duplicates
+                self.assertFalse(original.has_duplicates)
+
                 # create repeated values, 3rd and 5th values are duplicated
                 idx = original[list(range(len(original))) + [5, 3]]
                 expected = Index([False] * len(original) + [True, True])
diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py
@@ -3451,6 +3451,69 @@ def test_has_duplicates(self):
                                    [0, 1, 2, 0, 0, 1, 2]])
         self.assertTrue(index.has_duplicates)
 
+        # GH 9075
+        t = [(u'x', u'out', u'z', 5, u'y', u'in', u'z', 169),
+             (u'x', u'out', u'z', 7, u'y', u'in', u'z', 119),
+             (u'x', u'out', u'z', 9, u'y', u'in', u'z', 135),
+             (u'x', u'out', u'z', 13, u'y', u'in', u'z', 145),
+             (u'x', u'out', u'z', 14, u'y', u'in', u'z', 158),
+             (u'x', u'out', u'z', 16, u'y', u'in', u'z', 122),
+             (u'x', u'out', u'z', 17, u'y', u'in', u'z', 160),
+             (u'x', u'out', u'z', 18, u'y', u'in', u'z', 180),
+             (u'x', u'out', u'z', 20, u'y', u'in', u'z', 143),
+             (u'x', u'out', u'z', 21, u'y', u'in', u'z', 128),
+             (u'x', u'out', u'z', 22, u'y', u'in', u'z', 129),
+             (u'x', u'out', u'z', 25, u'y', u'in', u'z', 111),
+             (u'x', u'out', u'z', 28, u'y', u'in', u'z', 114),
+             (u'x', u'out', u'z', 29, u'y', u'in', u'z', 121),
+             (u'x', u'out', u'z', 31, u'y', u'in', u'z', 126),
+             (u'x', u'out', u'z', 32, u'y', u'in', u'z', 155),
+             (u'x', u'out', u'z', 33, u'y', u'in', u'z', 123),
+             (u'x', u'out', u'z', 12, u'y', u'in', u'z', 144)]
+
+        index = pd.MultiIndex.from_tuples(t)
+        self.assertFalse(index.has_duplicates)
+
+        # handle int64 overflow if possible
+        def check(nlevels, with_nulls):
+            labels = np.tile(np.arange(500), 2)
+            level = np.arange(500)
+
+            if with_nulls:  # inject some null values
+                labels[500] = -1  # common nan value
+                labels = list(labels.copy() for i in range(nlevels))
+                for i in range(nlevels):
+                    labels[i][500 + i - nlevels // 2 ] = -1
+
+                labels += [np.array([-1, 1]).repeat(500)]
+            else:
+                labels = [labels] * nlevels + [np.arange(2).repeat(500)]
+
+            levels = [level] * nlevels + [[0, 1]]
+
+            # no dups
+            index = MultiIndex(levels=levels, labels=labels)
+            self.assertFalse(index.has_duplicates)
+
+            # with a dup
+            if with_nulls:
+                f = lambda a: np.insert(a, 1000, a[0])
+                labels = list(map(f, labels))
+                index = MultiIndex(levels=levels, labels=labels)
+            else:
+                values = index.values.tolist()
+                index = MultiIndex.from_tuples(values + [values[0]])
+
+            self.assertTrue(index.has_duplicates)
+
+        # no overflow
+        check(4, False)
+        check(4, True)
+
+        # overflow possible
+        check(8, False)
+        check(8, True)
+
     def test_tolist(self):
         result = self.index.tolist()
         exp = list(self.index.values)