Skip to content

Commit f7af818

Browse files
committed
Merge pull request #9101 from behzadnouri/mi-dups
overflow bug in multi-index when checking for duplicates
2 parents 890734d + 99c95b4 commit f7af818

File tree

5 files changed

+112
-16
lines changed

5 files changed

+112
-16
lines changed

doc/source/api.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1176,6 +1176,7 @@ Attributes
11761176
Index.is_monotonic_increasing
11771177
Index.is_monotonic_decreasing
11781178
Index.is_unique
1179+
Index.has_duplicates
11791180
Index.dtype
11801181
Index.inferred_type
11811182
Index.is_all_dates

doc/source/whatsnew/v0.16.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,4 +50,4 @@ Bug Fixes
5050
.. _whatsnew_0160.bug_fixes:
5151

5252
- Fixed compatibility issue in ``DatetimeIndex`` affecting architectures where ``numpy.int_`` defaults to ``numpy.int32`` (:issue:`8943`)
53-
53+
- Bug in ``MultiIndex.has_duplicates`` when having many levels causes an indexer overflow (:issue:`9075`)

pandas/core/index.py

+44-15
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import warnings
44
import operator
55
from functools import partial
6-
from pandas.compat import range, zip, lrange, lzip, u, reduce
6+
from pandas.compat import range, zip, lrange, lzip, u, reduce, filter, map
77
from pandas import compat
88
import numpy as np
99

@@ -600,6 +600,10 @@ def is_unique(self):
600600
""" return if the index has unique values """
601601
return self._engine.is_unique
602602

603+
@property
604+
def has_duplicates(self):
605+
return not self.is_unique
606+
603607
def is_boolean(self):
604608
return self.inferred_type in ['boolean']
605609

@@ -3218,22 +3222,47 @@ def _has_complex_internals(self):
32183222
# to disable groupby tricks
32193223
return True
32203224

3221-
@property
3222-
def has_duplicates(self):
3223-
"""
3224-
Return True if there are no unique groups
3225-
"""
3226-
# has duplicates
3227-
shape = [len(lev) for lev in self.levels]
3228-
group_index = np.zeros(len(self), dtype='i8')
3229-
for i in range(len(shape)):
3230-
stride = np.prod([x for x in shape[i + 1:]], dtype='i8')
3231-
group_index += self.labels[i] * stride
3225+
@cache_readonly
3226+
def is_unique(self):
3227+
from pandas.hashtable import Int64HashTable
32323228

3233-
if len(np.unique(group_index)) < len(group_index):
3234-
return True
3229+
def _get_group_index(labels, shape):
3230+
from pandas.core.groupby import _int64_overflow_possible, \
3231+
_compress_group_index
32353232

3236-
return False
3233+
# how many levels can be done without overflow
3234+
pred = lambda i: not _int64_overflow_possible(shape[:i])
3235+
nlev = next(filter(pred, range(len(shape), 0, -1)))
3236+
3237+
# compute group indicies for the first `nlev` levels
3238+
group_index = labels[0].astype('i8', subok=False, copy=True)
3239+
stride = shape[0]
3240+
3241+
for i in range(1, nlev):
3242+
group_index += labels[i] * stride
3243+
stride *= shape[i]
3244+
3245+
if nlev == len(shape):
3246+
return group_index
3247+
3248+
comp_ids, obs_ids = _compress_group_index(group_index, sort=False)
3249+
3250+
labels = [comp_ids] + labels[nlev:]
3251+
shape = [len(obs_ids)] + shape[nlev:]
3252+
3253+
return _get_group_index(labels, shape)
3254+
3255+
def _maybe_lift(lab, size): # pormote nan values
3256+
return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
3257+
3258+
shape = map(len, self.levels)
3259+
labels = map(_ensure_int64, self.labels)
3260+
3261+
labels, shape = map(list, zip(*map(_maybe_lift, labels, shape)))
3262+
group_index = _get_group_index(labels, shape)
3263+
3264+
table = Int64HashTable(min(1 << 20, len(group_index)))
3265+
return len(table.unique(group_index)) == len(self)
32373266

32383267
def get_value(self, series, key):
32393268
# somewhat broken encapsulation

pandas/tests/test_base.py

+3
Original file line numberDiff line numberDiff line change
@@ -620,6 +620,9 @@ def test_duplicated_drop_duplicates(self):
620620
tm.assert_index_equal(result, original)
621621
self.assertFalse(result is original)
622622

623+
# has_duplicates
624+
self.assertFalse(original.has_duplicates)
625+
623626
# create repeated values, 3rd and 5th values are duplicated
624627
idx = original[list(range(len(original))) + [5, 3]]
625628
expected = Index([False] * len(original) + [True, True])

pandas/tests/test_index.py

+63
Original file line numberDiff line numberDiff line change
@@ -3451,6 +3451,69 @@ def test_has_duplicates(self):
34513451
[0, 1, 2, 0, 0, 1, 2]])
34523452
self.assertTrue(index.has_duplicates)
34533453

3454+
# GH 9075
3455+
t = [(u'x', u'out', u'z', 5, u'y', u'in', u'z', 169),
3456+
(u'x', u'out', u'z', 7, u'y', u'in', u'z', 119),
3457+
(u'x', u'out', u'z', 9, u'y', u'in', u'z', 135),
3458+
(u'x', u'out', u'z', 13, u'y', u'in', u'z', 145),
3459+
(u'x', u'out', u'z', 14, u'y', u'in', u'z', 158),
3460+
(u'x', u'out', u'z', 16, u'y', u'in', u'z', 122),
3461+
(u'x', u'out', u'z', 17, u'y', u'in', u'z', 160),
3462+
(u'x', u'out', u'z', 18, u'y', u'in', u'z', 180),
3463+
(u'x', u'out', u'z', 20, u'y', u'in', u'z', 143),
3464+
(u'x', u'out', u'z', 21, u'y', u'in', u'z', 128),
3465+
(u'x', u'out', u'z', 22, u'y', u'in', u'z', 129),
3466+
(u'x', u'out', u'z', 25, u'y', u'in', u'z', 111),
3467+
(u'x', u'out', u'z', 28, u'y', u'in', u'z', 114),
3468+
(u'x', u'out', u'z', 29, u'y', u'in', u'z', 121),
3469+
(u'x', u'out', u'z', 31, u'y', u'in', u'z', 126),
3470+
(u'x', u'out', u'z', 32, u'y', u'in', u'z', 155),
3471+
(u'x', u'out', u'z', 33, u'y', u'in', u'z', 123),
3472+
(u'x', u'out', u'z', 12, u'y', u'in', u'z', 144)]
3473+
3474+
index = pd.MultiIndex.from_tuples(t)
3475+
self.assertFalse(index.has_duplicates)
3476+
3477+
# handle int64 overflow if possible
3478+
def check(nlevels, with_nulls):
3479+
labels = np.tile(np.arange(500), 2)
3480+
level = np.arange(500)
3481+
3482+
if with_nulls: # inject some null values
3483+
labels[500] = -1 # common nan value
3484+
labels = list(labels.copy() for i in range(nlevels))
3485+
for i in range(nlevels):
3486+
labels[i][500 + i - nlevels // 2 ] = -1
3487+
3488+
labels += [np.array([-1, 1]).repeat(500)]
3489+
else:
3490+
labels = [labels] * nlevels + [np.arange(2).repeat(500)]
3491+
3492+
levels = [level] * nlevels + [[0, 1]]
3493+
3494+
# no dups
3495+
index = MultiIndex(levels=levels, labels=labels)
3496+
self.assertFalse(index.has_duplicates)
3497+
3498+
# with a dup
3499+
if with_nulls:
3500+
f = lambda a: np.insert(a, 1000, a[0])
3501+
labels = list(map(f, labels))
3502+
index = MultiIndex(levels=levels, labels=labels)
3503+
else:
3504+
values = index.values.tolist()
3505+
index = MultiIndex.from_tuples(values + [values[0]])
3506+
3507+
self.assertTrue(index.has_duplicates)
3508+
3509+
# no overflow
3510+
check(4, False)
3511+
check(4, True)
3512+
3513+
# overflow possible
3514+
check(8, False)
3515+
check(8, True)
3516+
34543517
def test_tolist(self):
34553518
result = self.index.tolist()
34563519
exp = list(self.index.values)

0 commit comments

Comments
 (0)