Skip to content

Commit 24c5b8f

Browse files
committed
BUG: MultiIndex indexing with >= 1000000 elements close #1757
1 parent ded51c8 commit 24c5b8f

File tree

3 files changed

+47
-2
lines changed

3 files changed

+47
-2
lines changed

RELEASE.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ pandas 0.8.2
8989
- Fix handling of general objects in isnull on which bool(...) fails (#1749)
9090
- Fix .ix indexing with MultiIndex ambiguity (#1678)
9191
- Fix .ix setting logic error with non-unique MultiIndex (#1750)
92+
- Basic indexing now works on MultiIndex with > 1000000 elements, regression
93+
from earlier version of pandas (#1757)
9294

9395
pandas 0.8.1
9496
============

pandas/src/engines.pyx

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def set_value_at(ndarray arr, object loc, object val):
4646

4747

4848
# Don't populate hash tables in monotonic indexes larger than this
49-
cdef int _SIZE_CUTOFF = 1000000
49+
_SIZE_CUTOFF = 1000000
5050

5151

5252
cdef class IndexEngine:
@@ -116,7 +116,7 @@ cdef class IndexEngine:
116116
if not self.is_unique:
117117
return self._get_loc_duplicates(val)
118118
values = self._get_index_values()
119-
loc = values.searchsorted(val, side='left')
119+
loc = _bin_search(values, val) # .searchsorted(val, side='left')
120120
if util.get_value_at(values, loc) != val:
121121
raise KeyError(val)
122122
return loc
@@ -331,6 +331,32 @@ cdef class Float64Engine(IndexEngine):
331331
return _algos.backfill_float64(self._get_index_values(), other,
332332
limit=limit)
333333

334+
335+
cdef Py_ssize_t _bin_search(ndarray values, object val):
336+
cdef:
337+
Py_ssize_t mid, lo = 0, hi = len(values) - 1
338+
object pval
339+
340+
if hi >= 0 and val > util.get_value_at(values, hi):
341+
return len(values)
342+
343+
while lo < hi:
344+
mid = (lo + hi) // 2
345+
pval = util.get_value_at(values, mid)
346+
if val < pval:
347+
hi = mid
348+
elif val > pval:
349+
lo = mid + 1
350+
else:
351+
while mid > 0 and val == util.get_value_at(values, mid - 1):
352+
mid -= 1
353+
return mid
354+
355+
if val <= util.get_value_at(values, mid):
356+
return mid
357+
else:
358+
return mid + 1
359+
334360
_pad_functions = {
335361
'object' : _algos.pad_object,
336362
'int64' : _algos.pad_int64,

pandas/tests/test_multilevel.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1467,6 +1467,23 @@ def test_nonunique_assignment_1750(self):
14671467

14681468
self.assert_((df.xs((1, 1))['C'] == '_').all())
14691469

1470+
def test_indexing_over_hashtable_size_cutoff(self):
1471+
n = 10000
1472+
1473+
import pandas.lib as lib
1474+
old_cutoff = lib._SIZE_CUTOFF
1475+
lib._SIZE_CUTOFF = 20000
1476+
1477+
s = Series(np.arange(n),
1478+
MultiIndex.from_arrays((["a"] * n, np.arange(n))))
1479+
1480+
# hai it works!
1481+
self.assertEquals(s[("a", 5)], 5)
1482+
self.assertEquals(s[("a", 6)], 6)
1483+
self.assertEquals(s[("a", 7)], 7)
1484+
1485+
lib._SIZE_CUTOFF = old_cutoff
1486+
14701487
if __name__ == '__main__':
14711488

14721489
# unittest.main()

0 commit comments

Comments
 (0)