Skip to content

Commit 8ebd8cc

Browse files
committed
WIP: high memory in MI
xref pandas-dev#13904
1 parent 3bd237f commit 8ebd8cc

File tree

7 files changed

+296
-32
lines changed

7 files changed

+296
-32
lines changed

asv_bench/benchmarks/indexing.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def setup(self):
8888

8989
def time_getitem_scalar(self):
9090
self.ts[self.dt]
91-
91+
9292

9393
class DataFrameIndexing(object):
9494
goal_time = 0.2
@@ -189,6 +189,7 @@ def setup(self):
189189
self.eps_C = 5
190190
self.eps_D = 5000
191191
self.mdt2 = self.mdt.set_index(['A', 'B', 'C', 'D']).sortlevel()
192+
self.miint = MultiIndex.from_product([np.arange(1000), np.arange(1000)], names=['one','two'])
192193

193194
def time_series_xs_mi_ix(self):
194195
self.s.ix[999]
@@ -199,6 +200,10 @@ def time_frame_xs_mi_ix(self):
199200
def time_multiindex_slicers(self):
200201
self.mdt2.loc[self.idx[(self.test_A - self.eps_A):(self.test_A + self.eps_A), (self.test_B - self.eps_B):(self.test_B + self.eps_B), (self.test_C - self.eps_C):(self.test_C + self.eps_C), (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :]
201202

203+
def time_multiindex_get_indexer(self):
204+
self.miint.get_indexer(np.array([(0, 10), (0, 11), (0, 12), (0, 13), (0, 14),
205+
(0, 15), (0, 16),(0, 17), (0, 18), (0, 19)], dtype=object))
206+
202207

203208
class PanelIndexing(object):
204209
goal_time = 0.2

pandas/hashtable.pxd

+9
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from numpy cimport ndarray
12
from khash cimport (kh_int64_t, kh_uint64_t, kh_float64_t, kh_pymap_t,
23
kh_str_t, uint64_t, int64_t, float64_t)
34

@@ -30,6 +31,14 @@ cdef class PyObjectHashTable(HashTable):
3031
cpdef get_item(self, object val)
3132
cpdef set_item(self, object key, Py_ssize_t val)
3233

34+
cdef class MultiIndexHashTable(HashTable):
35+
cdef:
36+
kh_int64_t *table
37+
int64_t nlevels
38+
39+
cpdef get_item(self, object val)
40+
cpdef set_item(self, object key, Py_ssize_t val)
41+
3342
cdef class StringHashTable(HashTable):
3443
cdef kh_str_t *table
3544

pandas/index.pyx

+79-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# cython: profile=False
1+
# cython: profile=True
22

33
from numpy cimport ndarray
44

@@ -275,7 +275,6 @@ cdef class IndexEngine:
275275
if not self.is_mapping_populated:
276276

277277
values = self._get_index_values()
278-
279278
self.mapping = self._make_hash_table(len(values))
280279
self.mapping.map_locations(values)
281280

@@ -545,5 +544,83 @@ cdef inline bint _is_utc(object tz):
545544
return tz is UTC or isinstance(tz, _du_utc)
546545

547546

547+
cdef class MultiIndexEngine(IndexEngine):
548+
549+
def __init__(self, vgetter, n):
550+
super(MultiIndexEngine, self).__init__(vgetter, n)
551+
552+
# by definition we *always* want to use our hashtable
553+
# and not bin searching
554+
self.over_size_threshold = False
555+
556+
def _call_monotonic(self, object mi):
557+
cdef:
558+
object values
559+
560+
values = mi._to_multiindex_structure()
561+
return values.is_lexsorted, False, values.is_unique
562+
563+
def get_backfill_indexer(self, other, limit=None):
564+
return algos.backfill_object(self._get_index_values(),
565+
other, limit=limit)
566+
567+
def get_pad_indexer(self, other, limit=None):
568+
return algos.pad_object(self._get_index_values(),
569+
other, limit=limit)
570+
571+
cpdef get_loc(self, object val):
572+
self._ensure_mapping_populated()
573+
if not self.unique:
574+
return self._get_loc_duplicates(val)
575+
576+
self._check_type(val)
577+
578+
try:
579+
return self.mapping.get_item(val)
580+
except TypeError:
581+
raise KeyError(val)
582+
583+
cdef _make_hash_table(self, n):
584+
return _hash.MultiIndexHashTable(n)
585+
586+
cdef _check_type(self, object val):
587+
hash(val)
588+
if util.is_bool_object(val):
589+
raise KeyError(val)
590+
elif util.is_float_object(val):
591+
raise KeyError(val)
592+
593+
cdef _maybe_get_bool_indexer(self, object val):
594+
cdef:
595+
ndarray[uint8_t, cast=True] indexer
596+
ndarray[object] values
597+
int count = 0
598+
Py_ssize_t i, n
599+
int last_true
600+
601+
if not util.is_integer_object(val):
602+
raise KeyError(val)
603+
604+
values = self._get_index_values()
605+
n = len(values)
606+
607+
result = np.empty(n, dtype=bool)
608+
indexer = result.view(np.uint8)
609+
610+
for i in range(n):
611+
if values[i] == val:
612+
count += 1
613+
indexer[i] = 1
614+
last_true = i
615+
else:
616+
indexer[i] = 0
617+
618+
if count == 0:
619+
raise KeyError(val)
620+
if count == 1:
621+
return last_true
622+
623+
return result
624+
548625
# Generated from template.
549626
include "index_class_helper.pxi"

pandas/indexes/base.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -2404,7 +2404,8 @@ def _get_fill_indexer_searchsorted(self, target, method, limit=None):
24042404
'if index and target are monotonic' % method)
24052405

24062406
side = 'left' if method == 'pad' else 'right'
2407-
target = np.asarray(target)
2407+
##### TODO? remove
2408+
#target = np.asarray(target)
24082409

24092410
# find exact matches first (this simplifies the algorithm)
24102411
indexer = self.get_indexer(target)

pandas/indexes/multi.py

+42-22
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ class MultiIndex(Index):
7373
_levels = FrozenList()
7474
_labels = FrozenList()
7575
_comparables = ['names']
76+
_engine_type = _index.MultiIndexEngine
7677
rename = Index.set_names
7778

7879
def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
@@ -114,7 +115,6 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
114115
result._verify_integrity()
115116
if _set_identity:
116117
result._reset_identity()
117-
118118
return result
119119

120120
def _verify_integrity(self, labels=None, levels=None):
@@ -613,6 +613,43 @@ def _get_level_number(self, level):
613613

614614
_tuples = None
615615

616+
def _to_multiindex_structure(self):
617+
618+
# return a structure that has an efficient representation
619+
class _MultiIndexStructure(object):
620+
def __init__(self, mi):
621+
self.nlevels = mi.nlevels
622+
self.length = len(mi)
623+
self.obj = mi
624+
625+
def get_level_values(num):
626+
# just return the actual ndarray
627+
unique = mi.levels[num] # .values
628+
labels = mi.labels[num]
629+
filled = algos.take_1d(unique.values, labels,
630+
fill_value=unique._na_value)
631+
return filled
632+
633+
self.values = [get_level_values(level)
634+
for level in range(self.nlevels)]
635+
636+
def __len__(self):
637+
return self.length
638+
639+
@property
640+
def is_lexsorted(self):
641+
return self.obj.is_lexsorted()
642+
643+
@property
644+
def is_unique(self):
645+
return self.obj.is_unique
646+
647+
return _MultiIndexStructure(self)
648+
649+
@cache_readonly
650+
def _engine(self):
651+
return self._engine_type(lambda: self, len(self))
652+
616653
@property
617654
def values(self):
618655
if self._tuples is not None:
@@ -1472,29 +1509,23 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
14721509
method = missing.clean_reindex_fill_method(method)
14731510
target = _ensure_index(target)
14741511

1475-
target_index = target
1476-
if isinstance(target, MultiIndex):
1477-
target_index = target._tuple_index
1478-
1479-
if not is_object_dtype(target_index.dtype):
1480-
return np.ones(len(target_index)) * -1
1512+
if not isinstance(target, MultiIndex):
1513+
target = MultiIndex.from_tuples(target)
14811514

14821515
if not self.is_unique:
14831516
raise Exception('Reindexing only valid with uniquely valued Index '
14841517
'objects')
14851518

1486-
self_index = self._tuple_index
1487-
14881519
if method == 'pad' or method == 'backfill':
14891520
if tolerance is not None:
14901521
raise NotImplementedError("tolerance not implemented yet "
14911522
'for MultiIndex')
1492-
indexer = self_index._get_fill_indexer(target, method, limit)
1523+
indexer = self._get_fill_indexer(target, method, limit)
14931524
elif method == 'nearest':
14941525
raise NotImplementedError("method='nearest' not implemented yet "
14951526
'for MultiIndex; see GitHub issue 9365')
14961527
else:
1497-
indexer = self_index._engine.get_indexer(target._values)
1528+
indexer = self._engine.get_indexer(target)
14981529

14991530
return _ensure_platform_int(indexer)
15001531

@@ -1561,17 +1592,6 @@ def reindex(self, target, method=None, level=None, limit=None,
15611592

15621593
return target, indexer
15631594

1564-
@cache_readonly
1565-
def _tuple_index(self):
1566-
"""
1567-
Convert MultiIndex to an Index of tuples
1568-
1569-
Returns
1570-
-------
1571-
index : Index
1572-
"""
1573-
return Index(self._values)
1574-
15751595
def get_slice_bound(self, label, side, kind):
15761596

15771597
if not isinstance(label, tuple):

0 commit comments

Comments
 (0)