Skip to content

Commit 706ace6

Browse files
committed
WIP: high memory in MI
xref pandas-dev#13904
1 parent 157dcd2 commit 706ace6

File tree

12 files changed

+313
-48
lines changed

12 files changed

+313
-48
lines changed

asv_bench/benchmarks/indexing.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def setup(self):
8888

8989
def time_getitem_scalar(self):
9090
self.ts[self.dt]
91-
91+
9292

9393
class DataFrameIndexing(object):
9494
goal_time = 0.2
@@ -189,6 +189,7 @@ def setup(self):
189189
self.eps_C = 5
190190
self.eps_D = 5000
191191
self.mdt2 = self.mdt.set_index(['A', 'B', 'C', 'D']).sortlevel()
192+
self.miint = MultiIndex.from_product([np.arange(1000), np.arange(1000)], names=['one','two'])
192193

193194
def time_series_xs_mi_ix(self):
194195
self.s.ix[999]
@@ -199,6 +200,10 @@ def time_frame_xs_mi_ix(self):
199200
def time_multiindex_slicers(self):
200201
self.mdt2.loc[self.idx[(self.test_A - self.eps_A):(self.test_A + self.eps_A), (self.test_B - self.eps_B):(self.test_B + self.eps_B), (self.test_C - self.eps_C):(self.test_C + self.eps_C), (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :]
201202

203+
def time_multiindex_get_indexer(self):
204+
self.miint.get_indexer(np.array([(0, 10), (0, 11), (0, 12), (0, 13), (0, 14),
205+
(0, 15), (0, 16),(0, 17), (0, 18), (0, 19)], dtype=object))
206+
202207

203208
class PanelIndexing(object):
204209
goal_time = 0.2

pandas/hashtable.pxd

+8
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,14 @@ cdef class PyObjectHashTable(HashTable):
3030
cpdef get_item(self, object val)
3131
cpdef set_item(self, object key, Py_ssize_t val)
3232

33+
cdef class MultiIndexHashTable(HashTable):
34+
cdef:
35+
kh_uint64_t *table
36+
int64_t nlevels
37+
38+
cpdef get_item(self, object val)
39+
cpdef set_item(self, object key, Py_ssize_t val)
40+
3341
cdef class StringHashTable(HashTable):
3442
cdef kh_str_t *table
3543

pandas/index.pyx

+73-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# cython: profile=False
1+
# cython: profile=True
22

33
from numpy cimport ndarray
44

@@ -275,7 +275,6 @@ cdef class IndexEngine:
275275
if not self.is_mapping_populated:
276276

277277
values = self._get_index_values()
278-
279278
self.mapping = self._make_hash_table(len(values))
280279
self.mapping.map_locations(values)
281280

@@ -545,5 +544,77 @@ cdef inline bint _is_utc(object tz):
545544
return tz is UTC or isinstance(tz, _du_utc)
546545

547546

547+
cdef class MultiIndexEngine(IndexEngine):
548+
549+
def __init__(self, vgetter, n):
550+
super(MultiIndexEngine, self).__init__(vgetter, n)
551+
552+
# by definition we *always* want to use our hashtable
553+
# and not bin searching
554+
self.over_size_threshold = False
555+
556+
def _call_monotonic(self, object mi):
557+
cdef:
558+
object values
559+
560+
values = mi._to_multiindex_structure()
561+
return values.is_lexsorted, False, values.is_unique
562+
563+
def get_backfill_indexer(self, other, limit=None):
564+
return algos.backfill_object(self._get_index_values(),
565+
other, limit=limit)
566+
567+
def get_pad_indexer(self, other, limit=None):
568+
return algos.pad_object(self._get_index_values(),
569+
other, limit=limit)
570+
571+
cpdef get_loc(self, object val):
572+
if is_definitely_invalid_key(val):
573+
raise TypeError("'{val}' is an invalid key".format(val=val))
574+
575+
self._ensure_mapping_populated()
576+
if not self.unique:
577+
return self._get_loc_duplicates(val)
578+
579+
try:
580+
return self.mapping.get_item(val)
581+
except TypeError:
582+
raise KeyError(val)
583+
584+
cdef _make_hash_table(self, n):
585+
return _hash.MultiIndexHashTable(n)
586+
587+
cdef _maybe_get_bool_indexer(self, object val):
588+
cdef:
589+
ndarray[uint8_t, cast=True] indexer
590+
ndarray[object] values
591+
int count = 0
592+
Py_ssize_t i, n
593+
int last_true
594+
595+
if not util.is_integer_object(val):
596+
raise KeyError(val)
597+
598+
values = self._get_index_values()
599+
n = len(values)
600+
601+
result = np.empty(n, dtype=bool)
602+
indexer = result.view(np.uint8)
603+
604+
for i in range(n):
605+
if values[i] == val:
606+
count += 1
607+
indexer[i] = 1
608+
last_true = i
609+
else:
610+
indexer[i] = 0
611+
612+
if count == 0:
613+
raise KeyError(val)
614+
if count == 1:
615+
return last_true
616+
617+
return result
618+
548619
# Generated from template.
549620
include "index_class_helper.pxi"

pandas/indexes/base.py

-1
Original file line numberDiff line numberDiff line change
@@ -2404,7 +2404,6 @@ def _get_fill_indexer_searchsorted(self, target, method, limit=None):
24042404
'if index and target are monotonic' % method)
24052405

24062406
side = 'left' if method == 'pad' else 'right'
2407-
target = np.asarray(target)
24082407

24092408
# find exact matches first (this simplifies the algorithm)
24102409
indexer = self.get_indexer(target)

pandas/indexes/multi.py

+39-24
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
from pandas.compat.numpy import function as nv
1515
from pandas import compat
1616

17-
1817
from pandas.types.common import (_ensure_int64,
1918
_ensure_platform_int,
2019
is_object_dtype,
@@ -73,6 +72,7 @@ class MultiIndex(Index):
7372
_levels = FrozenList()
7473
_labels = FrozenList()
7574
_comparables = ['names']
75+
_engine_type = _index.MultiIndexEngine
7676
rename = Index.set_names
7777

7878
def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
@@ -114,7 +114,6 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
114114
result._verify_integrity()
115115
if _set_identity:
116116
result._reset_identity()
117-
118117
return result
119118

120119
def _verify_integrity(self, labels=None, levels=None):
@@ -613,6 +612,38 @@ def _get_level_number(self, level):
613612

614613
_tuples = None
615614

615+
def _to_multiindex_structure(self):
616+
617+
# return a structure that has an efficient representation
618+
# of the hashed values of the multi-index
619+
# as well as properties for interfacing with index.pyx
620+
# and hashtable.pyx
621+
from pandas.tools.hashing import hash_pandas_object
622+
623+
class _MultiIndexStructure(object):
624+
def __init__(self, mi):
625+
self.nlevels = mi.nlevels
626+
self.length = len(mi)
627+
self.obj = mi
628+
self.values = hash_pandas_object(mi).values
629+
630+
def __len__(self):
631+
return self.length
632+
633+
@property
634+
def is_lexsorted(self):
635+
return self.obj.is_lexsorted()
636+
637+
@property
638+
def is_unique(self):
639+
return self.obj.is_unique
640+
641+
return _MultiIndexStructure(self)
642+
643+
@cache_readonly
644+
def _engine(self):
645+
return self._engine_type(lambda: self, len(self))
646+
616647
@property
617648
def values(self):
618649
if self._tuples is not None:
@@ -846,7 +877,8 @@ def to_frame(self, index=True):
846877
from pandas import DataFrame
847878
result = DataFrame({(name or level): self.get_level_values(level)
848879
for name, level in
849-
zip(self.names, range(len(self.levels)))})
880+
zip(self.names, range(len(self.levels)))},
881+
copy=False)
850882
if index:
851883
result.index = self
852884
return result
@@ -1472,29 +1504,23 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
14721504
method = missing.clean_reindex_fill_method(method)
14731505
target = _ensure_index(target)
14741506

1475-
target_index = target
1476-
if isinstance(target, MultiIndex):
1477-
target_index = target._tuple_index
1478-
1479-
if not is_object_dtype(target_index.dtype):
1480-
return np.ones(len(target_index)) * -1
1507+
if not isinstance(target, MultiIndex):
1508+
target = MultiIndex.from_tuples(target)
14811509

14821510
if not self.is_unique:
14831511
raise Exception('Reindexing only valid with uniquely valued Index '
14841512
'objects')
14851513

1486-
self_index = self._tuple_index
1487-
14881514
if method == 'pad' or method == 'backfill':
14891515
if tolerance is not None:
14901516
raise NotImplementedError("tolerance not implemented yet "
14911517
'for MultiIndex')
1492-
indexer = self_index._get_fill_indexer(target, method, limit)
1518+
indexer = self._get_fill_indexer(target, method, limit)
14931519
elif method == 'nearest':
14941520
raise NotImplementedError("method='nearest' not implemented yet "
14951521
'for MultiIndex; see GitHub issue 9365')
14961522
else:
1497-
indexer = self_index._engine.get_indexer(target._values)
1523+
indexer = self._engine.get_indexer(target)
14981524

14991525
return _ensure_platform_int(indexer)
15001526

@@ -1561,17 +1587,6 @@ def reindex(self, target, method=None, level=None, limit=None,
15611587

15621588
return target, indexer
15631589

1564-
@cache_readonly
1565-
def _tuple_index(self):
1566-
"""
1567-
Convert MultiIndex to an Index of tuples
1568-
1569-
Returns
1570-
-------
1571-
index : Index
1572-
"""
1573-
return Index(self._values)
1574-
15751590
def get_slice_bound(self, label, side, kind):
15761591

15771592
if not isinstance(label, tuple):

pandas/io/pytables.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -3789,9 +3789,9 @@ def read(self, where=None, columns=None, **kwargs):
37893789
lp = DataFrame(c.data, index=long_index, columns=c.values)
37903790

37913791
# need a better algorithm
3792-
tuple_index = long_index._tuple_index
3792+
tuple_index = long_index.values
37933793

3794-
unique_tuples = lib.fast_unique(tuple_index.values)
3794+
unique_tuples = lib.fast_unique(tuple_index)
37953795
unique_tuples = _asarray_tuplesafe(unique_tuples)
37963796

37973797
indexer = match(unique_tuples, tuple_index)

0 commit comments

Comments
 (0)