Skip to content

Commit 605a5f4

Browse files
committed
WIP: high memory in MI
xref pandas-dev#13904
1 parent 5322245 commit 605a5f4

File tree

4 files changed

+222
-3
lines changed

4 files changed

+222
-3
lines changed

pandas/hashtable.pxd

+10
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from numpy cimport ndarray
12
from khash cimport (kh_int64_t, kh_uint64_t, kh_float64_t, kh_pymap_t,
23
kh_str_t, uint64_t, int64_t, float64_t)
34

@@ -30,6 +31,15 @@ cdef class PyObjectHashTable(HashTable):
3031
cpdef get_item(self, object val)
3132
cpdef set_item(self, object key, Py_ssize_t val)
3233

34+
cdef class MultiIndexHashTable(HashTable):
35+
cdef:
36+
kh_int64_t *table
37+
object values
38+
int64_t nlevels
39+
40+
cpdef get_item(self, object val)
41+
cpdef set_item(self, object key, Py_ssize_t val)
42+
3343
cdef class StringHashTable(HashTable):
3444
cdef kh_str_t *table
3545

pandas/index.pyx

+56-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# cython: profile=False
1+
# cython: profile=True
22

33
from numpy cimport ndarray
44

@@ -275,7 +275,6 @@ cdef class IndexEngine:
275275
if not self.is_mapping_populated:
276276

277277
values = self._get_index_values()
278-
279278
self.mapping = self._make_hash_table(len(values))
280279
self.mapping.map_locations(values)
281280

@@ -545,5 +544,60 @@ cdef inline bint _is_utc(object tz):
545544
return tz is UTC or isinstance(tz, _du_utc)
546545

547546

547+
cdef class MultiIndexEngine(IndexEngine):
548+
549+
def _call_monotonic(self, values):
550+
return algos.is_monotonic_object(values, timelike=False)
551+
552+
def get_backfill_indexer(self, other, limit=None):
553+
return algos.backfill_object(self._get_index_values(),
554+
other, limit=limit)
555+
556+
def get_pad_indexer(self, other, limit=None):
557+
return algos.pad_object(self._get_index_values(),
558+
other, limit=limit)
559+
560+
cdef _make_hash_table(self, n):
561+
return _hash.MultiIndexHashTable(n)
562+
563+
cdef _check_type(self, object val):
564+
hash(val)
565+
if util.is_bool_object(val):
566+
raise KeyError(val)
567+
elif util.is_float_object(val):
568+
raise KeyError(val)
569+
570+
cdef _maybe_get_bool_indexer(self, object val):
571+
cdef:
572+
ndarray[uint8_t, cast=True] indexer
573+
ndarray[object] values
574+
int count = 0
575+
Py_ssize_t i, n
576+
int last_true
577+
578+
if not util.is_integer_object(val):
579+
raise KeyError(val)
580+
581+
values = self._get_index_values()
582+
n = len(values)
583+
584+
result = np.empty(n, dtype=bool)
585+
indexer = result.view(np.uint8)
586+
587+
for i in range(n):
588+
if values[i] == val:
589+
count += 1
590+
indexer[i] = 1
591+
last_true = i
592+
else:
593+
indexer[i] = 0
594+
595+
if count == 0:
596+
raise KeyError(val)
597+
if count == 1:
598+
return last_true
599+
600+
return result
601+
548602
# Generated from template.
549603
include "index_class_helper.pxi"

pandas/indexes/multi.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ class MultiIndex(Index):
7373
_levels = FrozenList()
7474
_labels = FrozenList()
7575
_comparables = ['names']
76+
_engine_type = _index.MultiIndexEngine
7677
rename = Index.set_names
7778

7879
def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
@@ -114,7 +115,6 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None,
114115
result._verify_integrity()
115116
if _set_identity:
116117
result._reset_identity()
117-
118118
return result
119119

120120
def _verify_integrity(self, labels=None, levels=None):
@@ -613,8 +613,13 @@ def _get_level_number(self, level):
613613

614614
_tuples = None
615615

616+
@cache_readonly
617+
def _engine(self):
618+
return self._engine_type(lambda: self, len(self))
619+
616620
@property
617621
def values(self):
622+
print("values called")
618623
if self._tuples is not None:
619624
return self._tuples
620625

pandas/src/hashtable_class_helper.pxi.in

+150
Original file line numberDiff line numberDiff line change
@@ -843,3 +843,153 @@ cdef class PyObjectHashTable(HashTable):
843843
count += 1
844844

845845
return np.asarray(labels)
846+
847+
848+
cdef class MultiIndexHashTable(HashTable):
849+
850+
def __init__(self, size_hint=1):
851+
self.table = kh_init_int64()
852+
self.values = None
853+
self.nlevels = 0
854+
kh_resize_int64(self.table, size_hint)
855+
856+
def __dealloc__(self):
857+
if self.table is not NULL:
858+
kh_destroy_int64(self.table)
859+
self.table = NULL
860+
self.values = None
861+
self.nlevels = 0
862+
863+
def __len__(self):
864+
return self.table.size
865+
866+
def __contains__(self, object key):
867+
cdef khiter_t k
868+
hash(key)
869+
if key != key or key is None:
870+
key = na_sentinel
871+
k = kh_get_int64(self.table, hash(key))
872+
return k != self.table.n_buckets
873+
874+
cpdef get_item(self, object val):
875+
cdef:
876+
khiter_t k
877+
Py_ssize_t level
878+
int64_t i
879+
880+
if val != val or val is None:
881+
val = na_sentinel
882+
k = kh_get_int64(self.table, hash(val))
883+
if k != self.table.n_buckets:
884+
i = self.table.vals[k]
885+
return tuple([self.values[level][i] for level in range(self.nlevels)])
886+
else:
887+
raise KeyError(val)
888+
889+
def get_iter_test(self, object key, Py_ssize_t iterations):
890+
cdef Py_ssize_t i, val
891+
if key != key or key is None:
892+
key = na_sentinel
893+
for i in range(iterations):
894+
k = kh_get_int64(self.table, hash(key))
895+
if k != self.table.n_buckets:
896+
val = self.table.vals[k]
897+
898+
cpdef set_item(self, object key, Py_ssize_t val):
899+
raise NotImplementedError
900+
901+
def map_locations(self, object mi):
902+
cdef:
903+
Py_ssize_t i, n, level
904+
int64_t val
905+
int ret = 0
906+
khiter_t k
907+
908+
# the values are a rec-array
909+
self.nlevels = len(mi.levels)
910+
self.values = [mi.get_level_values(level).values
911+
for level in range(self.nlevels)]
912+
913+
n = len(self.values[0])
914+
for i in range(n):
915+
val = hash(tuple([self.values[level][i]
916+
for level in range(self.nlevels)]))
917+
k = kh_put_int64(self.table, val, &ret)
918+
self.table.vals[k] = i
919+
920+
def lookup(self, ndarray[object] values):
921+
cdef:
922+
Py_ssize_t i, n = len(values)
923+
int ret = 0
924+
int64_t val
925+
khiter_t k
926+
int64_t[:] locs = np.empty(n, dtype=np.int64)
927+
928+
for i in range(n):
929+
val = hash(values[i])
930+
if val != val or val is None:
931+
val = na_sentinel
932+
933+
k = kh_get_int64(self.table, val)
934+
if k != self.table.n_buckets:
935+
locs[i] = self.table.vals[k]
936+
else:
937+
locs[i] = -1
938+
939+
return np.asarray(locs)
940+
941+
def unique(self, ndarray[object] values):
942+
cdef:
943+
Py_ssize_t i, n = len(values)
944+
int ret = 0
945+
int64_t val
946+
khiter_t k
947+
ObjectVector uniques = ObjectVector()
948+
bint seen_na = 0
949+
950+
for i in range(n):
951+
val = hash(values[i])
952+
953+
if not _checknan(val):
954+
k = kh_get_int64(self.table, val)
955+
if k == self.table.n_buckets:
956+
kh_put_int64(self.table, val, &ret)
957+
uniques.append(val)
958+
elif not seen_na:
959+
seen_na = 1
960+
uniques.append(nan)
961+
962+
return uniques.to_array()
963+
964+
def get_labels(self, ndarray[object] values, ObjectVector uniques,
965+
Py_ssize_t count_prior, int64_t na_sentinel,
966+
bint check_null=True):
967+
cdef:
968+
Py_ssize_t i, n = len(values)
969+
int64_t[:] labels
970+
Py_ssize_t idx, count = count_prior
971+
int ret = 0
972+
int64_t val
973+
khiter_t k
974+
975+
labels = np.empty(n, dtype=np.int64)
976+
977+
for i in range(n):
978+
val = hash(values[i])
979+
980+
if check_null and val != val or val is None:
981+
labels[i] = na_sentinel
982+
continue
983+
984+
k = kh_get_int64(self.table, val)
985+
if k != self.table.n_buckets:
986+
idx = self.table.vals[k]
987+
labels[i] = idx
988+
else:
989+
k = kh_put_int64(self.table, val, &ret)
990+
self.table.vals[k] = count
991+
uniques.append(val)
992+
labels[i] = count
993+
count += 1
994+
995+
return np.asarray(labels)

0 commit comments

Comments
 (0)