Skip to content

Commit 8cbee35

Browse files
toobazjorisvandenbossche
authored andcommitted
REF: codes-based MultiIndex engine (#19074)
1 parent 24d9509 commit 8cbee35

File tree

7 files changed

+303
-251
lines changed

7 files changed

+303
-251
lines changed

doc/source/whatsnew/v0.23.0.txt

+6-1
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,7 @@ Performance Improvements
380380
- Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`)
381381
- :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`)
382382
- Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`)
383+
- Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`)
383384

384385

385386
.. _whatsnew_0230.docs:
@@ -477,7 +478,11 @@ MultiIndex
477478
- Bug in :func:`MultiIndex.get_level_values` which would return an invalid index on level of ints with missing values (:issue:`17924`)
478479
- Bug in :func:`MultiIndex.remove_unused_levels` which would fill nan values (:issue:`18417`)
479480
- Bug in :func:`MultiIndex.from_tuples`` which would fail to take zipped tuples in python3 (:issue:`18434`)
480-
-
481+
- Bug in :func:`MultiIndex.get_loc`` which would fail to automatically cast values between float and int (:issue:`18818`, :issue:`15994`)
482+
- Bug in :func:`MultiIndex.get_loc`` which would cast boolean to integer labels (:issue:`19086`)
483+
- Bug in :func:`MultiIndex.get_loc`` which would fail to locate keys containing ``NaN`` (:issue:`18485`)
484+
- Bug in :func:`MultiIndex.get_loc`` in large :class:`MultiIndex`, would fail when levels had different dtypes (:issue:`18520`)
485+
481486

482487
I/O
483488
^^^

pandas/_libs/hashtable.pxd

-9
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,6 @@ cdef class PyObjectHashTable(HashTable):
3131
cpdef get_item(self, object val)
3232
cpdef set_item(self, object key, Py_ssize_t val)
3333

34-
cdef class MultiIndexHashTable(HashTable):
35-
cdef:
36-
kh_uint64_t *table
37-
object mi
38-
39-
cpdef get_item(self, object val)
40-
cpdef set_item(self, object key, Py_ssize_t val)
41-
cdef inline void _check_for_collision(self, Py_ssize_t loc, object label)
42-
4334

4435
cdef class StringHashTable(HashTable):
4536
cdef kh_str_t *table

pandas/_libs/hashtable_class_helper.pxi.in

-136
Original file line numberDiff line numberDiff line change
@@ -899,139 +899,3 @@ cdef class PyObjectHashTable(HashTable):
899899
count += 1
900900

901901
return np.asarray(labels)
902-
903-
904-
cdef class MultiIndexHashTable(HashTable):
905-
906-
def __init__(self, size_hint=1):
907-
self.table = kh_init_uint64()
908-
self.mi = None
909-
kh_resize_uint64(self.table, size_hint)
910-
911-
def __dealloc__(self):
912-
if self.table is not NULL:
913-
kh_destroy_uint64(self.table)
914-
self.table = NULL
915-
916-
def __len__(self):
917-
return self.table.size
918-
919-
def sizeof(self, deep=False):
920-
""" return the size of my table in bytes """
921-
return self.table.n_buckets * (sizeof(uint64_t) + # keys
922-
sizeof(size_t) + # vals
923-
sizeof(uint32_t)) # flags
924-
925-
def _check_for_collisions(self, int64_t[:] locs, object mi):
926-
# validate that the locs map to the actual values
927-
# provided in the mi
928-
# we can only check if we *don't* have any missing values
929-
# :<
930-
cdef:
931-
ndarray[int64_t] alocs
932-
933-
alocs = np.asarray(locs)
934-
if (alocs != -1).all():
935-
936-
result = self.mi.take(locs)
937-
if isinstance(mi, tuple):
938-
from pandas import Index
939-
mi = Index([mi])
940-
if not result.equals(mi):
941-
raise AssertionError(
942-
"hash collision\nlocs:\n{}\n"
943-
"result:\n{}\nmi:\n{}".format(alocs, result, mi))
944-
945-
cdef inline void _check_for_collision(self, Py_ssize_t loc, object label):
946-
# validate that the loc maps to the actual value
947-
# version of _check_for_collisions above for single label (tuple)
948-
949-
result = self.mi[loc]
950-
951-
if not all(l == r or (is_null_datetimelike(l)
952-
and is_null_datetimelike(r))
953-
for l, r in zip(result, label)):
954-
raise AssertionError(
955-
"hash collision\nloc:\n{}\n"
956-
"result:\n{}\nmi:\n{}".format(loc, result, label))
957-
958-
def __contains__(self, object key):
959-
try:
960-
self.get_item(key)
961-
return True
962-
except (KeyError, ValueError, TypeError):
963-
return False
964-
965-
cpdef get_item(self, object key):
966-
cdef:
967-
khiter_t k
968-
uint64_t value
969-
int64_t[:] locs
970-
Py_ssize_t loc
971-
972-
value = self.mi._hashed_indexing_key(key)
973-
k = kh_get_uint64(self.table, value)
974-
if k != self.table.n_buckets:
975-
loc = self.table.vals[k]
976-
self._check_for_collision(loc, key)
977-
return loc
978-
else:
979-
raise KeyError(key)
980-
981-
cpdef set_item(self, object key, Py_ssize_t val):
982-
raise NotImplementedError
983-
984-
@cython.boundscheck(False)
985-
def map_locations(self, object mi):
986-
cdef:
987-
Py_ssize_t i, n
988-
ndarray[uint64_t] values
989-
uint64_t val
990-
int ret = 0
991-
khiter_t k
992-
993-
self.mi = mi
994-
n = len(mi)
995-
values = mi._hashed_values
996-
997-
with nogil:
998-
for i in range(n):
999-
val = values[i]
1000-
k = kh_put_uint64(self.table, val, &ret)
1001-
self.table.vals[k] = i
1002-
1003-
@cython.boundscheck(False)
1004-
def lookup(self, object mi):
1005-
# look up with a target mi
1006-
cdef:
1007-
Py_ssize_t i, n
1008-
ndarray[uint64_t] values
1009-
int ret = 0
1010-
uint64_t val
1011-
khiter_t k
1012-
int64_t[:] locs
1013-
1014-
n = len(mi)
1015-
values = mi._hashed_values
1016-
1017-
locs = np.empty(n, dtype=np.int64)
1018-
1019-
with nogil:
1020-
for i in range(n):
1021-
val = values[i]
1022-
k = kh_get_uint64(self.table, val)
1023-
if k != self.table.n_buckets:
1024-
locs[i] = self.table.vals[k]
1025-
else:
1026-
locs[i] = -1
1027-
1028-
self._check_for_collisions(locs, mi)
1029-
return np.asarray(locs)
1030-
1031-
def unique(self, object mi):
1032-
raise NotImplementedError
1033-
1034-
def get_labels(self, object mi, ObjectVector uniques,
1035-
Py_ssize_t count_prior, int64_t na_sentinel,
1036-
bint check_null=True):
1037-
raise NotImplementedError

pandas/_libs/index.pyx

+118-50
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,12 @@ from hashtable cimport HashTable
2626
from pandas._libs import algos, hashtable as _hash
2727
from pandas._libs.tslibs import period as periodlib
2828
from pandas._libs.tslib import Timestamp, Timedelta
29+
from pandas._libs.missing import checknull
2930

3031
cdef int64_t iNaT = util.get_nat()
3132

3233

33-
cdef inline is_definitely_invalid_key(object val):
34+
cdef inline bint is_definitely_invalid_key(object val):
3435
if PyTuple_Check(val):
3536
try:
3637
hash(val)
@@ -585,70 +586,137 @@ cpdef convert_scalar(ndarray arr, object value):
585586
return value
586587

587588

588-
cdef class MultiIndexObjectEngine(ObjectEngine):
589+
cdef class BaseMultiIndexCodesEngine:
589590
"""
590-
provide the same interface as the MultiIndexEngine
591-
but use the IndexEngine for computation
592-
593-
This provides good performance with samller MI's
591+
Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
592+
represent each label in a MultiIndex as an integer, by juxtaposing the bits
593+
encoding each level, with appropriate offsets.
594+
595+
For instance: if 3 levels have respectively 3, 6 and 1 possible values,
596+
then their labels can be represented using respectively 2, 3 and 1 bits,
597+
as follows:
598+
_ _ _ _____ _ __ __ __
599+
|0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
600+
— — — ————— — —— —— ——
601+
|0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
602+
— — — ————— — —— —— ——
603+
|0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
604+
‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
605+
and the resulting unsigned integer representation will be:
606+
_ _ _ _____ _ __ __ __ __ __ __
607+
|0|0|0| ... |0|c0|b2|b1|b0|a1|a0|
608+
‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾
609+
610+
Offsets are calculated at initialization, labels are transformed by method
611+
_codes_to_ints.
612+
613+
Keys are located by first locating each component against the respective
614+
level, then locating (the integer representation of) codes.
594615
"""
595-
def get_indexer(self, values):
596-
# convert a MI to an ndarray
597-
if hasattr(values, 'values'):
598-
values = values.values
599-
return super(MultiIndexObjectEngine, self).get_indexer(values)
616+
def __init__(self, object levels, object labels,
617+
ndarray[uint64_t, ndim=1] offsets):
618+
"""
619+
Parameters
620+
----------
621+
levels : list-like of numpy arrays
622+
Levels of the MultiIndex
623+
labels : list-like of numpy arrays of integer dtype
624+
Labels of the MultiIndex
625+
offsets : numpy array of uint64 dtype
626+
Pre-calculated offsets, one for each level of the index
627+
"""
600628

601-
cpdef get_loc(self, object val):
629+
self.levels = levels
630+
self.offsets = offsets
602631

603-
# convert a MI to an ndarray
604-
if hasattr(val, 'values'):
605-
val = val.values
606-
return super(MultiIndexObjectEngine, self).get_loc(val)
632+
# Transform labels in a single array, and add 1 so that we are working
633+
# with positive integers (-1 for NaN becomes 0):
634+
codes = (np.array(labels, dtype='int64').T + 1).astype('uint64',
635+
copy=False)
607636

637+
# Map each codes combination in the index to an integer unambiguously
638+
# (no collisions possible), based on the "offsets", which describe the
639+
# number of bits to switch labels for each level:
640+
lab_ints = self._codes_to_ints(codes)
608641

609-
cdef class MultiIndexHashEngine(ObjectEngine):
610-
"""
611-
Use a hashing based MultiIndex impl
612-
but use the IndexEngine for computation
642+
# Initialize underlying index (e.g. libindex.UInt64Engine) with
643+
# integers representing labels: we will use its get_loc and get_indexer
644+
self._base.__init__(self, lambda: lab_ints, len(lab_ints))
613645

614-
This provides good performance with larger MI's
615-
"""
646+
def _extract_level_codes(self, object target, object method=None):
647+
"""
648+
Map the requested list of (tuple) keys to their integer representations
649+
for searching in the underlying integer index.
650+
651+
Parameters
652+
----------
653+
target : list-like of keys
654+
Each key is a tuple, with a label for each level of the index.
655+
656+
Returns
657+
------
658+
int_keys : 1-dimensional array of dtype uint64 or object
659+
Integers representing one combination each
660+
"""
616661

617-
def _call_monotonic(self, object mi):
618-
# defer these back to the mi iteself
619-
return (mi.is_monotonic_increasing,
620-
mi.is_monotonic_decreasing,
621-
mi.is_unique)
662+
level_codes = [lev.get_indexer(codes) + 1 for lev, codes
663+
in zip(self.levels, zip(*target))]
664+
return self._codes_to_ints(np.array(level_codes, dtype='uint64').T)
665+
666+
def get_indexer(self, object target, object method=None,
667+
object limit=None):
668+
lab_ints = self._extract_level_codes(target)
669+
670+
# All methods (exact, backfill, pad) directly map to the respective
671+
# methods of the underlying (integers) index...
672+
if method is not None:
673+
# but underlying backfill and pad methods require index and keys
674+
# to be sorted. The index already is (checked in
675+
# Index._get_fill_indexer), sort (integer representations of) keys:
676+
order = np.argsort(lab_ints)
677+
lab_ints = lab_ints[order]
678+
indexer = (getattr(self._base, 'get_{}_indexer'.format(method))
679+
(self, lab_ints, limit=limit))
680+
indexer = indexer[order]
681+
else:
682+
indexer = self._base.get_indexer(self, lab_ints)
622683

623-
def get_backfill_indexer(self, other, limit=None):
624-
# we coerce to ndarray-of-tuples
625-
values = np.array(self._get_index_values())
626-
return algos.backfill_object(values, other, limit=limit)
684+
return indexer
627685

628-
def get_pad_indexer(self, other, limit=None):
629-
# we coerce to ndarray-of-tuples
630-
values = np.array(self._get_index_values())
631-
return algos.pad_object(values, other, limit=limit)
686+
def get_loc(self, object key):
687+
if is_definitely_invalid_key(key):
688+
raise TypeError("'{key}' is an invalid key".format(key=key))
689+
if not PyTuple_Check(key):
690+
raise KeyError(key)
691+
try:
692+
indices = [0 if checknull(v) else lev.get_loc(v) + 1
693+
for lev, v in zip(self.levels, key)]
694+
except KeyError:
695+
raise KeyError(key)
632696

633-
cpdef get_loc(self, object val):
634-
if is_definitely_invalid_key(val):
635-
raise TypeError("'{val}' is an invalid key".format(val=val))
697+
# Transform indices into single integer:
698+
lab_int = self._codes_to_ints(np.array(indices, dtype='uint64'))
636699

637-
self._ensure_mapping_populated()
638-
if not self.unique:
639-
return self._get_loc_duplicates(val)
700+
return self._base.get_loc(self, lab_int)
640701

641-
try:
642-
return self.mapping.get_item(val)
643-
except TypeError:
644-
raise KeyError(val)
702+
def get_indexer_non_unique(self, object target):
703+
# This needs to be overridden just because the default one works on
704+
# target._values, and target can be itself a MultiIndex.
645705

646-
def get_indexer(self, values):
647-
self._ensure_mapping_populated()
648-
return self.mapping.lookup(values)
706+
lab_ints = self._extract_level_codes(target)
707+
indexer = self._base.get_indexer_non_unique(self, lab_ints)
708+
709+
return indexer
710+
711+
def __contains__(self, object val):
712+
# Default __contains__ looks in the underlying mapping, which in this
713+
# case only contains integer representations.
714+
try:
715+
self.get_loc(val)
716+
return True
717+
except (KeyError, TypeError, ValueError):
718+
return False
649719

650-
cdef _make_hash_table(self, n):
651-
return _hash.MultiIndexHashTable(n)
652720

653721
# Generated from template.
654722
include "index_class_helper.pxi"

0 commit comments

Comments
 (0)