@@ -20,6 +20,7 @@ from hashtable cimport HashTable
20
20
from pandas._libs import algos, hashtable as _hash
21
21
from pandas._libs.tslibs import period as periodlib
22
22
from pandas._libs.tslib import Timestamp, Timedelta
23
+ from pandas._libs.missing import checknull
23
24
from datetime import datetime, timedelta, date
24
25
25
26
from cpython cimport PyTuple_Check, PyList_Check
@@ -583,6 +584,129 @@ cpdef convert_scalar(ndarray arr, object value):
583
584
return value
584
585
585
586
587
+ cdef class BaseMultiIndexCodesEngine:
588
+ """
589
+ Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
590
+ represent each label in a MultiIndex as an integer, by juxtaposing the bits
591
+ encoding each level, with appropriate offsets.
592
+
593
+ For instance: if 3 levels have respectively 3, 6 and 1 possible values,
594
+ then their labels can be represented using respectively 2, 3 and 1 bits,
595
+ as follows:
596
+ _ _ _ _____ _ __ __ __
597
+ |0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
598
+ — — — ————— — —— —— ——
599
+ |0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
600
+ — — — ————— — —— —— ——
601
+ |0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
602
+ ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
603
+ and the resulting unsigned integer representation will be:
604
+ _ _ _ ____ _ __ __ __ __ __ __
605
+ |0|0|0| ...|0|c0|b2|b1|b0|a1|a0|
606
+ ‾ ‾ ‾ ‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾
607
+
608
+ Offsets are calculated at initialization, labels are transformed by method
609
+ _codes_to_ints.
610
+
611
+ Keys are located by first locating each component against the respective
612
+ level, then locating (the integer representation of) codes.
613
+ """
614
+ def __init__ (self , object levels , object labels ,
615
+ ndarray[uint64_t , ndim = 1 ] offsets):
616
+ self .levels = levels
617
+ self .offsets = offsets
618
+
619
+ # Transform labels in a single array, and add 1 so that we are working
620
+ # with positive integers (-1 for NaN becomes 0):
621
+ codes = (np.array(labels, dtype = ' int64' ).T + 1 ).astype(' uint64' )
622
+
623
+ # Map each codes combination in the index to an integer unambiguously
624
+ # (no collisions possible), based on the "offsets", which describe the
625
+ # number of bits to switch labels for each level:
626
+ lab_ints = self ._codes_to_ints(codes)
627
+
628
+ # Initialize underlying index (e.g. libindex.UInt64Engine) with
629
+ # integers representing labels: we will use its get_loc and get_indexer
630
+ self ._base.__init__ (self , lambda : lab_ints, len (lab_ints))
631
+
632
+ def _extract_level_codes (self , object target , object method = None ):
633
+ """
634
+ Map the requested list of (tuple) keys to their integer representations
635
+ for searching in the underlying integer index.
636
+
637
+ Parameters
638
+ ----------
639
+ target : 2-dimensional array of dtype uint64
640
+ Combinations of integers (one per row)
641
+
642
+ Returns
643
+ ------
644
+ int_keys : 1-dimensional array of dtype uint64 or object
645
+ Integers representing one combination each
646
+ """
647
+
648
+ level_codes = [lev.get_indexer(codes) + 1 for lev, codes
649
+ in zip (self .levels, zip (* target))]
650
+ return self ._codes_to_ints(np.array(level_codes, dtype = ' uint64' ).T)
651
+
652
+ def get_indexer (self , object target , object method = None ,
653
+ object limit = None ):
654
+ lab_ints = self ._extract_level_codes(target)
655
+
656
+ # All methods (exact, backfill, pad) directly map to the respective
657
+ # methods of the underlying (integers) index...
658
+ if method is not None :
659
+ # but underlying backfill and pad methods require index and keys
660
+ # to be sorted. The index already is (checked in
661
+ # Index._get_fill_indexer), sort (integer representations of) keys:
662
+ order = np.argsort(lab_ints)
663
+ lab_ints = lab_ints[order]
664
+ indexer = (getattr (self ._base, ' get_{}_indexer' .format(method))
665
+ (self , lab_ints, limit= limit))
666
+ indexer = indexer[order]
667
+ else :
668
+ indexer = self ._base.get_indexer(self , lab_ints)
669
+
670
+ return indexer
671
+
672
+ def get_loc (self , object key ):
673
+ if is_definitely_invalid_key(key):
674
+ raise TypeError (" '{key}' is an invalid key" .format(key = key))
675
+ if not PyTuple_Check(key):
676
+ raise KeyError (key)
677
+ try :
678
+ indices = [0 if checknull(v) else lev.get_loc(v) + 1
679
+ for lev, v in zip (self .levels, key)]
680
+ except KeyError :
681
+ raise KeyError (key)
682
+
683
+ # ndmin=2 because codes_to_ints expects multiple labels:
684
+ indices = np.array(indices, ndmin = 2 , dtype = ' uint64' )
685
+ # ... and returns a (length 1, in this case) array of integers:
686
+ lab_int = self ._codes_to_ints(indices)[0 ]
687
+
688
+ return self ._base.get_loc(self , lab_int)
689
+
690
+ def get_indexer_non_unique (self , object target ):
691
+ # This needs to be overridden just because the default one works on
692
+ # target._values, and target can be itself a MultiIndex.
693
+
694
+ lab_ints = self ._extract_level_codes(target)
695
+ indexer = self ._base.get_indexer_non_unique(self , lab_ints)
696
+
697
+ return indexer
698
+
699
+ def __contains__ (self , object val ):
700
+ # Default __contains__ looks in the underlying mapping, which in this
701
+ # case only contains integer representations.
702
+ try :
703
+ self .get_loc(val)
704
+ return True
705
+ except (KeyError , TypeError , ValueError ):
706
+ return False
707
+
708
+
709
+
586
710
cdef class MultiIndexObjectEngine(ObjectEngine):
587
711
"""
588
712
provide the same interface as the MultiIndexEngine
0 commit comments