@@ -26,6 +26,7 @@ from hashtable cimport HashTable
26
26
from pandas._libs import algos, hashtable as _hash
27
27
from pandas._libs.tslibs import period as periodlib
28
28
from pandas._libs.tslib import Timestamp, Timedelta
29
+ from pandas._libs.missing import checknull
29
30
30
31
cdef int64_t iNaT = util.get_nat()
31
32
@@ -585,6 +586,141 @@ cpdef convert_scalar(ndarray arr, object value):
585
586
return value
586
587
587
588
589
+ cdef class BaseMultiIndexCodesEngine:
590
+ """
591
+ Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
592
+ represent each label in a MultiIndex as an integer, by juxtaposing the bits
593
+ encoding each level, with appropriate offsets.
594
+
595
+ For instance: if 3 levels have respectively 3, 6 and 1 possible values,
596
+ then their labels can be represented using respectively 2, 3 and 1 bits,
597
+ as follows:
598
+ _ _ _ _____ _ __ __ __
599
+ |0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
600
+ — — — ————— — —— —— ——
601
+ |0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
602
+ — — — ————— — —— —— ——
603
+ |0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
604
+ ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
605
+ and the resulting unsigned integer representation will be:
606
+ _ _ _ _____ _ __ __ __ __ __ __
607
+ |0|0|0| ... |0|c0|b2|b1|b0|a1|a0|
608
+ ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾
609
+
610
+ Offsets are calculated at initialization, labels are transformed by method
611
+ _codes_to_ints.
612
+
613
+ Keys are located by first locating each component against the respective
614
+ level, then locating (the integer representation of) codes.
615
+ """
616
+ def __init__ (self , object levels , object labels ,
617
+ ndarray[uint64_t , ndim = 1 ] offsets):
618
+ """
619
+ Parameters
620
+ ----------
621
+ levels : list-like of numpy arrays
622
+ Levels of the MultiIndex
623
+ labels : list-like of numpy arrays of integer dtype
624
+ Labels of the MultiIndex
625
+ offsets : numpy array of uint64 dtype
626
+ Pre-calculated offsets, one for each level of the index
627
+ """
628
+
629
+ self .levels = levels
630
+ self .offsets = offsets
631
+
632
+ # Transform labels in a single array, and add 1 so that we are working
633
+ # with positive integers (-1 for NaN becomes 0):
634
+ codes = (np.array(labels, dtype = ' int64' ).T + 1 ).astype(' uint64' ,
635
+ copy = False )
636
+
637
+ # Map each codes combination in the index to an integer unambiguously
638
+ # (no collisions possible), based on the "offsets", which describe the
639
+ # number of bits to switch labels for each level:
640
+ lab_ints = self ._codes_to_ints(codes)
641
+
642
+ # Initialize underlying index (e.g. libindex.UInt64Engine) with
643
+ # integers representing labels: we will use its get_loc and get_indexer
644
+ self ._base.__init__ (self , lambda : lab_ints, len (lab_ints))
645
+
646
+ def _extract_level_codes (self , object target , object method = None ):
647
+ """
648
+ Map the requested list of (tuple) keys to their integer representations
649
+ for searching in the underlying integer index.
650
+
651
+ Parameters
652
+ ----------
653
+ target : list-like of keys
654
+ Each key is a tuple, with a label for each level of the index.
655
+
656
+ Returns
657
+ ------
658
+ int_keys : 1-dimensional array of dtype uint64 or object
659
+ Integers representing one combination each
660
+ """
661
+
662
+ level_codes = [lev.get_indexer(codes) + 1 for lev, codes
663
+ in zip (self .levels, zip (* target))]
664
+ return self ._codes_to_ints(np.array(level_codes, dtype = ' uint64' ).T)
665
+
666
+ def get_indexer (self , object target , object method = None ,
667
+ object limit = None ):
668
+ lab_ints = self ._extract_level_codes(target)
669
+
670
+ # All methods (exact, backfill, pad) directly map to the respective
671
+ # methods of the underlying (integers) index...
672
+ if method is not None :
673
+ # but underlying backfill and pad methods require index and keys
674
+ # to be sorted. The index already is (checked in
675
+ # Index._get_fill_indexer), sort (integer representations of) keys:
676
+ order = np.argsort(lab_ints)
677
+ lab_ints = lab_ints[order]
678
+ indexer = (getattr (self ._base, ' get_{}_indexer' .format(method))
679
+ (self , lab_ints, limit= limit))
680
+ indexer = indexer[order]
681
+ else :
682
+ indexer = self ._base.get_indexer(self , lab_ints)
683
+
684
+ return indexer
685
+
686
+ def get_loc (self , object key ):
687
+ if is_definitely_invalid_key(key):
688
+ raise TypeError (" '{key}' is an invalid key" .format(key = key))
689
+ if not PyTuple_Check(key):
690
+ raise KeyError (key)
691
+ try :
692
+ indices = [0 if checknull(v) else lev.get_loc(v) + 1
693
+ for lev, v in zip (self .levels, key)]
694
+ except KeyError :
695
+ raise KeyError (key)
696
+
697
+ # ndmin=2 because codes_to_ints expects multiple labels:
698
+ indices = np.array(indices, ndmin = 2 , dtype = ' uint64' )
699
+ # ... and returns a (length 1, in this case) array of integers:
700
+ lab_int = self ._codes_to_ints(indices)[0 ]
701
+
702
+ return self ._base.get_loc(self , lab_int)
703
+
704
+ def get_indexer_non_unique (self , object target ):
705
+ # This needs to be overridden just because the default one works on
706
+ # target._values, and target can be itself a MultiIndex.
707
+
708
+ lab_ints = self ._extract_level_codes(target)
709
+ indexer = self ._base.get_indexer_non_unique(self , lab_ints)
710
+
711
+ return indexer
712
+
713
+ def __contains__ (self , object val ):
714
+ # Default __contains__ looks in the underlying mapping, which in this
715
+ # case only contains integer representations.
716
+ try :
717
+ self .get_loc(val)
718
+ return True
719
+ except (KeyError , TypeError , ValueError ):
720
+ return False
721
+
722
+
723
+
588
724
cdef class MultiIndexObjectEngine(ObjectEngine):
589
725
"""
590
726
provide the same interface as the MultiIndexEngine
0 commit comments