@@ -20,6 +20,7 @@ from hashtable cimport HashTable
20
20
from pandas._libs import algos, hashtable as _hash
21
21
from pandas._libs.tslibs import period as periodlib
22
22
from pandas._libs.tslib import Timestamp, Timedelta
23
+ from pandas._libs.missing import checknull
23
24
from datetime import datetime, timedelta, date
24
25
25
26
from cpython cimport PyTuple_Check, PyList_Check
@@ -583,6 +584,141 @@ cpdef convert_scalar(ndarray arr, object value):
583
584
return value
584
585
585
586
587
+ cdef class BaseMultiIndexCodesEngine:
588
+ """
589
+ Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
590
+ represent each label in a MultiIndex as an integer, by juxtaposing the bits
591
+ encoding each level, with appropriate offsets.
592
+
593
+ For instance: if 3 levels have respectively 3, 6 and 1 possible values,
594
+ then their labels can be represented using respectively 2, 3 and 1 bits,
595
+ as follows:
596
+ _ _ _ _____ _ __ __ __
597
+ |0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
598
+ — — — ————— — —— —— ——
599
+ |0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
600
+ — — — ————— — —— —— ——
601
+ |0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
602
+ ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
603
+ and the resulting unsigned integer representation will be:
604
+ _ _ _ ____ _ __ __ __ __ __ __
605
+ |0|0|0| ...|0|c0|b2|b1|b0|a1|a0|
606
+ ‾ ‾ ‾ ‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾
607
+
608
+ Offsets are calculated at initialization, labels are transformed by method
609
+ _codes_to_ints.
610
+
611
+ Keys are located by first locating each component against the respective
612
+ level, then locating (the integer representation of) codes.
613
+ """
614
+ def __init__ (self , object levels , object labels ,
615
+ ndarray[uint64_t , ndim = 1 ] offsets):
616
+ """
617
+ Parameters
618
+ ----------
619
+ levels : list-like of numpy arrays
620
+ Levels of the MultiIndex
621
+ labels : list-like of numpy arrays of integer dtype
622
+ Labels of the MultiIndex
623
+ offsets : numpy array of uint64 dtype
624
+ Pre-calculated offsets, one for each level of the index
625
+ """
626
+
627
+ self .levels = levels
628
+ self .offsets = offsets
629
+
630
+ # Transform labels in a single array, and add 1 so that we are working
631
+ # with positive integers (-1 for NaN becomes 0):
632
+ codes = (np.array(labels, dtype = ' int64' ).T + 1 ).astype(' uint64' ,
633
+ copy = False )
634
+
635
+ # Map each codes combination in the index to an integer unambiguously
636
+ # (no collisions possible), based on the "offsets", which describe the
637
+ # number of bits to switch labels for each level:
638
+ lab_ints = self ._codes_to_ints(codes)
639
+
640
+ # Initialize underlying index (e.g. libindex.UInt64Engine) with
641
+ # integers representing labels: we will use its get_loc and get_indexer
642
+ self ._base.__init__ (self , lambda : lab_ints, len (lab_ints))
643
+
644
+ def _extract_level_codes (self , object target , object method = None ):
645
+ """
646
+ Map the requested list of (tuple) keys to their integer representations
647
+ for searching in the underlying integer index.
648
+
649
+ Parameters
650
+ ----------
651
+ target : list-like of keys
652
+ Each key is a tuple, with a label for each level of the index.
653
+
654
+ Returns
655
+ ------
656
+ int_keys : 1-dimensional array of dtype uint64 or object
657
+ Integers representing one combination each
658
+ """
659
+
660
+ level_codes = [lev.get_indexer(codes) + 1 for lev, codes
661
+ in zip (self .levels, zip (* target))]
662
+ return self ._codes_to_ints(np.array(level_codes, dtype = ' uint64' ).T)
663
+
664
+ def get_indexer (self , object target , object method = None ,
665
+ object limit = None ):
666
+ lab_ints = self ._extract_level_codes(target)
667
+
668
+ # All methods (exact, backfill, pad) directly map to the respective
669
+ # methods of the underlying (integers) index...
670
+ if method is not None :
671
+ # but underlying backfill and pad methods require index and keys
672
+ # to be sorted. The index already is (checked in
673
+ # Index._get_fill_indexer), sort (integer representations of) keys:
674
+ order = np.argsort(lab_ints)
675
+ lab_ints = lab_ints[order]
676
+ indexer = (getattr (self ._base, ' get_{}_indexer' .format(method))
677
+ (self , lab_ints, limit= limit))
678
+ indexer = indexer[order]
679
+ else :
680
+ indexer = self ._base.get_indexer(self , lab_ints)
681
+
682
+ return indexer
683
+
684
+ def get_loc (self , object key ):
685
+ if is_definitely_invalid_key(key):
686
+ raise TypeError (" '{key}' is an invalid key" .format(key = key))
687
+ if not PyTuple_Check(key):
688
+ raise KeyError (key)
689
+ try :
690
+ indices = [0 if checknull(v) else lev.get_loc(v) + 1
691
+ for lev, v in zip (self .levels, key)]
692
+ except KeyError :
693
+ raise KeyError (key)
694
+
695
+ # ndmin=2 because codes_to_ints expects multiple labels:
696
+ indices = np.array(indices, ndmin = 2 , dtype = ' uint64' )
697
+ # ... and returns a (length 1, in this case) array of integers:
698
+ lab_int = self ._codes_to_ints(indices)[0 ]
699
+
700
+ return self ._base.get_loc(self , lab_int)
701
+
702
+ def get_indexer_non_unique (self , object target ):
703
+ # This needs to be overridden just because the default one works on
704
+ # target._values, and target can be itself a MultiIndex.
705
+
706
+ lab_ints = self ._extract_level_codes(target)
707
+ indexer = self ._base.get_indexer_non_unique(self , lab_ints)
708
+
709
+ return indexer
710
+
711
+ def __contains__ (self , object val ):
712
+ # Default __contains__ looks in the underlying mapping, which in this
713
+ # case only contains integer representations.
714
+ try :
715
+ self .get_loc(val)
716
+ return True
717
+ except (KeyError , TypeError , ValueError ):
718
+ return False
719
+
720
+
721
+
586
722
cdef class MultiIndexObjectEngine(ObjectEngine):
587
723
"""
588
724
provide the same interface as the MultiIndexEngine
0 commit comments