@@ -26,11 +26,12 @@ from hashtable cimport HashTable
26
26
from pandas._libs import algos, hashtable as _hash
27
27
from pandas._libs.tslibs import period as periodlib
28
28
from pandas._libs.tslib import Timestamp, Timedelta
29
+ from pandas._libs.missing import checknull
29
30
30
31
cdef int64_t iNaT = util.get_nat()
31
32
32
33
33
- cdef inline is_definitely_invalid_key(object val):
34
+ cdef inline bint is_definitely_invalid_key(object val):
34
35
if PyTuple_Check(val):
35
36
try :
36
37
hash (val)
@@ -585,70 +586,137 @@ cpdef convert_scalar(ndarray arr, object value):
585
586
return value
586
587
587
588
588
- cdef class MultiIndexObjectEngine(ObjectEngine) :
589
+ cdef class BaseMultiIndexCodesEngine :
589
590
"""
590
- provide the same interface as the MultiIndexEngine
591
- but use the IndexEngine for computation
592
-
593
- This provides good performance with samller MI's
591
+ Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
592
+ represent each label in a MultiIndex as an integer, by juxtaposing the bits
593
+ encoding each level, with appropriate offsets.
594
+
595
+ For instance: if 3 levels have respectively 3, 6 and 1 possible values,
596
+ then their labels can be represented using respectively 2, 3 and 1 bits,
597
+ as follows:
598
+ _ _ _ _____ _ __ __ __
599
+ |0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
600
+ — — — ————— — —— —— ——
601
+ |0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
602
+ — — — ————— — —— —— ——
603
+ |0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
604
+ ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
605
+ and the resulting unsigned integer representation will be:
606
+ _ _ _ _____ _ __ __ __ __ __ __
607
+ |0|0|0| ... |0|c0|b2|b1|b0|a1|a0|
608
+ ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾
609
+
610
+ Offsets are calculated at initialization, labels are transformed by method
611
+ _codes_to_ints.
612
+
613
+ Keys are located by first locating each component against the respective
614
+ level, then locating (the integer representation of) codes.
594
615
"""
595
- def get_indexer (self , values ):
596
- # convert a MI to an ndarray
597
- if hasattr (values, ' values' ):
598
- values = values.values
599
- return super (MultiIndexObjectEngine, self ).get_indexer(values)
616
+ def __init__ (self , object levels , object labels ,
617
+ ndarray[uint64_t , ndim = 1 ] offsets):
618
+ """
619
+ Parameters
620
+ ----------
621
+ levels : list-like of numpy arrays
622
+ Levels of the MultiIndex
623
+ labels : list-like of numpy arrays of integer dtype
624
+ Labels of the MultiIndex
625
+ offsets : numpy array of uint64 dtype
626
+ Pre-calculated offsets, one for each level of the index
627
+ """
600
628
601
- cpdef get_loc(self , object val):
629
+ self .levels = levels
630
+ self .offsets = offsets
602
631
603
- # convert a MI to an ndarray
604
- if hasattr (val, ' values ' ):
605
- val = val.values
606
- return super (MultiIndexObjectEngine, self ).get_loc(val )
632
+ # Transform labels in a single array, and add 1 so that we are working
633
+ # with positive integers (-1 for NaN becomes 0 ):
634
+ codes = (np.array(labels, dtype = ' int64 ' ).T + 1 ).astype( ' uint64 ' ,
635
+ copy = False )
607
636
637
+ # Map each codes combination in the index to an integer unambiguously
638
+ # (no collisions possible), based on the "offsets", which describe the
639
+ # number of bits to switch labels for each level:
640
+ lab_ints = self ._codes_to_ints(codes)
608
641
609
- cdef class MultiIndexHashEngine(ObjectEngine):
610
- """
611
- Use a hashing based MultiIndex impl
612
- but use the IndexEngine for computation
642
+ # Initialize underlying index (e.g. libindex.UInt64Engine) with
643
+ # integers representing labels: we will use its get_loc and get_indexer
644
+ self ._base.__init__ (self , lambda : lab_ints, len (lab_ints))
613
645
614
- This provides good performance with larger MI's
615
- """
646
+ def _extract_level_codes (self , object target , object method = None ):
647
+ """
648
+ Map the requested list of (tuple) keys to their integer representations
649
+ for searching in the underlying integer index.
650
+
651
+ Parameters
652
+ ----------
653
+ target : list-like of keys
654
+ Each key is a tuple, with a label for each level of the index.
655
+
656
+ Returns
657
+ ------
658
+ int_keys : 1-dimensional array of dtype uint64 or object
659
+ Integers representing one combination each
660
+ """
616
661
617
- def _call_monotonic (self , object mi ):
618
- # defer these back to the mi iteself
619
- return (mi.is_monotonic_increasing,
620
- mi.is_monotonic_decreasing,
621
- mi.is_unique)
662
+ level_codes = [lev.get_indexer(codes) + 1 for lev, codes
663
+ in zip (self .levels, zip (* target))]
664
+ return self ._codes_to_ints(np.array(level_codes, dtype = ' uint64' ).T)
665
+
666
+ def get_indexer (self , object target , object method = None ,
667
+ object limit = None ):
668
+ lab_ints = self ._extract_level_codes(target)
669
+
670
+ # All methods (exact, backfill, pad) directly map to the respective
671
+ # methods of the underlying (integers) index...
672
+ if method is not None :
673
+ # but underlying backfill and pad methods require index and keys
674
+ # to be sorted. The index already is (checked in
675
+ # Index._get_fill_indexer), sort (integer representations of) keys:
676
+ order = np.argsort(lab_ints)
677
+ lab_ints = lab_ints[order]
678
+ indexer = (getattr (self ._base, ' get_{}_indexer' .format(method))
679
+ (self , lab_ints, limit= limit))
680
+ indexer = indexer[order]
681
+ else :
682
+ indexer = self ._base.get_indexer(self , lab_ints)
622
683
623
- def get_backfill_indexer (self , other , limit = None ):
624
- # we coerce to ndarray-of-tuples
625
- values = np.array(self ._get_index_values())
626
- return algos.backfill_object(values, other, limit = limit)
684
+ return indexer
627
685
628
- def get_pad_indexer (self , other , limit = None ):
629
- # we coerce to ndarray-of-tuples
630
- values = np.array(self ._get_index_values())
631
- return algos.pad_object(values, other, limit = limit)
686
+ def get_loc (self , object key ):
687
+ if is_definitely_invalid_key(key):
688
+ raise TypeError (" '{key}' is an invalid key" .format(key = key))
689
+ if not PyTuple_Check(key):
690
+ raise KeyError (key)
691
+ try :
692
+ indices = [0 if checknull(v) else lev.get_loc(v) + 1
693
+ for lev, v in zip (self .levels, key)]
694
+ except KeyError :
695
+ raise KeyError (key)
632
696
633
- cpdef get_loc(self , object val):
634
- if is_definitely_invalid_key(val):
635
- raise TypeError (" '{val}' is an invalid key" .format(val = val))
697
+ # Transform indices into single integer:
698
+ lab_int = self ._codes_to_ints(np.array(indices, dtype = ' uint64' ))
636
699
637
- self ._ensure_mapping_populated()
638
- if not self .unique:
639
- return self ._get_loc_duplicates(val)
700
+ return self ._base.get_loc(self , lab_int)
640
701
641
- try :
642
- return self .mapping.get_item(val)
643
- except TypeError :
644
- raise KeyError (val)
702
+ def get_indexer_non_unique (self , object target ):
703
+ # This needs to be overridden just because the default one works on
704
+ # target._values, and target can be itself a MultiIndex.
645
705
646
- def get_indexer (self , values ):
647
- self ._ensure_mapping_populated()
648
- return self .mapping.lookup(values)
706
+ lab_ints = self ._extract_level_codes(target)
707
+ indexer = self ._base.get_indexer_non_unique(self , lab_ints)
708
+
709
+ return indexer
710
+
711
+ def __contains__ (self , object val ):
712
+ # Default __contains__ looks in the underlying mapping, which in this
713
+ # case only contains integer representations.
714
+ try :
715
+ self .get_loc(val)
716
+ return True
717
+ except (KeyError , TypeError , ValueError ):
718
+ return False
649
719
650
- cdef _make_hash_table(self , n):
651
- return _hash.MultiIndexHashTable(n)
652
720
653
721
# Generated from template.
654
722
include " index_class_helper.pxi"
0 commit comments