|
50 | 50 | target_klass='MultiIndex or list of tuples'))
|
51 | 51 |
|
52 | 52 |
|
| 53 | +def is_definitely_invalid_key(val): |
| 54 | + if isinstance(val, tuple): |
| 55 | + try: |
| 56 | + hash(val) |
| 57 | + except TypeError: |
| 58 | + return True |
| 59 | + |
| 60 | + # we have a _data, means we are a NDFrame |
| 61 | + return isinstance(val, (slice, np.ndarray, list)) or hasattr(val, '_data') |
| 62 | + |
| 63 | + |
| 64 | +class BaseMultiIndexCodesEngine(object): |
| 65 | + def __init__(self, levels, labels, offsets, **kwargs): |
| 66 | + self._levels = levels |
| 67 | + self._labels = labels |
| 68 | + self._offsets = offsets |
| 69 | + |
| 70 | + # Map each combination to an integer |
| 71 | + lab_ints = self._labs_to_ints(labels) |
| 72 | + |
| 73 | + # Initialize underlying index |
| 74 | + self._base.__init__(self, lambda: lab_ints, len(lab_ints), **kwargs) |
| 75 | + |
| 76 | + def _labs_to_ints(self, labels): |
| 77 | + # Add 1 so that -1 (NaN) becomes 0 |
| 78 | + codes = (np.asarray(labels).T + 1).astype(self._type) |
| 79 | + # Shift: |
| 80 | + rot_codes = codes << self._offsets |
| 81 | + # Now sum and OR are in fact interchangeable: |
| 82 | + return np.bitwise_or.reduce(rot_codes, axis=1) |
| 83 | + |
| 84 | + def get_indexer(self, target, method=None, limit=None): |
| 85 | + level_codes = [self._levels[lev].get_indexer(codes, method=method) |
| 86 | + for lev, codes in enumerate(zip(*target))] |
| 87 | + |
| 88 | + keys_int = self._labs_to_ints(level_codes) |
| 89 | + |
| 90 | + if method is not None: |
| 91 | + # keys must be sorted - the engine already is |
| 92 | + order = np.argsort(keys_int) |
| 93 | + keys_int = keys_int[order] |
| 94 | + sup_meth = getattr(self._base, 'get_{}_indexer'.format(method)) |
| 95 | + indexer = sup_meth(self, keys_int, limit=limit) |
| 96 | + indexer = indexer[order] |
| 97 | + else: |
| 98 | + indexer = self._base.get_indexer(self, keys_int) |
| 99 | + |
| 100 | + return indexer |
| 101 | + |
| 102 | + def get_loc(self, key): |
| 103 | + if is_definitely_invalid_key(key): |
| 104 | + raise TypeError("'{key}' is an invalid key".format(key=key)) |
| 105 | + if not isinstance(key, tuple): |
| 106 | + raise KeyError(key) |
| 107 | + try: |
| 108 | + idces = [-1 if isna(val) else self._levels[lev].get_loc(val) |
| 109 | + for lev, val in enumerate(key)] |
| 110 | + except KeyError: |
| 111 | + raise KeyError(key) |
| 112 | + idces = np.array(idces, ndmin=2).T |
| 113 | + |
| 114 | + key_int = self._labs_to_ints(idces)[0] |
| 115 | + |
| 116 | + return self._base.get_loc(self, key_int) |
| 117 | + |
| 118 | + def get_indexer_non_unique(self, target): |
| 119 | + # This needs to be overridden just because the default one works on |
| 120 | + # target._values, and target can be itself a MultiIndex. |
| 121 | + |
| 122 | + level_codes = [self._levels[lev].get_indexer(codes) |
| 123 | + for lev, codes in enumerate(zip(*target))] |
| 124 | + keys_int = self._labs_to_ints(level_codes) |
| 125 | + |
| 126 | + indexer = self._base.get_indexer_non_unique(self, keys_int) |
| 127 | + |
| 128 | + return indexer |
| 129 | + |
| 130 | + def __contains__(self, val): |
| 131 | + try: |
| 132 | + self.get_loc(val) |
| 133 | + return True |
| 134 | + except: |
| 135 | + return False |
| 136 | + |
| 137 | + |
| 138 | +class MultiIndexUIntEngine(BaseMultiIndexCodesEngine, libindex.UInt64Engine): |
| 139 | + """ |
| 140 | + Manage a MultiIndex by mapping label combinations to positive integers. |
| 141 | + """ |
| 142 | + _base = libindex.UInt64Engine |
| 143 | + _type = 'uint64' |
| 144 | + |
| 145 | + |
| 146 | +class MultiIndexPyIntEngine(BaseMultiIndexCodesEngine, libindex.ObjectEngine): |
| 147 | + """ |
| 148 | + In those (extreme) cases in which the number of possible label combinations |
| 149 | + overflows the 64 bits integers, use an ObjectEngine containing Python |
| 150 | + integers. |
| 151 | + """ |
| 152 | + _base = libindex.ObjectEngine |
| 153 | + _type = 'object' |
| 154 | + |
| 155 | + |
53 | 156 | class MultiIndex(Index):
|
54 | 157 | """
|
55 | 158 | A multi-level, or hierarchical, index object for pandas objects
|
@@ -691,16 +794,15 @@ def _get_level_number(self, level):
|
691 | 794 |
|
692 | 795 | @cache_readonly
|
693 | 796 | def _engine(self):
|
| 797 | + # Find powers of 2 which dominate level sizes - including -1 for NaN: |
| 798 | + lev_bits = np.cumsum(np.ceil(np.log2([len(l) + 1 for l in |
| 799 | + self.levels[::-1]])))[::-1] |
| 800 | + offsets = np.concatenate([lev_bits[1:], [0]]).astype('uint') |
694 | 801 |
|
695 |
| - # choose our engine based on our size |
696 |
| - # the hashing based MultiIndex for larger |
697 |
| - # sizes, and the MultiIndexOjbect for smaller |
698 |
| - # xref: https://github.com/pandas-dev/pandas/pull/16324 |
699 |
| - l = len(self) |
700 |
| - if l > 10000: |
701 |
| - return libindex.MultiIndexHashEngine(lambda: self, l) |
702 |
| - |
703 |
| - return libindex.MultiIndexObjectEngine(lambda: self.values, l) |
| 802 | + if lev_bits[0] > 64: |
| 803 | + # The levels would overflow a 64 bit integer - use Python integers: |
| 804 | + return MultiIndexPyIntEngine(self.levels, self.labels, offsets) |
| 805 | + return MultiIndexUIntEngine(self.levels, self.labels, offsets) |
704 | 806 |
|
705 | 807 | @property
|
706 | 808 | def values(self):
|
@@ -1889,7 +1991,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
|
1889 | 1991 | if tolerance is not None:
|
1890 | 1992 | raise NotImplementedError("tolerance not implemented yet "
|
1891 | 1993 | 'for MultiIndex')
|
1892 |
| - indexer = self._get_fill_indexer(target, method, limit) |
| 1994 | + indexer = self._engine.get_indexer(target, method, limit) |
1893 | 1995 | elif method == 'nearest':
|
1894 | 1996 | raise NotImplementedError("method='nearest' not implemented yet "
|
1895 | 1997 | 'for MultiIndex; see GitHub issue 9365')
|
|
0 commit comments