Try moving BaseMultiIndexCodesEngine to python

GianlucaFicarelli · GianlucaFicarelli · commit 28282c0cbb19 · 2024-04-29T18:09:14.000+02:00
diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi
@@ -8,7 +8,7 @@ from pandas import (
 )
 from pandas.core.arrays import ExtensionArray
 
-multiindex_nulls_shift: int
+def is_definitely_invalid_key(val: object) -> bool: ...
 
 class IndexEngine:
     over_size_threshold: bool
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -44,11 +44,8 @@ from pandas._libs.missing cimport (
 
 from decimal import InvalidOperation
 
-# Defines shift of MultiIndex codes to avoid negative codes (missing values)
-multiindex_nulls_shift = 2
 
-
-cdef bint is_definitely_invalid_key(object val):
+def is_definitely_invalid_key(val):
     try:
         hash(val)
     except TypeError:
@@ -671,172 +668,6 @@ cdef class PeriodEngine(Int64Engine):
         return algos.is_monotonic(values, timelike=True)
 
 
-cdef class BaseMultiIndexCodesEngine:
-    """
-    Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
-    represent each label in a MultiIndex as an integer, by juxtaposing the bits
-    encoding each level, with appropriate offsets.
-
-    For instance: if 3 levels have respectively 3, 6 and 1 possible values,
-    then their labels can be represented using respectively 2, 3 and 1 bits,
-    as follows:
-     _ _ _ _____ _ __ __ __
-    |0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
-     — — — ————— — —— —— ——
-    |0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
-     — — — ————— — —— —— ——
-    |0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
-     ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
-    and the resulting unsigned integer representation will be:
-     _ _ _ _____ _ __ __ __ __ __ __
-    |0|0|0| ... |0|c0|b2|b1|b0|a1|a0|
-     ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾
-
-    Offsets are calculated at initialization, labels are transformed by method
-    _codes_to_ints.
-
-    Keys are located by first locating each component against the respective
-    level, then locating (the integer representation of) codes.
-    """
-    def __init__(self, object levels, object labels, ndarray offsets):
-        """
-        Parameters
-        ----------
-        levels : list-like of numpy arrays
-            Levels of the MultiIndex.
-        labels : list-like of numpy arrays of integer dtype
-            Labels of the MultiIndex.
-        offsets : numpy array of int dtype
-            Pre-calculated offsets, one for each level of the index.
-        """
-        self.levels = levels
-        self.offsets = offsets
-
-        # Transform labels in a single array, and add 2 so that we are working
-        # with positive integers (-1 for NaN becomes 1). This enables us to
-        # differentiate between values that are missing in other and matching
-        # NaNs. We will set values that are not found to 0 later:
-        codes = np.array(labels).T
-        codes += multiindex_nulls_shift  # inplace sum optimisation
-
-        self.level_has_nans = [-1 in lab for lab in labels]
-
-        # Map each codes combination in the index to an integer unambiguously
-        # (no collisions possible), based on the "offsets", which describe the
-        # number of bits to switch labels for each level:
-        lab_ints = self._codes_to_ints(codes)
-
-        # Initialize underlying index (e.g. libindex.UInt64Engine) with
-        # integers representing labels: we will use its get_loc and get_indexer
-        self._base.__init__(self, lab_ints)
-
-    def _codes_to_ints(self, ndarray codes) -> np.ndarray:
-        """
-        Transform combination(s) of uint in one uint or Python integer (each), in a
-        strictly monotonic way (i.e. respecting the lexicographic order of integer
-        combinations).
-
-        Parameters
-        ----------
-        codes : 1- or 2-dimensional array of dtype uint
-            Combinations of integers (one per row)
-
-        Returns
-        -------
-        scalar or 1-dimensional array, of dtype _codes_dtype
-            Integer(s) representing one combination (each).
-        """
-        # To avoid overflows, first make sure we are working with the right dtype:
-        codes = codes.astype(self._codes_dtype, copy=False)
-
-        # Shift the representation of each level by the pre-calculated number of bits:
-        codes <<= self.offsets  # inplace shift optimisation
-
-        # Now sum and OR are in fact interchangeable. This is a simple
-        # composition of the (disjunct) significant bits of each level (i.e.
-        # each column in "codes") in a single positive integer (per row):
-        if codes.ndim == 1:
-            # Single key
-            return np.bitwise_or.reduce(codes)
-
-        # Multiple keys
-        return np.bitwise_or.reduce(codes, axis=1)
-
-    def _extract_level_codes(self, target) -> np.ndarray:
-        """
-        Map the requested list of (tuple) keys to their integer representations
-        for searching in the underlying integer index.
-
-        Parameters
-        ----------
-        target : MultiIndex
-
-        Returns
-        ------
-        int_keys : 1-dimensional array of dtype uint64 or object
-            Integers representing one combination each
-        """
-        level_codes = list(target._recode_for_new_levels(self.levels))
-        for i, codes in enumerate(level_codes):
-            if self.levels[i].hasnans:
-                na_index = self.levels[i].isna().nonzero()[0][0]
-                codes[target.codes[i] == -1] = na_index
-            codes += 1
-            codes[codes > 0] += 1
-            if self.level_has_nans[i]:
-                codes[target.codes[i] == -1] += 1
-        return self._codes_to_ints(np.array(level_codes, dtype=self._codes_dtype).T)
-
-    def get_indexer(self, target: np.ndarray) -> np.ndarray:
-        """
-        Returns an array giving the positions of each value of `target` in
-        `self.values`, where -1 represents a value in `target` which does not
-        appear in `self.values`
-
-        Parameters
-        ----------
-        target : np.ndarray
-
-        Returns
-        -------
-        np.ndarray[intp_t, ndim=1] of the indexer of `target` into
-        `self.values`
-        """
-        return self._base.get_indexer(self, target)
-
-    def get_loc(self, object key):
-        if is_definitely_invalid_key(key):
-            raise TypeError(f"'{key}' is an invalid key")
-        if not isinstance(key, tuple):
-            raise KeyError(key)
-        try:
-            indices = [1 if checknull(v) else lev.get_loc(v) + multiindex_nulls_shift
-                       for lev, v in zip(self.levels, key)]
-        except KeyError:
-            raise KeyError(key)
-
-        # Transform indices into single integer:
-        lab_int = self._codes_to_ints(np.array(indices, dtype=self._codes_dtype))
-
-        return self._base.get_loc(self, lab_int)
-
-    def get_indexer_non_unique(self, target: np.ndarray) -> np.ndarray:
-        indexer = self._base.get_indexer_non_unique(self, target)
-
-        return indexer
-
-    def __contains__(self, val: object) -> bool:
-        # We assume before we get here:
-        #  - val is hashable
-        # Default __contains__ looks in the underlying mapping, which in this
-        # case only contains integer representations.
-        try:
-            self.get_loc(val)
-            return True
-        except (KeyError, TypeError, ValueError):
-            return False
-
-
 # Generated from template.
 include "index_class_helper.pxi"
 
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@ from pandas import (`
`8`	`8`	`)`
`9`	`9`	`from pandas.core.arrays import ExtensionArray`
`10`	`10`
`11`		`-multiindex_nulls_shift: int`
	`11`	`+def is_definitely_invalid_key(val: object) -> bool: ...`
`12`	`12`
`13`	`13`	`class IndexEngine:`
`14`	`14`	`over_size_threshold: bool`