Skip to content

Commit 28282c0

Browse files
Try moving BaseMultiIndexCodesEngine to python
1 parent 20f3bc0 commit 28282c0

File tree

3 files changed

+179
-177
lines changed

3 files changed

+179
-177
lines changed

pandas/_libs/index.pyi

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ from pandas import (
88
)
99
from pandas.core.arrays import ExtensionArray
1010

11-
multiindex_nulls_shift: int
11+
def is_definitely_invalid_key(val: object) -> bool: ...
1212

1313
class IndexEngine:
1414
over_size_threshold: bool

pandas/_libs/index.pyx

+1-170
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,8 @@ from pandas._libs.missing cimport (
4444

4545
from decimal import InvalidOperation
4646

47-
# Defines shift of MultiIndex codes to avoid negative codes (missing values)
48-
multiindex_nulls_shift = 2
4947

50-
51-
cdef bint is_definitely_invalid_key(object val):
48+
def is_definitely_invalid_key(val):
5249
try:
5350
hash(val)
5451
except TypeError:
@@ -671,172 +668,6 @@ cdef class PeriodEngine(Int64Engine):
671668
return algos.is_monotonic(values, timelike=True)
672669

673670

674-
cdef class BaseMultiIndexCodesEngine:
675-
"""
676-
Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
677-
represent each label in a MultiIndex as an integer, by juxtaposing the bits
678-
encoding each level, with appropriate offsets.
679-
680-
For instance: if 3 levels have respectively 3, 6 and 1 possible values,
681-
then their labels can be represented using respectively 2, 3 and 1 bits,
682-
as follows:
683-
_ _ _ _____ _ __ __ __
684-
|0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
685-
— — — ————— — —— —— ——
686-
|0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
687-
— — — ————— — —— —— ——
688-
|0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
689-
‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
690-
and the resulting unsigned integer representation will be:
691-
_ _ _ _____ _ __ __ __ __ __ __
692-
|0|0|0| ... |0|c0|b2|b1|b0|a1|a0|
693-
‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾
694-
695-
Offsets are calculated at initialization, labels are transformed by method
696-
_codes_to_ints.
697-
698-
Keys are located by first locating each component against the respective
699-
level, then locating (the integer representation of) codes.
700-
"""
701-
def __init__(self, object levels, object labels, ndarray offsets):
702-
"""
703-
Parameters
704-
----------
705-
levels : list-like of numpy arrays
706-
Levels of the MultiIndex.
707-
labels : list-like of numpy arrays of integer dtype
708-
Labels of the MultiIndex.
709-
offsets : numpy array of int dtype
710-
Pre-calculated offsets, one for each level of the index.
711-
"""
712-
self.levels = levels
713-
self.offsets = offsets
714-
715-
# Transform labels in a single array, and add 2 so that we are working
716-
# with positive integers (-1 for NaN becomes 1). This enables us to
717-
# differentiate between values that are missing in other and matching
718-
# NaNs. We will set values that are not found to 0 later:
719-
codes = np.array(labels).T
720-
codes += multiindex_nulls_shift # inplace sum optimisation
721-
722-
self.level_has_nans = [-1 in lab for lab in labels]
723-
724-
# Map each codes combination in the index to an integer unambiguously
725-
# (no collisions possible), based on the "offsets", which describe the
726-
# number of bits to switch labels for each level:
727-
lab_ints = self._codes_to_ints(codes)
728-
729-
# Initialize underlying index (e.g. libindex.UInt64Engine) with
730-
# integers representing labels: we will use its get_loc and get_indexer
731-
self._base.__init__(self, lab_ints)
732-
733-
def _codes_to_ints(self, ndarray codes) -> np.ndarray:
734-
"""
735-
Transform combination(s) of uint in one uint or Python integer (each), in a
736-
strictly monotonic way (i.e. respecting the lexicographic order of integer
737-
combinations).
738-
739-
Parameters
740-
----------
741-
codes : 1- or 2-dimensional array of dtype uint
742-
Combinations of integers (one per row)
743-
744-
Returns
745-
-------
746-
scalar or 1-dimensional array, of dtype _codes_dtype
747-
Integer(s) representing one combination (each).
748-
"""
749-
# To avoid overflows, first make sure we are working with the right dtype:
750-
codes = codes.astype(self._codes_dtype, copy=False)
751-
752-
# Shift the representation of each level by the pre-calculated number of bits:
753-
codes <<= self.offsets # inplace shift optimisation
754-
755-
# Now sum and OR are in fact interchangeable. This is a simple
756-
# composition of the (disjunct) significant bits of each level (i.e.
757-
# each column in "codes") in a single positive integer (per row):
758-
if codes.ndim == 1:
759-
# Single key
760-
return np.bitwise_or.reduce(codes)
761-
762-
# Multiple keys
763-
return np.bitwise_or.reduce(codes, axis=1)
764-
765-
def _extract_level_codes(self, target) -> np.ndarray:
766-
"""
767-
Map the requested list of (tuple) keys to their integer representations
768-
for searching in the underlying integer index.
769-
770-
Parameters
771-
----------
772-
target : MultiIndex
773-
774-
Returns
775-
------
776-
int_keys : 1-dimensional array of dtype uint64 or object
777-
Integers representing one combination each
778-
"""
779-
level_codes = list(target._recode_for_new_levels(self.levels))
780-
for i, codes in enumerate(level_codes):
781-
if self.levels[i].hasnans:
782-
na_index = self.levels[i].isna().nonzero()[0][0]
783-
codes[target.codes[i] == -1] = na_index
784-
codes += 1
785-
codes[codes > 0] += 1
786-
if self.level_has_nans[i]:
787-
codes[target.codes[i] == -1] += 1
788-
return self._codes_to_ints(np.array(level_codes, dtype=self._codes_dtype).T)
789-
790-
def get_indexer(self, target: np.ndarray) -> np.ndarray:
791-
"""
792-
Returns an array giving the positions of each value of `target` in
793-
`self.values`, where -1 represents a value in `target` which does not
794-
appear in `self.values`
795-
796-
Parameters
797-
----------
798-
target : np.ndarray
799-
800-
Returns
801-
-------
802-
np.ndarray[intp_t, ndim=1] of the indexer of `target` into
803-
`self.values`
804-
"""
805-
return self._base.get_indexer(self, target)
806-
807-
def get_loc(self, object key):
808-
if is_definitely_invalid_key(key):
809-
raise TypeError(f"'{key}' is an invalid key")
810-
if not isinstance(key, tuple):
811-
raise KeyError(key)
812-
try:
813-
indices = [1 if checknull(v) else lev.get_loc(v) + multiindex_nulls_shift
814-
for lev, v in zip(self.levels, key)]
815-
except KeyError:
816-
raise KeyError(key)
817-
818-
# Transform indices into single integer:
819-
lab_int = self._codes_to_ints(np.array(indices, dtype=self._codes_dtype))
820-
821-
return self._base.get_loc(self, lab_int)
822-
823-
def get_indexer_non_unique(self, target: np.ndarray) -> np.ndarray:
824-
indexer = self._base.get_indexer_non_unique(self, target)
825-
826-
return indexer
827-
828-
def __contains__(self, val: object) -> bool:
829-
# We assume before we get here:
830-
# - val is hashable
831-
# Default __contains__ looks in the underlying mapping, which in this
832-
# case only contains integer representations.
833-
try:
834-
self.get_loc(val)
835-
return True
836-
except (KeyError, TypeError, ValueError):
837-
return False
838-
839-
840671
# Generated from template.
841672
include "index_class_helper.pxi"
842673

0 commit comments

Comments
 (0)