PERF: MultiIndex._engine use smaller dtypes (#58411)

GianlucaFicarelli · web-flow · commit 086b047242e8 · 2024-04-30T11:25:26.000-07:00
* PERF: MultiIndex._engine use smaller dtypes

* Move offsets downcasting to MultiIndex._engine

* Remove unused import uint64_t
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -336,6 +336,7 @@ Performance improvements
 - Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`)
 - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`)
 - Performance improvement in :meth:`Index.to_frame` returning a :class:`RangeIndex` columns of a :class:`Index` when possible. (:issue:`58018`)
+- Performance improvement in :meth:`MultiIndex._engine` to use smaller dtypes if possible (:issue:`58411`)
 - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`)
 - Performance improvement in :meth:`MultiIndex.memory_usage` to ignore the index engine when it isn't already cached. (:issue:`58385`)
 - Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`)
diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi
@@ -74,13 +74,13 @@ class MaskedBoolEngine(MaskedUInt8Engine): ...
 
 class BaseMultiIndexCodesEngine:
     levels: list[np.ndarray]
-    offsets: np.ndarray  # ndarray[uint64_t, ndim=1]
+    offsets: np.ndarray  # np.ndarray[..., ndim=1]
 
     def __init__(
         self,
         levels: list[Index],  # all entries hashable
         labels: list[np.ndarray],  # all entries integer-dtyped
-        offsets: np.ndarray,  # np.ndarray[np.uint64, ndim=1]
+        offsets: np.ndarray,  # np.ndarray[..., ndim=1]
     ) -> None: ...
     def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ...
     def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ...
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -9,7 +9,6 @@ from numpy cimport (
     intp_t,
     ndarray,
     uint8_t,
-    uint64_t,
 )
 
 cnp.import_array()
@@ -699,16 +698,15 @@ cdef class BaseMultiIndexCodesEngine:
     Keys are located by first locating each component against the respective
     level, then locating (the integer representation of) codes.
     """
-    def __init__(self, object levels, object labels,
-                 ndarray[uint64_t, ndim=1] offsets):
+    def __init__(self, object levels, object labels, ndarray offsets):
         """
         Parameters
         ----------
         levels : list-like of numpy arrays
             Levels of the MultiIndex.
         labels : list-like of numpy arrays of integer dtype
             Labels of the MultiIndex.
-        offsets : numpy array of uint64 dtype
+        offsets : numpy array of int dtype
             Pre-calculated offsets, one for each level of the index.
         """
         self.levels = levels
@@ -718,8 +716,9 @@ cdef class BaseMultiIndexCodesEngine:
         # with positive integers (-1 for NaN becomes 1). This enables us to
         # differentiate between values that are missing in other and matching
         # NaNs. We will set values that are not found to 0 later:
-        labels_arr = np.array(labels, dtype="int64").T + multiindex_nulls_shift
-        codes = labels_arr.astype("uint64", copy=False)
+        codes = np.array(labels).T
+        codes += multiindex_nulls_shift  # inplace sum optimisation
+
         self.level_has_nans = [-1 in lab for lab in labels]
 
         # Map each codes combination in the index to an integer unambiguously
@@ -731,8 +730,37 @@ cdef class BaseMultiIndexCodesEngine:
         # integers representing labels: we will use its get_loc and get_indexer
         self._base.__init__(self, lab_ints)
 
-    def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray:
-        raise NotImplementedError("Implemented by subclass")  # pragma: no cover
+    def _codes_to_ints(self, ndarray codes) -> np.ndarray:
+        """
+        Transform combination(s) of uint in one uint or Python integer (each), in a
+        strictly monotonic way (i.e. respecting the lexicographic order of integer
+        combinations).
+
+        Parameters
+        ----------
+        codes : 1- or 2-dimensional array of dtype uint
+            Combinations of integers (one per row)
+
+        Returns
+        -------
+        scalar or 1-dimensional array, of dtype _codes_dtype
+            Integer(s) representing one combination (each).
+        """
+        # To avoid overflows, first make sure we are working with the right dtype:
+        codes = codes.astype(self._codes_dtype, copy=False)
+
+        # Shift the representation of each level by the pre-calculated number of bits:
+        codes <<= self.offsets  # inplace shift optimisation
+
+        # Now sum and OR are in fact interchangeable. This is a simple
+        # composition of the (disjunct) significant bits of each level (i.e.
+        # each column in "codes") in a single positive integer (per row):
+        if codes.ndim == 1:
+            # Single key
+            return np.bitwise_or.reduce(codes)
+
+        # Multiple keys
+        return np.bitwise_or.reduce(codes, axis=1)
 
     def _extract_level_codes(self, target) -> np.ndarray:
         """
@@ -757,7 +785,7 @@ cdef class BaseMultiIndexCodesEngine:
             codes[codes > 0] += 1
             if self.level_has_nans[i]:
                 codes[target.codes[i] == -1] += 1
-        return self._codes_to_ints(np.array(level_codes, dtype="uint64").T)
+        return self._codes_to_ints(np.array(level_codes, dtype=self._codes_dtype).T)
 
     def get_indexer(self, target: np.ndarray) -> np.ndarray:
         """
@@ -788,7 +816,7 @@ cdef class BaseMultiIndexCodesEngine:
             raise KeyError(key)
 
         # Transform indices into single integer:
-        lab_int = self._codes_to_ints(np.array(indices, dtype="uint64"))
+        lab_int = self._codes_to_ints(np.array(indices, dtype=self._codes_dtype))
 
         return self._base.get_loc(self, lab_int)
 
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -123,84 +123,56 @@
 )
 
 
-class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine):
-    """
-    This class manages a MultiIndex by mapping label combinations to positive
-    integers.
+class MultiIndexUInt64Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine):
+    """Manages a MultiIndex by mapping label combinations to positive integers.
+
+    The number of possible label combinations must not overflow the 64 bits integers.
     """
 
     _base = libindex.UInt64Engine
+    _codes_dtype = "uint64"
 
-    def _codes_to_ints(self, codes):
-        """
-        Transform combination(s) of uint64 in one uint64 (each), in a strictly
-        monotonic way (i.e. respecting the lexicographic order of integer
-        combinations): see BaseMultiIndexCodesEngine documentation.
 
-        Parameters
-        ----------
-        codes : 1- or 2-dimensional array of dtype uint64
-            Combinations of integers (one per row)
+class MultiIndexUInt32Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt32Engine):
+    """Manages a MultiIndex by mapping label combinations to positive integers.
 
-        Returns
-        -------
-        scalar or 1-dimensional array, of dtype uint64
-            Integer(s) representing one combination (each).
-        """
-        # Shift the representation of each level by the pre-calculated number
-        # of bits:
-        codes <<= self.offsets
+    The number of possible label combinations must not overflow the 32 bits integers.
+    """
 
-        # Now sum and OR are in fact interchangeable. This is a simple
-        # composition of the (disjunct) significant bits of each level (i.e.
-        # each column in "codes") in a single positive integer:
-        if codes.ndim == 1:
-            # Single key
-            return np.bitwise_or.reduce(codes)
+    _base = libindex.UInt32Engine
+    _codes_dtype = "uint32"
 
-        # Multiple keys
-        return np.bitwise_or.reduce(codes, axis=1)
 
+class MultiIndexUInt16Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt16Engine):
+    """Manages a MultiIndex by mapping label combinations to positive integers.
 
-class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine):
-    """
-    This class manages those (extreme) cases in which the number of possible
-    label combinations overflows the 64 bits integers, and uses an ObjectEngine
-    containing Python integers.
+    The number of possible label combinations must not overflow the 16 bits integers.
     """
 
-    _base = libindex.ObjectEngine
+    _base = libindex.UInt16Engine
+    _codes_dtype = "uint16"
 
-    def _codes_to_ints(self, codes):
-        """
-        Transform combination(s) of uint64 in one Python integer (each), in a
-        strictly monotonic way (i.e. respecting the lexicographic order of
-        integer combinations): see BaseMultiIndexCodesEngine documentation.
 
-        Parameters
-        ----------
-        codes : 1- or 2-dimensional array of dtype uint64
-            Combinations of integers (one per row)
+class MultiIndexUInt8Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt8Engine):
+    """Manages a MultiIndex by mapping label combinations to positive integers.
 
-        Returns
-        -------
-        int, or 1-dimensional array of dtype object
-            Integer(s) representing one combination (each).
-        """
-        # Shift the representation of each level by the pre-calculated number
-        # of bits. Since this can overflow uint64, first make sure we are
-        # working with Python integers:
-        codes = codes.astype("object") << self.offsets
+    The number of possible label combinations must not overflow the 8 bits integers.
+    """
 
-        # Now sum and OR are in fact interchangeable. This is a simple
-        # composition of the (disjunct) significant bits of each level (i.e.
-        # each column in "codes") in a single positive integer (per row):
-        if codes.ndim == 1:
-            # Single key
-            return np.bitwise_or.reduce(codes)
+    _base = libindex.UInt8Engine
+    _codes_dtype = "uint8"
 
-        # Multiple keys
-        return np.bitwise_or.reduce(codes, axis=1)
+
+class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine):
+    """Manages a MultiIndex by mapping label combinations to positive integers.
+
+    This class manages those (extreme) cases in which the number of possible
+    label combinations overflows the 64 bits integers, and uses an ObjectEngine
+    containing Python integers.
+    """
+
+    _base = libindex.ObjectEngine
+    _codes_dtype = "object"
 
 
 def names_compat(meth: F) -> F:
@@ -1229,13 +1201,25 @@ def _engine(self):
         # equivalent to sorting lexicographically the codes themselves. Notice
         # that each level needs to be shifted by the number of bits needed to
         # represent the _previous_ ones:
-        offsets = np.concatenate([lev_bits[1:], [0]]).astype("uint64")
+        offsets = np.concatenate([lev_bits[1:], [0]])
+        # Downcast the type if possible, to prevent upcasting when shifting codes:
+        offsets = offsets.astype(np.min_scalar_type(int(offsets[0])))
 
         # Check the total number of bits needed for our representation:
         if lev_bits[0] > 64:
             # The levels would overflow a 64 bit uint - use Python integers:
             return MultiIndexPyIntEngine(self.levels, self.codes, offsets)
-        return MultiIndexUIntEngine(self.levels, self.codes, offsets)
+        if lev_bits[0] > 32:
+            # The levels would overflow a 32 bit uint - use uint64
+            return MultiIndexUInt64Engine(self.levels, self.codes, offsets)
+        if lev_bits[0] > 16:
+            # The levels would overflow a 16 bit uint - use uint8
+            return MultiIndexUInt32Engine(self.levels, self.codes, offsets)
+        if lev_bits[0] > 8:
+            # The levels would overflow a 8 bit uint - use uint16
+            return MultiIndexUInt16Engine(self.levels, self.codes, offsets)
+        # The levels fit in an 8 bit uint - use uint8
+        return MultiIndexUInt8Engine(self.levels, self.codes, offsets)
 
     # Return type "Callable[..., MultiIndex]" of "_constructor" incompatible with return
     # type "Type[MultiIndex]" in supertype "Index"
diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py
@@ -919,30 +919,41 @@ def test_slice_indexer_with_missing_value(index_arr, expected, start_idx, end_id
     assert result == expected
 
 
-def test_pyint_engine():
+@pytest.mark.parametrize(
+    "N, expected_dtype",
+    [
+        (1, "uint8"),  # 2*4*N = 8
+        (2, "uint16"),  # 2*4*N = 16
+        (4, "uint32"),  # 2*4*N = 32
+        (8, "uint64"),  # 2*4*N = 64
+        (10, "object"),  # 2*4*N = 80
+    ],
+)
+def test_pyint_engine(N, expected_dtype):
     # GH#18519 : when combinations of codes cannot be represented in 64
     # bits, the index underlying the MultiIndex engine works with Python
     # integers, rather than uint64.
-    N = 5
     keys = [
         tuple(arr)
         for arr in [
-            [0] * 10 * N,
-            [1] * 10 * N,
-            [2] * 10 * N,
-            [np.nan] * N + [2] * 9 * N,
-            [0] * N + [2] * 9 * N,
-            [np.nan] * N + [2] * 8 * N + [0] * N,
+            [0] * 4 * N,
+            [1] * 4 * N,
+            [np.nan] * N + [0] * 3 * N,
+            [0] * N + [1] * 3 * N,
+            [np.nan] * N + [1] * 2 * N + [0] * N,
         ]
     ]
-    # Each level contains 4 elements (including NaN), so it is represented
-    # in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a
-    # 64 bit engine and truncating the first levels, the fourth and fifth
-    # keys would collide; if truncating the last levels, the fifth and
-    # sixth; if rotating bits rather than shifting, the third and fifth.
+    # Each level contains 3 elements (NaN, 0, 1), and it's represented
+    # in 2 bits to store 4 possible values (0=notfound, 1=NaN, 2=0, 3=1), for
+    # a total of 2*N*4 = 80 > 64 bits where N=10 and the number of levels is N*4.
+    # If we were using a 64 bit engine and truncating the first levels, the
+    # fourth and fifth keys would collide; if truncating the last levels, the
+    # fifth and sixth; if rotating bits rather than shifting, the third and fifth.
+
+    index = MultiIndex.from_tuples(keys)
+    assert index._engine.values.dtype == expected_dtype
 
     for idx, key_value in enumerate(keys):
-        index = MultiIndex.from_tuples(keys)
         assert index.get_loc(key_value) == idx
 
         expected = np.arange(idx + 1, dtype=np.intp)
@@ -952,7 +963,7 @@ def test_pyint_engine():
     # With missing key:
     idces = range(len(keys))
     expected = np.array([-1] + list(idces), dtype=np.intp)
-    missing = tuple([0, 1] * 5 * N)
+    missing = tuple([0, 1, 0, 1] * N)
     result = index.get_indexer([missing] + [keys[i] for i in idces])
     tm.assert_numpy_array_equal(result, expected)