Skip to content

Commit 086b047

Browse files
PERF: MultiIndex._engine use smaller dtypes (#58411)
* PERF: MultiIndex._engine use smaller dtypes * Move offsets downcasting to MultiIndex._engine * Remove unused import uint64_t
1 parent 0f9adf8 commit 086b047

File tree

5 files changed

+114
-90
lines changed

5 files changed

+114
-90
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,7 @@ Performance improvements
336336
- Performance improvement in :meth:`Index.join` by propagating cached attributes in cases where the result matches one of the inputs (:issue:`57023`)
337337
- Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`)
338338
- Performance improvement in :meth:`Index.to_frame` returning a :class:`RangeIndex` columns of a :class:`Index` when possible. (:issue:`58018`)
339+
- Performance improvement in :meth:`MultiIndex._engine` to use smaller dtypes if possible (:issue:`58411`)
339340
- Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`)
340341
- Performance improvement in :meth:`MultiIndex.memory_usage` to ignore the index engine when it isn't already cached. (:issue:`58385`)
341342
- Performance improvement in :meth:`RangeIndex.__getitem__` with a boolean mask or integers returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57588`)

pandas/_libs/index.pyi

+2-2
Original file line numberDiff line numberDiff line change
@@ -74,13 +74,13 @@ class MaskedBoolEngine(MaskedUInt8Engine): ...
7474

7575
class BaseMultiIndexCodesEngine:
7676
levels: list[np.ndarray]
77-
offsets: np.ndarray # ndarray[uint64_t, ndim=1]
77+
offsets: np.ndarray # np.ndarray[..., ndim=1]
7878

7979
def __init__(
8080
self,
8181
levels: list[Index], # all entries hashable
8282
labels: list[np.ndarray], # all entries integer-dtyped
83-
offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1]
83+
offsets: np.ndarray, # np.ndarray[..., ndim=1]
8484
) -> None: ...
8585
def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ...
8686
def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ...

pandas/_libs/index.pyx

+38-10
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ from numpy cimport (
99
intp_t,
1010
ndarray,
1111
uint8_t,
12-
uint64_t,
1312
)
1413

1514
cnp.import_array()
@@ -699,16 +698,15 @@ cdef class BaseMultiIndexCodesEngine:
699698
Keys are located by first locating each component against the respective
700699
level, then locating (the integer representation of) codes.
701700
"""
702-
def __init__(self, object levels, object labels,
703-
ndarray[uint64_t, ndim=1] offsets):
701+
def __init__(self, object levels, object labels, ndarray offsets):
704702
"""
705703
Parameters
706704
----------
707705
levels : list-like of numpy arrays
708706
Levels of the MultiIndex.
709707
labels : list-like of numpy arrays of integer dtype
710708
Labels of the MultiIndex.
711-
offsets : numpy array of uint64 dtype
709+
offsets : numpy array of int dtype
712710
Pre-calculated offsets, one for each level of the index.
713711
"""
714712
self.levels = levels
@@ -718,8 +716,9 @@ cdef class BaseMultiIndexCodesEngine:
718716
# with positive integers (-1 for NaN becomes 1). This enables us to
719717
# differentiate between values that are missing in other and matching
720718
# NaNs. We will set values that are not found to 0 later:
721-
labels_arr = np.array(labels, dtype="int64").T + multiindex_nulls_shift
722-
codes = labels_arr.astype("uint64", copy=False)
719+
codes = np.array(labels).T
720+
codes += multiindex_nulls_shift # inplace sum optimisation
721+
723722
self.level_has_nans = [-1 in lab for lab in labels]
724723

725724
# Map each codes combination in the index to an integer unambiguously
@@ -731,8 +730,37 @@ cdef class BaseMultiIndexCodesEngine:
731730
# integers representing labels: we will use its get_loc and get_indexer
732731
self._base.__init__(self, lab_ints)
733732

734-
def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray:
735-
raise NotImplementedError("Implemented by subclass") # pragma: no cover
733+
def _codes_to_ints(self, ndarray codes) -> np.ndarray:
734+
"""
735+
Transform combination(s) of uint in one uint or Python integer (each), in a
736+
strictly monotonic way (i.e. respecting the lexicographic order of integer
737+
combinations).
738+
739+
Parameters
740+
----------
741+
codes : 1- or 2-dimensional array of dtype uint
742+
Combinations of integers (one per row)
743+
744+
Returns
745+
-------
746+
scalar or 1-dimensional array, of dtype _codes_dtype
747+
Integer(s) representing one combination (each).
748+
"""
749+
# To avoid overflows, first make sure we are working with the right dtype:
750+
codes = codes.astype(self._codes_dtype, copy=False)
751+
752+
# Shift the representation of each level by the pre-calculated number of bits:
753+
codes <<= self.offsets # inplace shift optimisation
754+
755+
# Now sum and OR are in fact interchangeable. This is a simple
756+
# composition of the (disjunct) significant bits of each level (i.e.
757+
# each column in "codes") in a single positive integer (per row):
758+
if codes.ndim == 1:
759+
# Single key
760+
return np.bitwise_or.reduce(codes)
761+
762+
# Multiple keys
763+
return np.bitwise_or.reduce(codes, axis=1)
736764

737765
def _extract_level_codes(self, target) -> np.ndarray:
738766
"""
@@ -757,7 +785,7 @@ cdef class BaseMultiIndexCodesEngine:
757785
codes[codes > 0] += 1
758786
if self.level_has_nans[i]:
759787
codes[target.codes[i] == -1] += 1
760-
return self._codes_to_ints(np.array(level_codes, dtype="uint64").T)
788+
return self._codes_to_ints(np.array(level_codes, dtype=self._codes_dtype).T)
761789

762790
def get_indexer(self, target: np.ndarray) -> np.ndarray:
763791
"""
@@ -788,7 +816,7 @@ cdef class BaseMultiIndexCodesEngine:
788816
raise KeyError(key)
789817

790818
# Transform indices into single integer:
791-
lab_int = self._codes_to_ints(np.array(indices, dtype="uint64"))
819+
lab_int = self._codes_to_ints(np.array(indices, dtype=self._codes_dtype))
792820

793821
return self._base.get_loc(self, lab_int)
794822

pandas/core/indexes/multi.py

+47-63
Original file line numberDiff line numberDiff line change
@@ -123,84 +123,56 @@
123123
)
124124

125125

126-
class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine):
127-
"""
128-
This class manages a MultiIndex by mapping label combinations to positive
129-
integers.
126+
class MultiIndexUInt64Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine):
127+
"""Manages a MultiIndex by mapping label combinations to positive integers.
128+
129+
The number of possible label combinations must not overflow the 64 bits integers.
130130
"""
131131

132132
_base = libindex.UInt64Engine
133+
_codes_dtype = "uint64"
133134

134-
def _codes_to_ints(self, codes):
135-
"""
136-
Transform combination(s) of uint64 in one uint64 (each), in a strictly
137-
monotonic way (i.e. respecting the lexicographic order of integer
138-
combinations): see BaseMultiIndexCodesEngine documentation.
139135

140-
Parameters
141-
----------
142-
codes : 1- or 2-dimensional array of dtype uint64
143-
Combinations of integers (one per row)
136+
class MultiIndexUInt32Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt32Engine):
137+
"""Manages a MultiIndex by mapping label combinations to positive integers.
144138
145-
Returns
146-
-------
147-
scalar or 1-dimensional array, of dtype uint64
148-
Integer(s) representing one combination (each).
149-
"""
150-
# Shift the representation of each level by the pre-calculated number
151-
# of bits:
152-
codes <<= self.offsets
139+
The number of possible label combinations must not overflow the 32 bits integers.
140+
"""
153141

154-
# Now sum and OR are in fact interchangeable. This is a simple
155-
# composition of the (disjunct) significant bits of each level (i.e.
156-
# each column in "codes") in a single positive integer:
157-
if codes.ndim == 1:
158-
# Single key
159-
return np.bitwise_or.reduce(codes)
142+
_base = libindex.UInt32Engine
143+
_codes_dtype = "uint32"
160144

161-
# Multiple keys
162-
return np.bitwise_or.reduce(codes, axis=1)
163145

146+
class MultiIndexUInt16Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt16Engine):
147+
"""Manages a MultiIndex by mapping label combinations to positive integers.
164148
165-
class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine):
166-
"""
167-
This class manages those (extreme) cases in which the number of possible
168-
label combinations overflows the 64 bits integers, and uses an ObjectEngine
169-
containing Python integers.
149+
The number of possible label combinations must not overflow the 16 bits integers.
170150
"""
171151

172-
_base = libindex.ObjectEngine
152+
_base = libindex.UInt16Engine
153+
_codes_dtype = "uint16"
173154

174-
def _codes_to_ints(self, codes):
175-
"""
176-
Transform combination(s) of uint64 in one Python integer (each), in a
177-
strictly monotonic way (i.e. respecting the lexicographic order of
178-
integer combinations): see BaseMultiIndexCodesEngine documentation.
179155

180-
Parameters
181-
----------
182-
codes : 1- or 2-dimensional array of dtype uint64
183-
Combinations of integers (one per row)
156+
class MultiIndexUInt8Engine(libindex.BaseMultiIndexCodesEngine, libindex.UInt8Engine):
157+
"""Manages a MultiIndex by mapping label combinations to positive integers.
184158
185-
Returns
186-
-------
187-
int, or 1-dimensional array of dtype object
188-
Integer(s) representing one combination (each).
189-
"""
190-
# Shift the representation of each level by the pre-calculated number
191-
# of bits. Since this can overflow uint64, first make sure we are
192-
# working with Python integers:
193-
codes = codes.astype("object") << self.offsets
159+
The number of possible label combinations must not overflow the 8 bits integers.
160+
"""
194161

195-
# Now sum and OR are in fact interchangeable. This is a simple
196-
# composition of the (disjunct) significant bits of each level (i.e.
197-
# each column in "codes") in a single positive integer (per row):
198-
if codes.ndim == 1:
199-
# Single key
200-
return np.bitwise_or.reduce(codes)
162+
_base = libindex.UInt8Engine
163+
_codes_dtype = "uint8"
201164

202-
# Multiple keys
203-
return np.bitwise_or.reduce(codes, axis=1)
165+
166+
class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine):
167+
"""Manages a MultiIndex by mapping label combinations to positive integers.
168+
169+
This class manages those (extreme) cases in which the number of possible
170+
label combinations overflows the 64 bits integers, and uses an ObjectEngine
171+
containing Python integers.
172+
"""
173+
174+
_base = libindex.ObjectEngine
175+
_codes_dtype = "object"
204176

205177

206178
def names_compat(meth: F) -> F:
@@ -1229,13 +1201,25 @@ def _engine(self):
12291201
# equivalent to sorting lexicographically the codes themselves. Notice
12301202
# that each level needs to be shifted by the number of bits needed to
12311203
# represent the _previous_ ones:
1232-
offsets = np.concatenate([lev_bits[1:], [0]]).astype("uint64")
1204+
offsets = np.concatenate([lev_bits[1:], [0]])
1205+
# Downcast the type if possible, to prevent upcasting when shifting codes:
1206+
offsets = offsets.astype(np.min_scalar_type(int(offsets[0])))
12331207

12341208
# Check the total number of bits needed for our representation:
12351209
if lev_bits[0] > 64:
12361210
# The levels would overflow a 64 bit uint - use Python integers:
12371211
return MultiIndexPyIntEngine(self.levels, self.codes, offsets)
1238-
return MultiIndexUIntEngine(self.levels, self.codes, offsets)
1212+
if lev_bits[0] > 32:
1213+
# The levels would overflow a 32 bit uint - use uint64
1214+
return MultiIndexUInt64Engine(self.levels, self.codes, offsets)
1215+
if lev_bits[0] > 16:
1216+
# The levels would overflow a 16 bit uint - use uint8
1217+
return MultiIndexUInt32Engine(self.levels, self.codes, offsets)
1218+
if lev_bits[0] > 8:
1219+
# The levels would overflow a 8 bit uint - use uint16
1220+
return MultiIndexUInt16Engine(self.levels, self.codes, offsets)
1221+
# The levels fit in an 8 bit uint - use uint8
1222+
return MultiIndexUInt8Engine(self.levels, self.codes, offsets)
12391223

12401224
# Return type "Callable[..., MultiIndex]" of "_constructor" incompatible with return
12411225
# type "Type[MultiIndex]" in supertype "Index"

pandas/tests/indexes/multi/test_indexing.py

+26-15
Original file line numberDiff line numberDiff line change
@@ -919,30 +919,41 @@ def test_slice_indexer_with_missing_value(index_arr, expected, start_idx, end_id
919919
assert result == expected
920920

921921

922-
def test_pyint_engine():
922+
@pytest.mark.parametrize(
923+
"N, expected_dtype",
924+
[
925+
(1, "uint8"), # 2*4*N = 8
926+
(2, "uint16"), # 2*4*N = 16
927+
(4, "uint32"), # 2*4*N = 32
928+
(8, "uint64"), # 2*4*N = 64
929+
(10, "object"), # 2*4*N = 80
930+
],
931+
)
932+
def test_pyint_engine(N, expected_dtype):
923933
# GH#18519 : when combinations of codes cannot be represented in 64
924934
# bits, the index underlying the MultiIndex engine works with Python
925935
# integers, rather than uint64.
926-
N = 5
927936
keys = [
928937
tuple(arr)
929938
for arr in [
930-
[0] * 10 * N,
931-
[1] * 10 * N,
932-
[2] * 10 * N,
933-
[np.nan] * N + [2] * 9 * N,
934-
[0] * N + [2] * 9 * N,
935-
[np.nan] * N + [2] * 8 * N + [0] * N,
939+
[0] * 4 * N,
940+
[1] * 4 * N,
941+
[np.nan] * N + [0] * 3 * N,
942+
[0] * N + [1] * 3 * N,
943+
[np.nan] * N + [1] * 2 * N + [0] * N,
936944
]
937945
]
938-
# Each level contains 4 elements (including NaN), so it is represented
939-
# in 2 bits, for a total of 2*N*10 = 100 > 64 bits. If we were using a
940-
# 64 bit engine and truncating the first levels, the fourth and fifth
941-
# keys would collide; if truncating the last levels, the fifth and
942-
# sixth; if rotating bits rather than shifting, the third and fifth.
946+
# Each level contains 3 elements (NaN, 0, 1), and it's represented
947+
# in 2 bits to store 4 possible values (0=notfound, 1=NaN, 2=0, 3=1), for
948+
# a total of 2*N*4 = 80 > 64 bits where N=10 and the number of levels is N*4.
949+
# If we were using a 64 bit engine and truncating the first levels, the
950+
# fourth and fifth keys would collide; if truncating the last levels, the
951+
# fifth and sixth; if rotating bits rather than shifting, the third and fifth.
952+
953+
index = MultiIndex.from_tuples(keys)
954+
assert index._engine.values.dtype == expected_dtype
943955

944956
for idx, key_value in enumerate(keys):
945-
index = MultiIndex.from_tuples(keys)
946957
assert index.get_loc(key_value) == idx
947958

948959
expected = np.arange(idx + 1, dtype=np.intp)
@@ -952,7 +963,7 @@ def test_pyint_engine():
952963
# With missing key:
953964
idces = range(len(keys))
954965
expected = np.array([-1] + list(idces), dtype=np.intp)
955-
missing = tuple([0, 1] * 5 * N)
966+
missing = tuple([0, 1, 0, 1] * N)
956967
result = index.get_indexer([missing] + [keys[i] for i in idces])
957968
tm.assert_numpy_array_equal(result, expected)
958969

0 commit comments

Comments
 (0)