avoid zip and unzip to improve the speed of get_index

qiuwei · qiuwei · commit 5dba92819e5d · 2021-04-27T15:42:21.000+08:00
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -584,7 +584,7 @@ cdef class BaseMultiIndexCodesEngine:
     def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray:
         raise NotImplementedError("Implemented by subclass")
 
-    def _extract_level_codes(self, ndarray[object] target) -> np.ndarray:
+    def _extract_level_codes(self, list target) -> np.ndarray:
         """
         Map the requested list of (tuple) keys to their integer representations
         for searching in the underlying integer index.
@@ -599,11 +599,12 @@ cdef class BaseMultiIndexCodesEngine:
         int_keys : 1-dimensional array of dtype uint64 or object
             Integers representing one combination each
         """
-        level_codes = [lev.get_indexer(codes) + 1 for lev, codes
-                       in zip(self.levels, zip(*target))]
+        n_levels = len(self.levels)
+        level_codes = [self.levels[i].get_indexer(target[i]) + 1
+                       for i in range(n_levels)]
         return self._codes_to_ints(np.array(level_codes, dtype='uint64').T)
 
-    def get_indexer(self, ndarray[object] target) -> np.ndarray:
+    def get_indexer(self, list target) -> np.ndarray:
         """
         Returns an array giving the positions of each value of `target` in
         `self.values`, where -1 represents a value in `target` which does not
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -58,11 +58,7 @@
     pandas_dtype,
 )
 from pandas.core.dtypes.dtypes import ExtensionDtype
-from pandas.core.dtypes.generic import (
-    ABCDataFrame,
-    ABCDatetimeIndex,
-    ABCTimedeltaIndex,
-)
+from pandas.core.dtypes.generic import ABCDataFrame
 from pandas.core.dtypes.missing import (
     array_equivalent,
     isna,
@@ -696,24 +692,25 @@ def from_frame(cls, df: DataFrame, sortorder=None, names=None) -> MultiIndex:
     @cache_readonly
     def _values(self) -> np.ndarray:
         # We override here, since our parent uses _data, which we don't use.
+        arr = lib.fast_zip(self._list_values)
+        return arr
+
+    @cache_readonly
+    def _list_values(self) -> list:
         values = []
 
         for i in range(self.nlevels):
             vals = self._get_level_values(i)
             if is_categorical_dtype(vals.dtype):
                 vals = cast("CategoricalIndex", vals)
                 vals = vals._data._internal_get_values()
-            if isinstance(vals.dtype, ExtensionDtype) or isinstance(
-                vals, (ABCDatetimeIndex, ABCTimedeltaIndex)
-            ):
+            if isinstance(vals.dtype, ExtensionDtype) or hasattr(vals, "_box_values"):
                 vals = vals.astype(object)
             # error: Incompatible types in assignment (expression has type "ndarray",
             # variable has type "Index")
             vals = np.array(vals, copy=False)  # type: ignore[assignment]
             values.append(vals)
-
-        arr = lib.fast_zip(values)
-        return arr
+        return values
 
     @property
     def values(self) -> np.ndarray:
@@ -2709,7 +2706,7 @@ def _get_indexer(
                 "for MultiIndex; see GitHub issue 9365"
             )
         else:
-            indexer = self._engine.get_indexer(target._values)
+            indexer = self._engine.get_indexer(target._list_values)
 
         # Note: we only get here (in extant tests at least) with
         #  target.nlevels == self.nlevels