PERF: avoid repeating checks in interpolation (pandas-dev#42963)

jbrockmendel · feefladder · commit 63cc5205b198 · 2021-09-07T07:54:12.000+02:00
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
@@ -215,7 +215,7 @@ def interpolate_array_2d(
     **kwargs,
 ):
     """
-    Wrapper to dispatch to either interpolate_2d or interpolate_2d_with_fill.
+    Wrapper to dispatch to either interpolate_2d or _interpolate_2d_with_fill.
     """
     try:
         m = clean_fill_method(method)
@@ -237,7 +237,7 @@ def interpolate_array_2d(
     else:
         assert index is not None  # for mypy
 
-        interp_values = interpolate_2d_with_fill(
+        interp_values = _interpolate_2d_with_fill(
             data=data,
             index=index,
             axis=axis,
@@ -251,7 +251,7 @@ def interpolate_array_2d(
     return interp_values
 
 
-def interpolate_2d_with_fill(
+def _interpolate_2d_with_fill(
     data: np.ndarray,  # floating dtype
     index: Index,
     axis: int,
@@ -263,11 +263,11 @@ def interpolate_2d_with_fill(
     **kwargs,
 ) -> np.ndarray:
     """
-    Column-wise application of interpolate_1d.
+    Column-wise application of _interpolate_1d.
 
     Notes
     -----
-    The signature does differs from interpolate_1d because it only
+    The signature does differs from _interpolate_1d because it only
     includes what is needed for Block.interpolate.
     """
     # validate the interp method
@@ -276,13 +276,44 @@ def interpolate_2d_with_fill(
     if is_valid_na_for_dtype(fill_value, data.dtype):
         fill_value = na_value_for_dtype(data.dtype, compat=False)
 
+    if method == "time":
+        if not needs_i8_conversion(index.dtype):
+            raise ValueError(
+                "time-weighted interpolation only works "
+                "on Series or DataFrames with a "
+                "DatetimeIndex"
+            )
+        method = "values"
+
+    valid_limit_directions = ["forward", "backward", "both"]
+    limit_direction = limit_direction.lower()
+    if limit_direction not in valid_limit_directions:
+        raise ValueError(
+            "Invalid limit_direction: expecting one of "
+            f"{valid_limit_directions}, got '{limit_direction}'."
+        )
+
+    if limit_area is not None:
+        valid_limit_areas = ["inside", "outside"]
+        limit_area = limit_area.lower()
+        if limit_area not in valid_limit_areas:
+            raise ValueError(
+                f"Invalid limit_area: expecting one of {valid_limit_areas}, got "
+                f"{limit_area}."
+            )
+
+    # default limit is unlimited GH #16282
+    limit = algos.validate_limit(nobs=None, limit=limit)
+
+    indices = _index_to_interp_indices(index, method)
+
     def func(yvalues: np.ndarray) -> np.ndarray:
         # process 1-d slices in the axis direction, returning it
 
         # should the axis argument be handled below in apply_along_axis?
-        # i.e. not an arg to interpolate_1d
-        return interpolate_1d(
-            xvalues=index,
+        # i.e. not an arg to _interpolate_1d
+        return _interpolate_1d(
+            indices=indices,
             yvalues=yvalues,
             method=method,
             limit=limit,
@@ -297,8 +328,30 @@ def func(yvalues: np.ndarray) -> np.ndarray:
     return np.apply_along_axis(func, axis, data)
 
 
-def interpolate_1d(
-    xvalues: Index,
+def _index_to_interp_indices(index: Index, method: str) -> np.ndarray:
+    """
+    Convert Index to ndarray of indices to pass to NumPy/SciPy.
+    """
+    xarr = index._values
+    if needs_i8_conversion(xarr.dtype):
+        # GH#1646 for dt64tz
+        xarr = xarr.view("i8")
+
+    if method == "linear":
+        inds = xarr
+        inds = cast(np.ndarray, inds)
+    else:
+        inds = np.asarray(xarr)
+
+        if method in ("values", "index"):
+            if inds.dtype == np.object_:
+                inds = lib.maybe_convert_objects(inds)
+
+    return inds
+
+
+def _interpolate_1d(
+    indices: np.ndarray,
     yvalues: np.ndarray,
     method: str | None = "linear",
     limit: int | None = None,
@@ -311,51 +364,23 @@ def interpolate_1d(
 ):
     """
     Logic for the 1-d interpolation.  The result should be 1-d, inputs
-    xvalues and yvalues will each be 1-d arrays of the same length.
+    indices and yvalues will each be 1-d arrays of the same length.
 
     Bounds_error is currently hardcoded to False since non-scipy ones don't
     take it as an argument.
     """
+
     invalid = isna(yvalues)
     valid = ~invalid
 
     if not valid.any():
-        result = np.empty(xvalues.shape, dtype=np.float64)
+        result = np.empty(indices.shape, dtype=np.float64)
         result.fill(np.nan)
         return result
 
     if valid.all():
         return yvalues
 
-    if method == "time":
-        if not needs_i8_conversion(xvalues.dtype):
-            raise ValueError(
-                "time-weighted interpolation only works "
-                "on Series or DataFrames with a "
-                "DatetimeIndex"
-            )
-        method = "values"
-
-    valid_limit_directions = ["forward", "backward", "both"]
-    limit_direction = limit_direction.lower()
-    if limit_direction not in valid_limit_directions:
-        raise ValueError(
-            "Invalid limit_direction: expecting one of "
-            f"{valid_limit_directions}, got '{limit_direction}'."
-        )
-
-    if limit_area is not None:
-        valid_limit_areas = ["inside", "outside"]
-        limit_area = limit_area.lower()
-        if limit_area not in valid_limit_areas:
-            raise ValueError(
-                f"Invalid limit_area: expecting one of {valid_limit_areas}, got "
-                f"{limit_area}."
-            )
-
-    # default limit is unlimited GH #16282
-    limit = algos.validate_limit(nobs=None, limit=limit)
-
     # These are sets of index pointers to invalid values... i.e. {0, 1, etc...
     all_nans = set(np.flatnonzero(invalid))
 
@@ -369,8 +394,6 @@ def interpolate_1d(
         last_valid_index = len(yvalues)
     end_nans = set(range(1 + last_valid_index, len(valid)))
 
-    mid_nans = all_nans - start_nans - end_nans
-
     # Like the sets above, preserve_nans contains indices of invalid values,
     # but in this case, it is the final set of indices that need to be
     # preserved as NaN after the interpolation.
@@ -396,44 +419,26 @@ def interpolate_1d(
         preserve_nans |= start_nans | end_nans
     elif limit_area == "outside":
         # preserve NaNs on the inside
+        mid_nans = all_nans - start_nans - end_nans
         preserve_nans |= mid_nans
 
     # sort preserve_nans and convert to list
     preserve_nans = sorted(preserve_nans)
 
     result = yvalues.copy()
 
-    # xarr to pass to NumPy/SciPy
-    xarr = xvalues._values
-    if needs_i8_conversion(xarr.dtype):
-        # GH#1646 for dt64tz
-        xarr = xarr.view("i8")
-
-    if method == "linear":
-        inds = xarr
-    else:
-        inds = np.asarray(xarr)
-
-        if method in ("values", "index"):
-            if inds.dtype == np.object_:
-                inds = lib.maybe_convert_objects(inds)
-
     if method in NP_METHODS:
         # np.interp requires sorted X values, #21037
 
-        # error: Argument 1 to "argsort" has incompatible type "Union[ExtensionArray,
-        # Any]"; expected "Union[Union[int, float, complex, str, bytes, generic],
-        # Sequence[Union[int, float, complex, str, bytes, generic]],
-        # Sequence[Sequence[Any]], _SupportsArray]"
-        indexer = np.argsort(inds[valid])  # type: ignore[arg-type]
+        indexer = np.argsort(indices[valid])
         result[invalid] = np.interp(
-            inds[invalid], inds[valid][indexer], yvalues[valid][indexer]
+            indices[invalid], indices[valid][indexer], yvalues[valid][indexer]
         )
     else:
         result[invalid] = _interpolate_scipy_wrapper(
-            inds[valid],
+            indices[valid],
             yvalues[valid],
-            inds[invalid],
+            indices[invalid],
             method=method,
             fill_value=fill_value,
             bounds_error=bounds_error,