pandas-dev · simonjayhawkins · Jun 12, 2020 · Jun 12, 2020 · Jun 12, 2020 · Jun 14, 2020
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
@@ -2,7 +2,7 @@
 Routines for filling missing data.
 """
 
-from typing import Any, List, Optional, Set, Union
+from typing import Any, Optional
 
 import numpy as np
 
@@ -230,41 +230,26 @@ def interpolate_1d(
     # default limit is unlimited GH #16282
     limit = algos._validate_limit(nobs=None, limit=limit)
 
-    # These are sets of index pointers to invalid values... i.e. {0, 1, etc...
-    all_nans = set(np.flatnonzero(invalid))
-    start_nans = set(range(find_valid_index(yvalues, "first")))
-    end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid)))
-    mid_nans = all_nans - start_nans - end_nans
-
-    # Like the sets above, preserve_nans contains indices of invalid values,
-    # but in this case, it is the final set of indices that need to be
-    # preserved as NaN after the interpolation.
-
-    # For example if limit_direction='forward' then preserve_nans will
-    # contain indices of NaNs at the beginning of the series, and NaNs that
-    # are more than'limit' away from the prior non-NaN.
-
-    # set preserve_nans based on direction using _interp_limit
-    preserve_nans: Union[List, Set]
     if limit_direction == "forward":
-        preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
+        nans_to_interpolate = _interp_limit(invalid, limit, 0)
     elif limit_direction == "backward":
-        preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
+        nans_to_interpolate = _interp_limit(invalid, 0, limit)
     else:
         # both directions... just use _interp_limit
-        preserve_nans = set(_interp_limit(invalid, limit, limit))
+        nans_to_interpolate = _interp_limit(invalid, limit, limit)
 
     # if limit_area is set, add either mid or outside indices
     # to preserve_nans GH #16284
-    if limit_area == "inside":
-        # preserve NaNs on the outside
-        preserve_nans |= start_nans | end_nans
-    elif limit_area == "outside":
-        # preserve NaNs on the inside
-        preserve_nans |= mid_nans
-
-    # sort preserve_nans and covert to list
-    preserve_nans = sorted(preserve_nans)
+    if limit_area:
+        first = find_valid_index(yvalues, "first")
+        last = find_valid_index(yvalues, "last")
+        if limit_area == "inside":
+            # preserve NaNs on the outside
+            nans_to_interpolate[:first] = False
+            nans_to_interpolate[last + 1 :] = False
+        else:
+            # preserve NaNs on the inside
+            nans_to_interpolate[first : last + 1] = False
 
     yvalues = getattr(yvalues, "values", yvalues)
     result = yvalues.copy()
@@ -288,22 +273,21 @@ def interpolate_1d(
     if method in NP_METHODS:
         # np.interp requires sorted X values, #21037
         indexer = np.argsort(inds[valid])
-        result[invalid] = np.interp(
-            inds[invalid], inds[valid][indexer], yvalues[valid][indexer]
+        result[nans_to_interpolate] = np.interp(
+            inds[nans_to_interpolate], inds[valid][indexer], yvalues[valid][indexer]
         )
     else:
-        result[invalid] = _interpolate_scipy_wrapper(
+        result[nans_to_interpolate] = _interpolate_scipy_wrapper(
             inds[valid],
             yvalues[valid],
-            inds[invalid],
+            inds[nans_to_interpolate],
             method=method,
             fill_value=fill_value,
             bounds_error=bounds_error,
             order=order,
             **kwargs,
         )
 
-    result[preserve_nans] = np.nan
     return result
 
 
@@ -666,10 +650,11 @@ def clean_reindex_fill_method(method):
     return clean_fill_method(method, allow_nearest=True)
 
 
-def _interp_limit(invalid, fw_limit, bw_limit):
+def _interp_limit(
+    invalid: np.ndarray, fw_limit: Optional[int], bw_limit: Optional[int]
+) -> np.ndarray:
     """
-    Get indexers of values that won't be filled
-    because they exceed the limits.
+    Update mask to exclude elements not within limits
 
     Parameters
     ----------
@@ -681,68 +666,121 @@ def _interp_limit(invalid, fw_limit, bw_limit):
 
     Returns
     -------
-    set of indexers
+    boolean ndarray
 
     Notes
     -----
-    This is equivalent to the more readable, but slower
+    There follows a description of the implementation used for creating a mask
+    for forward interpolation with a limit. To create a backwards fill, we first
+    reverse the array and use the same algorithm.
+    To fill in both directions we combine the masks from both forward and backwards
+    fills.
 
-    .. code-block:: python
+    Say we start with the following array
 
-        def _interp_limit(invalid, fw_limit, bw_limit):
-            for x in np.where(invalid)[0]:
-                if invalid[max(0, x - fw_limit):x + bw_limit + 1].all():
-                    yield x
-    """
-    # handle forward first; the backward direction is the same except
-    # 1. operate on the reversed array
-    # 2. subtract the returned indices from N - 1
-    N = len(invalid)
-    f_idx = set()
-    b_idx = set()
-
-    def inner(invalid, limit):
-        limit = min(limit, N)
-        windowed = _rolling_window(invalid, limit + 1).all(1)
-        idx = set(np.where(windowed)[0] + limit) | set(
-            np.where((~invalid[: limit + 1]).cumsum() == 0)[0]
-        )
-        return idx
+    array([nan, nan,  1.,  3., nan, nan, nan, 11., nan, nan])
 
-    if fw_limit is not None:
+    create (or get from masked arrays) a boolean array of missing values
 
-        if fw_limit == 0:
-            f_idx = set(np.where(invalid)[0])
-        else:
-            f_idx = inner(invalid, fw_limit)
+    >>> arr = pd.core.missing.isna(arr)
+    >>> arr
+    array([ True,  True, False, False,  True,  True,  True, False,  True,
+            True])
 
-    if bw_limit is not None:
+    we convert the boolean array to integer array for counting the streaks
 
-        if bw_limit == 0:
-            # then we don't even need to care about backwards
-            # just use forwards
-            return f_idx
-        else:
-            b_idx = list(inner(invalid[::-1], bw_limit))
-            b_idx = set(N - 1 - np.asarray(b_idx))
-            if fw_limit == 0:
-                return b_idx
+    >>> arr = arr.astype(int)
+    >>> arr
+    array([1, 1, 0, 0, 1, 1, 1, 0, 1, 1])
 
-    return f_idx & b_idx
+    cumsum will get us off to a good start, we store this as we will need this later
 
+    >>> cumsum = arr.cumsum()
+    >>> cumsum
+    array([1, 2, 2, 2, 3, 4, 5, 5, 6, 7], dtype=int32)
 
-def _rolling_window(a, window):
-    """
-    [True, True, False, True, False], 2 ->
+    multiplying this accumulation with the original array of ones to get non-zero
+    values where we originally had ones
 
-    [
-        [True,  True],
-        [True, False],
-        [False, True],
-        [True, False],
-    ]
+    >>> arr = cumsum * arr
+    >>> arr
+    array([1, 2, 0, 0, 3, 4, 5, 0, 6, 7])
+
+    the previous result is close to what we want, but we want to restart
+    each streak at one. start by using the diff method to substract the previous
+    value for each element
+
+    >>> arr = np.diff(arr, prepend=0)
+    >>> arr
+    array([ 1,  1, -2,  0,  3,  1,  1, -5,  6,  1])
+
+    a negative value now represents the end of a streak of missing values
+    so let's first select just the negative values
+
+    >>> arr = np.where(arr < 0, arr, 0)
+    >>> arr
+    array([ 0,  0, -2,  0,  0,  0,  0, -5,  0,  0])
+
+    we will need to propegate the negative values
+
+    >>> arr = np.minimum.accumulate(arr)
+    >>> arr
+    array([ 0,  0, -2, -2, -2, -2, -2, -5, -5, -5], dtype=int32)
+
+    and then subtract the excess accumlation
+
+    >>> arr = arr + cumsum
+    >>> arr
+    array([1, 2, 0, 0, 1, 2, 3, 0, 1, 2], dtype=int32)
+
+    remember that positive values represent missing values and zeros represent
+    valid values. We have a array with some missing values at the start. For a
+    forward fill algorithm, we want to update the mask to leave these missing
+    values unchanged.
+
+    >>> arr[: arr.argmin()] = 0
+    >>> arr
+    array([0, 0, 0, 0, 1, 2, 3, 0, 1, 2], dtype=int32)
+
+    we will now select only values within a set limit, say 2
+
+    >>> arr = np.where(arr > 2, 0, arr)
+    >>> arr
+    array([0, 0, 0, 0, 1, 2, 0, 0, 1, 2], dtype=int32)
+
+    and finally convert back into a boolean mask
+
+    >>> arr.astype(bool)
+    array([ False,  False, False, False,  True,  True, False, False,  True,
+            True])
     """
-    # https://stackoverflow.com/a/6811241
-    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
-    strides = a.strides + (a.strides[-1],)
-    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
+
+    def inner(arr, limit):
+        arr = arr.astype(int)
+        arr[: arr.argmin()] = 0
+        if limit:
+            cumsum = arr.cumsum()
+            arr = cumsum * arr
+            arr = np.diff(arr)
+            arr = np.pad(arr, (1, 0))
+            arr = np.where(arr < 0, arr, 0)
+            arr = np.minimum.accumulate(arr)
+            arr = arr + cumsum
+            arr = np.where(arr > limit, 0, arr)
+        return arr.astype(bool)
+
+    if fw_limit == 0:
+        f_idx = invalid
+    else:
+        f_idx = inner(invalid, fw_limit)
+
+    if bw_limit == 0:
+        # then we don't even need to care about backwards
+        # just use forwards
+        return f_idx
+    else:
+        b_idx = inner(invalid[::-1], bw_limit)[::-1]
+        if fw_limit == 0:
+            return b_idx
+
+    return f_idx | b_idx