From f0485a1ce0c6537c70778ee3d599bab6cb56b6de Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 12 Jun 2020 08:30:50 +0100 Subject: [PATCH 1/5] PERF: remove use of Python sets for interpolate --- pandas/core/missing.py | 146 +++++++++++++++-------------------------- 1 file changed, 52 insertions(+), 94 deletions(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index d8671616f944e..d0be1c07af873 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -2,7 +2,7 @@ Routines for filling missing data. """ -from typing import Any, List, Optional, Set, Union +from typing import Any, Optional import numpy as np @@ -225,41 +225,25 @@ def interpolate_1d( # default limit is unlimited GH #16282 limit = algos._validate_limit(nobs=None, limit=limit) - # These are sets of index pointers to invalid values... i.e. {0, 1, etc... - all_nans = set(np.flatnonzero(invalid)) - start_nans = set(range(find_valid_index(yvalues, "first"))) - end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid))) - mid_nans = all_nans - start_nans - end_nans - - # Like the sets above, preserve_nans contains indices of invalid values, - # but in this case, it is the final set of indices that need to be - # preserved as NaN after the interpolation. - - # For example if limit_direction='forward' then preserve_nans will - # contain indices of NaNs at the beginning of the series, and NaNs that - # are more than'limit' away from the prior non-NaN. - - # set preserve_nans based on direction using _interp_limit - preserve_nans: Union[List, Set] + first = find_valid_index(yvalues, "first") + last = find_valid_index(yvalues, "last") if limit_direction == "forward": - preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) + nans_to_interpolate = _interp_limit(invalid, limit, 0, first, last) elif limit_direction == "backward": - preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) + nans_to_interpolate = _interp_limit(invalid, 0, limit, first, last) else: # both directions... just use _interp_limit - preserve_nans = set(_interp_limit(invalid, limit, limit)) + nans_to_interpolate = _interp_limit(invalid, limit, limit, first, last) # if limit_area is set, add either mid or outside indices # to preserve_nans GH #16284 if limit_area == "inside": # preserve NaNs on the outside - preserve_nans |= start_nans | end_nans + nans_to_interpolate[:first] = False + nans_to_interpolate[last + 1 :] = False elif limit_area == "outside": # preserve NaNs on the inside - preserve_nans |= mid_nans - - # sort preserve_nans and covert to list - preserve_nans = sorted(preserve_nans) + nans_to_interpolate[first : last + 1] = False xvalues = getattr(xvalues, "values", xvalues) yvalues = getattr(yvalues, "values", yvalues) @@ -277,10 +261,9 @@ def interpolate_1d( inds = xvalues # np.interp requires sorted X values, #21037 indexer = np.argsort(inds[valid]) - result[invalid] = np.interp( - inds[invalid], inds[valid][indexer], yvalues[valid][indexer] + result[nans_to_interpolate] = np.interp( + inds[nans_to_interpolate], inds[valid][indexer], yvalues[valid][indexer] ) - result[preserve_nans] = np.nan return result sp_methods = [ @@ -305,17 +288,16 @@ def interpolate_1d( # hack for DatetimeIndex, #1646 if issubclass(inds.dtype.type, np.datetime64): inds = inds.view(np.int64) - result[invalid] = _interpolate_scipy_wrapper( + result[nans_to_interpolate] = _interpolate_scipy_wrapper( inds[valid], yvalues[valid], - inds[invalid], + inds[nans_to_interpolate], method=method, fill_value=fill_value, bounds_error=bounds_error, order=order, **kwargs, ) - result[preserve_nans] = np.nan return result @@ -678,10 +660,15 @@ def clean_reindex_fill_method(method): return clean_fill_method(method, allow_nearest=True) -def _interp_limit(invalid, fw_limit, bw_limit): +def _interp_limit( + invalid: np.ndarray, + fw_limit: Optional[int], + bw_limit: Optional[int], + first: int, + last: int, +) -> np.ndarray: """ - Get indexers of values that won't be filled - because they exceed the limits. + Update mask to exclude elements not within limits Parameters ---------- @@ -690,71 +677,42 @@ def _interp_limit(invalid, fw_limit, bw_limit): forward limit to index bw_limit : int or None backward limit to index + first: int + first valid index + last: int + last valid index Returns ------- - set of indexers - - Notes - ----- - This is equivalent to the more readable, but slower - - .. code-block:: python - - def _interp_limit(invalid, fw_limit, bw_limit): - for x in np.where(invalid)[0]: - if invalid[max(0, x - fw_limit):x + bw_limit + 1].all(): - yield x + boolean ndarray """ - # handle forward first; the backward direction is the same except - # 1. operate on the reversed array - # 2. subtract the returned indices from N - 1 - N = len(invalid) - f_idx = set() - b_idx = set() - - def inner(invalid, limit): - limit = min(limit, N) - windowed = _rolling_window(invalid, limit + 1).all(1) - idx = set(np.where(windowed)[0] + limit) | set( - np.where((~invalid[: limit + 1]).cumsum() == 0)[0] - ) - return idx - if fw_limit is not None: + def inner(arr, limit): + if limit is None: + return arr.copy() + arr = arr.astype(int) + cumsum = arr.cumsum() + arr = cumsum * arr + arr = np.diff(arr, prepend=0) + arr = np.where(arr < 0, arr, 0) + arr = np.minimum.accumulate(arr) + arr = arr + cumsum + return np.where(arr > limit, 0, arr).astype(bool) + + if fw_limit == 0: + f_idx = invalid + else: + f_idx = inner(invalid, fw_limit) + f_idx[:first] = False + if bw_limit == 0: + # then we don't even need to care about backwards + # just use forwards + return f_idx + else: + b_idx = inner(invalid[::-1], bw_limit)[::-1] + b_idx[last + 1 :] = False if fw_limit == 0: - f_idx = set(np.where(invalid)[0]) - else: - f_idx = inner(invalid, fw_limit) - - if bw_limit is not None: - - if bw_limit == 0: - # then we don't even need to care about backwards - # just use forwards - return f_idx - else: - b_idx = list(inner(invalid[::-1], bw_limit)) - b_idx = set(N - 1 - np.asarray(b_idx)) - if fw_limit == 0: - return b_idx - - return f_idx & b_idx + return b_idx - -def _rolling_window(a, window): - """ - [True, True, False, True, False], 2 -> - - [ - [True, True], - [True, False], - [False, True], - [True, False], - ] - """ - # https://stackoverflow.com/a/6811241 - shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) - strides = a.strides + (a.strides[-1],) - return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) + return f_idx | b_idx From e707c3db4fc5e544d712d2795cb9ddfb60dcec98 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 12 Jun 2020 16:01:50 +0100 Subject: [PATCH 2/5] add implementation notes --- pandas/core/missing.py | 77 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index d0be1c07af873..99453b75167e4 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -685,6 +685,83 @@ def _interp_limit( Returns ------- boolean ndarray + + Notes + ----- + There follows a description of the implementation used for creating a mask + for forward interpolation with a limit. To create a backwards fill, we first + reverse the array and use the same algorithm. + To fill in both directions we combine the masks from both forward and backwards + fills. + + Say we start with the following array + + array([nan, nan, 1., 3., nan, nan, nan, 11., nan, nan]) + + create (or get from masked arrays) a boolean array of missing values + + >>> arr = pd.core.missing.isna(arr) + >>> arr + array([ True, True, False, False, True, True, True, False, True, + True]) + + we convert the boolean array to integer array for counting the streaks + + >>> arr = arr.astype(int) + >>> arr + array([1, 1, 0, 0, 1, 1, 1, 0, 1, 1]) + + cumsum will get us off to a good start, we store this as we will need this later + + >>> cumsum = arr.cumsum() + >>> cumsum + array([1, 2, 2, 2, 3, 4, 5, 5, 6, 7], dtype=int32) + + multiplying this accumulation with the original array of ones to get non-zero + values where we originally had ones + + >>> arr = cumsum * arr + >>> arr + array([1, 2, 0, 0, 3, 4, 5, 0, 6, 7]) + + the previous result is close to what we want, but we want to restart + each streak at one. start by using the diff method to substract the previous + value for each element + + >>> arr = np.diff(arr, prepend=0) + >>> arr + array([ 1, 1, -2, 0, 3, 1, 1, -5, 6, 1]) + + a negative value now represents the end of a streak of missing values + so let's first select just the negative values + + >>> arr = np.where(arr < 0, arr, 0) + >>> arr + array([ 0, 0, -2, 0, 0, 0, 0, -5, 0, 0]) + + we will need to propegate the negative values + + >>> arr = np.minimum.accumulate(arr) + >>> arr + array([ 0, 0, -2, -2, -2, -2, -2, -5, -5, -5], dtype=int32) + + and then subtract the excess accumlation + + >>> arr = arr + cumsum + >>> arr + array([1, 2, 0, 0, 1, 2, 3, 0, 1, 2], dtype=int32) + + we will now select only values within a set limit, say 2 + + >>> arr = np.where(arr > 2, 0, arr) + >>> arr + array([1, 2, 0, 0, 1, 2, 0, 0, 1, 2], dtype=int32) + + and finally convert back into a boolean mask + + >>> arr.astype(bool) + array([ True, True, False, False, True, True, False, False, True, + True]) """ def inner(arr, limit): From 1702ef50b63086b01e5b7aa4254d631d71b91113 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 12 Jun 2020 17:01:11 +0100 Subject: [PATCH 3/5] avoid passing first and last to _interp_limit --- pandas/core/missing.py | 69 +++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 99453b75167e4..a8ac8d008a8c8 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -225,25 +225,26 @@ def interpolate_1d( # default limit is unlimited GH #16282 limit = algos._validate_limit(nobs=None, limit=limit) - first = find_valid_index(yvalues, "first") - last = find_valid_index(yvalues, "last") if limit_direction == "forward": - nans_to_interpolate = _interp_limit(invalid, limit, 0, first, last) + nans_to_interpolate = _interp_limit(invalid, limit, 0) elif limit_direction == "backward": - nans_to_interpolate = _interp_limit(invalid, 0, limit, first, last) + nans_to_interpolate = _interp_limit(invalid, 0, limit) else: # both directions... just use _interp_limit - nans_to_interpolate = _interp_limit(invalid, limit, limit, first, last) + nans_to_interpolate = _interp_limit(invalid, limit, limit) # if limit_area is set, add either mid or outside indices # to preserve_nans GH #16284 - if limit_area == "inside": - # preserve NaNs on the outside - nans_to_interpolate[:first] = False - nans_to_interpolate[last + 1 :] = False - elif limit_area == "outside": - # preserve NaNs on the inside - nans_to_interpolate[first : last + 1] = False + if limit_area: + first = find_valid_index(yvalues, "first") + last = find_valid_index(yvalues, "last") + if limit_area == "inside": + # preserve NaNs on the outside + nans_to_interpolate[:first] = False + nans_to_interpolate[last + 1 :] = False + else: + # preserve NaNs on the inside + nans_to_interpolate[first : last + 1] = False xvalues = getattr(xvalues, "values", xvalues) yvalues = getattr(yvalues, "values", yvalues) @@ -661,11 +662,7 @@ def clean_reindex_fill_method(method): def _interp_limit( - invalid: np.ndarray, - fw_limit: Optional[int], - bw_limit: Optional[int], - first: int, - last: int, + invalid: np.ndarray, fw_limit: Optional[int], bw_limit: Optional[int] ) -> np.ndarray: """ Update mask to exclude elements not within limits @@ -677,10 +674,6 @@ def _interp_limit( forward limit to index bw_limit : int or None backward limit to index - first: int - first valid index - last: int - last valid index Returns ------- @@ -751,36 +744,45 @@ def _interp_limit( >>> arr array([1, 2, 0, 0, 1, 2, 3, 0, 1, 2], dtype=int32) + remember that positive values represent missing values and zeros represent + valid values. We have a array with some missing values at the start. For a + forward fill algorithm, we want to update the mask to leave these missing + values unchanged. + + >>> arr[: arr.argmin()] = 0 + >>> arr + array([0, 0, 0, 0, 1, 2, 3, 0, 1, 2], dtype=int32) + we will now select only values within a set limit, say 2 >>> arr = np.where(arr > 2, 0, arr) >>> arr - array([1, 2, 0, 0, 1, 2, 0, 0, 1, 2], dtype=int32) + array([0, 0, 0, 0, 1, 2, 0, 0, 1, 2], dtype=int32) and finally convert back into a boolean mask >>> arr.astype(bool) - array([ True, True, False, False, True, True, False, False, True, + array([ False, False, False, False, True, True, False, False, True, True]) """ def inner(arr, limit): - if limit is None: - return arr.copy() arr = arr.astype(int) - cumsum = arr.cumsum() - arr = cumsum * arr - arr = np.diff(arr, prepend=0) - arr = np.where(arr < 0, arr, 0) - arr = np.minimum.accumulate(arr) - arr = arr + cumsum - return np.where(arr > limit, 0, arr).astype(bool) + arr[: arr.argmin()] = 0 + if limit: + cumsum = arr.cumsum() + arr = cumsum * arr + arr = np.diff(arr, prepend=0) + arr = np.where(arr < 0, arr, 0) + arr = np.minimum.accumulate(arr) + arr = arr + cumsum + arr = np.where(arr > limit, 0, arr) + return arr.astype(bool) if fw_limit == 0: f_idx = invalid else: f_idx = inner(invalid, fw_limit) - f_idx[:first] = False if bw_limit == 0: # then we don't even need to care about backwards @@ -788,7 +790,6 @@ def inner(arr, limit): return f_idx else: b_idx = inner(invalid[::-1], bw_limit)[::-1] - b_idx[last + 1 :] = False if fw_limit == 0: return b_idx From 0bcccb7957a3e14c1bd443671e1992085fcfc601 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 14 Jun 2020 19:32:22 +0100 Subject: [PATCH 4/5] update for older numpy --- pandas/core/missing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 0ad9b8464c927..3ddc4172accfd 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -761,7 +761,8 @@ def inner(arr, limit): if limit: cumsum = arr.cumsum() arr = cumsum * arr - arr = np.diff(arr, prepend=0) + arr = np.diff(arr) + arr = np.pad(arr, (1, 0)) arr = np.where(arr < 0, arr, 0) arr = np.minimum.accumulate(arr) arr = arr + cumsum From 04fc8cb2fae0588ea9dc30ca18f5b541de489f5b Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 14 Jun 2020 20:08:55 +0100 Subject: [PATCH 5/5] older numpy --- pandas/core/missing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 3ddc4172accfd..f08812cf3d65b 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -762,7 +762,7 @@ def inner(arr, limit): cumsum = arr.cumsum() arr = cumsum * arr arr = np.diff(arr) - arr = np.pad(arr, (1, 0)) + arr = np.pad(arr, (1, 0), mode="constant") arr = np.where(arr < 0, arr, 0) arr = np.minimum.accumulate(arr) arr = arr + cumsum