BUG: Fix window aggregations to skip over unused elements (pandas-devGH-45647)

rtpsw · rtpsw · commit 256a0c71246e · 2022-01-27T03:45:10.000-05:00
diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
@@ -122,9 +122,9 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start,
              ndarray[int64_t] end, int64_t minp) -> np.ndarray:
     cdef:
         Py_ssize_t i, j
-        float64_t sum_x = 0, compensation_add = 0, compensation_remove = 0
+        float64_t sum_x, compensation_add, compensation_remove
         int64_t s, e
-        int64_t nobs = 0, N = len(values)
+        int64_t nobs = 0, N = len(start)
         ndarray[float64_t] output
         bint is_monotonic_increasing_bounds
 
@@ -139,10 +139,12 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start,
             s = start[i]
             e = end[i]
 
-            if i == 0 or not is_monotonic_increasing_bounds:
+            if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
 
                 # setup
 
+                sum_x = compensation_add = compensation_remove = 0
+                nobs = 0
                 for j in range(s, e):
                     add_sum(values[j], &nobs, &sum_x, &compensation_add)
 
@@ -226,9 +228,9 @@ cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
 def roll_mean(const float64_t[:] values, ndarray[int64_t] start,
               ndarray[int64_t] end, int64_t minp) -> np.ndarray:
     cdef:
-        float64_t val, compensation_add = 0, compensation_remove = 0, sum_x = 0
+        float64_t val, compensation_add, compensation_remove, sum_x
         int64_t s, e
-        Py_ssize_t nobs = 0, i, j, neg_ct = 0, N = len(values)
+        Py_ssize_t nobs, i, j, neg_ct, N = len(start)
         ndarray[float64_t] output
         bint is_monotonic_increasing_bounds
 
@@ -243,8 +245,10 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start,
             s = start[i]
             e = end[i]
 
-            if i == 0 or not is_monotonic_increasing_bounds:
+            if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
 
+                compensation_add = compensation_remove = sum_x = 0
+                nobs = neg_ct = 0
                 # setup
                 for j in range(s, e):
                     val = values[j]
@@ -349,11 +353,11 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
     Numerically stable implementation using Welford's method.
     """
     cdef:
-        float64_t mean_x = 0, ssqdm_x = 0, nobs = 0, compensation_add = 0,
-        float64_t compensation_remove = 0,
+        float64_t mean_x, ssqdm_x, nobs, compensation_add,
+        float64_t compensation_remove,
         float64_t val, prev, delta, mean_x_old
         int64_t s, e
-        Py_ssize_t i, j, N = len(values)
+        Py_ssize_t i, j, N = len(start)
         ndarray[float64_t] output
         bint is_monotonic_increasing_bounds
 
@@ -372,8 +376,9 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
 
             # Over the first window, observations can only be added
             # never removed
-            if i == 0 or not is_monotonic_increasing_bounds:
+            if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
 
+                mean_x = ssqdm_x = nobs = compensation_add = compensation_remove = 0
                 for j in range(s, e):
                     add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add)
 
@@ -500,11 +505,11 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
     cdef:
         Py_ssize_t i, j
         float64_t val, prev, min_val, mean_val, sum_val = 0
-        float64_t compensation_xxx_add = 0, compensation_xxx_remove = 0
-        float64_t compensation_xx_add = 0, compensation_xx_remove = 0
-        float64_t compensation_x_add = 0, compensation_x_remove = 0
-        float64_t x = 0, xx = 0, xxx = 0
-        int64_t nobs = 0, N = len(values), nobs_mean = 0
+        float64_t compensation_xxx_add, compensation_xxx_remove
+        float64_t compensation_xx_add, compensation_xx_remove
+        float64_t compensation_x_add, compensation_x_remove
+        float64_t x, xx, xxx
+        int64_t nobs = 0, N = len(start), V = len(values), nobs_mean = 0
         int64_t s, e
         ndarray[float64_t] output, mean_array, values_copy
         bint is_monotonic_increasing_bounds
@@ -518,7 +523,7 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
     values_copy = np.copy(values)
 
     with nogil:
-        for i in range(0, N):
+        for i in range(0, V):
             val = values_copy[i]
             if notnan(val):
                 nobs_mean += 1
@@ -527,7 +532,7 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
         # Other cases would lead to imprecision for smallest values
         if min_val - mean_val > -1e5:
             mean_val = round(mean_val)
-            for i in range(0, N):
+            for i in range(0, V):
                 values_copy[i] = values_copy[i] - mean_val
 
         for i in range(0, N):
@@ -537,8 +542,13 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
 
             # Over the first window, observations can only be added
             # never removed
-            if i == 0 or not is_monotonic_increasing_bounds:
+            if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
 
+                compensation_xxx_add = compensation_xxx_remove = 0
+                compensation_xx_add = compensation_xx_remove = 0
+                compensation_x_add = compensation_x_remove = 0
+                x = xx = xxx = 0
+                nobs = 0
                 for j in range(s, e):
                     val = values_copy[j]
                     add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
@@ -682,12 +692,12 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
     cdef:
         Py_ssize_t i, j
         float64_t val, prev, mean_val, min_val, sum_val = 0
-        float64_t compensation_xxxx_add = 0, compensation_xxxx_remove = 0
-        float64_t compensation_xxx_remove = 0, compensation_xxx_add = 0
-        float64_t compensation_xx_remove = 0, compensation_xx_add = 0
-        float64_t compensation_x_remove = 0, compensation_x_add = 0
-        float64_t x = 0, xx = 0, xxx = 0, xxxx = 0
-        int64_t nobs = 0, s, e, N = len(values), nobs_mean = 0
+        float64_t compensation_xxxx_add, compensation_xxxx_remove
+        float64_t compensation_xxx_remove, compensation_xxx_add
+        float64_t compensation_xx_remove, compensation_xx_add
+        float64_t compensation_x_remove, compensation_x_add
+        float64_t x, xx, xxx, xxxx
+        int64_t nobs, s, e, N = len(start), V = len(values), nobs_mean = 0
         ndarray[float64_t] output, values_copy
         bint is_monotonic_increasing_bounds
 
@@ -700,7 +710,7 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
     min_val = np.nanmin(values)
 
     with nogil:
-        for i in range(0, N):
+        for i in range(0, V):
             val = values_copy[i]
             if notnan(val):
                 nobs_mean += 1
@@ -709,7 +719,7 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
         # Other cases would lead to imprecision for smallest values
         if min_val - mean_val > -1e4:
             mean_val = round(mean_val)
-            for i in range(0, N):
+            for i in range(0, V):
                 values_copy[i] = values_copy[i] - mean_val
 
         for i in range(0, N):
@@ -719,8 +729,14 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
 
             # Over the first window, observations can only be added
             # never removed
-            if i == 0 or not is_monotonic_increasing_bounds:
+            if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
 
+                compensation_xxxx_add = compensation_xxxx_remove = 0
+                compensation_xxx_remove = compensation_xxx_add = 0
+                compensation_xx_remove = compensation_xx_add = 0
+                compensation_x_remove = compensation_x_add = 0
+                x = xx = xxx = xxxx = 0
+                nobs = 0
                 for j in range(s, e):
                     add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
                              &compensation_x_add, &compensation_xx_add,
@@ -764,7 +780,7 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start,
         Py_ssize_t i, j
         bint err = False, is_monotonic_increasing_bounds
         int midpoint, ret = 0
-        int64_t nobs = 0, N = len(values), s, e, win
+        int64_t nobs, N = len(start), s, e, win
         float64_t val, res, prev
         skiplist_t *sl
         ndarray[float64_t] output
@@ -791,8 +807,11 @@ def roll_median_c(const float64_t[:] values, ndarray[int64_t] start,
             s = start[i]
             e = end[i]
 
-            if i == 0 or not is_monotonic_increasing_bounds:
+            if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
 
+                skiplist_destroy(sl)
+                sl = skiplist_init(<int>win)
+                nobs = 0
                 # setup
                 for j in range(s, e):
                     val = values[j]
@@ -948,7 +967,7 @@ cdef _roll_min_max(ndarray[numeric_t] values,
     cdef:
         numeric_t ai
         int64_t curr_win_size, start
-        Py_ssize_t i, k, nobs = 0, N = len(values)
+        Py_ssize_t i, k, nobs = 0, N = len(starti)
         deque Q[int64_t]  # min/max always the front
         deque W[int64_t]  # track the whole window for nobs compute
         ndarray[float64_t, ndim=1] output
@@ -1031,7 +1050,7 @@ def roll_quantile(const float64_t[:] values, ndarray[int64_t] start,
     O(N log(window)) implementation using skip list
     """
     cdef:
-        Py_ssize_t i, j, s, e, N = len(values), idx
+        Py_ssize_t i, j, s, e, N = len(start), idx
         int ret = 0
         int64_t nobs = 0, win
         float64_t val, prev, midpoint, idx_with_fraction
@@ -1068,8 +1087,8 @@ def roll_quantile(const float64_t[:] values, ndarray[int64_t] start,
             s = start[i]
             e = end[i]
 
-            if i == 0 or not is_monotonic_increasing_bounds:
-                if not is_monotonic_increasing_bounds:
+            if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
+                if not is_monotonic_increasing_bounds or s >= end[i - 1]:
                     nobs = 0
                     skiplist_destroy(skiplist)
                     skiplist = skiplist_init(<int>win)
@@ -1160,7 +1179,7 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start,
     derived from roll_quantile
     """
     cdef:
-        Py_ssize_t i, j, s, e, N = len(values), idx
+        Py_ssize_t i, j, s, e, N = len(start), idx
         float64_t rank_min = 0, rank = 0
         int64_t nobs = 0, win
         float64_t val
@@ -1193,8 +1212,8 @@ def roll_rank(const float64_t[:] values, ndarray[int64_t] start,
             s = start[i]
             e = end[i]
 
-            if i == 0 or not is_monotonic_increasing_bounds:
-                if not is_monotonic_increasing_bounds:
+            if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
+                if not is_monotonic_increasing_bounds or s >= end[i - 1]:
                     nobs = 0
                     skiplist_destroy(skiplist)
                     skiplist = skiplist_init(<int>win)
diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py
@@ -2,10 +2,12 @@
     datetime,
     timedelta,
 )
+from functools import partial
 
 import numpy as np
 import pytest
 
+import pandas._libs.window.aggregations as window_aggregations
 import pandas.util._test_decorators as td
 
 from pandas import (
@@ -259,3 +261,62 @@ def frame():
         index=bdate_range(datetime(2009, 1, 1), periods=100),
         columns=np.arange(10),
     )
+
+
+def _named_func(name_and_func):
+    name, func = name_and_func
+    if not hasattr(func, "func"):
+        func = partial(func)
+    func.__name__ = name
+    return func
+
+
+@pytest.fixture(
+    params=[
+        _named_func(x)
+        for x in [
+            ("roll_sum", window_aggregations.roll_sum),
+            ("roll_mean", window_aggregations.roll_mean),
+        ]
+        + [
+            (f"roll_var({ddof})", partial(window_aggregations.roll_var, ddof=ddof))
+            for ddof in [0, 1]
+        ]
+        + [
+            ("roll_skew", window_aggregations.roll_skew),
+            ("roll_kurt", window_aggregations.roll_kurt),
+            ("roll_mediac_c", window_aggregations.roll_median_c),
+            ("roll_max", window_aggregations.roll_max),
+            ("roll_min", window_aggregations.roll_min),
+        ]
+        + [
+            (
+                f"roll_quntile({quantile},{interpolation})",
+                partial(
+                    window_aggregations.roll_quantile,
+                    quantile=quantile,
+                    interpolation=interpolation,
+                ),
+            )
+            for quantile in [0.25, 0.5, 0.75]
+            for interpolation in window_aggregations.interpolation_types
+        ]
+        + [
+            (
+                f"roll_rank({percentile},{method},{ascending})",
+                partial(
+                    window_aggregations.roll_rank,
+                    percentile=percentile,
+                    method=method,
+                    ascending=ascending,
+                ),
+            )
+            for percentile in [True, False]
+            for method in window_aggregations.rolling_rank_tiebreakers.keys()
+            for ascending in [True, False]
+        ]
+    ]
+)
+def rolling_aggregation(request):
+    """Make a named rolling aggregation function as fixture."""
+    return request.param
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
@@ -2,6 +2,7 @@
     datetime,
     timedelta,
 )
+import time
 
 import numpy as np
 import pytest
@@ -1391,7 +1392,7 @@ def test_rolling_corr_timedelta_index(index, window):
     # GH: 31286
     x = Series([1, 2, 3, 4, 5], index=index)
     y = x.copy()
-    x[0:2] = 0.0
+    x.iloc[0:2] = 0.0
     result = x.rolling(window).corr(y)
     expected = Series([np.nan, np.nan, 1, 1, 1], index=index)
     tm.assert_almost_equal(result, expected)
@@ -1734,3 +1735,46 @@ def test_rolling_std_neg_sqrt():
 
     b = a.ewm(span=3).std()
     assert np.isfinite(b[2:]).all()
+
+
+def test_rolling_aggregation_boundary_consistency(rolling_aggregation):
+    minp, step, width, size, selection = 0, 1, 3, 11, [2, 7]
+    s = Series(np.arange(1, 1 + size, dtype=np.float64))
+    end = np.arange(width, size, step, dtype=np.int64)
+    start = end - width
+    selarr = np.array(selection, dtype=np.int32)
+    result = Series(rolling_aggregation(s.values, start[selarr], end[selarr], minp))
+    expected = Series(rolling_aggregation(s.values, start, end, minp)[selarr])
+    tm.assert_equal(expected, result)
+
+
+def test_rolling_aggregation_timing_with_unused_elements(rolling_aggregation):
+    minp, width = 0, 5  # width at least 4 for kurt
+
+    def time_roll_agg(extra_size):
+        t0 = time.time()
+        size = 2 * width + 2 + extra_size
+        s = Series(np.full(size, np.nan, dtype=np.float64))
+        start = np.array([1, size - width - 2], dtype=np.int64)
+        end = np.array([1 + width, size - 2], dtype=np.int64)
+        loc = np.array(
+            [j for i in range(len(start)) for j in range(start[i], end[i])],
+            dtype=np.int32,
+        )
+        for j in loc:
+            s.iloc[j] = 1 + j
+        result = Series(rolling_aggregation(s.values, start, end, minp))
+        compact_s = np.array(s.iloc[loc], dtype=np.float64)
+        compact_start = np.arange(0, len(start) * width, width, dtype=np.int64)
+        compact_end = compact_start + width
+        expected = Series(
+            rolling_aggregation(compact_s, compact_start, compact_end, minp)
+        )
+        assert np.isfinite(expected.values).all(), "Not all expected values are finite"
+        tm.assert_equal(expected, result)
+        t1 = time.time()
+        return t1 - t0
+
+    for i in range(3):  # all but last are warmup
+        t = [time_roll_agg(extra_size) for extra_size in [2, 2000]]
+    assert abs((t[0] - t[1]) / t[0]) < 2, "Unused elements significantly affect timing"