ENH: Use Kahan summation and Welfords method to calculate rolling var and std (#37055)

phofl · web-flow · commit 15d818b79ac9 · 2020-10-12T18:53:38.000-04:00
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -192,6 +192,7 @@ Other enhancements
 - Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`)
 - Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`)
 - Added :meth:`Rolling.sem()` and :meth:`Expanding.sem()` to compute the standard error of mean (:issue:`26476`).
+- :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`)
 
 .. _whatsnew_120.api_breaking.python:
 
diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
@@ -311,37 +311,46 @@ cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs,
 
 
 cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x,
-                         float64_t *ssqdm_x) nogil:
+                         float64_t *ssqdm_x, float64_t *compensation) nogil:
     """ add a value from the var calc """
     cdef:
-        float64_t delta
+        float64_t delta, prev_mean, y, t
 
     # `isnan` instead of equality as fix for GH-21813, msvc 2017 bug
     if isnan(val):
         return
 
     nobs[0] = nobs[0] + 1
-    # a part of Welford's method for the online variance-calculation
+    # Welford's method for the online variance-calculation
+    # using Kahan summation
     # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
-    delta = val - mean_x[0]
+    prev_mean = mean_x[0] - compensation[0]
+    y = val - compensation[0]
+    t = y - mean_x[0]
+    compensation[0] = t + mean_x[0] - y
+    delta = t
     mean_x[0] = mean_x[0] + delta / nobs[0]
-    ssqdm_x[0] = ssqdm_x[0] + ((nobs[0] - 1) * delta ** 2) / nobs[0]
+    ssqdm_x[0] = ssqdm_x[0] + (val - prev_mean) * (val - mean_x[0])
 
 
 cdef inline void remove_var(float64_t val, float64_t *nobs, float64_t *mean_x,
-                            float64_t *ssqdm_x) nogil:
+                            float64_t *ssqdm_x, float64_t *compensation) nogil:
     """ remove a value from the var calc """
     cdef:
-        float64_t delta
-
+        float64_t delta, prev_mean, y, t
     if notnan(val):
         nobs[0] = nobs[0] - 1
         if nobs[0]:
-            # a part of Welford's method for the online variance-calculation
+            # Welford's method for the online variance-calculation
+            # using Kahan summation
             # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
-            delta = val - mean_x[0]
+            prev_mean = mean_x[0] - compensation[0]
+            y = val - compensation[0]
+            t = y - mean_x[0]
+            compensation[0] = t + mean_x[0] - y
+            delta = t
             mean_x[0] = mean_x[0] - delta / nobs[0]
-            ssqdm_x[0] = ssqdm_x[0] - ((nobs[0] + 1) * delta ** 2) / nobs[0]
+            ssqdm_x[0] = ssqdm_x[0] - (val - prev_mean) * (val - mean_x[0])
         else:
             mean_x[0] = 0
             ssqdm_x[0] = 0
@@ -353,7 +362,8 @@ def roll_var(ndarray[float64_t] values, ndarray[int64_t] start,
     Numerically stable implementation using Welford's method.
     """
     cdef:
-        float64_t mean_x = 0, ssqdm_x = 0, nobs = 0,
+        float64_t mean_x = 0, ssqdm_x = 0, nobs = 0, compensation_add = 0,
+        float64_t compensation_remove = 0,
         float64_t val, prev, delta, mean_x_old
         int64_t s, e
         Py_ssize_t i, j, N = len(values)
@@ -375,26 +385,28 @@ def roll_var(ndarray[float64_t] values, ndarray[int64_t] start,
             if i == 0 or not is_monotonic_bounds:
 
                 for j in range(s, e):
-                    add_var(values[j], &nobs, &mean_x, &ssqdm_x)
+                    add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add)
 
             else:
 
                 # After the first window, observations can both be added
                 # and removed
 
-                # calculate adds
-                for j in range(end[i - 1], e):
-                    add_var(values[j], &nobs, &mean_x, &ssqdm_x)
-
                 # calculate deletes
                 for j in range(start[i - 1], s):
-                    remove_var(values[j], &nobs, &mean_x, &ssqdm_x)
+                    remove_var(values[j], &nobs, &mean_x, &ssqdm_x,
+                               &compensation_remove)
+
+                # calculate adds
+                for j in range(end[i - 1], e):
+                    add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add)
 
             output[i] = calc_var(minp, ddof, nobs, ssqdm_x)
 
             if not is_monotonic_bounds:
                 for j in range(s, e):
-                    remove_var(values[j], &nobs, &mean_x, &ssqdm_x)
+                    remove_var(values[j], &nobs, &mean_x, &ssqdm_x,
+                               &compensation_remove)
 
     return output
 
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
@@ -879,3 +879,20 @@ def test_rolling_sem(constructor):
         result = pd.Series(result[0].values)
     expected = pd.Series([np.nan] + [0.707107] * 2)
     tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    ("func", "third_value", "values"),
+    [
+        ("var", 1, [5e33, 0, 0.5, 0.5, 2, 0]),
+        ("std", 1, [7.071068e16, 0, 0.7071068, 0.7071068, 1.414214, 0]),
+        ("var", 2, [5e33, 0.5, 0, 0.5, 2, 0]),
+        ("std", 2, [7.071068e16, 0.7071068, 0, 0.7071068, 1.414214, 0]),
+    ],
+)
+def test_rolling_var_numerical_issues(func, third_value, values):
+    # GH: 37051
+    ds = pd.Series([99999999999999999, 1, third_value, 2, 3, 1, 1])
+    result = getattr(ds.rolling(2), func)()
+    expected = pd.Series([np.nan] + values)
+    tm.assert_series_equal(result, expected)