[BUG]: Implement Kahan summation for rolling().mean() to avoid numerical issues (pandas-dev#36348)

phofl · Kevin D Smith · commit e173701d9c08 · 2020-11-02T08:51:46.000-06:00
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -118,6 +118,7 @@ Other enhancements
 - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`)
 - :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`)
 - `Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`)
+- :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`)
 
 .. _whatsnew_120.api_breaking.python:
 
diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
@@ -161,27 +161,42 @@ cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x) nogi
     return result
 
 
-cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogil:
-    """ add a value from the sum calc """
+cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
+                         float64_t *compensation) nogil:
+    """ add a value from the sum calc using Kahan summation """
+
+    cdef:
+        float64_t y, t
 
     # Not NaN
     if notnan(val):
         nobs[0] = nobs[0] + 1
-        sum_x[0] = sum_x[0] + val
+        y = val - compensation[0]
+        t = sum_x[0] + y
+        compensation[0] = t - sum_x[0] - y
+        sum_x[0] = t
 
 
-cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogil:
-    """ remove a value from the sum calc """
+cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
+                            float64_t *compensation) nogil:
+    """ remove a value from the sum calc using Kahan summation """
+
+    cdef:
+        float64_t y, t
 
+    # Not NaN
     if notnan(val):
         nobs[0] = nobs[0] - 1
-        sum_x[0] = sum_x[0] - val
+        y = - val - compensation[0]
+        t = sum_x[0] + y
+        compensation[0] = t - sum_x[0] - y
+        sum_x[0] = t
 
 
 def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start,
                       ndarray[int64_t] end, int64_t minp):
     cdef:
-        float64_t sum_x = 0
+        float64_t sum_x = 0, compensation_add = 0, compensation_remove = 0
         int64_t s, e
         int64_t nobs = 0, i, j, N = len(values)
         ndarray[float64_t] output
@@ -201,31 +216,31 @@ def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start,
                 # setup
 
                 for j in range(s, e):
-                    add_sum(values[j], &nobs, &sum_x)
+                    add_sum(values[j], &nobs, &sum_x, &compensation_add)
 
             else:
 
                 # calculate deletes
                 for j in range(start[i - 1], s):
-                    remove_sum(values[j], &nobs, &sum_x)
+                    remove_sum(values[j], &nobs, &sum_x, &compensation_remove)
 
                 # calculate adds
                 for j in range(end[i - 1], e):
-                    add_sum(values[j], &nobs, &sum_x)
+                    add_sum(values[j], &nobs, &sum_x, &compensation_add)
 
             output[i] = calc_sum(minp, nobs, sum_x)
 
             if not is_monotonic_bounds:
                 for j in range(s, e):
-                    remove_sum(values[j], &nobs, &sum_x)
+                    remove_sum(values[j], &nobs, &sum_x, &compensation_remove)
 
     return output
 
 
 def roll_sum_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
                    ndarray[int64_t] end, int64_t minp, int64_t win):
     cdef:
-        float64_t val, prev_x, sum_x = 0
+        float64_t val, prev_x, sum_x = 0, compensation_add = 0, compensation_remove = 0
         int64_t range_endpoint
         int64_t nobs = 0, i, N = len(values)
         ndarray[float64_t] output
@@ -237,16 +252,16 @@ def roll_sum_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
     with nogil:
 
         for i in range(0, range_endpoint):
-            add_sum(values[i], &nobs, &sum_x)
+            add_sum(values[i], &nobs, &sum_x, &compensation_add)
             output[i] = NaN
 
         for i in range(range_endpoint, N):
             val = values[i]
-            add_sum(val, &nobs, &sum_x)
+            add_sum(val, &nobs, &sum_x, &compensation_add)
 
             if i > win - 1:
                 prev_x = values[i - win]
-                remove_sum(prev_x, &nobs, &sum_x)
+                remove_sum(prev_x, &nobs, &sum_x, &compensation_remove)
 
             output[i] = calc_sum(minp, nobs, sum_x)
 
@@ -277,32 +292,42 @@ cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs,
 
 
 cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
-                          Py_ssize_t *neg_ct) nogil:
-    """ add a value from the mean calc """
+                          Py_ssize_t *neg_ct, float64_t *compensation) nogil:
+    """ add a value from the mean calc using Kahan summation """
+    cdef:
+        float64_t y, t
 
     # Not NaN
     if notnan(val):
         nobs[0] = nobs[0] + 1
-        sum_x[0] = sum_x[0] + val
+        y = val - compensation[0]
+        t = sum_x[0] + y
+        compensation[0] = t - sum_x[0] - y
+        sum_x[0] = t
         if signbit(val):
             neg_ct[0] = neg_ct[0] + 1
 
 
 cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
-                             Py_ssize_t *neg_ct) nogil:
-    """ remove a value from the mean calc """
+                             Py_ssize_t *neg_ct, float64_t *compensation) nogil:
+    """ remove a value from the mean calc using Kahan summation """
+    cdef:
+        float64_t y, t
 
     if notnan(val):
         nobs[0] = nobs[0] - 1
-        sum_x[0] = sum_x[0] - val
+        y = - val - compensation[0]
+        t = sum_x[0] + y
+        compensation[0] = t - sum_x[0] - y
+        sum_x[0] = t
         if signbit(val):
             neg_ct[0] = neg_ct[0] - 1
 
 
 def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
                     ndarray[int64_t] end, int64_t minp, int64_t win):
     cdef:
-        float64_t val, prev_x, sum_x = 0
+        float64_t val, prev_x, sum_x = 0, compensation_add = 0, compensation_remove = 0
         Py_ssize_t nobs = 0, i, neg_ct = 0, N = len(values)
         ndarray[float64_t] output
 
@@ -311,16 +336,16 @@ def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
     with nogil:
         for i in range(minp - 1):
             val = values[i]
-            add_mean(val, &nobs, &sum_x, &neg_ct)
+            add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)
             output[i] = NaN
 
         for i in range(minp - 1, N):
             val = values[i]
-            add_mean(val, &nobs, &sum_x, &neg_ct)
+            add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)
 
             if i > win - 1:
                 prev_x = values[i - win]
-                remove_mean(prev_x, &nobs, &sum_x, &neg_ct)
+                remove_mean(prev_x, &nobs, &sum_x, &neg_ct, &compensation_remove)
 
             output[i] = calc_mean(minp, nobs, neg_ct, sum_x)
 
@@ -330,7 +355,7 @@ def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
 def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start,
                        ndarray[int64_t] end, int64_t minp):
     cdef:
-        float64_t val, sum_x = 0
+        float64_t val, compensation_add = 0, compensation_remove = 0, sum_x = 0
         int64_t s, e
         Py_ssize_t nobs = 0, i, j, neg_ct = 0, N = len(values)
         ndarray[float64_t] output
@@ -350,26 +375,26 @@ def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start,
                 # setup
                 for j in range(s, e):
                     val = values[j]
-                    add_mean(val, &nobs, &sum_x, &neg_ct)
+                    add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)
 
             else:
 
                 # calculate deletes
                 for j in range(start[i - 1], s):
                     val = values[j]
-                    remove_mean(val, &nobs, &sum_x, &neg_ct)
+                    remove_mean(val, &nobs, &sum_x, &neg_ct, &compensation_remove)
 
                 # calculate adds
                 for j in range(end[i - 1], e):
                     val = values[j]
-                    add_mean(val, &nobs, &sum_x, &neg_ct)
+                    add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)
 
             output[i] = calc_mean(minp, nobs, neg_ct, sum_x)
 
             if not is_monotonic_bounds:
                 for j in range(s, e):
                     val = values[j]
-                    remove_mean(val, &nobs, &sum_x, &neg_ct)
+                    remove_mean(val, &nobs, &sum_x, &neg_ct, &compensation_remove)
     return output
 
 # ----------------------------------------------------------------------
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
@@ -696,3 +696,78 @@ def scaled_sum(*args):
     expected = DataFrame(data={"X": [0.0, 0.5, 1.0, 1.5, 2.0]}, index=_index)
     result = df.groupby(**grouping).rolling(1).apply(scaled_sum, raw=raw, args=(2,))
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("add", [0.0, 2.0])
+def test_rolling_numerical_accuracy_kahan_mean(add):
+    # GH: 36031 implementing kahan summation
+    df = pd.DataFrame(
+        {"A": [3002399751580331.0 + add, -0.0, -0.0]},
+        index=[
+            pd.Timestamp("19700101 09:00:00"),
+            pd.Timestamp("19700101 09:00:03"),
+            pd.Timestamp("19700101 09:00:06"),
+        ],
+    )
+    result = (
+        df.resample("1s").ffill().rolling("3s", closed="left", min_periods=3).mean()
+    )
+    dates = pd.date_range("19700101 09:00:00", periods=7, freq="S")
+    expected = pd.DataFrame(
+        {
+            "A": [
+                np.nan,
+                np.nan,
+                np.nan,
+                3002399751580330.5,
+                2001599834386887.25,
+                1000799917193443.625,
+                0.0,
+            ]
+        },
+        index=dates,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_rolling_numerical_accuracy_kahan_sum():
+    # GH: 13254
+    df = pd.DataFrame([2.186, -1.647, 0.0, 0.0, 0.0, 0.0], columns=["x"])
+    result = df["x"].rolling(3).sum()
+    expected = pd.Series([np.nan, np.nan, 0.539, -1.647, 0.0, 0.0], name="x")
+    tm.assert_series_equal(result, expected)
+
+
+def test_rolling_numerical_accuracy_jump():
+    # GH: 32761
+    index = pd.date_range(start="2020-01-01", end="2020-01-02", freq="60s").append(
+        pd.DatetimeIndex(["2020-01-03"])
+    )
+    data = np.random.rand(len(index))
+
+    df = pd.DataFrame({"data": data}, index=index)
+    result = df.rolling("60s").mean()
+    tm.assert_frame_equal(result, df[["data"]])
+
+
+def test_rolling_numerical_accuracy_small_values():
+    # GH: 10319
+    s = Series(
+        data=[0.00012456, 0.0003, -0.0, -0.0],
+        index=date_range("1999-02-03", "1999-02-06"),
+    )
+    result = s.rolling(1).mean()
+    tm.assert_series_equal(result, s)
+
+
+def test_rolling_numerical_too_large_numbers():
+    # GH: 11645
+    dates = pd.date_range("2015-01-01", periods=10, freq="D")
+    ds = pd.Series(data=range(10), index=dates, dtype=np.float64)
+    ds[2] = -9e33
+    result = ds.rolling(5).mean()
+    expected = pd.Series(
+        [np.nan, np.nan, np.nan, np.nan, -1.8e33, -1.8e33, -1.8e33, 0.0, 6.0, 7.0],
+        index=dates,
+    )
+    tm.assert_series_equal(result, expected)