ENH: Improve numerical stability for window functions skew and kurt (#37557)

phofl · web-flow · commit 8dd2d95fe020 · 2020-11-09T16:20:05.000-05:00
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -232,6 +232,7 @@ Other enhancements
 - :meth:`testing.assert_index_equal` now has a ``check_order`` parameter that allows indexes to be checked in an order-insensitive manner (:issue:`37478`)
 - :func:`read_csv` supports memory-mapping for compressed files (:issue:`37621`)
 - Improve error reporting for :meth:`DataFrame.merge()` when invalid merge column definitions were given (:issue:`16228`)
+- Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`)
 
 .. _whatsnew_120.api_breaking.python:
 
diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx
@@ -2,6 +2,7 @@
 
 import cython
 
+from libc.math cimport round
 from libcpp.deque cimport deque
 
 import numpy as np
@@ -458,51 +459,92 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs,
 
 cdef inline void add_skew(float64_t val, int64_t *nobs,
                           float64_t *x, float64_t *xx,
-                          float64_t *xxx) nogil:
+                          float64_t *xxx,
+                          float64_t *compensation_x,
+                          float64_t *compensation_xx,
+                          float64_t *compensation_xxx) nogil:
     """ add a value from the skew calc """
+    cdef:
+        float64_t y, t
 
     # Not NaN
     if notnan(val):
         nobs[0] = nobs[0] + 1
 
-        # seriously don't ask me why this is faster
-        x[0] = x[0] + val
-        xx[0] = xx[0] + val * val
-        xxx[0] = xxx[0] + val * val * val
+        y = val - compensation_x[0]
+        t = x[0] + y
+        compensation_x[0] = t - x[0] - y
+        x[0] = t
+        y = val * val - compensation_xx[0]
+        t = xx[0] + y
+        compensation_xx[0] = t - xx[0] - y
+        xx[0] = t
+        y = val * val * val - compensation_xxx[0]
+        t = xxx[0] + y
+        compensation_xxx[0] = t - xxx[0] - y
+        xxx[0] = t
 
 
 cdef inline void remove_skew(float64_t val, int64_t *nobs,
                              float64_t *x, float64_t *xx,
-                             float64_t *xxx) nogil:
+                             float64_t *xxx,
+                             float64_t *compensation_x,
+                             float64_t *compensation_xx,
+                             float64_t *compensation_xxx) nogil:
     """ remove a value from the skew calc """
+    cdef:
+        float64_t y, t
 
     # Not NaN
     if notnan(val):
         nobs[0] = nobs[0] - 1
 
-        # seriously don't ask me why this is faster
-        x[0] = x[0] - val
-        xx[0] = xx[0] - val * val
-        xxx[0] = xxx[0] - val * val * val
+        y = - val - compensation_x[0]
+        t = x[0] + y
+        compensation_x[0] = t - x[0] - y
+        x[0] = t
+        y = - val * val - compensation_xx[0]
+        t = xx[0] + y
+        compensation_xx[0] = t - xx[0] - y
+        xx[0] = t
+        y = - val * val * val - compensation_xxx[0]
+        t = xxx[0] + y
+        compensation_xxx[0] = t - xxx[0] - y
+        xxx[0] = t
 
 
 def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
               ndarray[int64_t] end, int64_t minp):
     cdef:
-        float64_t val, prev
+        float64_t val, prev, min_val, mean_val, sum_val = 0
+        float64_t compensation_xxx_add = 0, compensation_xxx_remove = 0
+        float64_t compensation_xx_add = 0, compensation_xx_remove = 0
+        float64_t compensation_x_add = 0, compensation_x_remove = 0
         float64_t x = 0, xx = 0, xxx = 0
-        int64_t nobs = 0, i, j, N = len(values)
+        int64_t nobs = 0, i, j, N = len(values), nobs_mean = 0
         int64_t s, e
-        ndarray[float64_t] output
+        ndarray[float64_t] output, mean_array
         bint is_monotonic_increasing_bounds
 
     minp = max(minp, 3)
     is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
         start, end
     )
     output = np.empty(N, dtype=float)
+    min_val = np.nanmin(values)
 
     with nogil:
+        for i in range(0, N):
+            val = values[i]
+            if notnan(val):
+                nobs_mean += 1
+                sum_val += val
+        mean_val = sum_val / nobs_mean
+        # Other cases would lead to imprecision for smallest values
+        if min_val - mean_val > -1e5:
+            mean_val = round(mean_val)
+            for i in range(0, N):
+                values[i] = values[i] - mean_val
 
         for i in range(0, N):
 
@@ -515,22 +557,24 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
 
                 for j in range(s, e):
                     val = values[j]
-                    add_skew(val, &nobs, &x, &xx, &xxx)
+                    add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
+                             &compensation_xx_add, &compensation_xxx_add)
 
             else:
 
                 # After the first window, observations can both be added
                 # and removed
+                # calculate deletes
+                for j in range(start[i - 1], s):
+                    val = values[j]
+                    remove_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_remove,
+                                &compensation_xx_remove, &compensation_xxx_remove)
 
                 # calculate adds
                 for j in range(end[i - 1], e):
                     val = values[j]
-                    add_skew(val, &nobs, &x, &xx, &xxx)
-
-                # calculate deletes
-                for j in range(start[i - 1], s):
-                    val = values[j]
-                    remove_skew(val, &nobs, &x, &xx, &xxx)
+                    add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
+                             &compensation_xx_add, &compensation_xxx_add)
 
             output[i] = calc_skew(minp, nobs, x, xx, xxx)
 
@@ -585,42 +629,80 @@ cdef inline float64_t calc_kurt(int64_t minp, int64_t nobs,
 
 cdef inline void add_kurt(float64_t val, int64_t *nobs,
                           float64_t *x, float64_t *xx,
-                          float64_t *xxx, float64_t *xxxx) nogil:
+                          float64_t *xxx, float64_t *xxxx,
+                          float64_t *compensation_x,
+                          float64_t *compensation_xx,
+                          float64_t *compensation_xxx,
+                          float64_t *compensation_xxxx) nogil:
     """ add a value from the kurotic calc """
+    cdef:
+        float64_t y, t
 
     # Not NaN
     if notnan(val):
         nobs[0] = nobs[0] + 1
 
-        # seriously don't ask me why this is faster
-        x[0] = x[0] + val
-        xx[0] = xx[0] + val * val
-        xxx[0] = xxx[0] + val * val * val
-        xxxx[0] = xxxx[0] + val * val * val * val
+        y = val - compensation_x[0]
+        t = x[0] + y
+        compensation_x[0] = t - x[0] - y
+        x[0] = t
+        y = val * val - compensation_xx[0]
+        t = xx[0] + y
+        compensation_xx[0] = t - xx[0] - y
+        xx[0] = t
+        y = val * val * val - compensation_xxx[0]
+        t = xxx[0] + y
+        compensation_xxx[0] = t - xxx[0] - y
+        xxx[0] = t
+        y = val * val * val * val - compensation_xxxx[0]
+        t = xxxx[0] + y
+        compensation_xxxx[0] = t - xxxx[0] - y
+        xxxx[0] = t
 
 
 cdef inline void remove_kurt(float64_t val, int64_t *nobs,
                              float64_t *x, float64_t *xx,
-                             float64_t *xxx, float64_t *xxxx) nogil:
+                             float64_t *xxx, float64_t *xxxx,
+                             float64_t *compensation_x,
+                             float64_t *compensation_xx,
+                             float64_t *compensation_xxx,
+                             float64_t *compensation_xxxx) nogil:
     """ remove a value from the kurotic calc """
+    cdef:
+        float64_t y, t
 
     # Not NaN
     if notnan(val):
         nobs[0] = nobs[0] - 1
 
-        # seriously don't ask me why this is faster
-        x[0] = x[0] - val
-        xx[0] = xx[0] - val * val
-        xxx[0] = xxx[0] - val * val * val
-        xxxx[0] = xxxx[0] - val * val * val * val
+        y = - val - compensation_x[0]
+        t = x[0] + y
+        compensation_x[0] = t - x[0] - y
+        x[0] = t
+        y = - val * val - compensation_xx[0]
+        t = xx[0] + y
+        compensation_xx[0] = t - xx[0] - y
+        xx[0] = t
+        y = - val * val * val - compensation_xxx[0]
+        t = xxx[0] + y
+        compensation_xxx[0] = t - xxx[0] - y
+        xxx[0] = t
+        y = - val * val * val * val - compensation_xxxx[0]
+        t = xxxx[0] + y
+        compensation_xxxx[0] = t - xxxx[0] - y
+        xxxx[0] = t
 
 
 def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
               ndarray[int64_t] end, int64_t minp):
     cdef:
-        float64_t val, prev
+        float64_t val, prev, mean_val, min_val, sum_val = 0
+        float64_t compensation_xxxx_add = 0, compensation_xxxx_remove = 0
+        float64_t compensation_xxx_remove = 0, compensation_xxx_add = 0
+        float64_t compensation_xx_remove = 0, compensation_xx_add = 0
+        float64_t compensation_x_remove = 0, compensation_x_add = 0
         float64_t x = 0, xx = 0, xxx = 0, xxxx = 0
-        int64_t nobs = 0, i, j, s, e, N = len(values)
+        int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0
         ndarray[float64_t] output
         bint is_monotonic_increasing_bounds
 
@@ -629,8 +711,20 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
         start, end
     )
     output = np.empty(N, dtype=float)
+    min_val = np.nanmin(values)
 
     with nogil:
+        for i in range(0, N):
+            val = values[i]
+            if notnan(val):
+                nobs_mean += 1
+                sum_val += val
+        mean_val = sum_val / nobs_mean
+        # Other cases would lead to imprecision for smallest values
+        if min_val - mean_val > -1e4:
+            mean_val = round(mean_val)
+            for i in range(0, N):
+                values[i] = values[i] - mean_val
 
         for i in range(0, N):
 
@@ -642,20 +736,25 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
             if i == 0 or not is_monotonic_increasing_bounds:
 
                 for j in range(s, e):
-                    add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx)
+                    add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx,
+                             &compensation_x_add, &compensation_xx_add,
+                             &compensation_xxx_add, &compensation_xxxx_add)
 
             else:
 
                 # After the first window, observations can both be added
                 # and removed
+                # calculate deletes
+                for j in range(start[i - 1], s):
+                    remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx,
+                                &compensation_x_remove, &compensation_xx_remove,
+                                &compensation_xxx_remove, &compensation_xxxx_remove)
 
                 # calculate adds
                 for j in range(end[i - 1], e):
-                    add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx)
-
-                # calculate deletes
-                for j in range(start[i - 1], s):
-                    remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx)
+                    add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx,
+                             &compensation_x_add, &compensation_xx_add,
+                             &compensation_xxx_add, &compensation_xxxx_add)
 
             output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx)
 
diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py
@@ -255,3 +255,13 @@ def test_expanding_sem(constructor):
         result = Series(result[0].values)
     expected = Series([np.nan] + [0.707107] * 2)
     tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("method", ["skew", "kurt"])
+def test_expanding_skew_kurt_numerical_stability(method):
+    # GH: 6929
+    s = Series(np.random.rand(10))
+    expected = getattr(s.expanding(3), method)()
+    s = s + 5000
+    result = getattr(s.expanding(3), method)()
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
@@ -1102,3 +1102,28 @@ def test_groupby_rolling_nan_included():
         ),
     )
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("method", ["skew", "kurt"])
+def test_rolling_skew_kurt_numerical_stability(method):
+    # GH: 6929
+    s = Series(np.random.rand(10))
+    expected = getattr(s.rolling(3), method)()
+    s = s + 50000
+    result = getattr(s.rolling(3), method)()
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    ("method", "values"),
+    [
+        ("skew", [2.0, 0.854563, 0.0, 1.999984]),
+        ("kurt", [4.0, -1.289256, -1.2, 3.999946]),
+    ],
+)
+def test_rolling_skew_kurt_large_value_range(method, values):
+    # GH: 37557
+    s = Series([3000000, 1, 1, 2, 3, 4, 999])
+    result = getattr(s.rolling(4), method)()
+    expected = Series([np.nan] * 3 + values)
+    tm.assert_series_equal(result, expected)