diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index d4d28dde52d58..fab4a8dce8ea8 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -232,6 +232,7 @@ Other enhancements - :meth:`testing.assert_index_equal` now has a ``check_order`` parameter that allows indexes to be checked in an order-insensitive manner (:issue:`37478`) - :func:`read_csv` supports memory-mapping for compressed files (:issue:`37621`) - Improve error reporting for :meth:`DataFrame.merge()` when invalid merge column definitions were given (:issue:`16228`) +- Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`) .. _whatsnew_120.api_breaking.python: diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 3556085bb300b..4de7a5860c465 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -2,6 +2,7 @@ import cython +from libc.math cimport round from libcpp.deque cimport deque import numpy as np @@ -458,42 +459,71 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs, cdef inline void add_skew(float64_t val, int64_t *nobs, float64_t *x, float64_t *xx, - float64_t *xxx) nogil: + float64_t *xxx, + float64_t *compensation_x, + float64_t *compensation_xx, + float64_t *compensation_xxx) nogil: """ add a value from the skew calc """ + cdef: + float64_t y, t # Not NaN if notnan(val): nobs[0] = nobs[0] + 1 - # seriously don't ask me why this is faster - x[0] = x[0] + val - xx[0] = xx[0] + val * val - xxx[0] = xxx[0] + val * val * val + y = val - compensation_x[0] + t = x[0] + y + compensation_x[0] = t - x[0] - y + x[0] = t + y = val * val - compensation_xx[0] + t = xx[0] + y + compensation_xx[0] = t - xx[0] - y + xx[0] = t + y = val * val * val - compensation_xxx[0] + t = xxx[0] + y + compensation_xxx[0] = t - xxx[0] - y + xxx[0] = t cdef inline void remove_skew(float64_t val, int64_t *nobs, float64_t *x, float64_t *xx, - float64_t *xxx) nogil: + float64_t *xxx, + float64_t *compensation_x, + float64_t *compensation_xx, + float64_t *compensation_xxx) nogil: """ remove a value from the skew calc """ + cdef: + float64_t y, t # Not NaN if notnan(val): nobs[0] = nobs[0] - 1 - # seriously don't ask me why this is faster - x[0] = x[0] - val - xx[0] = xx[0] - val * val - xxx[0] = xxx[0] - val * val * val + y = - val - compensation_x[0] + t = x[0] + y + compensation_x[0] = t - x[0] - y + x[0] = t + y = - val * val - compensation_xx[0] + t = xx[0] + y + compensation_xx[0] = t - xx[0] - y + xx[0] = t + y = - val * val * val - compensation_xxx[0] + t = xxx[0] + y + compensation_xxx[0] = t - xxx[0] - y + xxx[0] = t def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, prev + float64_t val, prev, min_val, mean_val, sum_val = 0 + float64_t compensation_xxx_add = 0, compensation_xxx_remove = 0 + float64_t compensation_xx_add = 0, compensation_xx_remove = 0 + float64_t compensation_x_add = 0, compensation_x_remove = 0 float64_t x = 0, xx = 0, xxx = 0 - int64_t nobs = 0, i, j, N = len(values) + int64_t nobs = 0, i, j, N = len(values), nobs_mean = 0 int64_t s, e - ndarray[float64_t] output + ndarray[float64_t] output, mean_array bint is_monotonic_increasing_bounds minp = max(minp, 3) @@ -501,8 +531,20 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, start, end ) output = np.empty(N, dtype=float) + min_val = np.nanmin(values) with nogil: + for i in range(0, N): + val = values[i] + if notnan(val): + nobs_mean += 1 + sum_val += val + mean_val = sum_val / nobs_mean + # Other cases would lead to imprecision for smallest values + if min_val - mean_val > -1e5: + mean_val = round(mean_val) + for i in range(0, N): + values[i] = values[i] - mean_val for i in range(0, N): @@ -515,22 +557,24 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, for j in range(s, e): val = values[j] - add_skew(val, &nobs, &x, &xx, &xxx) + add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add, + &compensation_xx_add, &compensation_xxx_add) else: # After the first window, observations can both be added # and removed + # calculate deletes + for j in range(start[i - 1], s): + val = values[j] + remove_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_remove, + &compensation_xx_remove, &compensation_xxx_remove) # calculate adds for j in range(end[i - 1], e): val = values[j] - add_skew(val, &nobs, &x, &xx, &xxx) - - # calculate deletes - for j in range(start[i - 1], s): - val = values[j] - remove_skew(val, &nobs, &x, &xx, &xxx) + add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add, + &compensation_xx_add, &compensation_xxx_add) output[i] = calc_skew(minp, nobs, x, xx, xxx) @@ -585,42 +629,80 @@ cdef inline float64_t calc_kurt(int64_t minp, int64_t nobs, cdef inline void add_kurt(float64_t val, int64_t *nobs, float64_t *x, float64_t *xx, - float64_t *xxx, float64_t *xxxx) nogil: + float64_t *xxx, float64_t *xxxx, + float64_t *compensation_x, + float64_t *compensation_xx, + float64_t *compensation_xxx, + float64_t *compensation_xxxx) nogil: """ add a value from the kurotic calc """ + cdef: + float64_t y, t # Not NaN if notnan(val): nobs[0] = nobs[0] + 1 - # seriously don't ask me why this is faster - x[0] = x[0] + val - xx[0] = xx[0] + val * val - xxx[0] = xxx[0] + val * val * val - xxxx[0] = xxxx[0] + val * val * val * val + y = val - compensation_x[0] + t = x[0] + y + compensation_x[0] = t - x[0] - y + x[0] = t + y = val * val - compensation_xx[0] + t = xx[0] + y + compensation_xx[0] = t - xx[0] - y + xx[0] = t + y = val * val * val - compensation_xxx[0] + t = xxx[0] + y + compensation_xxx[0] = t - xxx[0] - y + xxx[0] = t + y = val * val * val * val - compensation_xxxx[0] + t = xxxx[0] + y + compensation_xxxx[0] = t - xxxx[0] - y + xxxx[0] = t cdef inline void remove_kurt(float64_t val, int64_t *nobs, float64_t *x, float64_t *xx, - float64_t *xxx, float64_t *xxxx) nogil: + float64_t *xxx, float64_t *xxxx, + float64_t *compensation_x, + float64_t *compensation_xx, + float64_t *compensation_xxx, + float64_t *compensation_xxxx) nogil: """ remove a value from the kurotic calc """ + cdef: + float64_t y, t # Not NaN if notnan(val): nobs[0] = nobs[0] - 1 - # seriously don't ask me why this is faster - x[0] = x[0] - val - xx[0] = xx[0] - val * val - xxx[0] = xxx[0] - val * val * val - xxxx[0] = xxxx[0] - val * val * val * val + y = - val - compensation_x[0] + t = x[0] + y + compensation_x[0] = t - x[0] - y + x[0] = t + y = - val * val - compensation_xx[0] + t = xx[0] + y + compensation_xx[0] = t - xx[0] - y + xx[0] = t + y = - val * val * val - compensation_xxx[0] + t = xxx[0] + y + compensation_xxx[0] = t - xxx[0] - y + xxx[0] = t + y = - val * val * val * val - compensation_xxxx[0] + t = xxxx[0] + y + compensation_xxxx[0] = t - xxxx[0] - y + xxxx[0] = t def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, prev + float64_t val, prev, mean_val, min_val, sum_val = 0 + float64_t compensation_xxxx_add = 0, compensation_xxxx_remove = 0 + float64_t compensation_xxx_remove = 0, compensation_xxx_add = 0 + float64_t compensation_xx_remove = 0, compensation_xx_add = 0 + float64_t compensation_x_remove = 0, compensation_x_add = 0 float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 - int64_t nobs = 0, i, j, s, e, N = len(values) + int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0 ndarray[float64_t] output bint is_monotonic_increasing_bounds @@ -629,8 +711,20 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, start, end ) output = np.empty(N, dtype=float) + min_val = np.nanmin(values) with nogil: + for i in range(0, N): + val = values[i] + if notnan(val): + nobs_mean += 1 + sum_val += val + mean_val = sum_val / nobs_mean + # Other cases would lead to imprecision for smallest values + if min_val - mean_val > -1e4: + mean_val = round(mean_val) + for i in range(0, N): + values[i] = values[i] - mean_val for i in range(0, N): @@ -642,20 +736,25 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, if i == 0 or not is_monotonic_increasing_bounds: for j in range(s, e): - add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx, + &compensation_x_add, &compensation_xx_add, + &compensation_xxx_add, &compensation_xxxx_add) else: # After the first window, observations can both be added # and removed + # calculate deletes + for j in range(start[i - 1], s): + remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx, + &compensation_x_remove, &compensation_xx_remove, + &compensation_xxx_remove, &compensation_xxxx_remove) # calculate adds for j in range(end[i - 1], e): - add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) - - # calculate deletes - for j in range(start[i - 1], s): - remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx, + &compensation_x_add, &compensation_xx_add, + &compensation_xxx_add, &compensation_xxxx_add) output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index 21c7477918d02..ace6848a58c9c 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -255,3 +255,13 @@ def test_expanding_sem(constructor): result = Series(result[0].values) expected = Series([np.nan] + [0.707107] * 2) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("method", ["skew", "kurt"]) +def test_expanding_skew_kurt_numerical_stability(method): + # GH: 6929 + s = Series(np.random.rand(10)) + expected = getattr(s.expanding(3), method)() + s = s + 5000 + result = getattr(s.expanding(3), method)() + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 02bcfab8d3388..6cad93f2d77ba 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1102,3 +1102,28 @@ def test_groupby_rolling_nan_included(): ), ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("method", ["skew", "kurt"]) +def test_rolling_skew_kurt_numerical_stability(method): + # GH: 6929 + s = Series(np.random.rand(10)) + expected = getattr(s.rolling(3), method)() + s = s + 50000 + result = getattr(s.rolling(3), method)() + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + ("method", "values"), + [ + ("skew", [2.0, 0.854563, 0.0, 1.999984]), + ("kurt", [4.0, -1.289256, -1.2, 3.999946]), + ], +) +def test_rolling_skew_kurt_large_value_range(method, values): + # GH: 37557 + s = Series([3000000, 1, 1, 2, 3, 4, 999]) + result = getattr(s.rolling(4), method)() + expected = Series([np.nan] * 3 + values) + tm.assert_series_equal(result, expected)