From d4b1a9bec386130f065335e898128cdb563dd4fc Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 1 Nov 2020 14:49:15 +0100 Subject: [PATCH 01/15] Improve numerical stability for window functions skew and kurt --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_libs/window/aggregations.pyx | 23 +++++++++++++++++++---- pandas/tests/window/test_expanding.py | 10 ++++++++++ pandas/tests/window/test_rolling.py | 10 ++++++++++ 4 files changed, 40 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 7c850ffedfcab..4fa229f78b1e9 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -228,6 +228,7 @@ Other enhancements - :class:`Rolling` now supports the ``closed`` argument for fixed windows (:issue:`34315`) - :class:`DatetimeIndex` and :class:`Series` with ``datetime64`` or ``datetime64tz`` dtypes now support ``std`` (:issue:`37436`) - :class:`Window` now supports all Scipy window types in ``win_type`` with flexible keyword argument support (:issue:`34556`) +- Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` (:issue:`6929`) .. _whatsnew_120.api_breaking.python: diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index b2dbf7802e6f0..747e4b4e4507e 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -490,9 +490,9 @@ cdef inline void remove_skew(float64_t val, int64_t *nobs, def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, prev + float64_t val, prev, sum_val = 0.0 float64_t x = 0, xx = 0, xxx = 0 - int64_t nobs = 0, i, j, N = len(values) + int64_t nobs = 0, i, j, N = len(values), nobs_mean = 0 int64_t s, e ndarray[float64_t] output bint is_monotonic_increasing_bounds @@ -504,7 +504,14 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, output = np.empty(N, dtype=float) with nogil: + for i in range(0, N): + val = values[i] + if notnan(val): + sum_val += val + nobs_mean += 1 + values = values - sum_val / nobs_mean + with nogil: for i in range(0, N): s = start[i] @@ -619,9 +626,9 @@ cdef inline void remove_kurt(float64_t val, int64_t *nobs, def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, prev + float64_t val, prev, sum_val = 0 float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 - int64_t nobs = 0, i, j, s, e, N = len(values) + int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0 ndarray[float64_t] output bint is_monotonic_increasing_bounds @@ -631,6 +638,14 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, ) output = np.empty(N, dtype=float) + with nogil: + for i in range(0, N): + val = values[i] + if notnan(val): + sum_val += val + nobs_mean += 1 + values = values - sum_val / nobs_mean + with nogil: for i in range(0, N): diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index 183d2814920e4..1af1446ee7b53 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -255,3 +255,13 @@ def test_expanding_sem(constructor): result = pd.Series(result[0].values) expected = pd.Series([np.nan] + [0.707107] * 2) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("method", ["skew", "kurt"]) +def test_expanding_skew_kurt_numerical_stability(method): + # GH: 6929 + s = pd.Series(np.random.rand(10)) + expected = getattr(s.expanding(3), method)() + s = s + 5000 + result = getattr(s.expanding(3), method)() + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 2c8439aae75e5..a856f0adc216b 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1087,3 +1087,13 @@ def test_rolling_corr_timedelta_index(index, window): result = x.rolling(window).corr(y) expected = Series([np.nan, np.nan, 1, 1, 1], index=index) tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize("method", ["skew", "kurt"]) +def test_rolling_skew_kurt_numerical_stability(method): + # GH: 6929 + s = pd.Series(np.random.rand(10)) + expected = getattr(s.rolling(3), method)() + s = s + 5000 + result = getattr(s.rolling(3), method)() + tm.assert_series_equal(result, expected) From c63453986e83a632ee4520fb7e78957c8e822555 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 1 Nov 2020 14:51:08 +0100 Subject: [PATCH 02/15] Fix typing --- pandas/_libs/window/aggregations.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 747e4b4e4507e..a35da7c574af5 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -626,7 +626,7 @@ cdef inline void remove_kurt(float64_t val, int64_t *nobs, def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, prev, sum_val = 0 + float64_t val, prev, sum_val = 0.0 float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0 ndarray[float64_t] output From bf88a63ba003d56791a4dd87e53a7f3997a390c7 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 1 Nov 2020 15:21:27 +0100 Subject: [PATCH 03/15] Fix pattern --- pandas/tests/window/test_rolling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index a856f0adc216b..a2320153393e1 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1092,7 +1092,7 @@ def test_rolling_corr_timedelta_index(index, window): @pytest.mark.parametrize("method", ["skew", "kurt"]) def test_rolling_skew_kurt_numerical_stability(method): # GH: 6929 - s = pd.Series(np.random.rand(10)) + s = Series(np.random.rand(10)) expected = getattr(s.rolling(3), method)() s = s + 5000 result = getattr(s.rolling(3), method)() From e57f97cddbc87bef830a441897a0f93cb8057f78 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 1 Nov 2020 15:31:06 +0100 Subject: [PATCH 04/15] Improve numerical stability for mean calculation --- pandas/_libs/window/aggregations.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index a35da7c574af5..5fe1c855e24de 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -490,7 +490,7 @@ cdef inline void remove_skew(float64_t val, int64_t *nobs, def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, prev, sum_val = 0.0 + float64_t val, prev, mean = 0.0 float64_t x = 0, xx = 0, xxx = 0 int64_t nobs = 0, i, j, N = len(values), nobs_mean = 0 int64_t s, e @@ -507,9 +507,9 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, for i in range(0, N): val = values[i] if notnan(val): - sum_val += val nobs_mean += 1 - values = values - sum_val / nobs_mean + mean = mean + 1 / nobs_mean * (val - mean) + values = values - mean with nogil: for i in range(0, N): @@ -626,7 +626,7 @@ cdef inline void remove_kurt(float64_t val, int64_t *nobs, def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, prev, sum_val = 0.0 + float64_t val, prev, mean = 0.0 float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0 ndarray[float64_t] output @@ -642,9 +642,9 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, for i in range(0, N): val = values[i] if notnan(val): - sum_val += val nobs_mean += 1 - values = values - sum_val / nobs_mean + mean = mean + 1 / nobs_mean * (val - mean) + values = values - mean with nogil: From a71f645d4f6ddcf0d23df946a2d5373162ad672d Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 1 Nov 2020 16:48:44 +0100 Subject: [PATCH 05/15] Change dtype assignment --- pandas/_libs/window/aggregations.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 5fe1c855e24de..e4cf50c820b33 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -490,9 +490,9 @@ cdef inline void remove_skew(float64_t val, int64_t *nobs, def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, prev, mean = 0.0 + float64_t val, prev, mean = 0, nobs_mean = 0 float64_t x = 0, xx = 0, xxx = 0 - int64_t nobs = 0, i, j, N = len(values), nobs_mean = 0 + int64_t nobs = 0, i, j, N = len(values) int64_t s, e ndarray[float64_t] output bint is_monotonic_increasing_bounds @@ -626,9 +626,9 @@ cdef inline void remove_kurt(float64_t val, int64_t *nobs, def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, prev, mean = 0.0 + float64_t val, prev, mean = 0, nobs_mean = 0 float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 - int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0 + int64_t nobs = 0, i, j, s, e, N = len(values) ndarray[float64_t] output bint is_monotonic_increasing_bounds From a1874d88c8b054e1c1bf930d32fee391c233bed1 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 1 Nov 2020 16:56:50 +0100 Subject: [PATCH 06/15] Improve performance --- pandas/_libs/window/aggregations.pyx | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index e4cf50c820b33..a746ed8d7b978 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -490,9 +490,9 @@ cdef inline void remove_skew(float64_t val, int64_t *nobs, def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, prev, mean = 0, nobs_mean = 0 + float64_t val, prev, sum_val = 0 float64_t x = 0, xx = 0, xxx = 0 - int64_t nobs = 0, i, j, N = len(values) + int64_t nobs = 0, i, j, N = len(values), nobs_mean = 0 int64_t s, e ndarray[float64_t] output bint is_monotonic_increasing_bounds @@ -508,8 +508,8 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, val = values[i] if notnan(val): nobs_mean += 1 - mean = mean + 1 / nobs_mean * (val - mean) - values = values - mean + sum_val += val + values = values - round(sum_val / nobs_mean) with nogil: for i in range(0, N): @@ -626,9 +626,9 @@ cdef inline void remove_kurt(float64_t val, int64_t *nobs, def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, prev, mean = 0, nobs_mean = 0 + float64_t val, prev, sum_val = 0 float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 - int64_t nobs = 0, i, j, s, e, N = len(values) + int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0 ndarray[float64_t] output bint is_monotonic_increasing_bounds @@ -643,8 +643,8 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, val = values[i] if notnan(val): nobs_mean += 1 - mean = mean + 1 / nobs_mean * (val - mean) - values = values - mean + sum_val += val + values = values - round(sum_val / nobs_mean) with nogil: From eb359253efeb14bc3182a01c3a1678e97f9498d5 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 1 Nov 2020 21:09:33 +0100 Subject: [PATCH 07/15] Implement kahan summation --- pandas/_libs/window/aggregations.pyx | 163 +++++++++++++++++++-------- pandas/tests/window/test_rolling.py | 12 +- 2 files changed, 127 insertions(+), 48 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index a746ed8d7b978..46b1dd4046036 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -459,38 +459,67 @@ cdef inline float64_t calc_skew(int64_t minp, int64_t nobs, cdef inline void add_skew(float64_t val, int64_t *nobs, float64_t *x, float64_t *xx, - float64_t *xxx) nogil: + float64_t *xxx, + float64_t *compensation_x, + float64_t *compensation_xx, + float64_t *compensation_xxx) nogil: """ add a value from the skew calc """ + cdef: + float64_t y, t # Not NaN if notnan(val): nobs[0] = nobs[0] + 1 - # seriously don't ask me why this is faster - x[0] = x[0] + val - xx[0] = xx[0] + val * val - xxx[0] = xxx[0] + val * val * val + y = val - compensation_x[0] + t = x[0] + y + compensation_x[0] = t - x[0] - y + x[0] = t + y = val * val - compensation_xx[0] + t = xx[0] + y + compensation_xx[0] = t - xx[0] - y + xx[0] = t + y = val * val * val - compensation_xxx[0] + t = xxx[0] + y + compensation_xxx[0] = t - xxx[0] - y + xxx[0] = t cdef inline void remove_skew(float64_t val, int64_t *nobs, float64_t *x, float64_t *xx, - float64_t *xxx) nogil: + float64_t *xxx, + float64_t *compensation_x, + float64_t *compensation_xx, + float64_t *compensation_xxx) nogil: """ remove a value from the skew calc """ + cdef: + float64_t y, t # Not NaN if notnan(val): nobs[0] = nobs[0] - 1 - # seriously don't ask me why this is faster - x[0] = x[0] - val - xx[0] = xx[0] - val * val - xxx[0] = xxx[0] - val * val * val + y = - val - compensation_x[0] + t = x[0] + y + compensation_x[0] = t - x[0] - y + x[0] = t + y = - val * val - compensation_xx[0] + t = xx[0] + y + compensation_xx[0] = t - xx[0] - y + xx[0] = t + y = - val * val * val - compensation_xxx[0] + t = xxx[0] + y + compensation_xxx[0] = t - xxx[0] - y + xxx[0] = t def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, prev, sum_val = 0 + float64_t val, prev, min_val, mean_val, sum_val = 0 + float64_t compensation_xxx_add = 0, compensation_xxx_remove = 0 + float64_t compensation_xx_add = 0, compensation_xx_remove = 0 + float64_t compensation_x_add = 0, compensation_x_remove = 0 float64_t x = 0, xx = 0, xxx = 0 int64_t nobs = 0, i, j, N = len(values), nobs_mean = 0 int64_t s, e @@ -502,6 +531,7 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, start, end ) output = np.empty(N, dtype=float) + min_val = np.nanmin(values) with nogil: for i in range(0, N): @@ -509,7 +539,10 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, if notnan(val): nobs_mean += 1 sum_val += val - values = values - round(sum_val / nobs_mean) + mean_val = sum_val / nobs_mean + # Other cases would lead to imprecision for smallest values + if min_val - mean_val > -1e5: + values = values - round(sum_val / nobs_mean) with nogil: for i in range(0, N): @@ -523,22 +556,24 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, for j in range(s, e): val = values[j] - add_skew(val, &nobs, &x, &xx, &xxx) + add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add, + &compensation_xx_add, &compensation_xxx_add) else: # After the first window, observations can both be added # and removed + # calculate deletes + for j in range(start[i - 1], s): + val = values[j] + remove_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_remove, + &compensation_xx_remove, &compensation_xxx_remove) # calculate adds for j in range(end[i - 1], e): val = values[j] - add_skew(val, &nobs, &x, &xx, &xxx) - - # calculate deletes - for j in range(start[i - 1], s): - val = values[j] - remove_skew(val, &nobs, &x, &xx, &xxx) + add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add, + &compensation_xx_add, &compensation_xxx_add) output[i] = calc_skew(minp, nobs, x, xx, xxx) @@ -593,42 +628,79 @@ cdef inline float64_t calc_kurt(int64_t minp, int64_t nobs, cdef inline void add_kurt(float64_t val, int64_t *nobs, float64_t *x, float64_t *xx, - float64_t *xxx, float64_t *xxxx) nogil: + float64_t *xxx, float64_t *xxxx, + float64_t *compensation_x, + float64_t *compensation_xx, + float64_t *compensation_xxx, + float64_t *compensation_xxxx) nogil: """ add a value from the kurotic calc """ + cdef: + float64_t y, t # Not NaN if notnan(val): nobs[0] = nobs[0] + 1 - # seriously don't ask me why this is faster - x[0] = x[0] + val - xx[0] = xx[0] + val * val - xxx[0] = xxx[0] + val * val * val - xxxx[0] = xxxx[0] + val * val * val * val + y = val - compensation_x[0] + t = x[0] + y + compensation_x[0] = t - x[0] - y + x[0] = t + y = val * val - compensation_xx[0] + t = xx[0] + y + compensation_xx[0] = t - xx[0] - y + xx[0] = t + y = val * val * val - compensation_xxx[0] + t = xxx[0] + y + compensation_xxx[0] = t - xxx[0] - y + xxx[0] = t + y = val * val * val * val - compensation_xxxx[0] + t = xxxx[0] + y + compensation_xxxx[0] = t - xxxx[0] - y + xxxx[0] = t cdef inline void remove_kurt(float64_t val, int64_t *nobs, float64_t *x, float64_t *xx, - float64_t *xxx, float64_t *xxxx) nogil: + float64_t *xxx, float64_t *xxxx, + float64_t *compensation_x, + float64_t *compensation_xx, + float64_t *compensation_xxx, + float64_t *compensation_xxxx) nogil: """ remove a value from the kurotic calc """ + cdef: + float64_t y, t # Not NaN if notnan(val): nobs[0] = nobs[0] - 1 - # seriously don't ask me why this is faster - x[0] = x[0] - val - xx[0] = xx[0] - val * val - xxx[0] = xxx[0] - val * val * val - xxxx[0] = xxxx[0] - val * val * val * val + y = - val - compensation_x[0] + t = x[0] + y + compensation_x[0] = t - x[0] - y + x[0] = t + y = - val * val - compensation_xx[0] + t = xx[0] + y + compensation_xx[0] = t - xx[0] - y + xx[0] = t + y = - val * val * val - compensation_xxx[0] + t = xxx[0] + y + compensation_xxx[0] = t - xxx[0] - y + xxx[0] = t + y = - val * val * val * val - compensation_xxxx[0] + t = xxxx[0] + y + compensation_xxxx[0] = t - xxxx[0] - y + xxxx[0] = t def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, prev, sum_val = 0 + float64_t val, prev, compensation_xxxx_add = 0, compensation_xxxx_remove = 0 + float64_t compensation_xxx_remove = 0, compensation_xxx_add = 0 + float64_t compensation_xx_remove = 0, compensation_xx_add = 0 + float64_t compensation_x_remove = 0, compensation_x_add = 0 float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 - int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0 + int64_t nobs = 0, i, j, s, e, N = len(values) ndarray[float64_t] output bint is_monotonic_increasing_bounds @@ -638,14 +710,6 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, ) output = np.empty(N, dtype=float) - with nogil: - for i in range(0, N): - val = values[i] - if notnan(val): - nobs_mean += 1 - sum_val += val - values = values - round(sum_val / nobs_mean) - with nogil: for i in range(0, N): @@ -658,20 +722,25 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, if i == 0 or not is_monotonic_increasing_bounds: for j in range(s, e): - add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx, + &compensation_x_add, &compensation_xx_add, + &compensation_xxx_add, &compensation_xxxx_add) else: # After the first window, observations can both be added # and removed + # calculate deletes + for j in range(start[i - 1], s): + remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx, + &compensation_x_remove, &compensation_xx_remove, + &compensation_xxx_remove, &compensation_xxxx_remove) # calculate adds for j in range(end[i - 1], e): - add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) - - # calculate deletes - for j in range(start[i - 1], s): - remove_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx) + add_kurt(values[j], &nobs, &x, &xx, &xxx, &xxxx, + &compensation_x_add, &compensation_xx_add, + &compensation_xxx_add, &compensation_xxxx_add) output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index a2320153393e1..937f8edc8cb67 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1094,6 +1094,16 @@ def test_rolling_skew_kurt_numerical_stability(method): # GH: 6929 s = Series(np.random.rand(10)) expected = getattr(s.rolling(3), method)() - s = s + 5000 + s = s + 50000 result = getattr(s.rolling(3), method)() tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize(("method", "values"), [("skew", [2.0, 0.854563, 0.0, 1.999984]), + ("kurt", [4.0, -1.289256, -1.2, 3.999946])]) +def test_rolling_skew_kurt_large_value_range(method, values): + # GH: 37557 + s = pd.Series([3000000, 1, 1, 2, 3, 4, 999]) + result = getattr(s.rolling(4), method)() + expected = pd.Series([np.nan] * 3 + values) + tm.assert_series_equal(result, expected) From 480ba26ca9d13adf9f57b8206f350c110917f690 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 1 Nov 2020 21:10:04 +0100 Subject: [PATCH 08/15] Change whatsnew --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 15ca1a615bc10..eee5eec8ef225 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -228,7 +228,7 @@ Other enhancements - :class:`Rolling` now supports the ``closed`` argument for fixed windows (:issue:`34315`) - :class:`DatetimeIndex` and :class:`Series` with ``datetime64`` or ``datetime64tz`` dtypes now support ``std`` (:issue:`37436`) - :class:`Window` now supports all Scipy window types in ``win_type`` with flexible keyword argument support (:issue:`34556`) -- Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` (:issue:`6929`) +- Improve numerical stability for :meth:`Rolling.skew()`, :meth:`Rolling.kurt()`, :meth:`Expanding.skew()` and :meth:`Expanding.kurt()` through implementation of Kahan summation (:issue:`6929`) .. _whatsnew_120.api_breaking.python: From c58be86a7d3fae5c0cb3c8c8fe70588fe311d431 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 1 Nov 2020 21:11:05 +0100 Subject: [PATCH 09/15] RUn black --- pandas/tests/window/test_rolling.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 937f8edc8cb67..188d21ce1479c 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1099,8 +1099,13 @@ def test_rolling_skew_kurt_numerical_stability(method): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize(("method", "values"), [("skew", [2.0, 0.854563, 0.0, 1.999984]), - ("kurt", [4.0, -1.289256, -1.2, 3.999946])]) +@pytest.mark.parametrize( + ("method", "values"), + [ + ("skew", [2.0, 0.854563, 0.0, 1.999984]), + ("kurt", [4.0, -1.289256, -1.2, 3.999946]), + ], +) def test_rolling_skew_kurt_large_value_range(method, values): # GH: 37557 s = pd.Series([3000000, 1, 1, 2, 3, 4, 999]) From fa460b1dc4c940260567646601fbc0387d0898ee Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 1 Nov 2020 21:29:32 +0100 Subject: [PATCH 10/15] Fix pattern --- pandas/tests/window/test_rolling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 188d21ce1479c..01628d2e016e4 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1108,7 +1108,7 @@ def test_rolling_skew_kurt_numerical_stability(method): ) def test_rolling_skew_kurt_large_value_range(method, values): # GH: 37557 - s = pd.Series([3000000, 1, 1, 2, 3, 4, 999]) + s = Series([3000000, 1, 1, 2, 3, 4, 999]) result = getattr(s.rolling(4), method)() - expected = pd.Series([np.nan] * 3 + values) + expected = Series([np.nan] * 3 + values) tm.assert_series_equal(result, expected) From ce0a9a246bc6230fc7798860214cea21bfe8e5d3 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 1 Nov 2020 21:34:23 +0100 Subject: [PATCH 11/15] Fix failing test --- pandas/_libs/window/aggregations.pyx | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 46b1dd4046036..8bb0098924bb0 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -695,12 +695,13 @@ cdef inline void remove_kurt(float64_t val, int64_t *nobs, def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, prev, compensation_xxxx_add = 0, compensation_xxxx_remove = 0 + float64_t val, prev, mean_val, min_val, sum_val = 0 + float64_t compensation_xxxx_add = 0, compensation_xxxx_remove = 0 float64_t compensation_xxx_remove = 0, compensation_xxx_add = 0 float64_t compensation_xx_remove = 0, compensation_xx_add = 0 float64_t compensation_x_remove = 0, compensation_x_add = 0 float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 - int64_t nobs = 0, i, j, s, e, N = len(values) + int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0 ndarray[float64_t] output bint is_monotonic_increasing_bounds @@ -709,6 +710,18 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, start, end ) output = np.empty(N, dtype=float) + min_val = np.nanmin(values) + + with nogil: + for i in range(0, N): + val = values[i] + if notnan(val): + nobs_mean += 1 + sum_val += val + mean_val = sum_val / nobs_mean + # Other cases would lead to imprecision for smallest values + if min_val - mean_val > -1e4: + values = values - round(sum_val / nobs_mean) with nogil: From e643a45682425e9c2ecac60762f992694eda1a3d Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 1 Nov 2020 21:35:15 +0100 Subject: [PATCH 12/15] Remove unnecessary division --- pandas/_libs/window/aggregations.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 8bb0098924bb0..c3d9b0b8ef160 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -542,7 +542,7 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, mean_val = sum_val / nobs_mean # Other cases would lead to imprecision for smallest values if min_val - mean_val > -1e5: - values = values - round(sum_val / nobs_mean) + values = values - round(mean_val) with nogil: for i in range(0, N): @@ -721,7 +721,7 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, mean_val = sum_val / nobs_mean # Other cases would lead to imprecision for smallest values if min_val - mean_val > -1e4: - values = values - round(sum_val / nobs_mean) + values = values - round(mean_val) with nogil: From bfedac0bdbc124304b9af1a2f0d0ef6b71f2f07f Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 8 Nov 2020 12:04:03 +0100 Subject: [PATCH 13/15] Hold no gil permanently --- pandas/_libs/window/aggregations.pyx | 29 ++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index aa6ec61cc8bd4..bf9b95220e02e 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -520,9 +520,9 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, float64_t compensation_xx_add = 0, compensation_xx_remove = 0 float64_t compensation_x_add = 0, compensation_x_remove = 0 float64_t x = 0, xx = 0, xxx = 0 - int64_t nobs = 0, i, j, N = len(values), nobs_mean = 0 + int64_t nobs = 0, i, j, N = len(values), nobs_mean = 0, mean_int = 0 int64_t s, e - ndarray[float64_t] output + ndarray[float64_t] output, mean_array bint is_monotonic_increasing_bounds minp = max(minp, 3) @@ -538,12 +538,13 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, if notnan(val): nobs_mean += 1 sum_val += val - mean_val = sum_val / nobs_mean - # Other cases would lead to imprecision for smallest values - if min_val - mean_val > -1e5: - values = values - round(mean_val) + mean_val = sum_val / nobs_mean + # Other cases would lead to imprecision for smallest values + if min_val - mean_val > -1e5: + mean_int = mean_val + for i in range(0, N): + values[i] = values[i] - mean_int - with nogil: for i in range(0, N): s = start[i] @@ -700,7 +701,7 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, float64_t compensation_xx_remove = 0, compensation_xx_add = 0 float64_t compensation_x_remove = 0, compensation_x_add = 0 float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 - int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0 + int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0, mean_int = 0 ndarray[float64_t] output bint is_monotonic_increasing_bounds @@ -717,12 +718,12 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, if notnan(val): nobs_mean += 1 sum_val += val - mean_val = sum_val / nobs_mean - # Other cases would lead to imprecision for smallest values - if min_val - mean_val > -1e4: - values = values - round(mean_val) - - with nogil: + mean_val = sum_val / nobs_mean + # Other cases would lead to imprecision for smallest values + if min_val - mean_val > -1e4: + mean_int = mean_val + for i in range(0, N): + values[i] = values[i] - mean_int for i in range(0, N): From 852f3ca89380de5f2f487a84e16674a6a48f488b Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 9 Nov 2020 00:33:21 +0100 Subject: [PATCH 14/15] Use round from math --- pandas/_libs/window/aggregations.pyx | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index bf9b95220e02e..4de7a5860c465 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -2,6 +2,7 @@ import cython +from libc.math cimport round from libcpp.deque cimport deque import numpy as np @@ -520,7 +521,7 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, float64_t compensation_xx_add = 0, compensation_xx_remove = 0 float64_t compensation_x_add = 0, compensation_x_remove = 0 float64_t x = 0, xx = 0, xxx = 0 - int64_t nobs = 0, i, j, N = len(values), nobs_mean = 0, mean_int = 0 + int64_t nobs = 0, i, j, N = len(values), nobs_mean = 0 int64_t s, e ndarray[float64_t] output, mean_array bint is_monotonic_increasing_bounds @@ -541,9 +542,9 @@ def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start, mean_val = sum_val / nobs_mean # Other cases would lead to imprecision for smallest values if min_val - mean_val > -1e5: - mean_int = mean_val + mean_val = round(mean_val) for i in range(0, N): - values[i] = values[i] - mean_int + values[i] = values[i] - mean_val for i in range(0, N): @@ -701,7 +702,7 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, float64_t compensation_xx_remove = 0, compensation_xx_add = 0 float64_t compensation_x_remove = 0, compensation_x_add = 0 float64_t x = 0, xx = 0, xxx = 0, xxxx = 0 - int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0, mean_int = 0 + int64_t nobs = 0, i, j, s, e, N = len(values), nobs_mean = 0 ndarray[float64_t] output bint is_monotonic_increasing_bounds @@ -721,9 +722,9 @@ def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start, mean_val = sum_val / nobs_mean # Other cases would lead to imprecision for smallest values if min_val - mean_val > -1e4: - mean_int = mean_val + mean_val = round(mean_val) for i in range(0, N): - values[i] = values[i] - mean_int + values[i] = values[i] - mean_val for i in range(0, N): From 4688648178aaa9e53d82de898972786d0c5ffe69 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 9 Nov 2020 00:41:17 +0100 Subject: [PATCH 15/15] Fix pattern --- pandas/tests/window/test_expanding.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index 1af1446ee7b53..7b3e1cd357c63 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -252,15 +252,15 @@ def test_expanding_sem(constructor): obj = getattr(pd, constructor)([0, 1, 2]) result = obj.expanding().sem() if isinstance(result, DataFrame): - result = pd.Series(result[0].values) - expected = pd.Series([np.nan] + [0.707107] * 2) + result = Series(result[0].values) + expected = Series([np.nan] + [0.707107] * 2) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("method", ["skew", "kurt"]) def test_expanding_skew_kurt_numerical_stability(method): # GH: 6929 - s = pd.Series(np.random.rand(10)) + s = Series(np.random.rand(10)) expected = getattr(s.expanding(3), method)() s = s + 5000 result = getattr(s.expanding(3), method)()