Skip to content

Commit 15d818b

Browse files
authored
ENH: Use Kahan summation and Welfords method to calculate rolling var and std (#37055)
1 parent ca27cca commit 15d818b

File tree

3 files changed

+49
-19
lines changed

3 files changed

+49
-19
lines changed

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,7 @@ Other enhancements
192192
- Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`)
193193
- Where possible :meth:`RangeIndex.difference` and :meth:`RangeIndex.symmetric_difference` will return :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`36564`)
194194
- Added :meth:`Rolling.sem()` and :meth:`Expanding.sem()` to compute the standard error of mean (:issue:`26476`).
195+
- :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`)
195196

196197
.. _whatsnew_120.api_breaking.python:
197198

pandas/_libs/window/aggregations.pyx

+31-19
Original file line numberDiff line numberDiff line change
@@ -311,37 +311,46 @@ cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs,
311311

312312

313313
cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x,
314-
float64_t *ssqdm_x) nogil:
314+
float64_t *ssqdm_x, float64_t *compensation) nogil:
315315
""" add a value from the var calc """
316316
cdef:
317-
float64_t delta
317+
float64_t delta, prev_mean, y, t
318318

319319
# `isnan` instead of equality as fix for GH-21813, msvc 2017 bug
320320
if isnan(val):
321321
return
322322

323323
nobs[0] = nobs[0] + 1
324-
# a part of Welford's method for the online variance-calculation
324+
# Welford's method for the online variance-calculation
325+
# using Kahan summation
325326
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
326-
delta = val - mean_x[0]
327+
prev_mean = mean_x[0] - compensation[0]
328+
y = val - compensation[0]
329+
t = y - mean_x[0]
330+
compensation[0] = t + mean_x[0] - y
331+
delta = t
327332
mean_x[0] = mean_x[0] + delta / nobs[0]
328-
ssqdm_x[0] = ssqdm_x[0] + ((nobs[0] - 1) * delta ** 2) / nobs[0]
333+
ssqdm_x[0] = ssqdm_x[0] + (val - prev_mean) * (val - mean_x[0])
329334

330335

331336
cdef inline void remove_var(float64_t val, float64_t *nobs, float64_t *mean_x,
332-
float64_t *ssqdm_x) nogil:
337+
float64_t *ssqdm_x, float64_t *compensation) nogil:
333338
""" remove a value from the var calc """
334339
cdef:
335-
float64_t delta
336-
340+
float64_t delta, prev_mean, y, t
337341
if notnan(val):
338342
nobs[0] = nobs[0] - 1
339343
if nobs[0]:
340-
# a part of Welford's method for the online variance-calculation
344+
# Welford's method for the online variance-calculation
345+
# using Kahan summation
341346
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
342-
delta = val - mean_x[0]
347+
prev_mean = mean_x[0] - compensation[0]
348+
y = val - compensation[0]
349+
t = y - mean_x[0]
350+
compensation[0] = t + mean_x[0] - y
351+
delta = t
343352
mean_x[0] = mean_x[0] - delta / nobs[0]
344-
ssqdm_x[0] = ssqdm_x[0] - ((nobs[0] + 1) * delta ** 2) / nobs[0]
353+
ssqdm_x[0] = ssqdm_x[0] - (val - prev_mean) * (val - mean_x[0])
345354
else:
346355
mean_x[0] = 0
347356
ssqdm_x[0] = 0
@@ -353,7 +362,8 @@ def roll_var(ndarray[float64_t] values, ndarray[int64_t] start,
353362
Numerically stable implementation using Welford's method.
354363
"""
355364
cdef:
356-
float64_t mean_x = 0, ssqdm_x = 0, nobs = 0,
365+
float64_t mean_x = 0, ssqdm_x = 0, nobs = 0, compensation_add = 0,
366+
float64_t compensation_remove = 0,
357367
float64_t val, prev, delta, mean_x_old
358368
int64_t s, e
359369
Py_ssize_t i, j, N = len(values)
@@ -375,26 +385,28 @@ def roll_var(ndarray[float64_t] values, ndarray[int64_t] start,
375385
if i == 0 or not is_monotonic_bounds:
376386

377387
for j in range(s, e):
378-
add_var(values[j], &nobs, &mean_x, &ssqdm_x)
388+
add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add)
379389

380390
else:
381391

382392
# After the first window, observations can both be added
383393
# and removed
384394

385-
# calculate adds
386-
for j in range(end[i - 1], e):
387-
add_var(values[j], &nobs, &mean_x, &ssqdm_x)
388-
389395
# calculate deletes
390396
for j in range(start[i - 1], s):
391-
remove_var(values[j], &nobs, &mean_x, &ssqdm_x)
397+
remove_var(values[j], &nobs, &mean_x, &ssqdm_x,
398+
&compensation_remove)
399+
400+
# calculate adds
401+
for j in range(end[i - 1], e):
402+
add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add)
392403

393404
output[i] = calc_var(minp, ddof, nobs, ssqdm_x)
394405

395406
if not is_monotonic_bounds:
396407
for j in range(s, e):
397-
remove_var(values[j], &nobs, &mean_x, &ssqdm_x)
408+
remove_var(values[j], &nobs, &mean_x, &ssqdm_x,
409+
&compensation_remove)
398410

399411
return output
400412

pandas/tests/window/test_rolling.py

+17
Original file line numberDiff line numberDiff line change
@@ -879,3 +879,20 @@ def test_rolling_sem(constructor):
879879
result = pd.Series(result[0].values)
880880
expected = pd.Series([np.nan] + [0.707107] * 2)
881881
tm.assert_series_equal(result, expected)
882+
883+
884+
@pytest.mark.parametrize(
885+
("func", "third_value", "values"),
886+
[
887+
("var", 1, [5e33, 0, 0.5, 0.5, 2, 0]),
888+
("std", 1, [7.071068e16, 0, 0.7071068, 0.7071068, 1.414214, 0]),
889+
("var", 2, [5e33, 0.5, 0, 0.5, 2, 0]),
890+
("std", 2, [7.071068e16, 0.7071068, 0, 0.7071068, 1.414214, 0]),
891+
],
892+
)
893+
def test_rolling_var_numerical_issues(func, third_value, values):
894+
# GH: 37051
895+
ds = pd.Series([99999999999999999, 1, third_value, 2, 3, 1, 1])
896+
result = getattr(ds.rolling(2), func)()
897+
expected = pd.Series([np.nan] + values)
898+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)