Skip to content

Commit e173701

Browse files
phoflKevin D Smith
authored and
Kevin D Smith
committed
[BUG]: Implement Kahan summation for rolling().mean() to avoid numerical issues (pandas-dev#36348)
1 parent dc8d42c commit e173701

File tree

3 files changed

+131
-30
lines changed

3 files changed

+131
-30
lines changed

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ Other enhancements
118118
- :class:`Index` with object dtype supports division and multiplication (:issue:`34160`)
119119
- :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`)
120120
- `Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`)
121+
- :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`)
121122

122123
.. _whatsnew_120.api_breaking.python:
123124

pandas/_libs/window/aggregations.pyx

+55-30
Original file line numberDiff line numberDiff line change
@@ -161,27 +161,42 @@ cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x) nogi
161161
return result
162162

163163

164-
cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogil:
165-
""" add a value from the sum calc """
164+
cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
165+
float64_t *compensation) nogil:
166+
""" add a value from the sum calc using Kahan summation """
167+
168+
cdef:
169+
float64_t y, t
166170

167171
# Not NaN
168172
if notnan(val):
169173
nobs[0] = nobs[0] + 1
170-
sum_x[0] = sum_x[0] + val
174+
y = val - compensation[0]
175+
t = sum_x[0] + y
176+
compensation[0] = t - sum_x[0] - y
177+
sum_x[0] = t
171178

172179

173-
cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogil:
174-
""" remove a value from the sum calc """
180+
cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
181+
float64_t *compensation) nogil:
182+
""" remove a value from the sum calc using Kahan summation """
183+
184+
cdef:
185+
float64_t y, t
175186

187+
# Not NaN
176188
if notnan(val):
177189
nobs[0] = nobs[0] - 1
178-
sum_x[0] = sum_x[0] - val
190+
y = - val - compensation[0]
191+
t = sum_x[0] + y
192+
compensation[0] = t - sum_x[0] - y
193+
sum_x[0] = t
179194

180195

181196
def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start,
182197
ndarray[int64_t] end, int64_t minp):
183198
cdef:
184-
float64_t sum_x = 0
199+
float64_t sum_x = 0, compensation_add = 0, compensation_remove = 0
185200
int64_t s, e
186201
int64_t nobs = 0, i, j, N = len(values)
187202
ndarray[float64_t] output
@@ -201,31 +216,31 @@ def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start,
201216
# setup
202217

203218
for j in range(s, e):
204-
add_sum(values[j], &nobs, &sum_x)
219+
add_sum(values[j], &nobs, &sum_x, &compensation_add)
205220

206221
else:
207222

208223
# calculate deletes
209224
for j in range(start[i - 1], s):
210-
remove_sum(values[j], &nobs, &sum_x)
225+
remove_sum(values[j], &nobs, &sum_x, &compensation_remove)
211226

212227
# calculate adds
213228
for j in range(end[i - 1], e):
214-
add_sum(values[j], &nobs, &sum_x)
229+
add_sum(values[j], &nobs, &sum_x, &compensation_add)
215230

216231
output[i] = calc_sum(minp, nobs, sum_x)
217232

218233
if not is_monotonic_bounds:
219234
for j in range(s, e):
220-
remove_sum(values[j], &nobs, &sum_x)
235+
remove_sum(values[j], &nobs, &sum_x, &compensation_remove)
221236

222237
return output
223238

224239

225240
def roll_sum_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
226241
ndarray[int64_t] end, int64_t minp, int64_t win):
227242
cdef:
228-
float64_t val, prev_x, sum_x = 0
243+
float64_t val, prev_x, sum_x = 0, compensation_add = 0, compensation_remove = 0
229244
int64_t range_endpoint
230245
int64_t nobs = 0, i, N = len(values)
231246
ndarray[float64_t] output
@@ -237,16 +252,16 @@ def roll_sum_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
237252
with nogil:
238253

239254
for i in range(0, range_endpoint):
240-
add_sum(values[i], &nobs, &sum_x)
255+
add_sum(values[i], &nobs, &sum_x, &compensation_add)
241256
output[i] = NaN
242257

243258
for i in range(range_endpoint, N):
244259
val = values[i]
245-
add_sum(val, &nobs, &sum_x)
260+
add_sum(val, &nobs, &sum_x, &compensation_add)
246261

247262
if i > win - 1:
248263
prev_x = values[i - win]
249-
remove_sum(prev_x, &nobs, &sum_x)
264+
remove_sum(prev_x, &nobs, &sum_x, &compensation_remove)
250265

251266
output[i] = calc_sum(minp, nobs, sum_x)
252267

@@ -277,32 +292,42 @@ cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs,
277292

278293

279294
cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
280-
Py_ssize_t *neg_ct) nogil:
281-
""" add a value from the mean calc """
295+
Py_ssize_t *neg_ct, float64_t *compensation) nogil:
296+
""" add a value from the mean calc using Kahan summation """
297+
cdef:
298+
float64_t y, t
282299

283300
# Not NaN
284301
if notnan(val):
285302
nobs[0] = nobs[0] + 1
286-
sum_x[0] = sum_x[0] + val
303+
y = val - compensation[0]
304+
t = sum_x[0] + y
305+
compensation[0] = t - sum_x[0] - y
306+
sum_x[0] = t
287307
if signbit(val):
288308
neg_ct[0] = neg_ct[0] + 1
289309

290310

291311
cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
292-
Py_ssize_t *neg_ct) nogil:
293-
""" remove a value from the mean calc """
312+
Py_ssize_t *neg_ct, float64_t *compensation) nogil:
313+
""" remove a value from the mean calc using Kahan summation """
314+
cdef:
315+
float64_t y, t
294316

295317
if notnan(val):
296318
nobs[0] = nobs[0] - 1
297-
sum_x[0] = sum_x[0] - val
319+
y = - val - compensation[0]
320+
t = sum_x[0] + y
321+
compensation[0] = t - sum_x[0] - y
322+
sum_x[0] = t
298323
if signbit(val):
299324
neg_ct[0] = neg_ct[0] - 1
300325

301326

302327
def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
303328
ndarray[int64_t] end, int64_t minp, int64_t win):
304329
cdef:
305-
float64_t val, prev_x, sum_x = 0
330+
float64_t val, prev_x, sum_x = 0, compensation_add = 0, compensation_remove = 0
306331
Py_ssize_t nobs = 0, i, neg_ct = 0, N = len(values)
307332
ndarray[float64_t] output
308333

@@ -311,16 +336,16 @@ def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
311336
with nogil:
312337
for i in range(minp - 1):
313338
val = values[i]
314-
add_mean(val, &nobs, &sum_x, &neg_ct)
339+
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)
315340
output[i] = NaN
316341

317342
for i in range(minp - 1, N):
318343
val = values[i]
319-
add_mean(val, &nobs, &sum_x, &neg_ct)
344+
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)
320345

321346
if i > win - 1:
322347
prev_x = values[i - win]
323-
remove_mean(prev_x, &nobs, &sum_x, &neg_ct)
348+
remove_mean(prev_x, &nobs, &sum_x, &neg_ct, &compensation_remove)
324349

325350
output[i] = calc_mean(minp, nobs, neg_ct, sum_x)
326351

@@ -330,7 +355,7 @@ def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
330355
def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start,
331356
ndarray[int64_t] end, int64_t minp):
332357
cdef:
333-
float64_t val, sum_x = 0
358+
float64_t val, compensation_add = 0, compensation_remove = 0, sum_x = 0
334359
int64_t s, e
335360
Py_ssize_t nobs = 0, i, j, neg_ct = 0, N = len(values)
336361
ndarray[float64_t] output
@@ -350,26 +375,26 @@ def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start,
350375
# setup
351376
for j in range(s, e):
352377
val = values[j]
353-
add_mean(val, &nobs, &sum_x, &neg_ct)
378+
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)
354379

355380
else:
356381

357382
# calculate deletes
358383
for j in range(start[i - 1], s):
359384
val = values[j]
360-
remove_mean(val, &nobs, &sum_x, &neg_ct)
385+
remove_mean(val, &nobs, &sum_x, &neg_ct, &compensation_remove)
361386

362387
# calculate adds
363388
for j in range(end[i - 1], e):
364389
val = values[j]
365-
add_mean(val, &nobs, &sum_x, &neg_ct)
390+
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)
366391

367392
output[i] = calc_mean(minp, nobs, neg_ct, sum_x)
368393

369394
if not is_monotonic_bounds:
370395
for j in range(s, e):
371396
val = values[j]
372-
remove_mean(val, &nobs, &sum_x, &neg_ct)
397+
remove_mean(val, &nobs, &sum_x, &neg_ct, &compensation_remove)
373398
return output
374399

375400
# ----------------------------------------------------------------------

pandas/tests/window/test_rolling.py

+75
Original file line numberDiff line numberDiff line change
@@ -696,3 +696,78 @@ def scaled_sum(*args):
696696
expected = DataFrame(data={"X": [0.0, 0.5, 1.0, 1.5, 2.0]}, index=_index)
697697
result = df.groupby(**grouping).rolling(1).apply(scaled_sum, raw=raw, args=(2,))
698698
tm.assert_frame_equal(result, expected)
699+
700+
701+
@pytest.mark.parametrize("add", [0.0, 2.0])
702+
def test_rolling_numerical_accuracy_kahan_mean(add):
703+
# GH: 36031 implementing kahan summation
704+
df = pd.DataFrame(
705+
{"A": [3002399751580331.0 + add, -0.0, -0.0]},
706+
index=[
707+
pd.Timestamp("19700101 09:00:00"),
708+
pd.Timestamp("19700101 09:00:03"),
709+
pd.Timestamp("19700101 09:00:06"),
710+
],
711+
)
712+
result = (
713+
df.resample("1s").ffill().rolling("3s", closed="left", min_periods=3).mean()
714+
)
715+
dates = pd.date_range("19700101 09:00:00", periods=7, freq="S")
716+
expected = pd.DataFrame(
717+
{
718+
"A": [
719+
np.nan,
720+
np.nan,
721+
np.nan,
722+
3002399751580330.5,
723+
2001599834386887.25,
724+
1000799917193443.625,
725+
0.0,
726+
]
727+
},
728+
index=dates,
729+
)
730+
tm.assert_frame_equal(result, expected)
731+
732+
733+
def test_rolling_numerical_accuracy_kahan_sum():
734+
# GH: 13254
735+
df = pd.DataFrame([2.186, -1.647, 0.0, 0.0, 0.0, 0.0], columns=["x"])
736+
result = df["x"].rolling(3).sum()
737+
expected = pd.Series([np.nan, np.nan, 0.539, -1.647, 0.0, 0.0], name="x")
738+
tm.assert_series_equal(result, expected)
739+
740+
741+
def test_rolling_numerical_accuracy_jump():
742+
# GH: 32761
743+
index = pd.date_range(start="2020-01-01", end="2020-01-02", freq="60s").append(
744+
pd.DatetimeIndex(["2020-01-03"])
745+
)
746+
data = np.random.rand(len(index))
747+
748+
df = pd.DataFrame({"data": data}, index=index)
749+
result = df.rolling("60s").mean()
750+
tm.assert_frame_equal(result, df[["data"]])
751+
752+
753+
def test_rolling_numerical_accuracy_small_values():
754+
# GH: 10319
755+
s = Series(
756+
data=[0.00012456, 0.0003, -0.0, -0.0],
757+
index=date_range("1999-02-03", "1999-02-06"),
758+
)
759+
result = s.rolling(1).mean()
760+
tm.assert_series_equal(result, s)
761+
762+
763+
def test_rolling_numerical_too_large_numbers():
764+
# GH: 11645
765+
dates = pd.date_range("2015-01-01", periods=10, freq="D")
766+
ds = pd.Series(data=range(10), index=dates, dtype=np.float64)
767+
ds[2] = -9e33
768+
result = ds.rolling(5).mean()
769+
expected = pd.Series(
770+
[np.nan, np.nan, np.nan, np.nan, -1.8e33, -1.8e33, -1.8e33, 0.0, 6.0, 7.0],
771+
index=dates,
772+
)
773+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)