Skip to content

[BUG]: Implement Kahan summation for rolling().mean() to avoid numerical issues #36348

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Sep 15, 2020
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ Other enhancements
- :class:`Index` with object dtype supports division and multiplication (:issue:`34160`)
- :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`)
- `Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`)
- :meth:`Series.rolling().mean()` and :meth:`DataFrame.rolling().mean()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`36031`)

.. _whatsnew_120.api_breaking.python:

Expand Down
40 changes: 25 additions & 15 deletions pandas/_libs/window/aggregations.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -277,32 +277,42 @@ cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs,


cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
Py_ssize_t *neg_ct) nogil:
""" add a value from the mean calc """
Py_ssize_t *neg_ct, float64_t *c) nogil:
""" add a value from the mean calc using Kahan summation """
cdef:
float64_t y, t

# Not NaN
if notnan(val):
nobs[0] = nobs[0] + 1
sum_x[0] = sum_x[0] + val
y = val - c[0]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a comment that using kahan summation

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I already added a comment in the function header. Is this sufficient?

t = sum_x[0] + y
c[0] = t - sum_x[0] - y
sum_x[0] = t
if signbit(val):
neg_ct[0] = neg_ct[0] + 1


cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
Py_ssize_t *neg_ct) nogil:
""" remove a value from the mean calc """
Py_ssize_t *neg_ct, float64_t *c) nogil:
""" remove a value from the mean calc using Kahan summation """
cdef:
float64_t y, t

if notnan(val):
nobs[0] = nobs[0] - 1
sum_x[0] = sum_x[0] - val
y = - val - c[0]
t = sum_x[0] + y
c[0] = t - sum_x[0] - y
sum_x[0] = t
if signbit(val):
neg_ct[0] = neg_ct[0] - 1


def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
ndarray[int64_t] end, int64_t minp, int64_t win):
cdef:
float64_t val, prev_x, sum_x = 0
float64_t val, prev_x, sum_x = 0, c_add = 0, c_remove = 0
Py_ssize_t nobs = 0, i, neg_ct = 0, N = len(values)
ndarray[float64_t] output

Expand All @@ -311,16 +321,16 @@ def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
with nogil:
for i in range(minp - 1):
val = values[i]
add_mean(val, &nobs, &sum_x, &neg_ct)
add_mean(val, &nobs, &sum_x, &neg_ct, &c_add)
output[i] = NaN

for i in range(minp - 1, N):
val = values[i]
add_mean(val, &nobs, &sum_x, &neg_ct)
add_mean(val, &nobs, &sum_x, &neg_ct, &c_add)

if i > win - 1:
prev_x = values[i - win]
remove_mean(prev_x, &nobs, &sum_x, &neg_ct)
remove_mean(prev_x, &nobs, &sum_x, &neg_ct, &c_remove)

output[i] = calc_mean(minp, nobs, neg_ct, sum_x)

Expand All @@ -330,7 +340,7 @@ def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start,
ndarray[int64_t] end, int64_t minp):
cdef:
float64_t val, sum_x = 0
float64_t val, c_add = 0, c_remove = 0, sum_x = 0
int64_t s, e
Py_ssize_t nobs = 0, i, j, neg_ct = 0, N = len(values)
ndarray[float64_t] output
Expand All @@ -350,26 +360,26 @@ def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start,
# setup
for j in range(s, e):
val = values[j]
add_mean(val, &nobs, &sum_x, &neg_ct)
add_mean(val, &nobs, &sum_x, &neg_ct, &c_add)

else:

# calculate deletes
for j in range(start[i - 1], s):
val = values[j]
remove_mean(val, &nobs, &sum_x, &neg_ct)
remove_mean(val, &nobs, &sum_x, &neg_ct, &c_remove)

# calculate adds
for j in range(end[i - 1], e):
val = values[j]
add_mean(val, &nobs, &sum_x, &neg_ct)
add_mean(val, &nobs, &sum_x, &neg_ct, &c_add)

output[i] = calc_mean(minp, nobs, neg_ct, sum_x)

if not is_monotonic_bounds:
for j in range(s, e):
val = values[j]
remove_mean(val, &nobs, &sum_x, &neg_ct)
remove_mean(val, &nobs, &sum_x, &neg_ct, &c_remove)
return output

# ----------------------------------------------------------------------
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/window/test_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,3 +696,21 @@ def scaled_sum(*args):
expected = DataFrame(data={"X": [0.0, 0.5, 1.0, 1.5, 2.0]}, index=_index)
result = df.groupby(**grouping).rolling(1).apply(scaled_sum, raw=raw, args=(2,))
tm.assert_frame_equal(result, expected)


def test_rolling_numerical_accuracy_kahan():
# GH: 36031 implementing kahan summation
df = pd.DataFrame(
{
"A": [3002399751580331.0, -0.0, -0.0]
}, # First value is a single digit longer.
index=[
pd.Timestamp("19700101 09:00:00"),
pd.Timestamp("19700101 09:00:03"),
pd.Timestamp("19700101 09:00:06"),
],
)
result = (
df.resample("1s").ffill().rolling("3s", closed="left", min_periods=3).mean()
)
assert result.values[-1] == 0.0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we compare the entire DataFrame result for this test as well?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done