Skip to content

[BUG]: Implement Kahan summation for rolling().mean() to avoid numerical issues #36348

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Sep 15, 2020
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ Other enhancements
- :class:`Index` with object dtype supports division and multiplication (:issue:`34160`)
- :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`)
- `Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`)
- :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`)

.. _whatsnew_120.api_breaking.python:

Expand Down
85 changes: 55 additions & 30 deletions pandas/_libs/window/aggregations.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -161,27 +161,42 @@ cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x) nogi
return result


cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogil:
""" add a value from the sum calc """
cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
float64_t *compensation) nogil:
""" add a value from the sum calc using Kahan summation """

cdef:
float64_t y, t

# Not NaN
if notnan(val):
nobs[0] = nobs[0] + 1
sum_x[0] = sum_x[0] + val
y = val - compensation[0]
t = sum_x[0] + y
compensation[0] = t - sum_x[0] - y
sum_x[0] = t


cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogil:
""" remove a value from the sum calc """
cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
float64_t *compensation) nogil:
""" remove a value from the sum calc using Kahan summation """

cdef:
float64_t y, t

# Not NaN
if notnan(val):
nobs[0] = nobs[0] - 1
sum_x[0] = sum_x[0] - val
y = - val - compensation[0]
t = sum_x[0] + y
compensation[0] = t - sum_x[0] - y
sum_x[0] = t


def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start,
ndarray[int64_t] end, int64_t minp):
cdef:
float64_t sum_x = 0
float64_t sum_x = 0, compensation_add = 0, compensation_remove = 0
int64_t s, e
int64_t nobs = 0, i, j, N = len(values)
ndarray[float64_t] output
Expand All @@ -201,31 +216,31 @@ def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start,
# setup

for j in range(s, e):
add_sum(values[j], &nobs, &sum_x)
add_sum(values[j], &nobs, &sum_x, &compensation_add)

else:

# calculate deletes
for j in range(start[i - 1], s):
remove_sum(values[j], &nobs, &sum_x)
remove_sum(values[j], &nobs, &sum_x, &compensation_remove)

# calculate adds
for j in range(end[i - 1], e):
add_sum(values[j], &nobs, &sum_x)
add_sum(values[j], &nobs, &sum_x, &compensation_add)

output[i] = calc_sum(minp, nobs, sum_x)

if not is_monotonic_bounds:
for j in range(s, e):
remove_sum(values[j], &nobs, &sum_x)
remove_sum(values[j], &nobs, &sum_x, &compensation_remove)

return output


def roll_sum_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
ndarray[int64_t] end, int64_t minp, int64_t win):
cdef:
float64_t val, prev_x, sum_x = 0
float64_t val, prev_x, sum_x = 0, compensation_add = 0, compensation_remove = 0
int64_t range_endpoint
int64_t nobs = 0, i, N = len(values)
ndarray[float64_t] output
Expand All @@ -237,16 +252,16 @@ def roll_sum_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
with nogil:

for i in range(0, range_endpoint):
add_sum(values[i], &nobs, &sum_x)
add_sum(values[i], &nobs, &sum_x, &compensation_add)
output[i] = NaN

for i in range(range_endpoint, N):
val = values[i]
add_sum(val, &nobs, &sum_x)
add_sum(val, &nobs, &sum_x, &compensation_add)

if i > win - 1:
prev_x = values[i - win]
remove_sum(prev_x, &nobs, &sum_x)
remove_sum(prev_x, &nobs, &sum_x, &compensation_remove)

output[i] = calc_sum(minp, nobs, sum_x)

Expand Down Expand Up @@ -277,32 +292,42 @@ cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs,


cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
Py_ssize_t *neg_ct) nogil:
""" add a value from the mean calc """
Py_ssize_t *neg_ct, float64_t *compensation) nogil:
""" add a value from the mean calc using Kahan summation """
cdef:
float64_t y, t

# Not NaN
if notnan(val):
nobs[0] = nobs[0] + 1
sum_x[0] = sum_x[0] + val
y = val - compensation[0]
t = sum_x[0] + y
compensation[0] = t - sum_x[0] - y
sum_x[0] = t
if signbit(val):
neg_ct[0] = neg_ct[0] + 1


cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
Py_ssize_t *neg_ct) nogil:
""" remove a value from the mean calc """
Py_ssize_t *neg_ct, float64_t *compensation) nogil:
""" remove a value from the mean calc using Kahan summation """
cdef:
float64_t y, t

if notnan(val):
nobs[0] = nobs[0] - 1
sum_x[0] = sum_x[0] - val
y = - val - compensation[0]
t = sum_x[0] + y
compensation[0] = t - sum_x[0] - y
sum_x[0] = t
if signbit(val):
neg_ct[0] = neg_ct[0] - 1


def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
ndarray[int64_t] end, int64_t minp, int64_t win):
cdef:
float64_t val, prev_x, sum_x = 0
float64_t val, prev_x, sum_x = 0, compensation_add = 0, compensation_remove = 0
Py_ssize_t nobs = 0, i, neg_ct = 0, N = len(values)
ndarray[float64_t] output

Expand All @@ -311,16 +336,16 @@ def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
with nogil:
for i in range(minp - 1):
val = values[i]
add_mean(val, &nobs, &sum_x, &neg_ct)
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)
output[i] = NaN

for i in range(minp - 1, N):
val = values[i]
add_mean(val, &nobs, &sum_x, &neg_ct)
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)

if i > win - 1:
prev_x = values[i - win]
remove_mean(prev_x, &nobs, &sum_x, &neg_ct)
remove_mean(prev_x, &nobs, &sum_x, &neg_ct, &compensation_remove)

output[i] = calc_mean(minp, nobs, neg_ct, sum_x)

Expand All @@ -330,7 +355,7 @@ def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start,
def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start,
ndarray[int64_t] end, int64_t minp):
cdef:
float64_t val, sum_x = 0
float64_t val, compensation_add = 0, compensation_remove = 0, sum_x = 0
int64_t s, e
Py_ssize_t nobs = 0, i, j, neg_ct = 0, N = len(values)
ndarray[float64_t] output
Expand All @@ -350,26 +375,26 @@ def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start,
# setup
for j in range(s, e):
val = values[j]
add_mean(val, &nobs, &sum_x, &neg_ct)
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)

else:

# calculate deletes
for j in range(start[i - 1], s):
val = values[j]
remove_mean(val, &nobs, &sum_x, &neg_ct)
remove_mean(val, &nobs, &sum_x, &neg_ct, &compensation_remove)

# calculate adds
for j in range(end[i - 1], e):
val = values[j]
add_mean(val, &nobs, &sum_x, &neg_ct)
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)

output[i] = calc_mean(minp, nobs, neg_ct, sum_x)

if not is_monotonic_bounds:
for j in range(s, e):
val = values[j]
remove_mean(val, &nobs, &sum_x, &neg_ct)
remove_mean(val, &nobs, &sum_x, &neg_ct, &compensation_remove)
return output

# ----------------------------------------------------------------------
Expand Down
75 changes: 75 additions & 0 deletions pandas/tests/window/test_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,3 +696,78 @@ def scaled_sum(*args):
expected = DataFrame(data={"X": [0.0, 0.5, 1.0, 1.5, 2.0]}, index=_index)
result = df.groupby(**grouping).rolling(1).apply(scaled_sum, raw=raw, args=(2,))
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("add", [0.0, 2.0])
def test_rolling_numerical_accuracy_kahan_mean(add):
# GH: 36031 implementing kahan summation
df = pd.DataFrame(
{"A": [3002399751580331.0 + add, -0.0, -0.0]},
index=[
pd.Timestamp("19700101 09:00:00"),
pd.Timestamp("19700101 09:00:03"),
pd.Timestamp("19700101 09:00:06"),
],
)
result = (
df.resample("1s").ffill().rolling("3s", closed="left", min_periods=3).mean()
)
dates = pd.date_range("19700101 09:00:00", periods=7, freq="S")
expected = pd.DataFrame(
{
"A": [
np.nan,
np.nan,
np.nan,
3002399751580330.5,
2001599834386887.25,
1000799917193443.625,
0.0,
]
},
index=dates,
)
tm.assert_frame_equal(result, expected)


def test_rolling_numerical_accuracy_kahan_sum():
# GH: 13254
df = pd.DataFrame([2.186, -1.647, 0.0, 0.0, 0.0, 0.0], columns=["x"])
result = df["x"].rolling(3).sum()
expected = pd.Series([np.nan, np.nan, 0.539, -1.647, 0.0, 0.0], name="x")
tm.assert_series_equal(result, expected)


def test_rolling_numerical_accuracy_jump():
# GH: 32761
index = pd.date_range(start="2020-01-01", end="2020-01-02", freq="60s").append(
pd.DatetimeIndex(["2020-01-03"])
)
data = np.random.rand(len(index))

df = pd.DataFrame({"data": data}, index=index)
result = df.rolling("60s").mean()
tm.assert_frame_equal(result, df[["data"]])


def test_rolling_numerical_accuracy_small_values():
# GH: 10319
s = Series(
data=[0.00012456, 0.0003, -0.0, -0.0],
index=date_range("1999-02-03", "1999-02-06"),
)
result = s.rolling(1).mean()
tm.assert_series_equal(result, s)


def test_rolling_numerical_too_large_numbers():
# GH: 11645
dates = pd.date_range("2015-01-01", periods=10, freq="D")
ds = pd.Series(data=range(10), index=dates, dtype=np.float64)
ds[2] = -9e33
result = ds.rolling(5).mean()
expected = pd.Series(
[np.nan, np.nan, np.nan, np.nan, -1.8e33, -1.8e33, -1.8e33, 0.0, 6.0, 7.0],
index=dates,
)
tm.assert_series_equal(result, expected)