Skip to content

BUG: Fix weighted rolling variance implementation #55153

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ Bug fixes
- Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`)
- Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`)
- Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`)
- Bug in :meth:`pandas.core.window.Window.var` and :meth:`pandas.core.window.Window.std` where implementation of the algorithm was incorrect (:issue:`53273` and :issue:`54333`)

Categorical
^^^^^^^^^^^
Expand Down
111 changes: 23 additions & 88 deletions pandas/_libs/window/aggregations.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1558,7 +1558,7 @@ cdef float64_t calc_weighted_var(float64_t t,
if nobs == 1:
result = 0
else:
result = t * win_n / ((win_n - ddof) * sum_w)
result = t * nobs / ((nobs - ddof) * sum_w)
if result < 0:
result = 0
else:
Expand Down Expand Up @@ -1599,68 +1599,21 @@ cdef void add_weighted_var(float64_t val,
cdef:
float64_t temp, q, r

if val != val:
return

nobs[0] = nobs[0] + 1

q = val - mean[0]
temp = sum_w[0] + w
r = q * w / temp

mean[0] = mean[0] + r
t[0] = t[0] + r * sum_w[0] * q
sum_w[0] = temp


cdef void remove_weighted_var(float64_t val,
float64_t w,
float64_t *t,
float64_t *sum_w,
float64_t *mean,
float64_t *nobs) noexcept nogil:
"""
Update weighted mean, sum of weights and sum of weighted squared
differences to remove value and weight pair from weighted variance
calculation using West's method.

Paper: https://dl.acm.org/citation.cfm?id=359153

Parameters
----------
val: float64_t
window values
w: float64_t
window weights
t: float64_t
sum of weighted squared differences
sum_w: float64_t
sum of weights
mean: float64_t
weighted mean
nobs: float64_t
number of observations
"""

cdef:
float64_t temp, q, r

if val == val:
nobs[0] = nobs[0] - 1

if nobs[0]:
q = val - mean[0]
temp = sum_w[0] - w
r = q * w / temp
if nobs[0] == 1:
sum_w[0] = w
mean[0] = val
t[0] = 0

mean[0] = mean[0] - r
t[0] = t[0] - r * sum_w[0] * q
sum_w[0] = temp
else:
q = val - mean[0]
temp = sum_w[0] + w
r = q * w / temp

else:
t[0] = 0
sum_w[0] = 0
mean[0] = 0
mean[0] = mean[0] + r
t[0] = t[0] + r * sum_w[0] * q
sum_w[0] = temp


def roll_weighted_var(const float64_t[:] values, const float64_t[:] weights,
Expand Down Expand Up @@ -1690,44 +1643,26 @@ def roll_weighted_var(const float64_t[:] values, const float64_t[:] weights,
"""

cdef:
float64_t t = 0, sum_w = 0, mean = 0, nobs = 0
float64_t val, pre_val, w, pre_w
Py_ssize_t i, n, win_n
float64_t t, sum_w, mean, nobs
float64_t val, w
Py_ssize_t in_i, win_i, n, win_n
float64_t[:] output

n = len(values)
win_n = len(weights)
output = np.empty(n, dtype=np.float64)

with nogil:

for i in range(min(win_n, n)):
add_weighted_var(values[i], weights[i], &t,
&sum_w, &mean, &nobs)

output[i] = calc_weighted_var(t, sum_w, win_n,
ddof, nobs, minp)

for i in range(win_n, n):
val = values[i]
pre_val = values[i - win_n]
output = np.empty(n, dtype=np.float64)

w = weights[i % win_n]
pre_w = weights[(i - win_n) % win_n]

if val == val:
if pre_val == pre_val:
remove_weighted_var(pre_val, pre_w, &t,
&sum_w, &mean, &nobs)
for in_i in range(n):
nobs = 0
for win_i in range(max(0, win_n - in_i - 1), win_n):
w = weights[win_i]
val = values[win_i - win_n + in_i + 1]

if val == val and w == w:
add_weighted_var(val, w, &t, &sum_w, &mean, &nobs)

elif pre_val == pre_val:
remove_weighted_var(pre_val, pre_w, &t,
&sum_w, &mean, &nobs)

output[i] = calc_weighted_var(t, sum_w, win_n,
ddof, nobs, minp)
output[in_i] = calc_weighted_var(t, sum_w, win_n, ddof, nobs, minp)

return output

Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/window/test_win_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,6 +661,23 @@ def test_cmov_window_special_linear_range(win_types_special, step):
tm.assert_series_equal(xp, rs)


@pytest.mark.parametrize("size", [10, 100, 1000])
@pytest.mark.parametrize("scale", [0, 0.1, 0.01, 0.001, 0.0001])
def test_weighted_std_through_mean(win_types, step, scale, size):
# GH 53273
pytest.importorskip("scipy")
window = 5
s = Series(np.random.default_rng(0).normal(loc=1, scale=scale, size=size))

squared_mean = (s**2).rolling(window, win_type=win_types).mean()
mean = s.rolling(window, win_type=win_types).mean()
expected = (squared_mean - mean**2).pow(0.5)

result = s.rolling(window, win_type=win_types).std(ddof=0)

tm.assert_series_equal(expected, result)


def test_weighted_var_big_window_no_segfault(win_types, center):
# GitHub Issue #46772
pytest.importorskip("scipy")
Expand Down