diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 0fc4afc95a2ce..dde7f0e2cf828 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -222,6 +222,7 @@ Bug fixes - Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`) - Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) - Bug in :meth:`DataFrame.apply` where passing ``raw=True`` ignored ``args`` passed to the applied function (:issue:`55009`) +- Bug in :meth:`pandas.core.window.Window.var` and :meth:`pandas.core.window.Window.std` where implementation of the algorithm was incorrect (:issue:`53273` and :issue:`54333`) Categorical ^^^^^^^^^^^ diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 6365c030b695b..6617945fee768 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1558,7 +1558,7 @@ cdef float64_t calc_weighted_var(float64_t t, if nobs == 1: result = 0 else: - result = t * win_n / ((win_n - ddof) * sum_w) + result = t * nobs / ((nobs - ddof) * sum_w) if result < 0: result = 0 else: @@ -1599,68 +1599,21 @@ cdef void add_weighted_var(float64_t val, cdef: float64_t temp, q, r - if val != val: - return - nobs[0] = nobs[0] + 1 - q = val - mean[0] - temp = sum_w[0] + w - r = q * w / temp - - mean[0] = mean[0] + r - t[0] = t[0] + r * sum_w[0] * q - sum_w[0] = temp - - -cdef void remove_weighted_var(float64_t val, - float64_t w, - float64_t *t, - float64_t *sum_w, - float64_t *mean, - float64_t *nobs) noexcept nogil: - """ - Update weighted mean, sum of weights and sum of weighted squared - differences to remove value and weight pair from weighted variance - calculation using West's method. - - Paper: https://dl.acm.org/citation.cfm?id=359153 - - Parameters - ---------- - val: float64_t - window values - w: float64_t - window weights - t: float64_t - sum of weighted squared differences - sum_w: float64_t - sum of weights - mean: float64_t - weighted mean - nobs: float64_t - number of observations - """ - - cdef: - float64_t temp, q, r - - if val == val: - nobs[0] = nobs[0] - 1 - - if nobs[0]: - q = val - mean[0] - temp = sum_w[0] - w - r = q * w / temp + if nobs[0] == 1: + sum_w[0] = w + mean[0] = val + t[0] = 0 - mean[0] = mean[0] - r - t[0] = t[0] - r * sum_w[0] * q - sum_w[0] = temp + else: + q = val - mean[0] + temp = sum_w[0] + w + r = q * w / temp - else: - t[0] = 0 - sum_w[0] = 0 - mean[0] = 0 + mean[0] = mean[0] + r + t[0] = t[0] + r * sum_w[0] * q + sum_w[0] = temp def roll_weighted_var(const float64_t[:] values, const float64_t[:] weights, @@ -1690,44 +1643,26 @@ def roll_weighted_var(const float64_t[:] values, const float64_t[:] weights, """ cdef: - float64_t t = 0, sum_w = 0, mean = 0, nobs = 0 - float64_t val, pre_val, w, pre_w - Py_ssize_t i, n, win_n + float64_t t, sum_w, mean, nobs + float64_t val, w + Py_ssize_t in_i, win_i, n, win_n float64_t[:] output n = len(values) win_n = len(weights) - output = np.empty(n, dtype=np.float64) - - with nogil: - - for i in range(min(win_n, n)): - add_weighted_var(values[i], weights[i], &t, - &sum_w, &mean, &nobs) - output[i] = calc_weighted_var(t, sum_w, win_n, - ddof, nobs, minp) - - for i in range(win_n, n): - val = values[i] - pre_val = values[i - win_n] + output = np.empty(n, dtype=np.float64) - w = weights[i % win_n] - pre_w = weights[(i - win_n) % win_n] - - if val == val: - if pre_val == pre_val: - remove_weighted_var(pre_val, pre_w, &t, - &sum_w, &mean, &nobs) + for in_i in range(n): + nobs = 0 + for win_i in range(max(0, win_n - in_i - 1), win_n): + w = weights[win_i] + val = values[win_i - win_n + in_i + 1] + if val == val and w == w: add_weighted_var(val, w, &t, &sum_w, &mean, &nobs) - elif pre_val == pre_val: - remove_weighted_var(pre_val, pre_w, &t, - &sum_w, &mean, &nobs) - - output[i] = calc_weighted_var(t, sum_w, win_n, - ddof, nobs, minp) + output[in_i] = calc_weighted_var(t, sum_w, win_n, ddof, nobs, minp) return output diff --git a/pandas/tests/window/test_win_type.py b/pandas/tests/window/test_win_type.py index 5052019ddb726..d325db0650c68 100644 --- a/pandas/tests/window/test_win_type.py +++ b/pandas/tests/window/test_win_type.py @@ -661,6 +661,23 @@ def test_cmov_window_special_linear_range(win_types_special, step): tm.assert_series_equal(xp, rs) +@pytest.mark.parametrize("size", [10, 100, 1000]) +@pytest.mark.parametrize("scale", [0, 0.1, 0.01, 0.001, 0.0001]) +def test_weighted_std_through_mean(win_types, step, scale, size): + # GH 53273 + pytest.importorskip("scipy") + window = 5 + s = Series(np.random.default_rng(0).normal(loc=1, scale=scale, size=size)) + + squared_mean = (s**2).rolling(window, win_type=win_types).mean() + mean = s.rolling(window, win_type=win_types).mean() + expected = (squared_mean - mean**2).pow(0.5) + + result = s.rolling(window, win_type=win_types).std(ddof=0) + + tm.assert_series_equal(expected, result) + + def test_weighted_var_big_window_no_segfault(win_types, center): # GitHub Issue #46772 pytest.importorskip("scipy")