diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index e4879a6c41515..be986e2684cb6 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -607,6 +607,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.max` with empty groups and ``uint64`` dtype incorrectly raising ``RuntimeError`` (:issue:`46408`) - Bug in :meth:`.GroupBy.apply` would fail when ``func`` was a string and args or kwargs were supplied (:issue:`46479`) - Bug in :meth:`SeriesGroupBy.apply` would incorrectly name its result when there was a unique group (:issue:`46369`) +- Bug in :meth:`Rolling.sum` and :meth:`Rolling.mean` would give incorrect result with window of same values (:issue:`42064`, :issue:`46431`) - Bug in :meth:`Rolling.var` and :meth:`Rolling.std` would give non-zero result with window of same values (:issue:`42064`) - Bug in :meth:`.Rolling.var` would segfault calculating weighted variance when window size was larger than data size (:issue:`46760`) - Bug in :meth:`Grouper.__repr__` where ``dropna`` was not included. Now it is (:issue:`46754`) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 659a6c33a040b..fdc39178238c9 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -69,14 +69,19 @@ cdef bint is_monotonic_increasing_start_end_bounds( # Rolling sum -cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x) nogil: +cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x, + int64_t num_consecutive_same_value, float64_t prev_value + ) nogil: cdef: float64_t result if nobs == 0 == minp: result = 0 elif nobs >= minp: - result = sum_x + if num_consecutive_same_value >= nobs: + result = prev_value * nobs + else: + result = sum_x else: result = NaN @@ -84,7 +89,8 @@ cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x) nogi cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x, - float64_t *compensation) nogil: + float64_t *compensation, int64_t *num_consecutive_same_value, + float64_t *prev_value) nogil: """ add a value from the sum calc using Kahan summation """ cdef: @@ -98,6 +104,14 @@ cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x, compensation[0] = t - sum_x[0] - y sum_x[0] = t + # GH#42064, record num of same values to remove floating point artifacts + if val == prev_value[0]: + num_consecutive_same_value[0] += 1 + else: + # reset to 1 (include current value itself) + num_consecutive_same_value[0] = 1 + prev_value[0] = val + cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x, float64_t *compensation) nogil: @@ -119,8 +133,8 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp) -> np.ndarray: cdef: Py_ssize_t i, j - float64_t sum_x, compensation_add, compensation_remove - int64_t s, e + float64_t sum_x, compensation_add, compensation_remove, prev_value + int64_t s, e, num_consecutive_same_value int64_t nobs = 0, N = len(start) ndarray[float64_t] output bint is_monotonic_increasing_bounds @@ -139,11 +153,13 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start, if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]: # setup - + prev_value = values[s] + num_consecutive_same_value = 0 sum_x = compensation_add = compensation_remove = 0 nobs = 0 for j in range(s, e): - add_sum(values[j], &nobs, &sum_x, &compensation_add) + add_sum(values[j], &nobs, &sum_x, &compensation_add, + &num_consecutive_same_value, &prev_value) else: @@ -153,9 +169,10 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start, # calculate adds for j in range(end[i - 1], e): - add_sum(values[j], &nobs, &sum_x, &compensation_add) + add_sum(values[j], &nobs, &sum_x, &compensation_add, + &num_consecutive_same_value, &prev_value) - output[i] = calc_sum(minp, nobs, sum_x) + output[i] = calc_sum(minp, nobs, sum_x, num_consecutive_same_value, prev_value) if not is_monotonic_increasing_bounds: nobs = 0 @@ -169,14 +186,17 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start, # Rolling mean -cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs, - Py_ssize_t neg_ct, float64_t sum_x) nogil: +cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs, Py_ssize_t neg_ct, + float64_t sum_x, int64_t num_consecutive_same_value, + float64_t prev_value) nogil: cdef: float64_t result if nobs >= minp and nobs > 0: result = sum_x / nobs - if neg_ct == 0 and result < 0: + if num_consecutive_same_value >= nobs: + result = prev_value + elif neg_ct == 0 and result < 0: # all positive result = 0 elif neg_ct == nobs and result > 0: @@ -190,7 +210,8 @@ cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs, cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, - Py_ssize_t *neg_ct, float64_t *compensation) nogil: + Py_ssize_t *neg_ct, float64_t *compensation, + int64_t *num_consecutive_same_value, float64_t *prev_value) nogil: """ add a value from the mean calc using Kahan summation """ cdef: float64_t y, t @@ -205,6 +226,14 @@ cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, if signbit(val): neg_ct[0] = neg_ct[0] + 1 + # GH#42064, record num of same values to remove floating point artifacts + if val == prev_value[0]: + num_consecutive_same_value[0] += 1 + else: + # reset to 1 (include current value itself) + num_consecutive_same_value[0] = 1 + prev_value[0] = val + cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, Py_ssize_t *neg_ct, float64_t *compensation) nogil: @@ -225,8 +254,8 @@ cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, def roll_mean(const float64_t[:] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp) -> np.ndarray: cdef: - float64_t val, compensation_add, compensation_remove, sum_x - int64_t s, e + float64_t val, compensation_add, compensation_remove, sum_x, prev_value + int64_t s, e, num_consecutive_same_value Py_ssize_t nobs, i, j, neg_ct, N = len(start) ndarray[float64_t] output bint is_monotonic_increasing_bounds @@ -244,12 +273,15 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start, if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]: + # setup compensation_add = compensation_remove = sum_x = 0 nobs = neg_ct = 0 - # setup + prev_value = values[s] + num_consecutive_same_value = 0 for j in range(s, e): val = values[j] - add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add) + add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add, + &num_consecutive_same_value, &prev_value) else: @@ -261,9 +293,10 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start, # calculate adds for j in range(end[i - 1], e): val = values[j] - add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add) + add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add, + &num_consecutive_same_value, &prev_value) - output[i] = calc_mean(minp, nobs, neg_ct, sum_x) + output[i] = calc_mean(minp, nobs, neg_ct, sum_x, num_consecutive_same_value, prev_value) if not is_monotonic_increasing_bounds: nobs = 0 diff --git a/pandas/core/_numba/kernels/mean_.py b/pandas/core/_numba/kernels/mean_.py index 8f67dd9b51c06..725989e093441 100644 --- a/pandas/core/_numba/kernels/mean_.py +++ b/pandas/core/_numba/kernels/mean_.py @@ -16,8 +16,14 @@ @numba.jit(nopython=True, nogil=True, parallel=False) def add_mean( - val: float, nobs: int, sum_x: float, neg_ct: int, compensation: float -) -> tuple[int, float, int, float]: + val: float, + nobs: int, + sum_x: float, + neg_ct: int, + compensation: float, + num_consecutive_same_value: int, + prev_value: float, +) -> tuple[int, float, int, float, int, float]: if not np.isnan(val): nobs += 1 y = val - compensation @@ -26,7 +32,14 @@ def add_mean( sum_x = t if val < 0: neg_ct += 1 - return nobs, sum_x, neg_ct, compensation + + if val == prev_value: + num_consecutive_same_value += 1 + else: + num_consecutive_same_value = 1 + prev_value = val + + return nobs, sum_x, neg_ct, compensation, num_consecutive_same_value, prev_value @numba.jit(nopython=True, nogil=True, parallel=False) @@ -68,10 +81,26 @@ def sliding_mean( s = start[i] e = end[i] if i == 0 or not is_monotonic_increasing_bounds: + prev_value = values[s] + num_consecutive_same_value = 0 + for j in range(s, e): val = values[j] - nobs, sum_x, neg_ct, compensation_add = add_mean( - val, nobs, sum_x, neg_ct, compensation_add + ( + nobs, + sum_x, + neg_ct, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_mean( + val, + nobs, + sum_x, + neg_ct, + compensation_add, + num_consecutive_same_value, + prev_value, ) else: for j in range(start[i - 1], s): @@ -82,13 +111,28 @@ def sliding_mean( for j in range(end[i - 1], e): val = values[j] - nobs, sum_x, neg_ct, compensation_add = add_mean( - val, nobs, sum_x, neg_ct, compensation_add + ( + nobs, + sum_x, + neg_ct, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_mean( + val, + nobs, + sum_x, + neg_ct, + compensation_add, + num_consecutive_same_value, + prev_value, ) if nobs >= min_periods and nobs > 0: result = sum_x / nobs - if neg_ct == 0 and result < 0: + if num_consecutive_same_value >= nobs: + result = prev_value + elif neg_ct == 0 and result < 0: result = 0 elif neg_ct == nobs and result > 0: result = 0 diff --git a/pandas/core/_numba/kernels/sum_.py b/pandas/core/_numba/kernels/sum_.py index b67452e4071d9..056897189fe67 100644 --- a/pandas/core/_numba/kernels/sum_.py +++ b/pandas/core/_numba/kernels/sum_.py @@ -16,15 +16,27 @@ @numba.jit(nopython=True, nogil=True, parallel=False) def add_sum( - val: float, nobs: int, sum_x: float, compensation: float -) -> tuple[int, float, float]: + val: float, + nobs: int, + sum_x: float, + compensation: float, + num_consecutive_same_value: int, + prev_value: float, +) -> tuple[int, float, float, int, float]: if not np.isnan(val): nobs += 1 y = val - compensation t = sum_x + y compensation = t - sum_x - y sum_x = t - return nobs, sum_x, compensation + + if val == prev_value: + num_consecutive_same_value += 1 + else: + num_consecutive_same_value = 1 + prev_value = val + + return nobs, sum_x, compensation, num_consecutive_same_value, prev_value @numba.jit(nopython=True, nogil=True, parallel=False) @@ -63,10 +75,24 @@ def sliding_sum( s = start[i] e = end[i] if i == 0 or not is_monotonic_increasing_bounds: + prev_value = values[s] + num_consecutive_same_value = 0 + for j in range(s, e): val = values[j] - nobs, sum_x, compensation_add = add_sum( - val, nobs, sum_x, compensation_add + ( + nobs, + sum_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_sum( + val, + nobs, + sum_x, + compensation_add, + num_consecutive_same_value, + prev_value, ) else: for j in range(start[i - 1], s): @@ -77,14 +103,28 @@ def sliding_sum( for j in range(end[i - 1], e): val = values[j] - nobs, sum_x, compensation_add = add_sum( - val, nobs, sum_x, compensation_add + ( + nobs, + sum_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_sum( + val, + nobs, + sum_x, + compensation_add, + num_consecutive_same_value, + prev_value, ) if nobs == 0 == min_periods: result = 0.0 elif nobs >= min_periods: - result = sum_x + if num_consecutive_same_value >= nobs: + result = prev_value * nobs + else: + result = sum_x else: result = np.nan diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 5f7709c62a1e2..9c46a7194c9c5 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1849,3 +1849,14 @@ def test_rolling_var_same_value_count_logic(values, window, min_periods, expecte result_std = sr.rolling(window, min_periods=min_periods).std() tm.assert_series_equal(result_std, np.sqrt(expected)) tm.assert_series_equal(expected == 0, result_std == 0) + + +def test_rolling_mean_sum_floating_artifacts(): + # GH 42064. + + sr = Series([1 / 3, 4, 0, 0, 0, 0, 0]) + r = sr.rolling(3) + result = r.mean() + assert (result[-3:] == 0).all() + result = r.sum() + assert (result[-3:] == 0).all()