From dfe5fba578db1ab137e0ade0659e40f54953c377 Mon Sep 17 00:00:00 2001 From: auderson Date: Sat, 9 Apr 2022 14:25:18 +0800 Subject: [PATCH 1/5] add value count for roll mean & sum --- pandas/_libs/window/aggregations.pyx | 71 ++++++++++++++++++++-------- pandas/core/_numba/kernels/mean_.py | 60 +++++++++++++++++++---- pandas/core/_numba/kernels/sum_.py | 56 ++++++++++++++++++---- pandas/tests/window/test_rolling.py | 11 +++++ 4 files changed, 163 insertions(+), 35 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 8191084da924f..554d7356c097e 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -70,14 +70,19 @@ cdef bint is_monotonic_increasing_start_end_bounds( # Rolling sum -cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x) nogil: +cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x, + int64_t num_consecutive_same_value, float64_t prev_value + ) nogil: cdef: float64_t result if nobs == 0 == minp: result = 0 elif nobs >= minp: - result = sum_x + if num_consecutive_same_value >= nobs: + result = prev_value * nobs + else: + result = sum_x else: result = NaN @@ -85,7 +90,8 @@ cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x) nogi cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x, - float64_t *compensation) nogil: + float64_t *compensation, int64_t *num_consecutive_same_value, + float64_t *prev_value) nogil: """ add a value from the sum calc using Kahan summation """ cdef: @@ -99,6 +105,14 @@ cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x, compensation[0] = t - sum_x[0] - y sum_x[0] = t + # GH#42064, record num of same values to remove floating point artifacts + if val == prev_value[0]: + num_consecutive_same_value[0] += 1 + else: + # reset to 1 (include current value itself) + num_consecutive_same_value[0] = 1 + prev_value[0] = val + cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x, float64_t *compensation) nogil: @@ -120,8 +134,8 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp) -> np.ndarray: cdef: Py_ssize_t i, j - float64_t sum_x, compensation_add, compensation_remove - int64_t s, e + float64_t sum_x, compensation_add, compensation_remove, prev_value + int64_t s, e, num_consecutive_same_value int64_t nobs = 0, N = len(start) ndarray[float64_t] output bint is_monotonic_increasing_bounds @@ -140,11 +154,13 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start, if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]: # setup - + prev_value = values[s] + num_consecutive_same_value = 0 sum_x = compensation_add = compensation_remove = 0 nobs = 0 for j in range(s, e): - add_sum(values[j], &nobs, &sum_x, &compensation_add) + add_sum(values[j], &nobs, &sum_x, &compensation_add, + &num_consecutive_same_value, &prev_value) else: @@ -154,9 +170,10 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start, # calculate adds for j in range(end[i - 1], e): - add_sum(values[j], &nobs, &sum_x, &compensation_add) + add_sum(values[j], &nobs, &sum_x, &compensation_add, + &num_consecutive_same_value, &prev_value) - output[i] = calc_sum(minp, nobs, sum_x) + output[i] = calc_sum(minp, nobs, sum_x, num_consecutive_same_value, prev_value) if not is_monotonic_increasing_bounds: nobs = 0 @@ -170,14 +187,17 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start, # Rolling mean -cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs, - Py_ssize_t neg_ct, float64_t sum_x) nogil: +cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs, Py_ssize_t neg_ct, + float64_t sum_x, int64_t num_consecutive_same_value, + float64_t prev_value) nogil: cdef: float64_t result if nobs >= minp and nobs > 0: result = sum_x / nobs - if neg_ct == 0 and result < 0: + if num_consecutive_same_value >= nobs: + result = prev_value + elif neg_ct == 0 and result < 0: # all positive result = 0 elif neg_ct == nobs and result > 0: @@ -191,7 +211,8 @@ cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs, cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, - Py_ssize_t *neg_ct, float64_t *compensation) nogil: + Py_ssize_t *neg_ct, float64_t *compensation, + int64_t *num_consecutive_same_value, float64_t *prev_value) nogil: """ add a value from the mean calc using Kahan summation """ cdef: float64_t y, t @@ -206,6 +227,14 @@ cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, if signbit(val): neg_ct[0] = neg_ct[0] + 1 + # GH#42064, record num of same values to remove floating point artifacts + if val == prev_value[0]: + num_consecutive_same_value[0] += 1 + else: + # reset to 1 (include current value itself) + num_consecutive_same_value[0] = 1 + prev_value[0] = val + cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, Py_ssize_t *neg_ct, float64_t *compensation) nogil: @@ -226,8 +255,8 @@ cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, def roll_mean(const float64_t[:] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp) -> np.ndarray: cdef: - float64_t val, compensation_add, compensation_remove, sum_x - int64_t s, e + float64_t val, compensation_add, compensation_remove, sum_x, prev_value + int64_t s, e, num_consecutive_same_value Py_ssize_t nobs, i, j, neg_ct, N = len(start) ndarray[float64_t] output bint is_monotonic_increasing_bounds @@ -245,12 +274,15 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start, if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]: + # setup compensation_add = compensation_remove = sum_x = 0 nobs = neg_ct = 0 - # setup + prev_value = values[s] + num_consecutive_same_value = 0 for j in range(s, e): val = values[j] - add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add) + add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add, + &num_consecutive_same_value, &prev_value) else: @@ -262,9 +294,10 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start, # calculate adds for j in range(end[i - 1], e): val = values[j] - add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add) + add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add, + &num_consecutive_same_value, &prev_value) - output[i] = calc_mean(minp, nobs, neg_ct, sum_x) + output[i] = calc_mean(minp, nobs, neg_ct, sum_x, num_consecutive_same_value, prev_value) if not is_monotonic_increasing_bounds: nobs = 0 diff --git a/pandas/core/_numba/kernels/mean_.py b/pandas/core/_numba/kernels/mean_.py index 8f67dd9b51c06..242c492411556 100644 --- a/pandas/core/_numba/kernels/mean_.py +++ b/pandas/core/_numba/kernels/mean_.py @@ -16,8 +16,14 @@ @numba.jit(nopython=True, nogil=True, parallel=False) def add_mean( - val: float, nobs: int, sum_x: float, neg_ct: int, compensation: float -) -> tuple[int, float, int, float]: + val: float, + nobs: int, + sum_x: float, + neg_ct: int, + compensation: float, + num_consecutive_same_value: int, + prev_value: float, +) -> tuple[int, float, int, float, float, float]: if not np.isnan(val): nobs += 1 y = val - compensation @@ -26,7 +32,14 @@ def add_mean( sum_x = t if val < 0: neg_ct += 1 - return nobs, sum_x, neg_ct, compensation + + if val == prev_value: + num_consecutive_same_value += 1 + else: + num_consecutive_same_value = 1 + prev_value = val + + return nobs, sum_x, neg_ct, compensation, num_consecutive_same_value, prev_value @numba.jit(nopython=True, nogil=True, parallel=False) @@ -68,10 +81,26 @@ def sliding_mean( s = start[i] e = end[i] if i == 0 or not is_monotonic_increasing_bounds: + prev_value = values[s] + num_consecutive_same_value = 0 + for j in range(s, e): val = values[j] - nobs, sum_x, neg_ct, compensation_add = add_mean( - val, nobs, sum_x, neg_ct, compensation_add + ( + nobs, + sum_x, + neg_ct, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_mean( + val, + nobs, + sum_x, + neg_ct, + compensation_add, + num_consecutive_same_value, + prev_value, ) else: for j in range(start[i - 1], s): @@ -82,13 +111,28 @@ def sliding_mean( for j in range(end[i - 1], e): val = values[j] - nobs, sum_x, neg_ct, compensation_add = add_mean( - val, nobs, sum_x, neg_ct, compensation_add + ( + nobs, + sum_x, + neg_ct, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_mean( + val, + nobs, + sum_x, + neg_ct, + compensation_add, + num_consecutive_same_value, + prev_value, ) if nobs >= min_periods and nobs > 0: result = sum_x / nobs - if neg_ct == 0 and result < 0: + if num_consecutive_same_value >= nobs: + result = prev_value + elif neg_ct == 0 and result < 0: result = 0 elif neg_ct == nobs and result > 0: result = 0 diff --git a/pandas/core/_numba/kernels/sum_.py b/pandas/core/_numba/kernels/sum_.py index c2e81b4990ba9..252dc8d5d2fa5 100644 --- a/pandas/core/_numba/kernels/sum_.py +++ b/pandas/core/_numba/kernels/sum_.py @@ -16,15 +16,27 @@ @numba.jit(nopython=True, nogil=True, parallel=False) def add_sum( - val: float, nobs: int, sum_x: float, compensation: float -) -> tuple[int, float, float]: + val: float, + nobs: int, + sum_x: float, + compensation: float, + num_consecutive_same_value: int, + prev_value: float, +) -> tuple[int, float, float, float, float]: if not np.isnan(val): nobs += 1 y = val - compensation t = sum_x + y compensation = t - sum_x - y sum_x = t - return nobs, sum_x, compensation + + if val == prev_value: + num_consecutive_same_value += 1 + else: + num_consecutive_same_value = 1 + prev_value = val + + return nobs, sum_x, compensation, num_consecutive_same_value, prev_value @numba.jit(nopython=True, nogil=True, parallel=False) @@ -63,10 +75,24 @@ def sliding_sum( s = start[i] e = end[i] if i == 0 or not is_monotonic_increasing_bounds: + prev_value = values[s] + num_consecutive_same_value = 0 + for j in range(s, e): val = values[j] - nobs, sum_x, compensation_add = add_sum( - val, nobs, sum_x, compensation_add + ( + nobs, + sum_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_sum( + val, + nobs, + sum_x, + compensation_add, + num_consecutive_same_value, + prev_value, ) else: for j in range(start[i - 1], s): @@ -77,14 +103,28 @@ def sliding_sum( for j in range(end[i - 1], e): val = values[j] - nobs, sum_x, compensation_add = add_sum( - val, nobs, sum_x, compensation_add + ( + nobs, + sum_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_sum( + val, + nobs, + sum_x, + compensation_add, + num_consecutive_same_value, + prev_value, ) if nobs == 0 == nobs: result = 0.0 elif nobs >= min_periods: - result = sum_x + if num_consecutive_same_value >= nobs: + result = prev_value * nobs + else: + result = sum_x else: result = np.nan diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 89c90836ae957..07244f2991caf 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1769,3 +1769,14 @@ def test_step_not_integer_raises(): def test_step_not_positive_raises(): with pytest.raises(ValueError, match="step must be >= 0"): DataFrame(range(2)).rolling(1, step=-1) + + +def test_rolling_mean_sum_floating_artifacts(): + # GH 42064. + + sr = Series([1 / 3, 4, 0, 0, 0, 0, 0]) + r = sr.rolling(3) + result = r.mean() + assert (result[-3:] == 0).all() + result = r.sum() + assert (result[-3:] == 0).all() \ No newline at end of file From a20e07034dc67e8040b1b07bc64c28f085b8c22a Mon Sep 17 00:00:00 2001 From: auderson Date: Sat, 9 Apr 2022 14:57:04 +0800 Subject: [PATCH 2/5] add value count for roll mean & sum --- pandas/tests/window/test_rolling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 07244f2991caf..7cc6e18df1e40 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1779,4 +1779,4 @@ def test_rolling_mean_sum_floating_artifacts(): result = r.mean() assert (result[-3:] == 0).all() result = r.sum() - assert (result[-3:] == 0).all() \ No newline at end of file + assert (result[-3:] == 0).all() From 97b975d536ccd087db78bb1af0f4dc03b178e227 Mon Sep 17 00:00:00 2001 From: auderson Date: Sat, 9 Apr 2022 15:25:43 +0800 Subject: [PATCH 3/5] correct typing --- pandas/core/_numba/kernels/mean_.py | 2 +- pandas/core/_numba/kernels/sum_.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/_numba/kernels/mean_.py b/pandas/core/_numba/kernels/mean_.py index 242c492411556..725989e093441 100644 --- a/pandas/core/_numba/kernels/mean_.py +++ b/pandas/core/_numba/kernels/mean_.py @@ -23,7 +23,7 @@ def add_mean( compensation: float, num_consecutive_same_value: int, prev_value: float, -) -> tuple[int, float, int, float, float, float]: +) -> tuple[int, float, int, float, int, float]: if not np.isnan(val): nobs += 1 y = val - compensation diff --git a/pandas/core/_numba/kernels/sum_.py b/pandas/core/_numba/kernels/sum_.py index 252dc8d5d2fa5..527f29ea830a5 100644 --- a/pandas/core/_numba/kernels/sum_.py +++ b/pandas/core/_numba/kernels/sum_.py @@ -22,7 +22,7 @@ def add_sum( compensation: float, num_consecutive_same_value: int, prev_value: float, -) -> tuple[int, float, float, float, float]: +) -> tuple[int, float, float, int, float]: if not np.isnan(val): nobs += 1 y = val - compensation From 586b7922b20aa4cc64a5fa8e437d1bd0c4a9b1ba Mon Sep 17 00:00:00 2001 From: auderson Date: Tue, 12 Apr 2022 09:45:13 +0800 Subject: [PATCH 4/5] update doc --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 358d9447b131d..accd55361ec58 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -599,7 +599,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.cummax` with ``int64`` dtype with leading value being the smallest possible int64 (:issue:`46382`) - Bug in :meth:`GroupBy.max` with empty groups and ``uint64`` dtype incorrectly raising ``RuntimeError`` (:issue:`46408`) - Bug in :meth:`.GroupBy.apply` would fail when ``func`` was a string and args or kwargs were supplied (:issue:`46479`) -- +- Bug in :meth:`Rolling.sum` and :meth:`Rolling.mean` would give incorrect result with window of same values (:issue:`42064`, :issue:`46431`) Reshaping ^^^^^^^^^ From 40b7b64e3640a768ac5d271e262f6c0169366a9c Mon Sep 17 00:00:00 2001 From: auderson Date: Tue, 26 Apr 2022 15:55:18 +0800 Subject: [PATCH 5/5] update doc --- doc/source/whatsnew/v1.5.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 505b8211d1f62..be986e2684cb6 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -606,8 +606,8 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.cummax` with ``int64`` dtype with leading value being the smallest possible int64 (:issue:`46382`) - Bug in :meth:`GroupBy.max` with empty groups and ``uint64`` dtype incorrectly raising ``RuntimeError`` (:issue:`46408`) - Bug in :meth:`.GroupBy.apply` would fail when ``func`` was a string and args or kwargs were supplied (:issue:`46479`) -- Bug in :meth:`Rolling.sum` and :meth:`Rolling.mean` would give incorrect result with window of same values (:issue:`42064`, :issue:`46431`) - Bug in :meth:`SeriesGroupBy.apply` would incorrectly name its result when there was a unique group (:issue:`46369`) +- Bug in :meth:`Rolling.sum` and :meth:`Rolling.mean` would give incorrect result with window of same values (:issue:`42064`, :issue:`46431`) - Bug in :meth:`Rolling.var` and :meth:`Rolling.std` would give non-zero result with window of same values (:issue:`42064`) - Bug in :meth:`.Rolling.var` would segfault calculating weighted variance when window size was larger than data size (:issue:`46760`) - Bug in :meth:`Grouper.__repr__` where ``dropna`` was not included. Now it is (:issue:`46754`)