Skip to content

ENH: Same val counts for roll sum mean #46715

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Apr 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,7 @@ Groupby/resample/rolling
- Bug in :meth:`GroupBy.max` with empty groups and ``uint64`` dtype incorrectly raising ``RuntimeError`` (:issue:`46408`)
- Bug in :meth:`.GroupBy.apply` would fail when ``func`` was a string and args or kwargs were supplied (:issue:`46479`)
- Bug in :meth:`SeriesGroupBy.apply` would incorrectly name its result when there was a unique group (:issue:`46369`)
- Bug in :meth:`Rolling.sum` and :meth:`Rolling.mean` would give incorrect result with window of same values (:issue:`42064`, :issue:`46431`)
- Bug in :meth:`Rolling.var` and :meth:`Rolling.std` would give non-zero result with window of same values (:issue:`42064`)
- Bug in :meth:`.Rolling.var` would segfault calculating weighted variance when window size was larger than data size (:issue:`46760`)
- Bug in :meth:`Grouper.__repr__` where ``dropna`` was not included. Now it is (:issue:`46754`)
Expand Down
71 changes: 52 additions & 19 deletions pandas/_libs/window/aggregations.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -69,22 +69,28 @@ cdef bint is_monotonic_increasing_start_end_bounds(
# Rolling sum


cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x) nogil:
cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x,
int64_t num_consecutive_same_value, float64_t prev_value
) nogil:
cdef:
float64_t result

if nobs == 0 == minp:
result = 0
elif nobs >= minp:
result = sum_x
if num_consecutive_same_value >= nobs:
result = prev_value * nobs
else:
result = sum_x
else:
result = NaN

return result


cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
float64_t *compensation) nogil:
float64_t *compensation, int64_t *num_consecutive_same_value,
float64_t *prev_value) nogil:
""" add a value from the sum calc using Kahan summation """

cdef:
Expand All @@ -98,6 +104,14 @@ cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
compensation[0] = t - sum_x[0] - y
sum_x[0] = t

# GH#42064, record num of same values to remove floating point artifacts
if val == prev_value[0]:
num_consecutive_same_value[0] += 1
else:
# reset to 1 (include current value itself)
num_consecutive_same_value[0] = 1
prev_value[0] = val


cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
float64_t *compensation) nogil:
Expand All @@ -119,8 +133,8 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start,
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
cdef:
Py_ssize_t i, j
float64_t sum_x, compensation_add, compensation_remove
int64_t s, e
float64_t sum_x, compensation_add, compensation_remove, prev_value
int64_t s, e, num_consecutive_same_value
int64_t nobs = 0, N = len(start)
ndarray[float64_t] output
bint is_monotonic_increasing_bounds
Expand All @@ -139,11 +153,13 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start,
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:

# setup

prev_value = values[s]
num_consecutive_same_value = 0
sum_x = compensation_add = compensation_remove = 0
nobs = 0
for j in range(s, e):
add_sum(values[j], &nobs, &sum_x, &compensation_add)
add_sum(values[j], &nobs, &sum_x, &compensation_add,
&num_consecutive_same_value, &prev_value)

else:

Expand All @@ -153,9 +169,10 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start,

# calculate adds
for j in range(end[i - 1], e):
add_sum(values[j], &nobs, &sum_x, &compensation_add)
add_sum(values[j], &nobs, &sum_x, &compensation_add,
&num_consecutive_same_value, &prev_value)

output[i] = calc_sum(minp, nobs, sum_x)
output[i] = calc_sum(minp, nobs, sum_x, num_consecutive_same_value, prev_value)

if not is_monotonic_increasing_bounds:
nobs = 0
Expand All @@ -169,14 +186,17 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start,
# Rolling mean


cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs,
Py_ssize_t neg_ct, float64_t sum_x) nogil:
cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs, Py_ssize_t neg_ct,
float64_t sum_x, int64_t num_consecutive_same_value,
float64_t prev_value) nogil:
cdef:
float64_t result

if nobs >= minp and nobs > 0:
result = sum_x / <float64_t>nobs
if neg_ct == 0 and result < 0:
if num_consecutive_same_value >= nobs:
result = prev_value
elif neg_ct == 0 and result < 0:
# all positive
result = 0
elif neg_ct == nobs and result > 0:
Expand All @@ -190,7 +210,8 @@ cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs,


cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
Py_ssize_t *neg_ct, float64_t *compensation) nogil:
Py_ssize_t *neg_ct, float64_t *compensation,
int64_t *num_consecutive_same_value, float64_t *prev_value) nogil:
""" add a value from the mean calc using Kahan summation """
cdef:
float64_t y, t
Expand All @@ -205,6 +226,14 @@ cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
if signbit(val):
neg_ct[0] = neg_ct[0] + 1

# GH#42064, record num of same values to remove floating point artifacts
if val == prev_value[0]:
num_consecutive_same_value[0] += 1
else:
# reset to 1 (include current value itself)
num_consecutive_same_value[0] = 1
prev_value[0] = val


cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
Py_ssize_t *neg_ct, float64_t *compensation) nogil:
Expand All @@ -225,8 +254,8 @@ cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
def roll_mean(const float64_t[:] values, ndarray[int64_t] start,
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
cdef:
float64_t val, compensation_add, compensation_remove, sum_x
int64_t s, e
float64_t val, compensation_add, compensation_remove, sum_x, prev_value
int64_t s, e, num_consecutive_same_value
Py_ssize_t nobs, i, j, neg_ct, N = len(start)
ndarray[float64_t] output
bint is_monotonic_increasing_bounds
Expand All @@ -244,12 +273,15 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start,

if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:

# setup
compensation_add = compensation_remove = sum_x = 0
nobs = neg_ct = 0
# setup
prev_value = values[s]
num_consecutive_same_value = 0
for j in range(s, e):
val = values[j]
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add,
&num_consecutive_same_value, &prev_value)

else:

Expand All @@ -261,9 +293,10 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start,
# calculate adds
for j in range(end[i - 1], e):
val = values[j]
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add,
&num_consecutive_same_value, &prev_value)

output[i] = calc_mean(minp, nobs, neg_ct, sum_x)
output[i] = calc_mean(minp, nobs, neg_ct, sum_x, num_consecutive_same_value, prev_value)

if not is_monotonic_increasing_bounds:
nobs = 0
Expand Down
60 changes: 52 additions & 8 deletions pandas/core/_numba/kernels/mean_.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,14 @@

@numba.jit(nopython=True, nogil=True, parallel=False)
def add_mean(
val: float, nobs: int, sum_x: float, neg_ct: int, compensation: float
) -> tuple[int, float, int, float]:
val: float,
nobs: int,
sum_x: float,
neg_ct: int,
compensation: float,
num_consecutive_same_value: int,
prev_value: float,
) -> tuple[int, float, int, float, int, float]:
if not np.isnan(val):
nobs += 1
y = val - compensation
Expand All @@ -26,7 +32,14 @@ def add_mean(
sum_x = t
if val < 0:
neg_ct += 1
return nobs, sum_x, neg_ct, compensation

if val == prev_value:
num_consecutive_same_value += 1
else:
num_consecutive_same_value = 1
prev_value = val

return nobs, sum_x, neg_ct, compensation, num_consecutive_same_value, prev_value


@numba.jit(nopython=True, nogil=True, parallel=False)
Expand Down Expand Up @@ -68,10 +81,26 @@ def sliding_mean(
s = start[i]
e = end[i]
if i == 0 or not is_monotonic_increasing_bounds:
prev_value = values[s]
num_consecutive_same_value = 0

for j in range(s, e):
val = values[j]
nobs, sum_x, neg_ct, compensation_add = add_mean(
val, nobs, sum_x, neg_ct, compensation_add
(
nobs,
sum_x,
neg_ct,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_mean(
val,
nobs,
sum_x,
neg_ct,
compensation_add,
num_consecutive_same_value,
prev_value,
)
else:
for j in range(start[i - 1], s):
Expand All @@ -82,13 +111,28 @@ def sliding_mean(

for j in range(end[i - 1], e):
val = values[j]
nobs, sum_x, neg_ct, compensation_add = add_mean(
val, nobs, sum_x, neg_ct, compensation_add
(
nobs,
sum_x,
neg_ct,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_mean(
val,
nobs,
sum_x,
neg_ct,
compensation_add,
num_consecutive_same_value,
prev_value,
)

if nobs >= min_periods and nobs > 0:
result = sum_x / nobs
if neg_ct == 0 and result < 0:
if num_consecutive_same_value >= nobs:
result = prev_value
elif neg_ct == 0 and result < 0:
result = 0
elif neg_ct == nobs and result > 0:
result = 0
Expand Down
56 changes: 48 additions & 8 deletions pandas/core/_numba/kernels/sum_.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,27 @@

@numba.jit(nopython=True, nogil=True, parallel=False)
def add_sum(
val: float, nobs: int, sum_x: float, compensation: float
) -> tuple[int, float, float]:
val: float,
nobs: int,
sum_x: float,
compensation: float,
num_consecutive_same_value: int,
prev_value: float,
) -> tuple[int, float, float, int, float]:
if not np.isnan(val):
nobs += 1
y = val - compensation
t = sum_x + y
compensation = t - sum_x - y
sum_x = t
return nobs, sum_x, compensation

if val == prev_value:
num_consecutive_same_value += 1
else:
num_consecutive_same_value = 1
prev_value = val

return nobs, sum_x, compensation, num_consecutive_same_value, prev_value


@numba.jit(nopython=True, nogil=True, parallel=False)
Expand Down Expand Up @@ -63,10 +75,24 @@ def sliding_sum(
s = start[i]
e = end[i]
if i == 0 or not is_monotonic_increasing_bounds:
prev_value = values[s]
num_consecutive_same_value = 0

for j in range(s, e):
val = values[j]
nobs, sum_x, compensation_add = add_sum(
val, nobs, sum_x, compensation_add
(
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_sum(
val,
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)
else:
for j in range(start[i - 1], s):
Expand All @@ -77,14 +103,28 @@ def sliding_sum(

for j in range(end[i - 1], e):
val = values[j]
nobs, sum_x, compensation_add = add_sum(
val, nobs, sum_x, compensation_add
(
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
) = add_sum(
val,
nobs,
sum_x,
compensation_add,
num_consecutive_same_value,
prev_value,
)

if nobs == 0 == min_periods:
result = 0.0
elif nobs >= min_periods:
result = sum_x
if num_consecutive_same_value >= nobs:
result = prev_value * nobs
else:
result = sum_x
else:
result = np.nan

Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/window/test_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1849,3 +1849,14 @@ def test_rolling_var_same_value_count_logic(values, window, min_periods, expecte
result_std = sr.rolling(window, min_periods=min_periods).std()
tm.assert_series_equal(result_std, np.sqrt(expected))
tm.assert_series_equal(expected == 0, result_std == 0)


def test_rolling_mean_sum_floating_artifacts():
# GH 42064.

sr = Series([1 / 3, 4, 0, 0, 0, 0, 0])
r = sr.rolling(3)
result = r.mean()
assert (result[-3:] == 0).all()
result = r.sum()
assert (result[-3:] == 0).all()