Skip to content

Commit dbd11ef

Browse files
audersonyehoshuadimarsky
authored andcommitted
ENH: Same val counts for roll sum mean (pandas-dev#46715)
1 parent 4dbcfb4 commit dbd11ef

File tree

5 files changed

+164
-35
lines changed

5 files changed

+164
-35
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,7 @@ Groupby/resample/rolling
630630
- Bug in :meth:`GroupBy.max` with empty groups and ``uint64`` dtype incorrectly raising ``RuntimeError`` (:issue:`46408`)
631631
- Bug in :meth:`.GroupBy.apply` would fail when ``func`` was a string and args or kwargs were supplied (:issue:`46479`)
632632
- Bug in :meth:`SeriesGroupBy.apply` would incorrectly name its result when there was a unique group (:issue:`46369`)
633+
- Bug in :meth:`Rolling.sum` and :meth:`Rolling.mean` would give incorrect result with window of same values (:issue:`42064`, :issue:`46431`)
633634
- Bug in :meth:`Rolling.var` and :meth:`Rolling.std` would give non-zero result with window of same values (:issue:`42064`)
634635
- Bug in :meth:`.Rolling.var` would segfault calculating weighted variance when window size was larger than data size (:issue:`46760`)
635636
- Bug in :meth:`Grouper.__repr__` where ``dropna`` was not included. Now it is (:issue:`46754`)

pandas/_libs/window/aggregations.pyx

+52-19
Original file line numberDiff line numberDiff line change
@@ -69,22 +69,28 @@ cdef bint is_monotonic_increasing_start_end_bounds(
6969
# Rolling sum
7070

7171

72-
cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x) nogil:
72+
cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x,
73+
int64_t num_consecutive_same_value, float64_t prev_value
74+
) nogil:
7375
cdef:
7476
float64_t result
7577

7678
if nobs == 0 == minp:
7779
result = 0
7880
elif nobs >= minp:
79-
result = sum_x
81+
if num_consecutive_same_value >= nobs:
82+
result = prev_value * nobs
83+
else:
84+
result = sum_x
8085
else:
8186
result = NaN
8287

8388
return result
8489

8590

8691
cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
87-
float64_t *compensation) nogil:
92+
float64_t *compensation, int64_t *num_consecutive_same_value,
93+
float64_t *prev_value) nogil:
8894
""" add a value from the sum calc using Kahan summation """
8995

9096
cdef:
@@ -98,6 +104,14 @@ cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
98104
compensation[0] = t - sum_x[0] - y
99105
sum_x[0] = t
100106

107+
# GH#42064, record num of same values to remove floating point artifacts
108+
if val == prev_value[0]:
109+
num_consecutive_same_value[0] += 1
110+
else:
111+
# reset to 1 (include current value itself)
112+
num_consecutive_same_value[0] = 1
113+
prev_value[0] = val
114+
101115

102116
cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
103117
float64_t *compensation) nogil:
@@ -119,8 +133,8 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start,
119133
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
120134
cdef:
121135
Py_ssize_t i, j
122-
float64_t sum_x, compensation_add, compensation_remove
123-
int64_t s, e
136+
float64_t sum_x, compensation_add, compensation_remove, prev_value
137+
int64_t s, e, num_consecutive_same_value
124138
int64_t nobs = 0, N = len(start)
125139
ndarray[float64_t] output
126140
bint is_monotonic_increasing_bounds
@@ -139,11 +153,13 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start,
139153
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
140154

141155
# setup
142-
156+
prev_value = values[s]
157+
num_consecutive_same_value = 0
143158
sum_x = compensation_add = compensation_remove = 0
144159
nobs = 0
145160
for j in range(s, e):
146-
add_sum(values[j], &nobs, &sum_x, &compensation_add)
161+
add_sum(values[j], &nobs, &sum_x, &compensation_add,
162+
&num_consecutive_same_value, &prev_value)
147163

148164
else:
149165

@@ -153,9 +169,10 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start,
153169

154170
# calculate adds
155171
for j in range(end[i - 1], e):
156-
add_sum(values[j], &nobs, &sum_x, &compensation_add)
172+
add_sum(values[j], &nobs, &sum_x, &compensation_add,
173+
&num_consecutive_same_value, &prev_value)
157174

158-
output[i] = calc_sum(minp, nobs, sum_x)
175+
output[i] = calc_sum(minp, nobs, sum_x, num_consecutive_same_value, prev_value)
159176

160177
if not is_monotonic_increasing_bounds:
161178
nobs = 0
@@ -169,14 +186,17 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start,
169186
# Rolling mean
170187

171188

172-
cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs,
173-
Py_ssize_t neg_ct, float64_t sum_x) nogil:
189+
cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs, Py_ssize_t neg_ct,
190+
float64_t sum_x, int64_t num_consecutive_same_value,
191+
float64_t prev_value) nogil:
174192
cdef:
175193
float64_t result
176194

177195
if nobs >= minp and nobs > 0:
178196
result = sum_x / <float64_t>nobs
179-
if neg_ct == 0 and result < 0:
197+
if num_consecutive_same_value >= nobs:
198+
result = prev_value
199+
elif neg_ct == 0 and result < 0:
180200
# all positive
181201
result = 0
182202
elif neg_ct == nobs and result > 0:
@@ -190,7 +210,8 @@ cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs,
190210

191211

192212
cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
193-
Py_ssize_t *neg_ct, float64_t *compensation) nogil:
213+
Py_ssize_t *neg_ct, float64_t *compensation,
214+
int64_t *num_consecutive_same_value, float64_t *prev_value) nogil:
194215
""" add a value from the mean calc using Kahan summation """
195216
cdef:
196217
float64_t y, t
@@ -205,6 +226,14 @@ cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
205226
if signbit(val):
206227
neg_ct[0] = neg_ct[0] + 1
207228

229+
# GH#42064, record num of same values to remove floating point artifacts
230+
if val == prev_value[0]:
231+
num_consecutive_same_value[0] += 1
232+
else:
233+
# reset to 1 (include current value itself)
234+
num_consecutive_same_value[0] = 1
235+
prev_value[0] = val
236+
208237

209238
cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
210239
Py_ssize_t *neg_ct, float64_t *compensation) nogil:
@@ -225,8 +254,8 @@ cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
225254
def roll_mean(const float64_t[:] values, ndarray[int64_t] start,
226255
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
227256
cdef:
228-
float64_t val, compensation_add, compensation_remove, sum_x
229-
int64_t s, e
257+
float64_t val, compensation_add, compensation_remove, sum_x, prev_value
258+
int64_t s, e, num_consecutive_same_value
230259
Py_ssize_t nobs, i, j, neg_ct, N = len(start)
231260
ndarray[float64_t] output
232261
bint is_monotonic_increasing_bounds
@@ -244,12 +273,15 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start,
244273

245274
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
246275

276+
# setup
247277
compensation_add = compensation_remove = sum_x = 0
248278
nobs = neg_ct = 0
249-
# setup
279+
prev_value = values[s]
280+
num_consecutive_same_value = 0
250281
for j in range(s, e):
251282
val = values[j]
252-
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)
283+
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add,
284+
&num_consecutive_same_value, &prev_value)
253285

254286
else:
255287

@@ -261,9 +293,10 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start,
261293
# calculate adds
262294
for j in range(end[i - 1], e):
263295
val = values[j]
264-
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)
296+
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add,
297+
&num_consecutive_same_value, &prev_value)
265298

266-
output[i] = calc_mean(minp, nobs, neg_ct, sum_x)
299+
output[i] = calc_mean(minp, nobs, neg_ct, sum_x, num_consecutive_same_value, prev_value)
267300

268301
if not is_monotonic_increasing_bounds:
269302
nobs = 0

pandas/core/_numba/kernels/mean_.py

+52-8
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,14 @@
1616

1717
@numba.jit(nopython=True, nogil=True, parallel=False)
1818
def add_mean(
19-
val: float, nobs: int, sum_x: float, neg_ct: int, compensation: float
20-
) -> tuple[int, float, int, float]:
19+
val: float,
20+
nobs: int,
21+
sum_x: float,
22+
neg_ct: int,
23+
compensation: float,
24+
num_consecutive_same_value: int,
25+
prev_value: float,
26+
) -> tuple[int, float, int, float, int, float]:
2127
if not np.isnan(val):
2228
nobs += 1
2329
y = val - compensation
@@ -26,7 +32,14 @@ def add_mean(
2632
sum_x = t
2733
if val < 0:
2834
neg_ct += 1
29-
return nobs, sum_x, neg_ct, compensation
35+
36+
if val == prev_value:
37+
num_consecutive_same_value += 1
38+
else:
39+
num_consecutive_same_value = 1
40+
prev_value = val
41+
42+
return nobs, sum_x, neg_ct, compensation, num_consecutive_same_value, prev_value
3043

3144

3245
@numba.jit(nopython=True, nogil=True, parallel=False)
@@ -68,10 +81,26 @@ def sliding_mean(
6881
s = start[i]
6982
e = end[i]
7083
if i == 0 or not is_monotonic_increasing_bounds:
84+
prev_value = values[s]
85+
num_consecutive_same_value = 0
86+
7187
for j in range(s, e):
7288
val = values[j]
73-
nobs, sum_x, neg_ct, compensation_add = add_mean(
74-
val, nobs, sum_x, neg_ct, compensation_add
89+
(
90+
nobs,
91+
sum_x,
92+
neg_ct,
93+
compensation_add,
94+
num_consecutive_same_value,
95+
prev_value,
96+
) = add_mean(
97+
val,
98+
nobs,
99+
sum_x,
100+
neg_ct,
101+
compensation_add,
102+
num_consecutive_same_value,
103+
prev_value,
75104
)
76105
else:
77106
for j in range(start[i - 1], s):
@@ -82,13 +111,28 @@ def sliding_mean(
82111

83112
for j in range(end[i - 1], e):
84113
val = values[j]
85-
nobs, sum_x, neg_ct, compensation_add = add_mean(
86-
val, nobs, sum_x, neg_ct, compensation_add
114+
(
115+
nobs,
116+
sum_x,
117+
neg_ct,
118+
compensation_add,
119+
num_consecutive_same_value,
120+
prev_value,
121+
) = add_mean(
122+
val,
123+
nobs,
124+
sum_x,
125+
neg_ct,
126+
compensation_add,
127+
num_consecutive_same_value,
128+
prev_value,
87129
)
88130

89131
if nobs >= min_periods and nobs > 0:
90132
result = sum_x / nobs
91-
if neg_ct == 0 and result < 0:
133+
if num_consecutive_same_value >= nobs:
134+
result = prev_value
135+
elif neg_ct == 0 and result < 0:
92136
result = 0
93137
elif neg_ct == nobs and result > 0:
94138
result = 0

pandas/core/_numba/kernels/sum_.py

+48-8
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,27 @@
1616

1717
@numba.jit(nopython=True, nogil=True, parallel=False)
1818
def add_sum(
19-
val: float, nobs: int, sum_x: float, compensation: float
20-
) -> tuple[int, float, float]:
19+
val: float,
20+
nobs: int,
21+
sum_x: float,
22+
compensation: float,
23+
num_consecutive_same_value: int,
24+
prev_value: float,
25+
) -> tuple[int, float, float, int, float]:
2126
if not np.isnan(val):
2227
nobs += 1
2328
y = val - compensation
2429
t = sum_x + y
2530
compensation = t - sum_x - y
2631
sum_x = t
27-
return nobs, sum_x, compensation
32+
33+
if val == prev_value:
34+
num_consecutive_same_value += 1
35+
else:
36+
num_consecutive_same_value = 1
37+
prev_value = val
38+
39+
return nobs, sum_x, compensation, num_consecutive_same_value, prev_value
2840

2941

3042
@numba.jit(nopython=True, nogil=True, parallel=False)
@@ -63,10 +75,24 @@ def sliding_sum(
6375
s = start[i]
6476
e = end[i]
6577
if i == 0 or not is_monotonic_increasing_bounds:
78+
prev_value = values[s]
79+
num_consecutive_same_value = 0
80+
6681
for j in range(s, e):
6782
val = values[j]
68-
nobs, sum_x, compensation_add = add_sum(
69-
val, nobs, sum_x, compensation_add
83+
(
84+
nobs,
85+
sum_x,
86+
compensation_add,
87+
num_consecutive_same_value,
88+
prev_value,
89+
) = add_sum(
90+
val,
91+
nobs,
92+
sum_x,
93+
compensation_add,
94+
num_consecutive_same_value,
95+
prev_value,
7096
)
7197
else:
7298
for j in range(start[i - 1], s):
@@ -77,14 +103,28 @@ def sliding_sum(
77103

78104
for j in range(end[i - 1], e):
79105
val = values[j]
80-
nobs, sum_x, compensation_add = add_sum(
81-
val, nobs, sum_x, compensation_add
106+
(
107+
nobs,
108+
sum_x,
109+
compensation_add,
110+
num_consecutive_same_value,
111+
prev_value,
112+
) = add_sum(
113+
val,
114+
nobs,
115+
sum_x,
116+
compensation_add,
117+
num_consecutive_same_value,
118+
prev_value,
82119
)
83120

84121
if nobs == 0 == min_periods:
85122
result = 0.0
86123
elif nobs >= min_periods:
87-
result = sum_x
124+
if num_consecutive_same_value >= nobs:
125+
result = prev_value * nobs
126+
else:
127+
result = sum_x
88128
else:
89129
result = np.nan
90130

pandas/tests/window/test_rolling.py

+11
Original file line numberDiff line numberDiff line change
@@ -1849,3 +1849,14 @@ def test_rolling_var_same_value_count_logic(values, window, min_periods, expecte
18491849
result_std = sr.rolling(window, min_periods=min_periods).std()
18501850
tm.assert_series_equal(result_std, np.sqrt(expected))
18511851
tm.assert_series_equal(expected == 0, result_std == 0)
1852+
1853+
1854+
def test_rolling_mean_sum_floating_artifacts():
1855+
# GH 42064.
1856+
1857+
sr = Series([1 / 3, 4, 0, 0, 0, 0, 0])
1858+
r = sr.rolling(3)
1859+
result = r.mean()
1860+
assert (result[-3:] == 0).all()
1861+
result = r.sum()
1862+
assert (result[-3:] == 0).all()

0 commit comments

Comments
 (0)