Skip to content

Commit 2efc912

Browse files
committed
add value count for roll mean & sum
1 parent 21c1730 commit 2efc912

File tree

5 files changed

+166
-36
lines changed

5 files changed

+166
-36
lines changed

pandas/_libs/window/aggregations.pyx

+52-19
Original file line numberDiff line numberDiff line change
@@ -70,22 +70,28 @@ cdef bint is_monotonic_increasing_start_end_bounds(
7070
# Rolling sum
7171

7272

73-
cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x) nogil:
73+
cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x,
74+
int64_t num_consecutive_same_value, float64_t prev_value
75+
) nogil:
7476
cdef:
7577
float64_t result
7678

7779
if nobs == 0 == minp:
7880
result = 0
7981
elif nobs >= minp:
80-
result = sum_x
82+
if num_consecutive_same_value >= nobs:
83+
result = prev_value * nobs
84+
else:
85+
result = sum_x
8186
else:
8287
result = NaN
8388

8489
return result
8590

8691

8792
cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
88-
float64_t *compensation) nogil:
93+
float64_t *compensation, int64_t *num_consecutive_same_value,
94+
float64_t *prev_value) nogil:
8995
""" add a value from the sum calc using Kahan summation """
9096

9197
cdef:
@@ -99,6 +105,14 @@ cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
99105
compensation[0] = t - sum_x[0] - y
100106
sum_x[0] = t
101107

108+
# GH#42064, record num of same values to remove floating point artifacts
109+
if val == prev_value[0]:
110+
num_consecutive_same_value[0] += 1
111+
else:
112+
# reset to 1 (include current value itself)
113+
num_consecutive_same_value[0] = 1
114+
prev_value[0] = val
115+
102116

103117
cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
104118
float64_t *compensation) nogil:
@@ -120,8 +134,8 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start,
120134
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
121135
cdef:
122136
Py_ssize_t i, j
123-
float64_t sum_x, compensation_add, compensation_remove
124-
int64_t s, e
137+
float64_t sum_x, compensation_add, compensation_remove, prev_value
138+
int64_t s, e, num_consecutive_same_value
125139
int64_t nobs = 0, N = len(start)
126140
ndarray[float64_t] output
127141
bint is_monotonic_increasing_bounds
@@ -140,11 +154,13 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start,
140154
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
141155

142156
# setup
143-
157+
prev_value = values[s]
158+
num_consecutive_same_value = 0
144159
sum_x = compensation_add = compensation_remove = 0
145160
nobs = 0
146161
for j in range(s, e):
147-
add_sum(values[j], &nobs, &sum_x, &compensation_add)
162+
add_sum(values[j], &nobs, &sum_x, &compensation_add,
163+
&num_consecutive_same_value, &prev_value)
148164

149165
else:
150166

@@ -154,9 +170,10 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start,
154170

155171
# calculate adds
156172
for j in range(end[i - 1], e):
157-
add_sum(values[j], &nobs, &sum_x, &compensation_add)
173+
add_sum(values[j], &nobs, &sum_x, &compensation_add,
174+
&num_consecutive_same_value, &prev_value)
158175

159-
output[i] = calc_sum(minp, nobs, sum_x)
176+
output[i] = calc_sum(minp, nobs, sum_x, num_consecutive_same_value, prev_value)
160177

161178
if not is_monotonic_increasing_bounds:
162179
nobs = 0
@@ -170,14 +187,17 @@ def roll_sum(const float64_t[:] values, ndarray[int64_t] start,
170187
# Rolling mean
171188

172189

173-
cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs,
174-
Py_ssize_t neg_ct, float64_t sum_x) nogil:
190+
cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs, Py_ssize_t neg_ct,
191+
float64_t sum_x, int64_t num_consecutive_same_value,
192+
float64_t prev_value) nogil:
175193
cdef:
176194
float64_t result
177195

178196
if nobs >= minp and nobs > 0:
179197
result = sum_x / <float64_t>nobs
180-
if neg_ct == 0 and result < 0:
198+
if num_consecutive_same_value >= nobs:
199+
result = prev_value
200+
elif neg_ct == 0 and result < 0:
181201
# all positive
182202
result = 0
183203
elif neg_ct == nobs and result > 0:
@@ -191,7 +211,8 @@ cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs,
191211

192212

193213
cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
194-
Py_ssize_t *neg_ct, float64_t *compensation) nogil:
214+
Py_ssize_t *neg_ct, float64_t *compensation,
215+
int64_t *num_consecutive_same_value, float64_t *prev_value) nogil:
195216
""" add a value from the mean calc using Kahan summation """
196217
cdef:
197218
float64_t y, t
@@ -206,6 +227,14 @@ cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
206227
if signbit(val):
207228
neg_ct[0] = neg_ct[0] + 1
208229

230+
# GH#42064, record num of same values to remove floating point artifacts
231+
if val == prev_value[0]:
232+
num_consecutive_same_value[0] += 1
233+
else:
234+
# reset to 1 (include current value itself)
235+
num_consecutive_same_value[0] = 1
236+
prev_value[0] = val
237+
209238

210239
cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
211240
Py_ssize_t *neg_ct, float64_t *compensation) nogil:
@@ -226,8 +255,8 @@ cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
226255
def roll_mean(const float64_t[:] values, ndarray[int64_t] start,
227256
ndarray[int64_t] end, int64_t minp) -> np.ndarray:
228257
cdef:
229-
float64_t val, compensation_add, compensation_remove, sum_x
230-
int64_t s, e
258+
float64_t val, compensation_add, compensation_remove, sum_x, prev_value
259+
int64_t s, e, num_consecutive_same_value
231260
Py_ssize_t nobs, i, j, neg_ct, N = len(start)
232261
ndarray[float64_t] output
233262
bint is_monotonic_increasing_bounds
@@ -245,12 +274,15 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start,
245274

246275
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
247276

277+
# setup
248278
compensation_add = compensation_remove = sum_x = 0
249279
nobs = neg_ct = 0
250-
# setup
280+
prev_value = values[s]
281+
num_consecutive_same_value = 0
251282
for j in range(s, e):
252283
val = values[j]
253-
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)
284+
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add,
285+
&num_consecutive_same_value, &prev_value)
254286

255287
else:
256288

@@ -262,9 +294,10 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start,
262294
# calculate adds
263295
for j in range(end[i - 1], e):
264296
val = values[j]
265-
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add)
297+
add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add,
298+
&num_consecutive_same_value, &prev_value)
266299

267-
output[i] = calc_mean(minp, nobs, neg_ct, sum_x)
300+
output[i] = calc_mean(minp, nobs, neg_ct, sum_x, num_consecutive_same_value, prev_value)
268301

269302
if not is_monotonic_increasing_bounds:
270303
nobs = 0

pandas/core/_numba/kernels/mean_.py

+52-8
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,14 @@
1616

1717
@numba.jit(nopython=True, nogil=True, parallel=False)
1818
def add_mean(
19-
val: float, nobs: int, sum_x: float, neg_ct: int, compensation: float
20-
) -> tuple[int, float, int, float]:
19+
val: float,
20+
nobs: int,
21+
sum_x: float,
22+
neg_ct: int,
23+
compensation: float,
24+
num_consecutive_same_value: int,
25+
prev_value: float,
26+
) -> tuple[int, float, int, float, float, float]:
2127
if not np.isnan(val):
2228
nobs += 1
2329
y = val - compensation
@@ -26,7 +32,14 @@ def add_mean(
2632
sum_x = t
2733
if val < 0:
2834
neg_ct += 1
29-
return nobs, sum_x, neg_ct, compensation
35+
36+
if val == prev_value:
37+
num_consecutive_same_value += 1
38+
else:
39+
num_consecutive_same_value = 1
40+
prev_value = val
41+
42+
return nobs, sum_x, neg_ct, compensation, num_consecutive_same_value, prev_value
3043

3144

3245
@numba.jit(nopython=True, nogil=True, parallel=False)
@@ -68,10 +81,26 @@ def sliding_mean(
6881
s = start[i]
6982
e = end[i]
7083
if i == 0 or not is_monotonic_increasing_bounds:
84+
prev_value = values[s]
85+
num_consecutive_same_value = 0
86+
7187
for j in range(s, e):
7288
val = values[j]
73-
nobs, sum_x, neg_ct, compensation_add = add_mean(
74-
val, nobs, sum_x, neg_ct, compensation_add
89+
(
90+
nobs,
91+
sum_x,
92+
neg_ct,
93+
compensation_add,
94+
num_consecutive_same_value,
95+
prev_value,
96+
) = add_mean(
97+
val,
98+
nobs,
99+
sum_x,
100+
neg_ct,
101+
compensation_add,
102+
num_consecutive_same_value,
103+
prev_value,
75104
)
76105
else:
77106
for j in range(start[i - 1], s):
@@ -82,13 +111,28 @@ def sliding_mean(
82111

83112
for j in range(end[i - 1], e):
84113
val = values[j]
85-
nobs, sum_x, neg_ct, compensation_add = add_mean(
86-
val, nobs, sum_x, neg_ct, compensation_add
114+
(
115+
nobs,
116+
sum_x,
117+
neg_ct,
118+
compensation_add,
119+
num_consecutive_same_value,
120+
prev_value,
121+
) = add_mean(
122+
val,
123+
nobs,
124+
sum_x,
125+
neg_ct,
126+
compensation_add,
127+
num_consecutive_same_value,
128+
prev_value,
87129
)
88130

89131
if nobs >= min_periods and nobs > 0:
90132
result = sum_x / nobs
91-
if neg_ct == 0 and result < 0:
133+
if num_consecutive_same_value >= nobs:
134+
result = prev_value
135+
elif neg_ct == 0 and result < 0:
92136
result = 0
93137
elif neg_ct == nobs and result > 0:
94138
result = 0

pandas/core/_numba/kernels/sum_.py

+48-8
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,27 @@
1616

1717
@numba.jit(nopython=True, nogil=True, parallel=False)
1818
def add_sum(
19-
val: float, nobs: int, sum_x: float, compensation: float
20-
) -> tuple[int, float, float]:
19+
val: float,
20+
nobs: int,
21+
sum_x: float,
22+
compensation: float,
23+
num_consecutive_same_value: int,
24+
prev_value: float,
25+
) -> tuple[int, float, float, float, float]:
2126
if not np.isnan(val):
2227
nobs += 1
2328
y = val - compensation
2429
t = sum_x + y
2530
compensation = t - sum_x - y
2631
sum_x = t
27-
return nobs, sum_x, compensation
32+
33+
if val == prev_value:
34+
num_consecutive_same_value += 1
35+
else:
36+
num_consecutive_same_value = 1
37+
prev_value = val
38+
39+
return nobs, sum_x, compensation, num_consecutive_same_value, prev_value
2840

2941

3042
@numba.jit(nopython=True, nogil=True, parallel=False)
@@ -63,10 +75,24 @@ def sliding_sum(
6375
s = start[i]
6476
e = end[i]
6577
if i == 0 or not is_monotonic_increasing_bounds:
78+
prev_value = values[s]
79+
num_consecutive_same_value = 0
80+
6681
for j in range(s, e):
6782
val = values[j]
68-
nobs, sum_x, compensation_add = add_sum(
69-
val, nobs, sum_x, compensation_add
83+
(
84+
nobs,
85+
sum_x,
86+
compensation_add,
87+
num_consecutive_same_value,
88+
prev_value,
89+
) = add_sum(
90+
val,
91+
nobs,
92+
sum_x,
93+
compensation_add,
94+
num_consecutive_same_value,
95+
prev_value,
7096
)
7197
else:
7298
for j in range(start[i - 1], s):
@@ -77,14 +103,28 @@ def sliding_sum(
77103

78104
for j in range(end[i - 1], e):
79105
val = values[j]
80-
nobs, sum_x, compensation_add = add_sum(
81-
val, nobs, sum_x, compensation_add
106+
(
107+
nobs,
108+
sum_x,
109+
compensation_add,
110+
num_consecutive_same_value,
111+
prev_value,
112+
) = add_sum(
113+
val,
114+
nobs,
115+
sum_x,
116+
compensation_add,
117+
num_consecutive_same_value,
118+
prev_value,
82119
)
83120

84121
if nobs == 0 == min_periods:
85122
result = 0.0
86123
elif nobs >= min_periods:
87-
result = sum_x
124+
if num_consecutive_same_value >= nobs:
125+
result = prev_value * nobs
126+
else:
127+
result = sum_x
88128
else:
89129
result = np.nan
90130

pandas/tests/window/test_numba.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ def f(x, *args):
9292
Series(range(5), name="foo"),
9393
Series([20, 10, 10, np.inf, 1, 1, 2, 3]),
9494
Series([20, 10, 10, np.nan, 10, 1, 2, 3]),
95+
Series([1 / 3, 4, 0, 0, 0, 0, 0]),
9596
],
9697
)
9798
def test_numba_vs_cython_rolling_methods(
@@ -113,7 +114,8 @@ def test_numba_vs_cython_rolling_methods(
113114
engine="numba", engine_kwargs=engine_kwargs, **kwargs
114115
)
115116
expected = getattr(roll, method)(engine="cython", **kwargs)
116-
tm.assert_equal(result, expected)
117+
# check_exact=True to make sure no floating artifacts
118+
tm.assert_equal(result, expected, check_exact=True)
117119

118120
@pytest.mark.parametrize(
119121
"data", [DataFrame(np.eye(5)), Series(range(5), name="foo")]

pandas/tests/window/test_rolling.py

+11
Original file line numberDiff line numberDiff line change
@@ -1885,3 +1885,14 @@ def test_rolling_var_same_value_count_logic(values, window, min_periods, expecte
18851885
result_std = sr.rolling(window, min_periods=min_periods).std()
18861886
tm.assert_series_equal(result_std, np.sqrt(expected))
18871887
tm.assert_series_equal(expected == 0, result_std == 0)
1888+
1889+
1890+
def test_rolling_mean_sum_floating_artifacts():
1891+
# GH 42064.
1892+
1893+
sr = Series([1 / 3, 4, 0, 0, 0, 0, 0])
1894+
r = sr.rolling(3)
1895+
result = r.mean()
1896+
assert (result[-3:] == 0).all()
1897+
result = r.sum()
1898+
assert (result[-3:] == 0).all()

0 commit comments

Comments
 (0)