Skip to content

Commit a8968bf

Browse files
audersonaudersonjreback
authored
ENH: use same value counts for roll_var to remove floating point artifacts (#46431)
* fix merge * bug fix: nobs == 0 == min_periods * use Py_ssize_t in place of int64_t * use tm.assert_series_equal * add more checks in roll numba methods * add more checks in roll numba methods * cancel check exact * test 32bit: undo Py_ssize_t * add explanation for test_rolling_var_same_value_count_logic * comments updated * add equal-zero test for test_rolling_var_numerical_issues & test_rolling_var_floating_artifact_precision * update docs * add more tests * update whats new * update whats new * update doc Co-authored-by: auderson <[email protected]> Co-authored-by: Jeff Reback <[email protected]>
1 parent b6f21f3 commit a8968bf

File tree

7 files changed

+188
-43
lines changed

7 files changed

+188
-43
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -603,6 +603,7 @@ Groupby/resample/rolling
603603
- Bug in :meth:`GroupBy.max` with empty groups and ``uint64`` dtype incorrectly raising ``RuntimeError`` (:issue:`46408`)
604604
- Bug in :meth:`.GroupBy.apply` would fail when ``func`` was a string and args or kwargs were supplied (:issue:`46479`)
605605
- Bug in :meth:`SeriesGroupBy.apply` would incorrectly name its result when there was a unique group (:issue:`46369`)
606+
- Bug in :meth:`Rolling.var` and :meth:`Rolling.std` would give non-zero result with window of same values (:issue:`42064`)
606607
- Bug in :meth:`.Rolling.var` would segfault calculating weighted variance when window size was larger than data size (:issue:`46760`)
607608
- Bug in :meth:`Grouper.__repr__` where ``dropna`` was not included. Now it is (:issue:`46754`)
608609

pandas/_libs/window/aggregations.pyx

+24-10
Original file line numberDiff line numberDiff line change
@@ -278,15 +278,15 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start,
278278

279279

280280
cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs,
281-
float64_t ssqdm_x) nogil:
281+
float64_t ssqdm_x, int64_t num_consecutive_same_value) nogil:
282282
cdef:
283283
float64_t result
284284

285285
# Variance is unchanged if no observation is added or removed
286286
if (nobs >= minp) and (nobs > ddof):
287287

288-
# pathological case
289-
if nobs == 1:
288+
# pathological case & repeatedly same values case
289+
if nobs == 1 or num_consecutive_same_value >= nobs:
290290
result = 0
291291
else:
292292
result = ssqdm_x / (nobs - <float64_t>ddof)
@@ -297,7 +297,8 @@ cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs,
297297

298298

299299
cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x,
300-
float64_t *ssqdm_x, float64_t *compensation) nogil:
300+
float64_t *ssqdm_x, float64_t *compensation,
301+
int64_t *num_consecutive_same_value, float64_t *prev_value) nogil:
301302
""" add a value from the var calc """
302303
cdef:
303304
float64_t delta, prev_mean, y, t
@@ -307,6 +308,15 @@ cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x,
307308
return
308309

309310
nobs[0] = nobs[0] + 1
311+
312+
# GH#42064, record num of same values to remove floating point artifacts
313+
if val == prev_value[0]:
314+
num_consecutive_same_value[0] += 1
315+
else:
316+
# reset to 1 (include current value itself)
317+
num_consecutive_same_value[0] = 1
318+
prev_value[0] = val
319+
310320
# Welford's method for the online variance-calculation
311321
# using Kahan summation
312322
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
@@ -352,9 +362,8 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
352362
"""
353363
cdef:
354364
float64_t mean_x, ssqdm_x, nobs, compensation_add,
355-
float64_t compensation_remove,
356-
float64_t val, prev, delta, mean_x_old
357-
int64_t s, e
365+
float64_t compensation_remove, prev_value
366+
int64_t s, e, num_consecutive_same_value
358367
Py_ssize_t i, j, N = len(start)
359368
ndarray[float64_t] output
360369
bint is_monotonic_increasing_bounds
@@ -376,9 +385,13 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
376385
# never removed
377386
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
378387

388+
prev_value = values[s]
389+
num_consecutive_same_value = 0
390+
379391
mean_x = ssqdm_x = nobs = compensation_add = compensation_remove = 0
380392
for j in range(s, e):
381-
add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add)
393+
add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add,
394+
&num_consecutive_same_value, &prev_value)
382395

383396
else:
384397

@@ -392,9 +405,10 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
392405

393406
# calculate adds
394407
for j in range(end[i - 1], e):
395-
add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add)
408+
add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add,
409+
&num_consecutive_same_value, &prev_value)
396410

397-
output[i] = calc_var(minp, ddof, nobs, ssqdm_x)
411+
output[i] = calc_var(minp, ddof, nobs, ssqdm_x, num_consecutive_same_value)
398412

399413
if not is_monotonic_increasing_bounds:
400414
nobs = 0.0

pandas/core/_numba/kernels/sum_.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def sliding_sum(
8181
val, nobs, sum_x, compensation_add
8282
)
8383

84-
if nobs == 0 == nobs:
84+
if nobs == 0 == min_periods:
8585
result = 0.0
8686
elif nobs >= min_periods:
8787
result = sum_x

pandas/core/_numba/kernels/var_.py

+51-8
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,22 @@
1616

1717
@numba.jit(nopython=True, nogil=True, parallel=False)
1818
def add_var(
19-
val: float, nobs: int, mean_x: float, ssqdm_x: float, compensation: float
20-
) -> tuple[int, float, float, float]:
19+
val: float,
20+
nobs: int,
21+
mean_x: float,
22+
ssqdm_x: float,
23+
compensation: float,
24+
num_consecutive_same_value: int,
25+
prev_value: float,
26+
) -> tuple[int, float, float, float, int, float]:
2127
if not np.isnan(val):
28+
29+
if val == prev_value:
30+
num_consecutive_same_value += 1
31+
else:
32+
num_consecutive_same_value = 1
33+
prev_value = val
34+
2235
nobs += 1
2336
prev_mean = mean_x - compensation
2437
y = val - compensation
@@ -30,7 +43,7 @@ def add_var(
3043
else:
3144
mean_x = 0
3245
ssqdm_x += (val - prev_mean) * (val - mean_x)
33-
return nobs, mean_x, ssqdm_x, compensation
46+
return nobs, mean_x, ssqdm_x, compensation, num_consecutive_same_value, prev_value
3447

3548

3649
@numba.jit(nopython=True, nogil=True, parallel=False)
@@ -79,10 +92,27 @@ def sliding_var(
7992
s = start[i]
8093
e = end[i]
8194
if i == 0 or not is_monotonic_increasing_bounds:
95+
96+
prev_value = values[s]
97+
num_consecutive_same_value = 0
98+
8299
for j in range(s, e):
83100
val = values[j]
84-
nobs, mean_x, ssqdm_x, compensation_add = add_var(
85-
val, nobs, mean_x, ssqdm_x, compensation_add
101+
(
102+
nobs,
103+
mean_x,
104+
ssqdm_x,
105+
compensation_add,
106+
num_consecutive_same_value,
107+
prev_value,
108+
) = add_var(
109+
val,
110+
nobs,
111+
mean_x,
112+
ssqdm_x,
113+
compensation_add,
114+
num_consecutive_same_value,
115+
prev_value,
86116
)
87117
else:
88118
for j in range(start[i - 1], s):
@@ -93,12 +123,25 @@ def sliding_var(
93123

94124
for j in range(end[i - 1], e):
95125
val = values[j]
96-
nobs, mean_x, ssqdm_x, compensation_add = add_var(
97-
val, nobs, mean_x, ssqdm_x, compensation_add
126+
(
127+
nobs,
128+
mean_x,
129+
ssqdm_x,
130+
compensation_add,
131+
num_consecutive_same_value,
132+
prev_value,
133+
) = add_var(
134+
val,
135+
nobs,
136+
mean_x,
137+
ssqdm_x,
138+
compensation_add,
139+
num_consecutive_same_value,
140+
prev_value,
98141
)
99142

100143
if nobs >= min_periods and nobs > ddof:
101-
if nobs == 1:
144+
if nobs == 1 or num_consecutive_same_value >= nobs:
102145
result = 0.0
103146
else:
104147
result = ssqdm_x / (nobs - ddof)

pandas/core/window/rolling.py

+16-22
Original file line numberDiff line numberDiff line change
@@ -2150,24 +2150,21 @@ def median(
21502150
The default ``ddof`` of 1 used in :meth:`Series.std` is different
21512151
than the default ``ddof`` of 0 in :func:`numpy.std`.
21522152
2153-
A minimum of one period is required for the rolling calculation.
2154-
2155-
The implementation is susceptible to floating point imprecision as
2156-
shown in the example below.\n
2153+
A minimum of one period is required for the rolling calculation.\n
21572154
"""
21582155
).replace("\n", "", 1),
21592156
create_section_header("Examples"),
21602157
dedent(
21612158
"""
21622159
>>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
21632160
>>> s.rolling(3).std()
2164-
0 NaN
2165-
1 NaN
2166-
2 5.773503e-01
2167-
3 1.000000e+00
2168-
4 1.000000e+00
2169-
5 1.154701e+00
2170-
6 2.580957e-08
2161+
0 NaN
2162+
1 NaN
2163+
2 0.577350
2164+
3 1.000000
2165+
4 1.000000
2166+
5 1.154701
2167+
6 0.000000
21712168
dtype: float64
21722169
"""
21732170
).replace("\n", "", 1),
@@ -2212,24 +2209,21 @@ def std(
22122209
The default ``ddof`` of 1 used in :meth:`Series.var` is different
22132210
than the default ``ddof`` of 0 in :func:`numpy.var`.
22142211
2215-
A minimum of one period is required for the rolling calculation.
2216-
2217-
The implementation is susceptible to floating point imprecision as
2218-
shown in the example below.\n
2212+
A minimum of one period is required for the rolling calculation.\n
22192213
"""
22202214
).replace("\n", "", 1),
22212215
create_section_header("Examples"),
22222216
dedent(
22232217
"""
22242218
>>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
22252219
>>> s.rolling(3).var()
2226-
0 NaN
2227-
1 NaN
2228-
2 3.333333e-01
2229-
3 1.000000e+00
2230-
4 1.000000e+00
2231-
5 1.333333e+00
2232-
6 6.661338e-16
2220+
0 NaN
2221+
1 NaN
2222+
2 0.333333
2223+
3 1.000000
2224+
4 1.000000
2225+
5 1.333333
2226+
6 0.000000
22332227
dtype: float64
22342228
"""
22352229
).replace("\n", "", 1),

pandas/tests/window/test_numba.py

+15-2
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,20 @@ def f(x, *args):
7979
tm.assert_series_equal(result, expected)
8080

8181
@pytest.mark.parametrize(
82-
"data", [DataFrame(np.eye(5)), Series(range(5), name="foo")]
82+
"data",
83+
[
84+
DataFrame(np.eye(5)),
85+
DataFrame(
86+
[
87+
[5, 7, 7, 7, np.nan, np.inf, 4, 3, 3, 3],
88+
[5, 7, 7, 7, np.nan, np.inf, 7, 3, 3, 3],
89+
[np.nan, np.nan, 5, 6, 7, 5, 5, 5, 5, 5],
90+
]
91+
).T,
92+
Series(range(5), name="foo"),
93+
Series([20, 10, 10, np.inf, 1, 1, 2, 3]),
94+
Series([20, 10, 10, np.nan, 10, 1, 2, 3]),
95+
],
8396
)
8497
def test_numba_vs_cython_rolling_methods(
8598
self,
@@ -95,7 +108,7 @@ def test_numba_vs_cython_rolling_methods(
95108

96109
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
97110

98-
roll = data.rolling(2, step=step)
111+
roll = data.rolling(3, step=step)
99112
result = getattr(roll, method)(
100113
engine="numba", engine_kwargs=engine_kwargs, **kwargs
101114
)

pandas/tests/window/test_rolling.py

+80
Original file line numberDiff line numberDiff line change
@@ -1203,6 +1203,9 @@ def test_rolling_var_numerical_issues(func, third_value, values):
12031203
result = getattr(ds.rolling(2), func)()
12041204
expected = Series([np.nan] + values)
12051205
tm.assert_series_equal(result, expected)
1206+
# GH 42064
1207+
# new `roll_var` will output 0.0 correctly
1208+
tm.assert_series_equal(result == 0, expected == 0)
12061209

12071210

12081211
def test_timeoffset_as_window_parameter_for_corr():
@@ -1509,6 +1512,9 @@ def test_rolling_var_floating_artifact_precision():
15091512
result = s.rolling(3).var()
15101513
expected = Series([np.nan, np.nan, 4 / 3, 0])
15111514
tm.assert_series_equal(result, expected, atol=1.0e-15, rtol=1.0e-15)
1515+
# GH 42064
1516+
# new `roll_var` will output 0.0 correctly
1517+
tm.assert_series_equal(result == 0, expected == 0)
15121518

15131519

15141520
def test_rolling_std_small_values():
@@ -1769,3 +1775,77 @@ def test_step_not_integer_raises():
17691775
def test_step_not_positive_raises():
17701776
with pytest.raises(ValueError, match="step must be >= 0"):
17711777
DataFrame(range(2)).rolling(1, step=-1)
1778+
1779+
1780+
@pytest.mark.parametrize(
1781+
["values", "window", "min_periods", "expected"],
1782+
[
1783+
[
1784+
[20, 10, 10, np.inf, 1, 1, 2, 3],
1785+
3,
1786+
1,
1787+
[np.nan, 50, 100 / 3, 0, 40.5, 0, 1 / 3, 1],
1788+
],
1789+
[
1790+
[20, 10, 10, np.nan, 10, 1, 2, 3],
1791+
3,
1792+
1,
1793+
[np.nan, 50, 100 / 3, 0, 0, 40.5, 73 / 3, 1],
1794+
],
1795+
[
1796+
[np.nan, 5, 6, 7, 5, 5, 5],
1797+
3,
1798+
3,
1799+
[np.nan] * 3 + [1, 1, 4 / 3, 0],
1800+
],
1801+
[
1802+
[5, 7, 7, 7, np.nan, np.inf, 4, 3, 3, 3],
1803+
3,
1804+
3,
1805+
[np.nan] * 2 + [4 / 3, 0] + [np.nan] * 4 + [1 / 3, 0],
1806+
],
1807+
[
1808+
[5, 7, 7, 7, np.nan, np.inf, 7, 3, 3, 3],
1809+
3,
1810+
3,
1811+
[np.nan] * 2 + [4 / 3, 0] + [np.nan] * 4 + [16 / 3, 0],
1812+
],
1813+
[
1814+
[5, 7] * 4,
1815+
3,
1816+
3,
1817+
[np.nan] * 2 + [4 / 3] * 6,
1818+
],
1819+
[
1820+
[5, 7, 5, np.nan, 7, 5, 7],
1821+
3,
1822+
2,
1823+
[np.nan, 2, 4 / 3] + [2] * 3 + [4 / 3],
1824+
],
1825+
],
1826+
)
1827+
def test_rolling_var_same_value_count_logic(values, window, min_periods, expected):
1828+
# GH 42064.
1829+
1830+
expected = Series(expected)
1831+
sr = Series(values)
1832+
1833+
# With new algo implemented, result will be set to .0 in rolling var
1834+
# if sufficient amount of consecutively same values are found.
1835+
result_var = sr.rolling(window, min_periods=min_periods).var()
1836+
1837+
# use `assert_series_equal` twice to check for equality,
1838+
# because `check_exact=True` will fail in 32-bit tests due to
1839+
# precision loss.
1840+
1841+
# 1. result should be close to correct value
1842+
# non-zero values can still differ slightly from "truth"
1843+
# as the result of online algorithm
1844+
tm.assert_series_equal(result_var, expected)
1845+
# 2. zeros should be exactly the same since the new algo takes effect here
1846+
tm.assert_series_equal(expected == 0, result_var == 0)
1847+
1848+
# std should also pass as it's just a sqrt of var
1849+
result_std = sr.rolling(window, min_periods=min_periods).std()
1850+
tm.assert_series_equal(result_std, np.sqrt(expected))
1851+
tm.assert_series_equal(expected == 0, result_std == 0)

0 commit comments

Comments
 (0)