Skip to content

Commit ac1cec4

Browse files
committed
fix merge
1 parent c68c626 commit ac1cec4

File tree

5 files changed

+192
-40
lines changed

5 files changed

+192
-40
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ Other enhancements
4646
- Improved error message in :class:`~pandas.core.window.Rolling` when ``window`` is a frequency and ``NaT`` is in the rolling axis (:issue:`46087`)
4747
- :class:`Series` and :class:`DataFrame` with ``IntegerDtype`` now supports bitwise operations (:issue:`34463`)
4848
- Add ``milliseconds`` field support for :class:`~pandas.DateOffset` (:issue:`43371`)
49+
- :meth:`DataFrame.rolling.var`, :meth:`DataFrame.rolling.std`, :meth:`Series.rolling.var`, :meth:`Series.rolling.std` now generate correct result 0 for window containing same values (:issue:`42064`)
4950
-
5051

5152
.. ---------------------------------------------------------------------------

pandas/_libs/window/aggregations.pyx

+24-10
Original file line numberDiff line numberDiff line change
@@ -278,15 +278,15 @@ def roll_mean(const float64_t[:] values, ndarray[int64_t] start,
278278

279279

280280
cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs,
281-
float64_t ssqdm_x) nogil:
281+
float64_t ssqdm_x, int64_t num_consecutive_same_value) nogil:
282282
cdef:
283283
float64_t result
284284

285285
# Variance is unchanged if no observation is added or removed
286286
if (nobs >= minp) and (nobs > ddof):
287287

288-
# pathological case
289-
if nobs == 1:
288+
# pathological case & repeatedly same values case
289+
if nobs == 1 or num_consecutive_same_value >= nobs:
290290
result = 0
291291
else:
292292
result = ssqdm_x / (nobs - <float64_t>ddof)
@@ -297,7 +297,8 @@ cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs,
297297

298298

299299
cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x,
300-
float64_t *ssqdm_x, float64_t *compensation) nogil:
300+
float64_t *ssqdm_x, float64_t *compensation,
301+
int64_t *num_consecutive_same_value, float64_t *prev_value) nogil:
301302
""" add a value from the var calc """
302303
cdef:
303304
float64_t delta, prev_mean, y, t
@@ -307,6 +308,15 @@ cdef inline void add_var(float64_t val, float64_t *nobs, float64_t *mean_x,
307308
return
308309

309310
nobs[0] = nobs[0] + 1
311+
312+
# GH#42064, record num of same values to remove floating point artifacts
313+
if val == prev_value[0]:
314+
num_consecutive_same_value[0] += 1
315+
else:
316+
# reset to 1 (include current value itself)
317+
num_consecutive_same_value[0] = 1
318+
prev_value[0] = val
319+
310320
# Welford's method for the online variance-calculation
311321
# using Kahan summation
312322
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
@@ -352,9 +362,8 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
352362
"""
353363
cdef:
354364
float64_t mean_x, ssqdm_x, nobs, compensation_add,
355-
float64_t compensation_remove,
356-
float64_t val, prev, delta, mean_x_old
357-
int64_t s, e
365+
float64_t compensation_remove, prev_value
366+
int64_t s, e, num_consecutive_same_value
358367
Py_ssize_t i, j, N = len(start)
359368
ndarray[float64_t] output
360369
bint is_monotonic_increasing_bounds
@@ -376,9 +385,13 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
376385
# never removed
377386
if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
378387

388+
prev_value = values[s]
389+
num_consecutive_same_value = 0
390+
379391
mean_x = ssqdm_x = nobs = compensation_add = compensation_remove = 0
380392
for j in range(s, e):
381-
add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add)
393+
add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add,
394+
&num_consecutive_same_value, &prev_value)
382395

383396
else:
384397

@@ -392,9 +405,10 @@ def roll_var(const float64_t[:] values, ndarray[int64_t] start,
392405

393406
# calculate adds
394407
for j in range(end[i - 1], e):
395-
add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add)
408+
add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add,
409+
&num_consecutive_same_value, &prev_value)
396410

397-
output[i] = calc_var(minp, ddof, nobs, ssqdm_x)
411+
output[i] = calc_var(minp, ddof, nobs, ssqdm_x, num_consecutive_same_value)
398412

399413
if not is_monotonic_increasing_bounds:
400414
nobs = 0.0

pandas/core/_numba/kernels/var_.py

+51-8
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,22 @@
1616

1717
@numba.jit(nopython=True, nogil=True, parallel=False)
1818
def add_var(
19-
val: float, nobs: int, mean_x: float, ssqdm_x: float, compensation: float
20-
) -> tuple[int, float, float, float]:
19+
val: float,
20+
nobs: int,
21+
mean_x: float,
22+
ssqdm_x: float,
23+
compensation: float,
24+
num_consecutive_same_value: int,
25+
prev_value: float,
26+
) -> tuple[int, float, float, float, int, float]:
2127
if not np.isnan(val):
28+
29+
if val == prev_value:
30+
num_consecutive_same_value += 1
31+
else:
32+
num_consecutive_same_value = 1
33+
prev_value = val
34+
2235
nobs += 1
2336
prev_mean = mean_x - compensation
2437
y = val - compensation
@@ -30,7 +43,7 @@ def add_var(
3043
else:
3144
mean_x = 0
3245
ssqdm_x += (val - prev_mean) * (val - mean_x)
33-
return nobs, mean_x, ssqdm_x, compensation
46+
return nobs, mean_x, ssqdm_x, compensation, num_consecutive_same_value, prev_value
3447

3548

3649
@numba.jit(nopython=True, nogil=True, parallel=False)
@@ -79,10 +92,27 @@ def sliding_var(
7992
s = start[i]
8093
e = end[i]
8194
if i == 0 or not is_monotonic_increasing_bounds:
95+
96+
prev_value = values[s]
97+
num_consecutive_same_value = 0
98+
8299
for j in range(s, e):
83100
val = values[j]
84-
nobs, mean_x, ssqdm_x, compensation_add = add_var(
85-
val, nobs, mean_x, ssqdm_x, compensation_add
101+
(
102+
nobs,
103+
mean_x,
104+
ssqdm_x,
105+
compensation_add,
106+
num_consecutive_same_value,
107+
prev_value,
108+
) = add_var(
109+
val,
110+
nobs,
111+
mean_x,
112+
ssqdm_x,
113+
compensation_add,
114+
num_consecutive_same_value,
115+
prev_value,
86116
)
87117
else:
88118
for j in range(start[i - 1], s):
@@ -93,12 +123,25 @@ def sliding_var(
93123

94124
for j in range(end[i - 1], e):
95125
val = values[j]
96-
nobs, mean_x, ssqdm_x, compensation_add = add_var(
97-
val, nobs, mean_x, ssqdm_x, compensation_add
126+
(
127+
nobs,
128+
mean_x,
129+
ssqdm_x,
130+
compensation_add,
131+
num_consecutive_same_value,
132+
prev_value,
133+
) = add_var(
134+
val,
135+
nobs,
136+
mean_x,
137+
ssqdm_x,
138+
compensation_add,
139+
num_consecutive_same_value,
140+
prev_value,
98141
)
99142

100143
if nobs >= min_periods and nobs > ddof:
101-
if nobs == 1:
144+
if nobs == 1 or num_consecutive_same_value >= nobs:
102145
result = 0.0
103146
else:
104147
result = ssqdm_x / (nobs - ddof)

pandas/core/window/rolling.py

+16-22
Original file line numberDiff line numberDiff line change
@@ -2150,24 +2150,21 @@ def median(
21502150
The default ``ddof`` of 1 used in :meth:`Series.std` is different
21512151
than the default ``ddof`` of 0 in :func:`numpy.std`.
21522152
2153-
A minimum of one period is required for the rolling calculation.
2154-
2155-
The implementation is susceptible to floating point imprecision as
2156-
shown in the example below.\n
2153+
A minimum of one period is required for the rolling calculation.\n
21572154
"""
21582155
).replace("\n", "", 1),
21592156
create_section_header("Examples"),
21602157
dedent(
21612158
"""
21622159
>>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
21632160
>>> s.rolling(3).std()
2164-
0 NaN
2165-
1 NaN
2166-
2 5.773503e-01
2167-
3 1.000000e+00
2168-
4 1.000000e+00
2169-
5 1.154701e+00
2170-
6 2.580957e-08
2161+
0 NaN
2162+
1 NaN
2163+
2 0.577350
2164+
3 1.000000
2165+
4 1.000000
2166+
5 1.154701
2167+
6 0.000000
21712168
dtype: float64
21722169
"""
21732170
).replace("\n", "", 1),
@@ -2212,24 +2209,21 @@ def std(
22122209
The default ``ddof`` of 1 used in :meth:`Series.var` is different
22132210
than the default ``ddof`` of 0 in :func:`numpy.var`.
22142211
2215-
A minimum of one period is required for the rolling calculation.
2216-
2217-
The implementation is susceptible to floating point imprecision as
2218-
shown in the example below.\n
2212+
A minimum of one period is required for the rolling calculation.\n
22192213
"""
22202214
).replace("\n", "", 1),
22212215
create_section_header("Examples"),
22222216
dedent(
22232217
"""
22242218
>>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
22252219
>>> s.rolling(3).var()
2226-
0 NaN
2227-
1 NaN
2228-
2 3.333333e-01
2229-
3 1.000000e+00
2230-
4 1.000000e+00
2231-
5 1.333333e+00
2232-
6 6.661338e-16
2220+
0 NaN
2221+
1 NaN
2222+
2 0.333333
2223+
3 1.000000
2224+
4 1.000000
2225+
5 1.333333
2226+
6 0.000000
22332227
dtype: float64
22342228
"""
22352229
).replace("\n", "", 1),

pandas/tests/window/test_rolling.py

+100
Original file line numberDiff line numberDiff line change
@@ -1781,3 +1781,103 @@ def test_step_not_integer_raises():
17811781
def test_step_not_positive_raises():
17821782
with pytest.raises(ValueError, match="step must be >= 0"):
17831783
DataFrame(range(2)).rolling(1, step=-1)
1784+
1785+
1786+
@pytest.mark.parametrize(
1787+
["values", "window", "min_periods", "expected"],
1788+
[
1789+
[
1790+
np.array([20, 10, 10, np.inf, 1, 1, 2, 3]),
1791+
3,
1792+
1,
1793+
np.array(
1794+
[
1795+
np.nan,
1796+
50.0,
1797+
33.33333333333333,
1798+
0.0,
1799+
40.5,
1800+
0.0,
1801+
0.3333333333333333,
1802+
1.0,
1803+
]
1804+
),
1805+
],
1806+
[
1807+
np.array([20, 10, 10, np.nan, 10, 1, 2, 3]),
1808+
3,
1809+
1,
1810+
np.array(
1811+
[
1812+
np.nan,
1813+
50.0,
1814+
33.33333333333333,
1815+
0.0,
1816+
0.0,
1817+
40.5,
1818+
24.333333333333332,
1819+
1.0,
1820+
]
1821+
),
1822+
],
1823+
[
1824+
np.array([np.nan, 5, 6, 7, 5, 5, 5]),
1825+
3,
1826+
3,
1827+
np.array([np.nan, np.nan, np.nan, 1.0, 1.0, 1.3333333333333335, 0.0]),
1828+
],
1829+
[
1830+
np.array([5, 7, 7, 7, np.nan, np.inf, 4, 3, 3, 3]),
1831+
3,
1832+
3,
1833+
np.array(
1834+
[
1835+
np.nan,
1836+
np.nan,
1837+
1.3333333333333335,
1838+
0.0,
1839+
np.nan,
1840+
np.nan,
1841+
np.nan,
1842+
np.nan,
1843+
0.33333333333333337,
1844+
0.0,
1845+
]
1846+
),
1847+
],
1848+
[
1849+
np.array([5, 7, 7, 7, np.nan, np.inf, 7, 3, 3, 3]),
1850+
3,
1851+
3,
1852+
np.array(
1853+
[
1854+
np.nan,
1855+
np.nan,
1856+
1.3333333333333335,
1857+
0.0,
1858+
np.nan,
1859+
np.nan,
1860+
np.nan,
1861+
np.nan,
1862+
5.333333333333333,
1863+
0.0,
1864+
]
1865+
),
1866+
],
1867+
],
1868+
)
1869+
def test_rolling_var_same_value_count_logic(values, window, min_periods, expected):
1870+
# GH 42064
1871+
1872+
sr = Series(values)
1873+
result_var = sr.rolling(window, min_periods=min_periods).var()
1874+
# 1. result should be close to correct value
1875+
# non-zero values can still differ slightly as the result of online algorithm
1876+
assert np.isclose(result_var, expected, equal_nan=True).all()
1877+
# 2. zeros should be exactly the same since the new algo takes effect here
1878+
assert (result_var[expected == 0] == 0).all()
1879+
1880+
# std should also pass as it's just a sqrt of var
1881+
result_std = sr.rolling(window, min_periods=min_periods).std()
1882+
assert np.isclose(result_std, np.sqrt(expected), equal_nan=True).all()
1883+
assert (result_std[expected == 0] == 0).all()

0 commit comments

Comments
 (0)