-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: rolling std can't handle mixture of (relatively) big and small numbers #42064
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Comments
confirmed...
This is actually a result of the
This is to do with computer precision and the implementation of |
this was fixed for 1.3 |
@jreback I got my results, and the same problem, on: |
Note: this is called out in the user guide in the warning block https://pandas.pydata.org/docs/user_guide/window.html#overview and even thought 1.3 does have a change in There will be likely be no definitive fix for this given the algorithm we're using to calculate rolling aggregations |
Just ran into this bug as well. Kind of nasty, given that you'd totally expect var(same values) to be 0.0 exactly. |
Thanks for the report, but since this is an implementation detail related to our algorithm (and it's been documented). I think this issue can be closed. |
@mroeschke CodeThe code below is a numba version of rolling var, mostly copied from pandas/pandas/_libs/window/aggregations.pyx Lines 277 to 405 in 2278923
except that I modified it to work on 2-d inputs. import timeit
import numpy as np
import pandas as pd
from numba import njit
@njit(nogil=True, inline='always')
def add_var_old(row, nobs, mean_x,
ssqdm_x, compensation):
""" add a value from the var calc """
# cdef:
# float64_t delta, prev_mean, y, t
for j, val in enumerate(row):
# GH#21813, if msvc 2017 bug is resolved, we should be OK with != instead of `isnan`
if val != val:
continue
nobs[j] += 1
# Welford's method for the online variance-calculation
# using Kahan summation
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
prev_mean = mean_x[j] - compensation[j]
y = val - compensation[j]
t = y - mean_x[j]
compensation[j] = t + mean_x[j] - y
delta = t
if nobs[j]:
mean_x[j] = mean_x[j] + delta / nobs[j]
else:
mean_x[j] = 0
ssqdm_x[j] = ssqdm_x[j] + (val - prev_mean) * (val - mean_x[j])
@njit(nogil=True, inline='always')
def calc_var_old(minp, ddof, nobs, ssqdm_x):
# cdef:
# float64_t result
result = np.empty(len(nobs))
for i, (_nobs, _ssqdm_x) in enumerate(zip(nobs, ssqdm_x)):
if (_nobs >= minp) and (_nobs > ddof):
# pathological case
if _nobs == 1:
result[i] = 0
else:
result[i] = _ssqdm_x / (_nobs - ddof)
else:
result[i] = np.nan
return result
@njit(nogil=True)
def roll_var_old(values, wind, minp, ddof=1) -> np.ndarray:
"""
Numerically stable implementation using Welford's method.
"""
# cdef:
# float64_t mean_x, ssqdm_x, nobs, compensation_add,
# float64_t compensation_remove,
# float64_t val, prev, delta, mean_x_old
# int64_t s, e
# Py_ssize_t i, j, N = len(start)
# ndarray[float64_t] output
# bint is_monotonic_increasing_bounds
minp = max(minp, 1)
# is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
# start, end
# )
n, m = values.shape
output = np.empty((n, m), dtype=np.float64)
mean_x = np.zeros(m)
ssqdm_x = np.zeros(m)
nobs = np.zeros(m)
compensation_add = np.zeros(m)
compensation_remove = np.zeros(m)
for i in range(n - wind + 1):
s = i
e = wind + s
# Over the first window, observations can only be added
# never removed
if i == 0:
for j in range(s, e):
add_var_old(values[j], nobs, mean_x, ssqdm_x, compensation_add)
output[j] = np.nan
else:
# After the first window, observations can both be added
# and removed
# calculate deletes
remove_var(values[s - 1], nobs, mean_x, ssqdm_x, compensation_remove)
# calculate adds
add_var_old(values[e - 1], nobs, mean_x, ssqdm_x, compensation_add)
output[e - 1] = calc_var_old(minp, ddof, nobs, ssqdm_x)
return output
@njit(nogil=True, inline='always')
def add_var_new(row, nobs, mean_x,
ssqdm_x, compensation, n_same_value, prev_value):
""" add a value from the var calc """
# cdef:
# float64_t delta, prev_mean, y, t
for j, val in enumerate(row):
# GH#21813, if msvc 2017 bug is resolved, we should be OK with != instead of `isnan`
if val != val:
continue
nobs[j] += 1
if val == prev_value[j]:
n_same_value[j] += 1 # incr count
else:
n_same_value[j] = 1 # if not same, set count to 1 (include itself)
prev_value[j] = val # store prev value
# Welford's method for the online variance-calculation
# using Kahan summation
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
prev_mean = mean_x[j] - compensation[j]
y = val - compensation[j]
t = y - mean_x[j]
compensation[j] = t + mean_x[j] - y
delta = t
if nobs[j]:
mean_x[j] = mean_x[j] + delta / nobs[j]
else:
mean_x[j] = 0
ssqdm_x[j] = ssqdm_x[j] + (val - prev_mean) * (val - mean_x[j])
# no change here
@njit(nogil=True, inline='always')
def remove_var(row, nobs, mean_x,
ssqdm_x, compensation):
""" remove a value from the var calc """
# cdef:
# float64_t delta, prev_mean, y, t
for j, val in enumerate(row):
if val == val:
nobs[j] -= 1
if nobs[j]:
# Welford's method for the online variance-calculation
# using Kahan summation
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
prev_mean = mean_x[j] - compensation[j]
y = val - compensation[j]
t = y - mean_x[j]
compensation[j] = t + mean_x[j] - y
delta = t
mean_x[j] = mean_x[j] - delta / nobs[j]
ssqdm_x[j] = ssqdm_x[j] - (val - prev_mean) * (val - mean_x[j])
else:
mean_x[j] = 0
ssqdm_x[j] = 0
@njit(nogil=True, inline='always')
def calc_var_new(minp, ddof, nobs,
ssqdm_x, n_same_value):
# cdef:
# float64_t result
result = np.empty(len(nobs))
for i, (_nobs, _ssqdm_x, _nsv) in enumerate(zip(nobs, ssqdm_x, n_same_value)):
if (_nobs >= minp) and (_nobs > ddof):
# pathological case
if _nobs == 1 or _nsv >= _nobs: # if num same value >= num obs
result[i] = 0
else:
result[i] = _ssqdm_x / (_nobs - ddof)
else:
result[i] = np.nan
return result
@njit(nogil=True)
def roll_var_new(values, wind, minp, ddof=1) -> np.ndarray:
"""
Numerically stable implementation using Welford's method.
"""
# cdef:
# float64_t mean_x, ssqdm_x, nobs, compensation_add,
# float64_t compensation_remove,
# float64_t val, prev, delta, mean_x_old
# int64_t s, e
# Py_ssize_t i, j, N = len(start)
# ndarray[float64_t] output
# bint is_monotonic_increasing_bounds
minp = max(minp, 1)
# is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
# start, end
# )
n, m = values.shape
output = np.empty((n, m), dtype=np.float64)
mean_x = np.zeros(m)
ssqdm_x = np.zeros(m)
nobs = np.zeros(m)
compensation_add = np.zeros(m)
compensation_remove = np.zeros(m)
n_same_value = np.zeros(m, dtype=np.int64)
prev_value = values[0].copy() # record first value
for i in range(n - wind + 1):
s = i
e = wind + s
# Over the first window, observations can only be added
# never removed
if i == 0:
for j in range(s, e):
add_var_new(values[j], nobs, mean_x, ssqdm_x, compensation_add, n_same_value, prev_value)
output[j] = np.nan
else:
# After the first window, observations can both be added
# and removed
# calculate deletes
remove_var(values[s - 1], nobs, mean_x, ssqdm_x, compensation_remove)
# calculate adds
add_var_new(values[e - 1], nobs, mean_x, ssqdm_x, compensation_add, n_same_value, prev_value)
output[e - 1] = calc_var_new(minp, ddof, nobs, ssqdm_x, n_same_value)
return output How it worksI record the previous value in Some examplesThen first let's have a df with big number at the top: df = pd.DataFrame(np.zeros(10))
df.iloc[0] = 1000
df.rolling(5).std()
Out[32]:
0
0 NaN
1 NaN
2 NaN
3 NaN
4 2.000000e+05
5 2.910383e-11
6 2.910383e-11
7 2.910383e-11
8 2.910383e-11
9 2.910383e-11 It's same as numba version: roll_var_old(df.values, 5, 5, 1)
Out[39]:
array([[ nan],
[ nan],
[ nan],
[ nan],
[2.00000000e+05],
[2.91038305e-11],
[2.91038305e-11],
[2.91038305e-11],
[2.91038305e-11],
[2.91038305e-11]]) My version: roll_var_new(df.values, 5, 5, 1)
Out[40]:
array([[ nan],
[ nan],
[ nan],
[ nan],
[200000.],
[ 0.],
[ 0.],
[ 0.],
[ 0.],
[ 0.]]) Performance impactarray = np.random.randn(1000, 10)
times_old = timeit.repeat('roll_var_old(array, 10, 10, 1)', number=1000, globals=globals())
np.sum(times_old)
# 0.1767864041030407
times_new = timeit.repeat('roll_var_new(array, 10, 10, 1)', number=1000, globals=globals())
np.sum(times_new)
# 0.20479978621006012 |
So the 2D input is the reason why there are no floating point artifacts? I can't easily discern what the old vs new difference is based on all the code. But generally, if different processing removes these floating point artifacts (and passes all the tests), IMO a performance hit is okay. |
So in short, the new algo will record and count how many same values have appeared: # in `add_var`
if val == prev_value[0] and val != MAXfloat64 and val != MINfloat64:
n_same_value[0] += 1 # incr count
else:
n_same_value[0] = 1 # if not same, reset count to 1 (include itself)
prev_value[0] = val # store prev value If values are the same, we use 0 as the result, instead of # in `calc_var`
if (nobs >= minp) and (nobs > ddof):
# pathological case & repeatedly same values case
if nobs == 1 or num_consecutive_same_value >= nobs:
result = 0
else:
result = ssqdm_x / (nobs - <float64_t>ddof)
else:
result = NaN The floating point artifacts comes from |
I have checked that this issue has not already been reported.
I have confirmed this bug exists on the latest version of pandas.
(optional) I have confirmed this bug exists on the master branch of pandas.
Problem description
If we have a relatively big number at the top, and the remaining is zeros, then the rolling result is not zero
Expected Output
Output of
pd.show_versions()
INSTALLED VERSIONS
commit : 2cb9652
python : 3.8.10.final.0
python-bits : 64
OS : Linux
OS-release : 5.8.0-53-generic
Version : #60-Ubuntu SMP Thu May 6 07:46:32 UTC 2021
machine : x86_64
processor : x86_64
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8
pandas : 1.2.4
numpy : 1.20.2
pytz : 2021.1
dateutil : 2.8.1
pip : 21.0.1
setuptools : 52.0.0.post20210125
Cython : None
pytest : 6.2.3
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : 4.6.2
html5lib : None
pymysql : 0.9.3
psycopg2 : 2.8.6 (dt dec pq3 ext lo64)
jinja2 : 3.0.0
IPython : 7.23.1
pandas_datareader: 0.9.0
bs4 : None
bottleneck : None
fsspec : None
fastparquet : None
gcsfs : None
matplotlib : 3.3.4
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : 4.0.0
pyxlsb : None
s3fs : None
scipy : 1.6.2
sqlalchemy : 1.4.15
tables : None
tabulate : None
xarray : None
xlrd : None
xlwt : None
numba : 0.53.1
The text was updated successfully, but these errors were encountered: