Skip to content

ENH: Ensure that rolling_var of identical values is exactly zero #8271

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 44 additions & 56 deletions pandas/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1087,7 +1087,7 @@ def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y,
sum_wt = 1.
sum_wt2 = 1.
old_wt = 1.

for i from 1 <= i < N:
cur_x = input_x[i]
cur_y = input_y[i]
Expand Down Expand Up @@ -1117,7 +1117,7 @@ def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y,
elif is_observation:
mean_x = cur_x
mean_y = cur_y

if nobs >= minp:
if not bias:
numerator = sum_wt * sum_wt
Expand Down Expand Up @@ -1259,84 +1259,72 @@ def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1):
"""
Numerically stable implementation using Welford's method.
"""
cdef double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta
cdef Py_ssize_t i
cdef Py_ssize_t N = len(input)
cdef double val, prev, mean_x = 0, ssqdm_x = 0, delta, rep = NaN, out
cdef Py_ssize_t nobs = 0, nrep = 0, i, N = len(input)

cdef ndarray[double_t] output = np.empty(N, dtype=float)

minp = _check_minp(win, minp, N)

# Check for windows larger than array, addresses #7297
win = min(win, N)

# Over the first window, observations can only be added, never removed
for i from 0 <= i < win:
for i from 0 <= i < N:
val = input[i]

# Not NaN
prev = NaN if i < win else input[i - win]

# First, count the number of observations and consecutive repeats
if prev == prev:
# prev is not NaN, removing an observation...
if nobs == nrep:
# ...and removing a repeat
nrep -= 1
if nrep == 0:
rep = NaN
nobs -= 1
if val == val:
nobs += 1
delta = (val - mean_x)
mean_x += delta / nobs
ssqdm_x += delta * (val - mean_x)

if (nobs >= minp) and (nobs > ddof):
#pathological case
if nobs == 1:
val = 0
# next is not NaN, adding an observation...
if val == rep:
# ...and adding a repeat
nrep += 1
else:
val = ssqdm_x / (nobs - ddof)
if val < 0:
val = 0
else:
val = NaN

output[i] = val

# After the first window, observations can both be added and removed
for i from win <= i < N:
val = input[i]
prev = input[i - win]
# ...and resetting repeats
nrep = 1
rep = val
nobs += 1

if val == val:
# Then, compute the new mean and sum of squared differences
if nobs == nrep:
# All non-NaN values in window are identical...
ssqdm_x = 0
mean_x = rep if nobs > 0 else 0
elif val == val:
# Adding one observation...
if prev == prev:
# Adding one observation and removing another one
# ...and removing another
delta = val - prev
prev -= mean_x
mean_x += delta / nobs
val -= mean_x
ssqdm_x += (val + prev) * delta
else:
# Adding one observation and not removing any
nobs += 1
delta = (val - mean_x)
# ...and not removing any
delta = val - mean_x
mean_x += delta / nobs
ssqdm_x += delta * (val - mean_x)
elif prev == prev:
# Adding no new observation, but removing one
nobs -= 1
if nobs:
delta = (prev - mean_x)
mean_x -= delta / nobs
ssqdm_x -= delta * (prev - mean_x)
else:
mean_x = 0
ssqdm_x = 0
delta = prev - mean_x
mean_x -= delta / nobs
ssqdm_x -= delta * (prev - mean_x)
# Variance is unchanged if no observation is added or removed

if (nobs >= minp) and (nobs > ddof):
#pathological case
if nobs == 1:
val = 0
else:
val = ssqdm_x / (nobs - ddof)
if val < 0:
val = 0
# Finally, compute and write the rolling variance to the output array
if nobs >= minp and nobs > ddof:
out = ssqdm_x / (nobs - ddof)
if out < 0:
out = 0
else:
val = NaN
out = NaN

output[i] = val
output[i] = out

return output

Expand Down
25 changes: 16 additions & 9 deletions pandas/stats/tests/test_moments.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
import numpy as np

from pandas import Series, DataFrame, Panel, bdate_range, isnull, notnull
from pandas.util.testing import (
assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal, assert_index_equal
)
from pandas.util.testing import (assert_almost_equal, assert_series_equal,
assert_frame_equal, assert_panel_equal,
assert_index_equal,)
import pandas.core.datetools as datetools
import pandas.stats.moments as mom
import pandas.util.testing as tm
Expand Down Expand Up @@ -524,7 +524,7 @@ def test_ewma(self):
self.assertTrue(np.abs(result - 1) < 1e-2)

s = Series([1.0, 2.0, 4.0, 8.0])

expected = Series([1.0, 1.6, 2.736842, 4.923077])
for f in [lambda s: mom.ewma(s, com=2.0, adjust=True),
lambda s: mom.ewma(s, com=2.0, adjust=True, ignore_na=False),
Expand Down Expand Up @@ -750,7 +750,7 @@ def _non_null_values(x):

for (std, var, cov) in [(std_biased, var_biased, cov_biased),
(std_unbiased, var_unbiased, cov_unbiased)]:

# check that var(x), std(x), and cov(x) are all >= 0
var_x = var(x)
std_x = std(x)
Expand All @@ -762,7 +762,7 @@ def _non_null_values(x):

# check that var(x) == cov(x, x)
assert_equal(var_x, cov_x_x)

# check that var(x) == std(x)^2
assert_equal(var_x, std_x * std_x)

Expand Down Expand Up @@ -796,7 +796,7 @@ def _non_null_values(x):
cov_x_y = cov(x, y)
cov_y_x = cov(y, x)
assert_equal(cov_x_y, cov_y_x)

# check that cov(x, y) == (var(x+y) - var(x) - var(y)) / 2
var_x_plus_y = var(x + y)
var_y = var(y)
Expand Down Expand Up @@ -1007,7 +1007,7 @@ def test_rolling_consistency(self):
expected.iloc[:, i, j] = rolling_f(x.iloc[:, i], x.iloc[:, j],
window=window, min_periods=min_periods, center=center)
assert_panel_equal(rolling_f_result, expected)

# binary moments
def test_rolling_cov(self):
A = self.series
Expand Down Expand Up @@ -1432,7 +1432,7 @@ def test_expanding_corr_pairwise_diff_length(self):
assert_frame_equal(result2, expected)
assert_frame_equal(result3, expected)
assert_frame_equal(result4, expected)

def test_pairwise_stats_column_names_order(self):
# GH 7738
df1s = [DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[0,1]),
Expand Down Expand Up @@ -1731,6 +1731,13 @@ def test_rolling_median_how_resample(self):
x = mom.rolling_median(series, window=1, freq='D')
assert_series_equal(expected, x)

def test_rolling_var_exactly_zero(self):
# Related to GH 7900
a = np.array([1, 2, 3, 3, 3, 2, 2, 2, np.nan, 2, np.nan, 2, np.nan,
np.nan, np.nan, 1, 1, 1])
v = mom.rolling_var(a, window=3, min_periods=2)
self.assertTrue(np.all(v[[4, 7, 8, 9, 11, 16, 17]] == 0))

if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down