diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 77d8cea4de507..ae716ddd38a5c 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -1087,7 +1087,7 @@ def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y, sum_wt = 1. sum_wt2 = 1. old_wt = 1. - + for i from 1 <= i < N: cur_x = input_x[i] cur_y = input_y[i] @@ -1117,7 +1117,7 @@ def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y, elif is_observation: mean_x = cur_x mean_y = cur_y - + if nobs >= minp: if not bias: numerator = sum_wt * sum_wt @@ -1259,84 +1259,72 @@ def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1): """ Numerically stable implementation using Welford's method. """ - cdef double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta - cdef Py_ssize_t i - cdef Py_ssize_t N = len(input) + cdef double val, prev, mean_x = 0, ssqdm_x = 0, delta, rep = NaN, out + cdef Py_ssize_t nobs = 0, nrep = 0, i, N = len(input) cdef ndarray[double_t] output = np.empty(N, dtype=float) minp = _check_minp(win, minp, N) - # Check for windows larger than array, addresses #7297 - win = min(win, N) - - # Over the first window, observations can only be added, never removed - for i from 0 <= i < win: + for i from 0 <= i < N: val = input[i] - - # Not NaN + prev = NaN if i < win else input[i - win] + + # First, count the number of observations and consecutive repeats + if prev == prev: + # prev is not NaN, removing an observation... + if nobs == nrep: + # ...and removing a repeat + nrep -= 1 + if nrep == 0: + rep = NaN + nobs -= 1 if val == val: - nobs += 1 - delta = (val - mean_x) - mean_x += delta / nobs - ssqdm_x += delta * (val - mean_x) - - if (nobs >= minp) and (nobs > ddof): - #pathological case - if nobs == 1: - val = 0 + # next is not NaN, adding an observation... + if val == rep: + # ...and adding a repeat + nrep += 1 else: - val = ssqdm_x / (nobs - ddof) - if val < 0: - val = 0 - else: - val = NaN - - output[i] = val - - # After the first window, observations can both be added and removed - for i from win <= i < N: - val = input[i] - prev = input[i - win] + # ...and resetting repeats + nrep = 1 + rep = val + nobs += 1 - if val == val: + # Then, compute the new mean and sum of squared differences + if nobs == nrep: + # All non-NaN values in window are identical... + ssqdm_x = 0 + mean_x = rep if nobs > 0 else 0 + elif val == val: + # Adding one observation... if prev == prev: - # Adding one observation and removing another one + # ...and removing another delta = val - prev prev -= mean_x mean_x += delta / nobs val -= mean_x ssqdm_x += (val + prev) * delta else: - # Adding one observation and not removing any - nobs += 1 - delta = (val - mean_x) + # ...and not removing any + delta = val - mean_x mean_x += delta / nobs ssqdm_x += delta * (val - mean_x) elif prev == prev: # Adding no new observation, but removing one - nobs -= 1 - if nobs: - delta = (prev - mean_x) - mean_x -= delta / nobs - ssqdm_x -= delta * (prev - mean_x) - else: - mean_x = 0 - ssqdm_x = 0 + delta = prev - mean_x + mean_x -= delta / nobs + ssqdm_x -= delta * (prev - mean_x) # Variance is unchanged if no observation is added or removed - if (nobs >= minp) and (nobs > ddof): - #pathological case - if nobs == 1: - val = 0 - else: - val = ssqdm_x / (nobs - ddof) - if val < 0: - val = 0 + # Finally, compute and write the rolling variance to the output array + if nobs >= minp and nobs > ddof: + out = ssqdm_x / (nobs - ddof) + if out < 0: + out = 0 else: - val = NaN + out = NaN - output[i] = val + output[i] = out return output diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py index 1d0be4ce48f4f..f17b7911cf086 100644 --- a/pandas/stats/tests/test_moments.py +++ b/pandas/stats/tests/test_moments.py @@ -7,9 +7,9 @@ import numpy as np from pandas import Series, DataFrame, Panel, bdate_range, isnull, notnull -from pandas.util.testing import ( - assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal, assert_index_equal -) +from pandas.util.testing import (assert_almost_equal, assert_series_equal, + assert_frame_equal, assert_panel_equal, + assert_index_equal,) import pandas.core.datetools as datetools import pandas.stats.moments as mom import pandas.util.testing as tm @@ -524,7 +524,7 @@ def test_ewma(self): self.assertTrue(np.abs(result - 1) < 1e-2) s = Series([1.0, 2.0, 4.0, 8.0]) - + expected = Series([1.0, 1.6, 2.736842, 4.923077]) for f in [lambda s: mom.ewma(s, com=2.0, adjust=True), lambda s: mom.ewma(s, com=2.0, adjust=True, ignore_na=False), @@ -750,7 +750,7 @@ def _non_null_values(x): for (std, var, cov) in [(std_biased, var_biased, cov_biased), (std_unbiased, var_unbiased, cov_unbiased)]: - + # check that var(x), std(x), and cov(x) are all >= 0 var_x = var(x) std_x = std(x) @@ -762,7 +762,7 @@ def _non_null_values(x): # check that var(x) == cov(x, x) assert_equal(var_x, cov_x_x) - + # check that var(x) == std(x)^2 assert_equal(var_x, std_x * std_x) @@ -796,7 +796,7 @@ def _non_null_values(x): cov_x_y = cov(x, y) cov_y_x = cov(y, x) assert_equal(cov_x_y, cov_y_x) - + # check that cov(x, y) == (var(x+y) - var(x) - var(y)) / 2 var_x_plus_y = var(x + y) var_y = var(y) @@ -1007,7 +1007,7 @@ def test_rolling_consistency(self): expected.iloc[:, i, j] = rolling_f(x.iloc[:, i], x.iloc[:, j], window=window, min_periods=min_periods, center=center) assert_panel_equal(rolling_f_result, expected) - + # binary moments def test_rolling_cov(self): A = self.series @@ -1432,7 +1432,7 @@ def test_expanding_corr_pairwise_diff_length(self): assert_frame_equal(result2, expected) assert_frame_equal(result3, expected) assert_frame_equal(result4, expected) - + def test_pairwise_stats_column_names_order(self): # GH 7738 df1s = [DataFrame([[2,4],[1,2],[5,2],[8,1]], columns=[0,1]), @@ -1731,6 +1731,13 @@ def test_rolling_median_how_resample(self): x = mom.rolling_median(series, window=1, freq='D') assert_series_equal(expected, x) + def test_rolling_var_exactly_zero(self): + # Related to GH 7900 + a = np.array([1, 2, 3, 3, 3, 2, 2, 2, np.nan, 2, np.nan, 2, np.nan, + np.nan, np.nan, 1, 1, 1]) + v = mom.rolling_var(a, window=3, min_periods=2) + self.assertTrue(np.all(v[[4, 7, 8, 9, 11, 16, 17]] == 0)) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],