BUG: ewma() weights incorrect when some values are missing (GH7543)

seth-p · jreback · commit 24b309f5a9bd · 2014-07-24T13:05:15.000-04:00
diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt
@@ -58,6 +58,18 @@ API changes
 
      rolling_min(s, window=10, min_periods=5)
 
+- :func:`ewma`, :func:`ewmastd`, :func:`ewmavar`, :func:`ewmacorr`, and :func:`ewmacov`
+  now have an optional ``ignore_na`` argument.
+  When ``ignore_na=False`` (the default), missing values are taken into account in the weights calculation.
+  When ``ignore_na=True`` (which reproduces the pre-0.15.0 behavior), missing values are ignored in the weights calculation.
+  (:issue:`7543`)
+
+  .. ipython:: python
+
+     ewma(Series([None, 1., 100.]), com=2.5)
+     ewma(Series([1., None, 100.]), com=2.5, ignore_na=True) # pre-0.15.0 behavior
+     ewma(Series([1., None, 100.]), com=2.5, ignore_na=False) # default
+
 - Bug in passing a ``DatetimeIndex`` with a timezone that was not being retained in DataFrame construction from a dict (:issue:`7822`)
 
   In prior versions this would drop the timezone.
diff --git a/pandas/algos.pyx b/pandas/algos.pyx
@@ -979,14 +979,16 @@ def roll_mean(ndarray[double_t] input,
 #-------------------------------------------------------------------------------
 # Exponentially weighted moving average
 
-def ewma(ndarray[double_t] input, double_t com, int adjust):
+def ewma(ndarray[double_t] input, double_t com, int adjust, int ignore_na):
     '''
     Compute exponentially-weighted moving average using center-of-mass.
 
     Parameters
     ----------
     input : ndarray (float64 type)
     com : float64
+    adjust: int
+    ignore_na: int
 
     Returns
     -------
@@ -1002,37 +1004,27 @@ def ewma(ndarray[double_t] input, double_t com, int adjust):
     if N == 0:
         return output
 
-    neww = 1. / (1. + com)
-    oldw = 1. - neww
-    adj = oldw
+    alpha = 1. / (1. + com)
+    old_wt_factor = 1. - alpha
+    new_wt = 1.0 if adjust else alpha
 
-    if adjust:
-        output[0] = neww * input[0]
-    else:
-        output[0] = input[0]
+    output[0] = input[0]
+    weighted_avg = output[0]
+    old_wt = 1.
 
     for i from 1 <= i < N:
         cur = input[i]
-        prev = output[i - 1]
-
-        if cur == cur:
-            if prev == prev:
-                output[i] = oldw * prev + neww * cur
-            else:
-                output[i] = neww * cur
+        if weighted_avg == weighted_avg:
+            if cur == cur:
+                old_wt *= old_wt_factor
+                weighted_avg = ((old_wt * weighted_avg) + (new_wt * cur)) / (old_wt + new_wt)
+                old_wt += new_wt
+            elif not ignore_na:
+                old_wt *= old_wt_factor
         else:
-            output[i] = prev
-
-    if adjust:
-        for i from 0 <= i < N:
-            cur = input[i]
+            weighted_avg = cur
 
-            if cur == cur:
-                output[i] = output[i] / (1. - adj)
-                adj *= oldw
-            else:
-                if i >= 1:
-                    output[i] = output[i - 1]
+        output[i] = weighted_avg
 
     return output
 
diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py
@@ -89,6 +89,9 @@
     imbalance in relative weightings (viewing EWMA as a moving average)
 how : string, default 'mean'
     Method for down- or re-sampling
+ignore_na : boolean, default False
+    Ignore missing values when calculating weights;
+    specify True to reproduce pre-0.15.0 behavior
 """
 
 _ewm_notes = r"""
@@ -420,12 +423,12 @@ def _get_center_of_mass(com, span, halflife):
               _type_of_input_retval, _ewm_notes)
 @Appender(_doc_template)
 def ewma(arg, com=None, span=None, halflife=None, min_periods=0, freq=None,
-         adjust=True, how=None):
+         adjust=True, how=None, ignore_na=False):
     com = _get_center_of_mass(com, span, halflife)
     arg = _conv_timerule(arg, freq, how)
 
     def _ewma(v):
-        result = algos.ewma(v, com, int(adjust))
+        result = algos.ewma(v, com, int(adjust), int(ignore_na))
         first_index = _first_valid_index(v)
         result[first_index: first_index + min_periods] = NaN
         return result
@@ -444,11 +447,11 @@ def _first_valid_index(arr):
               _ewm_kw+_bias_kw, _type_of_input_retval, _ewm_notes)
 @Appender(_doc_template)
 def ewmvar(arg, com=None, span=None, halflife=None, min_periods=0, bias=False,
-           freq=None, how=None):
+           freq=None, how=None, ignore_na=False):
     com = _get_center_of_mass(com, span, halflife)
     arg = _conv_timerule(arg, freq, how)
-    moment2nd = ewma(arg * arg, com=com, min_periods=min_periods)
-    moment1st = ewma(arg, com=com, min_periods=min_periods)
+    moment2nd = ewma(arg * arg, com=com, min_periods=min_periods, ignore_na=ignore_na)
+    moment1st = ewma(arg, com=com, min_periods=min_periods, ignore_na=ignore_na)
 
     result = moment2nd - moment1st ** 2
     if not bias:
@@ -460,9 +463,10 @@ def ewmvar(arg, com=None, span=None, halflife=None, min_periods=0, bias=False,
 @Substitution("Exponentially-weighted moving std", _unary_arg,
               _ewm_kw+_bias_kw, _type_of_input_retval, _ewm_notes)
 @Appender(_doc_template)
-def ewmstd(arg, com=None, span=None, halflife=None, min_periods=0, bias=False):
+def ewmstd(arg, com=None, span=None, halflife=None, min_periods=0, bias=False,
+           ignore_na=False):
     result = ewmvar(arg, com=com, span=span, halflife=halflife,
-                    min_periods=min_periods, bias=bias)
+                    min_periods=min_periods, bias=bias, ignore_na=ignore_na)
     return _zsqrt(result)
 
 ewmvol = ewmstd
@@ -472,7 +476,7 @@ def ewmstd(arg, com=None, span=None, halflife=None, min_periods=0, bias=False):
               _ewm_kw+_pairwise_kw, _type_of_input_retval, _ewm_notes)
 @Appender(_doc_template)
 def ewmcov(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0,
-           bias=False, freq=None, pairwise=None, how=None):
+           bias=False, freq=None, pairwise=None, how=None, ignore_na=False):
     if arg2 is None:
         arg2 = arg1
         pairwise = True if pairwise is None else pairwise
@@ -484,7 +488,8 @@ def ewmcov(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0,
     arg2 = _conv_timerule(arg2, freq, how)
 
     def _get_ewmcov(X, Y):
-        mean = lambda x: ewma(x, com=com, span=span, halflife=halflife, min_periods=min_periods)
+        mean = lambda x: ewma(x, com=com, span=span, halflife=halflife, min_periods=min_periods,
+                              ignore_na=ignore_na)
         return (mean(X * Y) - mean(X) * mean(Y))
     result = _flex_binary_moment(arg1, arg2, _get_ewmcov,
                                  pairwise=bool(pairwise))
@@ -499,7 +504,7 @@ def _get_ewmcov(X, Y):
               _ewm_kw+_pairwise_kw, _type_of_input_retval, _ewm_notes)
 @Appender(_doc_template)
 def ewmcorr(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0,
-            freq=None, pairwise=None, how=None):
+            freq=None, pairwise=None, how=None, ignore_na=False):
     if arg2 is None:
         arg2 = arg1
         pairwise = True if pairwise is None else pairwise
@@ -511,9 +516,10 @@ def ewmcorr(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0,
     arg2 = _conv_timerule(arg2, freq, how)
 
     def _get_ewmcorr(X, Y):
-        mean = lambda x: ewma(x, com=com, span=span, halflife=halflife, min_periods=min_periods)
+        mean = lambda x: ewma(x, com=com, span=span, halflife=halflife, min_periods=min_periods,
+                              ignore_na=ignore_na)
         var = lambda x: ewmvar(x, com=com, span=span, halflife=halflife, min_periods=min_periods,
-                               bias=True)
+                               bias=True, ignore_na=ignore_na)
         return (mean(X * Y) - mean(X) * mean(Y)) / _zsqrt(var(X) * var(Y))
     result = _flex_binary_moment(arg1, arg2, _get_ewmcorr,
                                  pairwise=bool(pairwise))
diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py
@@ -520,11 +520,64 @@ def test_ewma(self):
         result = mom.ewma(arr, span=100, adjust=False).sum()
         self.assertTrue(np.abs(result - 1) < 1e-2)
 
+        s = Series([1.0, 2.0, 4.0, 8.0])
+        
+        expected = Series([1.0, 1.6, 2.736842, 4.923077])
+        for f in [lambda s: mom.ewma(s, com=2.0, adjust=True),
+                  lambda s: mom.ewma(s, com=2.0, adjust=True, ignore_na=False),
+                  lambda s: mom.ewma(s, com=2.0, adjust=True, ignore_na=True),
+                 ]:
+            result = f(s)
+            assert_series_equal(result, expected)
+
+        expected = Series([1.0, 1.333333, 2.222222, 4.148148])
+        for f in [lambda s: mom.ewma(s, com=2.0, adjust=False),
+                  lambda s: mom.ewma(s, com=2.0, adjust=False, ignore_na=False),
+                  lambda s: mom.ewma(s, com=2.0, adjust=False, ignore_na=True),
+                 ]:
+            result = f(s)
+            assert_series_equal(result, expected)
+
     def test_ewma_nan_handling(self):
         s = Series([1.] + [np.nan] * 5 + [1.])
+        result = mom.ewma(s, com=5)
+        assert_almost_equal(result, [1.] * len(s))
 
+        s = Series([np.nan] * 2 + [1.] + [np.nan] * 2 + [1.])
         result = mom.ewma(s, com=5)
-        assert_almost_equal(result, [1] * len(s))
+        assert_almost_equal(result, [np.nan] * 2 + [1.] * 4)
+
+        # GH 7603
+        s0 = Series([np.nan, 1., 101.])
+        s1 = Series([1., np.nan, 101.])
+        s2 = Series([np.nan, 1., np.nan, np.nan, 101., np.nan])
+        com = 2.
+        alpha = 1. / (1. + com)
+
+        def simple_wma(s, w):
+            return (s.multiply(w).cumsum() / w.cumsum()).fillna(method='ffill')
+
+        for (s, adjust, ignore_na, w) in [
+                (s0, True, False, [np.nan, (1.0 - alpha), 1.]),
+                (s0, True, True, [np.nan, (1.0 - alpha), 1.]),
+                (s0, False, False, [np.nan, (1.0 - alpha), alpha]),
+                (s0, False, True, [np.nan, (1.0 - alpha), alpha]),
+                (s1, True, False, [(1.0 - alpha)**2, np.nan, 1.]),
+                (s1, True, True, [(1.0 - alpha), np.nan, 1.]),
+                (s1, False, False, [(1.0 - alpha)**2, np.nan, alpha]),
+                (s1, False, True, [(1.0 - alpha), np.nan, alpha]),
+                (s2, True, False, [np.nan, (1.0 - alpha)**3, np.nan, np.nan, 1., np.nan]),
+                (s2, True, True, [np.nan, (1.0 - alpha), np.nan, np.nan, 1., np.nan]),
+                (s2, False, False, [np.nan, (1.0 - alpha)**3, np.nan, np.nan, alpha, np.nan]),
+                (s2, False, True, [np.nan, (1.0 - alpha), np.nan, np.nan, alpha, np.nan]),
+                ]:
+            expected = simple_wma(s, Series(w))
+            result = mom.ewma(s, com=com, adjust=adjust, ignore_na=ignore_na)
+            assert_series_equal(result, expected)
+            if ignore_na is False:
+                # check that ignore_na defaults to False
+                result = mom.ewma(s, com=com, adjust=adjust)
+                assert_series_equal(result, expected)
 
     def test_ewmvar(self):
         self._check_ew(mom.ewmvar)