From 0192f539bf0f7eb779403eaf3089b9f42627a02e Mon Sep 17 00:00:00 2001 From: seth-p Date: Fri, 1 Aug 2014 14:57:02 -0400 Subject: [PATCH] BUG: ewm*() interpretation of min_periods is off by one --- doc/source/v0.15.0.txt | 7 +++- pandas/stats/moments.py | 5 ++- pandas/stats/tests/test_moments.py | 62 +++++++++++++++++++++++++++--- 3 files changed, 66 insertions(+), 8 deletions(-) diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index e5ba8efd25b02..b27a6ff9adbc1 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -68,7 +68,7 @@ API changes rolling_min(s, window=10, min_periods=5) -- :func:`ewma`, :func:`ewmastd`, :func:`ewmavar`, :func:`ewmacorr`, and :func:`ewmacov` +- :func:`ewma`, :func:`ewmstd`, :func:`ewmvol`, :func:`ewmvar`, :func:`ewmcorr`, and :func:`ewmcov` now have an optional ``ignore_na`` argument. When ``ignore_na=False`` (the default), missing values are taken into account in the weights calculation. When ``ignore_na=True`` (which reproduces the pre-0.15.0 behavior), missing values are ignored in the weights calculation. @@ -80,6 +80,11 @@ API changes ewma(Series([1., None, 100.]), com=2.5, ignore_na=True) # pre-0.15.0 behavior ewma(Series([1., None, 100.]), com=2.5, ignore_na=False) # default +- :func:`ewma`, :func:`ewmstd`, :func:`ewmvol`, :func:`ewmvar`, :func:`ewmcorr`, and :func:`ewmcov` + now set to ``NaN`` the first ``min_periods-1`` entries of the result (for ``min_periods>1``). + Previously the first ``min_periods`` entries of the result were set to ``NaN``. + The new behavior accords with the existing documentation. (:issue:`7884`) + - Bug in passing a ``DatetimeIndex`` with a timezone that was not being retained in DataFrame construction from a dict (:issue:`7822`) In prior versions this would drop the timezone. diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index 646e20acae3f9..44b8bbd0c9078 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -463,8 +463,9 @@ def ewma(arg, com=None, span=None, halflife=None, min_periods=0, freq=None, def _ewma(v): result = algos.ewma(v, com, int(adjust), int(ignore_na)) - first_index = _first_valid_index(v) - result[first_index: first_index + min_periods] = NaN + if min_periods > 1: + first_index = _first_valid_index(v) + result[first_index: first_index + min_periods - 1] = NaN return result return_hook, values = _process_data_structure(arg) diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py index 9f2dae3d7d9a3..ce7f9c8a225a8 100644 --- a/pandas/stats/tests/test_moments.py +++ b/pandas/stats/tests/test_moments.py @@ -629,8 +629,37 @@ def _check_ew_ndarray(self, func, preserve_nan=False): arr = randn(50) arr[:10] = np.NaN arr[-10:] = np.NaN - - # ??? check something + s = Series(arr) + + # check min_periods + # GH 7898 + result = func(s, 50, min_periods=2) + self.assertTrue(np.isnan(result.values[:11]).all()) + self.assertFalse(np.isnan(result.values[11:]).any()) + + for min_periods in (0, 1): + result = func(s, 50, min_periods=min_periods) + if func == mom.ewma: + self.assertTrue(np.isnan(result.values[:10]).all()) + self.assertFalse(np.isnan(result.values[10:]).any()) + else: + # ewmstd, ewmvol, ewmvar *should* require at least two values, + # but currently require only one, for some reason + self.assertTrue(np.isnan(result.values[:10]).all()) + self.assertFalse(np.isnan(result.values[10:]).any()) + + # check series of length 0 + result = func(Series([]), 50, min_periods=min_periods) + assert_series_equal(result, Series([])) + + # check series of length 1 + result = func(Series([1.]), 50, min_periods=min_periods) + if func == mom.ewma: + assert_series_equal(result, Series([1.])) + else: + # ewmstd, ewmvol, ewmvar *should* require at least two values, + # so should return NaN, but currently require one, so return 0. + assert_series_equal(result, Series([0.])) # pass in ints result2 = func(np.arange(50), span=10) @@ -752,9 +781,32 @@ def _check_binary_ew(self, func): B[-10:] = np.NaN result = func(A, B, 20, min_periods=5) - - self.assertTrue(np.isnan(result.values[:15]).all()) - self.assertFalse(np.isnan(result.values[15:]).any()) + self.assertTrue(np.isnan(result.values[:14]).all()) + self.assertFalse(np.isnan(result.values[14:]).any()) + + # GH 7898 + for min_periods in (0, 1, 2): + result = func(A, B, 20, min_periods=min_periods) + # binary functions (ewmcov, ewmcorr) *should* require at least two values + if (func == mom.ewmcov) and (min_periods <= 1): + # currenty ewmcov requires only one value, for some reason. + self.assertTrue(np.isnan(result.values[:10]).all()) + self.assertFalse(np.isnan(result.values[10:]).any()) + else: + self.assertTrue(np.isnan(result.values[:11]).all()) + self.assertFalse(np.isnan(result.values[11:]).any()) + + # check series of length 0 + result = func(Series([]), Series([]), 50, min_periods=min_periods) + assert_series_equal(result, Series([])) + + # check series of length 1 + result = func(Series([1.]), Series([1.]), 50, min_periods=min_periods) + if (func == mom.ewmcov) and (min_periods <= 1): + # currenty ewmcov requires only one value, for some reason. + assert_series_equal(result, Series([0.])) + else: + assert_series_equal(result, Series([np.NaN])) self.assertRaises(Exception, func, A, randn(50), 20, min_periods=5)