BUG: treat min_periods=0 as 1 in moving window functions, GH #365

wesm · wesm · commit b1fb3bc9ac49 · 2011-11-16T14:58:06.000-05:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -127,6 +127,7 @@ pandas 0.5.1
   - setupegg.py will invoke Cython (GH #192)
   - Fix block consolidation bug after inserting column into MultiIndex (GH #366)
   - Fix bug in join operations between Index and Int64Index (GH #367)
+  - Handle min_periods=0 case in moving window functions (GH #365)
 
 Thanks
 ------
@@ -135,6 +136,7 @@ Thanks
 - Joel Cross
 - Jeff Hammerbacher
 - Adam Klein
+- Thomas Kluyver
 - Jev Kuznetsov
 - Kieran O'Mahony
 - Wouter Overmeire
diff --git a/pandas/src/moments.pyx b/pandas/src/moments.pyx
@@ -25,7 +25,7 @@
 #               Series: Prentice-Hall Series in Automatic Computation
 
 
-def kth_smallest(ndarray[double_t, ndim=1] a, Py_ssize_t k):
+def kth_smallest(ndarray[double_t] a, Py_ssize_t k):
     cdef:
         Py_ssize_t i,j,l,m,n
         double_t x, t
@@ -82,8 +82,7 @@ def roll_sum(ndarray[double_t] input, int win, int minp):
 
     cdef ndarray[double_t] output = np.empty(N, dtype=float)
 
-    if minp > N:
-        minp = N + 1
+    minp = _check_minp(minp, N)
 
     for i from 0 <= i < minp - 1:
         val = input[i]
@@ -126,8 +125,7 @@ def roll_mean(ndarray[double_t] input,
 
     cdef ndarray[double_t] output = np.empty(N, dtype=float)
 
-    if minp > N:
-        minp = N + 1
+    minp = _check_minp(minp, N)
 
     for i from 0 <= i < minp - 1:
         val = input[i]
@@ -213,15 +211,23 @@ def ewma(ndarray[double_t] input, double_t com):
 #-------------------------------------------------------------------------------
 # Rolling variance
 
+def _check_minp(minp, N):
+    if minp > N:
+        minp = N + 1
+    elif minp == 0:
+        minp = 1
+    elif minp < 0:
+        raise ValueError('min_periods must be >= 0')
+    return minp
+
 def roll_var(ndarray[double_t] input, int win, int minp):
     cdef double val, prev, sum_x = 0, sum_xx = 0, nobs = 0
     cdef Py_ssize_t i
     cdef Py_ssize_t N = len(input)
 
     cdef ndarray[double_t] output = np.empty(N, dtype=float)
 
-    if minp > N:
-        minp = N + 1
+    minp = _check_minp(minp, N)
 
     for i from 0 <= i < minp - 1:
         val = input[i]
@@ -270,8 +276,7 @@ def roll_skew(ndarray[double_t] input, int win, int minp):
     # 3 components of the skewness equation
     cdef double A, B, C, R
 
-    if minp > N:
-        minp = N + 1
+    minp = _check_minp(minp, N)
 
     for i from 0 <= i < minp - 1:
         val = input[i]
@@ -333,8 +338,7 @@ def roll_kurt(ndarray[double_t] input,
     # 5 components of the kurtosis equation
     cdef double A, B, C, D, R, K
 
-    if minp > N:
-        minp = N + 1
+    minp = _check_minp(minp, N)
 
     for i from 0 <= i < minp - 1:
         val = input[i]
@@ -405,8 +409,7 @@ cdef _roll_skiplist_op(ndarray arg, int win, int minp, skiplist_f op):
 
     skiplist = IndexableSkiplist(win)
 
-    if minp > N:
-        minp = N + 1
+    minp = _check_minp(minp, N)
 
     for i from 0 <= i < minp - 1:
         val = input[i]
@@ -484,51 +487,50 @@ cdef double_t _get_min(object skiplist, int nobs, int minp):
 
 def roll_quantile(ndarray[float64_t, cast=True] input, int win,
                   int minp, double quantile):
-   '''
-   O(N log(window)) implementation using skip list
-   '''
-   cdef double val, prev, midpoint
-   cdef IndexableSkiplist skiplist
-   cdef Py_ssize_t nobs = 0, i
-   cdef Py_ssize_t N = len(input)
-   cdef ndarray[double_t] output = np.empty(N, dtype=float)
+    '''
+    O(N log(window)) implementation using skip list
+    '''
+    cdef double val, prev, midpoint
+    cdef IndexableSkiplist skiplist
+    cdef Py_ssize_t nobs = 0, i
+    cdef Py_ssize_t N = len(input)
+    cdef ndarray[double_t] output = np.empty(N, dtype=float)
 
-   skiplist = IndexableSkiplist(win)
+    skiplist = IndexableSkiplist(win)
 
-   if minp > N:
-       minp = N + 1
+    minp = _check_minp(minp, N)
 
-   for i from 0 <= i < minp - 1:
-       val = input[i]
+    for i from 0 <= i < minp - 1:
+        val = input[i]
 
-       # Not NaN
-       if val == val:
-           nobs += 1
-           skiplist.insert(val)
+        # Not NaN
+        if val == val:
+            nobs += 1
+            skiplist.insert(val)
 
-       output[i] = NaN
+        output[i] = NaN
 
-   for i from minp - 1 <= i < N:
-       val = input[i]
+    for i from minp - 1 <= i < N:
+        val = input[i]
 
-       if i > win - 1:
-           prev = input[i - win]
+        if i > win - 1:
+            prev = input[i - win]
 
-           if prev == prev:
-               skiplist.remove(prev)
-               nobs -= 1
+            if prev == prev:
+                skiplist.remove(prev)
+                nobs -= 1
 
-       if val == val:
-           nobs += 1
-           skiplist.insert(val)
+        if val == val:
+            nobs += 1
+            skiplist.insert(val)
 
-       if nobs >= minp:
-           idx = int((quantile / 1.) * (nobs - 1))
-           output[i] = skiplist.get(idx)
-       else:
-           output[i] = NaN
+        if nobs >= minp:
+            idx = int((quantile / 1.) * (nobs - 1))
+            output[i] = skiplist.get(idx)
+        else:
+            output[i] = NaN
 
-   return output
+    return output
 
 def roll_generic(ndarray[float64_t, cast=True] input, int win,
                  int minp, object func):
@@ -542,6 +544,7 @@ def roll_generic(ndarray[float64_t, cast=True] input, int win,
     buf = <float64_t*> input.data
 
     n = len(input)
+    minp = _check_minp(minp, n)
     output = np.empty(n, dtype=float)
     counts = roll_sum(np.isfinite(input).astype(float), win, minp)
 
diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py
@@ -291,11 +291,13 @@ def _conv_timerule(arg, time_rule):
 
     return arg
 
-def _two_periods(minp, window):
-    if minp is None:
-        return window
-    else:
-        return max(2, minp)
+def _require_min_periods(p):
+    def _check_func(minp, window):
+        if minp is None:
+            return window
+        else:
+            return max(p, minp)
+    return _check_func
 
 def _use_window(minp, window):
     if minp is None:
@@ -324,13 +326,13 @@ def call_cython(arg, window, minp):
 
 _ts_std = lambda *a, **kw: np.sqrt(_tseries.roll_var(*a, **kw))
 rolling_std = _rolling_func(_ts_std, 'Unbiased moving standard deviation',
-                            check_minp=_two_periods)
+                            check_minp=_require_min_periods(2))
 rolling_var = _rolling_func(_tseries.roll_var, 'Unbiased moving variance',
-                            check_minp=_two_periods)
+                            check_minp=_require_min_periods(2))
 rolling_skew = _rolling_func(_tseries.roll_skew, 'Unbiased moving skewness',
-                             check_minp=_two_periods)
+                             check_minp=_require_min_periods(3))
 rolling_kurt = _rolling_func(_tseries.roll_kurt, 'Unbiased moving kurtosis',
-                             check_minp=_two_periods)
+                             check_minp=_require_min_periods(4))
 
 def rolling_quantile(arg, window, quantile, min_periods=None, time_rule=None):
     """Moving quantile
diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py
@@ -142,6 +142,11 @@ def _check_ndarray(self, func, static_comp, window=50,
 
             self.assert_(not np.isnan(result[-6]))
             self.assert_(np.isnan(result[-5]))
+
+            # min_periods=0
+            result0 = func(arr, 20, min_periods=0)
+            result1 = func(arr, 20, min_periods=1)
+            assert_almost_equal(result0, result1)
         else:
             result = func(arr, 50)
             assert_almost_equal(result[-1], static_comp(arr[10:-10]))