ENH: min_periods for corr/cov #2002 and TST tweak to use better sys.stderr idiom 5645be2

Chang She · Chang She · commit 8d277e3107b4 · 2012-11-24T16:31:41.000-05:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -31,6 +31,9 @@ pandas 0.10.0
 
   - Add error handling to Series.str.encode/decode (#2276)
   - Add ``where`` and ``mask`` to Series (#2337)
+  - Grouped histogram via `by` keyword in Series/DataFrame.hist (#2186)
+  - Support optional ``min_periods`` keyword in ``corr`` and ``cov``
+    for both Series and DataFrame (#2002)
 
 **API Changes**
 
@@ -42,7 +45,6 @@ pandas 0.10.0
 
 **Improvements to existing features**
 
-  - Grouped histogram via `by` keyword in Series/DataFrame.hist (#2186)
   - Add ``nrows`` option to DataFrame.from_records for iterators (#1794)
   - Unstack/reshape algorithm rewrite to avoid high memory use in cases where
     the number of observed key-tuples is much smaller than the total possible
diff --git a/doc/source/computation.rst b/doc/source/computation.rst
@@ -62,6 +62,21 @@ among the series in the DataFrame, also excluding NA/null values.
    frame = DataFrame(randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e'])
    frame.cov()
 
+``DataFrame.cov`` also supports an optional ``min_periods`` keyword that
+specifies the required minimum number of observations for each column pair
+in order to have a valid result.
+
+.. ipython:: python
+
+   frame = DataFrame(randn(20, 3), columns=['a', 'b', 'c'])
+   frame.ix[:5, 'a'] = np.nan
+   frame.ix[5:10, 'b'] = np.nan
+
+   frame.cov()
+
+   frame.cov(min_periods=12)
+
+
 .. _computation.correlation:
 
 Correlation
@@ -97,6 +112,19 @@ All of these are currently computed using pairwise complete observations.
 Note that non-numeric columns will be automatically excluded from the
 correlation calculation.
 
+Like ``cov``, ``corr`` also supports the optional ``min_periods`` keyword:
+
+.. ipython:: python
+
+   frame = DataFrame(randn(20, 3), columns=['a', 'b', 'c'])
+   frame.ix[:5, 'a'] = np.nan
+   frame.ix[5:10, 'b'] = np.nan
+
+   frame.corr()
+
+   frame.corr(min_periods=12)
+
+
 A related method ``corrwith`` is implemented on DataFrame to compute the
 correlation between like-labeled Series contained in different DataFrame
 objects.
@@ -290,9 +318,9 @@ columns using ``ix`` indexing:
 
 Expanding window moment functions
 ---------------------------------
-A common alternative to rolling statistics is to use an *expanding* window, 
-which yields the value of the statistic with all the data available up to that 
-point in time. As these calculations are a special case of rolling statistics, 
+A common alternative to rolling statistics is to use an *expanding* window,
+which yields the value of the statistic with all the data available up to that
+point in time. As these calculations are a special case of rolling statistics,
 they are implemented in pandas such that the following two calls are equivalent:
 
 .. ipython:: python
@@ -301,7 +329,7 @@ they are implemented in pandas such that the following two calls are equivalent:
 
    expanding_mean(df)[:5]
 
-Like the ``rolling_`` functions, the following methods are included in the 
+Like the ``rolling_`` functions, the following methods are included in the
 ``pandas`` namespace or can be located in ``pandas.stats.moments``.
 
 .. csv-table::
@@ -324,12 +352,12 @@ Like the ``rolling_`` functions, the following methods are included in the
     ``expanding_corr``, Correlation (binary)
     ``expanding_corr_pairwise``, Pairwise correlation of DataFrame columns
 
-Aside from not having a ``window`` parameter, these functions have the same 
-interfaces as their ``rolling_`` counterpart. Like above, the parameters they 
+Aside from not having a ``window`` parameter, these functions have the same
+interfaces as their ``rolling_`` counterpart. Like above, the parameters they
 all accept are:
 
-  - ``min_periods``: threshold of non-null data points to require. Defaults to 
-    minimum needed to compute statistic. No ``NaNs`` will be output once 
+  - ``min_periods``: threshold of non-null data points to require. Defaults to
+    minimum needed to compute statistic. No ``NaNs`` will be output once
     ``min_periods`` non-null data points have been seen.
   - ``freq``: optionally specify a :ref:`frequency string <timeseries.alias>`
     or :ref:`DateOffset <timeseries.offsets>` to pre-conform the data to.
@@ -338,15 +366,15 @@ all accept are:
 
 .. note::
 
-   The output of the ``rolling_`` and ``expanding_`` functions do not return a 
-   ``NaN`` if there are at least ``min_periods`` non-null values in the current 
-   window. This differs from ``cumsum``, ``cumprod``, ``cummax``, and 
-   ``cummin``, which return ``NaN`` in the output wherever a ``NaN`` is 
+   The output of the ``rolling_`` and ``expanding_`` functions do not return a
+   ``NaN`` if there are at least ``min_periods`` non-null values in the current
+   window. This differs from ``cumsum``, ``cumprod``, ``cummax``, and
+   ``cummin``, which return ``NaN`` in the output wherever a ``NaN`` is
    encountered in the input.
 
-An expanding window statistic will be more stable (and less responsive) than 
-its rolling window counterpart as the increasing window size decreases the 
-relative impact of an individual data point. As an example, here is the 
+An expanding window statistic will be more stable (and less responsive) than
+its rolling window counterpart as the increasing window size decreases the
+relative impact of an individual data point. As an example, here is the
 ``expanding_mean`` output for the previous time series dataset:
 
 .. ipython:: python
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4241,7 +4241,7 @@ def merge(self, right, how='inner', on=None, left_on=None, right_on=None,
     #----------------------------------------------------------------------
     # Statistical methods, etc.
 
-    def corr(self, method='pearson'):
+    def corr(self, method='pearson', min_periods=None):
         """
         Compute pairwise correlation of columns, excluding NA/null values
 
@@ -4251,6 +4251,10 @@ def corr(self, method='pearson'):
             pearson : standard correlation coefficient
             kendall : Kendall Tau correlation coefficient
             spearman : Spearman rank correlation
+        min_periods : int, optional
+            Minimum number of observations required per pair of columns
+            to have a valid result. Currently only available for pearson
+            correlation
 
         Returns
         -------
@@ -4261,8 +4265,10 @@ def corr(self, method='pearson'):
         mat = numeric_df.values
 
         if method == 'pearson':
-            correl = lib.nancorr(com._ensure_float64(mat))
+            correl = lib.nancorr(com._ensure_float64(mat), minp=min_periods)
         else:
+            if min_periods is None:
+                min_periods = 1
             mat = mat.T
             corrf = nanops.get_corr_func(method)
             K = len(cols)
@@ -4271,7 +4277,7 @@ def corr(self, method='pearson'):
             for i, ac in enumerate(mat):
                 for j, bc in enumerate(mat):
                     valid = mask[i] & mask[j]
-                    if not valid.any():
+                    if valid.sum() < min_periods:
                         c = NA
                     elif not valid.all():
                         c = corrf(ac[valid], bc[valid])
@@ -4282,10 +4288,16 @@ def corr(self, method='pearson'):
 
         return self._constructor(correl, index=cols, columns=cols)
 
-    def cov(self):
+    def cov(self, min_periods=None):
         """
         Compute pairwise covariance of columns, excluding NA/null values
 
+        Parameters
+        ----------
+        min_periods : int, optional
+            Minimum number of observations required per pair of columns
+            to have a valid result.
+
         Returns
         -------
         y : DataFrame
@@ -4298,9 +4310,14 @@ def cov(self):
         mat = numeric_df.values
 
         if notnull(mat).all():
-            baseCov = np.cov(mat.T)
+            if min_periods is not None and min_periods > len(mat):
+                baseCov = np.empty((mat.shape[1], mat.shape[1]))
+                baseCov.fill(np.nan)
+            else:
+                baseCov = np.cov(mat.T)
         else:
-            baseCov = lib.nancorr(com._ensure_float64(mat), cov=True)
+            baseCov = lib.nancorr(com._ensure_float64(mat), cov=True,
+                                  minp=min_periods)
 
         return self._constructor(baseCov, index=cols, columns=cols)
 
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -384,19 +384,22 @@ def _zero_out_fperr(arg):
         return 0 if np.abs(arg) < 1e-14 else arg
 
 
-def nancorr(a, b, method='pearson'):
+def nancorr(a, b, method='pearson', min_periods=None):
     """
     a, b: ndarrays
     """
     if len(a) != len(b):
         raise AssertionError('Operands to nancorr must have same size')
 
+    if min_periods is None:
+        min_periods = 1
+
     valid = notnull(a) & notnull(b)
     if not valid.all():
         a = a[valid]
         b = b[valid]
 
-    if len(a) == 0:
+    if len(a) < min_periods:
         return np.nan
 
     f = get_corr_func(method)
@@ -427,16 +430,19 @@ def _spearman(a, b):
     return _cor_methods[method]
 
 
-def nancov(a, b):
+def nancov(a, b, min_periods=None):
     if len(a) != len(b):
         raise AssertionError('Operands to nancov must have same size')
 
+    if min_periods is None:
+        min_periods = 1
+
     valid = notnull(a) & notnull(b)
     if not valid.all():
         a = a[valid]
         b = b[valid]
 
-    if len(a) == 0:
+    if len(a) < min_periods:
         return np.nan
 
     return np.cov(a, b)[0, 1]
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1529,7 +1529,8 @@ def pretty_name(x):
 
         return Series(data, index=names)
 
-    def corr(self, other, method='pearson'):
+    def corr(self, other, method='pearson',
+             min_periods=None):
         """
         Compute correlation two Series, excluding missing values
 
@@ -1540,21 +1541,29 @@ def corr(self, other, method='pearson'):
             pearson : standard correlation coefficient
             kendall : Kendall Tau correlation coefficient
             spearman : Spearman rank correlation
+        min_periods : int, optional
+            Minimum number of observations needed to have a valid result
+
 
         Returns
         -------
         correlation : float
         """
         this, other = self.align(other, join='inner', copy=False)
-        return nanops.nancorr(this.values, other.values, method=method)
+        if len(this) == 0:
+            return np.nan
+        return nanops.nancorr(this.values, other.values, method=method,
+                              min_periods=min_periods)
 
-    def cov(self, other):
+    def cov(self, other, min_periods=None):
         """
         Compute covariance with Series, excluding missing values
 
         Parameters
         ----------
         other : Series
+        min_periods : int, optional
+            Minimum number of observations needed to have a valid result
 
         Returns
         -------
@@ -1565,7 +1574,8 @@ def cov(self, other):
         this, other = self.align(other, join='inner')
         if len(this) == 0:
             return np.nan
-        return nanops.nancov(this.values, other.values)
+        return nanops.nancov(this.values, other.values,
+                             min_periods=min_periods)
 
     def diff(self, periods=1):
         """
diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py
@@ -822,9 +822,12 @@ def test_sparse_to_dense(self):
     def test_sparse_series_ops(self):
         import sys
         buf = StringIO()
+        tmp = sys.stderr
         sys.stderr = buf
-        self._check_all(self._check_frame_ops)
-        sys.stderr = sys.__stderr__
+        try:
+            self._check_all(self._check_frame_ops)
+        finally:
+            sys.stderr = tmp
 
     def _check_frame_ops(self, frame):
         fill = frame.default_fill_value
diff --git a/pandas/src/moments.pyx b/pandas/src/moments.pyx
@@ -300,7 +300,7 @@ def ewma(ndarray[double_t] input, double_t com, int adjust):
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def nancorr(ndarray[float64_t, ndim=2] mat, cov=False):
+def nancorr(ndarray[float64_t, ndim=2] mat, cov=False, minp=None):
     cdef:
         Py_ssize_t i, j, xi, yi, N, K
         ndarray[float64_t, ndim=2] result
@@ -310,6 +310,9 @@ def nancorr(ndarray[float64_t, ndim=2] mat, cov=False):
 
     N, K = (<object> mat).shape
 
+    if minp is None:
+        minp = 1
+
     result = np.empty((K, K), dtype=np.float64)
     mask = np.isfinite(mat).view(np.uint8)
 
@@ -324,7 +327,7 @@ def nancorr(ndarray[float64_t, ndim=2] mat, cov=False):
                     sumx += vx
                     sumy += vy
 
-            if nobs == 0:
+            if nobs < minp:
                 result[xi, yi] = result[yi, xi] = np.NaN
             else:
                 meanx = sumx / nobs
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py