ENH: add kendall/spearman correlation methods, GH #428

wesm · wesm · commit 113511e9c07a · 2011-12-03T12:11:04.000-05:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -31,14 +31,16 @@ pandas 0.6.1
 
   - Can pass Series to DataFrame.append with ignore_index=True for appending a
     single row (GH #430)
+  - Add Spearman and Kendall correlation options to Series.corr and
+    DataFrame.corr (GH #428)
 
 **Improvements to existing features**
   - Improve memory usage of `DataFrame.describe` (do not copy data
     unnecessarily) (PR #425)
   - Use same formatting function for outputting floating point Series to console
     as in DataFrame (PR #420)
   - DataFrame.delevel will try to infer better dtype for new columns (GH #440)
-  - Exclude non-numeric types in DataFrame.corr
+  - Exclude non-numeric types in DataFrame.{corr, cov}
 
 **Bug fixes**
 
@@ -52,7 +54,6 @@ pandas 0.6.1
   - Fix groupby exception raised with as_index=False and single column selected
     (GH #421)
 
-
 Thanks
 ------
 - Ralph Bean
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2563,26 +2563,36 @@ def _join_index(self, other, how, lsuffix, rsuffix):
     #----------------------------------------------------------------------
     # Statistical methods, etc.
 
-    def corr(self):
+    def corr(self, method='pearson'):
         """
         Compute pairwise correlation of columns, excluding NA/null values
 
+        Parameters
+        ----------
+        method : {'pearson', 'kendall', 'spearman'}
+            pearson : standard correlation coefficient
+            kendall : Kendall Tau correlation coefficient
+            spearman : Spearman rank correlation
+
         Returns
         -------
         y : DataFrame
         """
         cols = self._get_numeric_columns()
         mat = self.as_matrix(cols).T
-        baseCov = np.cov(mat)
-
-        sigma = np.sqrt(np.diag(baseCov))
-        correl = baseCov / np.outer(sigma, sigma)
-
-        # Get the covariance with items that have NaN values
-        for i, j, ac, bc in self._cov_helper(mat):
-            c = np.corrcoef(ac, bc)[0, 1]
-            correl[i, j] = c
-            correl[j, i] = c
+        corrf = nanops.get_corr_func(method)
+        K = len(cols)
+        correl = np.empty((K, K), dtype=float)
+        mask = np.isfinite(mat)
+        for i, ac in enumerate(mat):
+            for j, bc  in enumerate(mat):
+                valid = mask[i] & mask[j]
+                if not valid.all():
+                    c = corrf(ac[valid], bc[valid])
+                else:
+                    c = corrf(ac, bc)
+                correl[i, j] = c
+                correl[j, i] = c
 
         return self._constructor(correl, index=cols, columns=cols)
 
@@ -2594,7 +2604,7 @@ def cov(self):
         -------
         y : DataFrame
         """
-        cols = self.columns
+        cols = self._get_numeric_columns()
         mat = self.as_matrix(cols).T
         baseCov = np.cov(mat)
 
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -218,3 +218,48 @@ def _zero_out_fperr(arg):
         return np.where(np.abs(arg) < 1e-14, 0, arg)
     else:
         return 0 if np.abs(arg) < 1e-14 else arg
+
+def nancorr(a, b, method='pearson'):
+    """
+    a, b: ndarrays
+    """
+    assert(len(a) == len(b))
+    if len(a) == 0:
+        return np.nan
+
+    valid = notnull(a) & notnull(b)
+    if not valid.all():
+        a = a[valid]
+        b = b[valid]
+
+    f = get_corr_func(method)
+    return f(a, b)
+
+def get_corr_func(method):
+    if method in ['kendall', 'spearman']:
+        from scipy.stats import kendalltau, spearmanr
+
+    def _pearson(a, b):
+        return np.corrcoef(a, b)[0, 1]
+    def _kendall(a, b):
+        return kendalltau(a, b)[0]
+    def _spearman(a, b):
+        return spearmanr(a, b)[0]
+
+    _cor_methods = {
+        'pearson' : _pearson,
+        'kendall' : _kendall,
+        'spearman' : _spearman
+    }
+    return _cor_methods[method]
+
+def nancov(a, b):
+    assert(len(a) == len(b))
+    if len(a) == 0:
+        return np.nan
+
+    valid = notnull(a) & notnull(b)
+    if not valid.all():
+        a = a[valid]
+        b = b[valid]
+    return np.cov(a, b)[0, 1]
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -867,22 +867,24 @@ def describe(self):
 
         return Series(data, index=names)
 
-    def corr(self, other):
+    def corr(self, other, method='pearson'):
         """
         Compute correlation two Series, excluding missing values
 
         Parameters
         ----------
         other : Series
+        method : {'pearson', 'kendall', 'spearman'}
+            pearson : standard correlation coefficient
+            kendall : Kendall Tau correlation coefficient
+            spearman : Spearman rank correlation
 
         Returns
         -------
         correlation : float
         """
-        this, that = self._get_nonna_aligned(other)
-        if this is None or that is None:
-            return nan
-        return np.corrcoef(this, that)[0, 1]
+        this, other = self.align(other, join='inner')
+        return nanops.nancorr(this.values, other.values, method=method)
 
     def cov(self, other):
         """
@@ -896,23 +898,10 @@ def cov(self, other):
         -------
         covariance : float
         """
-        this, that = self._get_nonna_aligned(other)
-        if this is None or that is None:
-            return nan
-        return np.cov(this, that)[0, 1]
-
-    def _get_nonna_aligned(self, other):
-        """
-        Returns two sub-Series with the same index and only non-na values
-        """
-        commonIdx = self.dropna().index.intersection(other.dropna().index)
-
-        if len(commonIdx) == 0:
-            return None, None
-
-        this = self.reindex(commonIdx)
-        that = other.reindex(commonIdx)
-        return this, that
+        this, other = self.align(other, join='inner')
+        if len(this) == 0:
+            return np.nan
+        return nanops.nancov(this.values, other.values)
 
     def diff(self, periods=1):
         """
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -2075,10 +2075,14 @@ def test_corr(self):
         self.frame['A'][:5] = nan
         self.frame['B'][:10] = nan
 
-        correls = self.frame.corr()
+        def _check_method(method='pearson'):
+            correls = self.frame.corr(method=method)
+            exp = self.frame['A'].corr(self.frame['C'], method=method)
+            assert_almost_equal(correls['A']['C'], exp)
 
-        assert_almost_equal(correls['A']['C'],
-                            self.frame['A'].corr(self.frame['C']))
+        _check_method('pearson')
+        _check_method('kendall')
+        _check_method('spearman')
 
         # exclude non-numeric types
         result = self.mixed_frame.corr()
@@ -2093,6 +2097,11 @@ def test_cov(self):
         assert_almost_equal(cov['A']['C'],
                             self.frame['A'].cov(self.frame['C']))
 
+        # exclude non-numeric types
+        result = self.mixed_frame.cov()
+        expected = self.mixed_frame.ix[:, ['A', 'B', 'C', 'D']].cov()
+        assert_frame_equal(result, expected)
+
     def test_corrwith(self):
         a = self.tsframe
         noise = Series(randn(len(a)), index=a.index)
diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
@@ -13,19 +13,7 @@
                                  assert_frame_equal)
 import pandas.core.common as com
 import pandas.util.testing as tm
-
-try:
-    from itertools import product as cart_product
-except ImportError:  # python 2.5
-    def cart_product(*args, **kwds):
-        # product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy
-        # product(range(2), repeat=3) --> 000 001 010 011 100 101 110 111
-        pools = map(tuple, args) * kwds.get('repeat', 1)
-        result = [[]]
-        for pool in pools:
-            result = [x+[y] for x in result for y in pool]
-        for prod in result:
-            yield tuple(prod)
+from pandas.util.compat import product as cart_product
 
 class TestMultiLevel(unittest.TestCase):
 
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -879,6 +879,8 @@ def test_combine_first(self):
         assert_series_equal(s, result)
 
     def test_corr(self):
+        import scipy.stats as stats
+
         # full overlap
         self.assertAlmostEqual(self.ts.corr(self.ts), 1)
 
@@ -888,7 +890,38 @@ def test_corr(self):
         # No overlap
         self.assert_(np.isnan(self.ts[::2].corr(self.ts[1::2])))
 
-        # additional checks?
+        A = tm.makeTimeSeries()
+        B = tm.makeTimeSeries()
+        result = A.corr(B)
+        expected, _ = stats.pearsonr(A, B)
+        self.assertAlmostEqual(result, expected)
+
+    def test_corr_rank(self):
+        import scipy.stats as stats
+        # kendall and spearman
+
+        A = tm.makeTimeSeries()
+        B = tm.makeTimeSeries()
+        A[-5:] = A[:5]
+        result = A.corr(B, method='kendall')
+        expected = stats.kendalltau(A, B)[0]
+        self.assertAlmostEqual(result, expected)
+
+        result = A.corr(B, method='spearman')
+        expected = stats.spearmanr(A, B)[0]
+        self.assertAlmostEqual(result, expected)
+
+        # results from R
+        A = Series([-0.89926396,  0.94209606, -1.03289164, -0.95445587,
+                    0.76910310, -0.06430576, -2.09704447, 0.40660407,
+                    -0.89926396,  0.94209606])
+        B = Series([-1.01270225, -0.62210117, -1.56895827,  0.59592943,
+                    -0.01680292,  1.17258718, -1.06009347, -0.10222060,
+                    -0.89076239,  0.89372375])
+        kexp = 0.4319297
+        sexp = 0.5853767
+        self.assertAlmostEqual(A.corr(B, method='kendall'), kexp)
+        self.assertAlmostEqual(A.corr(B, method='spearman'), sexp)
 
     def test_cov(self):
         # full overlap
diff --git a/pandas/util/compat.py b/pandas/util/compat.py
@@ -0,0 +1,14 @@
+# itertools.product not in Python 2.5
+
+try:
+    from itertools import product
+except ImportError:  # python 2.5
+    def product(*args, **kwds):
+        # product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy
+        # product(range(2), repeat=3) --> 000 001 010 011 100 101 110 111
+        pools = map(tuple, args) * kwds.get('repeat', 1)
+        result = [[]]
+        for pool in pools:
+            result = [x+[y] for x in result for y in pool]
+        for prod in result:
+            yield tuple(prod)