ENH: Allow parameters method and min_periods in DataFrame.corrwith() (pandas-dev#9490)

anthonyho · anthonyho · commit d7e03eb11c68 · 2017-03-04T15:07:39.000-08:00
diff --git a/doc/source/computation.rst b/doc/source/computation.rst
@@ -157,6 +157,7 @@ objects.
    df2 = pd.DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns)
    df1.corrwith(df2)
    df2.corrwith(df1, axis=1)
+   df2.corrwith(df1, axis=1, method='kendall')
 
 .. _computation.ranking:
 
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -227,6 +227,7 @@ Other enhancements
 - ``pd.TimedeltaIndex`` now has a custom datetick formatter specifically designed for nanosecond level precision (:issue:`8711`)
 - ``pd.types.concat.union_categoricals`` gained the ``ignore_ordered`` argument to allow ignoring the ordered attribute of unioned categoricals (:issue:`13410`). See the :ref:`categorical union docs <categorical.union>` for more information.
 - ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`)
+- ``pd.DataFrame.corrwith()`` now accepts ``method`` and ``min_periods`` as optional arguments, as in pd.DataFrame.corr() and pd.Series.corr() (:issue:`9490`)
 
 .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4819,7 +4819,8 @@ def cov(self, min_periods=None):
 
         return self._constructor(baseCov, index=idx, columns=cols)
 
-    def corrwith(self, other, axis=0, drop=False):
+    def corrwith(self, other, axis=0, drop=False, method='pearson',
+                 min_periods=1):
         """
         Compute pairwise correlation between rows or columns of two DataFrame
         objects.
@@ -4831,36 +4832,41 @@ def corrwith(self, other, axis=0, drop=False):
             0 or 'index' to compute column-wise, 1 or 'columns' for row-wise
         drop : boolean, default False
             Drop missing indices from result, default returns union of all
+        method : {'pearson', 'kendall', 'spearman'}
+            * pearson : standard correlation coefficient
+            * kendall : Kendall Tau correlation coefficient
+            * spearman : Spearman rank correlation
+
+            .. versionadded:: 0.20.0
+
+        min_periods : int, optional
+            Minimum number of observations needed to have a valid result
+
+            .. versionadded:: 0.20.0
 
         Returns
         -------
         correls : Series
         """
         axis = self._get_axis_number(axis)
         if isinstance(other, Series):
-            return self.apply(other.corr, axis=axis)
+            return self.apply(other.corr, axis=axis,
+                              method=method, min_periods=min_periods)
 
         this = self._get_numeric_data()
         other = other._get_numeric_data()
 
         left, right = this.align(other, join='inner', copy=False)
 
-        # mask missing values
-        left = left + right * 0
-        right = right + left * 0
-
         if axis == 1:
             left = left.T
             right = right.T
 
-        # demeaned data
-        ldem = left - left.mean()
-        rdem = right - right.mean()
-
-        num = (ldem * rdem).sum()
-        dom = (left.count() - 1) * left.std() * right.std()
-
-        correl = num / dom
+        correl = Series({col: nanops.nancorr(left[col].values,
+                                             right[col].values,
+                                             method=method,
+                                             min_periods=min_periods)
+                         for col in left.columns})
 
         if not drop:
             raxis = 1 if axis == 0 else 0
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -181,28 +181,33 @@ def test_corrwith(self):
         b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:])
         del b['B']
 
-        colcorr = a.corrwith(b, axis=0)
-        tm.assert_almost_equal(colcorr['A'], a['A'].corr(b['A']))
-
-        rowcorr = a.corrwith(b, axis=1)
-        tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0))
-
-        dropped = a.corrwith(b, axis=0, drop=True)
-        tm.assert_almost_equal(dropped['A'], a['A'].corr(b['A']))
-        self.assertNotIn('B', dropped)
-
-        dropped = a.corrwith(b, axis=1, drop=True)
-        self.assertNotIn(a.index[-1], dropped.index)
-
-        # non time-series data
-        index = ['a', 'b', 'c', 'd', 'e']
-        columns = ['one', 'two', 'three', 'four']
-        df1 = DataFrame(randn(5, 4), index=index, columns=columns)
-        df2 = DataFrame(randn(4, 4), index=index[:4], columns=columns)
-        correls = df1.corrwith(df2, axis=1)
-        for row in index[:4]:
-            tm.assert_almost_equal(correls[row],
-                                   df1.loc[row].corr(df2.loc[row]))
+        for meth in ['pearson', 'kendall', 'spearman']:
+            colcorr = a.corrwith(b, axis=0, method=meth)
+            tm.assert_almost_equal(colcorr['A'],
+                                   a['A'].corr(b['A'], method=meth))
+
+            rowcorr = a.corrwith(b, axis=1, method=meth)
+            tm.assert_series_equal(rowcorr,
+                                   a.T.corrwith(b.T, axis=0, method=meth))
+
+            dropped = a.corrwith(b, axis=0, drop=True, method=meth)
+            tm.assert_almost_equal(dropped['A'],
+                                   a['A'].corr(b['A'], method=meth))
+            self.assertNotIn('B', dropped)
+
+            dropped = a.corrwith(b, axis=1, drop=True, method=meth)
+            self.assertNotIn(a.index[-1], dropped.index)
+
+            # non time-series data
+            index = ['a', 'b', 'c', 'd', 'e']
+            columns = ['one', 'two', 'three', 'four']
+            df1 = DataFrame(randn(5, 4), index=index, columns=columns)
+            df2 = DataFrame(randn(4, 4), index=index[:4], columns=columns)
+            correls = df1.corrwith(df2, axis=1, method=meth)
+            for row in index[:4]:
+                tm.assert_almost_equal(correls[row],
+                                       df1.loc[row].corr(df2.loc[row],
+                                                         method=meth))
 
     def test_corrwith_with_objects(self):
         df1 = tm.makeTimeDataFrame()
@@ -212,19 +217,22 @@ def test_corrwith_with_objects(self):
         df1['obj'] = 'foo'
         df2['obj'] = 'bar'
 
-        result = df1.corrwith(df2)
-        expected = df1.loc[:, cols].corrwith(df2.loc[:, cols])
-        tm.assert_series_equal(result, expected)
+        for meth in ['pearson', 'kendall', 'spearman']:
+            result = df1.corrwith(df2, method=meth)
+            expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], method=meth)
+            tm.assert_series_equal(result, expected)
 
-        result = df1.corrwith(df2, axis=1)
-        expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1)
-        tm.assert_series_equal(result, expected)
+            result = df1.corrwith(df2, axis=1, method=meth)
+            expected = df1.loc[:, cols].corrwith(df2.loc[:, cols],
+                                                 axis=1, method=meth)
+            tm.assert_series_equal(result, expected)
 
     def test_corrwith_series(self):
-        result = self.tsframe.corrwith(self.tsframe['A'])
-        expected = self.tsframe.apply(self.tsframe['A'].corr)
+        for meth in ['pearson', 'kendall', 'spearman']:
+            result = self.tsframe.corrwith(self.tsframe['A'], method=meth)
+            expected = self.tsframe.apply(self.tsframe['A'].corr, method=meth)
 
-        tm.assert_series_equal(result, expected)
+            tm.assert_series_equal(result, expected)
 
     def test_corrwith_matches_corrcoef(self):
         df1 = DataFrame(np.arange(10000), columns=['a'])