diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 500e4d74d4c4f..7fdc713f076ed 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -106,6 +106,7 @@ def setup(self, method, use_bottleneck): from pandas.core import nanops nanops._USE_BOTTLENECK = use_bottleneck self.df = pd.DataFrame(np.random.randn(1000, 30)) + self.df2 = pd.DataFrame(np.random.randn(1000, 30)) self.s = pd.Series(np.random.randn(1000)) self.s2 = pd.Series(np.random.randn(1000)) @@ -115,6 +116,12 @@ def time_corr(self, method, use_bottleneck): def time_corr_series(self, method, use_bottleneck): self.s.corr(self.s2, method=method) + def time_corrwith_cols(self, method, use_bottleneck): + self.df.corrwith(self.df2, method=method) + + def time_corrwith_rows(self, method, use_bottleneck): + self.df.corrwith(self.df2, axis=1, method=method) + class Covariance(object): diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 29ab51c582a97..04c5207f6ae40 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -413,6 +413,7 @@ Other Enhancements - The ``scatter_matrix``, ``andrews_curves``, ``parallel_coordinates``, ``lag_plot``, ``autocorrelation_plot``, ``bootstrap_plot``, and ``radviz`` plots from the ``pandas.plotting`` module are now accessible from calling :meth:`DataFrame.plot` (:issue:`11978`) - :class:`IntervalIndex` has gained the :attr:`~IntervalIndex.is_overlapping` attribute to indicate if the ``IntervalIndex`` contains any overlapping intervals (:issue:`23309`) - :func:`pandas.DataFrame.to_sql` has gained the ``method`` argument to control SQL insertion clause. See the :ref:`insertion method ` section in the documentation. (:issue:`8953`) +- :meth:`DataFrame.corrwith` now supports Spearman's rank correlation, Kendall's tau as well as callable correlation methods. (:issue:`21925`) .. _whatsnew_0240.api_breaking: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a34a34186cf45..2fe18339de791 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6868,6 +6868,11 @@ def corr(self, method='pearson', min_periods=1): dogs cats dogs 1.0 0.3 cats 0.3 1.0 + + See Also + ------- + DataFrame.corrwith + Series.corr """ numeric_df = self._get_numeric_data() cols = numeric_df.columns @@ -7021,10 +7026,11 @@ def cov(self, min_periods=None): return self._constructor(baseCov, index=idx, columns=cols) - def corrwith(self, other, axis=0, drop=False): + def corrwith(self, other, axis=0, drop=False, method='pearson'): """ - Compute pairwise correlation between rows or columns of two DataFrame - objects. + Compute pairwise correlation between rows or columns of DataFrame + with rows or columns of Series or DataFrame. DataFrames are first + aligned along both axes before computing the correlations. Parameters ---------- @@ -7032,43 +7038,77 @@ def corrwith(self, other, axis=0, drop=False): axis : {0 or 'index', 1 or 'columns'}, default 0 0 or 'index' to compute column-wise, 1 or 'columns' for row-wise drop : boolean, default False - Drop missing indices from result, default returns union of all + Drop missing indices from result + method : {'pearson', 'kendall', 'spearman'} or callable + * pearson : standard correlation coefficient + * kendall : Kendall Tau correlation coefficient + * spearman : Spearman rank correlation + * callable: callable with input two 1d ndarrays + and returning a float + + .. versionadded:: 0.24.0 Returns ------- correls : Series + + See Also + ------- + DataFrame.corr """ axis = self._get_axis_number(axis) this = self._get_numeric_data() if isinstance(other, Series): - return this.apply(other.corr, axis=axis) + return this.apply(lambda x: other.corr(x, method=method), + axis=axis) other = other._get_numeric_data() - left, right = this.align(other, join='inner', copy=False) - # mask missing values - left = left + right * 0 - right = right + left * 0 - if axis == 1: left = left.T right = right.T - # demeaned data - ldem = left - left.mean() - rdem = right - right.mean() + if method == 'pearson': + # mask missing values + left = left + right * 0 + right = right + left * 0 + + # demeaned data + ldem = left - left.mean() + rdem = right - right.mean() - num = (ldem * rdem).sum() - dom = (left.count() - 1) * left.std() * right.std() + num = (ldem * rdem).sum() + dom = (left.count() - 1) * left.std() * right.std() - correl = num / dom + correl = num / dom + + elif method in ['kendall', 'spearman'] or callable(method): + def c(x): + return nanops.nancorr(x[0], x[1], method=method) + + correl = Series(map(c, + zip(left.values.T, right.values.T)), + index=left.columns) + + else: + raise ValueError("Invalid method {method} was passed, " + "valid methods are: 'pearson', 'kendall', " + "'spearman', or callable". + format(method=method)) if not drop: + # Find non-matching labels along the given axis + # and append missing correlations (GH 22375) raxis = 1 if axis == 0 else 0 - result_index = this._get_axis(raxis).union(other._get_axis(raxis)) - correl = correl.reindex(result_index) + result_index = (this._get_axis(raxis). + union(other._get_axis(raxis))) + idx_diff = result_index.difference(correl.index) + + if len(idx_diff) > 0: + correl = correl.append(Series([np.nan] * len(idx_diff), + index=idx_diff)) return correl diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 6f68828b94a84..ea24a50e82cbc 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -466,6 +466,32 @@ def test_corrwith_mixed_dtypes(self): expected = pd.Series(data=corrs, index=['a', 'b']) tm.assert_series_equal(result, expected) + def test_corrwith_dup_cols(self): + # GH 21925 + df1 = pd.DataFrame(np.vstack([np.arange(10)] * 3).T) + df2 = df1.copy() + df2 = pd.concat((df2, df2[0]), axis=1) + + result = df1.corrwith(df2) + expected = pd.Series(np.ones(4), index=[0, 0, 1, 2]) + tm.assert_series_equal(result, expected) + + @td.skip_if_no_scipy + def test_corrwith_spearman(self): + # GH 21925 + df = pd.DataFrame(np.random.random(size=(100, 3))) + result = df.corrwith(df**2, method="spearman") + expected = Series(np.ones(len(result))) + tm.assert_series_equal(result, expected) + + @td.skip_if_no_scipy + def test_corrwith_kendall(self): + # GH 21925 + df = pd.DataFrame(np.random.random(size=(100, 3))) + result = df.corrwith(df**2, method="kendall") + expected = Series(np.ones(len(result))) + tm.assert_series_equal(result, expected) + def test_bool_describe_in_mixed_frame(self): df = DataFrame({ 'string_data': ['a', 'b', 'c', 'd', 'e'],