From 97fbc5142298e1d04ff57d1bae5f575a16f2a0d6 Mon Sep 17 00:00:00 2001 From: daniel saxton Date: Wed, 15 Aug 2018 19:00:27 -0500 Subject: [PATCH 01/10] ENH: Enable corrwith to compute rank and callable correlation methods --- asv_bench/benchmarks/stat_ops.py | 7 +++ doc/source/whatsnew/v0.24.0.rst | 1 + pandas/core/frame.py | 81 +++++++++++++++++++++------- pandas/tests/frame/test_analytics.py | 35 ++++++++++++ 4 files changed, 104 insertions(+), 20 deletions(-) diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 500e4d74d4c4f..7fdc713f076ed 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -106,6 +106,7 @@ def setup(self, method, use_bottleneck): from pandas.core import nanops nanops._USE_BOTTLENECK = use_bottleneck self.df = pd.DataFrame(np.random.randn(1000, 30)) + self.df2 = pd.DataFrame(np.random.randn(1000, 30)) self.s = pd.Series(np.random.randn(1000)) self.s2 = pd.Series(np.random.randn(1000)) @@ -115,6 +116,12 @@ def time_corr(self, method, use_bottleneck): def time_corr_series(self, method, use_bottleneck): self.s.corr(self.s2, method=method) + def time_corrwith_cols(self, method, use_bottleneck): + self.df.corrwith(self.df2, method=method) + + def time_corrwith_rows(self, method, use_bottleneck): + self.df.corrwith(self.df2, axis=1, method=method) + class Covariance(object): diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 29ab51c582a97..04c5207f6ae40 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -413,6 +413,7 @@ Other Enhancements - The ``scatter_matrix``, ``andrews_curves``, ``parallel_coordinates``, ``lag_plot``, ``autocorrelation_plot``, ``bootstrap_plot``, and ``radviz`` plots from the ``pandas.plotting`` module are now accessible from calling :meth:`DataFrame.plot` (:issue:`11978`) - :class:`IntervalIndex` has gained the :attr:`~IntervalIndex.is_overlapping` attribute to indicate if the ``IntervalIndex`` contains any overlapping intervals (:issue:`23309`) - :func:`pandas.DataFrame.to_sql` has gained the ``method`` argument to control SQL insertion clause. See the :ref:`insertion method ` section in the documentation. (:issue:`8953`) +- :meth:`DataFrame.corrwith` now supports Spearman's rank correlation, Kendall's tau as well as callable correlation methods. (:issue:`21925`) .. _whatsnew_0240.api_breaking: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a34a34186cf45..242bf4fd5b2a9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6868,6 +6868,11 @@ def corr(self, method='pearson', min_periods=1): dogs cats dogs 1.0 0.3 cats 0.3 1.0 + + See Also + ------- + DataFrame.corrwith + Series.corr """ numeric_df = self._get_numeric_data() cols = numeric_df.columns @@ -7021,10 +7026,11 @@ def cov(self, min_periods=None): return self._constructor(baseCov, index=idx, columns=cols) - def corrwith(self, other, axis=0, drop=False): + def corrwith(self, other, axis=0, drop=False, method='pearson'): """ - Compute pairwise correlation between rows or columns of two DataFrame - objects. + Compute pairwise correlation between rows or columns of DataFrame + with rows or columns of Series or DataFrame. DataFrames are first + aligned along both axes before computing the correlations. Parameters ---------- @@ -7032,43 +7038,78 @@ def corrwith(self, other, axis=0, drop=False): axis : {0 or 'index', 1 or 'columns'}, default 0 0 or 'index' to compute column-wise, 1 or 'columns' for row-wise drop : boolean, default False - Drop missing indices from result, default returns union of all + Drop missing indices from result + method : {'pearson', 'kendall', 'spearman'} or callable + * pearson : standard correlation coefficient + * kendall : Kendall Tau correlation coefficient + * spearman : Spearman rank correlation + * callable: callable with input two 1d ndarrays + and returning a float + + .. versionadded:: 0.24.0 Returns ------- correls : Series + + See Also + ------- + DataFrame.corr """ + if method not in ['pearson', 'spearman', 'kendall']: + raise ValueError("method must be either 'pearson', " + "'spearman', or 'kendall', '{method}' " + "was supplied".format(method=method)) + axis = self._get_axis_number(axis) this = self._get_numeric_data() if isinstance(other, Series): - return this.apply(other.corr, axis=axis) + return this.apply(lambda x: other.corr(x, method=method), + axis=axis) other = other._get_numeric_data() - left, right = this.align(other, join='inner', copy=False) - # mask missing values - left = left + right * 0 - right = right + left * 0 - if axis == 1: left = left.T right = right.T - # demeaned data - ldem = left - left.mean() - rdem = right - right.mean() + if method == 'pearson': + # mask missing values + left = left + right * 0 + right = right + left * 0 + + if axis == 1: + left = left.T + right = right.T - num = (ldem * rdem).sum() - dom = (left.count() - 1) * left.std() * right.std() + # demeaned data + ldem = left - left.mean() + rdem = right - right.mean() - correl = num / dom + num = (ldem * rdem).sum() + dom = (left.count() - 1) * left.std() * right.std() - if not drop: - raxis = 1 if axis == 0 else 0 - result_index = this._get_axis(raxis).union(other._get_axis(raxis)) - correl = correl.reindex(result_index) + correl = num / dom + + if not drop: + raxis = 1 if axis == 0 else 0 + result_index = (this._get_axis(raxis). + union(other._get_axis(raxis))) + correl = correl.reindex(result_index) + + else: + def c(x): + return Series(x[0]).corr(Series(x[1]), + method=method) + + correl = Series(map(c, + zip(left.values.T, right.values.T)), + index=left.columns) + + if drop: + correl.dropna(inplace=True) return correl diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 6f68828b94a84..4a3024ff3b2c8 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -466,6 +466,41 @@ def test_corrwith_mixed_dtypes(self): expected = pd.Series(data=corrs, index=['a', 'b']) tm.assert_series_equal(result, expected) + def test_corrwith_dup_cols(self): + # GH 21925 + df1 = pd.DataFrame(np.vstack([np.arange(10)] * 3).T) + df2 = df1.copy() + df2 = pd.concat((df2, df2[0]), axis=1) + + result = df1.corrwith(df2).values + expected = np.ones(4) + tm.assert_almost_equal(result, expected) + + @td.skip_if_no_scipy + def test_corrwith_spearman(self): + # GH 21925 + df = pd.DataFrame(np.random.random(size=(100, 3))) + result = df.corrwith(df**2, method="spearman") + expected = Series(np.ones(len(result))) + tm.assert_series_equal(result, expected) + + @td.skip_if_no_scipy + def test_corrwith_kendall(self): + # GH 21925 + df = pd.DataFrame(np.random.random(size=(100, 3))) + result = df.corrwith(df**2, method="kendall") + expected = Series(np.ones(len(result))) + tm.assert_series_equal(result, expected) + + def test_corrwith_invalid_method(self): + # GH 21925 + df = pd.DataFrame(np.random.normal(size=(10, 2))) + s = pd.Series(np.random.randn(10)) + msg = ("method must be either 'pearson', 'spearman', " + "or 'kendall'") + with tm.assert_raises_regex(ValueError, msg): + df.corrwith(s, method="____") + def test_bool_describe_in_mixed_frame(self): df = DataFrame({ 'string_data': ['a', 'b', 'c', 'd', 'e'], From 0c84f8565368a8b09422ec4ff90bc4c95cb7c41b Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 18 Nov 2018 16:45:24 -0600 Subject: [PATCH 02/10] Fix corrwith * Remove incorrect error (didn't account for callables) * Add xfail to duplicate columns test * Fix transpose (was taken twice for Pearson) * Remove inplace usage for dropna --- pandas/core/frame.py | 22 +++++----------------- pandas/tests/frame/test_analytics.py | 1 + 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 242bf4fd5b2a9..ce9ce66d53801 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7056,11 +7056,6 @@ def corrwith(self, other, axis=0, drop=False, method='pearson'): ------- DataFrame.corr """ - if method not in ['pearson', 'spearman', 'kendall']: - raise ValueError("method must be either 'pearson', " - "'spearman', or 'kendall', '{method}' " - "was supplied".format(method=method)) - axis = self._get_axis_number(axis) this = self._get_numeric_data() @@ -7080,10 +7075,6 @@ def corrwith(self, other, axis=0, drop=False, method='pearson'): left = left + right * 0 right = right + left * 0 - if axis == 1: - left = left.T - right = right.T - # demeaned data ldem = left - left.mean() rdem = right - right.mean() @@ -7093,12 +7084,6 @@ def corrwith(self, other, axis=0, drop=False, method='pearson'): correl = num / dom - if not drop: - raxis = 1 if axis == 0 else 0 - result_index = (this._get_axis(raxis). - union(other._get_axis(raxis))) - correl = correl.reindex(result_index) - else: def c(x): return Series(x[0]).corr(Series(x[1]), @@ -7108,8 +7093,11 @@ def c(x): zip(left.values.T, right.values.T)), index=left.columns) - if drop: - correl.dropna(inplace=True) + if not drop: + raxis = 1 if axis == 0 else 0 + result_index = (this._get_axis(raxis). + union(other._get_axis(raxis))) + correl = correl.reindex(result_index) return correl diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 4a3024ff3b2c8..453a059baace7 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -466,6 +466,7 @@ def test_corrwith_mixed_dtypes(self): expected = pd.Series(data=corrs, index=['a', 'b']) tm.assert_series_equal(result, expected) + @pytest.mark.xfail def test_corrwith_dup_cols(self): # GH 21925 df1 = pd.DataFrame(np.vstack([np.arange(10)] * 3).T) From 48f459ebe9624796f03a0feb457e3bfe36e693fa Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 18 Nov 2018 19:40:10 -0600 Subject: [PATCH 03/10] Remove test that's no longer correct --- pandas/tests/frame/test_analytics.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 453a059baace7..421fc31e8b88f 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -493,15 +493,6 @@ def test_corrwith_kendall(self): expected = Series(np.ones(len(result))) tm.assert_series_equal(result, expected) - def test_corrwith_invalid_method(self): - # GH 21925 - df = pd.DataFrame(np.random.normal(size=(10, 2))) - s = pd.Series(np.random.randn(10)) - msg = ("method must be either 'pearson', 'spearman', " - "or 'kendall'") - with tm.assert_raises_regex(ValueError, msg): - df.corrwith(s, method="____") - def test_bool_describe_in_mixed_frame(self): df = DataFrame({ 'string_data': ['a', 'b', 'c', 'd', 'e'], From 2356dea6f3ab43be1c0b37c405fa3c2fd52643d1 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 27 Dec 2018 11:33:51 -0500 Subject: [PATCH 04/10] Update corrwith * Check for invalid method * Do not cast arrays to Series in function c --- pandas/core/frame.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ce9ce66d53801..998e8b1839fd2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7084,15 +7084,17 @@ def corrwith(self, other, axis=0, drop=False, method='pearson'): correl = num / dom - else: + elif method in ['kendall', 'spearman'] or callable(method): def c(x): - return Series(x[0]).corr(Series(x[1]), - method=method) + return nanops.nancorr(x[0], x[1], method=method) correl = Series(map(c, zip(left.values.T, right.values.T)), index=left.columns) + else: + raise ValueError('Invalid method') + if not drop: raxis = 1 if axis == 0 else 0 result_index = (this._get_axis(raxis). From 9010169d7ac25d72c322feb3631c91a385ae9bb3 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 27 Dec 2018 12:18:26 -0500 Subject: [PATCH 05/10] Fix error message --- pandas/core/frame.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 998e8b1839fd2..aa09176d8ef19 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7093,7 +7093,10 @@ def c(x): index=left.columns) else: - raise ValueError('Invalid method') + raise ValueError("Invalid method {method} was passed, " + "valid methods are: 'pearson', 'kendall', " + "'spearman', or callable". + format(method=str(method))) if not drop: raxis = 1 if axis == 0 else 0 From 6ebdd685b4869dd0b926487ca8ab30d023f9bb53 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 27 Dec 2018 21:41:36 -0500 Subject: [PATCH 06/10] Compare Series not arrays in test --- pandas/tests/frame/test_analytics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 421fc31e8b88f..21fe7ae2affbd 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -473,9 +473,9 @@ def test_corrwith_dup_cols(self): df2 = df1.copy() df2 = pd.concat((df2, df2[0]), axis=1) - result = df1.corrwith(df2).values - expected = np.ones(4) - tm.assert_almost_equal(result, expected) + result = df1.corrwith(df2) + expected = pd.Series(np.ones(4), index=[0, 0, 1, 2]) + tm.assert_series_equal(result, expected) @td.skip_if_no_scipy def test_corrwith_spearman(self): From c027574de76df95c66131afd3f8c051add474751 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 29 Dec 2018 11:54:51 -0500 Subject: [PATCH 07/10] Allow duplicate index labels --- pandas/core/frame.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index aa09176d8ef19..04deae78f3417 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7102,7 +7102,9 @@ def c(x): raxis = 1 if axis == 0 else 0 result_index = (this._get_axis(raxis). union(other._get_axis(raxis))) - correl = correl.reindex(result_index) + idx_diff = result_index.difference(correl.index) + correl = correl.append(pd.Series([np.nan] * len(idx_diff), + index=idx_diff)) return correl From 11ac73a8c3032315d0bb5bd73570b2c970d103fa Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 29 Dec 2018 12:04:11 -0500 Subject: [PATCH 08/10] Remove xfail from duplicate column test --- pandas/tests/frame/test_analytics.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 21fe7ae2affbd..ea24a50e82cbc 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -466,7 +466,6 @@ def test_corrwith_mixed_dtypes(self): expected = pd.Series(data=corrs, index=['a', 'b']) tm.assert_series_equal(result, expected) - @pytest.mark.xfail def test_corrwith_dup_cols(self): # GH 21925 df1 = pd.DataFrame(np.vstack([np.arange(10)] * 3).T) From fdb5415ea31587920af519cf00f5c2be22b515e8 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 29 Dec 2018 13:52:52 -0500 Subject: [PATCH 09/10] Use Series not pd.Series --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 04deae78f3417..55f46c28620a8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7103,8 +7103,8 @@ def c(x): result_index = (this._get_axis(raxis). union(other._get_axis(raxis))) idx_diff = result_index.difference(correl.index) - correl = correl.append(pd.Series([np.nan] * len(idx_diff), - index=idx_diff)) + correl = correl.append(Series([np.nan] * len(idx_diff), + index=idx_diff)) return correl From 870d1a3f7a85cd15e3d2b6e8ed0779deda14ce49 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 30 Dec 2018 17:45:41 -0500 Subject: [PATCH 10/10] Update corrwith * Add comment for when drop is False * Check if len(idx_diff) > 0 * Remove unnecessary string casting in error message --- pandas/core/frame.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 55f46c28620a8..2fe18339de791 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7096,15 +7096,19 @@ def c(x): raise ValueError("Invalid method {method} was passed, " "valid methods are: 'pearson', 'kendall', " "'spearman', or callable". - format(method=str(method))) + format(method=method)) if not drop: + # Find non-matching labels along the given axis + # and append missing correlations (GH 22375) raxis = 1 if axis == 0 else 0 result_index = (this._get_axis(raxis). union(other._get_axis(raxis))) idx_diff = result_index.difference(correl.index) - correl = correl.append(Series([np.nan] * len(idx_diff), - index=idx_diff)) + + if len(idx_diff) > 0: + correl = correl.append(Series([np.nan] * len(idx_diff), + index=idx_diff)) return correl