From 63f474db81b17978b5b31d591e610946321cb6cd Mon Sep 17 00:00:00 2001 From: liwh Date: Mon, 13 Sep 2021 15:32:35 +0800 Subject: [PATCH] BUG: Allow pairwise calcuation when comparing the column with itself (#25781) --- doc/source/whatsnew/v1.4.0.rst | 1 + pandas/core/frame.py | 9 ++++++++- pandas/tests/frame/methods/test_cov_corr.py | 15 +++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 822c6372ea2a4..7b628e32d21df 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -129,6 +129,7 @@ Other enhancements - :meth:`DataFrame.__pos__`, :meth:`DataFrame.__neg__` now retain ``ExtensionDtype`` dtypes (:issue:`43883`) - The error raised when an optional dependency can't be imported now includes the original exception, for easier investigation (:issue:`43882`) - Added :meth:`.ExponentialMovingWindow.sum` (:issue:`13297`) +- :meth:`DataFrame.corr` now accept the argument ``calculate_diagonal`` to allow results returned from callable to be used as diagonal elements of the correlation matrix instead of setting them to ones (:issue:`25781`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 76a00071c8adc..390d735853090 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9402,6 +9402,7 @@ def corr( self, method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson", min_periods: int = 1, + calculate_diagonal: bool = False, ) -> DataFrame: """ Compute pairwise correlation of columns, excluding NA/null values. @@ -9422,6 +9423,12 @@ def corr( Minimum number of observations required per pair of columns to have a valid result. Currently only available for Pearson and Spearman correlation. + calculate_diagonal : bool, optional + Whether to calculate pairwise correlation using supplied callable. + Ignored when method argument is not callable. If False, pairwise + correlation between a column and itself is default to 1. + + .. versionadded:: 1.4.0 Returns ------- @@ -9471,7 +9478,7 @@ def corr( valid = mask[i] & mask[j] if valid.sum() < min_periods: c = np.nan - elif i == j: + elif i == j and not calculate_diagonal: c = 1.0 elif not valid.all(): c = corrf(ac[valid], bc[valid]) diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 3dbf49df72558..0621c2bf0602e 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -236,6 +236,21 @@ def test_corr_min_periods_greater_than_length(self, method): ) tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings("ignore: An input array is constant") + @td.skip_if_no_scipy + @pytest.mark.parametrize("array_creator", [np.ones, np.zeros, np.random.random]) + def test_corr_diagonal_not_ones(self, array_creator): + from scipy.stats import pearsonr + + frame_size = 4 + df = DataFrame(array_creator((frame_size, frame_size))) + cor_mat = df.corr( + method=lambda x, y: pearsonr(x, y)[0], calculate_diagonal=True + ) + result_diag = [cor_mat.loc[i, i] for i in range(frame_size)] + expected_diag = [pearsonr(df[i], df[i])[0] for i in range(frame_size)] + tm.assert_almost_equal(result_diag, expected_diag) + class TestDataFrameCorrWith: def test_corrwith(self, datetime_frame):