diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 707257a35983e..6aa7437f8084e 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -21,6 +21,7 @@ New features the user to override the engine's default behavior to include or omit the dataframe's indexes from the resulting Parquet file. (:issue:`20768`) - :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) +- :meth:`DataFrame.corr` now accepts an argument indicating whether or not a triangular correlation matrix should be returned (:issue:`22840`) .. _whatsnew_0240.enhancements.extension_array_operators: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cc58674398b70..16b8302369ce1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6667,7 +6667,7 @@ def _series_round(s, decimals): # ---------------------------------------------------------------------- # Statistical methods, etc. - def corr(self, method='pearson', min_periods=1): + def corr(self, method='pearson', min_periods=1, tri=None): """ Compute pairwise correlation of columns, excluding NA/null values @@ -6686,6 +6686,14 @@ def corr(self, method='pearson', min_periods=1): to have a valid result. Currently only available for pearson and spearman correlation + tri : {'upper', 'lower'} or None, default : None + Whether or not to return the upper / lower triangular + correlation matrix. (This is useful for example when + one wishes to easily identify highly collinear columns + in a DataFrame.) + + .. versionadded:: 0.24.0 + Returns ------- y : DataFrame @@ -6701,6 +6709,13 @@ def corr(self, method='pearson', min_periods=1): dogs cats dogs 1.0 0.3 cats 0.3 1.0 + >>> df = pd.DataFrame(np.random.normal(size=(10, 2))) + >>> df[2] = df[0] + >>> df.corr(tri='lower').where(lambda x: x == 1) + 0 1 2 + 0 NaN NaN NaN + 1 NaN NaN NaN + 2 1.0 NaN NaN """ numeric_df = self._get_numeric_data() cols = numeric_df.columns @@ -6741,7 +6756,23 @@ def corr(self, method='pearson', min_periods=1): "'spearman', or 'kendall', '{method}' " "was supplied".format(method=method)) - return self._constructor(correl, index=idx, columns=cols) + corr_mat = self._constructor(correl, index=idx, columns=cols) + + if tri is not None: + mask = np.tril(np.ones_like(correl, + dtype=np.bool), + k=-1) + + if tri == 'lower': + return corr_mat.where(mask) + elif tri == 'upper': + return corr_mat.where(mask.T) + else: + raise ValueError("tri must be either 'lower', " + "or 'upper', '{tri_method}' " + "was supplied".format(tri_method=tri)) + else: + return corr_mat def cov(self, min_periods=None): """ diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index baebf414969be..3cd8a8c77f5a5 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -138,6 +138,26 @@ def test_corr_invalid_method(self): with tm.assert_raises_regex(ValueError, msg): df.corr(method="____") + def test_corr_tril(self): + # GH PR #22840 + df = pd.DataFrame(np.array([np.arange(100)] * 5).T) + result = df.corr(tri='lower') + mask = np.tril(np.ones_like(result, + dtype=np.bool), + k=-1) + expected = pd.DataFrame(np.ones_like(result)).where(mask) + tm.assert_frame_equal(result, expected) + + def test_corr_triu(self): + # GH PR #22840 + df = pd.DataFrame(np.array([np.arange(100)] * 5).T) + result = df.corr(tri='upper') + mask = np.triu(np.ones_like(result, + dtype=np.bool), + k=1) + expected = pd.DataFrame(np.ones_like(result)).where(mask) + tm.assert_frame_equal(result, expected) + def test_cov(self): # min_periods no NAs (corner case) expected = self.frame.cov()