From 8a2eac3a609798a92c7e90a02fc6a203e439e9e8 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 26 Sep 2018 12:44:43 -0500 Subject: [PATCH 1/8] modify DataFrame.corr to allow lower triangular correlation matrix --- pandas/core/frame.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cc58674398b70..862626a1d3865 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6667,7 +6667,7 @@ def _series_round(s, decimals): # ---------------------------------------------------------------------- # Statistical methods, etc. - def corr(self, method='pearson', min_periods=1): + def corr(self, method='pearson', min_periods=1, tri=False): """ Compute pairwise correlation of columns, excluding NA/null values @@ -6686,6 +6686,10 @@ def corr(self, method='pearson', min_periods=1): to have a valid result. Currently only available for pearson and spearman correlation + tri : boolean, default : False + Whether or not to return the lower triangular correlation + matrix + Returns ------- y : DataFrame @@ -6741,7 +6745,15 @@ def corr(self, method='pearson', min_periods=1): "'spearman', or 'kendall', '{method}' " "was supplied".format(method=method)) - return self._constructor(correl, index=idx, columns=cols) + corr_mat = self._constructor(correl, index=idx, columns=cols) + + if tri: + mask = np.tril(np.ones_like(corr_mat, + dtype=np.bool), + k=-1) + return corr_mat.where(mask) + else: + return corr_mat def cov(self, min_periods=None): """ From 9a95e8c1c26193398ad534540770eb89b5d85e7b Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 26 Sep 2018 14:16:07 -0500 Subject: [PATCH 2/8] add test for DataFrame.corr(tri=True) add whatsnew entry --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/tests/frame/test_analytics.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 707257a35983e..45cce4fb2f89f 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -21,6 +21,7 @@ New features the user to override the engine's default behavior to include or omit the dataframe's indexes from the resulting Parquet file. (:issue:`20768`) - :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) +- :meth:`DataFrame.corr` now accepts a boolean argument indicating whether or not the lower triangular correlation matrix should be returned (:issue:`22840`) .. _whatsnew_0240.enhancements.extension_array_operators: diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index baebf414969be..0dbf9b0393add 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -138,6 +138,12 @@ def test_corr_invalid_method(self): with tm.assert_raises_regex(ValueError, msg): df.corr(method="____") + def test_corr_tri(self): + # GH PR + df = pd.DataFrame(np.random.normal(size=(100, 5))) + corr_mat = df.corr(tri=True) + assert corr_mat.notnull().sum().sum() == 10 + def test_cov(self): # min_periods no NAs (corner case) expected = self.frame.cov() From 9acc618b350de496ee6e52383f5725ac2b0b1d83 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 26 Sep 2018 16:21:57 -0500 Subject: [PATCH 3/8] PR #22840 change tri options from boolean to string modify tests add versionadded tag --- pandas/core/frame.py | 22 ++++++++++++++++------ pandas/tests/frame/test_analytics.py | 12 +++++++++--- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 862626a1d3865..7f151a709386f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6667,7 +6667,7 @@ def _series_round(s, decimals): # ---------------------------------------------------------------------- # Statistical methods, etc. - def corr(self, method='pearson', min_periods=1, tri=False): + def corr(self, method='pearson', min_periods=1, tri=None): """ Compute pairwise correlation of columns, excluding NA/null values @@ -6686,9 +6686,11 @@ def corr(self, method='pearson', min_periods=1, tri=False): to have a valid result. Currently only available for pearson and spearman correlation - tri : boolean, default : False - Whether or not to return the lower triangular correlation - matrix + tri : {'upper', 'lower'} or None, default : None + Whether or not to return the upper / lower triangular + correlation matrix + + .. versionadded:: 0.24.0 Returns ------- @@ -6747,11 +6749,19 @@ def corr(self, method='pearson', min_periods=1, tri=False): corr_mat = self._constructor(correl, index=idx, columns=cols) - if tri: + if tri is not None: mask = np.tril(np.ones_like(corr_mat, dtype=np.bool), k=-1) - return corr_mat.where(mask) + + if tri == 'lower': + return corr_mat.where(mask) + elif tri == 'upper': + return corr_mat.where(mask.T) + else: + raise ValueError("tri must be either 'lower', " + "or 'upper', '{tri_method}' " + "was supplied".format(tri_method=tri)) else: return corr_mat diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 0dbf9b0393add..4165014e54b3d 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -138,10 +138,16 @@ def test_corr_invalid_method(self): with tm.assert_raises_regex(ValueError, msg): df.corr(method="____") - def test_corr_tri(self): - # GH PR + def test_corr_tril(self): + # GH PR #22840 df = pd.DataFrame(np.random.normal(size=(100, 5))) - corr_mat = df.corr(tri=True) + corr_mat = df.corr(tri='lower') + assert corr_mat.notnull().sum().sum() == 10 + + def test_corr_triu(self): + # GH PR #22840 + df = pd.DataFrame(np.random.normal(size=(100, 5))) + corr_mat = df.corr(tri='upper') assert corr_mat.notnull().sum().sum() == 10 def test_cov(self): From c97fec102e60cec58df262ef8ad042d4e2838085 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 26 Sep 2018 16:49:31 -0500 Subject: [PATCH 4/8] PR #22840 update whatsnew with argument type change --- doc/source/whatsnew/v0.24.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 45cce4fb2f89f..6aa7437f8084e 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -21,7 +21,7 @@ New features the user to override the engine's default behavior to include or omit the dataframe's indexes from the resulting Parquet file. (:issue:`20768`) - :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) -- :meth:`DataFrame.corr` now accepts a boolean argument indicating whether or not the lower triangular correlation matrix should be returned (:issue:`22840`) +- :meth:`DataFrame.corr` now accepts an argument indicating whether or not a triangular correlation matrix should be returned (:issue:`22840`) .. _whatsnew_0240.enhancements.extension_array_operators: From 9b0276a0add70b51a8051761dde3aa7cc605d96e Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 26 Sep 2018 17:54:20 -0500 Subject: [PATCH 5/8] PR #22842 use numpy array in ones_like modify tests to specify expected output --- pandas/core/frame.py | 2 +- pandas/tests/frame/test_analytics.py | 21 +++++++++++++++------ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7f151a709386f..bcf4f8bc5d0db 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6750,7 +6750,7 @@ def corr(self, method='pearson', min_periods=1, tri=None): corr_mat = self._constructor(correl, index=idx, columns=cols) if tri is not None: - mask = np.tril(np.ones_like(corr_mat, + mask = np.tril(np.ones_like(correl, dtype=np.bool), k=-1) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 4165014e54b3d..ed0d060886eba 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -140,15 +140,24 @@ def test_corr_invalid_method(self): def test_corr_tril(self): # GH PR #22840 - df = pd.DataFrame(np.random.normal(size=(100, 5))) - corr_mat = df.corr(tri='lower') - assert corr_mat.notnull().sum().sum() == 10 + df = pd.DataFrame(np.array([np.arange(100)] * 5).T) + result = df.corr(tri='lower') + mask = np.tril(np.ones_like(result, + dtype=np.bool), + k=-1) + expected = pd.DataFrame(np.ones_like(corr_mat)).where(mask) + tm.assert_frame_equal(result, expected) def test_corr_triu(self): # GH PR #22840 - df = pd.DataFrame(np.random.normal(size=(100, 5))) - corr_mat = df.corr(tri='upper') - assert corr_mat.notnull().sum().sum() == 10 + df = pd.DataFrame(np.array([np.arange(100)] * 5).T) + result = df.corr(tri='upper') + mask = np.triu(np.ones_like(result, + dtype=np.bool), + k=1) + expected = pd.DataFrame(np.ones_like(corr_mat)).where(mask) + tm.assert_frame_equal(result, expected) + def test_cov(self): # min_periods no NAs (corner case) From b15436b44805fe9ec20b7766138f5c9a342f85b8 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 26 Sep 2018 18:41:46 -0500 Subject: [PATCH 6/8] PR #22840 fix errors in tests --- pandas/tests/frame/test_analytics.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index ed0d060886eba..3cd8a8c77f5a5 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -145,7 +145,7 @@ def test_corr_tril(self): mask = np.tril(np.ones_like(result, dtype=np.bool), k=-1) - expected = pd.DataFrame(np.ones_like(corr_mat)).where(mask) + expected = pd.DataFrame(np.ones_like(result)).where(mask) tm.assert_frame_equal(result, expected) def test_corr_triu(self): @@ -155,10 +155,9 @@ def test_corr_triu(self): mask = np.triu(np.ones_like(result, dtype=np.bool), k=1) - expected = pd.DataFrame(np.ones_like(corr_mat)).where(mask) + expected = pd.DataFrame(np.ones_like(result)).where(mask) tm.assert_frame_equal(result, expected) - def test_cov(self): # min_periods no NAs (corner case) expected = self.frame.cov() From 6be67f6b33722bec46d6a7ebd81087fbda7472f7 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 26 Sep 2018 12:44:43 -0500 Subject: [PATCH 7/8] modify DataFrame.corr to allow triangular correlation matrix --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/core/frame.py | 26 ++++++++++++++++++++++++-- pandas/tests/frame/test_analytics.py | 20 ++++++++++++++++++++ 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 707257a35983e..6aa7437f8084e 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -21,6 +21,7 @@ New features the user to override the engine's default behavior to include or omit the dataframe's indexes from the resulting Parquet file. (:issue:`20768`) - :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) +- :meth:`DataFrame.corr` now accepts an argument indicating whether or not a triangular correlation matrix should be returned (:issue:`22840`) .. _whatsnew_0240.enhancements.extension_array_operators: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cc58674398b70..bcf4f8bc5d0db 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6667,7 +6667,7 @@ def _series_round(s, decimals): # ---------------------------------------------------------------------- # Statistical methods, etc. - def corr(self, method='pearson', min_periods=1): + def corr(self, method='pearson', min_periods=1, tri=None): """ Compute pairwise correlation of columns, excluding NA/null values @@ -6686,6 +6686,12 @@ def corr(self, method='pearson', min_periods=1): to have a valid result. Currently only available for pearson and spearman correlation + tri : {'upper', 'lower'} or None, default : None + Whether or not to return the upper / lower triangular + correlation matrix + + .. versionadded:: 0.24.0 + Returns ------- y : DataFrame @@ -6741,7 +6747,23 @@ def corr(self, method='pearson', min_periods=1): "'spearman', or 'kendall', '{method}' " "was supplied".format(method=method)) - return self._constructor(correl, index=idx, columns=cols) + corr_mat = self._constructor(correl, index=idx, columns=cols) + + if tri is not None: + mask = np.tril(np.ones_like(correl, + dtype=np.bool), + k=-1) + + if tri == 'lower': + return corr_mat.where(mask) + elif tri == 'upper': + return corr_mat.where(mask.T) + else: + raise ValueError("tri must be either 'lower', " + "or 'upper', '{tri_method}' " + "was supplied".format(tri_method=tri)) + else: + return corr_mat def cov(self, min_periods=None): """ diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index baebf414969be..3cd8a8c77f5a5 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -138,6 +138,26 @@ def test_corr_invalid_method(self): with tm.assert_raises_regex(ValueError, msg): df.corr(method="____") + def test_corr_tril(self): + # GH PR #22840 + df = pd.DataFrame(np.array([np.arange(100)] * 5).T) + result = df.corr(tri='lower') + mask = np.tril(np.ones_like(result, + dtype=np.bool), + k=-1) + expected = pd.DataFrame(np.ones_like(result)).where(mask) + tm.assert_frame_equal(result, expected) + + def test_corr_triu(self): + # GH PR #22840 + df = pd.DataFrame(np.array([np.arange(100)] * 5).T) + result = df.corr(tri='upper') + mask = np.triu(np.ones_like(result, + dtype=np.bool), + k=1) + expected = pd.DataFrame(np.ones_like(result)).where(mask) + tm.assert_frame_equal(result, expected) + def test_cov(self): # min_periods no NAs (corner case) expected = self.frame.cov() From 9df2bcbd836aff4b841a23a084ee8ac66d3095d9 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 27 Sep 2018 19:54:56 -0500 Subject: [PATCH 8/8] PR #22842: add to DataFrame.corr docstring --- pandas/core/frame.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bcf4f8bc5d0db..16b8302369ce1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6688,7 +6688,9 @@ def corr(self, method='pearson', min_periods=1, tri=None): tri : {'upper', 'lower'} or None, default : None Whether or not to return the upper / lower triangular - correlation matrix + correlation matrix. (This is useful for example when + one wishes to easily identify highly collinear columns + in a DataFrame.) .. versionadded:: 0.24.0 @@ -6707,6 +6709,13 @@ def corr(self, method='pearson', min_periods=1, tri=None): dogs cats dogs 1.0 0.3 cats 0.3 1.0 + >>> df = pd.DataFrame(np.random.normal(size=(10, 2))) + >>> df[2] = df[0] + >>> df.corr(tri='lower').where(lambda x: x == 1) + 0 1 2 + 0 NaN NaN NaN + 1 NaN NaN NaN + 2 1.0 NaN NaN """ numeric_df = self._get_numeric_data() cols = numeric_df.columns