Skip to content

Commit d7e03eb

Browse files
committed
ENH: Allow parameters method and min_periods in DataFrame.corrwith() (pandas-dev#9490)
1 parent 0b77680 commit d7e03eb

File tree

4 files changed

+61
-45
lines changed

4 files changed

+61
-45
lines changed

doc/source/computation.rst

+1
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ objects.
157157
df2 = pd.DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns)
158158
df1.corrwith(df2)
159159
df2.corrwith(df1, axis=1)
160+
df2.corrwith(df1, axis=1, method='kendall')
160161
161162
.. _computation.ranking:
162163

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ Other enhancements
227227
- ``pd.TimedeltaIndex`` now has a custom datetick formatter specifically designed for nanosecond level precision (:issue:`8711`)
228228
- ``pd.types.concat.union_categoricals`` gained the ``ignore_ordered`` argument to allow ignoring the ordered attribute of unioned categoricals (:issue:`13410`). See the :ref:`categorical union docs <categorical.union>` for more information.
229229
- ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`)
230+
- ``pd.DataFrame.corrwith()`` now accepts ``method`` and ``min_periods`` as optional arguments, as in pd.DataFrame.corr() and pd.Series.corr() (:issue:`9490`)
230231

231232
.. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
232233

pandas/core/frame.py

+20-14
Original file line numberDiff line numberDiff line change
@@ -4819,7 +4819,8 @@ def cov(self, min_periods=None):
48194819

48204820
return self._constructor(baseCov, index=idx, columns=cols)
48214821

4822-
def corrwith(self, other, axis=0, drop=False):
4822+
def corrwith(self, other, axis=0, drop=False, method='pearson',
4823+
min_periods=1):
48234824
"""
48244825
Compute pairwise correlation between rows or columns of two DataFrame
48254826
objects.
@@ -4831,36 +4832,41 @@ def corrwith(self, other, axis=0, drop=False):
48314832
0 or 'index' to compute column-wise, 1 or 'columns' for row-wise
48324833
drop : boolean, default False
48334834
Drop missing indices from result, default returns union of all
4835+
method : {'pearson', 'kendall', 'spearman'}
4836+
* pearson : standard correlation coefficient
4837+
* kendall : Kendall Tau correlation coefficient
4838+
* spearman : Spearman rank correlation
4839+
4840+
.. versionadded:: 0.20.0
4841+
4842+
min_periods : int, optional
4843+
Minimum number of observations needed to have a valid result
4844+
4845+
.. versionadded:: 0.20.0
48344846
48354847
Returns
48364848
-------
48374849
correls : Series
48384850
"""
48394851
axis = self._get_axis_number(axis)
48404852
if isinstance(other, Series):
4841-
return self.apply(other.corr, axis=axis)
4853+
return self.apply(other.corr, axis=axis,
4854+
method=method, min_periods=min_periods)
48424855

48434856
this = self._get_numeric_data()
48444857
other = other._get_numeric_data()
48454858

48464859
left, right = this.align(other, join='inner', copy=False)
48474860

4848-
# mask missing values
4849-
left = left + right * 0
4850-
right = right + left * 0
4851-
48524861
if axis == 1:
48534862
left = left.T
48544863
right = right.T
48554864

4856-
# demeaned data
4857-
ldem = left - left.mean()
4858-
rdem = right - right.mean()
4859-
4860-
num = (ldem * rdem).sum()
4861-
dom = (left.count() - 1) * left.std() * right.std()
4862-
4863-
correl = num / dom
4865+
correl = Series({col: nanops.nancorr(left[col].values,
4866+
right[col].values,
4867+
method=method,
4868+
min_periods=min_periods)
4869+
for col in left.columns})
48644870

48654871
if not drop:
48664872
raxis = 1 if axis == 0 else 0

pandas/tests/frame/test_analytics.py

+39-31
Original file line numberDiff line numberDiff line change
@@ -181,28 +181,33 @@ def test_corrwith(self):
181181
b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:])
182182
del b['B']
183183

184-
colcorr = a.corrwith(b, axis=0)
185-
tm.assert_almost_equal(colcorr['A'], a['A'].corr(b['A']))
186-
187-
rowcorr = a.corrwith(b, axis=1)
188-
tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0))
189-
190-
dropped = a.corrwith(b, axis=0, drop=True)
191-
tm.assert_almost_equal(dropped['A'], a['A'].corr(b['A']))
192-
self.assertNotIn('B', dropped)
193-
194-
dropped = a.corrwith(b, axis=1, drop=True)
195-
self.assertNotIn(a.index[-1], dropped.index)
196-
197-
# non time-series data
198-
index = ['a', 'b', 'c', 'd', 'e']
199-
columns = ['one', 'two', 'three', 'four']
200-
df1 = DataFrame(randn(5, 4), index=index, columns=columns)
201-
df2 = DataFrame(randn(4, 4), index=index[:4], columns=columns)
202-
correls = df1.corrwith(df2, axis=1)
203-
for row in index[:4]:
204-
tm.assert_almost_equal(correls[row],
205-
df1.loc[row].corr(df2.loc[row]))
184+
for meth in ['pearson', 'kendall', 'spearman']:
185+
colcorr = a.corrwith(b, axis=0, method=meth)
186+
tm.assert_almost_equal(colcorr['A'],
187+
a['A'].corr(b['A'], method=meth))
188+
189+
rowcorr = a.corrwith(b, axis=1, method=meth)
190+
tm.assert_series_equal(rowcorr,
191+
a.T.corrwith(b.T, axis=0, method=meth))
192+
193+
dropped = a.corrwith(b, axis=0, drop=True, method=meth)
194+
tm.assert_almost_equal(dropped['A'],
195+
a['A'].corr(b['A'], method=meth))
196+
self.assertNotIn('B', dropped)
197+
198+
dropped = a.corrwith(b, axis=1, drop=True, method=meth)
199+
self.assertNotIn(a.index[-1], dropped.index)
200+
201+
# non time-series data
202+
index = ['a', 'b', 'c', 'd', 'e']
203+
columns = ['one', 'two', 'three', 'four']
204+
df1 = DataFrame(randn(5, 4), index=index, columns=columns)
205+
df2 = DataFrame(randn(4, 4), index=index[:4], columns=columns)
206+
correls = df1.corrwith(df2, axis=1, method=meth)
207+
for row in index[:4]:
208+
tm.assert_almost_equal(correls[row],
209+
df1.loc[row].corr(df2.loc[row],
210+
method=meth))
206211

207212
def test_corrwith_with_objects(self):
208213
df1 = tm.makeTimeDataFrame()
@@ -212,19 +217,22 @@ def test_corrwith_with_objects(self):
212217
df1['obj'] = 'foo'
213218
df2['obj'] = 'bar'
214219

215-
result = df1.corrwith(df2)
216-
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols])
217-
tm.assert_series_equal(result, expected)
220+
for meth in ['pearson', 'kendall', 'spearman']:
221+
result = df1.corrwith(df2, method=meth)
222+
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], method=meth)
223+
tm.assert_series_equal(result, expected)
218224

219-
result = df1.corrwith(df2, axis=1)
220-
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1)
221-
tm.assert_series_equal(result, expected)
225+
result = df1.corrwith(df2, axis=1, method=meth)
226+
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols],
227+
axis=1, method=meth)
228+
tm.assert_series_equal(result, expected)
222229

223230
def test_corrwith_series(self):
224-
result = self.tsframe.corrwith(self.tsframe['A'])
225-
expected = self.tsframe.apply(self.tsframe['A'].corr)
231+
for meth in ['pearson', 'kendall', 'spearman']:
232+
result = self.tsframe.corrwith(self.tsframe['A'], method=meth)
233+
expected = self.tsframe.apply(self.tsframe['A'].corr, method=meth)
226234

227-
tm.assert_series_equal(result, expected)
235+
tm.assert_series_equal(result, expected)
228236

229237
def test_corrwith_matches_corrcoef(self):
230238
df1 = DataFrame(np.arange(10000), columns=['a'])

0 commit comments

Comments
 (0)