From 50c0a03d6d23c4eef2155ca5e7837e62f9b79af2 Mon Sep 17 00:00:00 2001 From: Tobias Brandt Date: Mon, 23 Sep 2013 11:34:16 +0200 Subject: [PATCH 1/5] TST: Added tests for pairwise behaviour in rolling moments. Added a test for rolling_cov_pairwise similar to the test for rolling_corr_pairwise. Added tests for ewmcov_pairwise and ewmcorr_pairwise. Added test for expanding_cov_pairwise based on the test for expanding_corr_pairwise. --- pandas/stats/tests/test_moments.py | 35 +++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py index a8359c102a902..af77152fdd5b7 100644 --- a/pandas/stats/tests/test_moments.py +++ b/pandas/stats/tests/test_moments.py @@ -586,6 +586,10 @@ def test_rolling_cov(self): result = mom.rolling_cov(A, B, 50, min_periods=25) assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1]) + def test_rolling_cov_pairwise(self): + self._check_pairwise_moment(mom.rolling_cov_pairwise, + mom.rolling_cov, 10, min_periods=5) + def test_rolling_corr(self): A = self.series B = A + randn(len(A)) @@ -603,12 +607,15 @@ def test_rolling_corr(self): assert_almost_equal(result[-1], a.corr(b)) def test_rolling_corr_pairwise(self): - panel = mom.rolling_corr_pairwise(self.frame, 10, min_periods=5) + self._check_pairwise_moment(mom.rolling_corr_pairwise, + mom.rolling_corr, 10, min_periods=5) + + def _check_pairwise_moment(self, func_pairwise, func, *args, **kwargs): + panel = func_pairwise(self.frame, self.frame, *args, **kwargs) - correl = panel.ix[:, 1, 5] - exp = mom.rolling_corr(self.frame[1], self.frame[5], - 10, min_periods=5) - tm.assert_series_equal(correl, exp) + actual = panel.ix[:, 1, 5] + expected = func(self.frame[1], self.frame[5], *args, **kwargs) + tm.assert_series_equal(actual, expected) def test_flex_binary_moment(self): # GH3155 @@ -666,9 +673,17 @@ def _check(method): def test_ewmcov(self): self._check_binary_ew(mom.ewmcov) + def test_ewmcov_pairwise(self): + self._check_pairwise_moment(mom.ewmcov_pairwise, mom.ewmcov, 10, + min_periods=5) + def test_ewmcorr(self): self._check_binary_ew(mom.ewmcorr) + def test_ewmcorr_pairwise(self): + self._check_pairwise_moment(mom.ewmcorr_pairwise, mom.ewmcorr, 10, + min_periods=5) + def _check_binary_ew(self, func): A = Series(randn(50), index=np.arange(50)) B = A[2:] + randn(48) @@ -746,6 +761,16 @@ def test_expanding_cov(self): def test_expanding_max(self): self._check_expanding(mom.expanding_max, np.max, preserve_nan=False) + def test_expanding_cov_pairwise(self): + result = mom.expanding_cov_pairwise(self.frame) + + rolling_result = mom.rolling_cov_pairwise(self.frame, + len(self.frame), + min_periods=1) + + for i in result.items: + assert_almost_equal(result[i], rolling_result[i]) + def test_expanding_corr_pairwise(self): result = mom.expanding_corr_pairwise(self.frame) From 9a92ad29f3c63219180b11b5555fde289ce0fc4a Mon Sep 17 00:00:00 2001 From: Tobias Brandt Date: Mon, 23 Sep 2013 11:23:02 +0200 Subject: [PATCH 2/5] ENH: Implemented pairwise rolling moment functions. Implemented rolling_cov_pairwise function. Implemented ewmcov_pairwise and ewmcorr_pairwise. Implemented expanding_cov_pairwise. Refactored the rolling moment functions to use _flex_pairwise_moment. --- pandas/stats/moments.py | 192 +++++++++++++++++++++++++--------------- 1 file changed, 122 insertions(+), 70 deletions(-) diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index ec01113abc8f2..814b0f67ea11f 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -20,13 +20,15 @@ 'rolling_sum', 'rolling_mean', 'rolling_std', 'rolling_cov', 'rolling_corr', 'rolling_var', 'rolling_skew', 'rolling_kurt', 'rolling_quantile', 'rolling_median', 'rolling_apply', - 'rolling_corr_pairwise', 'rolling_window', + 'rolling_cov_pairwise', 'rolling_corr_pairwise', 'rolling_window', 'ewma', 'ewmvar', 'ewmstd', 'ewmvol', 'ewmcorr', 'ewmcov', + 'ewmcorr_pairwise', 'ewmcov_pairwise', 'expanding_count', 'expanding_max', 'expanding_min', 'expanding_sum', 'expanding_mean', 'expanding_std', 'expanding_cov', 'expanding_corr', 'expanding_var', 'expanding_skew', 'expanding_kurt', 'expanding_quantile', - 'expanding_median', 'expanding_apply', 'expanding_corr_pairwise'] + 'expanding_median', 'expanding_apply', + 'expanding_cov_pairwise', 'expanding_corr_pairwise'] #------------------------------------------------------------------------------ # Docs @@ -102,7 +104,7 @@ Returns ------- -y : type of input argument +%s """ @@ -139,6 +141,8 @@ DataFrame / Series -> Computes result for each column Series / Series -> Series""" +_pairwise_retval = "y : Panel whose items are df1.index values" + _unary_arg = "arg : Series, DataFrame" _binary_arg_flex = """arg1 : Series, DataFrame, or ndarray @@ -147,6 +151,9 @@ _binary_arg = """arg1 : Series, DataFrame, or ndarray arg2 : Series, DataFrame, or ndarray""" +_pairwise_arg = """df1 : DataFrame +df2 : DataFrame""" + _bias_doc = r"""bias : boolean, default False Use a standard estimation bias correction """ @@ -232,7 +239,8 @@ def _flex_binary_moment(arg1, arg2, f): raise TypeError("arguments to moment function must be of type " "np.ndarray/Series/DataFrame") - if isinstance(arg1, (np.ndarray,Series)) and isinstance(arg2, (np.ndarray,Series)): + if isinstance(arg1, (np.ndarray, Series)) and \ + isinstance(arg2, (np.ndarray,Series)): X, Y = _prep_binary(arg1, arg2) return f(X, Y) elif isinstance(arg1, DataFrame): @@ -258,40 +266,55 @@ def _flex_binary_moment(arg1, arg2, f): return _flex_binary_moment(arg2, arg1, f) -def rolling_corr_pairwise(df, window, min_periods=None): - """ - Computes pairwise rolling correlation matrices as Panel whose items are - dates. - - Parameters - ---------- - df : DataFrame - window : int - Size of the moving window. This is the number of observations used for - calculating the statistic. - min_periods : int, default None - Minimum number of observations in window required to have a value - (otherwise result is NA). - - Returns - ------- - correls : Panel - """ - from pandas import Panel +def _flex_pairwise_moment(moment_func, df1, df2, **kwargs): from collections import defaultdict + # Detect symmetry + if df2 is df1: + symmetric = True + else: + symmetric = False + all_results = defaultdict(dict) - for i, k1 in enumerate(df.columns): - for k2 in df.columns[i:]: - corr = rolling_corr(df[k1], df[k2], window, - min_periods=min_periods) - all_results[k1][k2] = corr - all_results[k2][k1] = corr + for i, k1 in enumerate(df1.columns): + for j, k2 in enumerate(df2.columns): + if j Date: Wed, 25 Sep 2013 18:04:13 +0200 Subject: [PATCH 3/5] ENH: Folded the *_pairwise rolling moment functions into the base function API. --- pandas/stats/moments.py | 222 ++++++++++++++--------------- pandas/stats/tests/test_moments.py | 30 ++-- 2 files changed, 116 insertions(+), 136 deletions(-) diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index 814b0f67ea11f..59f804815b845 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -5,14 +5,14 @@ from __future__ import division from functools import wraps +from collections import defaultdict from numpy import NaN import numpy as np from pandas.core.api import DataFrame, Series, Panel, notnull import pandas.algos as algos -import pandas.core.common as com -from pandas.core.common import _values_from_object +import pandas.core.common as pdcom from pandas.util.decorators import Substitution, Appender @@ -20,15 +20,13 @@ 'rolling_sum', 'rolling_mean', 'rolling_std', 'rolling_cov', 'rolling_corr', 'rolling_var', 'rolling_skew', 'rolling_kurt', 'rolling_quantile', 'rolling_median', 'rolling_apply', - 'rolling_cov_pairwise', 'rolling_corr_pairwise', 'rolling_window', + 'rolling_corr_pairwise', 'rolling_window', 'ewma', 'ewmvar', 'ewmstd', 'ewmvol', 'ewmcorr', 'ewmcov', - 'ewmcorr_pairwise', 'ewmcov_pairwise', 'expanding_count', 'expanding_max', 'expanding_min', 'expanding_sum', 'expanding_mean', 'expanding_std', 'expanding_cov', 'expanding_corr', 'expanding_var', 'expanding_skew', 'expanding_kurt', 'expanding_quantile', - 'expanding_median', 'expanding_apply', - 'expanding_cov_pairwise', 'expanding_corr_pairwise'] + 'expanding_median', 'expanding_apply', 'expanding_corr_pairwise'] #------------------------------------------------------------------------------ # Docs @@ -203,25 +201,43 @@ def rolling_count(arg, window, freq=None, center=False, time_rule=None): @Substitution("Unbiased moving covariance.", _binary_arg_flex, _flex_retval) @Appender(_doc_template) -def rolling_cov(arg1, arg2, window, min_periods=None, freq=None, - center=False, time_rule=None): +def rolling_cov(arg1, arg2=None, window=None, min_periods=None, freq=None, + center=False, time_rule=None, pairwise=None): + if window is None and isinstance(arg2, (int, float)): + window = arg2 + arg2 = arg1 + pairwise = True if pairwise is None else pairwise # only default unset + elif arg2 is None: + arg2 = arg1 + pairwise = True if pairwise is None else pairwise # only default unset arg1 = _conv_timerule(arg1, freq, time_rule) arg2 = _conv_timerule(arg2, freq, time_rule) window = min(window, len(arg1), len(arg2)) def _get_cov(X, Y): - mean = lambda x: rolling_mean(x, window, min_periods,center=center) - count = rolling_count(X + Y, window,center=center) + mean = lambda x: rolling_mean(x, window, min_periods, center=center) + count = rolling_count(X + Y, window, center=center) bias_adj = count / (count - 1) return (mean(X * Y) - mean(X) * mean(Y)) * bias_adj - rs = _flex_binary_moment(arg1, arg2, _get_cov) + rs = _flex_binary_moment(arg1, arg2, _get_cov, pairwise=bool(pairwise)) return rs @Substitution("Moving sample correlation.", _binary_arg_flex, _flex_retval) @Appender(_doc_template) -def rolling_corr(arg1, arg2, window, min_periods=None, freq=None, - center=False, time_rule=None): +def rolling_corr(arg1, arg2=None, window=None, min_periods=None, freq=None, + center=False, time_rule=None, pairwise=None): + if window is None and isinstance(arg2, (int, float)): + window = arg2 + arg2 = arg1 + pairwise = True if pairwise is None else pairwise # only default unset + elif arg2 is None: + arg2 = arg1 + pairwise = True if pairwise is None else pairwise # only default unset + arg1 = _conv_timerule(arg1, freq, time_rule) + arg2 = _conv_timerule(arg2, freq, time_rule) + window = min(window, len(arg1), len(arg2)) + def _get_corr(a, b): num = rolling_cov(a, b, window, min_periods, freq=freq, center=center, time_rule=time_rule) @@ -230,10 +246,10 @@ def _get_corr(a, b): rolling_std(b, window, min_periods, freq=freq, center=center, time_rule=time_rule)) return num / den - return _flex_binary_moment(arg1, arg2, _get_corr) + return _flex_binary_moment(arg1, arg2, _get_corr, pairwise=bool(pairwise)) -def _flex_binary_moment(arg1, arg2, f): +def _flex_binary_moment(arg1, arg2, f, pairwise=False): if not (isinstance(arg1,(np.ndarray, Series, DataFrame)) and isinstance(arg2,(np.ndarray, Series, DataFrame))): raise TypeError("arguments to moment function must be of type " @@ -249,10 +265,23 @@ def _flex_binary_moment(arg1, arg2, f): X, Y = arg1.align(arg2, join='outer') X = X + 0 * Y Y = Y + 0 * X - res_columns = arg1.columns.union(arg2.columns) - for col in res_columns: - if col in X and col in Y: - results[col] = f(X[col], Y[col]) + if pairwise is False: + res_columns = arg1.columns.union(arg2.columns) + for col in res_columns: + if col in X and col in Y: + results[col] = f(X[col], Y[col]) + elif pairwise is True: + results = defaultdict(dict) + for i, k1 in enumerate(arg1.columns): + for j, k2 in enumerate(arg2.columns): + if j rs.ndim-1: - raise ValueError("Requested axis is larger then no. of argument dimensions") + raise ValueError("Requested axis is larger then no. of argument " + "dimensions") offset = int((window - 1) / 2.) if isinstance(rs, (Series, DataFrame, Panel)): @@ -480,15 +471,23 @@ def ewmstd(arg, com=None, span=None, halflife=None, min_periods=0, bias=False, @Substitution("Exponentially-weighted moving covariance", _binary_arg, "", _type_of_input) @Appender(_ewm_doc) -def ewmcov(arg1, arg2, com=None, span=None, halflife=None, min_periods=0, bias=False, - freq=None, time_rule=None): +def ewmcov(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0, bias=False, + freq=None, time_rule=None, pairwise=None): + if arg2 is None: + arg2 = arg1 + pairwise = True if pairwise is None else pairwise + elif isinstance(arg2, (int, float)) and com is None: + com = arg2 + arg2 = arg1 + pairwise = True if pairwise is None else pairwise arg1 = _conv_timerule(arg1, freq, time_rule) arg2 = _conv_timerule(arg2, freq, time_rule) def _get_ewmcov(X, Y): mean = lambda x: ewma(x, com=com, span=span, halflife=halflife, min_periods=min_periods) return (mean(X * Y) - mean(X) * mean(Y)) - result = _flex_binary_moment(arg1, arg2, _get_ewmcov) + result = _flex_binary_moment(arg1, arg2, _get_ewmcov, + pairwise=bool(pairwise)) if not bias: com = _get_center_of_mass(com, span, halflife) result *= (1.0 + 2.0 * com) / (2.0 * com) @@ -496,45 +495,31 @@ def _get_ewmcov(X, Y): return result -@Substitution("Pairwise exponentially-weighted moving covariance", - _pairwise_arg, "", _pairwise_retval) -@Appender(_ewm_doc) -def ewmcov_pairwise(df1, df2=None, com=None, span=None, min_periods=0, - bias=False, freq=None, time_rule=None): - if df2 is None: - df2 = df1 - return _flex_pairwise_moment(ewmcov, df1, df2, com=com, span=span, - min_periods=min_periods, bias=bias, freq=freq, time_rule=time_rule) - - @Substitution("Exponentially-weighted moving correlation", _binary_arg, "", _type_of_input) @Appender(_ewm_doc) -def ewmcorr(arg1, arg2, com=None, span=None, halflife=None, min_periods=0, - freq=None, time_rule=None): +def ewmcorr(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0, + freq=None, time_rule=None, pairwise=None): + if arg2 is None: + arg2 = arg1 + pairwise = True if pairwise is None else pairwise + elif isinstance(arg2, (int, float)) and com is None: + com = arg2 + arg2 = arg1 + pairwise = True if pairwise is None else pairwise arg1 = _conv_timerule(arg1, freq, time_rule) arg2 = _conv_timerule(arg2, freq, time_rule) def _get_ewmcorr(X, Y): mean = lambda x: ewma(x, com=com, span=span, halflife=halflife, min_periods=min_periods) var = lambda x: ewmvar(x, com=com, span=span, halflife=halflife, min_periods=min_periods, - bias=True) + bias=True) return (mean(X * Y) - mean(X) * mean(Y)) / _zsqrt(var(X) * var(Y)) - result = _flex_binary_moment(arg1, arg2, _get_ewmcorr) + result = _flex_binary_moment(arg1, arg2, _get_ewmcorr, + pairwise=bool(pairwise)) return result -@Substitution("Pairwise exponentially-weighted moving correlation", - _pairwise_arg, "", _pairwise_retval) -@Appender(_ewm_doc) -def ewmcorr_pairwise(df1, df2=None, com=None, span=None, min_periods=0, - freq=None, time_rule=None): - if df2 is None: - df2 = df1 - return _flex_pairwise_moment(ewmcorr, df1, df2, com=com, span=span, - min_periods=min_periods, freq=freq, time_rule=time_rule) - - def _zsqrt(x): result = np.sqrt(x) mask = x < 0 @@ -779,8 +764,8 @@ def rolling_window(arg, window=None, win_type=None, min_periods=None, if win_type is not None: raise ValueError(('Do not specify window type if using custom ' 'weights')) - window = com._asarray_tuplesafe(window).astype(float) - elif com.is_integer(window): # window size + window = pdcom._asarray_tuplesafe(window).astype(float) + elif pdcom.is_integer(window): # window size if win_type is None: raise ValueError('Must specify window type') try: @@ -928,34 +913,37 @@ def expanding_quantile(arg, quantile, min_periods=1, freq=None, @Substitution("Unbiased expanding covariance.", _binary_arg_flex, _flex_retval) @Appender(_expanding_doc) -def expanding_cov(arg1, arg2, min_periods=1, freq=None, center=False, - time_rule=None): +def expanding_cov(arg1, arg2=None, min_periods=1, freq=None, center=False, + time_rule=None, pairwise=None): + if arg2 is None: + arg2 = arg1 + pairwise = True if pairwise is None else pairwise + elif isinstance(arg2, (int, float)) and min_periods is None: + min_periods = arg2 + arg2 = arg1 + pairwise = True if pairwise is None else pairwise window = max(len(arg1), len(arg2)) return rolling_cov(arg1, arg2, window, min_periods=min_periods, freq=freq, - center=center, time_rule=time_rule) - - -@Substitution("Pairwise unbiased expanding covariance", _pairwise_arg, - _pairwise_retval) -@Appender(_expanding_doc) -def expanding_cov_pairwise(df1, df2=None, min_periods=1, freq=None, - center=False, time_rule=None): - if df2 is None: - df2 = df1 - return _flex_pairwise_moment(expanding_cov, df1, df2, - min_periods=min_periods, freq=freq, - center=center, time_rule=time_rule) + center=center, time_rule=time_rule, pairwise=pairwise) @Substitution("Expanding sample correlation.", _binary_arg_flex, _flex_retval) @Appender(_expanding_doc) -def expanding_corr(arg1, arg2, min_periods=1, freq=None, center=False, - time_rule=None): +def expanding_corr(arg1, arg2=None, min_periods=1, freq=None, center=False, + time_rule=None, pairwise=None): + if arg2 is None: + arg2 = arg1 + pairwise = True if pairwise is None else pairwise + elif isinstance(arg2, (int, float)) and min_periods is None: + min_periods = arg2 + arg2 = arg1 + pairwise = True if pairwise is None else pairwise window = max(len(arg1), len(arg2)) return rolling_corr(arg1, arg2, window, min_periods=min_periods, - freq=freq, center=center, time_rule=time_rule) + freq=freq, center=center, time_rule=time_rule, + pairwise=pairwise) @Substitution("Pairwise expanding sample correlation", _pairwise_arg, @@ -963,11 +951,9 @@ def expanding_corr(arg1, arg2, min_periods=1, freq=None, center=False, @Appender(_expanding_doc) def expanding_corr_pairwise(df1, df2=None, min_periods=1, freq=None, center=False, time_rule=None): - if df2 is None: - df2 = df1 - return _flex_pairwise_moment(expanding_corr, df1, df2, - min_periods=min_periods, freq=freq, - center=center, time_rule=time_rule) + return expanding_corr(df1, df2, min_periods=min_periods, + freq=freq, center=center, time_rule=time_rule, + pairwise=True) def expanding_apply(arg, func, min_periods=1, freq=None, center=False, diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py index af77152fdd5b7..97f08e7052c87 100644 --- a/pandas/stats/tests/test_moments.py +++ b/pandas/stats/tests/test_moments.py @@ -587,8 +587,7 @@ def test_rolling_cov(self): assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1]) def test_rolling_cov_pairwise(self): - self._check_pairwise_moment(mom.rolling_cov_pairwise, - mom.rolling_cov, 10, min_periods=5) + self._check_pairwise_moment(mom.rolling_cov, 10, min_periods=5) def test_rolling_corr(self): A = self.series @@ -607,11 +606,10 @@ def test_rolling_corr(self): assert_almost_equal(result[-1], a.corr(b)) def test_rolling_corr_pairwise(self): - self._check_pairwise_moment(mom.rolling_corr_pairwise, - mom.rolling_corr, 10, min_periods=5) + self._check_pairwise_moment(mom.rolling_corr, 10, min_periods=5) - def _check_pairwise_moment(self, func_pairwise, func, *args, **kwargs): - panel = func_pairwise(self.frame, self.frame, *args, **kwargs) + def _check_pairwise_moment(self, func, *args, **kwargs): + panel = func(self.frame, *args, **kwargs) actual = panel.ix[:, 1, 5] expected = func(self.frame[1], self.frame[5], *args, **kwargs) @@ -674,15 +672,13 @@ def test_ewmcov(self): self._check_binary_ew(mom.ewmcov) def test_ewmcov_pairwise(self): - self._check_pairwise_moment(mom.ewmcov_pairwise, mom.ewmcov, 10, - min_periods=5) + self._check_pairwise_moment(mom.ewmcov, span=10, min_periods=5) def test_ewmcorr(self): self._check_binary_ew(mom.ewmcorr) def test_ewmcorr_pairwise(self): - self._check_pairwise_moment(mom.ewmcorr_pairwise, mom.ewmcorr, 10, - min_periods=5) + self._check_pairwise_moment(mom.ewmcorr, span=10, min_periods=5) def _check_binary_ew(self, func): A = Series(randn(50), index=np.arange(50)) @@ -762,21 +758,19 @@ def test_expanding_max(self): self._check_expanding(mom.expanding_max, np.max, preserve_nan=False) def test_expanding_cov_pairwise(self): - result = mom.expanding_cov_pairwise(self.frame) + result = mom.expanding_cov(self.frame) - rolling_result = mom.rolling_cov_pairwise(self.frame, - len(self.frame), - min_periods=1) + rolling_result = mom.rolling_cov(self.frame, len(self.frame), + min_periods=1) for i in result.items: assert_almost_equal(result[i], rolling_result[i]) def test_expanding_corr_pairwise(self): - result = mom.expanding_corr_pairwise(self.frame) + result = mom.expanding_corr(self.frame) - rolling_result = mom.rolling_corr_pairwise(self.frame, - len(self.frame), - min_periods=1) + rolling_result = mom.rolling_corr(self.frame, len(self.frame), + min_periods=1) for i in result.items: assert_almost_equal(result[i], rolling_result[i]) From 27442b1aaf6f5c61bf52b1488edb41bbe4c181d3 Mon Sep 17 00:00:00 2001 From: Tobias Brandt Date: Thu, 26 Sep 2013 11:34:48 +0200 Subject: [PATCH 4/5] DOC: Updated documentation for pairwise API changes. Added a release note and example usage for pairwise=True. --- doc/source/computation.rst | 68 +++++++++++++++---- doc/source/v0.14.0.txt | 13 ++++ pandas/stats/moments.py | 134 ++++++++++++++++++------------------- 3 files changed, 135 insertions(+), 80 deletions(-) diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 66e0d457e33b6..7bd3c1aa03d90 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -59,6 +59,19 @@ The ``Series`` object has a method ``cov`` to compute covariance between series Analogously, ``DataFrame`` has a method ``cov`` to compute pairwise covariances among the series in the DataFrame, also excluding NA/null values. +.. _computation.covariance.caveats: + +.. note:: + + Assuming the missing data are missing at random this results in an estimate + for the covariance matrix which is unbiased. However, for many applications + this estimate may not be acceptable because the estimated covariance matrix + is not guaranteed to be positive semi-definite. This could lead to + estimated correlations having absolute values which are greater than one, + and/or a non-invertible covariance matrix. See `Estimation of covariance + matrices `_ + for more details. + .. ipython:: python frame = DataFrame(randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) @@ -99,6 +112,12 @@ correlation methods are provided: All of these are currently computed using pairwise complete observations. +.. note:: + + Please see the :ref:`caveats ` associated + with this method of calculating correlation matrices in the + :ref:`covariance section `. + .. ipython:: python frame = DataFrame(randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) @@ -325,11 +344,14 @@ Binary rolling moments two ``Series`` or any combination of ``DataFrame/Series`` or ``DataFrame/DataFrame``. Here is the behavior in each case: -- two ``Series``: compute the statistic for the pairing +- two ``Series``: compute the statistic for the pairing. - ``DataFrame/Series``: compute the statistics for each column of the DataFrame - with the passed Series, thus returning a DataFrame -- ``DataFrame/DataFrame``: compute statistic for matching column names, - returning a DataFrame + with the passed Series, thus returning a DataFrame. +- ``DataFrame/DataFrame``: by default compute the statistic for matching column + names, returning a DataFrame. If the keyword argument ``pairwise=True`` is + passed then computes the statistic for each pair of columns, returning a + ``Panel`` whose ``items`` are the dates in question (see :ref:`the next section + `). For example: @@ -340,20 +362,42 @@ For example: .. _stats.moments.corr_pairwise: -Computing rolling pairwise correlations -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Computing rolling pairwise covariances and correlations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In financial data analysis and other fields it's common to compute correlation -matrices for a collection of time series. More difficult is to compute a -moving-window correlation matrix. This can be done using the -``rolling_corr_pairwise`` function, which yields a ``Panel`` whose ``items`` -are the dates in question: +In financial data analysis and other fields it's common to compute covariance +and correlation matrices for a collection of time series. Often one is also +interested in moving-window covariance and correlation matrices. This can be +done by passing the ``pairwise`` keyword argument, which in the case of +``DataFrame`` inputs will yield a ``Panel`` whose ``items`` are the dates in +question. In the case of a single DataFrame argument the ``pairwise`` argument +can even be omitted: + +.. note:: + + Missing values are ignored and each entry is computed using the pairwise + complete observations. Please see the :ref:`covariance section + ` for :ref:`caveats + ` associated with this method of + calculating covariance and correlation matrices. .. ipython:: python - correls = rolling_corr_pairwise(df, 50) + covs = rolling_cov(df[['B','C','D']], df[['A','B','C']], 50, pairwise=True) + covs[df.index[-50]] + +.. ipython:: python + + correls = rolling_corr(df, 50) correls[df.index[-50]] +.. note:: + + Prior to version 0.14 this was available through ``rolling_corr_pairwise`` + which is now simply syntactic sugar for calling ``rolling_corr(..., + pairwise=True)`` and deprecated. This is likely to be removed in a future + release. + You can efficiently retrieve the time series of correlations between two columns using ``ix`` indexing: diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 95537878871b1..344198d6e5ef1 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -183,6 +183,19 @@ These are out-of-bounds selections Because of the default `align` value changes, coordinates of bar plots are now located on integer values (0.0, 1.0, 2.0 ...). This is intended to make bar plot be located on the same coodinates as line plot. However, bar plot may differs unexpectedly when you manually adjust the bar location or drawing area, such as using `set_xlim`, `set_ylim`, etc. In this cases, please modify your script to meet with new coordinates. +- ``pairwise`` keyword was added to the statistical moment functions + ``rolling_cov``, ``rolling_corr``, ``ewmcov``, ``ewmcorr``, + ``expanding_cov``, ``expanding_corr`` to allow the calculation of moving + window covariance and correlation matrices (:issue:`4950`). See + :ref:`Computing rolling pairwise covariances and correlations + ` in the docs. + + .. ipython:: python + + df = DataFrame(np.random.randn(10,4),columns=list('ABCD')) + covs = rolling_cov(df[['A','B','C']], df[['B','C','D']], 5, pairwise=True) + covs[df.index[-1]] + MultiIndexing Using Slicers ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index 59f804815b845..5da5054d991dd 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -31,13 +31,22 @@ #------------------------------------------------------------------------------ # Docs +# The order of arguments for the _doc_template is: +# (header, args, kwargs, returns, notes) + _doc_template = """ %s Parameters ---------- +%s%s +Returns +------- +%s %s -window : int +""" + +_roll_kw = """window : int Size of the moving window. This is the number of observations used for calculating the statistic. min_periods : int, default None @@ -49,11 +58,9 @@ for `freq`. center : boolean, default False Set the labels at the center of the window. - -Returns -------- -%s +""" +_roll_notes = r""" Notes ----- By default, the result is set to the right edge of the window. This can be @@ -65,12 +72,7 @@ """ -_ewm_doc = r"""%s - -Parameters ----------- -%s -com : float. optional +_ewm_kw = r"""com : float. optional Center of mass: :math:`\alpha = 1 / (1 + com)`, span : float, optional Specify decay in terms of span, :math:`\alpha = 2 / (span + 1)` @@ -85,8 +87,9 @@ adjust : boolean, default True Divide by decaying adjustment factor in beginning periods to account for imbalance in relative weightings (viewing EWMA as a moving average) +""" -%s +_ewm_notes = """ Notes ----- Either center of mass or span must be specified @@ -99,60 +102,51 @@ :math:`c = (s - 1) / 2` So a "20-day EWMA" would have center 9.5. - -Returns -------- -%s """ - -_expanding_doc = """ -%s - -Parameters ----------- -%s -min_periods : int, default None +_expanding_kw = """min_periods : int, default None Minimum number of observations in window required to have a value (otherwise result is NA). freq : string or DateOffset object, optional (default None) Frequency to conform the data to before computing the statistic. Specified as a frequency string or DateOffset object. `time_rule` is a legacy alias for `freq`. - -Returns -------- -%s - -Notes ------ -The `freq` keyword is used to conform time series data to a specified -frequency by resampling the data. This is done with the default parameters -of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ -_type_of_input = "y : type of input argument" +_type_of_input_retval = "y : type of input argument" _flex_retval = """y : type depends on inputs - DataFrame / DataFrame -> DataFrame (matches on columns) + DataFrame / DataFrame -> DataFrame (matches on columns) or Panel (pairwise) DataFrame / Series -> Computes result for each column Series / Series -> Series""" _pairwise_retval = "y : Panel whose items are df1.index values" -_unary_arg = "arg : Series, DataFrame" +_unary_arg = "arg : Series, DataFrame\n" _binary_arg_flex = """arg1 : Series, DataFrame, or ndarray -arg2 : Series, DataFrame, or ndarray""" +arg2 : Series, DataFrame, or ndarray, optional + if not supplied then will default to arg1 and produce pairwise output +""" _binary_arg = """arg1 : Series, DataFrame, or ndarray -arg2 : Series, DataFrame, or ndarray""" +arg2 : Series, DataFrame, or ndarray +""" _pairwise_arg = """df1 : DataFrame -df2 : DataFrame""" +df2 : DataFrame +""" + +_pairwise_kw = """pairwise : bool, default False + If False then only matching columns between arg1 and arg2 will be used and + the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the output + will be a Panel in the case of DataFrame inputs. In the case of missing + elements, only complete pairwise observations will be used. +""" -_bias_doc = r"""bias : boolean, default False +_bias_kw = r"""bias : boolean, default False Use a standard estimation bias correction """ @@ -199,7 +193,8 @@ def rolling_count(arg, window, freq=None, center=False, time_rule=None): return return_hook(result) -@Substitution("Unbiased moving covariance.", _binary_arg_flex, _flex_retval) +@Substitution("Unbiased moving covariance.", _binary_arg_flex, + _roll_kw+_pairwise_kw, _flex_retval, _roll_notes) @Appender(_doc_template) def rolling_cov(arg1, arg2=None, window=None, min_periods=None, freq=None, center=False, time_rule=None, pairwise=None): @@ -223,7 +218,8 @@ def _get_cov(X, Y): return rs -@Substitution("Moving sample correlation.", _binary_arg_flex, _flex_retval) +@Substitution("Moving sample correlation.", _binary_arg_flex, + _roll_kw+_pairwise_kw, _flex_retval, _roll_notes) @Appender(_doc_template) def rolling_corr(arg1, arg2=None, window=None, min_periods=None, freq=None, center=False, time_rule=None, pairwise=None): @@ -296,7 +292,7 @@ def _flex_binary_moment(arg1, arg2, f, pairwise=False): @Substitution("Pairwise moving sample correlation", _pairwise_arg, - _pairwise_retval) + _roll_kw, _pairwise_retval, _roll_notes) @Appender(_doc_template) def rolling_corr_pairwise(df1, df2=None, window=None, min_periods=None, freq=None, center=False, time_rule=None): @@ -415,9 +411,9 @@ def _get_center_of_mass(com, span, halflife): return float(com) -@Substitution("Exponentially-weighted moving average", _unary_arg, "", - _type_of_input) -@Appender(_ewm_doc) +@Substitution("Exponentially-weighted moving average", _unary_arg, _ewm_kw, + _type_of_input_retval, _ewm_notes) +@Appender(_doc_template) def ewma(arg, com=None, span=None, halflife=None, min_periods=0, freq=None, time_rule=None, adjust=True): com = _get_center_of_mass(com, span, halflife) @@ -439,9 +435,9 @@ def _first_valid_index(arr): return notnull(arr).argmax() if len(arr) else 0 -@Substitution("Exponentially-weighted moving variance", _unary_arg, _bias_doc, - _type_of_input) -@Appender(_ewm_doc) +@Substitution("Exponentially-weighted moving variance", _unary_arg, + _ewm_kw+_bias_kw, _type_of_input_retval, _ewm_notes) +@Appender(_doc_template) def ewmvar(arg, com=None, span=None, halflife=None, min_periods=0, bias=False, freq=None, time_rule=None): com = _get_center_of_mass(com, span, halflife) @@ -456,9 +452,9 @@ def ewmvar(arg, com=None, span=None, halflife=None, min_periods=0, bias=False, return result -@Substitution("Exponentially-weighted moving std", _unary_arg, _bias_doc, - _type_of_input) -@Appender(_ewm_doc) +@Substitution("Exponentially-weighted moving std", _unary_arg, + _ewm_kw+_bias_kw, _type_of_input_retval, _ewm_notes) +@Appender(_doc_template) def ewmstd(arg, com=None, span=None, halflife=None, min_periods=0, bias=False, time_rule=None): result = ewmvar(arg, com=com, span=span, halflife=halflife, time_rule=time_rule, @@ -468,9 +464,9 @@ def ewmstd(arg, com=None, span=None, halflife=None, min_periods=0, bias=False, ewmvol = ewmstd -@Substitution("Exponentially-weighted moving covariance", _binary_arg, "", - _type_of_input) -@Appender(_ewm_doc) +@Substitution("Exponentially-weighted moving covariance", _binary_arg_flex, + _ewm_kw+_pairwise_kw, _type_of_input_retval, _ewm_notes) +@Appender(_doc_template) def ewmcov(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0, bias=False, freq=None, time_rule=None, pairwise=None): if arg2 is None: @@ -495,9 +491,9 @@ def _get_ewmcov(X, Y): return result -@Substitution("Exponentially-weighted moving correlation", _binary_arg, "", - _type_of_input) -@Appender(_ewm_doc) +@Substitution("Exponentially-weighted moving correlation", _binary_arg_flex, + _ewm_kw+_pairwise_kw, _type_of_input_retval, _ewm_notes) +@Appender(_doc_template) def ewmcorr(arg1, arg2=None, com=None, span=None, halflife=None, min_periods=0, freq=None, time_rule=None, pairwise=None): if arg2 is None: @@ -581,7 +577,7 @@ def _use_window(minp, window): def _rolling_func(func, desc, check_minp=_use_window): - @Substitution(desc, _unary_arg, _type_of_input) + @Substitution(desc, _unary_arg, _roll_kw, _type_of_input_retval, _roll_notes) @Appender(_doc_template) @wraps(func) def f(arg, window, min_periods=None, freq=None, center=False, @@ -814,8 +810,8 @@ def _pop_args(win_type, arg_names, kwargs): def _expanding_func(func, desc, check_minp=_use_window): - @Substitution(desc, _unary_arg, _type_of_input) - @Appender(_expanding_doc) + @Substitution(desc, _unary_arg, _expanding_kw, _type_of_input_retval, "") + @Appender(_doc_template) @wraps(func) def f(arg, min_periods=1, freq=None, center=False, time_rule=None, **kwargs): @@ -911,8 +907,9 @@ def expanding_quantile(arg, quantile, min_periods=1, freq=None, freq=freq, center=center, time_rule=time_rule) -@Substitution("Unbiased expanding covariance.", _binary_arg_flex, _flex_retval) -@Appender(_expanding_doc) +@Substitution("Unbiased expanding covariance.", _binary_arg_flex, + _expanding_kw+_pairwise_kw, _flex_retval, "") +@Appender(_doc_template) def expanding_cov(arg1, arg2=None, min_periods=1, freq=None, center=False, time_rule=None, pairwise=None): if arg2 is None: @@ -928,8 +925,9 @@ def expanding_cov(arg1, arg2=None, min_periods=1, freq=None, center=False, center=center, time_rule=time_rule, pairwise=pairwise) -@Substitution("Expanding sample correlation.", _binary_arg_flex, _flex_retval) -@Appender(_expanding_doc) +@Substitution("Expanding sample correlation.", _binary_arg_flex, + _expanding_kw+_pairwise_kw, _flex_retval, "") +@Appender(_doc_template) def expanding_corr(arg1, arg2=None, min_periods=1, freq=None, center=False, time_rule=None, pairwise=None): if arg2 is None: @@ -947,8 +945,8 @@ def expanding_corr(arg1, arg2=None, min_periods=1, freq=None, center=False, @Substitution("Pairwise expanding sample correlation", _pairwise_arg, - _pairwise_retval) -@Appender(_expanding_doc) + _expanding_kw, _pairwise_retval, "") +@Appender(_doc_template) def expanding_corr_pairwise(df1, df2=None, min_periods=1, freq=None, center=False, time_rule=None): return expanding_corr(df1, df2, min_periods=min_periods, From 1fcb94e07bba92354d250182f186d9c9d52194ed Mon Sep 17 00:00:00 2001 From: Tobias Brandt Date: Fri, 28 Mar 2014 14:52:32 +0200 Subject: [PATCH 5/5] ENH: Marked rolling_corr_pairwise and expanding_corr_pairwise as deprecated. --- pandas/stats/moments.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index 5da5054d991dd..523f055eaf605 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -291,11 +291,14 @@ def _flex_binary_moment(arg1, arg2, f, pairwise=False): return _flex_binary_moment(arg2, arg1, f) -@Substitution("Pairwise moving sample correlation", _pairwise_arg, +@Substitution("Deprecated. Use rolling_corr(..., pairwise=True) instead.\n\n" + "Pairwise moving sample correlation", _pairwise_arg, _roll_kw, _pairwise_retval, _roll_notes) @Appender(_doc_template) def rolling_corr_pairwise(df1, df2=None, window=None, min_periods=None, freq=None, center=False, time_rule=None): + import warnings + warnings.warn("rolling_corr_pairwise is deprecated, use rolling_corr(..., pairwise=True)", FutureWarning) return rolling_corr(df1, df2, window=window, min_periods=min_periods, freq=freq, center=center, time_rule=time_rule, pairwise=True) @@ -944,11 +947,14 @@ def expanding_corr(arg1, arg2=None, min_periods=1, freq=None, center=False, pairwise=pairwise) -@Substitution("Pairwise expanding sample correlation", _pairwise_arg, +@Substitution("Deprecated. Use expanding_corr(..., pairwise=True) instead.\n\n" + "Pairwise expanding sample correlation", _pairwise_arg, _expanding_kw, _pairwise_retval, "") @Appender(_doc_template) def expanding_corr_pairwise(df1, df2=None, min_periods=1, freq=None, center=False, time_rule=None): + import warnings + warnings.warn("expanding_corr_pairwise is deprecated, use expanding_corr(..., pairwise=True)", FutureWarning) return expanding_corr(df1, df2, min_periods=min_periods, freq=freq, center=center, time_rule=time_rule, pairwise=True)