From dbd1322a225127fb6807332cb00b0144019ec6c0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 27 Mar 2017 18:26:55 -0400 Subject: [PATCH 1/3] BUG/PERF: handle a slice correctly in get_level_indexer --- pandas/indexes/multi.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index f12b10ae682fa..9e1ccde9ddc23 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -2069,20 +2069,14 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): else: loc = level_index.get_loc(key) - if level > 0 or self.lexsort_depth == 0: + if isinstance(loc, slice): + return loc + elif level > 0 or self.lexsort_depth == 0: return np.array(labels == loc, dtype=bool) - else: - # sorted, so can return slice object -> view - try: - loc = labels.dtype.type(loc) - except TypeError: - # this occurs when loc is a slice (partial string indexing) - # but the TypeError raised by searchsorted in this case - # is catched in Index._has_valid_type() - pass - i = labels.searchsorted(loc, side='left') - j = labels.searchsorted(loc, side='right') - return slice(i, j) + + i = labels.searchsorted(loc, side='left') + j = labels.searchsorted(loc, side='right') + return slice(i, j) def get_locs(self, tup): """ From ed7d927c25b53cad8a7152fdd0c52d597f7e0cff Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 8 Mar 2017 10:34:09 -0500 Subject: [PATCH 2/3] API: df.rolling(..).corr()/cov() when pairwise=True to return MI DataFrame xref #15601 --- doc/source/computation.rst | 20 +- doc/source/whatsnew/v0.20.0.txt | 46 ++++ pandas/core/window.py | 27 ++- pandas/tests/test_window.py | 409 ++++++++++++++++---------------- 4 files changed, 297 insertions(+), 205 deletions(-) diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 57480a244f308..86370709cde41 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -505,13 +505,18 @@ two ``Series`` or any combination of ``DataFrame/Series`` or - ``DataFrame/DataFrame``: by default compute the statistic for matching column names, returning a DataFrame. If the keyword argument ``pairwise=True`` is passed then computes the statistic for each pair of columns, returning a - ``Panel`` whose ``items`` are the dates in question (see :ref:`the next section + ``MultiIndexed DataFrame`` whose ``index`` are the dates in question (see :ref:`the next section `). For example: .. ipython:: python + df = pd.DataFrame(np.random.randn(1000, 4), + index=pd.date_range('1/1/2000', periods=1000), + columns=['A', 'B', 'C', 'D']) + df = df.cumsum() + df2 = df[:20] df2.rolling(window=5).corr(df2['B']) @@ -520,11 +525,16 @@ For example: Computing rolling pairwise covariances and correlations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. warning:: + + Prior to version 0.20.0 if ``pairwise=True`` was passed, a ``Panel`` would be returned. + This will now return a 2-level MultiIndexed DataFrame, see the whatsnew :ref:`here ` + In financial data analysis and other fields it's common to compute covariance and correlation matrices for a collection of time series. Often one is also interested in moving-window covariance and correlation matrices. This can be done by passing the ``pairwise`` keyword argument, which in the case of -``DataFrame`` inputs will yield a ``Panel`` whose ``items`` are the dates in +``DataFrame`` inputs will yield a ``MultiIndexed DataFrame`` whose ``index`` are the dates in question. In the case of a single DataFrame argument the ``pairwise`` argument can even be omitted: @@ -539,12 +549,12 @@ can even be omitted: .. ipython:: python covs = df[['B','C','D']].rolling(window=50).cov(df[['A','B','C']], pairwise=True) - covs[df.index[-50]] + covs.iloc[-50].unstack() .. ipython:: python correls = df.rolling(window=50).corr() - correls[df.index[-50]] + correls.iloc[-50].unstack() You can efficiently retrieve the time series of correlations between two columns using ``.loc`` indexing: @@ -557,7 +567,7 @@ columns using ``.loc`` indexing: .. ipython:: python @savefig rolling_corr_pairwise_ex.png - correls.loc[:, 'A', 'C'].plot() + correls[('A', 'C')].plot() .. _stats.aggregate: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 74fe7916523c5..da6ee6e911d50 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -12,11 +12,13 @@ Highlights include: - The ``.ix`` indexer has been deprecated, see :ref:`here ` - Improved user API when accessing levels in ``.groupby()``, see :ref:`here ` - Improved support for UInt64 dtypes, see :ref:`here ` +- Window Binary Corr/Cov operations return a MultiIndex DataFrame rather than a Panel, see :ref:`here ` - A new orient for JSON serialization, ``orient='table'``, that uses the Table Schema spec, see :ref:`here ` - Support for S3 handling now uses ``s3fs``, see :ref:`here ` - Google BigQuery support now uses the ``pandas-gbq`` library, see :ref:`here ` - Switched the test framework to use `pytest `__ (:issue:`13097`) + Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. .. contents:: What's new in v0.20.0 @@ -766,6 +768,50 @@ New Behavior: df.groupby('A').agg([np.mean, np.std, np.min, np.max]) +.. _whatsnew_0200.api_breaking.rolling_pairwise: + +Window Binary Corr/Cov operations return a MultiIndex DataFrame +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A binary window operation, like ``.corr()`` or ``.cov()``, when operating on a ``.rolling(..)``, ``.expanding(..)``, or ``.ewm(..)`` object, +will now return a 2-level ``MultiIndexed DataFrame`` rather than a ``Panel``. These are equivalent in function, +but MultiIndexed DataFrames enjoy more support in pandas. +See the section on :ref:`Windowed Binary Operations ` for more information. (:issue:`15677`) + +.. ipython:: python + + np.random.seed(1234) + df = DataFrame(np.random.rand(100, 2), + columns=['A', 'B'], + index=pd.date_range('20160101', periods=100, freq='D')) + df + +Old Behavior: + +.. code-block:: ipython + + In [2]: df.rolling(12).corr() + Out[2]: + + Dimensions: 100 (items) x 2 (major_axis) x 2 (minor_axis) + Items axis: 2016-01-01 00:00:00 to 2016-04-09 00:00:00 + Major_axis axis: A to B + Minor_axis axis: A to B + +New Behavior: + +.. ipython:: python + + res = df.rolling(12).corr() + res + +Retrieving a correlation matrix for a specified index + +.. ipython:: python + + res.iloc[-1].unstack() + + .. _whatsnew_0200.api_breaking.hdfstore_where: HDFStore where string comparison diff --git a/pandas/core/window.py b/pandas/core/window.py index 9c9f861451309..a978dd6827e77 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -1652,7 +1652,8 @@ def _cov(x, y): def _flex_binary_moment(arg1, arg2, f, pairwise=False): - from pandas import Series, DataFrame, Panel + from pandas import Series, DataFrame + if not (isinstance(arg1, (np.ndarray, Series, DataFrame)) and isinstance(arg2, (np.ndarray, Series, DataFrame))): raise TypeError("arguments to moment function must be of type " @@ -1703,12 +1704,34 @@ def dataframe_from_int_dict(data, frame_template): else: results[i][j] = f(*_prep_binary(arg1.iloc[:, i], arg2.iloc[:, j])) + + # TODO: not the most efficient (perf-wise) + # though not bad code-wise + from pandas import Panel, MultiIndex p = Panel.from_dict(results).swapaxes('items', 'major') if len(p.major_axis) > 0: p.major_axis = arg1.columns[p.major_axis] if len(p.minor_axis) > 0: p.minor_axis = arg2.columns[p.minor_axis] - return p + + if len(p.items): + result = pd.concat( + [p.iloc[i].T for i in range(len(p.items))], + keys=p.items) + else: + + result = DataFrame( + index=MultiIndex(levels=[arg1.columns, arg2.index], + labels=[[], []]), + columns=arg1.columns, + dtype='float64') + + # reset our names + result.columns.name = None + result.index.names = ['major', 'minor'] + + return result + else: raise ValueError("'pairwise' is not True/False") else: diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index ceb12c6c03074..8b84631c60969 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -10,8 +10,8 @@ from distutils.version import LooseVersion import pandas as pd -from pandas import (Series, DataFrame, Panel, bdate_range, isnull, - notnull, concat, Timestamp) +from pandas import (Series, DataFrame, bdate_range, isnull, + notnull, concat, Timestamp, Index) import pandas.stats.moments as mom import pandas.core.window as rwindow import pandas.tseries.offsets as offsets @@ -172,7 +172,7 @@ def test_agg_consistency(self): tm.assert_index_equal(result, expected) result = r['A'].agg([np.sum, np.mean]).columns - expected = pd.Index(['sum', 'mean']) + expected = Index(['sum', 'mean']) tm.assert_index_equal(result, expected) result = r.agg({'A': [np.sum, np.mean]}).columns @@ -1688,6 +1688,162 @@ def _check_ew_structures(self, func, name): self.assertEqual(type(frame_result), DataFrame) +class TestPairwise(object): + + # GH 7738 + df1s = [DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0, 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], + columns=['C', 'C']), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1., 0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0., 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=['C', 1]), + DataFrame([[2., 4.], [1., 2.], [5., 2.], [8., 1.]], + columns=[1, 0.]), + DataFrame([[2, 4.], [1, 2.], [5, 2.], [8, 1.]], + columns=[0, 1.]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1.]], + columns=[1., 'X']), ] + df2 = DataFrame([[None, 1, 1], [None, 1, 2], + [None, 3, 2], [None, 8, 1]], columns=['Y', 'Z', 'X']) + s = Series([1, 1, 3, 8]) + + def compare(self, result, expected): + + # since we have sorted the results + # we can only compare non-nans + result = result.dropna().values + expected = expected.dropna().values + + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize('f', [lambda x: x.cov(), lambda x: x.corr()]) + def test_no_flex(self, f): + + # DataFrame methods (which do not call _flex_binary_moment()) + + with warnings.catch_warnings(record=True): + + results = [f(df) for df in self.df1s] + for (df, result) in zip(self.df1s, results): + tm.assert_index_equal(result.index, df.columns) + tm.assert_index_equal(result.columns, df.columns) + for i, result in enumerate(results): + if i > 0: + self.compare(result, results[0]) + + @pytest.mark.parametrize( + 'f', [lambda x: x.expanding().cov(pairwise=True), + lambda x: x.expanding().corr(pairwise=True), + lambda x: x.rolling(window=3).cov(pairwise=True), + lambda x: x.rolling(window=3).corr(pairwise=True), + lambda x: x.ewm(com=3).cov(pairwise=True), + lambda x: x.ewm(com=3).corr(pairwise=True)]) + def test_pairwise_with_self(self, f): + + # DataFrame with itself, pairwise=True + results = [f(df) for df in self.df1s] + for (df, result) in zip(self.df1s, results): + tm.assert_index_equal(result.index.levels[0], + df.index, + check_names=False) + tm.assert_index_equal(result.index.levels[1], + df.columns, + check_names=False) + tm.assert_index_equal(result.columns, df.columns) + for i, result in enumerate(results): + if i > 0: + self.compare(result, results[0]) + + @pytest.mark.parametrize( + 'f', [lambda x: x.expanding().cov(pairwise=False), + lambda x: x.expanding().corr(pairwise=False), + lambda x: x.rolling(window=3).cov(pairwise=False), + lambda x: x.rolling(window=3).corr(pairwise=False), + lambda x: x.ewm(com=3).cov(pairwise=False), + lambda x: x.ewm(com=3).corr(pairwise=False), ]) + def test_no_pairwise_with_self(self, f): + + # DataFrame with itself, pairwise=False + results = [f(df) for df in self.df1s] + for (df, result) in zip(self.df1s, results): + tm.assert_index_equal(result.index, df.index) + tm.assert_index_equal(result.columns, df.columns) + for i, result in enumerate(results): + if i > 0: + self.compare(result, results[0]) + + @pytest.mark.parametrize( + 'f', [lambda x, y: x.expanding().cov(y, pairwise=True), + lambda x, y: x.expanding().corr(y, pairwise=True), + lambda x, y: x.rolling(window=3).cov(y, pairwise=True), + lambda x, y: x.rolling(window=3).corr(y, pairwise=True), + lambda x, y: x.ewm(com=3).cov(y, pairwise=True), + lambda x, y: x.ewm(com=3).corr(y, pairwise=True), ]) + def test_pairwise_with_other(self, f): + + # DataFrame with another DataFrame, pairwise=True + results = [f(df, self.df2) for df in self.df1s] + for (df, result) in zip(self.df1s, results): + tm.assert_index_equal(result.index.levels[0], + df.index, + check_names=False) + tm.assert_index_equal(result.index.levels[1], + self.df2.columns, + check_names=False) + for i, result in enumerate(results): + if i > 0: + self.compare(result, results[0]) + + @pytest.mark.parametrize( + 'f', [lambda x, y: x.expanding().cov(y, pairwise=False), + lambda x, y: x.expanding().corr(y, pairwise=False), + lambda x, y: x.rolling(window=3).cov(y, pairwise=False), + lambda x, y: x.rolling(window=3).corr(y, pairwise=False), + lambda x, y: x.ewm(com=3).cov(y, pairwise=False), + lambda x, y: x.ewm(com=3).corr(y, pairwise=False), ]) + def test_no_pairwise_with_other(self, f): + + with warnings.catch_warnings(record=True): + + # DataFrame with another DataFrame, pairwise=False + results = [f(df, self.df2) if df.columns.is_unique else None + for df in self.df1s] + for (df, result) in zip(self.df1s, results): + if result is not None: + expected_index = df.index.union(self.df2.index) + expected_columns = df.columns.union(self.df2.columns) + tm.assert_index_equal(result.index, expected_index) + tm.assert_index_equal(result.columns, expected_columns) + else: + tm.assertRaisesRegexp( + ValueError, "'arg1' columns are not unique", f, df, + self.df2) + tm.assertRaisesRegexp( + ValueError, "'arg2' columns are not unique", f, + self.df2, df) + + @pytest.mark.parametrize( + 'f', [lambda x, y: x.expanding().cov(y), + lambda x, y: x.expanding().corr(y), + lambda x, y: x.rolling(window=3).cov(y), + lambda x, y: x.rolling(window=3).corr(y), + lambda x, y: x.ewm(com=3).cov(y), + lambda x, y: x.ewm(com=3).corr(y), ]) + def test_pairwise_with_series(self, f): + + # DataFrame with a Series + results = ([f(df, self.s) for df in self.df1s] + + [f(self.s, df) for df in self.df1s]) + for (df, result) in zip(self.df1s, results): + tm.assert_index_equal(result.index, df.index) + tm.assert_index_equal(result.columns, df.columns) + for i, result in enumerate(results): + if i > 0: + self.compare(result, results[0]) + + # create the data only once as we are not setting it def _create_consistency_data(): def create_series(): @@ -2083,21 +2239,6 @@ def test_expanding_consistency(self): assert_equal(expanding_f_result, expanding_apply_f_result) - if (name in ['cov', 'corr']) and isinstance(x, - DataFrame): - # test pairwise=True - expanding_f_result = expanding_f(x, pairwise=True) - expected = Panel(items=x.index, - major_axis=x.columns, - minor_axis=x.columns) - for i, _ in enumerate(x.columns): - for j, _ in enumerate(x.columns): - expected.iloc[:, i, j] = getattr( - x.iloc[:, i].expanding( - min_periods=min_periods), - name)(x.iloc[:, j]) - tm.assert_panel_equal(expanding_f_result, expected) - @tm.slow def test_rolling_consistency(self): @@ -2203,25 +2344,6 @@ def cases(): assert_equal(rolling_f_result, rolling_apply_f_result) - if (name in ['cov', 'corr']) and isinstance( - x, DataFrame): - # test pairwise=True - rolling_f_result = rolling_f(x, - pairwise=True) - expected = Panel(items=x.index, - major_axis=x.columns, - minor_axis=x.columns) - for i, _ in enumerate(x.columns): - for j, _ in enumerate(x.columns): - expected.iloc[:, i, j] = ( - getattr( - x.iloc[:, i] - .rolling(window=window, - min_periods=min_periods, - center=center), - name)(x.iloc[:, j])) - tm.assert_panel_equal(rolling_f_result, expected) - # binary moments def test_rolling_cov(self): A = self.series @@ -2257,11 +2379,11 @@ def _check_pairwise_moment(self, dispatch, name, **kwargs): def get_result(obj, obj2=None): return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2) - panel = get_result(self.frame) - actual = panel.loc[:, 1, 5] + result = get_result(self.frame) + result = result.loc[(slice(None), 1), 5] + result.index = result.index.droplevel(1) expected = get_result(self.frame[1], self.frame[5]) - tm.assert_series_equal(actual, expected, check_names=False) - self.assertEqual(actual.name, 5) + tm.assert_series_equal(result, expected, check_names=False) def test_flex_binary_moment(self): # GH3155 @@ -2429,17 +2551,14 @@ def test_expanding_cov_pairwise(self): rolling_result = self.frame.rolling(window=len(self.frame), min_periods=1).corr() - for i in result.items: - tm.assert_almost_equal(result[i], rolling_result[i]) + tm.assert_frame_equal(result, rolling_result) def test_expanding_corr_pairwise(self): result = self.frame.expanding().corr() rolling_result = self.frame.rolling(window=len(self.frame), min_periods=1).corr() - - for i in result.items: - tm.assert_almost_equal(result[i], rolling_result[i]) + tm.assert_frame_equal(result, rolling_result) def test_expanding_cov_diff_index(self): # GH 7512 @@ -2507,8 +2626,6 @@ def test_rolling_functions_window_non_shrinkage(self): s_expected = Series(np.nan, index=s.index) df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=['A', 'B']) df_expected = DataFrame(np.nan, index=df.index, columns=df.columns) - df_expected_panel = Panel(items=df.index, major_axis=df.columns, - minor_axis=df.columns) functions = [lambda x: (x.rolling(window=10, min_periods=5) .cov(x, pairwise=False)), @@ -2540,13 +2657,22 @@ def test_rolling_functions_window_non_shrinkage(self): # scipy needed for rolling_window continue + def test_rolling_functions_window_non_shrinkage_binary(self): + + # corr/cov return a MI DataFrame + df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=['A', 'B']) + df_expected = DataFrame( + columns=df.columns, + index=pd.MultiIndex.from_product([df.index, df.columns], + names=['major', 'minor']), + dtype='float64') functions = [lambda x: (x.rolling(window=10, min_periods=5) .cov(x, pairwise=True)), lambda x: (x.rolling(window=10, min_periods=5) .corr(x, pairwise=True))] for f in functions: - df_result_panel = f(df) - tm.assert_panel_equal(df_result_panel, df_expected_panel) + df_result = f(df) + tm.assert_frame_equal(df_result, df_expected) def test_moment_functions_zero_length(self): # GH 8056 @@ -2554,13 +2680,9 @@ def test_moment_functions_zero_length(self): s_expected = s df1 = DataFrame() df1_expected = df1 - df1_expected_panel = Panel(items=df1.index, major_axis=df1.columns, - minor_axis=df1.columns) df2 = DataFrame(columns=['a']) df2['a'] = df2['a'].astype('float64') df2_expected = df2 - df2_expected_panel = Panel(items=df2.index, major_axis=df2.columns, - minor_axis=df2.columns) functions = [lambda x: x.expanding().count(), lambda x: x.expanding(min_periods=5).cov( @@ -2613,6 +2735,23 @@ def test_moment_functions_zero_length(self): # scipy needed for rolling_window continue + def test_moment_functions_zero_length_pairwise(self): + + df1 = DataFrame() + df1_expected = df1 + df2 = DataFrame(columns=['a']) + df2['a'] = df2['a'].astype('float64') + + df1_expected = DataFrame( + index=pd.MultiIndex.from_product([df1.columns, df1.index], + names=['major', 'minor']), + columns=df1.columns) + df2_expected = DataFrame( + index=pd.MultiIndex.from_product([df2.columns, df2.index], + names=['major', 'minor']), + columns=df2.columns, + dtype='float64') + functions = [lambda x: (x.expanding(min_periods=5) .cov(x, pairwise=True)), lambda x: (x.expanding(min_periods=5) @@ -2623,11 +2762,11 @@ def test_moment_functions_zero_length(self): .corr(x, pairwise=True)), ] for f in functions: - df1_result_panel = f(df1) - tm.assert_panel_equal(df1_result_panel, df1_expected_panel) + df1_result = f(df1) + tm.assert_frame_equal(df1_result, df1_expected) - df2_result_panel = f(df2) - tm.assert_panel_equal(df2_result_panel, df2_expected_panel) + df2_result = f(df2) + tm.assert_frame_equal(df2_result, df2_expected) def test_expanding_cov_pairwise_diff_length(self): # GH 7512 @@ -2635,12 +2774,13 @@ def test_expanding_cov_pairwise_diff_length(self): df1a = DataFrame([[1, 5], [3, 9]], index=[0, 2], columns=['A', 'B']) df2 = DataFrame([[5, 6], [None, None], [2, 1]], columns=['X', 'Y']) df2a = DataFrame([[5, 6], [2, 1]], index=[0, 2], columns=['X', 'Y']) - result1 = df1.expanding().cov(df2a, pairwise=True)[2] - result2 = df1.expanding().cov(df2a, pairwise=True)[2] - result3 = df1a.expanding().cov(df2, pairwise=True)[2] - result4 = df1a.expanding().cov(df2a, pairwise=True)[2] - expected = DataFrame([[-3., -5.], [-6., -10.]], index=['A', 'B'], - columns=['X', 'Y']) + result1 = df1.expanding().cov(df2a, pairwise=True).loc[2] + result2 = df1.expanding().cov(df2a, pairwise=True).loc[2] + result3 = df1a.expanding().cov(df2, pairwise=True).loc[2] + result4 = df1a.expanding().cov(df2a, pairwise=True).loc[2] + expected = DataFrame([[-3.0, -6.0], [-5.0, -10.0]], + columns=['A', 'B'], + index=Index(['X', 'Y'], name='minor')) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) tm.assert_frame_equal(result3, expected) @@ -2652,145 +2792,18 @@ def test_expanding_corr_pairwise_diff_length(self): df1a = DataFrame([[1, 2], [3, 4]], index=[0, 2], columns=['A', 'B']) df2 = DataFrame([[5, 6], [None, None], [2, 1]], columns=['X', 'Y']) df2a = DataFrame([[5, 6], [2, 1]], index=[0, 2], columns=['X', 'Y']) - result1 = df1.expanding().corr(df2, pairwise=True)[2] - result2 = df1.expanding().corr(df2a, pairwise=True)[2] - result3 = df1a.expanding().corr(df2, pairwise=True)[2] - result4 = df1a.expanding().corr(df2a, pairwise=True)[2] - expected = DataFrame([[-1.0, -1.0], [-1.0, -1.0]], index=['A', 'B'], - columns=['X', 'Y']) + result1 = df1.expanding().corr(df2, pairwise=True).loc[2] + result2 = df1.expanding().corr(df2a, pairwise=True).loc[2] + result3 = df1a.expanding().corr(df2, pairwise=True).loc[2] + result4 = df1a.expanding().corr(df2a, pairwise=True).loc[2] + expected = DataFrame([[-1.0, -1.0], [-1.0, -1.0]], + columns=['A', 'B'], + index=Index(['X', 'Y'], name='minor')) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) tm.assert_frame_equal(result3, expected) tm.assert_frame_equal(result4, expected) - def test_pairwise_stats_column_names_order(self): - # GH 7738 - df1s = [DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0, 1]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 0]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 1]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], - columns=['C', 'C']), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1., 0]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0., 1]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=['C', 1]), - DataFrame([[2., 4.], [1., 2.], [5., 2.], [8., 1.]], - columns=[1, 0.]), - DataFrame([[2, 4.], [1, 2.], [5, 2.], [8, 1.]], - columns=[0, 1.]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1.]], - columns=[1., 'X']), ] - df2 = DataFrame([[None, 1, 1], [None, 1, 2], - [None, 3, 2], [None, 8, 1]], columns=['Y', 'Z', 'X']) - s = Series([1, 1, 3, 8]) - - # suppress warnings about incomparable objects, as we are deliberately - # testing with such column labels - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", - message=".*incomparable objects.*", - category=RuntimeWarning) - - # DataFrame methods (which do not call _flex_binary_moment()) - for f in [lambda x: x.cov(), lambda x: x.corr(), ]: - results = [f(df) for df in df1s] - for (df, result) in zip(df1s, results): - tm.assert_index_equal(result.index, df.columns) - tm.assert_index_equal(result.columns, df.columns) - for i, result in enumerate(results): - if i > 0: - # compare internal values, as columns can be different - self.assert_numpy_array_equal(result.values, - results[0].values) - - # DataFrame with itself, pairwise=True - for f in [lambda x: x.expanding().cov(pairwise=True), - lambda x: x.expanding().corr(pairwise=True), - lambda x: x.rolling(window=3).cov(pairwise=True), - lambda x: x.rolling(window=3).corr(pairwise=True), - lambda x: x.ewm(com=3).cov(pairwise=True), - lambda x: x.ewm(com=3).corr(pairwise=True), ]: - results = [f(df) for df in df1s] - for (df, result) in zip(df1s, results): - tm.assert_index_equal(result.items, df.index) - tm.assert_index_equal(result.major_axis, df.columns) - tm.assert_index_equal(result.minor_axis, df.columns) - for i, result in enumerate(results): - if i > 0: - self.assert_numpy_array_equal(result.values, - results[0].values) - - # DataFrame with itself, pairwise=False - for f in [lambda x: x.expanding().cov(pairwise=False), - lambda x: x.expanding().corr(pairwise=False), - lambda x: x.rolling(window=3).cov(pairwise=False), - lambda x: x.rolling(window=3).corr(pairwise=False), - lambda x: x.ewm(com=3).cov(pairwise=False), - lambda x: x.ewm(com=3).corr(pairwise=False), ]: - results = [f(df) for df in df1s] - for (df, result) in zip(df1s, results): - tm.assert_index_equal(result.index, df.index) - tm.assert_index_equal(result.columns, df.columns) - for i, result in enumerate(results): - if i > 0: - self.assert_numpy_array_equal(result.values, - results[0].values) - - # DataFrame with another DataFrame, pairwise=True - for f in [lambda x, y: x.expanding().cov(y, pairwise=True), - lambda x, y: x.expanding().corr(y, pairwise=True), - lambda x, y: x.rolling(window=3).cov(y, pairwise=True), - lambda x, y: x.rolling(window=3).corr(y, pairwise=True), - lambda x, y: x.ewm(com=3).cov(y, pairwise=True), - lambda x, y: x.ewm(com=3).corr(y, pairwise=True), ]: - results = [f(df, df2) for df in df1s] - for (df, result) in zip(df1s, results): - tm.assert_index_equal(result.items, df.index) - tm.assert_index_equal(result.major_axis, df.columns) - tm.assert_index_equal(result.minor_axis, df2.columns) - for i, result in enumerate(results): - if i > 0: - self.assert_numpy_array_equal(result.values, - results[0].values) - - # DataFrame with another DataFrame, pairwise=False - for f in [lambda x, y: x.expanding().cov(y, pairwise=False), - lambda x, y: x.expanding().corr(y, pairwise=False), - lambda x, y: x.rolling(window=3).cov(y, pairwise=False), - lambda x, y: x.rolling(window=3).corr(y, pairwise=False), - lambda x, y: x.ewm(com=3).cov(y, pairwise=False), - lambda x, y: x.ewm(com=3).corr(y, pairwise=False), ]: - results = [f(df, df2) if df.columns.is_unique else None - for df in df1s] - for (df, result) in zip(df1s, results): - if result is not None: - expected_index = df.index.union(df2.index) - expected_columns = df.columns.union(df2.columns) - tm.assert_index_equal(result.index, expected_index) - tm.assert_index_equal(result.columns, expected_columns) - else: - tm.assertRaisesRegexp( - ValueError, "'arg1' columns are not unique", f, df, - df2) - tm.assertRaisesRegexp( - ValueError, "'arg2' columns are not unique", f, - df2, df) - - # DataFrame with a Series - for f in [lambda x, y: x.expanding().cov(y), - lambda x, y: x.expanding().corr(y), - lambda x, y: x.rolling(window=3).cov(y), - lambda x, y: x.rolling(window=3).corr(y), - lambda x, y: x.ewm(com=3).cov(y), - lambda x, y: x.ewm(com=3).corr(y), ]: - results = [f(df, s) for df in df1s] + [f(s, df) for df in df1s] - for (df, result) in zip(df1s, results): - tm.assert_index_equal(result.index, df.index) - tm.assert_index_equal(result.columns, df.columns) - for i, result in enumerate(results): - if i > 0: - self.assert_numpy_array_equal(result.values, - results[0].values) - def test_rolling_skew_edge_cases(self): all_nan = Series([np.NaN] * 5) From 0f5092cc386fc01521aa8d8553f10627a776537f Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 28 Mar 2017 11:28:41 -0400 Subject: [PATCH 3/3] fix names on return structure --- doc/source/computation.rst | 6 ++-- doc/source/whatsnew/v0.20.0.txt | 12 +++---- pandas/core/window.py | 14 ++++---- pandas/tests/test_window.py | 58 +++++++++++++++++++++------------ 4 files changed, 55 insertions(+), 35 deletions(-) diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 86370709cde41..315dd122b96cc 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -549,12 +549,12 @@ can even be omitted: .. ipython:: python covs = df[['B','C','D']].rolling(window=50).cov(df[['A','B','C']], pairwise=True) - covs.iloc[-50].unstack() + covs.unstack(-1).iloc[-50] .. ipython:: python correls = df.rolling(window=50).corr() - correls.iloc[-50].unstack() + correls.unstack(-1).iloc[-50] You can efficiently retrieve the time series of correlations between two columns using ``.loc`` indexing: @@ -567,7 +567,7 @@ columns using ``.loc`` indexing: .. ipython:: python @savefig rolling_corr_pairwise_ex.png - correls[('A', 'C')].plot() + correls.unstack(-1).[('A', 'C')].plot() .. _stats.aggregate: diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index da6ee6e911d50..2b344087bdbea 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -781,9 +781,10 @@ See the section on :ref:`Windowed Binary Operations ` for .. ipython:: python np.random.seed(1234) - df = DataFrame(np.random.rand(100, 2), - columns=['A', 'B'], - index=pd.date_range('20160101', periods=100, freq='D')) + df = pd.DataFrame(np.random.rand(100, 2), + columns=pd.Index(['A', 'B'], name='bar'), + index=pd.date_range('20160101', + periods=100, freq='D', name='foo')) df Old Behavior: @@ -805,12 +806,11 @@ New Behavior: res = df.rolling(12).corr() res -Retrieving a correlation matrix for a specified index +Retrieving a correlation matrix for a cross-section .. ipython:: python - res.iloc[-1].unstack() - + df.rolling(12).corr().loc['2016-04-07'] .. _whatsnew_0200.api_breaking.hdfstore_where: diff --git a/pandas/core/window.py b/pandas/core/window.py index a978dd6827e77..a61d5b6d90dae 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -1707,7 +1707,7 @@ def dataframe_from_int_dict(data, frame_template): # TODO: not the most efficient (perf-wise) # though not bad code-wise - from pandas import Panel, MultiIndex + from pandas import Panel, MultiIndex, Index p = Panel.from_dict(results).swapaxes('items', 'major') if len(p.major_axis) > 0: p.major_axis = arg1.columns[p.major_axis] @@ -1721,14 +1721,16 @@ def dataframe_from_int_dict(data, frame_template): else: result = DataFrame( - index=MultiIndex(levels=[arg1.columns, arg2.index], + index=MultiIndex(levels=[arg1.index, arg1.columns], labels=[[], []]), - columns=arg1.columns, + columns=arg2.columns, dtype='float64') - # reset our names - result.columns.name = None - result.index.names = ['major', 'minor'] + # reset our names to arg1 names + # careful not to mutate the original names + result.columns = Index(result.columns).set_names(None) + result.index = result.index.set_names( + [arg1.index.name, arg1.columns.name]) return result diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 8b84631c60969..c75731882e231 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -2660,11 +2660,13 @@ def test_rolling_functions_window_non_shrinkage(self): def test_rolling_functions_window_non_shrinkage_binary(self): # corr/cov return a MI DataFrame - df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=['A', 'B']) + df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], + columns=Index(['A', 'B'], name='foo'), + index=Index(range(4), name='bar')) df_expected = DataFrame( - columns=df.columns, + columns=Index(['A', 'B']), index=pd.MultiIndex.from_product([df.index, df.columns], - names=['major', 'minor']), + names=['bar', 'foo']), dtype='float64') functions = [lambda x: (x.rolling(window=10, min_periods=5) .cov(x, pairwise=True)), @@ -2739,17 +2741,17 @@ def test_moment_functions_zero_length_pairwise(self): df1 = DataFrame() df1_expected = df1 - df2 = DataFrame(columns=['a']) + df2 = DataFrame(columns=Index(['a'], name='foo'), + index=Index([], name='bar')) df2['a'] = df2['a'].astype('float64') df1_expected = DataFrame( - index=pd.MultiIndex.from_product([df1.columns, df1.index], - names=['major', 'minor']), - columns=df1.columns) + index=pd.MultiIndex.from_product([df1.index, df1.columns]), + columns=Index([])) df2_expected = DataFrame( - index=pd.MultiIndex.from_product([df2.columns, df2.index], - names=['major', 'minor']), - columns=df2.columns, + index=pd.MultiIndex.from_product([df2.index, df2.columns], + names=['bar', 'foo']), + columns=Index(['a']), dtype='float64') functions = [lambda x: (x.expanding(min_periods=5) @@ -2770,17 +2772,25 @@ def test_moment_functions_zero_length_pairwise(self): def test_expanding_cov_pairwise_diff_length(self): # GH 7512 - df1 = DataFrame([[1, 5], [3, 2], [3, 9]], columns=['A', 'B']) - df1a = DataFrame([[1, 5], [3, 9]], index=[0, 2], columns=['A', 'B']) - df2 = DataFrame([[5, 6], [None, None], [2, 1]], columns=['X', 'Y']) - df2a = DataFrame([[5, 6], [2, 1]], index=[0, 2], columns=['X', 'Y']) + df1 = DataFrame([[1, 5], [3, 2], [3, 9]], + columns=Index(['A', 'B'], name='foo')) + df1a = DataFrame([[1, 5], [3, 9]], + index=[0, 2], + columns=Index(['A', 'B'], name='foo')) + df2 = DataFrame([[5, 6], [None, None], [2, 1]], + columns=Index(['X', 'Y'], name='foo')) + df2a = DataFrame([[5, 6], [2, 1]], + index=[0, 2], + columns=Index(['X', 'Y'], name='foo')) + # TODO: xref gh-15826 + # .loc is not preserving the names result1 = df1.expanding().cov(df2a, pairwise=True).loc[2] result2 = df1.expanding().cov(df2a, pairwise=True).loc[2] result3 = df1a.expanding().cov(df2, pairwise=True).loc[2] result4 = df1a.expanding().cov(df2a, pairwise=True).loc[2] expected = DataFrame([[-3.0, -6.0], [-5.0, -10.0]], columns=['A', 'B'], - index=Index(['X', 'Y'], name='minor')) + index=Index(['X', 'Y'], name='foo')) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) tm.assert_frame_equal(result3, expected) @@ -2788,17 +2798,25 @@ def test_expanding_cov_pairwise_diff_length(self): def test_expanding_corr_pairwise_diff_length(self): # GH 7512 - df1 = DataFrame([[1, 2], [3, 2], [3, 4]], columns=['A', 'B']) - df1a = DataFrame([[1, 2], [3, 4]], index=[0, 2], columns=['A', 'B']) - df2 = DataFrame([[5, 6], [None, None], [2, 1]], columns=['X', 'Y']) - df2a = DataFrame([[5, 6], [2, 1]], index=[0, 2], columns=['X', 'Y']) + df1 = DataFrame([[1, 2], [3, 2], [3, 4]], + columns=['A', 'B'], + index=Index(range(3), name='bar')) + df1a = DataFrame([[1, 2], [3, 4]], + index=Index([0, 2], name='bar'), + columns=['A', 'B']) + df2 = DataFrame([[5, 6], [None, None], [2, 1]], + columns=['X', 'Y'], + index=Index(range(3), name='bar')) + df2a = DataFrame([[5, 6], [2, 1]], + index=Index([0, 2], name='bar'), + columns=['X', 'Y']) result1 = df1.expanding().corr(df2, pairwise=True).loc[2] result2 = df1.expanding().corr(df2a, pairwise=True).loc[2] result3 = df1a.expanding().corr(df2, pairwise=True).loc[2] result4 = df1a.expanding().corr(df2a, pairwise=True).loc[2] expected = DataFrame([[-1.0, -1.0], [-1.0, -1.0]], columns=['A', 'B'], - index=Index(['X', 'Y'], name='minor')) + index=Index(['X', 'Y'])) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) tm.assert_frame_equal(result3, expected)