From 3ba28431d4791b9dd1b0f1a9a73728af9123b4cc Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 14 Jan 2018 18:03:07 -0500 Subject: [PATCH 1/4] PERF: remove use of Panel & perf in rolling corr/cov closes #17917 --- asv_bench/benchmarks/rolling.py | 25 ++++++++++++++++-- pandas/core/indexes/base.py | 5 ++++ pandas/core/window.py | 47 +++++++++++++++++++++------------ pandas/tests/test_window.py | 22 ++++++++------- 4 files changed, 71 insertions(+), 28 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 59cf7d090a622..75990d83f8212 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -11,8 +11,8 @@ class Methods(object): [10, 1000], ['int', 'float'], ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt', - 'sum', 'corr', 'cov']) - param_names = ['constructor', 'window', 'dtype', 'method'] + 'sum']) + param_names = ['contructor', 'window', 'dtype', 'method'] def setup(self, constructor, window, dtype, method): N = 10**5 @@ -23,6 +23,27 @@ def time_rolling(self, constructor, window, dtype, method): getattr(self.roll, method)() +class Pairwise(object): + + sample_time = 0.2 + params = ([10, 1000, None], + ['corr', 'cov'], + [True, False]) + param_names = ['window', 'method', 'pairwise'] + + def setup(self, window, method, pairwise): + N = 10**4 + arr = np.random.random(N) + self.df = pd.DataFrame(arr) + + def time_pairwise(self, window, method, pairwise): + if window is None: + r = self.df.expanding() + else: + r = self.df.rolling(window=window) + getattr(r, method)(self.df, pairwise=pairwise) + + class Quantile(object): sample_time = 0.2 diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 626f3dc86556a..619c8ea6193ee 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1102,6 +1102,11 @@ def _assert_can_do_op(self, value): def nlevels(self): return 1 + @property + def levels(self): + """ return a list my levels """ + return [list(self)] + def _get_names(self): return FrozenList((self.name, )) diff --git a/pandas/core/window.py b/pandas/core/window.py index 4d6a1de60f59b..3093930578a0e 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -1863,25 +1863,38 @@ def dataframe_from_int_dict(data, frame_template): results[i][j] = f(*_prep_binary(arg1.iloc[:, i], arg2.iloc[:, j])) - # TODO: not the most efficient (perf-wise) - # though not bad code-wise - from pandas import Panel, MultiIndex, concat - - with warnings.catch_warnings(record=True): - p = Panel.from_dict(results).swapaxes('items', 'major') - if len(p.major_axis) > 0: - p.major_axis = arg1.columns[p.major_axis] - if len(p.minor_axis) > 0: - p.minor_axis = arg2.columns[p.minor_axis] - - if len(p.items): + from pandas import MultiIndex, concat + + result_index = arg1.index.union(arg2.index) + if len(result_index): + + # construct result frame result = concat( - [p.iloc[i].T for i in range(len(p.items))], - keys=p.items) + [concat([results[i][j] + for j, c in enumerate(arg2.columns)], + ignore_index=True) + for i, c in enumerate(arg1.columns)], + ignore_index=True, + axis=1) + result.columns = arg1.columns + + # set the index and reorder + if arg2.columns.nlevels > 1: + result.index = MultiIndex.from_product( + arg2.columns.levels + result_index.levels) + result = result.reorder_levels([2, 0, 1]).sort_index() + else: + result.index = MultiIndex.from_product( + [range(len(arg2.columns)), + range(len(result_index))]) + result = result.swaplevel(1, 0).sort_index() + result.index = MultiIndex.from_product( + result_index.levels + arg2.columns.levels) else: + # empty result result = DataFrame( - index=MultiIndex(levels=[arg1.index, arg1.columns], + index=MultiIndex(levels=[arg1.index, arg2.columns], labels=[[], []]), columns=arg2.columns, dtype='float64') @@ -1890,9 +1903,9 @@ def dataframe_from_int_dict(data, frame_template): # reset our column names to arg2 names # careful not to mutate the original names result.columns = result.columns.set_names( - arg2.columns.names) + arg1.columns.names) result.index = result.index.set_names( - arg1.index.names + arg1.columns.names) + result_index.names + arg2.columns.names) return result diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 22526d14a7168..dabdb1e8e689c 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -14,6 +14,7 @@ import pandas.tseries.offsets as offsets from pandas.core.base import SpecificationError from pandas.errors import UnsupportedFunctionCall +from pandas.core.sorting import safe_sort import pandas.util.testing as tm import pandas.util._test_decorators as td from pandas.compat import range, zip @@ -1645,7 +1646,7 @@ def compare(self, result, expected): result = result.dropna().values expected = expected.dropna().values - tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected, check_dtype=False) @pytest.mark.parametrize('f', [lambda x: x.cov(), lambda x: x.corr()]) def test_no_flex(self, f): @@ -1670,15 +1671,19 @@ def test_no_flex(self, f): def test_pairwise_with_self(self, f): # DataFrame with itself, pairwise=True - results = [f(df) for df in self.df1s] - for (df, result) in zip(self.df1s, results): + # note that we may construct the 1st level of the MI + # in a non-motononic way, so compare accordingly + results = [] + for i, df in enumerate(self.df1s): + result = f(df) tm.assert_index_equal(result.index.levels[0], df.index, check_names=False) - tm.assert_index_equal(result.index.levels[1], - df.columns, - check_names=False) + tm.assert_numpy_array_equal(safe_sort(result.index.levels[1]), + safe_sort(df.columns.unique())) tm.assert_index_equal(result.columns, df.columns) + results.append(df) + for i, result in enumerate(results): if i > 0: self.compare(result, results[0]) @@ -1716,9 +1721,8 @@ def test_pairwise_with_other(self, f): tm.assert_index_equal(result.index.levels[0], df.index, check_names=False) - tm.assert_index_equal(result.index.levels[1], - self.df2.columns, - check_names=False) + tm.assert_numpy_array_equal(safe_sort(result.index.levels[1]), + safe_sort(self.df2.columns.unique())) for i, result in enumerate(results): if i > 0: self.compare(result, results[0]) From e4d7ce946162dbcda5ca45e0cfaaf156b77b1423 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 15 Jan 2018 18:38:32 -0500 Subject: [PATCH 2/4] whatsnew --- doc/source/whatsnew/v0.23.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 2bd2bb199bf1f..5db29cb76b106 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -383,7 +383,7 @@ Performance Improvements - :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`) - Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`) - Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`) - +- Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`) .. _whatsnew_0230.docs: From 44b22e00db2ae02abcab90966f03feca9dc9a1a8 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 15 Jan 2018 20:07:18 -0500 Subject: [PATCH 3/4] fix incorrect usage in pivot --- pandas/core/reshape/pivot.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 0e92fc4edce85..a4c9848dca900 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -99,19 +99,15 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', if not dropna: from pandas import MultiIndex - try: + if table.index.nlevels > 1: m = MultiIndex.from_arrays(cartesian_product(table.index.levels), names=table.index.names) table = table.reindex(m, axis=0) - except AttributeError: - pass # it's a single level - try: + if table.columns.nlevels > 1: m = MultiIndex.from_arrays(cartesian_product(table.columns.levels), names=table.columns.names) table = table.reindex(m, axis=1) - except AttributeError: - pass # it's a single level or a series if isinstance(table, ABCDataFrame): table = table.sort_index(axis=1) From 2e8aaa11a870d68f3759eb506effdbbb891249b8 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 16 Jan 2018 06:06:47 -0500 Subject: [PATCH 4/4] levels compat --- pandas/core/indexes/base.py | 5 ----- pandas/core/window.py | 4 ++-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 619c8ea6193ee..626f3dc86556a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1102,11 +1102,6 @@ def _assert_can_do_op(self, value): def nlevels(self): return 1 - @property - def levels(self): - """ return a list my levels """ - return [list(self)] - def _get_names(self): return FrozenList((self.name, )) diff --git a/pandas/core/window.py b/pandas/core/window.py index 3093930578a0e..a3f19ef50459d 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -1881,7 +1881,7 @@ def dataframe_from_int_dict(data, frame_template): # set the index and reorder if arg2.columns.nlevels > 1: result.index = MultiIndex.from_product( - arg2.columns.levels + result_index.levels) + arg2.columns.levels + [result_index]) result = result.reorder_levels([2, 0, 1]).sort_index() else: result.index = MultiIndex.from_product( @@ -1889,7 +1889,7 @@ def dataframe_from_int_dict(data, frame_template): range(len(result_index))]) result = result.swaplevel(1, 0).sort_index() result.index = MultiIndex.from_product( - result_index.levels + arg2.columns.levels) + [result_index] + [arg2.columns]) else: # empty result