From ca810fa0ab450f1cf507f0d7eb42d1ab9b716289 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 29 Jan 2021 00:15:47 -0800 Subject: [PATCH 01/19] add helper function to apply pairwise --- pandas/core/window/rolling.py | 37 +++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 439cd586825e1..f1f699843355d 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -415,6 +415,14 @@ def _apply_tablewise( self._insert_on_column(out, obj) return out + def _apply_pairwise(self, target, other, pairwise, func): + if other is None: + other = target + # only default unset + pairwise = True if pairwise is None else pairwise + + return flex_binary_moment(target, other, func, pairwise=bool(pairwise)) + def _apply( self, func: Callable[..., Any], @@ -761,8 +769,8 @@ def __init__(self, obj, *args, **kwargs): self._groupby.grouper.mutated = True super().__init__(obj, *args, **kwargs) - corr = dispatch("corr", other=None, pairwise=None) - cov = dispatch("cov", other=None, pairwise=None) + # corr = dispatch("corr", other=None, pairwise=None) + # cov = dispatch("cov", other=None, pairwise=None) def _apply( self, @@ -826,6 +834,14 @@ def _apply( result.index = result_index return result + def _apply_pairwise(self, target, other, pairwise, func): + # Need to manually drop the grouping column first + target = target.drop(columns=self._groupby.indices.keys(), errors="ignore") + result = super()._apply_pairwise(target, other, pairwise, func) + # Modify the resulting index to include the groupby level + + return result + def _create_data(self, obj: FrameOrSeries) -> FrameOrSeries: """ Split data into blocks & return conformed data. @@ -1832,11 +1848,6 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): """ def cov(self, other=None, pairwise=None, ddof=1, **kwargs): - if other is None: - other = self._selected_obj - # only default unset - pairwise = True if pairwise is None else pairwise - from pandas import Series def cov_func(x, y): @@ -1866,9 +1877,7 @@ def cov_func(x, y): result = (mean_x_y - mean_x * mean_y) * (count_x_y / (count_x_y - ddof)) return Series(result, index=x.index, name=x.name) - return flex_binary_moment( - self._selected_obj, other, cov_func, pairwise=bool(pairwise) - ) + return self._apply_pairwise(self._selected_obj, other, pairwise, cov_func) _shared_docs["corr"] = dedent( """ @@ -1981,10 +1990,6 @@ def cov_func(x, y): ) def corr(self, other=None, pairwise=None, ddof=1, **kwargs): - if other is None: - other = self._selected_obj - # only default unset - pairwise = True if pairwise is None else pairwise from pandas import Series @@ -2025,9 +2030,7 @@ def corr_func(x, y): result = numerator / denominator return Series(result, index=x.index, name=x.name) - return flex_binary_moment( - self._selected_obj, other, corr_func, pairwise=bool(pairwise) - ) + return self._apply_pairwise(self._selected_obj, other, pairwise, corr_func) class Rolling(RollingAndExpandingMixin): From e8c4bbbe392433d54fc17cf088ccbc26b8c82f5a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 31 Jan 2021 21:25:23 -0800 Subject: [PATCH 02/19] Use keys instead of indices.keys() --- pandas/core/window/rolling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index f1f699843355d..5f751fdcfa32d 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -836,7 +836,7 @@ def _apply( def _apply_pairwise(self, target, other, pairwise, func): # Need to manually drop the grouping column first - target = target.drop(columns=self._groupby.indices.keys(), errors="ignore") + target = target.drop(columns=list(self._groupby.keys), errors="ignore") result = super()._apply_pairwise(target, other, pairwise, func) # Modify the resulting index to include the groupby level From 6ca7441b9954585ac3061f0dbd449c007557b52c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 1 Feb 2021 11:49:16 -0800 Subject: [PATCH 03/19] Add logic to include groupby index --- pandas/core/window/rolling.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 5f751fdcfa32d..561a31821a5c1 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -835,11 +835,37 @@ def _apply( return result def _apply_pairwise(self, target, other, pairwise, func): - # Need to manually drop the grouping column first + # Manually drop the grouping column first target = target.drop(columns=list(self._groupby.keys), errors="ignore") result = super()._apply_pairwise(target, other, pairwise, func) + # Modify the resulting index to include the groupby level + groupby_codes = self._groupby.grouper.codes + groupby_levels = self._groupby.grouper.levels + + group_indices = self._groupby.grouper.indices.values() + if group_indices: + indexer = np.concatenate(list(group_indices)) + else: + indexer = np.array([], dtype=np.intp) + if target.ndim == 1: + num_target_columns = 1 + else: + num_target_columns = len(target.columns) + groupby_codes = [ + np.repeat(c.take(indexer), num_target_columns) for c in groupby_codes + ] + + result_codes = groupby_codes + list(result.index.codes) + result_levels = groupby_levels + list(result.index.levels) + result_names = list(self._groupby.keys) + list(result.index.names) + + result_index = MultiIndex( + result_levels, result_codes, names=result_names, verify_integrity=False + ) + + result.index = result_index return result def _create_data(self, obj: FrameOrSeries) -> FrameOrSeries: From cb35785cb2bc455b0257322e6b651e7e36ffc1e5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 1 Feb 2021 22:10:38 -0800 Subject: [PATCH 04/19] Add for nonpairwise case --- pandas/core/window/rolling.py | 73 +++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 21 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 561a31821a5c1..87f5282dd49ce 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -837,35 +837,66 @@ def _apply( def _apply_pairwise(self, target, other, pairwise, func): # Manually drop the grouping column first target = target.drop(columns=list(self._groupby.keys), errors="ignore") + breakpoint() result = super()._apply_pairwise(target, other, pairwise, func) - # Modify the resulting index to include the groupby level - groupby_codes = self._groupby.grouper.codes - groupby_levels = self._groupby.grouper.levels + if other is not None: + + from pandas.core.algorithms import factorize + from pandas.core.reshape.concat import concat + + gb_pairs = (list(pair) for pair in self._groupby.indices.keys()) + groupby_codes = [] + groupby_levels = [] + # e.g. [[1, 2], [4, 5]] as [[1, 4], [2, 5]] + for gb_level_pair in map(list, zip(*gb_pairs)): + labels = np.repeat(np.array(gb_level_pair), len(result)) + codes, levels = factorize(labels) + groupby_codes.append(codes) + groupby_levels.append(levels) + + result_codes, result_levels = factorize(result.index) + result_codes = groupby_codes + [result_codes] + result_levels = groupby_levels + [result_levels] + result_names = list(self._groupby.keys) + list(result.index.name) + result_index = MultiIndex( + result_levels, result_codes, names=result_names, verify_integrity=False + ) + result = concat( + [ + result.take(gb_indices).reindex(result.index) + for gb_indices in self._groupby.indices.values() + ] + ) - group_indices = self._groupby.grouper.indices.values() - if group_indices: - indexer = np.concatenate(list(group_indices)) + result.index = result_index else: - indexer = np.array([], dtype=np.intp) + groupby_codes = self._groupby.grouper.codes + groupby_levels = self._groupby.grouper.levels - if target.ndim == 1: - num_target_columns = 1 - else: - num_target_columns = len(target.columns) - groupby_codes = [ - np.repeat(c.take(indexer), num_target_columns) for c in groupby_codes - ] + group_indices = self._groupby.grouper.indices.values() + if group_indices: + indexer = np.concatenate(list(group_indices)) + else: + indexer = np.array([], dtype=np.intp) - result_codes = groupby_codes + list(result.index.codes) - result_levels = groupby_levels + list(result.index.levels) - result_names = list(self._groupby.keys) + list(result.index.names) + if target.ndim == 1: + repeat_by = 1 + else: + repeat_by = len(target.columns) + groupby_codes = [ + np.repeat(c.take(indexer), repeat_by) for c in groupby_codes + ] - result_index = MultiIndex( - result_levels, result_codes, names=result_names, verify_integrity=False - ) + result_codes = groupby_codes + list(result.index.codes) + result_levels = groupby_levels + list(result.index.levels) + result_names = list(self._groupby.keys) + list(result.index.names) - result.index = result_index + result_index = MultiIndex( + result_levels, result_codes, names=result_names, verify_integrity=False + ) + + result.index = result_index return result def _create_data(self, obj: FrameOrSeries) -> FrameOrSeries: From f53547a755b07ba71ace8cb1bcda171605cf0ec3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 2 Feb 2021 20:14:29 -0800 Subject: [PATCH 05/19] Get index to work for non pairwise --- pandas/core/window/rolling.py | 46 +++++++++++++++++------------------ 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 87f5282dd49ce..3054bc42a2e26 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -48,11 +48,14 @@ ) from pandas.core.dtypes.missing import notna +from pandas.core.algorithms import factorize from pandas.core.apply import ResamplerWindowApply from pandas.core.base import DataError, SelectionMixin +import pandas.core.common as common from pandas.core.construction import extract_array from pandas.core.groupby.base import GotItemMixin, ShallowMixin from pandas.core.indexes.api import Index, MultiIndex +from pandas.core.reshape.concat import concat from pandas.core.util.numba_ import NUMBA_FUNC_CACHE, maybe_use_numba from pandas.core.window.common import ( _doc_template, @@ -836,40 +839,38 @@ def _apply( def _apply_pairwise(self, target, other, pairwise, func): # Manually drop the grouping column first - target = target.drop(columns=list(self._groupby.keys), errors="ignore") - breakpoint() + target = target.drop( + columns=common.maybe_make_list(self._groupby.keys), errors="ignore" + ) result = super()._apply_pairwise(target, other, pairwise, func) # Modify the resulting index to include the groupby level if other is not None: + old_result_len = len(result) + result = concat( + [ + result.take(gb_indices).reindex(result.index) + for gb_indices in self._groupby.indices.values() + ] + ) - from pandas.core.algorithms import factorize - from pandas.core.reshape.concat import concat - - gb_pairs = (list(pair) for pair in self._groupby.indices.keys()) + gb_pairs = ( + common.maybe_make_list(pair) for pair in self._groupby.indices.keys() + ) groupby_codes = [] groupby_levels = [] # e.g. [[1, 2], [4, 5]] as [[1, 4], [2, 5]] for gb_level_pair in map(list, zip(*gb_pairs)): - labels = np.repeat(np.array(gb_level_pair), len(result)) + labels = np.repeat(np.array(gb_level_pair), old_result_len) codes, levels = factorize(labels) groupby_codes.append(codes) groupby_levels.append(levels) result_codes, result_levels = factorize(result.index) + result_codes = groupby_codes + [result_codes] result_levels = groupby_levels + [result_levels] - result_names = list(self._groupby.keys) + list(result.index.name) - result_index = MultiIndex( - result_levels, result_codes, names=result_names, verify_integrity=False - ) - result = concat( - [ - result.take(gb_indices).reindex(result.index) - for gb_indices in self._groupby.indices.values() - ] - ) + result_names = list(self._groupby.keys) + [result.index.name] - result.index = result_index else: groupby_codes = self._groupby.grouper.codes groupby_levels = self._groupby.grouper.levels @@ -892,11 +893,10 @@ def _apply_pairwise(self, target, other, pairwise, func): result_levels = groupby_levels + list(result.index.levels) result_names = list(self._groupby.keys) + list(result.index.names) - result_index = MultiIndex( - result_levels, result_codes, names=result_names, verify_integrity=False - ) - - result.index = result_index + result_index = MultiIndex( + result_levels, result_codes, names=result_names, verify_integrity=False + ) + result.index = result_index return result def _create_data(self, obj: FrameOrSeries) -> FrameOrSeries: From 3459e712f5c60b21e0ababb9437958d972c2b041 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 2 Feb 2021 20:15:23 -0800 Subject: [PATCH 06/19] Remove commented out methods --- pandas/core/window/rolling.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 3054bc42a2e26..f02cf31eee800 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -772,9 +772,6 @@ def __init__(self, obj, *args, **kwargs): self._groupby.grouper.mutated = True super().__init__(obj, *args, **kwargs) - # corr = dispatch("corr", other=None, pairwise=None) - # cov = dispatch("cov", other=None, pairwise=None) - def _apply( self, func: Callable[..., Any], From a7c0dd1713b86599fc9956829567370e631a6daf Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 2 Feb 2021 20:22:57 -0800 Subject: [PATCH 07/19] add comments --- pandas/core/window/rolling.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index f02cf31eee800..0918bff81c816 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -842,6 +842,9 @@ def _apply_pairwise(self, target, other, pairwise, func): result = super()._apply_pairwise(target, other, pairwise, func) # Modify the resulting index to include the groupby level if other is not None: + # When we have other, we must reindex (expand) the result + # from flex_binary_moment to a "transform"-like result + # per groupby combination old_result_len = len(result) result = concat( [ @@ -869,6 +872,8 @@ def _apply_pairwise(self, target, other, pairwise, func): result_names = list(self._groupby.keys) + [result.index.name] else: + # When we evaluate the pairwise=True result, repeat the groupby + # labels by the number of columns in the original object groupby_codes = self._groupby.grouper.codes groupby_levels = self._groupby.grouper.levels From d0f33ccde9999df01c4168558b0ce15e75539971 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 3 Feb 2021 12:03:16 -0800 Subject: [PATCH 08/19] Change _apply pairwise --- pandas/core/window/rolling.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 0918bff81c816..64f23febd37c5 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -51,7 +51,6 @@ from pandas.core.algorithms import factorize from pandas.core.apply import ResamplerWindowApply from pandas.core.base import DataError, SelectionMixin -import pandas.core.common as common from pandas.core.construction import extract_array from pandas.core.groupby.base import GotItemMixin, ShallowMixin from pandas.core.indexes.api import Index, MultiIndex @@ -835,10 +834,11 @@ def _apply( return result def _apply_pairwise(self, target, other, pairwise, func): + def as_list(arg): + return [arg] if not isinstance(arg, (tuple, list)) else arg + # Manually drop the grouping column first - target = target.drop( - columns=common.maybe_make_list(self._groupby.keys), errors="ignore" - ) + target = target.drop(columns=as_list(self._groupby.keys), errors="ignore") result = super()._apply_pairwise(target, other, pairwise, func) # Modify the resulting index to include the groupby level if other is not None: @@ -853,9 +853,7 @@ def _apply_pairwise(self, target, other, pairwise, func): ] ) - gb_pairs = ( - common.maybe_make_list(pair) for pair in self._groupby.indices.keys() - ) + gb_pairs = (as_list(pair) for pair in self._groupby.indices.keys()) groupby_codes = [] groupby_levels = [] # e.g. [[1, 2], [4, 5]] as [[1, 4], [2, 5]] @@ -866,10 +864,9 @@ def _apply_pairwise(self, target, other, pairwise, func): groupby_levels.append(levels) result_codes, result_levels = factorize(result.index) - result_codes = groupby_codes + [result_codes] result_levels = groupby_levels + [result_levels] - result_names = list(self._groupby.keys) + [result.index.name] + result_names = as_list(self._groupby.keys) + [result.index.name] else: # When we evaluate the pairwise=True result, repeat the groupby @@ -890,10 +887,19 @@ def _apply_pairwise(self, target, other, pairwise, func): groupby_codes = [ np.repeat(c.take(indexer), repeat_by) for c in groupby_codes ] - - result_codes = groupby_codes + list(result.index.codes) - result_levels = groupby_levels + list(result.index.levels) - result_names = list(self._groupby.keys) + list(result.index.names) + if isinstance(result.index, MultiIndex): + result_codes = list(result.index.codes) + result_levels = list(result.index.levels) + result_names = list(result.index.names) + else: + result_codes, result_levels = factorize(result.index) + result_codes = [result_codes] + result_levels = [result_levels] + result_names = [result.index.name] + + result_codes = groupby_codes + result_codes + result_levels = groupby_levels + result_levels + result_names = as_list(self._groupby.keys) + result_names result_index = MultiIndex( result_levels, result_codes, names=result_names, verify_integrity=False From d1841956fbd4a22729855b1bdb02e5e6acc2df22 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 3 Feb 2021 18:56:24 -0800 Subject: [PATCH 09/19] Use grouper.names instead --- pandas/core/window/rolling.py | 15 ++++++++------- pandas/tests/window/test_groupby.py | 2 ++ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 64f23febd37c5..5cc2359d5dc38 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -51,6 +51,7 @@ from pandas.core.algorithms import factorize from pandas.core.apply import ResamplerWindowApply from pandas.core.base import DataError, SelectionMixin +import pandas.core.common as common from pandas.core.construction import extract_array from pandas.core.groupby.base import GotItemMixin, ShallowMixin from pandas.core.indexes.api import Index, MultiIndex @@ -819,6 +820,7 @@ def _apply( # if the index of the original dataframe needs to be preserved, append # this index (but reordered) to the codes/levels from the groupby + breakpoint() if grouped_object_index is not None: idx = grouped_object_index.take(indexer) if not isinstance(idx, MultiIndex): @@ -834,11 +836,8 @@ def _apply( return result def _apply_pairwise(self, target, other, pairwise, func): - def as_list(arg): - return [arg] if not isinstance(arg, (tuple, list)) else arg - # Manually drop the grouping column first - target = target.drop(columns=as_list(self._groupby.keys), errors="ignore") + target = target.drop(columns=self._groupby.grouper.names, errors="ignore") result = super()._apply_pairwise(target, other, pairwise, func) # Modify the resulting index to include the groupby level if other is not None: @@ -853,7 +852,9 @@ def as_list(arg): ] ) - gb_pairs = (as_list(pair) for pair in self._groupby.indices.keys()) + gb_pairs = ( + common.maybe_make_list(pair) for pair in self._groupby.indices.keys() + ) groupby_codes = [] groupby_levels = [] # e.g. [[1, 2], [4, 5]] as [[1, 4], [2, 5]] @@ -866,7 +867,7 @@ def as_list(arg): result_codes, result_levels = factorize(result.index) result_codes = groupby_codes + [result_codes] result_levels = groupby_levels + [result_levels] - result_names = as_list(self._groupby.keys) + [result.index.name] + result_names = self._groupby.grouper.names + [result.index.name] else: # When we evaluate the pairwise=True result, repeat the groupby @@ -899,7 +900,7 @@ def as_list(arg): result_codes = groupby_codes + result_codes result_levels = groupby_levels + result_levels - result_names = as_list(self._groupby.keys) + result_names + result_names = self._groupby.grouper.names + result_names result_index = MultiIndex( result_levels, result_codes, names=result_names, verify_integrity=False diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index b89fb35ac3a70..a25f7debfc5b2 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -117,6 +117,8 @@ def func(x): return getattr(x.rolling(4), f)(self.frame) expected = g.apply(func) + # The grouped column should be all np.nan (groupby.apply inserts 0s) + expected["A"] = np.nan tm.assert_frame_equal(result, expected) result = getattr(r.B, f)(pairwise=True) From ff200f7b5ca83257e9f79295b609e3e4e5adca9e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 3 Feb 2021 20:11:13 -0800 Subject: [PATCH 10/19] Modify behavior of some incorrect tests --- pandas/core/window/rolling.py | 1 - pandas/tests/window/test_groupby.py | 7 ++++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index b91ec64f1baf7..e017378cc2ff5 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -585,7 +585,6 @@ def _apply( # if the index of the original dataframe needs to be preserved, append # this index (but reordered) to the codes/levels from the groupby - breakpoint() if grouped_object_index is not None: idx = grouped_object_index.take(indexer) if not isinstance(idx, MultiIndex): diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index a25f7debfc5b2..c5269693dfb83 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -117,7 +117,7 @@ def func(x): return getattr(x.rolling(4), f)(self.frame) expected = g.apply(func) - # The grouped column should be all np.nan (groupby.apply inserts 0s) + # The grouped column should be all np.nan (groupby.apply inserts 0s for cov) expected["A"] = np.nan tm.assert_frame_equal(result, expected) @@ -690,6 +690,11 @@ def func(x): return getattr(x.expanding(), f)(self.frame) expected = g.apply(func) + # groupby.apply returns 1 instead of nan for windows with all nan values + null_idx = list(range(20, 61)) + list(range(72, 113)) + expected.iloc[null_idx, 1] = np.nan + # The grouped column should be all np.nan (groupby.apply inserts 0s for cov) + expected["A"] = np.nan tm.assert_frame_equal(result, expected) result = getattr(r.B, f)(pairwise=True) From 0b1fcd1b20f1b06d1760abfad19a4584ba476172 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 3 Feb 2021 20:20:04 -0800 Subject: [PATCH 11/19] Add whatsnew entry --- doc/source/whatsnew/v1.3.0.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 4d0384abbf0c6..639f86cf25147 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -249,7 +249,8 @@ Performance improvements - Performance improvement in :meth:`Series.mean` for nullable data types (:issue:`34814`) - Performance improvement in :meth:`Series.isin` for nullable data types (:issue:`38340`) - Performance improvement in :meth:`DataFrame.corr` for method=kendall (:issue:`28329`) -- Performance improvement in :meth:`core.window.Rolling.corr` and :meth:`core.window.Rolling.cov` (:issue:`39388`) +- Performance improvement in :meth:`core.window.rolling.Rolling.corr` and :meth:`core.window.rolling.Rolling.cov` (:issue:`39388`) +- Performance improvement in :meth:`core.window.rolling.RollingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` (:issue:``) .. --------------------------------------------------------------------------- @@ -406,6 +407,8 @@ Groupby/resample/rolling - Bug in :meth:`.Resampler.aggregate` and :meth:`DataFrame.transform` raising ``TypeError`` instead of ``SpecificationError`` when missing keys had mixed dtypes (:issue:`39025`) - Bug in :meth:`.DataFrameGroupBy.idxmin` and :meth:`.DataFrameGroupBy.idxmax` with ``ExtensionDtype`` columns (:issue:`38733`) - Bug in :meth:`Series.resample` would raise when the index was a :class:`PeriodIndex` consisting of ``NaT`` (:issue:`39227`) +- Bug in :meth:`core.window.rolling.RollingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.corr` where the groupby column would return 0 instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:``) +- Bug in :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` where 1 would be returned instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:``) Reshaping ^^^^^^^^^ From 6a1e85921629a6f5a16e91ecb521a3b26b2d211c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 3 Feb 2021 20:31:34 -0800 Subject: [PATCH 12/19] Add asv benchmarks --- asv_bench/benchmarks/rolling.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 5f8cdb2a0bdac..5738775fe2b27 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -140,8 +140,11 @@ class Pairwise: def setup(self, window, method, pairwise): N = 10 ** 4 + n_groups = 20 + groups = [i for _ in range(N // n_groups) for i in range(n_groups)] arr = np.random.random(N) self.df = pd.DataFrame(arr) + self.df_group = pd.DataFrame({"A": groups, "B": arr}).groupby("A") def time_pairwise(self, window, method, pairwise): if window is None: @@ -150,6 +153,13 @@ def time_pairwise(self, window, method, pairwise): r = self.df.rolling(window=window) getattr(r, method)(self.df, pairwise=pairwise) + def time_groupby(self, window, method, pairwise): + if window is None: + r = self.df_group.expanding() + else: + r = self.df_group.rolling(window=window) + getattr(r, method)(self.df, pairwise=pairwise) + class Quantile: params = ( From ac944e497dadcc72309e7041c4cc6b2d448acbb1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 3 Feb 2021 22:42:31 -0800 Subject: [PATCH 13/19] Add issue number ref --- doc/source/whatsnew/v1.3.0.rst | 6 +++--- pandas/tests/window/test_groupby.py | 9 ++++++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 639f86cf25147..251fbf538ca92 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -250,7 +250,7 @@ Performance improvements - Performance improvement in :meth:`Series.isin` for nullable data types (:issue:`38340`) - Performance improvement in :meth:`DataFrame.corr` for method=kendall (:issue:`28329`) - Performance improvement in :meth:`core.window.rolling.Rolling.corr` and :meth:`core.window.rolling.Rolling.cov` (:issue:`39388`) -- Performance improvement in :meth:`core.window.rolling.RollingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` (:issue:``) +- Performance improvement in :meth:`core.window.rolling.RollingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` (:issue:`39591`) .. --------------------------------------------------------------------------- @@ -407,8 +407,8 @@ Groupby/resample/rolling - Bug in :meth:`.Resampler.aggregate` and :meth:`DataFrame.transform` raising ``TypeError`` instead of ``SpecificationError`` when missing keys had mixed dtypes (:issue:`39025`) - Bug in :meth:`.DataFrameGroupBy.idxmin` and :meth:`.DataFrameGroupBy.idxmax` with ``ExtensionDtype`` columns (:issue:`38733`) - Bug in :meth:`Series.resample` would raise when the index was a :class:`PeriodIndex` consisting of ``NaT`` (:issue:`39227`) -- Bug in :meth:`core.window.rolling.RollingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.corr` where the groupby column would return 0 instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:``) -- Bug in :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` where 1 would be returned instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:``) +- Bug in :meth:`core.window.rolling.RollingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.corr` where the groupby column would return 0 instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`) +- Bug in :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` where 1 would be returned instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`) Reshaping ^^^^^^^^^ diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index c5269693dfb83..d3c2b5467e5bb 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -117,7 +117,8 @@ def func(x): return getattr(x.rolling(4), f)(self.frame) expected = g.apply(func) - # The grouped column should be all np.nan (groupby.apply inserts 0s for cov) + # GH 39591: The grouped column should be all np.nan + # (groupby.apply inserts 0s for cov) expected["A"] = np.nan tm.assert_frame_equal(result, expected) @@ -690,10 +691,12 @@ def func(x): return getattr(x.expanding(), f)(self.frame) expected = g.apply(func) - # groupby.apply returns 1 instead of nan for windows with all nan values + # GH 39591: groupby.apply returns 1 instead of nan for windows + # with all nan values null_idx = list(range(20, 61)) + list(range(72, 113)) expected.iloc[null_idx, 1] = np.nan - # The grouped column should be all np.nan (groupby.apply inserts 0s for cov) + # GH 39591: The grouped column should be all np.nan + # (groupby.apply inserts 0s for cov) expected["A"] = np.nan tm.assert_frame_equal(result, expected) From bb6d843354c4f40971f5b498b05b375b90ed14cd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 4 Feb 2021 10:49:40 -0800 Subject: [PATCH 14/19] Move dispatch function to ewm --- pandas/core/window/ewm.py | 18 +++++++++++++++++- pandas/core/window/rolling.py | 16 ---------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 2948216e200de..26f366af3aa62 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -35,7 +35,7 @@ GroupbyIndexer, ) from pandas.core.window.numba_ import generate_numba_groupby_ewma_func -from pandas.core.window.rolling import BaseWindow, BaseWindowGroupby, dispatch +from pandas.core.window.rolling import BaseWindow, BaseWindowGroupby if TYPE_CHECKING: from pandas import Series @@ -83,6 +83,22 @@ def wrap_result(obj: Series, result: np.ndarray) -> Series: return obj._constructor(result, obj.index, name=obj.name) +def dispatch(name: str, *args, **kwargs): + """ + Dispatch to groupby apply. + """ + + def outer(self, *args, **kwargs): + def f(x): + x = self._shallow_copy(x, groupby=self._groupby) + return getattr(x, name)(*args, **kwargs) + + return self._groupby.apply(f) + + outer.__name__ = name + return outer + + class ExponentialMovingWindow(BaseWindow): r""" Provide exponential weighted (EW) functions. diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index e017378cc2ff5..0b398088cd03d 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -506,22 +506,6 @@ def aggregate(self, func, *args, **kwargs): agg = aggregate -def dispatch(name: str, *args, **kwargs): - """ - Dispatch to groupby apply. - """ - - def outer(self, *args, **kwargs): - def f(x): - x = self._shallow_copy(x, groupby=self._groupby) - return getattr(x, name)(*args, **kwargs) - - return self._groupby.apply(f) - - outer.__name__ = name - return outer - - class BaseWindowGroupby(GotItemMixin, BaseWindow): """ Provide the groupby windowing facilities. From a8d24b1f5fc8867fda5b11c12e903e2791945a0b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 4 Feb 2021 11:13:32 -0800 Subject: [PATCH 15/19] Add typing to _apply_pairwise --- pandas/core/window/rolling.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 0b398088cd03d..56d04389f2c43 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -409,6 +409,9 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: def _apply_tablewise( self, homogeneous_func: Callable[..., ArrayLike], name: Optional[str] = None ) -> FrameOrSeriesUnion: + """ + Apply the given function to the DataFrame across the entire object + """ if self._selected_obj.ndim == 1: raise ValueError("method='table' not applicable for Series objects.") obj = self._create_data(self._selected_obj) @@ -426,7 +429,16 @@ def _apply_tablewise( self._insert_on_column(out, obj) return out - def _apply_pairwise(self, target, other, pairwise, func): + def _apply_pairwise( + self, + target: FrameOrSeriesUnion, + other: Optional[FrameOrSeriesUnion], + pairwise: Optional[bool], + func: Callable[[FrameOrSeriesUnion, FrameOrSeriesUnion], FrameOrSeriesUnion], + ) -> FrameOrSeriesUnion: + """ + Apply the given pairwise function given 2 pandas objects (DataFrame/Series) + """ if other is None: other = target # only default unset @@ -583,7 +595,16 @@ def _apply( result.index = result_index return result - def _apply_pairwise(self, target, other, pairwise, func): + def _apply_pairwise( + self, + target: FrameOrSeriesUnion, + other: Optional[FrameOrSeriesUnion], + pairwise: Optional[bool], + func: Callable[[FrameOrSeriesUnion, FrameOrSeriesUnion], FrameOrSeriesUnion], + ) -> FrameOrSeriesUnion: + """ + Apply the given pairwise function given 2 pandas objects (DataFrame/Series) + """ # Manually drop the grouping column first target = target.drop(columns=self._groupby.grouper.names, errors="ignore") result = super()._apply_pairwise(target, other, pairwise, func) From cfed3bf209abab919bdfff9fd5b386d1ef52d920 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 4 Feb 2021 11:55:21 -0800 Subject: [PATCH 16/19] Share more code and add comments --- pandas/core/window/rolling.py | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 56d04389f2c43..df40b4ce06fd4 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -608,7 +608,7 @@ def _apply_pairwise( # Manually drop the grouping column first target = target.drop(columns=self._groupby.grouper.names, errors="ignore") result = super()._apply_pairwise(target, other, pairwise, func) - # Modify the resulting index to include the groupby level + # 1) Determine the levels + codes of the groupby levels if other is not None: # When we have other, we must reindex (expand) the result # from flex_binary_moment to a "transform"-like result @@ -633,11 +633,6 @@ def _apply_pairwise( groupby_codes.append(codes) groupby_levels.append(levels) - result_codes, result_levels = factorize(result.index) - result_codes = groupby_codes + [result_codes] - result_levels = groupby_levels + [result_levels] - result_names = self._groupby.grouper.names + [result.index.name] - else: # When we evaluate the pairwise=True result, repeat the groupby # labels by the number of columns in the original object @@ -657,19 +652,21 @@ def _apply_pairwise( groupby_codes = [ np.repeat(c.take(indexer), repeat_by) for c in groupby_codes ] - if isinstance(result.index, MultiIndex): - result_codes = list(result.index.codes) - result_levels = list(result.index.levels) - result_names = list(result.index.names) - else: - result_codes, result_levels = factorize(result.index) - result_codes = [result_codes] - result_levels = [result_levels] - result_names = [result.index.name] - - result_codes = groupby_codes + result_codes - result_levels = groupby_levels + result_levels - result_names = self._groupby.grouper.names + result_names + # 2) Determine the levels + codes of the result from super()._apply_pairwise + if isinstance(result.index, MultiIndex): + result_codes = list(result.index.codes) + result_levels = list(result.index.levels) + result_names = list(result.index.names) + else: + result_codes, result_levels = factorize(result.index) + result_codes = [result_codes] + result_levels = [result_levels] + result_names = [result.index.name] + + # 3) Create the resulting index by combining 1) + 2) + result_codes = groupby_codes + result_codes + result_levels = groupby_levels + result_levels + result_names = self._groupby.grouper.names + result_names result_index = MultiIndex( result_levels, result_codes, names=result_names, verify_integrity=False From 7900277865aebbac6b57476b62a7265f57d7e556 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 4 Feb 2021 14:42:05 -0800 Subject: [PATCH 17/19] Rename for mypy --- pandas/core/window/rolling.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index df40b4ce06fd4..b35d62e484280 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -658,9 +658,9 @@ def _apply_pairwise( result_levels = list(result.index.levels) result_names = list(result.index.names) else: - result_codes, result_levels = factorize(result.index) - result_codes = [result_codes] - result_levels = [result_levels] + idx_codes, idx_levels = factorize(result.index) + result_codes = [idx_codes] + result_levels = [idx_levels] result_names = [result.index.name] # 3) Create the resulting index by combining 1) + 2) From d436f73452b95fee88f6adb0ffa4cbbfc093771d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 4 Feb 2021 19:00:41 -0800 Subject: [PATCH 18/19] Use _apply_pairwise for ewm --- pandas/core/window/ewm.py | 78 +++++++++++++-------------------------- 1 file changed, 26 insertions(+), 52 deletions(-) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 26f366af3aa62..6565077d73de8 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -3,7 +3,7 @@ import datetime from functools import partial from textwrap import dedent -from typing import TYPE_CHECKING, Optional, Union +from typing import Optional, Union import warnings import numpy as np @@ -19,7 +19,7 @@ import pandas.core.common as common from pandas.core.util.numba_ import maybe_use_numba -from pandas.core.window.common import flex_binary_moment, zsqrt +from pandas.core.window.common import zsqrt from pandas.core.window.doc import ( _shared_docs, args_compat, @@ -37,9 +37,6 @@ from pandas.core.window.numba_ import generate_numba_groupby_ewma_func from pandas.core.window.rolling import BaseWindow, BaseWindowGroupby -if TYPE_CHECKING: - from pandas import Series - def get_center_of_mass( comass: Optional[float], @@ -74,15 +71,6 @@ def get_center_of_mass( return float(comass) -def wrap_result(obj: Series, result: np.ndarray) -> Series: - """ - Wrap a single 1D result. - """ - obj = obj._selected_obj - - return obj._constructor(result, obj.index, name=obj.name) - - def dispatch(name: str, *args, **kwargs): """ Dispatch to groupby apply. @@ -464,31 +452,25 @@ def cov( bias: bool = False, **kwargs, ): - if other is None: - other = self._selected_obj - # only default unset - pairwise = True if pairwise is None else pairwise - other = self._shallow_copy(other) - - def _get_cov(X, Y): - X = self._shallow_copy(X) - Y = self._shallow_copy(Y) - cov = window_aggregations.ewmcov( - X._prep_values(), + from pandas import Series + + def cov_func(x, y): + x_array = self._prep_values(x) + y_array = self._prep_values(y) + result = window_aggregations.ewmcov( + x_array, np.array([0], dtype=np.int64), np.array([0], dtype=np.int64), self.min_periods, - Y._prep_values(), + y_array, self.com, self.adjust, self.ignore_na, bias, ) - return wrap_result(X, cov) + return Series(result, index=x.index, name=x.name) - return flex_binary_moment( - self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise) - ) + return self._apply_pairwise(self._selected_obj, other, pairwise, cov_func) @doc( template_header, @@ -522,41 +504,33 @@ def corr( pairwise: Optional[bool] = None, **kwargs, ): - if other is None: - other = self._selected_obj - # only default unset - pairwise = True if pairwise is None else pairwise - other = self._shallow_copy(other) + from pandas import Series - def _get_corr(X, Y): - X = self._shallow_copy(X) - Y = self._shallow_copy(Y) + def cov_func(x, y): + x_array = self._prep_values(x) + y_array = self._prep_values(y) - def _cov(x, y): + def _cov(X, Y): return window_aggregations.ewmcov( - x, + X, np.array([0], dtype=np.int64), np.array([0], dtype=np.int64), self.min_periods, - y, + Y, self.com, self.adjust, self.ignore_na, 1, ) - x_values = X._prep_values() - y_values = Y._prep_values() with np.errstate(all="ignore"): - cov = _cov(x_values, y_values) - x_var = _cov(x_values, x_values) - y_var = _cov(y_values, y_values) - corr = cov / zsqrt(x_var * y_var) - return wrap_result(X, corr) - - return flex_binary_moment( - self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) - ) + cov = _cov(x_array, y_array) + x_var = _cov(x_array, x_array) + y_var = _cov(y_array, y_array) + result = cov / zsqrt(x_var * y_var) + return Series(result, index=x.index, name=x.name) + + return self._apply_pairwise(self._selected_obj, other, pairwise, cov_func) class ExponentialMovingWindowGroupby(BaseWindowGroupby, ExponentialMovingWindow): From 72b072316da8a805e730aeb80648eecc1c21534c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 4 Feb 2021 20:57:40 -0800 Subject: [PATCH 19/19] Fix Typing --- pandas/core/window/ewm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 6565077d73de8..e02555fb1e990 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -10,7 +10,7 @@ from pandas._libs.tslibs import Timedelta import pandas._libs.window.aggregations as window_aggregations -from pandas._typing import FrameOrSeries, TimedeltaConvertibleTypes +from pandas._typing import FrameOrSeries, FrameOrSeriesUnion, TimedeltaConvertibleTypes from pandas.compat.numpy import function as nv from pandas.util._decorators import doc @@ -447,7 +447,7 @@ def var_func(values, begin, end, min_periods): ) def cov( self, - other: Optional[Union[np.ndarray, FrameOrSeries]] = None, + other: Optional[FrameOrSeriesUnion] = None, pairwise: Optional[bool] = None, bias: bool = False, **kwargs, @@ -500,7 +500,7 @@ def cov_func(x, y): ) def corr( self, - other: Optional[Union[np.ndarray, FrameOrSeries]] = None, + other: Optional[FrameOrSeriesUnion] = None, pairwise: Optional[bool] = None, **kwargs, ):