From 7b86fdb9cb420e87b1cd03277eca5f29d383bee8 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Tue, 22 Jun 2021 21:51:32 -0400 Subject: [PATCH 1/7] PERF: Try fast/slow paths only once in DataFrameGroupby.transform --- pandas/core/groupby/generic.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5cb0eac5d9074..758f98425ef50 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1309,17 +1309,21 @@ def _transform_general(self, func, *args, **kwargs): gen = self.grouper.get_iterator(obj, axis=self.axis) fast_path, slow_path = self._define_paths(func, *args, **kwargs) + path = None for name, group in gen: object.__setattr__(group, "name", name) - # Try slow path and fast path. - try: - path, res = self._choose_path(fast_path, slow_path, group) - except TypeError: - return self._transform_item_by_item(obj, fast_path) - except ValueError as err: - msg = "transform must return a scalar value for each group" - raise ValueError(msg) from err + if path is None: + # Try slow path and fast path. + try: + path, res = self._choose_path(fast_path, slow_path, group) + except TypeError: + return self._transform_item_by_item(obj, fast_path) + except ValueError as err: + msg = "transform must return a scalar value for each group" + raise ValueError(msg) from err + else: + res = path(group) if isinstance(res, Series): From b70e42079759c1e2e84facf5e5b7b626914fc80f Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 23 Jun 2021 07:08:19 -0400 Subject: [PATCH 2/7] Avoid checking path each iteration --- pandas/core/groupby/generic.py | 68 ++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 758f98425ef50..f69b783ccf551 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1309,45 +1309,51 @@ def _transform_general(self, func, *args, **kwargs): gen = self.grouper.get_iterator(obj, axis=self.axis) fast_path, slow_path = self._define_paths(func, *args, **kwargs) - path = None - for name, group in gen: - object.__setattr__(group, "name", name) - - if path is None: - # Try slow path and fast path. - try: - path, res = self._choose_path(fast_path, slow_path, group) - except TypeError: - return self._transform_item_by_item(obj, fast_path) - except ValueError as err: - msg = "transform must return a scalar value for each group" - raise ValueError(msg) from err - else: - res = path(group) - + def process_result(group, res): if isinstance(res, Series): # we need to broadcast across the # other dimension; this will preserve dtypes # GH14457 if not np.prod(group.shape): - continue - elif res.index.is_(obj.index): - r = concat([res] * len(group.columns), axis=1) - r.columns = group.columns - r.index = group.index + pass else: - r = self.obj._constructor( - np.concatenate([res.values] * len(group.index)).reshape( - group.shape - ), - columns=group.columns, - index=group.index, - ) - - applied.append(r) + if res.index.is_(obj.index): + r = concat([res] * len(group.columns), axis=1) + r.columns = group.columns + r.index = group.index + else: + r = self.obj._constructor( + np.concatenate([res.values] * len(group.index)).reshape( + group.shape + ), + columns=group.columns, + index=group.index, + ) + + return r else: - applied.append(res) + return res + + try: + name, group = next(gen) + except StopIteration: + pass + else: + object.__setattr__(group, "name", name) + try: + path, res = self._choose_path(fast_path, slow_path, group) + except TypeError: + return self._transform_item_by_item(obj, fast_path) + except ValueError as err: + msg = "transform must return a scalar value for each group" + raise ValueError(msg) from err + applied.append(process_result(group, res)) + + for name, group in gen: + object.__setattr__(group, "name", name) + res = path(group) + applied.append(process_result(group, res)) concat_index = obj.columns if self.axis == 0 else obj.index other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 From 15d238a934746beb9b08ab17eb4acbec5e463e3b Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 23 Jun 2021 07:12:43 -0400 Subject: [PATCH 3/7] fix --- pandas/core/groupby/generic.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f69b783ccf551..6d70741cd1cf0 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1309,7 +1309,7 @@ def _transform_general(self, func, *args, **kwargs): gen = self.grouper.get_iterator(obj, axis=self.axis) fast_path, slow_path = self._define_paths(func, *args, **kwargs) - def process_result(group, res): + def process_result(applied, group, res): if isinstance(res, Series): # we need to broadcast across the @@ -1330,10 +1330,9 @@ def process_result(group, res): columns=group.columns, index=group.index, ) - - return r + applied.append(r) else: - return res + applied.append(res) try: name, group = next(gen) @@ -1348,12 +1347,12 @@ def process_result(group, res): except ValueError as err: msg = "transform must return a scalar value for each group" raise ValueError(msg) from err - applied.append(process_result(group, res)) + process_result(applied, group, res) for name, group in gen: object.__setattr__(group, "name", name) res = path(group) - applied.append(process_result(group, res)) + process_result(applied, group, res) concat_index = obj.columns if self.axis == 0 else obj.index other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 From 36de30e0961341f2cef8fb70249999b771bfb2f8 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 7 Jul 2021 09:50:13 -0400 Subject: [PATCH 4/7] Make process_result return instead of mutate --- pandas/core/groupby/generic.py | 41 +++++++++++++++++----------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 6d70741cd1cf0..eaa0b5a0a95fa 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1309,30 +1309,28 @@ def _transform_general(self, func, *args, **kwargs): gen = self.grouper.get_iterator(obj, axis=self.axis) fast_path, slow_path = self._define_paths(func, *args, **kwargs) - def process_result(applied, group, res): + def process_result( + group: FrameOrSeriesUnion, res: FrameOrSeriesUnion + ) -> FrameOrSeriesUnion: if isinstance(res, Series): - # we need to broadcast across the # other dimension; this will preserve dtypes # GH14457 - if not np.prod(group.shape): - pass + if res.index.is_(obj.index): + r = concat([res] * len(group.columns), axis=1) + r.columns = group.columns + r.index = group.index else: - if res.index.is_(obj.index): - r = concat([res] * len(group.columns), axis=1) - r.columns = group.columns - r.index = group.index - else: - r = self.obj._constructor( - np.concatenate([res.values] * len(group.index)).reshape( - group.shape - ), - columns=group.columns, - index=group.index, - ) - applied.append(r) + r = self.obj._constructor( + np.concatenate([res.values] * len(group.index)).reshape( + group.shape + ), + columns=group.columns, + index=group.index, + ) + return r else: - applied.append(res) + return res try: name, group = next(gen) @@ -1347,12 +1345,15 @@ def process_result(applied, group, res): except ValueError as err: msg = "transform must return a scalar value for each group" raise ValueError(msg) from err - process_result(applied, group, res) + if group.size > 0: + applied.append(process_result(group, res)) for name, group in gen: + if group.size == 0: + continue object.__setattr__(group, "name", name) res = path(group) - process_result(applied, group, res) + applied.append(process_result(group, res)) concat_index = obj.columns if self.axis == 0 else obj.index other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 From 2d40eaacc1f4b07430600ed10364f5afafdcccf1 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 7 Jul 2021 09:55:41 -0400 Subject: [PATCH 5/7] whatsnew --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 5b09b62fa9e88..495322dada350 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -159,7 +159,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance improvement in :meth:`.GroupBy.sample`, especially when ``weights`` argument provided (:issue:`34483`) -- +- Performance improvement in :meth:`.GroupBy.transform` for user-defined functions (:issue:`41598`) .. --------------------------------------------------------------------------- From ec9515b13221ae814398b6339f1fe79e893429ae Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 7 Jul 2021 17:42:09 -0400 Subject: [PATCH 6/7] Comment, refactor, and narrow typing --- pandas/core/groupby/generic.py | 58 +++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5c0b0da0fc90f..076193abd7ace 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1308,29 +1308,9 @@ def _transform_general(self, func, *args, **kwargs): gen = self.grouper.get_iterator(obj, axis=self.axis) fast_path, slow_path = self._define_paths(func, *args, **kwargs) - def process_result( - group: DataFrame | Series, res: DataFrame | Series - ) -> DataFrame | Series: - if isinstance(res, Series): - # we need to broadcast across the - # other dimension; this will preserve dtypes - # GH14457 - if res.index.is_(obj.index): - r = concat([res] * len(group.columns), axis=1) - r.columns = group.columns - r.index = group.index - else: - r = self.obj._constructor( - np.concatenate([res.values] * len(group.index)).reshape( - group.shape - ), - columns=group.columns, - index=group.index, - ) - return r - else: - return res - + # Determine whether to use slow or fast path by evaluating on the first group. + # Need to handle the case of an empty generator and process the result so that + # it does not need to be computed again. try: name, group = next(gen) except StopIteration: @@ -1345,14 +1325,17 @@ def process_result( msg = "transform must return a scalar value for each group" raise ValueError(msg) from err if group.size > 0: - applied.append(process_result(group, res)) + res = _wrap_transform_general_frame(self.obj, group, res) + applied.append(res) + # Compute and process with the remaining groups for name, group in gen: if group.size == 0: continue object.__setattr__(group, "name", name) res = path(group) - applied.append(process_result(group, res)) + res = _wrap_transform_general_frame(self.obj, group, res) + applied.append(res) concat_index = obj.columns if self.axis == 0 else obj.index other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1 @@ -1863,3 +1846,28 @@ def func(df): return self._python_apply_general(func, self._obj_with_exclusions) boxplot = boxplot_frame_groupby + + +def _wrap_transform_general_frame( + obj: DataFrame, group: DataFrame, res: DataFrame | Series +) -> DataFrame: + from pandas import concat + + if isinstance(res, Series): + # we need to broadcast across the + # other dimension; this will preserve dtypes + # GH14457 + if res.index.is_(obj.index): + r = concat([res] * len(group.columns), axis=1) + r.columns = group.columns + r.index = group.index + else: + r = obj._constructor( + np.concatenate([res.values] * len(group.index)).reshape(group.shape), + columns=group.columns, + index=group.index, + ) + assert isinstance(r, DataFrame) + return r + else: + return res From 2bea96453f4841953b1728e7ab55a1ea0258c086 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 8 Jul 2021 10:16:41 -0400 Subject: [PATCH 7/7] r -> res_frame --- pandas/core/groupby/generic.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 076193abd7ace..88d1baae86467 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1858,16 +1858,16 @@ def _wrap_transform_general_frame( # other dimension; this will preserve dtypes # GH14457 if res.index.is_(obj.index): - r = concat([res] * len(group.columns), axis=1) - r.columns = group.columns - r.index = group.index + res_frame = concat([res] * len(group.columns), axis=1) + res_frame.columns = group.columns + res_frame.index = group.index else: - r = obj._constructor( + res_frame = obj._constructor( np.concatenate([res.values] * len(group.index)).reshape(group.shape), columns=group.columns, index=group.index, ) - assert isinstance(r, DataFrame) - return r + assert isinstance(res_frame, DataFrame) + return res_frame else: return res