diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index fb08c6fdeaedf..9930c61e34b15 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -68,9 +68,18 @@ def time_groupby_apply_dict_return(self): class Apply: - def setup_cache(self): - N = 10 ** 4 - labels = np.random.randint(0, 2000, size=N) + + param_names = ["factor"] + params = [4, 5] + + def setup(self, factor): + N = 10 ** factor + # two cases: + # - small groups: small data (N**4) + many labels (2000) -> average group + # size of 5 (-> larger overhead of slicing method) + # - larger groups: larger data (N**5) + fewer labels (20) -> average group + # size of 5000 + labels = np.random.randint(0, 2000 if factor == 4 else 20, size=N) labels2 = np.random.randint(0, 3, size=N) df = DataFrame( { @@ -80,13 +89,13 @@ def setup_cache(self): "value2": ["foo", "bar", "baz", "qux"] * (N // 4), } ) - return df + self.df = df - def time_scalar_function_multi_col(self, df): - df.groupby(["key", "key2"]).apply(lambda x: 1) + def time_scalar_function_multi_col(self, factor): + self.df.groupby(["key", "key2"]).apply(lambda x: 1) - def time_scalar_function_single_col(self, df): - df.groupby("key").apply(lambda x: 1) + def time_scalar_function_single_col(self, factor): + self.df.groupby("key").apply(lambda x: 1) @staticmethod def df_copy_function(g): @@ -94,11 +103,11 @@ def df_copy_function(g): g.name return g.copy() - def time_copy_function_multi_col(self, df): - df.groupby(["key", "key2"]).apply(self.df_copy_function) + def time_copy_function_multi_col(self, factor): + self.df.groupby(["key", "key2"]).apply(self.df_copy_function) - def time_copy_overhead_single_col(self, df): - df.groupby("key").apply(self.df_copy_function) + def time_copy_overhead_single_col(self, factor): + self.df.groupby("key").apply(self.df_copy_function) class Groups: diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 41db72612a66b..2d9f60cb2fb49 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -339,6 +339,7 @@ Performance improvements - Performance improvement in :class:`core.window.rolling.ExpandingGroupby` aggregation methods (:issue:`39664`) - Performance improvement in :class:`Styler` where render times are more than 50% reduced (:issue:`39972` :issue:`39952`) - Performance improvement in :meth:`core.window.ewm.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`) +- Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 05d75bdda5131..886570ce7c060 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -208,14 +208,13 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): group_keys = self._get_group_keys() result_values = None - sdata: FrameOrSeries = splitter._get_sorted_data() - if sdata.ndim == 2 and np.any(sdata.dtypes.apply(is_extension_array_dtype)): + if data.ndim == 2 and np.any(data.dtypes.apply(is_extension_array_dtype)): # calling splitter.fast_apply will raise TypeError via apply_frame_axis0 # if we pass EA instead of ndarray # TODO: can we have a workaround for EAs backed by ndarray? pass - elif isinstance(sdata._mgr, ArrayManager): + elif isinstance(data._mgr, ArrayManager): # TODO(ArrayManager) don't use fast_apply / libreduction.apply_frame_axis0 # for now -> relies on BlockManager internals pass @@ -224,9 +223,10 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): and isinstance(splitter, FrameSplitter) and axis == 0 # fast_apply/libreduction doesn't allow non-numpy backed indexes - and not sdata.index._has_complex_internals + and not data.index._has_complex_internals ): try: + sdata = splitter.sorted_data result_values, mutated = splitter.fast_apply(f, sdata, group_keys) except IndexError: @@ -988,7 +988,7 @@ def sort_idx(self): return get_group_index_sorter(self.labels, self.ngroups) def __iter__(self): - sdata = self._get_sorted_data() + sdata = self.sorted_data if self.ngroups == 0: # we are inside a generator, rather than raise StopIteration @@ -1000,7 +1000,8 @@ def __iter__(self): for i, (start, end) in enumerate(zip(starts, ends)): yield i, self._chop(sdata, slice(start, end)) - def _get_sorted_data(self) -> FrameOrSeries: + @cache_readonly + def sorted_data(self) -> FrameOrSeries: return self.data.take(self.sort_idx, axis=self.axis) def _chop(self, sdata, slice_obj: slice) -> NDFrame: diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index daf5c71af7488..79ec0af267234 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -113,7 +113,7 @@ def f(g): splitter = grouper._get_splitter(g._selected_obj, axis=g.axis) group_keys = grouper._get_group_keys() - sdata = splitter._get_sorted_data() + sdata = splitter.sorted_data values, mutated = splitter.fast_apply(f, sdata, group_keys)