Skip to content

PERF: don't sort data twice in groupby apply when not using libreduction fast_apply #40176

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 21 additions & 12 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,18 @@ def time_groupby_apply_dict_return(self):


class Apply:
def setup_cache(self):
N = 10 ** 4
labels = np.random.randint(0, 2000, size=N)

param_names = ["factor"]
params = [4, 5]

def setup(self, factor):
N = 10 ** factor
# two cases:
# - small groups: small data (N**4) + many labels (2000) -> average group
# size of 5 (-> larger overhead of slicing method)
# - larger groups: larger data (N**5) + fewer labels (20) -> average group
# size of 5000
labels = np.random.randint(0, 2000 if factor == 4 else 20, size=N)
labels2 = np.random.randint(0, 3, size=N)
df = DataFrame(
{
Expand All @@ -80,25 +89,25 @@ def setup_cache(self):
"value2": ["foo", "bar", "baz", "qux"] * (N // 4),
}
)
return df
self.df = df

def time_scalar_function_multi_col(self, df):
df.groupby(["key", "key2"]).apply(lambda x: 1)
def time_scalar_function_multi_col(self, factor):
self.df.groupby(["key", "key2"]).apply(lambda x: 1)

def time_scalar_function_single_col(self, df):
df.groupby("key").apply(lambda x: 1)
def time_scalar_function_single_col(self, factor):
self.df.groupby("key").apply(lambda x: 1)

@staticmethod
def df_copy_function(g):
# ensure that the group name is available (see GH #15062)
g.name
return g.copy()

def time_copy_function_multi_col(self, df):
df.groupby(["key", "key2"]).apply(self.df_copy_function)
def time_copy_function_multi_col(self, factor):
self.df.groupby(["key", "key2"]).apply(self.df_copy_function)

def time_copy_overhead_single_col(self, df):
df.groupby("key").apply(self.df_copy_function)
def time_copy_overhead_single_col(self, factor):
self.df.groupby("key").apply(self.df_copy_function)


class Groups:
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,7 @@ Performance improvements
- Performance improvement in :class:`core.window.rolling.ExpandingGroupby` aggregation methods (:issue:`39664`)
- Performance improvement in :class:`Styler` where render times are more than 50% reduced (:issue:`39972` :issue:`39952`)
- Performance improvement in :meth:`core.window.ewm.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`)
- Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`)

.. ---------------------------------------------------------------------------

Expand Down
13 changes: 7 additions & 6 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,14 +208,13 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0):
group_keys = self._get_group_keys()
result_values = None

sdata: FrameOrSeries = splitter._get_sorted_data()
if sdata.ndim == 2 and np.any(sdata.dtypes.apply(is_extension_array_dtype)):
if data.ndim == 2 and np.any(data.dtypes.apply(is_extension_array_dtype)):
# calling splitter.fast_apply will raise TypeError via apply_frame_axis0
# if we pass EA instead of ndarray
# TODO: can we have a workaround for EAs backed by ndarray?
pass

elif isinstance(sdata._mgr, ArrayManager):
elif isinstance(data._mgr, ArrayManager):
# TODO(ArrayManager) don't use fast_apply / libreduction.apply_frame_axis0
# for now -> relies on BlockManager internals
pass
Expand All @@ -224,9 +223,10 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0):
and isinstance(splitter, FrameSplitter)
and axis == 0
# fast_apply/libreduction doesn't allow non-numpy backed indexes
and not sdata.index._has_complex_internals
and not data.index._has_complex_internals
):
try:
sdata = splitter.sorted_data
result_values, mutated = splitter.fast_apply(f, sdata, group_keys)

except IndexError:
Expand Down Expand Up @@ -988,7 +988,7 @@ def sort_idx(self):
return get_group_index_sorter(self.labels, self.ngroups)

def __iter__(self):
sdata = self._get_sorted_data()
sdata = self.sorted_data

if self.ngroups == 0:
# we are inside a generator, rather than raise StopIteration
Expand All @@ -1000,7 +1000,8 @@ def __iter__(self):
for i, (start, end) in enumerate(zip(starts, ends)):
yield i, self._chop(sdata, slice(start, end))

def _get_sorted_data(self) -> FrameOrSeries:
@cache_readonly
def sorted_data(self) -> FrameOrSeries:
return self.data.take(self.sort_idx, axis=self.axis)

def _chop(self, sdata, slice_obj: slice) -> NDFrame:
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def f(g):

splitter = grouper._get_splitter(g._selected_obj, axis=g.axis)
group_keys = grouper._get_group_keys()
sdata = splitter._get_sorted_data()
sdata = splitter.sorted_data

values, mutated = splitter.fast_apply(f, sdata, group_keys)

Expand Down