diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 03ff62568b405..4e66c96b82761 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -108,16 +108,34 @@ def setup(self): self.N = 10000 self.labels = np.random.randint(0, 2000, size=self.N) self.labels2 = np.random.randint(0, 3, size=self.N) - self.df = DataFrame({'key': self.labels, 'key2': self.labels2, 'value1': randn(self.N), 'value2': (['foo', 'bar', 'baz', 'qux'] * (self.N / 4)), }) - - def f(self, g): + self.df = DataFrame({ + 'key': self.labels, + 'key2': self.labels2, + 'value1': randn(self.N), + 'value2': (['foo', 'bar', 'baz', 'qux'] * (self.N / 4)), + }) + + @staticmethod + def scalar_function(g): return 1 - def time_groupby_frame_apply(self): - self.df.groupby(['key', 'key2']).apply(self.f) + def time_groupby_frame_apply_scalar_function(self): + self.df.groupby(['key', 'key2']).apply(self.scalar_function) + + def time_groupby_frame_apply_scalar_function_overhead(self): + self.df.groupby('key').apply(self.scalar_function) + + @staticmethod + def df_copy_function(g): + # ensure that the group name is available (see GH #15062) + g.name + return g.copy() + + def time_groupby_frame_df_copy_function(self): + self.df.groupby(['key', 'key2']).apply(self.df_copy_function) - def time_groupby_frame_apply_overhead(self): - self.df.groupby('key').apply(self.f) + def time_groupby_frame_apply_df_copy_overhead(self): + self.df.groupby('key').apply(self.df_copy_function) #---------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 7e4fa44ea8ded..360fcd8a37908 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -370,6 +370,10 @@ Performance Improvements - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. +- Improve performance of ``pd.core.groupby.GroupBy.apply`` when the applied + function used the ``.name`` attribute of the group DataFrame (:issue:`15062`). + + .. _whatsnew_0200.bug_fixes: diff --git a/pandas/src/reduce.pyx b/pandas/src/reduce.pyx index 1cd3e53494a72..2bba07256305a 100644 --- a/pandas/src/reduce.pyx +++ b/pandas/src/reduce.pyx @@ -497,7 +497,7 @@ def apply_frame_axis0(object frame, object f, object names, # Need to infer if our low-level mucking is going to cause a segfault if n > 0: chunk = frame.iloc[starts[0]:ends[0]] - shape_before = chunk.shape + object.__setattr__(chunk, 'name', names[0]) try: result = f(chunk) if result is chunk: diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 7140eb5a6fd12..483c3cb330c31 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -6022,6 +6022,21 @@ def test_cummin_cummax(self): result = base_df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() tm.assert_frame_equal(expected, result) + def test_group_name_available_in_inference_pass(self): + df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)}) + + names = [] + + def f(group): + names.append(group.name) + return group.copy() + + df.groupby('a', sort=False, group_keys=False).apply(f) + # we expect 2 zeros because we call ``f`` once to see if a faster route + # can be used. + expected_names = [0, 0, 1, 2] + tm.assert_equal(names, expected_names) + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all()