Skip to content

Commit 6c3d1f5

Browse files
Joe Jevnikmattip
Joe Jevnik
authored andcommitted
PERF: add the 'name' attribute to dataframes that go through apply_frame_axis0
Previously, if you did `group.name` in the applied function, it would fail and fall back to the slower path because the attribute did not exist; `shape_before` was unused. Author: Joe Jevnik <[email protected]> This patch had conflicts when merged, resolved by Committer: Jeff Reback <[email protected]> Closes pandas-dev#15062 from llllllllll/add-name-in-apply-inference-call and squashes the following commits: 722a945 [Joe Jevnik] DOC: update whatsnew for groupby perf change 7e75635 [Joe Jevnik] DEV: add groupby asv benchmark 710528a [Joe Jevnik] BUG: add the 'name' attribute to dataframes that go through apply_frame_axis0
1 parent 1ad8cbb commit 6c3d1f5

File tree

4 files changed

+45
-8
lines changed

4 files changed

+45
-8
lines changed

asv_bench/benchmarks/groupby.py

+25-7
Original file line numberDiff line numberDiff line change
@@ -108,16 +108,34 @@ def setup(self):
108108
self.N = 10000
109109
self.labels = np.random.randint(0, 2000, size=self.N)
110110
self.labels2 = np.random.randint(0, 3, size=self.N)
111-
self.df = DataFrame({'key': self.labels, 'key2': self.labels2, 'value1': randn(self.N), 'value2': (['foo', 'bar', 'baz', 'qux'] * (self.N / 4)), })
112-
113-
def f(self, g):
111+
self.df = DataFrame({
112+
'key': self.labels,
113+
'key2': self.labels2,
114+
'value1': np.random.randn(self.N),
115+
'value2': (['foo', 'bar', 'baz', 'qux'] * (self.N // 4)),
116+
})
117+
118+
@staticmethod
119+
def scalar_function(g):
114120
return 1
115121

116-
def time_groupby_frame_apply(self):
117-
self.df.groupby(['key', 'key2']).apply(self.f)
122+
def time_groupby_frame_apply_scalar_function(self):
123+
self.df.groupby(['key', 'key2']).apply(self.scalar_function)
124+
125+
def time_groupby_frame_apply_scalar_function_overhead(self):
126+
self.df.groupby('key').apply(self.scalar_function)
127+
128+
@staticmethod
129+
def df_copy_function(g):
130+
# ensure that the group name is available (see GH #15062)
131+
g.name
132+
return g.copy()
133+
134+
def time_groupby_frame_df_copy_function(self):
135+
self.df.groupby(['key', 'key2']).apply(self.df_copy_function)
118136

119-
def time_groupby_frame_apply_overhead(self):
120-
self.df.groupby('key').apply(self.f)
137+
def time_groupby_frame_apply_df_copy_overhead(self):
138+
self.df.groupby('key').apply(self.df_copy_function)
121139

122140

123141
#----------------------------------------------------------------------

doc/source/whatsnew/v0.20.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -831,6 +831,9 @@ Performance Improvements
831831
- Improved performance when using ``.unstack()`` (:issue:`15503`)
832832
- Improved performance of merge/join on ``category`` columns (:issue:`10409`)
833833
- Improved performance of ``drop_duplicates()`` on ``bool`` columns (:issue:`12963`)
834+
- Improve performance of ``pd.core.groupby.GroupBy.apply`` when the applied
835+
function used the ``.name`` attribute of the group DataFrame (:issue:`15062`).
836+
834837

835838

836839
.. _whatsnew_0200.bug_fixes:

pandas/_libs/src/reduce.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -497,7 +497,7 @@ def apply_frame_axis0(object frame, object f, object names,
497497
# Need to infer if our low-level mucking is going to cause a segfault
498498
if n > 0:
499499
chunk = frame.iloc[starts[0]:ends[0]]
500-
shape_before = chunk.shape
500+
object.__setattr__(chunk, 'name', names[0])
501501
try:
502502
result = f(chunk)
503503
if result is chunk:

pandas/tests/groupby/test_groupby.py

+16
Original file line numberDiff line numberDiff line change
@@ -3244,6 +3244,22 @@ def _check_all(grouped):
32443244
_check_all(self.df.groupby('A'))
32453245
_check_all(self.df.groupby(['A', 'B']))
32463246

3247+
def test_group_name_available_in_inference_pass(self):
3248+
# gh-15062
3249+
df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})
3250+
3251+
names = []
3252+
3253+
def f(group):
3254+
names.append(group.name)
3255+
return group.copy()
3256+
3257+
df.groupby('a', sort=False, group_keys=False).apply(f)
3258+
# we expect 2 zeros because we call ``f`` once to see if a faster route
3259+
# can be used.
3260+
expected_names = [0, 0, 1, 2]
3261+
tm.assert_equal(names, expected_names)
3262+
32473263
def test_no_dummy_key_names(self):
32483264
# GH #1291
32493265

0 commit comments

Comments
 (0)