Skip to content

Commit fcf8455

Browse files
committed
ENH: Only apply first group once in fast GroupBy.apply
1 parent 2626215 commit fcf8455

File tree

7 files changed

+203
-33
lines changed

7 files changed

+203
-33
lines changed

doc/source/user_guide/groupby.rst

+28-8
Original file line numberDiff line numberDiff line change
@@ -948,18 +948,38 @@ that is itself a series, and possibly upcast the result to a DataFrame:
948948

949949
.. warning::
950950

951-
In the current implementation apply calls func twice on the
952-
first group to decide whether it can take a fast or slow code
953-
path. This can lead to unexpected behavior if func has
954-
side-effects, as they will take effect twice for the first
955-
group.
951+
The current implementation uses a cythonized code path which requires
952+
that the input data is not modified inplace. The heuristic assumes that
953+
this might be happening if ``func(group) is group`` in which case we fall
954+
back to a slow code path which evaluates func on the first group a second
955+
time.
956+
This can lead to unexpected behavior if func has side-effects,
957+
as they will take effect twice for the first group.
958+
This behavior is
956959

957960
.. ipython:: python
958961
959962
d = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]})
960-
def identity(df):
961-
print(df)
962-
return df
963+
964+
def func_fast_apply(group):
965+
"""
966+
This func doesn't modify inplace and returns
967+
a scalar which is safe to fast apply
968+
"""
969+
print(group.name)
970+
return len(group)
971+
972+
d.groupby("a").apply(func_fast_apply)
973+
974+
def identity(group):
975+
"""
976+
This triggers the slow path because ``identity(group) is group``
977+
If there is no inplace modification happening
978+
this may be avoided by returning a shallow copy
979+
i.e. return group.copy()
980+
"""
981+
print(group.name)
982+
return group
963983
964984
d.groupby("a").apply(identity)
965985

doc/source/whatsnew/v0.25.0.rst

+64
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,70 @@ Other Enhancements
2626
Backwards incompatible API changes
2727
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2828

29+
Fast GroupBy.apply on ``DataFrame`` evaluates first group only once
30+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
31+
32+
(:issue:`2936`, :issue:`2656`, :issue:`7739`, :issue:`10519`, :issue:`12155`,
33+
:issue:`20084`, :issue:`21417`)
34+
35+
The implementation of ``DataFrame.groupby.apply`` previously evaluated func
36+
consistently twice on the first group to infer if it is safe to use a fast
37+
code path. Particularly for functions with side effects, this was an undesired
38+
behavior and may have led to surprises.
39+
40+
The new behavior is that the first group is no longer evaluated twice if the
41+
fast path can be used.
42+
43+
Previous behavior:
44+
45+
.. code-block:: ipython
46+
47+
In [2]: df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]})
48+
49+
In [3]: side_effects = []
50+
51+
In [4]: def func_fast_apply(group):
52+
...: side_effects.append(group.name)
53+
...: return len(group)
54+
...:
55+
56+
In [5]: df.groupby("a").apply(func_fast_apply)
57+
58+
In [6]: assert side_effects == ["x", "x", "y"]
59+
60+
New behavior:
61+
62+
.. ipython:: python
63+
64+
df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]})
65+
66+
side_effects = []
67+
def func_fast_apply(group):
68+
"""
69+
This func doesn't modify inplace and returns
70+
a scalar which is safe to fast apply
71+
"""
72+
side_effects.append(group.name)
73+
return len(group)
74+
75+
df.groupby("a").apply(func_fast_apply)
76+
side_effects
77+
78+
side_effects.clear()
79+
def identity(group):
80+
"""
81+
This triggers the slow path because ``identity(group) is group``
82+
If there is no inplace modification happening
83+
this may be avoided by returning a shallow copy
84+
i.e. return group.copy()
85+
"""
86+
side_effects.append(group.name)
87+
return group
88+
89+
df.groupby("a").apply(identity)
90+
side_effects
91+
92+
2993
.. _whatsnew_0250.api.other:
3094

3195
Other API Changes

pandas/_libs/reduction.pyx

+14-15
Original file line numberDiff line numberDiff line change
@@ -507,17 +507,6 @@ def apply_frame_axis0(object frame, object f, object names,
507507

508508
results = []
509509

510-
# Need to infer if our low-level mucking is going to cause a segfault
511-
if n > 0:
512-
chunk = frame.iloc[starts[0]:ends[0]]
513-
object.__setattr__(chunk, 'name', names[0])
514-
try:
515-
result = f(chunk)
516-
if result is chunk:
517-
raise InvalidApply('Function unsafe for fast apply')
518-
except:
519-
raise InvalidApply('Let this error raise above us')
520-
521510
slider = BlockSlider(frame)
522511

523512
mutated = False
@@ -527,13 +516,23 @@ def apply_frame_axis0(object frame, object f, object names,
527516
slider.move(starts[i], ends[i])
528517

529518
item_cache.clear() # ugh
530-
531-
object.__setattr__(slider.dummy, 'name', names[i])
532-
piece = f(slider.dummy)
519+
chunk = slider.dummy
520+
object.__setattr__(chunk, 'name', names[i])
521+
522+
# Need to infer if our low-level mucking will cause a segfault
523+
if i == 0:
524+
try:
525+
piece = f(chunk)
526+
if piece is chunk:
527+
raise InvalidApply('Function unsafe for fast apply')
528+
except:
529+
raise InvalidApply('Let this error raise above us')
530+
else:
531+
piece = f(chunk)
533532

534533
# I'm paying the price for index-sharing, ugh
535534
try:
536-
if piece.index is slider.dummy.index:
535+
if piece.index is chunk.index:
537536
piece = piece.copy(deep='all')
538537
else:
539538
mutated = True

pandas/tests/frame/test_apply.py

+33
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,39 @@ def test_apply_dup_names_multi_agg(self):
568568

569569
tm.assert_frame_equal(result, expected)
570570

571+
@pytest.mark.parametrize("axis, expected", [
572+
(0, ['a', 'b']),
573+
(1, [0, 1, 2, 3, 4, 5]),
574+
])
575+
def test_apply_first_row_once(self, axis, expected):
576+
# GH24748 ,GH2936, GH2656, GH7739, GH10519, GH12155, GH20084, GH21417
577+
df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})
578+
579+
rows = []
580+
581+
def f_fast(row):
582+
import ipdb; ipdb.set_trace()
583+
rows.append(row.name)
584+
return 0
585+
df.apply(f_fast, axis=axis)
586+
587+
# every row should appear once, i.e. apply is called once per row
588+
assert rows == expected
589+
590+
rows_slow = []
591+
592+
def f_slow(row):
593+
"""
594+
This function triggers a `function does not reduce`
595+
exception and uses the slow path
596+
"""
597+
rows_slow.append(row.name)
598+
return row.copy()
599+
600+
df.apply(f_slow, axis=axis)
601+
expected_first_row_twice = [expected[0]] + expected
602+
assert rows_slow == expected_first_row_twice
603+
571604

572605
class TestInferOutputShape(object):
573606
# the user has supplied an opaque UDF where

pandas/tests/groupby/test_apply.py

+22
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,28 @@ def f(g):
105105
assert not mutated
106106

107107

108+
def test_group_apply_once_per_group():
109+
# GH24748 ,GH2936, GH2656, GH7739, GH10519, GH12155, GH20084, GH21417
110+
df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})
111+
112+
names = []
113+
114+
def f_copy(group):
115+
names.append(group.name)
116+
return group.copy()
117+
df.groupby("a").apply(f_copy)
118+
assert names == [0, 1, 2]
119+
120+
def f_nocopy(group):
121+
names.append(group.name)
122+
return group
123+
names.clear()
124+
# this takes the slow apply path, i.e. we need to apply the
125+
# function to the first row twice
126+
df.groupby("a").apply(f_nocopy)
127+
assert names == [0, 0, 1, 2]
128+
129+
108130
def test_apply_with_mixed_dtype():
109131
# GH3480, apply with mixed dtype on axis=1 breaks in 0.11
110132
df = DataFrame({'foo1': np.random.randn(6),

pandas/tests/groupby/test_groupby.py

+15-5
Original file line numberDiff line numberDiff line change
@@ -1420,20 +1420,30 @@ def foo(x):
14201420

14211421
def test_group_name_available_in_inference_pass():
14221422
# gh-15062
1423+
# GH24748 ,GH2936, GH2656, GH7739, GH10519, GH12155, GH20084, GH21417
14231424
df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})
14241425

14251426
names = []
14261427

1427-
def f(group):
1428+
def f_fast(group):
14281429
names.append(group.name)
14291430
return group.copy()
14301431

1431-
df.groupby('a', sort=False, group_keys=False).apply(f)
1432-
# we expect 2 zeros because we call ``f`` once to see if a faster route
1433-
# can be used.
1434-
expected_names = [0, 0, 1, 2]
1432+
df.groupby('a', sort=False, group_keys=False).apply(f_fast)
1433+
1434+
# every group should appear once, i.e. apply is called once per group
1435+
expected_names = [0, 1, 2]
14351436
assert names == expected_names
14361437

1438+
names_slow = []
1439+
1440+
def f_slow(group):
1441+
names_slow.append(group.name)
1442+
return group
1443+
1444+
df.groupby('a', sort=False, group_keys=False).apply(f_slow)
1445+
assert names_slow == [0, 0, 1, 2]
1446+
14371447

14381448
def test_no_dummy_key_names(df):
14391449
# see gh-1291

pandas/tests/series/test_apply.py

+27-5
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,12 @@
77
import numpy as np
88
import pytest
99

10-
import pandas.compat as compat
11-
from pandas.compat import lrange
12-
1310
import pandas as pd
11+
import pandas as pd
12+
import pandas.util.testing as tm
1413
from pandas import DataFrame, Index, Series, isna
14+
import pandas.compat as compat
1515
from pandas.conftest import _get_cython_table_params
16-
import pandas.util.testing as tm
17-
from pandas.util.testing import assert_frame_equal, assert_series_equal
1816

1917

2018
class TestSeriesApply():
@@ -665,3 +663,27 @@ def test_map_missing_mixed(self, vals, mapping, exp):
665663
result = s.map(mapping)
666664

667665
tm.assert_series_equal(result, pd.Series(exp))
666+
667+
def test_apply_only_once(self):
668+
# GH24748 ,GH2936, GH2656, GH7739, GH10519, GH12155, GH20084, GH21417
669+
ser = pd.Series([0, 0, 1, 1, 2, 2], name="series")
670+
rows = []
671+
672+
def f(row):
673+
rows.append(row)
674+
return row
675+
ser.apply(f)
676+
# every row should appear once, i.e. apply is called once per row
677+
expected_names = [0, 0, 1, 1, 2, 2]
678+
assert rows == expected_names
679+
680+
# Rows should also only be applied once if the return
681+
# shape is different
682+
rows = []
683+
684+
def g(row):
685+
rows.append(row)
686+
return (row, row)
687+
ser.apply(g)
688+
expected_names = [0, 0, 1, 1, 2, 2]
689+
assert rows == expected_names

0 commit comments

Comments
 (0)