Skip to content

Commit 25fa482

Browse files
committed
Address review comments
1 parent ab2c65d commit 25fa482

File tree

5 files changed

+57
-57
lines changed

5 files changed

+57
-57
lines changed

doc/source/whatsnew/v0.25.0.rst

+20-26
Original file line numberDiff line numberDiff line change
@@ -29,46 +29,40 @@ Backwards incompatible API changes
2929
GroupBy.apply on ``DataFrame`` evaluates first group only once
3030
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3131

32-
(:issue:`2936`, :issue:`2656`, :issue:`7739`, :issue:`10519`, :issue:`12155`,
33-
:issue:`20084`, :issue:`21417`)
34-
35-
The implementation of ``DataFrame.groupby.apply`` previously evaluated func
36-
consistently twice on the first group to infer if it is safe to use a fast
37-
code path. Particularly for functions with side effects, this was an undesired
38-
behavior and may have led to surprises.
32+
The implementation of :meth:`DataFrameGroupBy.apply() <pandas.core.groupby.DataFrameGroupBy.apply>`
33+
previously evaluated func consistently twice on the first group to infer if it
34+
is safe to use a fast code path. Particularly for functions with side effects,
35+
this was an undesired behavior and may have led to surprises.
3936

4037
Now every group is evaluated only a single time.
4138

42-
Previous behavior:
43-
44-
.. code-block:: ipython
45-
46-
In [2]: df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]})
47-
48-
In [3]: side_effects = []
39+
:issue:`2936`, :issue:`2656`, :issue:`7739`, :issue:`10519`, :issue:`12155`,
40+
:issue:`20084`, :issue:`21417`
4941

50-
In [4]: def func_fast_apply(group):
51-
...: side_effects.append(group.name)
52-
...: return len(group)
53-
...:
54-
55-
In [5]: df.groupby("a").apply(func_fast_apply)
56-
57-
In [6]: assert side_effects == ["x", "x", "y"]
58-
59-
New behavior:
6042

6143
.. ipython:: python
6244
6345
df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]})
46+
df
6447
6548
side_effects = []
6649
def func(group):
6750
side_effects.append(group.name)
6851
return group
69-
7052
df.groupby("a").apply(func)
71-
assert side_effects == ["x", "y"]
53+
54+
Previous behavior:
55+
56+
.. code-block:: python
57+
58+
side_effects
59+
>>> ["x", "x", "y"]
60+
61+
New behavior:
62+
63+
.. ipython:: python
64+
65+
side_effects
7266
7367
7468
.. _whatsnew_0250.api.other:

pandas/_libs/reduction.pyx

+4-4
Original file line numberDiff line numberDiff line change
@@ -510,7 +510,7 @@ def apply_frame_axis0(object frame, object f, object names,
510510
slider = BlockSlider(frame)
511511

512512
mutated = False
513-
status = 0
513+
successfull_fast_apply = True
514514
item_cache = slider.dummy._item_cache
515515
try:
516516
for i in range(n):
@@ -526,7 +526,7 @@ def apply_frame_axis0(object frame, object f, object names,
526526
raise InvalidApply('Let this error raise above us')
527527
# Need to infer if low level index slider will cause segfaults
528528
if i == 0 and piece is chunk:
529-
status = 1
529+
successfull_fast_apply = False
530530
try:
531531
if piece.index is chunk.index:
532532
piece = piece.copy(deep='all')
@@ -536,12 +536,12 @@ def apply_frame_axis0(object frame, object f, object names,
536536
pass
537537

538538
results.append(piece)
539-
if status > 0:
539+
if not successfull_fast_apply:
540540
break
541541
finally:
542542
slider.reset()
543543

544-
return results, mutated, status
544+
return results, mutated, successfull_fast_apply
545545

546546

547547
cdef class BlockSlider:

pandas/core/groupby/ops.py

+16-9
Original file line numberDiff line numberDiff line change
@@ -165,30 +165,37 @@ def apply(self, f, data, axis=0):
165165
mutated = self.mutated
166166
splitter = self._get_splitter(data, axis=axis)
167167
group_keys = self._get_group_keys()
168-
status = 0
168+
reuse_result = False
169169
result_values = []
170170
# oh boy
171171
f_name = com.get_callable_name(f)
172172
if (f_name not in base.plotting_methods and
173173
hasattr(splitter, 'fast_apply') and axis == 0):
174174
try:
175175
result = splitter.fast_apply(f, group_keys)
176-
result_values, mutated, status = result
177-
if status == 0:
178-
return group_keys, result_values, mutated
176+
fast_apply_result, mutated, successful_fast_apply = result
177+
# If the fast apply path could be used we can return here.
178+
# Otherwise we need to fall back to the slow implementation.
179+
if successful_fast_apply:
180+
return group_keys, fast_apply_result, mutated
181+
else:
182+
# The slow implementation can still reuse the result
183+
# for the first group
184+
result_values = fast_apply_result
185+
reuse_result = True
179186
except reduction.InvalidApply:
180-
# we detect a mutation of some kind
181-
# so take slow path
187+
# Cannot fast apply on MultiIndex (_has_complex_internals).
188+
# This Exception is also raised if `f` triggers an exception but
189+
# it is preferable if the exception is raised in Python.
182190
pass
183191
except Exception:
184192
# raise this error to the caller
185193
pass
186194

187195
for key, (i, group) in zip(group_keys, splitter):
188196
object.__setattr__(group, 'name', key)
189-
if status > 0 and i == 0:
197+
if reuse_result and i == 0:
190198
continue
191-
192199
# group might be modified
193200
group_axes = _get_axes(group)
194201
res = f(group)
@@ -855,7 +862,7 @@ def fast_apply(self, f, names):
855862
starts, ends = lib.generate_slices(self.slabels, self.ngroups)
856863
except Exception:
857864
# fails when all -1
858-
return [], True
865+
return [], True, False
859866

860867
sdata = self._get_sorted_data()
861868
return reduction.apply_frame_axis0(sdata, f, names, starts, ends)

pandas/tests/groupby/test_apply.py

+15-4
Original file line numberDiff line numberDiff line change
@@ -101,26 +101,37 @@ def f(g):
101101
splitter = grouper._get_splitter(g._selected_obj, axis=g.axis)
102102
group_keys = grouper._get_group_keys()
103103

104-
values, mutated, status = splitter.fast_apply(f, group_keys)
105-
assert status == 0
104+
values, mutated, succsessful_apply = splitter.fast_apply(f, group_keys)
105+
# The bool successful_apply signals whether or not the fast apply was
106+
# successful on the entire data set. It is false for cases which need to
107+
# fall back to a slow apply code path for safety reasons.
108+
assert succsessful_apply
106109
assert not mutated
107110

108111

109112
def test_group_apply_once_per_group():
110-
# GH24748 ,GH2936, GH2656, GH7739, GH10519, GH12155, GH20084, GH21417
111-
df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})
113+
# GH2936, GH7739, GH10519, GH2656, GH12155, GH20084, GH21417
114+
115+
# This test should ensure that a function is only evaluted
116+
# once per group. Previously the function has been evaluated twice
117+
# on the first group to check if the Cython index slider is safe to use
118+
# This test ensures that the side effect (append to list) is only triggered
119+
# once per group
120+
df = pd.DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": np.arange(6)})
112121

113122
names = []
114123

115124
def f_copy(group):
116125
names.append(group.name)
117126
return group.copy()
127+
118128
df.groupby("a").apply(f_copy)
119129
assert names == [0, 1, 2]
120130

121131
def f_nocopy(group):
122132
names.append(group.name)
123133
return group
134+
124135
names = []
125136
# this takes the slow apply path
126137
df.groupby("a").apply(f_nocopy)

pandas/tests/groupby/test_groupby.py

+2-14
Original file line numberDiff line numberDiff line change
@@ -1420,30 +1420,18 @@ def foo(x):
14201420

14211421
def test_group_name_available_in_inference_pass():
14221422
# gh-15062
1423-
# GH24748 ,GH2936, GH2656, GH7739, GH10519, GH12155, GH20084, GH21417
14241423
df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})
14251424

14261425
names = []
14271426

1428-
def f_fast(group):
1427+
def f(group):
14291428
names.append(group.name)
14301429
return group.copy()
1430+
df.groupby('a', sort=False, group_keys=False).apply(f)
14311431

1432-
df.groupby('a', sort=False, group_keys=False).apply(f_fast)
1433-
1434-
# every group should appear once, i.e. apply is called once per group
14351432
expected_names = [0, 1, 2]
14361433
assert names == expected_names
14371434

1438-
names_slow = []
1439-
1440-
def f_slow(group):
1441-
names_slow.append(group.name)
1442-
return group
1443-
1444-
df.groupby('a', sort=False, group_keys=False).apply(f_slow)
1445-
assert names_slow == [0, 1, 2]
1446-
14471435

14481436
def test_no_dummy_key_names(df):
14491437
# see gh-1291

0 commit comments

Comments
 (0)