ENH: Only apply first group once in fast GroupBy.apply

fjetter · fjetter · commit ab2c65d123a9 · 2019-01-27T16:59:15.000+01:00
diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -946,23 +946,6 @@ that is itself a series, and possibly upcast the result to a DataFrame:
    So depending on the path taken, and exactly what you are grouping. Thus the grouped columns(s) may be included in
    the output as well as set the indices.
 
-.. warning::
-
-    In the current implementation apply calls func twice on the
-    first group to decide whether it can take a fast or slow code
-    path. This can lead to unexpected behavior if func has
-    side-effects, as they will take effect twice for the first
-    group.
-
-    .. ipython:: python
-
-        d = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]})
-        def identity(df):
-            print(df)
-            return df
-
-        d.groupby("a").apply(identity)
-
 
 Other useful features
 ---------------------
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -26,6 +26,51 @@ Other Enhancements
 Backwards incompatible API changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+GroupBy.apply on ``DataFrame`` evaluates first group only once
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+(:issue:`2936`, :issue:`2656`, :issue:`7739`, :issue:`10519`, :issue:`12155`,
+:issue:`20084`, :issue:`21417`)
+
+The implementation of ``DataFrame.groupby.apply`` previously evaluated func
+consistently twice on the first group to infer if it is safe to use a fast
+code path. Particularly for functions with side effects, this was an undesired
+behavior and may have led to surprises.
+
+Now every group is evaluated only a single time.
+
+Previous behavior:
+
+.. code-block:: ipython
+
+    In [2]: df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]})
+
+    In [3]: side_effects = []
+
+    In [4]: def func_fast_apply(group):
+    ...:     side_effects.append(group.name)
+    ...:     return len(group)
+    ...:
+
+    In [5]: df.groupby("a").apply(func_fast_apply)
+
+    In [6]: assert side_effects == ["x", "x", "y"]
+
+New behavior:
+
+.. ipython:: python
+
+    df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]})
+
+    side_effects = []
+    def func(group):
+        side_effects.append(group.name)
+        return group
+
+    df.groupby("a").apply(func)
+    assert side_effects == ["x", "y"]
+
+
 .. _whatsnew_0250.api.other:
 
 Other API Changes
diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx
@@ -507,44 +507,41 @@ def apply_frame_axis0(object frame, object f, object names,
 
     results = []
 
-    # Need to infer if our low-level mucking is going to cause a segfault
-    if n > 0:
-        chunk = frame.iloc[starts[0]:ends[0]]
-        object.__setattr__(chunk, 'name', names[0])
-        try:
-            result = f(chunk)
-            if result is chunk:
-                raise InvalidApply('Function unsafe for fast apply')
-        except:
-            raise InvalidApply('Let this error raise above us')
-
     slider = BlockSlider(frame)
 
     mutated = False
+    status = 0
     item_cache = slider.dummy._item_cache
     try:
         for i in range(n):
             slider.move(starts[i], ends[i])
 
             item_cache.clear()  # ugh
+            chunk = slider.dummy
+            object.__setattr__(chunk, 'name', names[i])
 
-            object.__setattr__(slider.dummy, 'name', names[i])
-            piece = f(slider.dummy)
-
-            # I'm paying the price for index-sharing, ugh
             try:
-                if piece.index is slider.dummy.index:
+                piece = f(chunk)
+            except:
+                raise InvalidApply('Let this error raise above us')
+            # Need to infer if low level index slider will cause segfaults
+            if i == 0 and piece is chunk:
+                status = 1
+            try:
+                if piece.index is chunk.index:
                     piece = piece.copy(deep='all')
                 else:
                     mutated = True
             except AttributeError:
                 pass
 
             results.append(piece)
+            if status > 0:
+                break
     finally:
         slider.reset()
 
-    return results, mutated
+    return results, mutated, status
 
 
 cdef class BlockSlider:
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -165,14 +165,17 @@ def apply(self, f, data, axis=0):
         mutated = self.mutated
         splitter = self._get_splitter(data, axis=axis)
         group_keys = self._get_group_keys()
-
+        status = 0
+        result_values = []
         # oh boy
         f_name = com.get_callable_name(f)
         if (f_name not in base.plotting_methods and
                 hasattr(splitter, 'fast_apply') and axis == 0):
             try:
-                values, mutated = splitter.fast_apply(f, group_keys)
-                return group_keys, values, mutated
+                result = splitter.fast_apply(f, group_keys)
+                result_values, mutated, status = result
+                if status == 0:
+                    return group_keys, result_values, mutated
             except reduction.InvalidApply:
                 # we detect a mutation of some kind
                 # so take slow path
@@ -181,9 +184,10 @@ def apply(self, f, data, axis=0):
                 # raise this error to the caller
                 pass
 
-        result_values = []
         for key, (i, group) in zip(group_keys, splitter):
             object.__setattr__(group, 'name', key)
+            if status > 0 and i == 0:
+                continue
 
             # group might be modified
             group_axes = _get_axes(group)
@@ -854,10 +858,7 @@ def fast_apply(self, f, names):
             return [], True
 
         sdata = self._get_sorted_data()
-        results, mutated = reduction.apply_frame_axis0(sdata, f, names,
-                                                       starts, ends)
-
-        return results, mutated
+        return reduction.apply_frame_axis0(sdata, f, names, starts, ends)
 
     def _chop(self, sdata, slice_obj):
         if self.axis == 0:
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -101,10 +101,32 @@ def f(g):
     splitter = grouper._get_splitter(g._selected_obj, axis=g.axis)
     group_keys = grouper._get_group_keys()
 
-    values, mutated = splitter.fast_apply(f, group_keys)
+    values, mutated, status = splitter.fast_apply(f, group_keys)
+    assert status == 0
     assert not mutated
 
 
+def test_group_apply_once_per_group():
+    # GH24748 ,GH2936, GH2656, GH7739, GH10519, GH12155, GH20084, GH21417
+    df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})
+
+    names = []
+
+    def f_copy(group):
+        names.append(group.name)
+        return group.copy()
+    df.groupby("a").apply(f_copy)
+    assert names == [0, 1, 2]
+
+    def f_nocopy(group):
+        names.append(group.name)
+        return group
+    names = []
+    # this takes the slow apply path
+    df.groupby("a").apply(f_nocopy)
+    assert names == [0, 1, 2]
+
+
 def test_apply_with_mixed_dtype():
     # GH3480, apply with mixed dtype on axis=1 breaks in 0.11
     df = DataFrame({'foo1': np.random.randn(6),
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -1420,20 +1420,30 @@ def foo(x):
 
 def test_group_name_available_in_inference_pass():
     # gh-15062
+    # GH24748 ,GH2936, GH2656, GH7739, GH10519, GH12155, GH20084, GH21417
     df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})
 
     names = []
 
-    def f(group):
+    def f_fast(group):
         names.append(group.name)
         return group.copy()
 
-    df.groupby('a', sort=False, group_keys=False).apply(f)
-    # we expect 2 zeros because we call ``f`` once to see if a faster route
-    # can be used.
-    expected_names = [0, 0, 1, 2]
+    df.groupby('a', sort=False, group_keys=False).apply(f_fast)
+
+    # every group should appear once, i.e. apply is called once per group
+    expected_names = [0, 1, 2]
     assert names == expected_names
 
+    names_slow = []
+
+    def f_slow(group):
+        names_slow.append(group.name)
+        return group
+
+    df.groupby('a', sort=False, group_keys=False).apply(f_slow)
+    assert names_slow == [0, 1, 2]
+
 
 def test_no_dummy_key_names(df):
     # see gh-1291