Address review comments

fjetter · fjetter · commit 25fa4823d5eb · 2019-02-21T14:56:24.000+01:00
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -29,46 +29,40 @@ Backwards incompatible API changes
 GroupBy.apply on ``DataFrame`` evaluates first group only once
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-(:issue:`2936`, :issue:`2656`, :issue:`7739`, :issue:`10519`, :issue:`12155`,
-:issue:`20084`, :issue:`21417`)
-
-The implementation of ``DataFrame.groupby.apply`` previously evaluated func
-consistently twice on the first group to infer if it is safe to use a fast
-code path. Particularly for functions with side effects, this was an undesired
-behavior and may have led to surprises.
+The implementation of :meth:`DataFrameGroupBy.apply() <pandas.core.groupby.DataFrameGroupBy.apply>`
+previously evaluated func consistently twice on the first group to infer if it
+is safe to use a fast code path. Particularly for functions with side effects,
+this was an undesired behavior and may have led to surprises.
 
 Now every group is evaluated only a single time.
 
-Previous behavior:
-
-.. code-block:: ipython
-
-    In [2]: df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]})
-
-    In [3]: side_effects = []
+:issue:`2936`, :issue:`2656`, :issue:`7739`, :issue:`10519`, :issue:`12155`,
+:issue:`20084`, :issue:`21417`
 
-    In [4]: def func_fast_apply(group):
-    ...:     side_effects.append(group.name)
-    ...:     return len(group)
-    ...:
-
-    In [5]: df.groupby("a").apply(func_fast_apply)
-
-    In [6]: assert side_effects == ["x", "x", "y"]
-
-New behavior:
 
 .. ipython:: python
 
     df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]})
+    df
 
     side_effects = []
     def func(group):
         side_effects.append(group.name)
         return group
-
     df.groupby("a").apply(func)
-    assert side_effects == ["x", "y"]
+
+Previous behavior:
+
+.. code-block:: python
+
+    side_effects
+    >>> ["x", "x", "y"]
+
+New behavior:
+
+.. ipython:: python
+
+    side_effects
 
 
 .. _whatsnew_0250.api.other:
diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx
@@ -510,7 +510,7 @@ def apply_frame_axis0(object frame, object f, object names,
     slider = BlockSlider(frame)
 
     mutated = False
-    status = 0
+    successfull_fast_apply = True
     item_cache = slider.dummy._item_cache
     try:
         for i in range(n):
@@ -526,7 +526,7 @@ def apply_frame_axis0(object frame, object f, object names,
                 raise InvalidApply('Let this error raise above us')
             # Need to infer if low level index slider will cause segfaults
             if i == 0 and piece is chunk:
-                status = 1
+                successfull_fast_apply = False
             try:
                 if piece.index is chunk.index:
                     piece = piece.copy(deep='all')
@@ -536,12 +536,12 @@ def apply_frame_axis0(object frame, object f, object names,
                 pass
 
             results.append(piece)
-            if status > 0:
+            if not successfull_fast_apply:
                 break
     finally:
         slider.reset()
 
-    return results, mutated, status
+    return results, mutated, successfull_fast_apply
 
 
 cdef class BlockSlider:
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -165,30 +165,37 @@ def apply(self, f, data, axis=0):
         mutated = self.mutated
         splitter = self._get_splitter(data, axis=axis)
         group_keys = self._get_group_keys()
-        status = 0
+        reuse_result = False
         result_values = []
         # oh boy
         f_name = com.get_callable_name(f)
         if (f_name not in base.plotting_methods and
                 hasattr(splitter, 'fast_apply') and axis == 0):
             try:
                 result = splitter.fast_apply(f, group_keys)
-                result_values, mutated, status = result
-                if status == 0:
-                    return group_keys, result_values, mutated
+                fast_apply_result, mutated, successful_fast_apply = result
+                # If the fast apply path could be used we can return here.
+                # Otherwise we need to fall back to the slow implementation.
+                if successful_fast_apply:
+                    return group_keys, fast_apply_result, mutated
+                else:
+                    # The slow implementation can still reuse the result
+                    # for the first group
+                    result_values = fast_apply_result
+                    reuse_result = True
             except reduction.InvalidApply:
-                # we detect a mutation of some kind
-                # so take slow path
+                # Cannot fast apply on MultiIndex (_has_complex_internals).
+                # This Exception is also raised if `f` triggers an exception but
+                # it is preferable if the exception is raised in Python.
                 pass
             except Exception:
                 # raise this error to the caller
                 pass
 
         for key, (i, group) in zip(group_keys, splitter):
             object.__setattr__(group, 'name', key)
-            if status > 0 and i == 0:
+            if reuse_result and i == 0:
                 continue
-
             # group might be modified
             group_axes = _get_axes(group)
             res = f(group)
@@ -855,7 +862,7 @@ def fast_apply(self, f, names):
             starts, ends = lib.generate_slices(self.slabels, self.ngroups)
         except Exception:
             # fails when all -1
-            return [], True
+            return [], True, False
 
         sdata = self._get_sorted_data()
         return reduction.apply_frame_axis0(sdata, f, names, starts, ends)
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -101,26 +101,37 @@ def f(g):
     splitter = grouper._get_splitter(g._selected_obj, axis=g.axis)
     group_keys = grouper._get_group_keys()
 
-    values, mutated, status = splitter.fast_apply(f, group_keys)
-    assert status == 0
+    values, mutated, succsessful_apply = splitter.fast_apply(f, group_keys)
+    # The bool successful_apply signals whether or not the fast apply was
+    # successful on the entire data set. It is false for cases which need to
+    # fall back to a slow apply code path for safety reasons.
+    assert succsessful_apply
     assert not mutated
 
 
 def test_group_apply_once_per_group():
-    # GH24748 ,GH2936, GH2656, GH7739, GH10519, GH12155, GH20084, GH21417
-    df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})
+    # GH2936, GH7739, GH10519, GH2656, GH12155, GH20084, GH21417
+
+    # This test should ensure that a function is only evaluted
+    # once per group. Previously the function has been evaluated twice
+    # on the first group to check if the Cython index slider is safe to use
+    # This test ensures that the side effect (append to list) is only triggered
+    # once per group
+    df = pd.DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": np.arange(6)})
 
     names = []
 
     def f_copy(group):
         names.append(group.name)
         return group.copy()
+
     df.groupby("a").apply(f_copy)
     assert names == [0, 1, 2]
 
     def f_nocopy(group):
         names.append(group.name)
         return group
+
     names = []
     # this takes the slow apply path
     df.groupby("a").apply(f_nocopy)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -1420,30 +1420,18 @@ def foo(x):
 
 def test_group_name_available_in_inference_pass():
     # gh-15062
-    # GH24748 ,GH2936, GH2656, GH7739, GH10519, GH12155, GH20084, GH21417
     df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})
 
     names = []
 
-    def f_fast(group):
+    def f(group):
         names.append(group.name)
         return group.copy()
+    df.groupby('a', sort=False, group_keys=False).apply(f)
 
-    df.groupby('a', sort=False, group_keys=False).apply(f_fast)
-
-    # every group should appear once, i.e. apply is called once per group
     expected_names = [0, 1, 2]
     assert names == expected_names
 
-    names_slow = []
-
-    def f_slow(group):
-        names_slow.append(group.name)
-        return group
-
-    df.groupby('a', sort=False, group_keys=False).apply(f_slow)
-    assert names_slow == [0, 1, 2]
-
 
 def test_no_dummy_key_names(df):
     # see gh-1291