ENH: Only apply first group once in fast GroupBy.apply (#24748)

fjetter · WillAyd · commit 324bb84c8e4c · 2019-03-25T21:38:33.000-07:00
diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -946,23 +946,6 @@ that is itself a series, and possibly upcast the result to a DataFrame:
    So depending on the path taken, and exactly what you are grouping. Thus the grouped columns(s) may be included in
    the output as well as set the indices.
 
-.. warning::
-
-    In the current implementation apply calls func twice on the
-    first group to decide whether it can take a fast or slow code
-    path. This can lead to unexpected behavior if func has
-    side-effects, as they will take effect twice for the first
-    group.
-
-    .. ipython:: python
-
-        d = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]})
-        def identity(df):
-            print(df)
-            return df
-
-        d.groupby("a").apply(identity)
-
 
 Other useful features
 ---------------------
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -73,6 +73,50 @@ is respected in indexing. (:issue:`24076`, :issue:`16785`)
     df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific'))
     df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00']
 
+.. _whatsnew_0250.api_breaking.groupby_apply_first_group_once:
+
+GroupBy.apply on ``DataFrame`` evaluates first group only once
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The implementation of :meth:`DataFrameGroupBy.apply() <pandas.core.groupby.DataFrameGroupBy.apply>`
+previously evaluated the supplied function consistently twice on the first group
+to infer if it is safe to use a fast code path. Particularly for functions with
+side effects, this was an undesired behavior and may have led to surprises.
+
+(:issue:`2936`, :issue:`2656`, :issue:`7739`, :issue:`10519`, :issue:`12155`,
+:issue:`20084`, :issue:`21417`)
+
+Now every group is evaluated only a single time.
+
+.. ipython:: python
+
+    df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]})
+    df
+
+    def func(group):
+        print(group.name)
+        return group
+
+*Previous Behaviour*:
+
+.. code-block:: python
+
+   In [3]: df.groupby('a').apply(func)
+   x
+   x
+   y
+   Out[3]:
+      a  b
+   0  x  1
+   1  y  2
+
+*New Behaviour*:
+
+.. ipython:: python
+
+    df.groupby("a").apply(func)
+
+
 Concatenating Sparse Values
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -83,14 +127,14 @@ Series or DataFrame with sparse values, rather than a ``SparseDataFrame`` (:issu
 
    df = pd.DataFrame({"A": pd.SparseArray([0, 1])})
 
-*Previous Behavior:*
+*Previous Behavior*:
 
 .. code-block:: ipython
 
    In [2]: type(pd.concat([df, df]))
    pandas.core.sparse.frame.SparseDataFrame
 
-*New Behavior:*
+*New Behavior*:
 
 .. ipython:: python
 
diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx
@@ -509,17 +509,6 @@ def apply_frame_axis0(object frame, object f, object names,
 
     results = []
 
-    # Need to infer if our low-level mucking is going to cause a segfault
-    if n > 0:
-        chunk = frame.iloc[starts[0]:ends[0]]
-        object.__setattr__(chunk, 'name', names[0])
-        try:
-            result = f(chunk)
-            if result is chunk:
-                raise InvalidApply('Function unsafe for fast apply')
-        except:
-            raise InvalidApply('Let this error raise above us')
-
     slider = BlockSlider(frame)
 
     mutated = False
@@ -529,20 +518,31 @@ def apply_frame_axis0(object frame, object f, object names,
             slider.move(starts[i], ends[i])
 
             item_cache.clear()  # ugh
+            chunk = slider.dummy
+            object.__setattr__(chunk, 'name', names[i])
 
-            object.__setattr__(slider.dummy, 'name', names[i])
-            piece = f(slider.dummy)
+            try:
+                piece = f(chunk)
+            except:
+                raise InvalidApply('Let this error raise above us')
 
-            # I'm paying the price for index-sharing, ugh
+            # Need to infer if low level index slider will cause segfaults
+            require_slow_apply = i == 0 and piece is chunk
             try:
-                if piece.index is slider.dummy.index:
+                if piece.index is chunk.index:
                     piece = piece.copy(deep='all')
                 else:
                     mutated = True
             except AttributeError:
                 pass
 
             results.append(piece)
+
+            # If the data was modified inplace we need to
+            # take the slow path to not risk segfaults
+            # we have already computed the first piece
+            if require_slow_apply:
+                break
     finally:
         slider.reset()
 
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -165,26 +165,45 @@ def apply(self, f, data, axis=0):
         mutated = self.mutated
         splitter = self._get_splitter(data, axis=axis)
         group_keys = self._get_group_keys()
+        result_values = None
 
         # oh boy
         f_name = com.get_callable_name(f)
         if (f_name not in base.plotting_methods and
                 hasattr(splitter, 'fast_apply') and axis == 0):
             try:
-                values, mutated = splitter.fast_apply(f, group_keys)
-                return group_keys, values, mutated
+                result_values, mutated = splitter.fast_apply(f, group_keys)
+
+                # If the fast apply path could be used we can return here.
+                # Otherwise we need to fall back to the slow implementation.
+                if len(result_values) == len(group_keys):
+                    return group_keys, result_values, mutated
+
             except reduction.InvalidApply:
-                # we detect a mutation of some kind
-                # so take slow path
+                # Cannot fast apply on MultiIndex (_has_complex_internals).
+                # This Exception is also raised if `f` triggers an exception
+                # but it is preferable to raise the exception in Python.
                 pass
             except Exception:
                 # raise this error to the caller
                 pass
 
-        result_values = []
         for key, (i, group) in zip(group_keys, splitter):
             object.__setattr__(group, 'name', key)
 
+            # result_values is None if fast apply path wasn't taken
+            # or fast apply aborted with an unexpected exception.
+            # In either case, initialize the result list and perform
+            # the slow iteration.
+            if result_values is None:
+                result_values = []
+
+            # If result_values is not None we're in the case that the
+            # fast apply loop was broken prematurely but we have
+            # already the result for the first group which we can reuse.
+            elif i == 0:
+                continue
+
             # group might be modified
             group_axes = _get_axes(group)
             res = f(group)
@@ -854,10 +873,7 @@ def fast_apply(self, f, names):
             return [], True
 
         sdata = self._get_sorted_data()
-        results, mutated = reduction.apply_frame_axis0(sdata, f, names,
-                                                       starts, ends)
-
-        return results, mutated
+        return reduction.apply_frame_axis0(sdata, f, names, starts, ends)
 
     def _chop(self, sdata, slice_obj):
         if self.axis == 0:
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -102,9 +102,80 @@ def f(g):
     group_keys = grouper._get_group_keys()
 
     values, mutated = splitter.fast_apply(f, group_keys)
+
     assert not mutated
 
 
+@pytest.mark.parametrize(
+    "df, group_names",
+    [
+        (DataFrame({"a": [1, 1, 1, 2, 3],
+                    "b": ["a", "a", "a", "b", "c"]}),
+         [1, 2, 3]),
+        (DataFrame({"a": [0, 0, 1, 1],
+                    "b": [0, 1, 0, 1]}),
+         [0, 1]),
+        (DataFrame({"a": [1]}),
+         [1]),
+        (DataFrame({"a": [1, 1, 1, 2, 2, 1, 1, 2],
+                    "b": range(8)}),
+         [1, 2]),
+        (DataFrame({"a": [1, 2, 3, 1, 2, 3],
+                    "two": [4, 5, 6, 7, 8, 9]}),
+         [1, 2, 3]),
+        (DataFrame({"a": list("aaabbbcccc"),
+                    "B": [3, 4, 3, 6, 5, 2, 1, 9, 5, 4],
+                    "C": [4, 0, 2, 2, 2, 7, 8, 6, 2, 8]}),
+         ["a", "b", "c"]),
+        (DataFrame([[1, 2, 3], [2, 2, 3]], columns=["a", "b", "c"]),
+         [1, 2]),
+    ], ids=['GH2936', 'GH7739 & GH10519', 'GH10519',
+            'GH2656', 'GH12155', 'GH20084', 'GH21417'])
+def test_group_apply_once_per_group(df, group_names):
+    # GH2936, GH7739, GH10519, GH2656, GH12155, GH20084, GH21417
+
+    # This test should ensure that a function is only evaluted
+    # once per group. Previously the function has been evaluated twice
+    # on the first group to check if the Cython index slider is safe to use
+    # This test ensures that the side effect (append to list) is only triggered
+    # once per group
+
+    names = []
+    # cannot parameterize over the functions since they need external
+    # `names` to detect side effects
+
+    def f_copy(group):
+        # this takes the fast apply path
+        names.append(group.name)
+        return group.copy()
+
+    def f_nocopy(group):
+        # this takes the slow apply path
+        names.append(group.name)
+        return group
+
+    def f_scalar(group):
+        # GH7739, GH2656
+        names.append(group.name)
+        return 0
+
+    def f_none(group):
+        # GH10519, GH12155, GH21417
+        names.append(group.name)
+        return None
+
+    def f_constant_df(group):
+        # GH2936, GH20084
+        names.append(group.name)
+        return DataFrame({"a": [1], "b": [1]})
+
+    for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]:
+        del names[:]
+
+        df.groupby("a").apply(func)
+        assert names == group_names
+
+
 def test_apply_with_mixed_dtype():
     # GH3480, apply with mixed dtype on axis=1 breaks in 0.11
     df = DataFrame({'foo1': np.random.randn(6),
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -1381,11 +1381,9 @@ def test_group_name_available_in_inference_pass():
     def f(group):
         names.append(group.name)
         return group.copy()
-
     df.groupby('a', sort=False, group_keys=False).apply(f)
-    # we expect 2 zeros because we call ``f`` once to see if a faster route
-    # can be used.
-    expected_names = [0, 0, 1, 2]
+
+    expected_names = [0, 1, 2]
     assert names == expected_names