ENH: Only apply first group once in fast GroupBy.apply

fjetter · fjetter · commit fcf845501e32 · 2019-01-27T11:52:29.000+01:00
diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
@@ -948,18 +948,38 @@ that is itself a series, and possibly upcast the result to a DataFrame:
 
 .. warning::
 
-    In the current implementation apply calls func twice on the
-    first group to decide whether it can take a fast or slow code
-    path. This can lead to unexpected behavior if func has
-    side-effects, as they will take effect twice for the first
-    group.
+    The current implementation uses a cythonized code path which requires
+    that the input data is not modified inplace. The heuristic assumes that
+    this might be happening if ``func(group) is group`` in which case we fall
+    back to a slow code path which evaluates func on the first group a second
+    time.
+    This can lead to unexpected behavior if func has side-effects,
+    as they will take effect twice for the first group.
+    This behavior is 
 
     .. ipython:: python
 
         d = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]})
-        def identity(df):
-            print(df)
-            return df
+
+        def func_fast_apply(group):
+            """
+            This func doesn't modify inplace and returns
+            a scalar which is safe to fast apply
+            """
+            print(group.name)
+            return len(group)
+
+        d.groupby("a").apply(func_fast_apply)
+
+        def identity(group):
+            """
+            This triggers the slow path because ``identity(group) is group``
+            If there is no inplace modification happening
+            this may be avoided by returning a shallow copy
+            i.e. return group.copy()
+            """
+            print(group.name)
+            return group
 
         d.groupby("a").apply(identity)
 
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -26,6 +26,70 @@ Other Enhancements
 Backwards incompatible API changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+Fast GroupBy.apply on ``DataFrame`` evaluates first group only once
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+(:issue:`2936`, :issue:`2656`, :issue:`7739`, :issue:`10519`, :issue:`12155`,
+:issue:`20084`, :issue:`21417`)
+
+The implementation of ``DataFrame.groupby.apply`` previously evaluated func
+consistently twice on the first group to infer if it is safe to use a fast
+code path. Particularly for functions with side effects, this was an undesired
+behavior and may have led to surprises.
+
+The new behavior is that the first group is no longer evaluated twice if the
+fast path can be used.
+
+Previous behavior:
+
+.. code-block:: ipython
+
+    In [2]: df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]})
+
+    In [3]: side_effects = []
+
+    In [4]: def func_fast_apply(group):
+    ...:     side_effects.append(group.name)
+    ...:     return len(group)
+    ...:
+
+    In [5]: df.groupby("a").apply(func_fast_apply)
+
+    In [6]: assert side_effects == ["x", "x", "y"]
+
+New behavior:
+
+.. ipython:: python
+
+    df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]})
+
+    side_effects = []
+    def func_fast_apply(group):
+        """
+        This func doesn't modify inplace and returns
+        a scalar which is safe to fast apply
+        """
+        side_effects.append(group.name)
+        return len(group)
+
+    df.groupby("a").apply(func_fast_apply)
+    side_effects
+
+    side_effects.clear()
+    def identity(group):
+        """
+        This triggers the slow path because ``identity(group) is group``
+        If there is no inplace modification happening
+        this may be avoided by returning a shallow copy
+        i.e. return group.copy()
+        """
+        side_effects.append(group.name)
+        return group
+
+    df.groupby("a").apply(identity)
+    side_effects
+
+
 .. _whatsnew_0250.api.other:
 
 Other API Changes
diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx
@@ -507,17 +507,6 @@ def apply_frame_axis0(object frame, object f, object names,
 
     results = []
 
-    # Need to infer if our low-level mucking is going to cause a segfault
-    if n > 0:
-        chunk = frame.iloc[starts[0]:ends[0]]
-        object.__setattr__(chunk, 'name', names[0])
-        try:
-            result = f(chunk)
-            if result is chunk:
-                raise InvalidApply('Function unsafe for fast apply')
-        except:
-            raise InvalidApply('Let this error raise above us')
-
     slider = BlockSlider(frame)
 
     mutated = False
@@ -527,13 +516,23 @@ def apply_frame_axis0(object frame, object f, object names,
             slider.move(starts[i], ends[i])
 
             item_cache.clear()  # ugh
-
-            object.__setattr__(slider.dummy, 'name', names[i])
-            piece = f(slider.dummy)
+            chunk = slider.dummy
+            object.__setattr__(chunk, 'name', names[i])
+
+            # Need to infer if our low-level mucking will cause a segfault
+            if i == 0:
+                try:
+                    piece = f(chunk)
+                    if piece is chunk:
+                        raise InvalidApply('Function unsafe for fast apply')
+                except:
+                    raise InvalidApply('Let this error raise above us')
+            else:
+                piece = f(chunk)
 
             # I'm paying the price for index-sharing, ugh
             try:
-                if piece.index is slider.dummy.index:
+                if piece.index is chunk.index:
                     piece = piece.copy(deep='all')
                 else:
                     mutated = True
diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py
@@ -568,6 +568,39 @@ def test_apply_dup_names_multi_agg(self):
 
         tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize("axis, expected", [
+        (0, ['a', 'b']),
+        (1, [0, 1, 2, 3, 4, 5]),
+    ])
+    def test_apply_first_row_once(self, axis, expected):
+        # GH24748 ,GH2936, GH2656, GH7739, GH10519, GH12155, GH20084, GH21417
+        df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})
+
+        rows = []
+
+        def f_fast(row):
+            import ipdb; ipdb.set_trace()
+            rows.append(row.name)
+            return 0
+        df.apply(f_fast, axis=axis)
+
+        # every row should appear once, i.e. apply is called once per row
+        assert rows == expected
+
+        rows_slow = []
+
+        def f_slow(row):
+            """
+            This function triggers a `function does not reduce`
+            exception and uses the slow path
+            """
+            rows_slow.append(row.name)
+            return row.copy()
+
+        df.apply(f_slow, axis=axis)
+        expected_first_row_twice = [expected[0]] + expected
+        assert rows_slow == expected_first_row_twice
+
 
 class TestInferOutputShape(object):
     # the user has supplied an opaque UDF where
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -105,6 +105,28 @@ def f(g):
     assert not mutated
 
 
+def test_group_apply_once_per_group():
+    # GH24748 ,GH2936, GH2656, GH7739, GH10519, GH12155, GH20084, GH21417
+    df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})
+
+    names = []
+
+    def f_copy(group):
+        names.append(group.name)
+        return group.copy()
+    df.groupby("a").apply(f_copy)
+    assert names == [0, 1, 2]
+
+    def f_nocopy(group):
+        names.append(group.name)
+        return group
+    names.clear()
+    # this takes the slow apply path, i.e. we need to apply the
+    # function to the first row twice
+    df.groupby("a").apply(f_nocopy)
+    assert names == [0, 0, 1, 2]
+
+
 def test_apply_with_mixed_dtype():
     # GH3480, apply with mixed dtype on axis=1 breaks in 0.11
     df = DataFrame({'foo1': np.random.randn(6),
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -1420,20 +1420,30 @@ def foo(x):
 
 def test_group_name_available_in_inference_pass():
     # gh-15062
+    # GH24748 ,GH2936, GH2656, GH7739, GH10519, GH12155, GH20084, GH21417
     df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})
 
     names = []
 
-    def f(group):
+    def f_fast(group):
         names.append(group.name)
         return group.copy()
 
-    df.groupby('a', sort=False, group_keys=False).apply(f)
-    # we expect 2 zeros because we call ``f`` once to see if a faster route
-    # can be used.
-    expected_names = [0, 0, 1, 2]
+    df.groupby('a', sort=False, group_keys=False).apply(f_fast)
+
+    # every group should appear once, i.e. apply is called once per group
+    expected_names = [0, 1, 2]
     assert names == expected_names
 
+    names_slow = []
+
+    def f_slow(group):
+        names_slow.append(group.name)
+        return group
+
+    df.groupby('a', sort=False, group_keys=False).apply(f_slow)
+    assert names_slow == [0, 0, 1, 2]
+
 
 def test_no_dummy_key_names(df):
     # see gh-1291
diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py
@@ -7,14 +7,12 @@
 import numpy as np
 import pytest
 
-import pandas.compat as compat
-from pandas.compat import lrange
-
 import pandas as pd
+import pandas as pd
+import pandas.util.testing as tm
 from pandas import DataFrame, Index, Series, isna
+import pandas.compat as compat
 from pandas.conftest import _get_cython_table_params
-import pandas.util.testing as tm
-from pandas.util.testing import assert_frame_equal, assert_series_equal
 
 
 class TestSeriesApply():
@@ -665,3 +663,27 @@ def test_map_missing_mixed(self, vals, mapping, exp):
         result = s.map(mapping)
 
         tm.assert_series_equal(result, pd.Series(exp))
+
+    def test_apply_only_once(self):
+        # GH24748 ,GH2936, GH2656, GH7739, GH10519, GH12155, GH20084, GH21417
+        ser = pd.Series([0, 0, 1, 1, 2, 2], name="series")
+        rows = []
+
+        def f(row):
+            rows.append(row)
+            return row
+        ser.apply(f)
+        # every row should appear once, i.e. apply is called once per row
+        expected_names = [0, 0, 1, 1, 2, 2]
+        assert rows == expected_names
+
+        # Rows should also only be applied once if the return
+        # shape is different
+        rows = []
+
+        def g(row):
+            rows.append(row)
+            return (row, row)
+        ser.apply(g)
+        expected_names = [0, 0, 1, 1, 2, 2]
+        assert rows == expected_names