From 5c9999779377b97491f518caff45cc1242208f21 Mon Sep 17 00:00:00 2001 From: fjetter Date: Sat, 12 Jan 2019 12:37:57 +0100 Subject: [PATCH 1/9] ENH: Only apply first group once in fast GroupBy.apply --- doc/source/user_guide/groupby.rst | 17 ---------- doc/source/whatsnew/v0.25.0.rst | 47 ++++++++++++++++++++++++++++ pandas/_libs/reduction.pyx | 31 +++++++++--------- pandas/core/groupby/ops.py | 17 +++++----- pandas/tests/groupby/test_apply.py | 24 +++++++++++++- pandas/tests/groupby/test_groupby.py | 20 +++++++++--- 6 files changed, 108 insertions(+), 48 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index e4dd82afcdf65..4f116a42253e5 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -946,23 +946,6 @@ that is itself a series, and possibly upcast the result to a DataFrame: So depending on the path taken, and exactly what you are grouping. Thus the grouped columns(s) may be included in the output as well as set the indices. -.. warning:: - - In the current implementation apply calls func twice on the - first group to decide whether it can take a fast or slow code - path. This can lead to unexpected behavior if func has - side-effects, as they will take effect twice for the first - group. - - .. ipython:: python - - d = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]}) - def identity(df): - print(df) - return df - - d.groupby("a").apply(identity) - Other useful features --------------------- diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 170e7f14da397..a5c47b1d17d77 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -63,6 +63,53 @@ is respected in indexing. (:issue:`24076`, :issue:`16785`) df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific')) df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00'] +.. _whatsnew_0250.api_breaking.groupby_apply_first_group_once: + +GroupBy.apply on ``DataFrame`` evaluates first group only once +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +(:issue:`2936`, :issue:`2656`, :issue:`7739`, :issue:`10519`, :issue:`12155`, +:issue:`20084`, :issue:`21417`) + +The implementation of ``DataFrame.groupby.apply`` previously evaluated func +consistently twice on the first group to infer if it is safe to use a fast +code path. Particularly for functions with side effects, this was an undesired +behavior and may have led to surprises. + +Now every group is evaluated only a single time. + +Previous behavior: + +.. code-block:: ipython + + In [2]: df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]}) + + In [3]: side_effects = [] + + In [4]: def func_fast_apply(group): + ...: side_effects.append(group.name) + ...: return len(group) + ...: + + In [5]: df.groupby("a").apply(func_fast_apply) + + In [6]: assert side_effects == ["x", "x", "y"] + +New behavior: + +.. ipython:: python + + df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]}) + + side_effects = [] + def func(group): + side_effects.append(group.name) + return group + + df.groupby("a").apply(func) + assert side_effects == ["x", "y"] + + .. _whatsnew_0250.api.other: Other API Changes diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 507567cf480d7..9f5baff511033 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -507,33 +507,28 @@ def apply_frame_axis0(object frame, object f, object names, results = [] - # Need to infer if our low-level mucking is going to cause a segfault - if n > 0: - chunk = frame.iloc[starts[0]:ends[0]] - object.__setattr__(chunk, 'name', names[0]) - try: - result = f(chunk) - if result is chunk: - raise InvalidApply('Function unsafe for fast apply') - except: - raise InvalidApply('Let this error raise above us') - slider = BlockSlider(frame) mutated = False + status = 0 item_cache = slider.dummy._item_cache try: for i in range(n): slider.move(starts[i], ends[i]) item_cache.clear() # ugh + chunk = slider.dummy + object.__setattr__(chunk, 'name', names[i]) - object.__setattr__(slider.dummy, 'name', names[i]) - piece = f(slider.dummy) - - # I'm paying the price for index-sharing, ugh try: - if piece.index is slider.dummy.index: + piece = f(chunk) + except: + raise InvalidApply('Let this error raise above us') + # Need to infer if low level index slider will cause segfaults + if i == 0 and piece is chunk: + status = 1 + try: + if piece.index is chunk.index: piece = piece.copy(deep='all') else: mutated = True @@ -541,10 +536,12 @@ def apply_frame_axis0(object frame, object f, object names, pass results.append(piece) + if status > 0: + break finally: slider.reset() - return results, mutated + return results, mutated, status cdef class BlockSlider: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 78c9aa9187135..8f1d68924bcc2 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -165,14 +165,17 @@ def apply(self, f, data, axis=0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) group_keys = self._get_group_keys() - + status = 0 + result_values = [] # oh boy f_name = com.get_callable_name(f) if (f_name not in base.plotting_methods and hasattr(splitter, 'fast_apply') and axis == 0): try: - values, mutated = splitter.fast_apply(f, group_keys) - return group_keys, values, mutated + result = splitter.fast_apply(f, group_keys) + result_values, mutated, status = result + if status == 0: + return group_keys, result_values, mutated except reduction.InvalidApply: # we detect a mutation of some kind # so take slow path @@ -181,9 +184,10 @@ def apply(self, f, data, axis=0): # raise this error to the caller pass - result_values = [] for key, (i, group) in zip(group_keys, splitter): object.__setattr__(group, 'name', key) + if status > 0 and i == 0: + continue # group might be modified group_axes = _get_axes(group) @@ -854,10 +858,7 @@ def fast_apply(self, f, names): return [], True sdata = self._get_sorted_data() - results, mutated = reduction.apply_frame_axis0(sdata, f, names, - starts, ends) - - return results, mutated + return reduction.apply_frame_axis0(sdata, f, names, starts, ends) def _chop(self, sdata, slice_obj): if self.axis == 0: diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 659d1a9cf9813..5b920ce6b5f7d 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -101,10 +101,32 @@ def f(g): splitter = grouper._get_splitter(g._selected_obj, axis=g.axis) group_keys = grouper._get_group_keys() - values, mutated = splitter.fast_apply(f, group_keys) + values, mutated, status = splitter.fast_apply(f, group_keys) + assert status == 0 assert not mutated +def test_group_apply_once_per_group(): + # GH24748 ,GH2936, GH2656, GH7739, GH10519, GH12155, GH20084, GH21417 + df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)}) + + names = [] + + def f_copy(group): + names.append(group.name) + return group.copy() + df.groupby("a").apply(f_copy) + assert names == [0, 1, 2] + + def f_nocopy(group): + names.append(group.name) + return group + names = [] + # this takes the slow apply path + df.groupby("a").apply(f_nocopy) + assert names == [0, 1, 2] + + def test_apply_with_mixed_dtype(): # GH3480, apply with mixed dtype on axis=1 breaks in 0.11 df = DataFrame({'foo1': np.random.randn(6), diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 12a5d494648fc..6fc47dd90e0ef 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1374,20 +1374,30 @@ def foo(x): def test_group_name_available_in_inference_pass(): # gh-15062 + # GH24748 ,GH2936, GH2656, GH7739, GH10519, GH12155, GH20084, GH21417 df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)}) names = [] - def f(group): + def f_fast(group): names.append(group.name) return group.copy() - df.groupby('a', sort=False, group_keys=False).apply(f) - # we expect 2 zeros because we call ``f`` once to see if a faster route - # can be used. - expected_names = [0, 0, 1, 2] + df.groupby('a', sort=False, group_keys=False).apply(f_fast) + + # every group should appear once, i.e. apply is called once per group + expected_names = [0, 1, 2] assert names == expected_names + names_slow = [] + + def f_slow(group): + names_slow.append(group.name) + return group + + df.groupby('a', sort=False, group_keys=False).apply(f_slow) + assert names_slow == [0, 1, 2] + def test_no_dummy_key_names(df): # see gh-1291 From f2228f80c93876953de0d1816e37c1732c899991 Mon Sep 17 00:00:00 2001 From: fjetter Date: Sun, 27 Jan 2019 18:06:25 +0100 Subject: [PATCH 2/9] Address review comments --- doc/source/whatsnew/v0.25.0.rst | 46 ++++++++++++---------------- pandas/_libs/reduction.pyx | 8 ++--- pandas/core/groupby/ops.py | 25 +++++++++------ pandas/tests/groupby/test_apply.py | 19 +++++++++--- pandas/tests/groupby/test_groupby.py | 16 ++-------- 5 files changed, 57 insertions(+), 57 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index a5c47b1d17d77..3739a32879c8a 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -68,46 +68,40 @@ is respected in indexing. (:issue:`24076`, :issue:`16785`) GroupBy.apply on ``DataFrame`` evaluates first group only once ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(:issue:`2936`, :issue:`2656`, :issue:`7739`, :issue:`10519`, :issue:`12155`, -:issue:`20084`, :issue:`21417`) - -The implementation of ``DataFrame.groupby.apply`` previously evaluated func -consistently twice on the first group to infer if it is safe to use a fast -code path. Particularly for functions with side effects, this was an undesired -behavior and may have led to surprises. +The implementation of :meth:`DataFrameGroupBy.apply() ` +previously evaluated func consistently twice on the first group to infer if it +is safe to use a fast code path. Particularly for functions with side effects, +this was an undesired behavior and may have led to surprises. Now every group is evaluated only a single time. -Previous behavior: - -.. code-block:: ipython - - In [2]: df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]}) +:issue:`2936`, :issue:`2656`, :issue:`7739`, :issue:`10519`, :issue:`12155`, +:issue:`20084`, :issue:`21417` - In [3]: side_effects = [] - - In [4]: def func_fast_apply(group): - ...: side_effects.append(group.name) - ...: return len(group) - ...: - - In [5]: df.groupby("a").apply(func_fast_apply) - - In [6]: assert side_effects == ["x", "x", "y"] - -New behavior: .. ipython:: python df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]}) + df side_effects = [] def func(group): side_effects.append(group.name) return group - df.groupby("a").apply(func) - assert side_effects == ["x", "y"] + +Previous behavior: + +.. code-block:: python + + side_effects + >>> ["x", "x", "y"] + +New behavior: + +.. ipython:: python + + side_effects .. _whatsnew_0250.api.other: diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 9f5baff511033..6b2b528a4ac66 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -510,7 +510,7 @@ def apply_frame_axis0(object frame, object f, object names, slider = BlockSlider(frame) mutated = False - status = 0 + successfull_fast_apply = True item_cache = slider.dummy._item_cache try: for i in range(n): @@ -526,7 +526,7 @@ def apply_frame_axis0(object frame, object f, object names, raise InvalidApply('Let this error raise above us') # Need to infer if low level index slider will cause segfaults if i == 0 and piece is chunk: - status = 1 + successfull_fast_apply = False try: if piece.index is chunk.index: piece = piece.copy(deep='all') @@ -536,12 +536,12 @@ def apply_frame_axis0(object frame, object f, object names, pass results.append(piece) - if status > 0: + if not successfull_fast_apply: break finally: slider.reset() - return results, mutated, status + return results, mutated, successfull_fast_apply cdef class BlockSlider: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 8f1d68924bcc2..e848f69691ae4 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -165,7 +165,7 @@ def apply(self, f, data, axis=0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) group_keys = self._get_group_keys() - status = 0 + reuse_result = False result_values = [] # oh boy f_name = com.get_callable_name(f) @@ -173,12 +173,20 @@ def apply(self, f, data, axis=0): hasattr(splitter, 'fast_apply') and axis == 0): try: result = splitter.fast_apply(f, group_keys) - result_values, mutated, status = result - if status == 0: - return group_keys, result_values, mutated + fast_apply_result, mutated, successful_fast_apply = result + # If the fast apply path could be used we can return here. + # Otherwise we need to fall back to the slow implementation. + if successful_fast_apply: + return group_keys, fast_apply_result, mutated + else: + # The slow implementation can still reuse the result + # for the first group + result_values = fast_apply_result + reuse_result = True except reduction.InvalidApply: - # we detect a mutation of some kind - # so take slow path + # Cannot fast apply on MultiIndex (_has_complex_internals). + # This Exception is also raised if `f` triggers an exception but + # it is preferable if the exception is raised in Python. pass except Exception: # raise this error to the caller @@ -186,9 +194,8 @@ def apply(self, f, data, axis=0): for key, (i, group) in zip(group_keys, splitter): object.__setattr__(group, 'name', key) - if status > 0 and i == 0: + if reuse_result and i == 0: continue - # group might be modified group_axes = _get_axes(group) res = f(group) @@ -855,7 +862,7 @@ def fast_apply(self, f, names): starts, ends = lib.generate_slices(self.slabels, self.ngroups) except Exception: # fails when all -1 - return [], True + return [], True, False sdata = self._get_sorted_data() return reduction.apply_frame_axis0(sdata, f, names, starts, ends) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 5b920ce6b5f7d..1fc911bcd2a9f 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -101,26 +101,37 @@ def f(g): splitter = grouper._get_splitter(g._selected_obj, axis=g.axis) group_keys = grouper._get_group_keys() - values, mutated, status = splitter.fast_apply(f, group_keys) - assert status == 0 + values, mutated, succsessful_apply = splitter.fast_apply(f, group_keys) + # The bool successful_apply signals whether or not the fast apply was + # successful on the entire data set. It is false for cases which need to + # fall back to a slow apply code path for safety reasons. + assert succsessful_apply assert not mutated def test_group_apply_once_per_group(): - # GH24748 ,GH2936, GH2656, GH7739, GH10519, GH12155, GH20084, GH21417 - df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)}) + # GH2936, GH7739, GH10519, GH2656, GH12155, GH20084, GH21417 + + # This test should ensure that a function is only evaluted + # once per group. Previously the function has been evaluated twice + # on the first group to check if the Cython index slider is safe to use + # This test ensures that the side effect (append to list) is only triggered + # once per group + df = pd.DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": np.arange(6)}) names = [] def f_copy(group): names.append(group.name) return group.copy() + df.groupby("a").apply(f_copy) assert names == [0, 1, 2] def f_nocopy(group): names.append(group.name) return group + names = [] # this takes the slow apply path df.groupby("a").apply(f_nocopy) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6fc47dd90e0ef..311c8ef447c54 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1374,30 +1374,18 @@ def foo(x): def test_group_name_available_in_inference_pass(): # gh-15062 - # GH24748 ,GH2936, GH2656, GH7739, GH10519, GH12155, GH20084, GH21417 df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)}) names = [] - def f_fast(group): + def f(group): names.append(group.name) return group.copy() + df.groupby('a', sort=False, group_keys=False).apply(f) - df.groupby('a', sort=False, group_keys=False).apply(f_fast) - - # every group should appear once, i.e. apply is called once per group expected_names = [0, 1, 2] assert names == expected_names - names_slow = [] - - def f_slow(group): - names_slow.append(group.name) - return group - - df.groupby('a', sort=False, group_keys=False).apply(f_slow) - assert names_slow == [0, 1, 2] - def test_no_dummy_key_names(df): # see gh-1291 From 5aae4c98050fc47bc3e304d35b6569fa449f58e3 Mon Sep 17 00:00:00 2001 From: fjetter Date: Thu, 21 Feb 2019 16:27:53 +0100 Subject: [PATCH 3/9] More PR comments --- doc/source/whatsnew/v0.25.0.rst | 17 ++++--- pandas/_libs/reduction.pyx | 12 +++-- pandas/core/groupby/ops.py | 24 +++++----- pandas/tests/groupby/test_apply.py | 75 ++++++++++++++++++++++++------ 4 files changed, 88 insertions(+), 40 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 3739a32879c8a..fdd39811c299a 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -69,15 +69,14 @@ GroupBy.apply on ``DataFrame`` evaluates first group only once ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The implementation of :meth:`DataFrameGroupBy.apply() ` -previously evaluated func consistently twice on the first group to infer if it -is safe to use a fast code path. Particularly for functions with side effects, -this was an undesired behavior and may have led to surprises. +previously evaluated the supplied function consistently twice on the first group +to infer if it is safe to use a fast code path. Particularly for functions with +side effects, this was an undesired behavior and may have led to surprises. -Now every group is evaluated only a single time. - -:issue:`2936`, :issue:`2656`, :issue:`7739`, :issue:`10519`, :issue:`12155`, -:issue:`20084`, :issue:`21417` +(:issue:`2936`, :issue:`2656`, :issue:`7739`, :issue:`10519`, :issue:`12155`, +:issue:`20084`, :issue:`21417`) +Now every group is evaluated only a single time. .. ipython:: python @@ -90,14 +89,14 @@ Now every group is evaluated only a single time. return group df.groupby("a").apply(func) -Previous behavior: +Previous Behaviour: .. code-block:: python side_effects >>> ["x", "x", "y"] -New behavior: +New Behaviour: .. ipython:: python diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 6b2b528a4ac66..19b52fe34c936 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -510,7 +510,7 @@ def apply_frame_axis0(object frame, object f, object names, slider = BlockSlider(frame) mutated = False - successfull_fast_apply = True + require_slow_apply = True item_cache = slider.dummy._item_cache try: for i in range(n): @@ -524,9 +524,10 @@ def apply_frame_axis0(object frame, object f, object names, piece = f(chunk) except: raise InvalidApply('Let this error raise above us') + # Need to infer if low level index slider will cause segfaults if i == 0 and piece is chunk: - successfull_fast_apply = False + require_slow_apply = True try: if piece.index is chunk.index: piece = piece.copy(deep='all') @@ -536,12 +537,15 @@ def apply_frame_axis0(object frame, object f, object names, pass results.append(piece) - if not successfull_fast_apply: + + # If the data was modified inplace we need to + # take the slow path to not risk segfaults + if require_slow_apply: break finally: slider.reset() - return results, mutated, successfull_fast_apply + return results, mutated cdef class BlockSlider: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e848f69691ae4..0927ff5907d53 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -165,28 +165,23 @@ def apply(self, f, data, axis=0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) group_keys = self._get_group_keys() - reuse_result = False result_values = [] + # oh boy f_name = com.get_callable_name(f) if (f_name not in base.plotting_methods and hasattr(splitter, 'fast_apply') and axis == 0): try: - result = splitter.fast_apply(f, group_keys) - fast_apply_result, mutated, successful_fast_apply = result + result_values, mutated = splitter.fast_apply(f, group_keys) # If the fast apply path could be used we can return here. # Otherwise we need to fall back to the slow implementation. - if successful_fast_apply: - return group_keys, fast_apply_result, mutated - else: - # The slow implementation can still reuse the result - # for the first group - result_values = fast_apply_result - reuse_result = True + if len(result_values) == len(group_keys): + return group_keys, result_values, mutated + except reduction.InvalidApply: # Cannot fast apply on MultiIndex (_has_complex_internals). # This Exception is also raised if `f` triggers an exception but - # it is preferable if the exception is raised in Python. + # it is preferable to raise the exception in Python. pass except Exception: # raise this error to the caller @@ -194,7 +189,10 @@ def apply(self, f, data, axis=0): for key, (i, group) in zip(group_keys, splitter): object.__setattr__(group, 'name', key) - if reuse_result and i == 0: + + # If the fast apply failed we may still use the result + # for the first group + if len(result_values) == 1 and i == 0: continue # group might be modified group_axes = _get_axes(group) @@ -862,7 +860,7 @@ def fast_apply(self, f, names): starts, ends = lib.generate_slices(self.slabels, self.ngroups) except Exception: # fails when all -1 - return [], True, False + return [], True sdata = self._get_sorted_data() return reduction.apply_frame_axis0(sdata, f, names, starts, ends) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 1fc911bcd2a9f..994d17e945584 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -101,15 +101,46 @@ def f(g): splitter = grouper._get_splitter(g._selected_obj, axis=g.axis) group_keys = grouper._get_group_keys() - values, mutated, succsessful_apply = splitter.fast_apply(f, group_keys) - # The bool successful_apply signals whether or not the fast apply was - # successful on the entire data set. It is false for cases which need to - # fall back to a slow apply code path for safety reasons. - assert succsessful_apply + values, mutated = splitter.fast_apply(f, group_keys) + assert not mutated -def test_group_apply_once_per_group(): +@pytest.mark.parametrize( + "df,group_names", + [ + ( + DataFrame({"a": [1, 1, 1, 2, 3], + "b": ["a", "a", "a", "b", "c"]}), + [1, 2, 3], + ), # GH2936 + ( + DataFrame({"a": [0, 0, 1, 1], + "b": [0, 1, 0, 1]}), + [0, 1] + ), # GH7739, GH10519 + (DataFrame({"a": [1]}), [1]), # GH10519 + ( + DataFrame({"a": [1, 1, 1, 2, 2, 1, 1, 2], + "b": range(8)}), + [1, 2]), # GH2656 + ( + DataFrame({"a": [1, 2, 3, 1, 2, 3], "two": [4, 5, 6, 7, 8, 9]}), + [1, 2, 3], + ), # GH12155 + ( + DataFrame({"a": list("aaabbbcccc"), + "B": [3, 4, 3, 6, 5, 2, 1, 9, 5, 4], + "C": [4, 0, 2, 2, 2, 7, 8, 6, 2, 8]}), + ["a", "b", "c"], + ), # GH20084 + ( + DataFrame([[1, 2, 3], [2, 2, 3]], columns=["a", "b", "c"]), + [1, 2], + ), # GH21417 + ], +) +def test_group_apply_once_per_group(df, group_names): # GH2936, GH7739, GH10519, GH2656, GH12155, GH20084, GH21417 # This test should ensure that a function is only evaluted @@ -117,25 +148,41 @@ def test_group_apply_once_per_group(): # on the first group to check if the Cython index slider is safe to use # This test ensures that the side effect (append to list) is only triggered # once per group - df = pd.DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": np.arange(6)}) names = [] + # cannot parameterize over the functions since they need external + # `names` to detect side effects def f_copy(group): + # this takes the fast apply path names.append(group.name) return group.copy() - df.groupby("a").apply(f_copy) - assert names == [0, 1, 2] - def f_nocopy(group): + # this takes the slow apply path names.append(group.name) return group - names = [] - # this takes the slow apply path - df.groupby("a").apply(f_nocopy) - assert names == [0, 1, 2] + def f_scalar(group): + # GH7739, GH2656 + names.append(group.name) + return 0 + + def f_none(group): + # GH10519, GH12155, GH21417 + names.append(group.name) + return None + + def f_constant_df(group): + # GH2936, GH20084 + names.append(group.name) + return DataFrame({"a": [1], "b": [1]}) + + for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]: + del names[:] + + df.groupby("a").apply(func) + assert names == group_names def test_apply_with_mixed_dtype(): From 6e52e67d6f2f6bb61eba2261f8cd510fd109c96b Mon Sep 17 00:00:00 2001 From: fjetter Date: Mon, 25 Feb 2019 13:54:36 +0100 Subject: [PATCH 4/9] PR comments --- pandas/core/groupby/ops.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 0927ff5907d53..060e9b5ead787 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -165,7 +165,7 @@ def apply(self, f, data, axis=0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) group_keys = self._get_group_keys() - result_values = [] + result_values = None # oh boy f_name = com.get_callable_name(f) @@ -190,9 +190,16 @@ def apply(self, f, data, axis=0): for key, (i, group) in zip(group_keys, splitter): object.__setattr__(group, 'name', key) - # If the fast apply failed we may still use the result - # for the first group - if len(result_values) == 1 and i == 0: + # result_values is None if fast apply path wasn't taken + # or fast apply aborted with an unexpected exception. + # In either case, initialize the result list and perform + # the slow iteration. + if result_values is None: + result_values = [] + # If result_values is not None we're in the case that the + # fast apply loop was broken prematurely but we have + # already the result for the first group which we can reuse. + elif i == 0: continue # group might be modified group_axes = _get_axes(group) From 121d025e72c417b24936fcadc1a027bd226fe21b Mon Sep 17 00:00:00 2001 From: fjetter Date: Mon, 25 Feb 2019 15:56:50 +0100 Subject: [PATCH 5/9] fix docs --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/groupby/ops.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index fdd39811c299a..5a69115add344 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -87,6 +87,7 @@ Now every group is evaluated only a single time. def func(group): side_effects.append(group.name) return group + df.groupby("a").apply(func) Previous Behaviour: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 060e9b5ead787..42082518ee2c2 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -180,8 +180,8 @@ def apply(self, f, data, axis=0): except reduction.InvalidApply: # Cannot fast apply on MultiIndex (_has_complex_internals). - # This Exception is also raised if `f` triggers an exception but - # it is preferable to raise the exception in Python. + # This Exception is also raised if `f` triggers an exception + # but it is preferable to raise the exception in Python. pass except Exception: # raise this error to the caller From feb39eb26133fdb0f24a7e1ad14475ac42ce22e5 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 25 Mar 2019 20:37:50 -0400 Subject: [PATCH 6/9] use printing in example --- doc/source/whatsnew/v0.25.0.rst | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 153ac1a7f0b9d..d271d0537c89c 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -93,25 +93,28 @@ Now every group is evaluated only a single time. df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]}) df - side_effects = [] def func(group): - side_effects.append(group.name) + print(group.name) return group - df.groupby("a").apply(func) - Previous Behaviour: .. code-block:: python - side_effects - >>> ["x", "x", "y"] + In [3]: df.groupby('a').apply(func) + x + x + y + Out[3]: + a b + 0 x 1 + 1 y 2 New Behaviour: .. ipython:: python - side_effects + df.groupby("a").apply(func) Concatenating Sparse Values From 67a2909c3d0204115a0fd18dcdf07c4daaacba21 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 25 Mar 2019 20:40:22 -0400 Subject: [PATCH 7/9] fix whatsnew --- doc/source/whatsnew/v0.25.0.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index d271d0537c89c..33aec3651730e 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -97,7 +97,7 @@ Now every group is evaluated only a single time. print(group.name) return group -Previous Behaviour: +*Previous Behaviour*: .. code-block:: python @@ -110,7 +110,7 @@ Previous Behaviour: 0 x 1 1 y 2 -New Behaviour: +*New Behaviour*: .. ipython:: python @@ -127,14 +127,14 @@ Series or DataFrame with sparse values, rather than a ``SparseDataFrame`` (:issu df = pd.DataFrame({"A": pd.SparseArray([0, 1])}) -*Previous Behavior:* +*Previous Behavior*: .. code-block:: ipython In [2]: type(pd.concat([df, df])) pandas.core.sparse.frame.SparseDataFrame -*New Behavior:* +*New Behavior*: .. ipython:: python @@ -181,7 +181,6 @@ If installed, we now require: +-----------------+-----------------+----------+ | pytest (dev) | 4.0.2 | | +-----------------+-----------------+----------+ ->>>>>>> master .. _whatsnew_0250.api.other: From 2f36c300e0d42636168a1541053a139dc9ccdf4a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 25 Mar 2019 20:49:08 -0400 Subject: [PATCH 8/9] reformat tests --- pandas/tests/groupby/test_apply.py | 55 +++++++++++++----------------- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 6e91230b64279..d753556f9978d 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -107,39 +107,30 @@ def f(g): @pytest.mark.parametrize( - "df,group_names", + "df, group_names", [ - ( - DataFrame({"a": [1, 1, 1, 2, 3], - "b": ["a", "a", "a", "b", "c"]}), - [1, 2, 3], - ), # GH2936 - ( - DataFrame({"a": [0, 0, 1, 1], - "b": [0, 1, 0, 1]}), - [0, 1] - ), # GH7739, GH10519 - (DataFrame({"a": [1]}), [1]), # GH10519 - ( - DataFrame({"a": [1, 1, 1, 2, 2, 1, 1, 2], - "b": range(8)}), - [1, 2]), # GH2656 - ( - DataFrame({"a": [1, 2, 3, 1, 2, 3], "two": [4, 5, 6, 7, 8, 9]}), - [1, 2, 3], - ), # GH12155 - ( - DataFrame({"a": list("aaabbbcccc"), - "B": [3, 4, 3, 6, 5, 2, 1, 9, 5, 4], - "C": [4, 0, 2, 2, 2, 7, 8, 6, 2, 8]}), - ["a", "b", "c"], - ), # GH20084 - ( - DataFrame([[1, 2, 3], [2, 2, 3]], columns=["a", "b", "c"]), - [1, 2], - ), # GH21417 - ], -) + (DataFrame({"a": [1, 1, 1, 2, 3], + "b": ["a", "a", "a", "b", "c"]}), + [1, 2, 3]), + (DataFrame({"a": [0, 0, 1, 1], + "b": [0, 1, 0, 1]}), + [0, 1]), + (DataFrame({"a": [1]}), + [1]), + (DataFrame({"a": [1, 1, 1, 2, 2, 1, 1, 2], + "b": range(8)}), + [1, 2]), + (DataFrame({"a": [1, 2, 3, 1, 2, 3], + "two": [4, 5, 6, 7, 8, 9]}), + [1, 2, 3]), + (DataFrame({"a": list("aaabbbcccc"), + "B": [3, 4, 3, 6, 5, 2, 1, 9, 5, 4], + "C": [4, 0, 2, 2, 2, 7, 8, 6, 2, 8]}), + ["a", "b", "c"]), + (DataFrame([[1, 2, 3], [2, 2, 3]], columns=["a", "b", "c"]), + [1, 2]), + ], ids=['GH2936', 'GH7739 & GH10519', 'GH10519', + 'GH2656', 'GH12155', 'GH20084', 'GH21417']) def test_group_apply_once_per_group(df, group_names): # GH2936, GH7739, GH10519, GH2656, GH12155, GH20084, GH21417 From b52f2061d44ac528defd54efb7648ac149a585c1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 25 Mar 2019 21:00:09 -0400 Subject: [PATCH 9/9] small clean in reduction.pyx --- pandas/_libs/reduction.pyx | 5 ++--- pandas/core/groupby/ops.py | 3 +++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index afe264ca26c15..5df6e02d6a040 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -512,7 +512,6 @@ def apply_frame_axis0(object frame, object f, object names, slider = BlockSlider(frame) mutated = False - require_slow_apply = True item_cache = slider.dummy._item_cache try: for i in range(n): @@ -528,8 +527,7 @@ def apply_frame_axis0(object frame, object f, object names, raise InvalidApply('Let this error raise above us') # Need to infer if low level index slider will cause segfaults - if i == 0 and piece is chunk: - require_slow_apply = True + require_slow_apply = i == 0 and piece is chunk try: if piece.index is chunk.index: piece = piece.copy(deep='all') @@ -542,6 +540,7 @@ def apply_frame_axis0(object frame, object f, object names, # If the data was modified inplace we need to # take the slow path to not risk segfaults + # we have already computed the first piece if require_slow_apply: break finally: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7e0da649690f6..ec22548de6da3 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -173,6 +173,7 @@ def apply(self, f, data, axis=0): hasattr(splitter, 'fast_apply') and axis == 0): try: result_values, mutated = splitter.fast_apply(f, group_keys) + # If the fast apply path could be used we can return here. # Otherwise we need to fall back to the slow implementation. if len(result_values) == len(group_keys): @@ -196,11 +197,13 @@ def apply(self, f, data, axis=0): # the slow iteration. if result_values is None: result_values = [] + # If result_values is not None we're in the case that the # fast apply loop was broken prematurely but we have # already the result for the first group which we can reuse. elif i == 0: continue + # group might be modified group_axes = _get_axes(group) res = f(group)