Skip to content

ENH: Only apply first group once in fast GroupBy.apply #24748

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Mar 26, 2019
17 changes: 0 additions & 17 deletions doc/source/user_guide/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -946,23 +946,6 @@ that is itself a series, and possibly upcast the result to a DataFrame:
So depending on the path taken, and exactly what you are grouping. Thus the grouped columns(s) may be included in
the output as well as set the indices.

.. warning::

In the current implementation apply calls func twice on the
first group to decide whether it can take a fast or slow code
path. This can lead to unexpected behavior if func has
side-effects, as they will take effect twice for the first
group.

.. ipython:: python

d = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]})
def identity(df):
print(df)
return df

d.groupby("a").apply(identity)


Other useful features
---------------------
Expand Down
48 changes: 46 additions & 2 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,50 @@ is respected in indexing. (:issue:`24076`, :issue:`16785`)
df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific'))
df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00']

.. _whatsnew_0250.api_breaking.groupby_apply_first_group_once:

GroupBy.apply on ``DataFrame`` evaluates first group only once
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The implementation of :meth:`DataFrameGroupBy.apply() <pandas.core.groupby.DataFrameGroupBy.apply>`
previously evaluated the supplied function consistently twice on the first group
to infer if it is safe to use a fast code path. Particularly for functions with
side effects, this was an undesired behavior and may have led to surprises.

(:issue:`2936`, :issue:`2656`, :issue:`7739`, :issue:`10519`, :issue:`12155`,
:issue:`20084`, :issue:`21417`)

Now every group is evaluated only a single time.

.. ipython:: python

df = pd.DataFrame({"a": ["x", "y"], "b": [1, 2]})
df

def func(group):
print(group.name)
return group

*Previous Behaviour*:

.. code-block:: python

In [3]: df.groupby('a').apply(func)
x
x
y
Out[3]:
a b
0 x 1
1 y 2

*New Behaviour*:

.. ipython:: python

df.groupby("a").apply(func)


Concatenating Sparse Values
^^^^^^^^^^^^^^^^^^^^^^^^^^^

Expand All @@ -83,14 +127,14 @@ Series or DataFrame with sparse values, rather than a ``SparseDataFrame`` (:issu

df = pd.DataFrame({"A": pd.SparseArray([0, 1])})

*Previous Behavior:*
*Previous Behavior*:

.. code-block:: ipython

In [2]: type(pd.concat([df, df]))
pandas.core.sparse.frame.SparseDataFrame

*New Behavior:*
*New Behavior*:

.. ipython:: python

Expand Down
30 changes: 15 additions & 15 deletions pandas/_libs/reduction.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -509,17 +509,6 @@ def apply_frame_axis0(object frame, object f, object names,

results = []

# Need to infer if our low-level mucking is going to cause a segfault
if n > 0:
chunk = frame.iloc[starts[0]:ends[0]]
object.__setattr__(chunk, 'name', names[0])
try:
result = f(chunk)
if result is chunk:
raise InvalidApply('Function unsafe for fast apply')
except:
raise InvalidApply('Let this error raise above us')

slider = BlockSlider(frame)

mutated = False
Expand All @@ -529,20 +518,31 @@ def apply_frame_axis0(object frame, object f, object names,
slider.move(starts[i], ends[i])

item_cache.clear() # ugh
chunk = slider.dummy
object.__setattr__(chunk, 'name', names[i])

object.__setattr__(slider.dummy, 'name', names[i])
piece = f(slider.dummy)
try:
piece = f(chunk)
except:
raise InvalidApply('Let this error raise above us')

# I'm paying the price for index-sharing, ugh
# Need to infer if low level index slider will cause segfaults
require_slow_apply = i == 0 and piece is chunk
try:
if piece.index is slider.dummy.index:
if piece.index is chunk.index:
piece = piece.copy(deep='all')
else:
mutated = True
except AttributeError:
pass

results.append(piece)

# If the data was modified inplace we need to
# take the slow path to not risk segfaults
# we have already computed the first piece
if require_slow_apply:
break
finally:
slider.reset()

Expand Down
34 changes: 25 additions & 9 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,26 +165,45 @@ def apply(self, f, data, axis=0):
mutated = self.mutated
splitter = self._get_splitter(data, axis=axis)
group_keys = self._get_group_keys()
result_values = None

# oh boy
f_name = com.get_callable_name(f)
if (f_name not in base.plotting_methods and
hasattr(splitter, 'fast_apply') and axis == 0):
try:
values, mutated = splitter.fast_apply(f, group_keys)
return group_keys, values, mutated
result_values, mutated = splitter.fast_apply(f, group_keys)

# If the fast apply path could be used we can return here.
# Otherwise we need to fall back to the slow implementation.
if len(result_values) == len(group_keys):
return group_keys, result_values, mutated

except reduction.InvalidApply:
# we detect a mutation of some kind
# so take slow path
# Cannot fast apply on MultiIndex (_has_complex_internals).
# This Exception is also raised if `f` triggers an exception
# but it is preferable to raise the exception in Python.
pass
except Exception:
# raise this error to the caller
pass

result_values = []
for key, (i, group) in zip(group_keys, splitter):
object.__setattr__(group, 'name', key)

# result_values is None if fast apply path wasn't taken
# or fast apply aborted with an unexpected exception.
# In either case, initialize the result list and perform
# the slow iteration.
if result_values is None:
result_values = []

# If result_values is not None we're in the case that the
# fast apply loop was broken prematurely but we have
# already the result for the first group which we can reuse.
elif i == 0:
continue

# group might be modified
group_axes = _get_axes(group)
res = f(group)
Expand Down Expand Up @@ -854,10 +873,7 @@ def fast_apply(self, f, names):
return [], True

sdata = self._get_sorted_data()
results, mutated = reduction.apply_frame_axis0(sdata, f, names,
starts, ends)

return results, mutated
return reduction.apply_frame_axis0(sdata, f, names, starts, ends)

def _chop(self, sdata, slice_obj):
if self.axis == 0:
Expand Down
71 changes: 71 additions & 0 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,80 @@ def f(g):
group_keys = grouper._get_group_keys()

values, mutated = splitter.fast_apply(f, group_keys)

assert not mutated


@pytest.mark.parametrize(
"df, group_names",
[
(DataFrame({"a": [1, 1, 1, 2, 3],
"b": ["a", "a", "a", "b", "c"]}),
[1, 2, 3]),
(DataFrame({"a": [0, 0, 1, 1],
"b": [0, 1, 0, 1]}),
[0, 1]),
(DataFrame({"a": [1]}),
[1]),
(DataFrame({"a": [1, 1, 1, 2, 2, 1, 1, 2],
"b": range(8)}),
[1, 2]),
(DataFrame({"a": [1, 2, 3, 1, 2, 3],
"two": [4, 5, 6, 7, 8, 9]}),
[1, 2, 3]),
(DataFrame({"a": list("aaabbbcccc"),
"B": [3, 4, 3, 6, 5, 2, 1, 9, 5, 4],
"C": [4, 0, 2, 2, 2, 7, 8, 6, 2, 8]}),
["a", "b", "c"]),
(DataFrame([[1, 2, 3], [2, 2, 3]], columns=["a", "b", "c"]),
[1, 2]),
], ids=['GH2936', 'GH7739 & GH10519', 'GH10519',
'GH2656', 'GH12155', 'GH20084', 'GH21417'])
def test_group_apply_once_per_group(df, group_names):
# GH2936, GH7739, GH10519, GH2656, GH12155, GH20084, GH21417

# This test should ensure that a function is only evaluted
# once per group. Previously the function has been evaluated twice
# on the first group to check if the Cython index slider is safe to use
# This test ensures that the side effect (append to list) is only triggered
# once per group

names = []
# cannot parameterize over the functions since they need external
# `names` to detect side effects

def f_copy(group):
# this takes the fast apply path
names.append(group.name)
return group.copy()

def f_nocopy(group):
# this takes the slow apply path
names.append(group.name)
return group

def f_scalar(group):
# GH7739, GH2656
names.append(group.name)
return 0

def f_none(group):
# GH10519, GH12155, GH21417
names.append(group.name)
return None

def f_constant_df(group):
# GH2936, GH20084
names.append(group.name)
return DataFrame({"a": [1], "b": [1]})

for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]:
del names[:]

df.groupby("a").apply(func)
assert names == group_names


def test_apply_with_mixed_dtype():
# GH3480, apply with mixed dtype on axis=1 breaks in 0.11
df = DataFrame({'foo1': np.random.randn(6),
Expand Down
6 changes: 2 additions & 4 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1381,11 +1381,9 @@ def test_group_name_available_in_inference_pass():
def f(group):
names.append(group.name)
return group.copy()

df.groupby('a', sort=False, group_keys=False).apply(f)
# we expect 2 zeros because we call ``f`` once to see if a faster route
# can be used.
expected_names = [0, 0, 1, 2]

expected_names = [0, 1, 2]
assert names == expected_names


Expand Down