Skip to content

Commit 41d930b

Browse files
committed
BUG in .groupby.apply when applying a function that has mixed data types and the user supplied function can fail on the grouping column
closes pandas-dev#20949
1 parent bd4332f commit 41d930b

File tree

3 files changed

+46
-13
lines changed

3 files changed

+46
-13
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1320,6 +1320,7 @@ Groupby/Resample/Rolling
13201320
- Bug in :func:`DataFrame.resample` that dropped timezone information (:issue:`13238`)
13211321
- Bug in :func:`DataFrame.groupby` where transformations using ``np.all`` and ``np.any`` were raising a ``ValueError`` (:issue:`20653`)
13221322
- Bug in :func:`DataFrame.resample` where ``ffill``, ``bfill``, ``pad``, ``backfill``, ``fillna``, ``interpolate``, and ``asfreq`` were ignoring ``loffset``. (:issue:`20744`)
1323+
- Bug in :func:`DataFrame.groupby` when applying a function that has mixed data types and the user supplied function can fail on the grouping column (:issue:`20949`)
13231324

13241325
Sparse
13251326
^^^^^^

pandas/core/groupby/groupby.py

+32-13
Original file line numberDiff line numberDiff line change
@@ -696,8 +696,8 @@ def _reset_group_selection(self):
696696
each group regardless of whether a group selection was previously set.
697697
"""
698698
if self._group_selection is not None:
699-
self._group_selection = None
700699
# GH12839 clear cached selection too when changing group selection
700+
self._group_selection = None
701701
self._reset_cache('_selected_obj')
702702

703703
def _set_group_selection(self):
@@ -706,16 +706,20 @@ def _set_group_selection(self):
706706
directly but instead via a grouper.
707707
"""
708708
grp = self.grouper
709-
if self.as_index and getattr(grp, 'groupings', None) is not None and \
710-
self.obj.ndim > 1:
711-
ax = self.obj._info_axis
712-
groupers = [g.name for g in grp.groupings
713-
if g.level is None and g.in_axis]
709+
if not (self.as_index and
710+
getattr(grp, 'groupings', None) is not None and
711+
self.obj.ndim > 1 and
712+
self._group_selection is None):
713+
return
714+
715+
ax = self.obj._info_axis
716+
groupers = [g.name for g in grp.groupings
717+
if g.level is None and g.in_axis]
714718

715-
if len(groupers):
716-
self._group_selection = ax.difference(Index(groupers)).tolist()
717-
# GH12839 clear selected obj cache when group selection changes
718-
self._reset_cache('_selected_obj')
719+
if len(groupers):
720+
# GH12839 clear selected obj cache when group selection changes
721+
self._group_selection = ax.difference(Index(groupers)).tolist()
722+
self._reset_cache('_selected_obj')
719723

720724
def _set_result_index_ordered(self, result):
721725
# set the result index on the passed values object and
@@ -897,7 +901,23 @@ def f(g):
897901

898902
# ignore SettingWithCopy here in case the user mutates
899903
with option_context('mode.chained_assignment', None):
900-
return self._python_apply_general(f)
904+
try:
905+
result = self._python_apply_general(f)
906+
except Exception:
907+
908+
# gh-20949
909+
# try again, with .apply acting as a filtering
910+
# operation, by excluding the grouping column
911+
# This would normally not be triggered
912+
# except if the udf is trying an operation that
913+
# fails on *some* columns, e.g. a numeric operation
914+
# on a string grouper column
915+
916+
self._set_group_selection()
917+
result = self._python_apply_general(f)
918+
self._reset_group_selection()
919+
920+
return result
901921

902922
def _python_apply_general(self, f):
903923
keys, values, mutated = self.grouper.apply(f, self._selected_obj,
@@ -1453,7 +1473,6 @@ def ohlc(self):
14531473

14541474
@Appender(DataFrame.describe.__doc__)
14551475
def describe(self, **kwargs):
1456-
self._set_group_selection()
14571476
result = self.apply(lambda x: x.describe(**kwargs))
14581477
if self.axis == 1:
14591478
return result.T
@@ -3768,7 +3787,6 @@ def nunique(self, dropna=True):
37683787

37693788
@Appender(Series.describe.__doc__)
37703789
def describe(self, **kwargs):
3771-
self._set_group_selection()
37723790
result = self.apply(lambda x: x.describe(**kwargs))
37733791
if self.axis == 1:
37743792
return result.T
@@ -4411,6 +4429,7 @@ def transform(self, func, *args, **kwargs):
44114429
return self._transform_general(func, *args, **kwargs)
44124430

44134431
obj = self._obj_with_exclusions
4432+
44144433
# nuiscance columns
44154434
if not result.columns.equals(obj.columns):
44164435
return self._transform_general(func, *args, **kwargs)

pandas/tests/groupby/test_apply.py

+13
Original file line numberDiff line numberDiff line change
@@ -515,3 +515,16 @@ def test_func(x):
515515
index=index2)
516516
tm.assert_frame_equal(result1, expected1)
517517
tm.assert_frame_equal(result2, expected2)
518+
519+
520+
def test_apply_with_mixed_types():
521+
# gh-20949
522+
df = pd.DataFrame({'A': 'a a b'.split(), 'B': [1,2,3], 'C': [4, 6, 5]})
523+
g = df.groupby('A')
524+
525+
result = g.transform(lambda x: x / x.sum())
526+
expected = pd.DataFrame({'B': [1/3., 2/3., 1], 'C': [0.4, 0.6, 1.0]})
527+
tm.assert_frame_equal(result, expected)
528+
529+
result = g.apply(lambda x: x / x.sum())
530+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)