From 41d930b9516f826d4203d7d907be912291bc2237 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 5 May 2018 09:59:28 -0400 Subject: [PATCH 1/3] BUG in .groupby.apply when applying a function that has mixed data types and the user supplied function can fail on the grouping column closes #20949 --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/groupby/groupby.py | 45 +++++++++++++++++++++--------- pandas/tests/groupby/test_apply.py | 13 +++++++++ 3 files changed, 46 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index eb6c212731822..a77620fe6b36b 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1320,6 +1320,7 @@ Groupby/Resample/Rolling - Bug in :func:`DataFrame.resample` that dropped timezone information (:issue:`13238`) - Bug in :func:`DataFrame.groupby` where transformations using ``np.all`` and ``np.any`` were raising a ``ValueError`` (:issue:`20653`) - Bug in :func:`DataFrame.resample` where ``ffill``, ``bfill``, ``pad``, ``backfill``, ``fillna``, ``interpolate``, and ``asfreq`` were ignoring ``loffset``. (:issue:`20744`) +- Bug in :func:`DataFrame.groupby` when applying a function that has mixed data types and the user supplied function can fail on the grouping column (:issue:`20949`) Sparse ^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 164d1bebd2929..671142a98a6b3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -696,8 +696,8 @@ def _reset_group_selection(self): each group regardless of whether a group selection was previously set. """ if self._group_selection is not None: - self._group_selection = None # GH12839 clear cached selection too when changing group selection + self._group_selection = None self._reset_cache('_selected_obj') def _set_group_selection(self): @@ -706,16 +706,20 @@ def _set_group_selection(self): directly but instead via a grouper. """ grp = self.grouper - if self.as_index and getattr(grp, 'groupings', None) is not None and \ - self.obj.ndim > 1: - ax = self.obj._info_axis - groupers = [g.name for g in grp.groupings - if g.level is None and g.in_axis] + if not (self.as_index and + getattr(grp, 'groupings', None) is not None and + self.obj.ndim > 1 and + self._group_selection is None): + return + + ax = self.obj._info_axis + groupers = [g.name for g in grp.groupings + if g.level is None and g.in_axis] - if len(groupers): - self._group_selection = ax.difference(Index(groupers)).tolist() - # GH12839 clear selected obj cache when group selection changes - self._reset_cache('_selected_obj') + if len(groupers): + # GH12839 clear selected obj cache when group selection changes + self._group_selection = ax.difference(Index(groupers)).tolist() + self._reset_cache('_selected_obj') def _set_result_index_ordered(self, result): # set the result index on the passed values object and @@ -897,7 +901,23 @@ def f(g): # ignore SettingWithCopy here in case the user mutates with option_context('mode.chained_assignment', None): - return self._python_apply_general(f) + try: + result = self._python_apply_general(f) + except Exception: + + # gh-20949 + # try again, with .apply acting as a filtering + # operation, by excluding the grouping column + # This would normally not be triggered + # except if the udf is trying an operation that + # fails on *some* columns, e.g. a numeric operation + # on a string grouper column + + self._set_group_selection() + result = self._python_apply_general(f) + self._reset_group_selection() + + return result def _python_apply_general(self, f): keys, values, mutated = self.grouper.apply(f, self._selected_obj, @@ -1453,7 +1473,6 @@ def ohlc(self): @Appender(DataFrame.describe.__doc__) def describe(self, **kwargs): - self._set_group_selection() result = self.apply(lambda x: x.describe(**kwargs)) if self.axis == 1: return result.T @@ -3768,7 +3787,6 @@ def nunique(self, dropna=True): @Appender(Series.describe.__doc__) def describe(self, **kwargs): - self._set_group_selection() result = self.apply(lambda x: x.describe(**kwargs)) if self.axis == 1: return result.T @@ -4411,6 +4429,7 @@ def transform(self, func, *args, **kwargs): return self._transform_general(func, *args, **kwargs) obj = self._obj_with_exclusions + # nuiscance columns if not result.columns.equals(obj.columns): return self._transform_general(func, *args, **kwargs) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 5ca10fe1af9d1..ecb4e2af53042 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -515,3 +515,16 @@ def test_func(x): index=index2) tm.assert_frame_equal(result1, expected1) tm.assert_frame_equal(result2, expected2) + + +def test_apply_with_mixed_types(): + # gh-20949 + df = pd.DataFrame({'A': 'a a b'.split(), 'B': [1,2,3], 'C': [4, 6, 5]}) + g = df.groupby('A') + + result = g.transform(lambda x: x / x.sum()) + expected = pd.DataFrame({'B': [1/3., 2/3., 1], 'C': [0.4, 0.6, 1.0]}) + tm.assert_frame_equal(result, expected) + + result = g.apply(lambda x: x / x.sum()) + tm.assert_frame_equal(result, expected) From 881c7e1c74d0513cfa4b4ab78a8c80408e544aca Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 5 May 2018 10:15:15 -0400 Subject: [PATCH 2/3] use a context manager --- pandas/core/groupby/groupby.py | 60 +++++++++++++++++------------- pandas/tests/groupby/test_apply.py | 4 +- 2 files changed, 37 insertions(+), 27 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 671142a98a6b3..08ab621f5e3ce 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -6,6 +6,7 @@ import warnings import copy from textwrap import dedent +from contextlib import contextmanager from pandas.compat import ( zip, range, lzip, @@ -549,6 +550,16 @@ def f(self): return attr +@contextmanager +def _group_selection_context(groupby): + """ + set / reset the _group_selection_context + """ + groupby._set_group_selection() + yield groupby + groupby._reset_group_selection() + + class _GroupBy(PandasObject, SelectionMixin): _group_selection = None _apply_whitelist = frozenset([]) @@ -704,6 +715,8 @@ def _set_group_selection(self): """ Create group based selection. Used when selection is not passed directly but instead via a grouper. + + NOTE: this should be paired with a call to _reset_group_selection """ grp = self.grouper if not (self.as_index and @@ -785,10 +798,10 @@ def _make_wrapper(self, name): type(self).__name__)) raise AttributeError(msg) - # need to setup the selection - # as are not passed directly but in the grouper self._set_group_selection() + # need to setup the selection + # as are not passed directly but in the grouper f = getattr(self._selected_obj, name) if not isinstance(f, types.MethodType): return self.apply(lambda self: getattr(self, name)) @@ -913,9 +926,8 @@ def f(g): # fails on *some* columns, e.g. a numeric operation # on a string grouper column - self._set_group_selection() - result = self._python_apply_general(f) - self._reset_group_selection() + with _group_selection_context(self): + return self._python_apply_general(f) return result @@ -1295,9 +1307,9 @@ def mean(self, *args, **kwargs): except GroupByError: raise except Exception: # pragma: no cover - self._set_group_selection() - f = lambda x: x.mean(axis=self.axis, **kwargs) - return self._python_agg_general(f) + with _group_selection_context(self): + f = lambda x: x.mean(axis=self.axis, **kwargs) + return self._python_agg_general(f) @Substitution(name='groupby') @Appender(_doc_template) @@ -1313,13 +1325,12 @@ def median(self, **kwargs): raise except Exception: # pragma: no cover - self._set_group_selection() - def f(x): if isinstance(x, np.ndarray): x = Series(x) return x.median(axis=self.axis, **kwargs) - return self._python_agg_general(f) + with _group_selection_context(self): + return self._python_agg_general(f) @Substitution(name='groupby') @Appender(_doc_template) @@ -1356,9 +1367,9 @@ def var(self, ddof=1, *args, **kwargs): if ddof == 1: return self._cython_agg_general('var', **kwargs) else: - self._set_group_selection() f = lambda x: x.var(ddof=ddof, **kwargs) - return self._python_agg_general(f) + with _group_selection_context(self): + return self._python_agg_general(f) @Substitution(name='groupby') @Appender(_doc_template) @@ -1404,6 +1415,7 @@ def f(self, **kwargs): kwargs['numeric_only'] = numeric_only if 'min_count' not in kwargs: kwargs['min_count'] = min_count + self._set_group_selection() try: return self._cython_agg_general( @@ -1797,13 +1809,12 @@ def ngroup(self, ascending=True): .cumcount : Number the rows in each group. """ - self._set_group_selection() - - index = self._selected_obj.index - result = Series(self.grouper.group_info[0], index) - if not ascending: - result = self.ngroups - 1 - result - return result + with _group_selection_context(self): + index = self._selected_obj.index + result = Series(self.grouper.group_info[0], index) + if not ascending: + result = self.ngroups - 1 - result + return result @Substitution(name='groupby') def cumcount(self, ascending=True): @@ -1854,11 +1865,10 @@ def cumcount(self, ascending=True): .ngroup : Number the groups themselves. """ - self._set_group_selection() - - index = self._selected_obj.index - cumcounts = self._cumcount_array(ascending=ascending) - return Series(cumcounts, index) + with _group_selection_context(self): + index = self._selected_obj.index + cumcounts = self._cumcount_array(ascending=ascending) + return Series(cumcounts, index) @Substitution(name='groupby') @Appender(_doc_template) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index ecb4e2af53042..07eef2d87feb3 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -519,11 +519,11 @@ def test_func(x): def test_apply_with_mixed_types(): # gh-20949 - df = pd.DataFrame({'A': 'a a b'.split(), 'B': [1,2,3], 'C': [4, 6, 5]}) + df = pd.DataFrame({'A': 'a a b'.split(), 'B': [1, 2, 3], 'C': [4, 6, 5]}) g = df.groupby('A') result = g.transform(lambda x: x / x.sum()) - expected = pd.DataFrame({'B': [1/3., 2/3., 1], 'C': [0.4, 0.6, 1.0]}) + expected = pd.DataFrame({'B': [1 / 3., 2 / 3., 1], 'C': [0.4, 0.6, 1.0]}) tm.assert_frame_equal(result, expected) result = g.apply(lambda x: x / x.sum()) From a67f4d02a99ebcc4e54eec506e0239e9ced9c38e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 7 May 2018 06:03:48 -0400 Subject: [PATCH 3/3] typo --- pandas/core/groupby/groupby.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 08ab621f5e3ce..df7a5dc9dc173 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1485,10 +1485,11 @@ def ohlc(self): @Appender(DataFrame.describe.__doc__) def describe(self, **kwargs): - result = self.apply(lambda x: x.describe(**kwargs)) - if self.axis == 1: - return result.T - return result.unstack() + with _group_selection_context(self): + result = self.apply(lambda x: x.describe(**kwargs)) + if self.axis == 1: + return result.T + return result.unstack() @Substitution(name='groupby') @Appender(_doc_template)