From 41d930b9516f826d4203d7d907be912291bc2237 Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Sat, 5 May 2018 09:59:28 -0400
Subject: [PATCH 1/3] BUG in .groupby.apply when applying a function that has
 mixed data types and the user supplied function can fail on the grouping
 column

closes #20949
---
 doc/source/whatsnew/v0.23.0.txt    |  1 +
 pandas/core/groupby/groupby.py     | 45 +++++++++++++++++++++---------
 pandas/tests/groupby/test_apply.py | 13 +++++++++
 3 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
index eb6c212731822..a77620fe6b36b 100644
--- a/doc/source/whatsnew/v0.23.0.txt
+++ b/doc/source/whatsnew/v0.23.0.txt
@@ -1320,6 +1320,7 @@ Groupby/Resample/Rolling
 - Bug in :func:`DataFrame.resample` that dropped timezone information (:issue:`13238`)
 - Bug in :func:`DataFrame.groupby` where transformations using ``np.all`` and ``np.any`` were raising a ``ValueError`` (:issue:`20653`)
 - Bug in :func:`DataFrame.resample` where ``ffill``, ``bfill``, ``pad``, ``backfill``, ``fillna``, ``interpolate``, and ``asfreq`` were ignoring ``loffset``. (:issue:`20744`)
+- Bug in :func:`DataFrame.groupby` when applying a function that has mixed data types and the user supplied function can fail on the grouping column (:issue:`20949`)
 
 Sparse
 ^^^^^^
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 164d1bebd2929..671142a98a6b3 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -696,8 +696,8 @@ def _reset_group_selection(self):
         each group regardless of whether a group selection was previously set.
         """
         if self._group_selection is not None:
-            self._group_selection = None
             # GH12839 clear cached selection too when changing group selection
+            self._group_selection = None
             self._reset_cache('_selected_obj')
 
     def _set_group_selection(self):
@@ -706,16 +706,20 @@ def _set_group_selection(self):
         directly but instead via a grouper.
         """
         grp = self.grouper
-        if self.as_index and getattr(grp, 'groupings', None) is not None and \
-           self.obj.ndim > 1:
-            ax = self.obj._info_axis
-            groupers = [g.name for g in grp.groupings
-                        if g.level is None and g.in_axis]
+        if not (self.as_index and
+                getattr(grp, 'groupings', None) is not None and
+                self.obj.ndim > 1 and
+                self._group_selection is None):
+            return
+
+        ax = self.obj._info_axis
+        groupers = [g.name for g in grp.groupings
+                    if g.level is None and g.in_axis]
 
-            if len(groupers):
-                self._group_selection = ax.difference(Index(groupers)).tolist()
-                # GH12839 clear selected obj cache when group selection changes
-                self._reset_cache('_selected_obj')
+        if len(groupers):
+            # GH12839 clear selected obj cache when group selection changes
+            self._group_selection = ax.difference(Index(groupers)).tolist()
+            self._reset_cache('_selected_obj')
 
     def _set_result_index_ordered(self, result):
         # set the result index on the passed values object and
@@ -897,7 +901,23 @@ def f(g):
 
         # ignore SettingWithCopy here in case the user mutates
         with option_context('mode.chained_assignment', None):
-            return self._python_apply_general(f)
+            try:
+                result = self._python_apply_general(f)
+            except Exception:
+
+                # gh-20949
+                # try again, with .apply acting as a filtering
+                # operation, by excluding the grouping column
+                # This would normally not be triggered
+                # except if the udf is trying an operation that
+                # fails on *some* columns, e.g. a numeric operation
+                # on a string grouper column
+
+                self._set_group_selection()
+                result = self._python_apply_general(f)
+                self._reset_group_selection()
+
+        return result
 
     def _python_apply_general(self, f):
         keys, values, mutated = self.grouper.apply(f, self._selected_obj,
@@ -1453,7 +1473,6 @@ def ohlc(self):
 
     @Appender(DataFrame.describe.__doc__)
     def describe(self, **kwargs):
-        self._set_group_selection()
         result = self.apply(lambda x: x.describe(**kwargs))
         if self.axis == 1:
             return result.T
@@ -3768,7 +3787,6 @@ def nunique(self, dropna=True):
 
     @Appender(Series.describe.__doc__)
     def describe(self, **kwargs):
-        self._set_group_selection()
         result = self.apply(lambda x: x.describe(**kwargs))
         if self.axis == 1:
             return result.T
@@ -4411,6 +4429,7 @@ def transform(self, func, *args, **kwargs):
             return self._transform_general(func, *args, **kwargs)
 
         obj = self._obj_with_exclusions
+
         # nuiscance columns
         if not result.columns.equals(obj.columns):
             return self._transform_general(func, *args, **kwargs)
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
index 5ca10fe1af9d1..ecb4e2af53042 100644
--- a/pandas/tests/groupby/test_apply.py
+++ b/pandas/tests/groupby/test_apply.py
@@ -515,3 +515,16 @@ def test_func(x):
                           index=index2)
     tm.assert_frame_equal(result1, expected1)
     tm.assert_frame_equal(result2, expected2)
+
+
+def test_apply_with_mixed_types():
+    # gh-20949
+    df = pd.DataFrame({'A': 'a a b'.split(), 'B': [1,2,3], 'C': [4, 6, 5]})
+    g = df.groupby('A')
+
+    result = g.transform(lambda x: x / x.sum())
+    expected = pd.DataFrame({'B': [1/3., 2/3., 1], 'C': [0.4, 0.6, 1.0]})
+    tm.assert_frame_equal(result, expected)
+
+    result = g.apply(lambda x: x / x.sum())
+    tm.assert_frame_equal(result, expected)

From 881c7e1c74d0513cfa4b4ab78a8c80408e544aca Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Sat, 5 May 2018 10:15:15 -0400
Subject: [PATCH 2/3] use a context manager

---
 pandas/core/groupby/groupby.py     | 60 +++++++++++++++++-------------
 pandas/tests/groupby/test_apply.py |  4 +-
 2 files changed, 37 insertions(+), 27 deletions(-)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 671142a98a6b3..08ab621f5e3ce 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -6,6 +6,7 @@
 import warnings
 import copy
 from textwrap import dedent
+from contextlib import contextmanager
 
 from pandas.compat import (
     zip, range, lzip,
@@ -549,6 +550,16 @@ def f(self):
         return attr
 
 
+@contextmanager
+def _group_selection_context(groupby):
+    """
+    set / reset the _group_selection_context
+    """
+    groupby._set_group_selection()
+    yield groupby
+    groupby._reset_group_selection()
+
+
 class _GroupBy(PandasObject, SelectionMixin):
     _group_selection = None
     _apply_whitelist = frozenset([])
@@ -704,6 +715,8 @@ def _set_group_selection(self):
         """
         Create group based selection. Used when selection is not passed
         directly but instead via a grouper.
+
+        NOTE: this should be paired with a call to _reset_group_selection
         """
         grp = self.grouper
         if not (self.as_index and
@@ -785,10 +798,10 @@ def _make_wrapper(self, name):
                                                      type(self).__name__))
             raise AttributeError(msg)
 
-        # need to setup the selection
-        # as are not passed directly but in the grouper
         self._set_group_selection()
 
+        # need to setup the selection
+        # as are not passed directly but in the grouper
         f = getattr(self._selected_obj, name)
         if not isinstance(f, types.MethodType):
             return self.apply(lambda self: getattr(self, name))
@@ -913,9 +926,8 @@ def f(g):
                 # fails on *some* columns, e.g. a numeric operation
                 # on a string grouper column
 
-                self._set_group_selection()
-                result = self._python_apply_general(f)
-                self._reset_group_selection()
+                with _group_selection_context(self):
+                    return self._python_apply_general(f)
 
         return result
 
@@ -1295,9 +1307,9 @@ def mean(self, *args, **kwargs):
         except GroupByError:
             raise
         except Exception:  # pragma: no cover
-            self._set_group_selection()
-            f = lambda x: x.mean(axis=self.axis, **kwargs)
-            return self._python_agg_general(f)
+            with _group_selection_context(self):
+                f = lambda x: x.mean(axis=self.axis, **kwargs)
+                return self._python_agg_general(f)
 
     @Substitution(name='groupby')
     @Appender(_doc_template)
@@ -1313,13 +1325,12 @@ def median(self, **kwargs):
             raise
         except Exception:  # pragma: no cover
 
-            self._set_group_selection()
-
             def f(x):
                 if isinstance(x, np.ndarray):
                     x = Series(x)
                 return x.median(axis=self.axis, **kwargs)
-            return self._python_agg_general(f)
+            with _group_selection_context(self):
+                return self._python_agg_general(f)
 
     @Substitution(name='groupby')
     @Appender(_doc_template)
@@ -1356,9 +1367,9 @@ def var(self, ddof=1, *args, **kwargs):
         if ddof == 1:
             return self._cython_agg_general('var', **kwargs)
         else:
-            self._set_group_selection()
             f = lambda x: x.var(ddof=ddof, **kwargs)
-            return self._python_agg_general(f)
+            with _group_selection_context(self):
+                return self._python_agg_general(f)
 
     @Substitution(name='groupby')
     @Appender(_doc_template)
@@ -1404,6 +1415,7 @@ def f(self, **kwargs):
                     kwargs['numeric_only'] = numeric_only
                 if 'min_count' not in kwargs:
                     kwargs['min_count'] = min_count
+
                 self._set_group_selection()
                 try:
                     return self._cython_agg_general(
@@ -1797,13 +1809,12 @@ def ngroup(self, ascending=True):
         .cumcount : Number the rows in each group.
         """
 
-        self._set_group_selection()
-
-        index = self._selected_obj.index
-        result = Series(self.grouper.group_info[0], index)
-        if not ascending:
-            result = self.ngroups - 1 - result
-        return result
+        with _group_selection_context(self):
+            index = self._selected_obj.index
+            result = Series(self.grouper.group_info[0], index)
+            if not ascending:
+                result = self.ngroups - 1 - result
+            return result
 
     @Substitution(name='groupby')
     def cumcount(self, ascending=True):
@@ -1854,11 +1865,10 @@ def cumcount(self, ascending=True):
         .ngroup : Number the groups themselves.
         """
 
-        self._set_group_selection()
-
-        index = self._selected_obj.index
-        cumcounts = self._cumcount_array(ascending=ascending)
-        return Series(cumcounts, index)
+        with _group_selection_context(self):
+            index = self._selected_obj.index
+            cumcounts = self._cumcount_array(ascending=ascending)
+            return Series(cumcounts, index)
 
     @Substitution(name='groupby')
     @Appender(_doc_template)
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
index ecb4e2af53042..07eef2d87feb3 100644
--- a/pandas/tests/groupby/test_apply.py
+++ b/pandas/tests/groupby/test_apply.py
@@ -519,11 +519,11 @@ def test_func(x):
 
 def test_apply_with_mixed_types():
     # gh-20949
-    df = pd.DataFrame({'A': 'a a b'.split(), 'B': [1,2,3], 'C': [4, 6, 5]})
+    df = pd.DataFrame({'A': 'a a b'.split(), 'B': [1, 2, 3], 'C': [4, 6, 5]})
     g = df.groupby('A')
 
     result = g.transform(lambda x: x / x.sum())
-    expected = pd.DataFrame({'B': [1/3., 2/3., 1], 'C': [0.4, 0.6, 1.0]})
+    expected = pd.DataFrame({'B': [1 / 3., 2 / 3., 1], 'C': [0.4, 0.6, 1.0]})
     tm.assert_frame_equal(result, expected)
 
     result = g.apply(lambda x: x / x.sum())

From a67f4d02a99ebcc4e54eec506e0239e9ced9c38e Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Mon, 7 May 2018 06:03:48 -0400
Subject: [PATCH 3/3] typo

---
 pandas/core/groupby/groupby.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 08ab621f5e3ce..df7a5dc9dc173 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -1485,10 +1485,11 @@ def ohlc(self):
 
     @Appender(DataFrame.describe.__doc__)
     def describe(self, **kwargs):
-        result = self.apply(lambda x: x.describe(**kwargs))
-        if self.axis == 1:
-            return result.T
-        return result.unstack()
+        with _group_selection_context(self):
+            result = self.apply(lambda x: x.describe(**kwargs))
+            if self.axis == 1:
+                return result.T
+            return result.unstack()
 
     @Substitution(name='groupby')
     @Appender(_doc_template)