pandas-dev · christopherzimmerman · Oct 21, 2019 · Oct 21, 2019 · Oct 21, 2019 · Oct 23, 2019
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -195,6 +195,70 @@ New repr for :class:`pandas.core.arrays.IntervalArray`
    pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)])
 
 
+.. _whatsnew_1000.api_breaking.GroupBy.apply:
+
+``GroupBy.apply`` behaves consistently with `as_index`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+- Previously, the result of :meth:`GroupBy.apply` sometimes contained the grouper column(s),
+  in both the index, and in the `DataFrame`. :meth:`GroupBy.apply`
+  now respects the ``as_index`` parameter, and only returns the grouper column(s) in
+  the result if ``as_index`` is set to `False`.  Other methods such as :meth:`GroupBy.resample`
+  exhibited similar behavior and now also respect the ``as_index`` parameter.
+
+*Previous Behavior*
+
+.. code-block:: ipython
+
+    In [1]: df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]})
+
+    In [2]: df.groupby("a").apply(lambda x: x.sum())
+    Out[2]:
+       a   b
+    a
+    1  2   3
+    2  4   7
+    3  6  11
+
+    In [3]: df.groupby("a").apply(lambda x: x.iloc[0])
+    Out[3]:
+       a  b
+    a
+    1  1  1
+    2  2  3
+    3  3  5
+
+    In [4]: idx = pd.date_range('1/1/2000', periods=4, freq='T')
+
+    In [5]: df = pd.DataFrame(data=4 * [range(2)],
+       ...:                   index=idx,
+       ...:                   columns=['a', 'b'])
+
+    In [6]: df.iloc[2, 0] = 5
+
+    In [7]: df.groupby('a').resample('M').sum()
+    Out[7]:
+                  a  b
+    a
+    0 2000-01-31  0  3
+    5 2000-01-31  5  1
+
+
+*Current Behavior*
+
+.. ipython:: python
+
+    df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]})
+    df.groupby("a").apply(lambda x: x.sum())
+    df.groupby("a").apply(lambda x: x.iloc[0])
+    idx = pd.date_range('1/1/2000', periods=4, freq='T')
+    df = pd.DataFrame(data=4 * [range(2)],
+                      index=idx,
+                      columns=['a', 'b'])
+    df.iloc[2, 0] = 5
+    df.groupby('a').resample('M').sum()
+
+
 All :class:`SeriesGroupBy` aggregation methods now respect the ``observed`` keyword
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 The following methods now also correctly output values for unobserved categories when called through ``groupby(..., observed=False)`` (:issue:`17605`)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -722,18 +722,17 @@ def f(g):
 
         # ignore SettingWithCopy here in case the user mutates
         with option_context("mode.chained_assignment", None):
-            try:
-                result = self._python_apply_general(f)
-            except TypeError:
-                # gh-20949
-                # try again, with .apply acting as a filtering
-                # operation, by excluding the grouping column
-                # This would normally not be triggered
-                # except if the udf is trying an operation that
-                # fails on *some* columns, e.g. a numeric operation
-                # on a string grouper column
-
-                with _group_selection_context(self):
+            with _group_selection_context(self):
+                try:
+                    result = self._python_apply_general(f)
+                except TypeError:
+                    # gh-20949
+                    # try again, with .apply acting as a filtering
+                    # operation, by excluding the grouping column
+                    # This would normally not be triggered
+                    # except if the udf is trying an operation that
+                    # fails on *some* columns, e.g. a numeric operation
+                    # on a string grouper column
                     return self._python_apply_general(f)
 
         return result
@@ -1480,7 +1479,7 @@ def resample(self, rule, *args, **kwargs):
         ...                   columns=['a', 'b'])
         >>> df.iloc[2, 0] = 5
         >>> df
-                            a  b
+                             a  b
         2000-01-01 00:00:00  0  1
         2000-01-01 00:01:00  0  1
         2000-01-01 00:02:00  5  1
@@ -1490,63 +1489,63 @@ def resample(self, rule, *args, **kwargs):
         the timestamps falling into a bin.
 
         >>> df.groupby('a').resample('3T').sum()
-                                 a  b
+                                 b
         a
-        0   2000-01-01 00:00:00  0  2
-            2000-01-01 00:03:00  0  1
-        5   2000-01-01 00:00:00  5  1
+        0   2000-01-01 00:00:00  2
+            2000-01-01 00:03:00  1
+        5   2000-01-01 00:00:00  1
 
         Upsample the series into 30 second bins.
 
         >>> df.groupby('a').resample('30S').sum()
-                            a  b
+                                 b
         a
-        0   2000-01-01 00:00:00  0  1
-            2000-01-01 00:00:30  0  0
-            2000-01-01 00:01:00  0  1
-            2000-01-01 00:01:30  0  0
-            2000-01-01 00:02:00  0  0
-            2000-01-01 00:02:30  0  0
-            2000-01-01 00:03:00  0  1
-        5   2000-01-01 00:02:00  5  1
+        0   2000-01-01 00:00:00  1
+            2000-01-01 00:00:30  0
+            2000-01-01 00:01:00  1
+            2000-01-01 00:01:30  0
+            2000-01-01 00:02:00  0
+            2000-01-01 00:02:30  0
+            2000-01-01 00:03:00  1
+        5   2000-01-01 00:02:00  1
 
         Resample by month. Values are assigned to the month of the period.
 
         >>> df.groupby('a').resample('M').sum()
-                    a  b
+                        b
         a
-        0   2000-01-31  0  3
-        5   2000-01-31  5  1
+        0   2000-01-31  3
+        5   2000-01-31  1
 
         Downsample the series into 3 minute bins as above, but close the right
         side of the bin interval.
 
         >>> df.groupby('a').resample('3T', closed='right').sum()
-                                 a  b
+                                 b
         a
-        0   1999-12-31 23:57:00  0  1
-            2000-01-01 00:00:00  0  2
-        5   2000-01-01 00:00:00  5  1
+        0   1999-12-31 23:57:00  1
+            2000-01-01 00:00:00  2
+        5   2000-01-01 00:00:00  1
 
         Downsample the series into 3 minute bins and close the right side of
         the bin interval, but label each bin using the right edge instead of
         the left.
 
         >>> df.groupby('a').resample('3T', closed='right', label='right').sum()
-                                 a  b
+                                 b
         a
-        0   2000-01-01 00:00:00  0  1
-            2000-01-01 00:03:00  0  2
-        5   2000-01-01 00:03:00  5  1
+        0   2000-01-01 00:00:00  1
+            2000-01-01 00:03:00  2
+        5   2000-01-01 00:03:00  1
 
         Add an offset of twenty seconds.
 
         >>> df.groupby('a').resample('3T', loffset='20s').sum()
-                               a  b
+                                 b
         a
-        0   2000-01-01 00:00:20  0  2
-            2000-01-01 00:03:20  0  1
-        5   2000-01-01 00:00:20  5  1
+        0   2000-01-01 00:00:20  2
+            2000-01-01 00:03:20  1
+        5   2000-01-01 00:00:20  1
         """
         from pandas.core.resample import get_resampler_for_grouping
 

diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py
@@ -112,6 +112,20 @@ def reduction_func(request):
     return request.param
 
 
+@pytest.fixture
+def as_index():
+    """yields all possible options for the `as_index` parameter, one at a time.
+    """
+    return [True, False]
+
+
+@pytest.fixture
+def group_keys():
+    """yields all possible options for the `group_keys` parameter, one at a time.
+    """
+    return [True, False]
+
+
 @pytest.fixture(params=transformation_kernels)
 def transformation_func(request):
     """yields the string names of all groupby transformation functions."""

diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -363,12 +363,33 @@ def f(group):
         tm.assert_frame_equal(result.loc[key], f(group))
 
 
-def test_apply_chunk_view():
+@pytest.mark.parametrize(
+    "as_index, group_keys, expected_index",
+    [
+        (
+            True,
+            True,
+            MultiIndex.from_tuples(
+                [(1, 0), (1, 1), (2, 2), (2, 3)], names=["key", None]
+            ),
+        ),
+        (True, False, [0, 1, 2, 3]),
+        (False, True, MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2), (1, 3)])),
+        (False, False, [0, 1, 2, 3]),
+    ],
+)
+def test_apply_chunk_view(as_index, group_keys, expected_index):
     # Low level tinkering could be unsafe, make sure not
-    df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)})
+    df = DataFrame({"key": [1, 1, 2, 2], "value": range(4)})
+
+    result = df.groupby("key", as_index=as_index, group_keys=group_keys).apply(
+        lambda x: x[:2]
+    )
+    if as_index:
+        df.pop("key")
 
-    result = df.groupby("key", group_keys=False).apply(lambda x: x[:2])
-    expected = df.take([0, 1, 3, 4, 6, 7])
+    expected = df.copy()
+    expected.index = expected_index
     tm.assert_frame_equal(result, expected)
 
 
@@ -386,7 +407,7 @@ def test_apply_no_name_column_conflict():
     grouped.apply(lambda x: x.sort_values("value", inplace=True))
 
 
-def test_apply_typecast_fail():
+def test_apply_typecast_fail(as_index):
     df = DataFrame(
         {
             "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0],
@@ -400,7 +421,12 @@ def f(group):
         group["v2"] = (v - v.min()) / (v.max() - v.min())
         return group
 
-    result = df.groupby("d").apply(f)
+    result = df.groupby("d", as_index=as_index).apply(f)
+
+    # GH 28549
+    # key no longer included in reduction output
+    if as_index:
+        df.pop("d")
 
     expected = df.copy()
     expected["v2"] = np.tile([0.0, 0.5, 1], 2)
@@ -426,6 +452,10 @@ def f(group):
 
     result = df.groupby("d").apply(f)
 
+    # GH 28549
+    # key no longer included in reduction output
+    df.pop("d")
+
     expected = df.copy()
     expected["v2"] = np.tile([0.0, 0.5, 1], 2)
 
@@ -638,24 +668,36 @@ def test_func(x):
     tm.assert_frame_equal(result, expected)
 
 
-def test_groupby_apply_none_first():
+@pytest.mark.parametrize(
+    "groups, vars_, expected_vars, expected_groups",
+    [
+        ([1, 1, 1, 2], [0, 1, 2, 3], [0, 2], [1, 1]),
+        ([1, 2, 2, 2], [0, 1, 2, 3], [1, 3], [2, 2]),
+    ],
+)
+def test_groupby_apply_none_first(
+    groups, vars_, expected_vars, expected_groups, as_index
+):
     # GH 12824. Tests if apply returns None first.
-    test_df1 = DataFrame({"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]})
-    test_df2 = DataFrame({"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]})
+    test_df = DataFrame({"groups": groups, "vars": vars_})
 
     def test_func(x):
         if x.shape[0] < 2:
             return None
         return x.iloc[[0, -1]]
 
-    result1 = test_df1.groupby("groups").apply(test_func)
-    result2 = test_df2.groupby("groups").apply(test_func)
-    index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None])
-    index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None])
-    expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1)
-    expected2 = DataFrame({"groups": [2, 2], "vars": [1, 3]}, index=index2)
-    tm.assert_frame_equal(result1, expected1)
-    tm.assert_frame_equal(result2, expected2)
+    result = test_df.groupby("groups", as_index=as_index).apply(test_func)
+
+    # GH 28549 "groups" should not be in output of apply
+    # unless as_index=True
+    if not as_index:
+        expected = DataFrame(
+            {"groups": expected_groups, "vars": expected_vars}, index=result.index
+        )
+    else:
+        expected = DataFrame({"vars": expected_vars}, index=result.index)
+
+    tm.assert_frame_equal(result, expected)
 
 
 def test_groupby_apply_return_empty_chunk():

diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -94,9 +94,16 @@ def f(x):
         return x.drop_duplicates("person_name").iloc[0]
 
     result = g.apply(f)
-    expected = x.iloc[[0, 1]].copy()
+
+    # GH 28549
+    # grouper key should not be present after apply
+    # with as_index=True.
+    # TODO split this into multiple tests
+    dropped = x.drop("person_id", 1)
+
+    expected = dropped.iloc[[0, 1]].copy()
     expected.index = Index([1, 2], name="person_id")
-    expected["person_name"] = expected["person_name"].astype("object")
+    expected["person_name"] = expected["person_name"]
     tm.assert_frame_equal(result, expected)
 
     # GH 9921
@@ -1247,6 +1254,16 @@ def test_get_nonexistent_category():
     # Accessing a Category that is not in the dataframe
     df = pd.DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)})
     with pytest.raises(KeyError, match="'vau'"):
+        df.groupby("var").apply(
+            lambda rows: pd.DataFrame({"val": [rows.iloc[-1]["vau"]]})
+        )
+
+
+def test_category_as_grouper_keys(as_index):
+    # Accessing a key that is not in the dataframe
+    df = pd.DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)})
+    bad_key = "'var'" if as_index else "'vau'"
+    with pytest.raises(KeyError, match=bad_key):
         df.groupby("var").apply(
             lambda rows: pd.DataFrame(
                 {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]}