pandas-dev · christopherzimmerman · Oct 21, 2019 · Oct 21, 2019 · Oct 21, 2019 · Oct 23, 2019
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -174,6 +174,43 @@ Backwards incompatible API changes
 
    pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)])
 
+.. _whatsnew_1000.api_breaking.GroupBy.apply:
+
+``GroupBy.apply`` behaves consistently with `as_index`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+- The result of :meth:`GroupBy.apply` sometimes contained the grouper column(s),
+  in both the index, and in the `DataFrame`. From Pandas 1.0, :meth:`GroupBy.apply`
+  will respect the `as_index` parameter, and only return the grouper column(s) in
+  the result if `as_index` is set to `False`.
+
+*pandas 0.25.x*
+
+.. code-block:: ipython
+
+    In [1]: df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]})
+
+    In [2]: df.groupby("a").apply(lambda x: x.sum())
+    Out[2]:
+       a   b
+    a
+    1  2   3
+    2  4   7
+    3  6  11
+
+*pandas 1.0.0*
+
+.. code-block:: ipython
+
+    In [1]: df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]})
+
+    In [2]: df.groupby("a").apply(lambda x: x.sum())
+    Out[2]:
+        b
+    a
+    1   3
+    2   7
+    3  11
 
 .. _whatsnew_1000.api.other:
 

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -721,7 +721,9 @@ def f(g):
             f = func
 
         # ignore SettingWithCopy here in case the user mutates
-        with option_context("mode.chained_assignment", None):
+        with option_context("mode.chained_assignment", None), _group_selection_context(
+            self
+        ):
             try:
                 result = self._python_apply_general(f)
             except TypeError:
@@ -732,9 +734,7 @@ def f(g):
                 # except if the udf is trying an operation that
                 # fails on *some* columns, e.g. a numeric operation
                 # on a string grouper column
-
-                with _group_selection_context(self):
-                    return self._python_apply_general(f)
+                return self._python_apply_general(f)
 
         return result
 
@@ -1465,7 +1465,7 @@ def resample(self, rule, *args, **kwargs):
         ...                   columns=['a', 'b'])
         >>> df.iloc[2, 0] = 5
         >>> df
-                            a  b
+                             a  b
         2000-01-01 00:00:00  0  1
         2000-01-01 00:01:00  0  1
         2000-01-01 00:02:00  5  1
@@ -1475,63 +1475,63 @@ def resample(self, rule, *args, **kwargs):
         the timestamps falling into a bin.
 
         >>> df.groupby('a').resample('3T').sum()
-                                 a  b
+                                 b
         a
-        0   2000-01-01 00:00:00  0  2
-            2000-01-01 00:03:00  0  1
-        5   2000-01-01 00:00:00  5  1
+        0   2000-01-01 00:00:00  2
+            2000-01-01 00:03:00  1
+        5   2000-01-01 00:00:00  1
 
         Upsample the series into 30 second bins.
 
         >>> df.groupby('a').resample('30S').sum()
-                            a  b
+                                 b
         a
-        0   2000-01-01 00:00:00  0  1
-            2000-01-01 00:00:30  0  0
-            2000-01-01 00:01:00  0  1
-            2000-01-01 00:01:30  0  0
-            2000-01-01 00:02:00  0  0
-            2000-01-01 00:02:30  0  0
-            2000-01-01 00:03:00  0  1
-        5   2000-01-01 00:02:00  5  1
+        0   2000-01-01 00:00:00  1
+            2000-01-01 00:00:30  0
+            2000-01-01 00:01:00  1
+            2000-01-01 00:01:30  0
+            2000-01-01 00:02:00  0
+            2000-01-01 00:02:30  0
+            2000-01-01 00:03:00  1
+        5   2000-01-01 00:02:00  1
 
         Resample by month. Values are assigned to the month of the period.
 
         >>> df.groupby('a').resample('M').sum()
-                    a  b
+                        b
         a
-        0   2000-01-31  0  3
-        5   2000-01-31  5  1
+        0   2000-01-31  3
+        5   2000-01-31  1
 
         Downsample the series into 3 minute bins as above, but close the right
         side of the bin interval.
 
         >>> df.groupby('a').resample('3T', closed='right').sum()
-                                 a  b
+                                 b
         a
-        0   1999-12-31 23:57:00  0  1
-            2000-01-01 00:00:00  0  2
-        5   2000-01-01 00:00:00  5  1
+        0   1999-12-31 23:57:00  1
+            2000-01-01 00:00:00  2
+        5   2000-01-01 00:00:00  1
 
         Downsample the series into 3 minute bins and close the right side of
         the bin interval, but label each bin using the right edge instead of
         the left.
 
         >>> df.groupby('a').resample('3T', closed='right', label='right').sum()
-                                 a  b
+                                 b
         a
-        0   2000-01-01 00:00:00  0  1
-            2000-01-01 00:03:00  0  2
-        5   2000-01-01 00:03:00  5  1
+        0   2000-01-01 00:00:00  1
+            2000-01-01 00:03:00  2
+        5   2000-01-01 00:03:00  1
 
         Add an offset of twenty seconds.
 
         >>> df.groupby('a').resample('3T', loffset='20s').sum()
-                               a  b
+                                 b
         a
-        0   2000-01-01 00:00:20  0  2
-            2000-01-01 00:03:20  0  1
-        5   2000-01-01 00:00:20  5  1
+        0   2000-01-01 00:00:20  2
+            2000-01-01 00:03:20  1
+        5   2000-01-01 00:00:20  1
         """
         from pandas.core.resample import get_resampler_for_grouping
 

diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py
@@ -110,3 +110,8 @@ def reduction_func(request):
     """yields the string names of all groupby reduction functions, one at a time.
     """
     return request.param
+
+
+@pytest.fixture
+def as_index():
+    return [True, False]
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -363,11 +363,18 @@ def f(group):
         tm.assert_frame_equal(result.loc[key], f(group))
 
 
-def test_apply_chunk_view():
+def test_apply_chunk_view(as_index):
     # Low level tinkering could be unsafe, make sure not
     df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)})
 
-    result = df.groupby("key", group_keys=False).apply(lambda x: x[:2])
+    result = df.groupby("key", group_keys=False, as_index=as_index).apply(
+        lambda x: x[:2]
+    )
+    # GH 28549
+    # key no longer included in reduction output
+    if as_index:
+        df.pop("key")
+
     expected = df.take([0, 1, 3, 4, 6, 7])
     tm.assert_frame_equal(result, expected)
 
@@ -386,7 +393,7 @@ def test_apply_no_name_column_conflict():
     grouped.apply(lambda x: x.sort_values("value", inplace=True))
 
 
-def test_apply_typecast_fail():
+def test_apply_typecast_fail(as_index):
     df = DataFrame(
         {
             "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0],
@@ -400,7 +407,12 @@ def f(group):
         group["v2"] = (v - v.min()) / (v.max() - v.min())
         return group
 
-    result = df.groupby("d").apply(f)
+    result = df.groupby("d", as_index=as_index).apply(f)
+
+    # GH 28549
+    # key no longer included in reduction output
+    if as_index:
+        df.pop("d")
 
     expected = df.copy()
     expected["v2"] = np.tile([0.0, 0.5, 1], 2)
@@ -426,6 +438,10 @@ def f(group):
 
     result = df.groupby("d").apply(f)
 
+    # GH 28549
+    # key no longer included in reduction output
+    df.pop("d")
+
     expected = df.copy()
     expected["v2"] = np.tile([0.0, 0.5, 1], 2)
 
@@ -638,24 +654,36 @@ def test_func(x):
     tm.assert_frame_equal(result, expected)
 
 
-def test_groupby_apply_none_first():
+@pytest.mark.parametrize(
+    "groups, vars_, expected_vars, expected_groups",
+    [
+        ([1, 1, 1, 2], [0, 1, 2, 3], [0, 2], [1, 1]),
+        ([1, 2, 2, 2], [0, 1, 2, 3], [1, 3], [2, 2]),
+    ],
+)
+def test_groupby_apply_none_first(
+    groups, vars_, expected_vars, expected_groups, as_index
+):
     # GH 12824. Tests if apply returns None first.
-    test_df1 = DataFrame({"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]})
-    test_df2 = DataFrame({"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]})
+    test_df = DataFrame({"groups": groups, "vars": vars_})
 
     def test_func(x):
         if x.shape[0] < 2:
             return None
         return x.iloc[[0, -1]]
 
-    result1 = test_df1.groupby("groups").apply(test_func)
-    result2 = test_df2.groupby("groups").apply(test_func)
-    index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None])
-    index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None])
-    expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1)
-    expected2 = DataFrame({"groups": [2, 2], "vars": [1, 3]}, index=index2)
-    tm.assert_frame_equal(result1, expected1)
-    tm.assert_frame_equal(result2, expected2)
+    result = test_df.groupby("groups", as_index=as_index).apply(test_func)
+
+    # GH 28549 "groups" should not be in output of apply
+    # unless as_index=True
+    if not as_index:
+        expected = DataFrame(
+            {"groups": expected_groups, "vars": expected_vars}, index=result.index
+        )
+    else:
+        expected = DataFrame({"vars": expected_vars}, index=result.index)
+
+    tm.assert_frame_equal(result, expected)
 
 
 def test_groupby_apply_return_empty_chunk():

diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -95,9 +95,16 @@ def f(x):
         return x.drop_duplicates("person_name").iloc[0]
 
     result = g.apply(f)
-    expected = x.iloc[[0, 1]].copy()
+
+    # GH 28549
+    # grouper key should not be present after apply
+    # with as_index=True.
+    # TODO split this into multiple tests
+    dropped = x.drop("person_id", 1)
+
+    expected = dropped.iloc[[0, 1]].copy()
     expected.index = Index([1, 2], name="person_id")
-    expected["person_name"] = expected["person_name"].astype("object")
+    expected["person_name"] = expected["person_name"]
     tm.assert_frame_equal(result, expected)
 
     # GH 9921
@@ -1218,7 +1225,10 @@ def test_get_nonexistent_category():
     # Accessing a Category that is not in the dataframe
     df = pd.DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)})
     with pytest.raises(KeyError, match="'vau'"):
-        df.groupby("var").apply(
+        # GH2849 This needs to use as_index=False so that
+        # var is still present when grouping or else another key error
+        # will raise about var.
+        df.groupby("var", as_index=False).apply(
             lambda rows: pd.DataFrame(
                 {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]}
             )

diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -87,10 +87,6 @@ def test_intercept_builtin_sum():
     tm.assert_series_equal(result2, expected)
 
 
-# @pytest.mark.parametrize("f", [max, min, sum])
-# def test_builtins_apply(f):
-
-
 @pytest.mark.parametrize("f", [max, min, sum])
 @pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]])  # Single key  # Multi-key
 def test_builtins_apply(keys, f):
@@ -102,21 +98,33 @@ def test_builtins_apply(keys, f):
     result = df.groupby(keys).apply(f)
     ngroups = len(df.drop_duplicates(subset=keys))
 
-    assert_msg = "invalid frame shape: {} (expected ({}, 3))".format(
-        result.shape, ngroups
+    # GH 28549
+    # grouping keys should not be included in output
+    if isinstance(keys, list):
+        result_shape = len(df.columns) - len(keys)
+    else:
+        result_shape = len(df.columns) - 1
+
+    assert_msg = "invalid frame shape: {} (expected ({}, {}))".format(
+        result.shape, ngroups, result_shape
     )
-    assert result.shape == (ngroups, 3), assert_msg
+    assert result.shape == (ngroups, result_shape), assert_msg
 
     tm.assert_frame_equal(
         result,  # numpy's equivalent function
         df.groupby(keys).apply(getattr(np, fname)),
     )
 
     if f != sum:
-        expected = df.groupby(keys).agg(fname).reset_index()
-        expected.set_index(keys, inplace=True, drop=False)
+        # GH 28549
+        # No longer need to reset/set index here
+        expected = df.groupby(keys).agg(fname)
         tm.assert_frame_equal(result, expected, check_dtype=False)
 
+    # GH 28549
+    # grouping keys should not be in output
+    df = df.drop(keys, 1)
+
     tm.assert_series_equal(getattr(result, fname)(), getattr(df, fname)())
 
 
@@ -341,10 +349,13 @@ def test_cython_api2():
     tm.assert_frame_equal(result, expected)
 
     # GH 13994
-    result = df.groupby("A").cumsum(axis=1)
+    # GH 28549
+    # Good represention of when as_index=False is now behaving
+    # as expected
+    result = df.groupby("A", as_index=False).cumsum(axis=1)
     expected = df.cumsum(axis=1)
     tm.assert_frame_equal(result, expected)
-    result = df.groupby("A").cumprod(axis=1)
+    result = df.groupby("A", as_index=False).cumprod(axis=1)
     expected = df.cumprod(axis=1)
     tm.assert_frame_equal(result, expected)
 
@@ -1107,7 +1118,10 @@ def test_count():
 
     for key in ["1st", "2nd", ["1st", "2nd"]]:
         left = df.groupby(key).count()
-        right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1)
+
+        # GH 28549
+        # don't need to drop key here anymore
+        right = df.groupby(key).apply(DataFrame.count)
         tm.assert_frame_equal(left, right)