pandas-dev · christopherzimmerman · Oct 21, 2019 · Oct 21, 2019 · Oct 21, 2019 · Oct 23, 2019
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -398,6 +398,81 @@ keywords.
 
    df.rename(index={0: 1}, columns={0: 2})
 
+
+.. _whatsnew_1000.api_breaking.GroupBy.apply:
+
+``GroupBy.apply`` behaves consistently with `as_index`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+- Previously, the result of :meth:`GroupBy.apply` sometimes contained the grouper column(s),
+  in both the index, and in the `DataFrame`. :meth:`GroupBy.apply`
+  now respects the ``as_index`` parameter, and only returns the grouper column(s) in
+  the result if ``as_index`` is set to `False`.  Other methods such as :meth:`GroupBy.resample`
+  exhibited similar behavior and now also respect the ``as_index`` parameter.
+
+*Previous Behavior*
+
+.. code-block:: ipython
+
+    In [1]: df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]})
+
+    In [2]: df.groupby("a").apply(lambda x: x.sum())
+    Out[2]:
+       a   b
+    a
+    1  2   3
+    2  4   7
+    3  6  11
+
+    In [3]: df.groupby("a").apply(lambda x: x.iloc[0])
+    Out[3]:
+       a  b
+    a
+    1  1  1
+    2  2  3
+    3  3  5
+
+    In [4]: idx = pd.date_range('1/1/2000', periods=4, freq='T')
+
+    In [5]: df = pd.DataFrame(data=4 * [range(2)],
+       ...:                   index=idx,
+       ...:                   columns=['a', 'b'])
+
+    In [6]: df.iloc[2, 0] = 5
+
+    In [7]: df.groupby('a').resample('M').sum()
+    Out[7]:
+                  a  b
+    a
+    0 2000-01-31  0  3
+    5 2000-01-31  5  1
+
+
+*Current Behavior*
+
+.. ipython:: python
+
+    df = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 3, 4, 5, 6]})
+    df.groupby("a").apply(lambda x: x.sum())
+    df.groupby("a").apply(lambda x: x.iloc[0])
+    idx = pd.date_range('1/1/2000', periods=4, freq='T')
+    df = pd.DataFrame(data=4 * [range(2)],
+                      index=idx,
+                      columns=['a', 'b'])
+    df.iloc[2, 0] = 5
+    df.groupby('a').resample('M').sum()
+
+
+All :class:`SeriesGroupBy` aggregation methods now respect the ``observed`` keyword
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The following methods now also correctly output values for unobserved categories when called through ``groupby(..., observed=False)`` (:issue:`17605`)
+
+- :meth:`SeriesGroupBy.count`
+- :meth:`SeriesGroupBy.size`
+- :meth:`SeriesGroupBy.nunique`
+- :meth:`SeriesGroupBy.nth`
+
+
 Extended verbose info output for :class:`~pandas.DataFrame`
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -731,18 +731,17 @@ def f(g):
 
         # ignore SettingWithCopy here in case the user mutates
         with option_context("mode.chained_assignment", None):
-            try:
-                result = self._python_apply_general(f)
-            except TypeError:
-                # gh-20949
-                # try again, with .apply acting as a filtering
-                # operation, by excluding the grouping column
-                # This would normally not be triggered
-                # except if the udf is trying an operation that
-                # fails on *some* columns, e.g. a numeric operation
-                # on a string grouper column
-
-                with _group_selection_context(self):
+            with _group_selection_context(self):
+                try:
+                    result = self._python_apply_general(f)
+                except TypeError:
+                    # gh-20949
+                    # try again, with .apply acting as a filtering
+                    # operation, by excluding the grouping column
+                    # This would normally not be triggered
+                    # except if the udf is trying an operation that
+                    # fails on *some* columns, e.g. a numeric operation
+                    # on a string grouper column
                     return self._python_apply_general(f)
 
         return result
@@ -1505,7 +1504,7 @@ def resample(self, rule, *args, **kwargs):
         ...                   columns=['a', 'b'])
         >>> df.iloc[2, 0] = 5
         >>> df
-                            a  b
+                             a  b
         2000-01-01 00:00:00  0  1
         2000-01-01 00:01:00  0  1
         2000-01-01 00:02:00  5  1
@@ -1515,63 +1514,63 @@ def resample(self, rule, *args, **kwargs):
         the timestamps falling into a bin.
 
         >>> df.groupby('a').resample('3T').sum()
-                                 a  b
+                                 b
         a
-        0   2000-01-01 00:00:00  0  2
-            2000-01-01 00:03:00  0  1
-        5   2000-01-01 00:00:00  5  1
+        0   2000-01-01 00:00:00  2
+            2000-01-01 00:03:00  1
+        5   2000-01-01 00:00:00  1
 
         Upsample the series into 30 second bins.
 
         >>> df.groupby('a').resample('30S').sum()
-                            a  b
+                                 b
         a
-        0   2000-01-01 00:00:00  0  1
-            2000-01-01 00:00:30  0  0
-            2000-01-01 00:01:00  0  1
-            2000-01-01 00:01:30  0  0
-            2000-01-01 00:02:00  0  0
-            2000-01-01 00:02:30  0  0
-            2000-01-01 00:03:00  0  1
-        5   2000-01-01 00:02:00  5  1
+        0   2000-01-01 00:00:00  1
+            2000-01-01 00:00:30  0
+            2000-01-01 00:01:00  1
+            2000-01-01 00:01:30  0
+            2000-01-01 00:02:00  0
+            2000-01-01 00:02:30  0
+            2000-01-01 00:03:00  1
+        5   2000-01-01 00:02:00  1
 
         Resample by month. Values are assigned to the month of the period.
 
         >>> df.groupby('a').resample('M').sum()
-                    a  b
+                        b
         a
-        0   2000-01-31  0  3
-        5   2000-01-31  5  1
+        0   2000-01-31  3
+        5   2000-01-31  1
 
         Downsample the series into 3 minute bins as above, but close the right
         side of the bin interval.
 
         >>> df.groupby('a').resample('3T', closed='right').sum()
-                                 a  b
+                                 b
         a
-        0   1999-12-31 23:57:00  0  1
-            2000-01-01 00:00:00  0  2
-        5   2000-01-01 00:00:00  5  1
+        0   1999-12-31 23:57:00  1
+            2000-01-01 00:00:00  2
+        5   2000-01-01 00:00:00  1
 
         Downsample the series into 3 minute bins and close the right side of
         the bin interval, but label each bin using the right edge instead of
         the left.
 
         >>> df.groupby('a').resample('3T', closed='right', label='right').sum()
-                                 a  b
+                                 b
         a
-        0   2000-01-01 00:00:00  0  1
-            2000-01-01 00:03:00  0  2
-        5   2000-01-01 00:03:00  5  1
+        0   2000-01-01 00:00:00  1
+            2000-01-01 00:03:00  2
+        5   2000-01-01 00:03:00  1
 
         Add an offset of twenty seconds.
 
         >>> df.groupby('a').resample('3T', loffset='20s').sum()
-                               a  b
+                                 b
         a
-        0   2000-01-01 00:00:20  0  2
-            2000-01-01 00:03:20  0  1
-        5   2000-01-01 00:00:20  5  1
+        0   2000-01-01 00:00:20  2
+            2000-01-01 00:03:20  1
+        5   2000-01-01 00:00:20  1
         """
         from pandas.core.resample import get_resampler_for_grouping
 

diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py
@@ -112,6 +112,20 @@ def reduction_func(request):
     return request.param
 
 
+@pytest.fixture
+def as_index():
+    """yields all possible options for the `as_index` parameter, one at a time.
+    """
+    return [True, False]
+
+
+@pytest.fixture
+def group_keys():
+    """yields all possible options for the `group_keys` parameter, one at a time.
+    """
+    return [True, False]
+
+
 @pytest.fixture(params=sorted(transformation_kernels))
 def transformation_func(request):
     """yields the string names of all groupby transformation functions."""

diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -363,12 +363,33 @@ def f(group):
         tm.assert_frame_equal(result.loc[key], f(group))
 
 
-def test_apply_chunk_view():
+@pytest.mark.parametrize(
+    "as_index, group_keys, expected_index",
+    [
+        (
+            True,
+            True,
+            MultiIndex.from_tuples(
+                [(1, 0), (1, 1), (2, 2), (2, 3)], names=["key", None]
+            ),
+        ),
+        (True, False, [0, 1, 2, 3]),
+        (False, True, MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2), (1, 3)])),
+        (False, False, [0, 1, 2, 3]),
+    ],
+)
+def test_apply_chunk_view(as_index, group_keys, expected_index):
     # Low level tinkering could be unsafe, make sure not
-    df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)})
+    df = DataFrame({"key": [1, 1, 2, 2], "value": range(4)})
+
+    result = df.groupby("key", as_index=as_index, group_keys=group_keys).apply(
+        lambda x: x[:2]
+    )
+    if as_index:
+        df.pop("key")
 
-    result = df.groupby("key", group_keys=False).apply(lambda x: x[:2])
-    expected = df.take([0, 1, 3, 4, 6, 7])
+    expected = df.copy()
+    expected.index = expected_index
     tm.assert_frame_equal(result, expected)
 
 
@@ -386,7 +407,7 @@ def test_apply_no_name_column_conflict():
     grouped.apply(lambda x: x.sort_values("value", inplace=True))
 
 
-def test_apply_typecast_fail():
+def test_apply_typecast_fail(as_index):
     df = DataFrame(
         {
             "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0],
@@ -400,7 +421,12 @@ def f(group):
         group["v2"] = (v - v.min()) / (v.max() - v.min())
         return group
 
-    result = df.groupby("d").apply(f)
+    result = df.groupby("d", as_index=as_index).apply(f)
+
+    # GH 28549
+    # key no longer included in reduction output
+    if as_index:
+        df.pop("d")
 
     expected = df.copy()
     expected["v2"] = np.tile([0.0, 0.5, 1], 2)
@@ -426,6 +452,10 @@ def f(group):
 
     result = df.groupby("d").apply(f)
 
+    # GH 28549
+    # key no longer included in reduction output
+    df.pop("d")
+
     expected = df.copy()
     expected["v2"] = np.tile([0.0, 0.5, 1], 2)
 
@@ -482,7 +512,7 @@ def test_apply_with_duplicated_non_sorted_axis(test_series):
         expected = ser.sort_index()
         tm.assert_series_equal(result, expected)
     else:
-        result = df.groupby("Y").apply(lambda x: x)
+        result = df.groupby("Y", as_index=False).apply(lambda x: x)
 
         # not expecting the order to remain the same for duplicated axis
         result = result.sort_values("Y")
@@ -661,24 +691,36 @@ def test_func(x):
     tm.assert_frame_equal(result, expected)
 
 
-def test_groupby_apply_none_first():
+@pytest.mark.parametrize(
+    "groups, vars_, expected_vars, expected_groups",
+    [
+        ([1, 1, 1, 2], [0, 1, 2, 3], [0, 2], [1, 1]),
+        ([1, 2, 2, 2], [0, 1, 2, 3], [1, 3], [2, 2]),
+    ],
+)
+def test_groupby_apply_none_first(
+    groups, vars_, expected_vars, expected_groups, as_index
+):
     # GH 12824. Tests if apply returns None first.
-    test_df1 = DataFrame({"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]})
-    test_df2 = DataFrame({"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]})
+    test_df = DataFrame({"groups": groups, "vars": vars_})
 
     def test_func(x):
         if x.shape[0] < 2:
             return None
         return x.iloc[[0, -1]]
 
-    result1 = test_df1.groupby("groups").apply(test_func)
-    result2 = test_df2.groupby("groups").apply(test_func)
-    index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None])
-    index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None])
-    expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1)
-    expected2 = DataFrame({"groups": [2, 2], "vars": [1, 3]}, index=index2)
-    tm.assert_frame_equal(result1, expected1)
-    tm.assert_frame_equal(result2, expected2)
+    result = test_df.groupby("groups", as_index=as_index).apply(test_func)
+
+    # GH 28549 "groups" should not be in output of apply
+    # unless as_index=True
+    if not as_index:
+        expected = DataFrame(
+            {"groups": expected_groups, "vars": expected_vars}, index=result.index
+        )
+    else:
+        expected = DataFrame({"vars": expected_vars}, index=result.index)
+
+    tm.assert_frame_equal(result, expected)
 
 
 def test_groupby_apply_return_empty_chunk():
@@ -805,7 +847,7 @@ def test_groupby_apply_datetime_result_dtypes():
         ],
         columns=["observation", "color", "mood", "intensity", "score"],
     )
-    result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes
+    result = data.groupby("color", as_index=False).apply(lambda g: g.iloc[0]).dtypes
     expected = Series(
         [np.dtype("datetime64[ns]"), np.object, np.object, np.int64, np.object],
         index=["observation", "color", "mood", "intensity", "score"],
@@ -825,7 +867,7 @@ def test_groupby_apply_datetime_result_dtypes():
 def test_apply_index_has_complex_internals(index):
     # GH 31248
     df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index)
-    result = df.groupby("group").apply(lambda x: x)
+    result = df.groupby("group", as_index=False).apply(lambda x: x)
     tm.assert_frame_equal(result, df)