BUG: groupby.describe with as_index=False incorrect (pandas-dev#49643)

rhshadrach · mliu08 · commit 5d3fafa50686 · 2022-11-26T22:09:07.000-08:00
* BUG: groupby.describe with as_index=False incorrect

* Add test for two groupings

* Simplify logic
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -718,7 +718,9 @@ Groupby/resample/rolling
 - Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would not include unobserved categories in result when grouping by categorical indexes (:issue:`49354`)
 - Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` would change result order depending on the input index when grouping by categoricals (:issue:`49223`)
 - Bug in :class:`.DataFrameGroupBy` and :class:`.SeriesGroupBy` when grouping on categorical data would sort result values even when used with ``sort=False`` (:issue:`42482`)
--
+- Bug in :meth:`.DataFrameGroupBy.apply` and :class:`SeriesGroupBy.apply` with ``as_index=False`` would not attempt the computation without using the grouping keys when using them failed with a ``TypeError`` (:issue:`49256`)
+- Bug in :meth:`.DataFrameGroupBy.describe` would describe the group keys (:issue:`49256`)
+- Bug in :meth:`.SeriesGroupBy.describe` with ``as_index=False`` would have the incorrect shape (:issue:`49256`)
 
 Reshaping
 ^^^^^^^^^
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1061,8 +1061,7 @@ def _set_group_selection(self) -> None:
         # This is a no-op for SeriesGroupBy
         grp = self.grouper
         if not (
-            self.as_index
-            and grp.groupings is not None
+            grp.groupings is not None
             and self.obj.ndim > 1
             and self._group_selection is None
         ):
@@ -2640,7 +2639,14 @@ def describe(self, **kwargs):
             )
             if self.axis == 1:
                 return result.T
-            return result.unstack()
+
+            # GH#49256 - properly handle the grouping column(s)
+            if self._selected_obj.ndim != 1 or self.as_index:
+                result = result.unstack()
+                if not self.as_index:
+                    self._insert_inaxis_grouper_inplace(result)
+
+            return result
 
     @final
     def resample(self, rule, *args, **kwargs):
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -974,15 +974,21 @@ def test_apply_function_index_return(function):
 
 
 def test_apply_function_with_indexing_return_column():
-    # GH#7002, GH#41480
+    # GH#7002, GH#41480, GH#49256
     df = DataFrame(
         {
             "foo1": ["one", "two", "two", "three", "one", "two"],
             "foo2": [1, 2, 4, 4, 5, 6],
         }
     )
-    with pytest.raises(TypeError, match="Could not convert"):
-        df.groupby("foo1", as_index=False).apply(lambda x: x.mean())
+    result = df.groupby("foo1", as_index=False).apply(lambda x: x.mean())
+    expected = DataFrame(
+        {
+            "foo1": ["one", "three", "two"],
+            "foo2": [3.0, 4.0, 4.0],
+        }
+    )
+    tm.assert_frame_equal(result, expected)
 
 
 @pytest.mark.parametrize(
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -336,13 +336,7 @@ def test_describe(self, df, gb, gni):
         result = gb.describe()
         tm.assert_frame_equal(result, expected)
 
-        expected = pd.concat(
-            [
-                df[df.A == 1].describe().unstack().to_frame().T,
-                df[df.A == 3].describe().unstack().to_frame().T,
-            ]
-        )
-        expected.index = Index([0, 1])
+        expected = expected.reset_index()
         result = gni.describe()
         tm.assert_frame_equal(result, expected)
 
@@ -1093,6 +1087,38 @@ def test_series_describe_single():
     tm.assert_series_equal(result, expected)
 
 
+@pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]])
+def test_series_describe_as_index(as_index, keys):
+    # GH#49256
+    df = DataFrame(
+        {
+            "key1": ["one", "two", "two", "three", "two"],
+            "key2": ["one", "two", "two", "three", "two"],
+            "foo2": [1, 2, 4, 4, 6],
+        }
+    )
+    gb = df.groupby(keys, as_index=as_index)["foo2"]
+    result = gb.describe()
+    expected = DataFrame(
+        {
+            "key1": ["one", "three", "two"],
+            "count": [1.0, 1.0, 3.0],
+            "mean": [1.0, 4.0, 4.0],
+            "std": [np.nan, np.nan, 2.0],
+            "min": [1.0, 4.0, 2.0],
+            "25%": [1.0, 4.0, 3.0],
+            "50%": [1.0, 4.0, 4.0],
+            "75%": [1.0, 4.0, 5.0],
+            "max": [1.0, 4.0, 6.0],
+        }
+    )
+    if len(keys) == 2:
+        expected.insert(1, "key2", expected["key1"])
+    if as_index:
+        expected = expected.set_index(keys)
+    tm.assert_frame_equal(result, expected)
+
+
 def test_series_index_name(df):
     grouped = df.loc[:, ["C"]].groupby(df["A"])
     result = grouped.agg(lambda x: x.mean())
@@ -1177,29 +1203,25 @@ def test_frame_describe_unstacked_format():
     "pandas.errors.PerformanceWarning"
 )
 @pytest.mark.parametrize("as_index", [True, False])
-def test_describe_with_duplicate_output_column_names(as_index):
+@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
+def test_describe_with_duplicate_output_column_names(as_index, keys):
     # GH 35314
     df = DataFrame(
         {
-            "a": [99, 99, 99, 88, 88, 88],
+            "a1": [99, 99, 99, 88, 88, 88],
+            "a2": [99, 99, 99, 88, 88, 88],
             "b": [1, 2, 3, 4, 5, 6],
             "c": [10, 20, 30, 40, 50, 60],
         },
-        columns=["a", "b", "b"],
+        columns=["a1", "a2", "b", "b"],
         copy=False,
     )
+    if keys == ["a1"]:
+        df = df.drop(columns="a2")
 
     expected = (
         DataFrame.from_records(
             [
-                ("a", "count", 3.0, 3.0),
-                ("a", "mean", 88.0, 99.0),
-                ("a", "std", 0.0, 0.0),
-                ("a", "min", 88.0, 99.0),
-                ("a", "25%", 88.0, 99.0),
-                ("a", "50%", 88.0, 99.0),
-                ("a", "75%", 88.0, 99.0),
-                ("a", "max", 88.0, 99.0),
                 ("b", "count", 3.0, 3.0),
                 ("b", "mean", 5.0, 2.0),
                 ("b", "std", 1.0, 1.0),
@@ -1222,14 +1244,17 @@ def test_describe_with_duplicate_output_column_names(as_index):
         .T
     )
     expected.columns.names = [None, None]
-    expected.index = Index([88, 99], name="a")
-
-    if as_index:
-        expected = expected.drop(columns=["a"], level=0)
+    if len(keys) == 2:
+        expected.index = MultiIndex(
+            levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"]
+        )
     else:
-        expected = expected.reset_index(drop=True)
+        expected.index = Index([88, 99], name="a1")
+
+    if not as_index:
+        expected = expected.reset_index()
 
-    result = df.groupby("a", as_index=as_index).describe()
+    result = df.groupby(keys, as_index=as_index).describe()
 
     tm.assert_frame_equal(result, expected)