pandas-dev · jreback · Aug 7, 2020 · Jul 14, 2020 · Jul 15, 2020 · Jul 15, 2020
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -1084,6 +1084,7 @@ Groupby/resample/rolling
 - Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`)
 - Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`)
 - Bug in :meth:'DataFrameGroupBy.first' and :meth:'DataFrameGroupBy.last' that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`)
+- Bug in :meth:`DataFrameGroupBy.apply` :meth:`DataFrameGroupBy.describe` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`)
 
 Reshaping
 ^^^^^^^^^

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -829,6 +829,8 @@ def __iter__(self):
     )
     def apply(self, func, *args, **kwargs):
 
+        self._reset_group_selection()
+
         func = self._is_builtin_func(func)
 
         # this is needed so we don't try and wrap strings. If we could
@@ -1623,11 +1625,10 @@ def ohlc(self) -> DataFrame:
 
     @doc(DataFrame.describe)
     def describe(self, **kwargs):
-        with _group_selection_context(self):
-            result = self.apply(lambda x: x.describe(**kwargs))
-            if self.axis == 1:
-                return result.T
-            return result.unstack()
+        result = self.apply(lambda x: x.describe(**kwargs))
+        if self.axis == 1:
+            return result.T
+        return result.unstack()
 
     def resample(self, rule, *args, **kwargs):
         """

diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py
@@ -486,13 +486,13 @@ def test_agg_timezone_round_trip():
     assert ts == grouped.first()["B"].iloc[0]
 
     # GH#27110 applying iloc should return a DataFrame
-    assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 0]
+    assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1]
 
     ts = df["B"].iloc[2]
     assert ts == grouped.last()["B"].iloc[0]
 
     # GH#27110 applying iloc should return a DataFrame
-    assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 0]
+    assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1]
 
 
 def test_sum_uint64_overflow():

diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -1010,3 +1010,32 @@ def test_apply_with_timezones_aware():
     result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy())
 
     tm.assert_frame_equal(result1, result2)
+
+
+def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func):
+    # GH 34656
+    # GH 34271
+    df = DataFrame(
+        {
+            "a": [99, 99, 99, 88, 88, 88],
+            "b": [1, 2, 3, 4, 5, 6],
+            "c": [10, 20, 30, 40, 50, 60],
+        }
+    )
+
+    expected = pd.DataFrame(
+        {"a": [264, 297], "b": [15, 6], "c": [150, 60]},
+        index=pd.Index([88, 99], name="a"),
+    )
+
+    # Check output wehn no other methods are called before .apply()
+    grp = df.groupby(by="a")
+    result = grp.apply(sum)
+    tm.assert_frame_equal(result, expected)
+
+    # Check output when another method is called before .apply()
+    grp = df.groupby(by="a")
+    args = {"nth": [0], "corrwith": [df]}.get(reduction_func, [])
+    _ = getattr(grp, reduction_func)(*args)
+    result = grp.apply(sum)
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -286,14 +286,47 @@ def test_non_cython_api():
 
     # describe
     expected_index = pd.Index([1, 3], name="A")
-    expected_col = pd.MultiIndex(
-        levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
-        codes=[[0] * 8, list(range(8))],
+    expected_col = pd.MultiIndex.from_product(
+        [["A", "B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]]
     )
     expected = pd.DataFrame(
         [
-            [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
-            [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
+            [
+                2.0,
+                1.0,
+                0.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                2.0,
+                np.nan,
+                2.0,
+                2.0,
+                2.0,
+                2.0,
+                2.0,
+            ],
+            [
+                1.0,
+                3.0,
+                np.nan,
+                3.0,
+                3.0,
+                3.0,
+                3.0,
+                3.0,
+                0.0,
+                np.nan,
+                np.nan,
+                np.nan,
+                np.nan,
+                np.nan,
+                np.nan,
+                np.nan,
+            ],
         ],
         index=expected_index,
         columns=expected_col,
@@ -974,6 +1007,50 @@ def test_frame_describe_unstacked_format():
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize("by_col_dtype", [int, float, str])
+def test_describe_results_includes_non_nuisance_columns(by_col_dtype):
+    # GH 34656
+    # GH 34271
+    df = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 3, 3], "b": [1, 2, 3, 4, 5, 6, 7, 8, 9]})
+    df = df.astype({"a": by_col_dtype})
+
+    expected = (
+        DataFrame.from_records(
+            [
+                ("a", "count", 3.0, 3.0, 3.0),
+                ("a", "mean", 1.0, 2.0, 3.0),
+                ("a", "std", 0.0, 0.0, 0.0),
+                ("a", "min", 1.0, 2.0, 3.0),
+                ("a", "25%", 1.0, 2.0, 3.0),
+                ("a", "50%", 1.0, 2.0, 3.0),
+                ("a", "75%", 1.0, 2.0, 3.0),
+                ("a", "max", 1.0, 2.0, 3.0),
+                ("b", "count", 3.0, 3.0, 3.0),
+                ("b", "mean", 2.0, 5.0, 8.0),
+                ("b", "std", 1.0, 1.0, 1.0),
+                ("b", "min", 1.0, 4.0, 7.0),
+                ("b", "25%", 1.5, 4.5, 7.5),
+                ("b", "50%", 2.0, 5.0, 8.0),
+                ("b", "75%", 2.5, 5.5, 8.5),
+                ("b", "max", 3.0, 6.0, 9.0),
+            ],
+            columns=["col", "func", 1, 2, 3],
+        )
+        .set_index(["col", "func"])
+        .T
+    )
+    expected.columns.names = [None, None]
+    expected.index = pd.Index(expected.index.astype(by_col_dtype), name="a")
+
+    if by_col_dtype is str:
+        # If the grouping column is a nuisance column (i.e. can't apply the
+        # std() or quantile() to it) then it does not appear in the output
+        expected = expected.drop(columns="a")
+
+    result = df.groupby("a").describe()
+    tm.assert_frame_equal(result, expected)
+
+
 def test_groupby_mean_no_overflow():
     # Regression test for (#22487)
     df = pd.DataFrame(

diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py
@@ -191,13 +191,15 @@ def test_grouper_creation_bug(self):
         result = g.sum()
         tm.assert_frame_equal(result, expected)
 
-        result = g.apply(lambda x: x.sum())
-        tm.assert_frame_equal(result, expected)
-
         g = df.groupby(pd.Grouper(key="A", axis=0))
         result = g.sum()
         tm.assert_frame_equal(result, expected)
 
+        result = g.apply(lambda x: x.sum())
+        expected["A"] = [0, 2, 4]
+        expected = expected.loc[:, ["A", "B"]]
+        tm.assert_frame_equal(result, expected)
+
         # GH14334
         # pd.Grouper(key=...) may be passed in a list
         df = DataFrame(

diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
@@ -743,6 +743,9 @@ def test_cython_transform_frame(op, args, targop):
             else:
                 expected = gb.apply(targop)
 
+            if op == "shift" and type(gb_target.get("by")) is str:
+                expected = expected.drop(columns=gb_target.get("by"))
+
             expected = expected.sort_index(axis=1)
             tm.assert_frame_equal(expected, gb.transform(op, *args).sort_index(axis=1))
             tm.assert_frame_equal(expected, getattr(gb, op)(*args).sort_index(axis=1))

diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py
@@ -224,7 +224,7 @@ def test_to_latex_multiindex(self):
 
         assert result == expected
 
-        result = df.groupby("a").describe().to_latex()
+        result = df.groupby("a").describe().drop(columns="a").to_latex()
         expected = r"""\begin{tabular}{lrrrrrrrr}
 \toprule
 {} & \multicolumn{8}{l}{c} \\