BUG: DataFrame.groupby with as_index=False shouldn't modify grouping columns

rhshadrach · rhshadrach · commit c1c6e4089146 · 2020-05-05T17:29:15.000-04:00
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1257,7 +1257,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
 
             v = values[0]
 
-            if isinstance(v, (np.ndarray, Index, Series)):
+            if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index:
                 if isinstance(v, Series):
                     applied_index = self._selected_obj._get_axis(self.axis)
                     all_indexed_same = all_indexes_same([x.index for x in values])
@@ -1333,6 +1333,11 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
                     result = DataFrame(
                         stacked_values.T, index=v.index, columns=key_index
                     )
+                elif not self.as_index:
+                    # We add grouping column below, so create a frame here
+                    result = DataFrame(
+                        values, index=key_index, columns=[self._selection]
+                    )
                 else:
                     # GH#1738: values is list of arrays of unequal lengths
                     #  fall through to the outer else clause
@@ -1348,6 +1353,9 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
                 else:
                     result = result._convert(datetime=True)
 
+                if not self.as_index:
+                    self._insert_inaxis_grouper_inplace(result)
+
                 return self._reindex_output(result)
 
             # values are not series or array-like but scalars
@@ -1684,9 +1692,11 @@ def _insert_inaxis_grouper_inplace(self, result):
                 ),
             )
         )
-
+        columns = result.columns
         for name, lev, in_axis in izip:
-            if in_axis:
+            # GH #28549
+            # When using .apply(-), name will be in columns already
+            if in_axis and name not in columns:
                 result.insert(0, name, lev)
 
     def _wrap_aggregated_output(
@@ -1851,7 +1861,7 @@ def nunique(self, dropna: bool = True):
         4   ham       5      x
         5   ham       5      y
         """
-        obj = self._selected_obj
+        obj = self._obj_with_exclusions
 
         def groupby_series(obj, col=None):
             return SeriesGroupBy(obj, selection=col, grouper=self.grouper).nunique(
@@ -1882,6 +1892,9 @@ def groupby_series(obj, col=None):
 
         if not self.as_index:
             results.index = ibase.default_index(len(results))
+            if results.ndim == 1:
+                results = results.to_frame()
+            self._insert_inaxis_grouper_inplace(results)
         return results
 
     boxplot = boxplot_frame_groupby
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -715,11 +715,11 @@ def _make_wrapper(self, name):
 
         # need to setup the selection
         # as are not passed directly but in the grouper
-        f = getattr(self._selected_obj, name)
+        f = getattr(self._obj_with_exclusions, name)
         if not isinstance(f, types.MethodType):
             return self.apply(lambda self: getattr(self, name))
 
-        f = getattr(type(self._selected_obj), name)
+        f = getattr(type(self._obj_with_exclusions), name)
         sig = inspect.signature(f)
 
         def wrapper(*args, **kwargs):
@@ -742,7 +742,7 @@ def curried(x):
                 return self.apply(curried)
 
             try:
-                return self.apply(curried)
+                return self._python_apply_general(curried, self._obj_with_exclusions)
             except TypeError as err:
                 if not re.search(
                     "reduction operation '.*' not allowed for this dtype", str(err)
@@ -833,7 +833,7 @@ def f(g):
         # ignore SettingWithCopy here in case the user mutates
         with option_context("mode.chained_assignment", None):
             try:
-                result = self._python_apply_general(f)
+                result = self._python_apply_general(f, self._selected_obj)
             except TypeError:
                 # gh-20949
                 # try again, with .apply acting as a filtering
@@ -844,12 +844,12 @@ def f(g):
                 # on a string grouper column
 
                 with _group_selection_context(self):
-                    return self._python_apply_general(f)
+                    return self._python_apply_general(f, self._selected_obj)
 
         return result
 
-    def _python_apply_general(self, f):
-        keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis)
+    def _python_apply_general(self, f, obj):
+        keys, values, mutated = self.grouper.apply(f, obj, self.axis)
 
         return self._wrap_applied_output(
             keys, values, not_indexed_same=mutated or self.mutated
@@ -1016,7 +1016,7 @@ def _python_agg_general(
             output[key] = maybe_cast_result(result, obj, numeric_only=True)
 
         if len(output) == 0:
-            return self._python_apply_general(f)
+            return self._python_apply_general(f, self._selected_obj)
 
         if self.grouper._filter_empty_groups:
 
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -291,7 +291,7 @@ def test_non_cython_api():
     result = g.mad()
     tm.assert_frame_equal(result, expected)
 
-    expected = DataFrame([[0.0, 0.0], [0, np.nan]], columns=["A", "B"], index=[0, 1])
+    expected = DataFrame([[1, 0.0], [3, np.nan]], columns=["A", "B"], index=[0, 1])
     result = gni.mad()
     tm.assert_frame_equal(result, expected)
 
@@ -584,6 +584,31 @@ def test_ops_general(op, targop):
     tm.assert_frame_equal(result, expected)
 
 
+def test_ops_not_as_index(reduction_func):
+    # Using as_index=False should not modify grouped column
+
+    if reduction_func in ("corrwith",):
+        pytest.skip("Test not applicable")
+
+    if reduction_func in ("nth", "ngroup", "size",):
+        pytest.skip("Skip until behavior is determined (GH #5755)")
+
+    if reduction_func in ("sem", "std"):
+        pytest.skip("Function incorrectly modifies keys (GH #10355)")
+
+    df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"])
+    expected = getattr(df.groupby("a"), reduction_func)().reset_index()
+
+    result = getattr(df.groupby("a", as_index=False), reduction_func)()
+    tm.assert_frame_equal(result, expected)
+
+    result = df.groupby("a", as_index=False).agg(reduction_func)
+    tm.assert_frame_equal(result, expected)
+
+    result = getattr(df.groupby("a", as_index=False)["b"], reduction_func)()
+    tm.assert_frame_equal(result, expected)
+
+
 def test_max_nan_bug():
     raw = """,Date,app,File
 -04-23,2013-04-23 00:00:00,,log080001.log
@@ -1004,7 +1029,11 @@ def check_nunique(df, keys, as_index=True):
         if not as_index:
             right = right.reset_index(drop=True)
 
-        tm.assert_series_equal(left, right, check_names=False)
+        if not as_index:
+            # keys make the result a frame
+            tm.assert_frame_equal(left, right, check_names=False)
+        else:
+            tm.assert_series_equal(left, right, check_names=False)
         tm.assert_frame_equal(df, original_df)
 
     days = date_range("2015-08-23", periods=10)
@@ -1035,13 +1064,14 @@ def check_nunique(df, keys, as_index=True):
 def test_nunique():
     df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")})
 
-    expected = DataFrame({"A": [1] * 3, "B": [1, 2, 1], "C": [1, 1, 2]})
+    expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 2, 1], "C": [1, 1, 2]})
     result = df.groupby("A", as_index=False).nunique()
     tm.assert_frame_equal(result, expected)
 
     # as_index
     expected.index = list("abc")
     expected.index.name = "A"
+    expected = expected.drop(columns="A")
     result = df.groupby("A").nunique()
     tm.assert_frame_equal(result, expected)
 
@@ -1050,7 +1080,7 @@ def test_nunique():
     tm.assert_frame_equal(result, expected)
 
     # dropna
-    expected = DataFrame({"A": [1] * 3, "B": [1] * 3, "C": [1] * 3}, index=list("abc"))
+    expected = DataFrame({"B": [1] * 3, "C": [1] * 3}, index=list("abc"))
     expected.index.name = "A"
     result = df.replace({"x": None}).groupby("A").nunique()
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py
@@ -406,7 +406,7 @@ def test_all_methods_categorized(mframe):
     if new_names:
         msg = f"""
 There are uncatgeorized methods defined on the Grouper class:
-{names}.
+{new_names}.
 
 Was a new method recently added?