BUG: DataFrame.groupby with as_index=False shouldn't modify grouping columns

rhshadrach · rhshadrach · commit 5eb636d029f7 · 2020-05-13T17:06:03.000-04:00
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -559,6 +559,34 @@ Assignment to multiple columns of a :class:`DataFrame` when some of the columns
    df[['a', 'c']] = 1
    df
 
+.. _whatsnew_110.api_breaking.groupby_as_index_false:
+
+Using groupby with ``as_index=False``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxmax``, ``idxmin``, ``mad``, ``nunique``, or ``skew`` would modify the grouping column. Now, the grouping column remains unchanged. (:issue:`21090`)
+
+.. ipython:: python
+
+   df = pd.DataFrame({"a": ["x", "x", "y", "y"], "b": [1, 1, 2, 3]})
+   df
+
+*Previous behavior*:
+
+.. code-block:: ipython
+
+   In [3]: df.groupby("a", as_index=False).nunique()
+   Out[4]:
+      a  b
+   0  1  1
+   1  1  2
+
+*New behavior*:
+
+.. ipython:: python
+
+   df.groupby("a", as_index=False).nunique()
+
 .. _whatsnew_110.deprecations:
 
 Deprecations
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1265,7 +1265,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
 
             v = values[0]
 
-            if isinstance(v, (np.ndarray, Index, Series)):
+            if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index:
                 if isinstance(v, Series):
                     applied_index = self._selected_obj._get_axis(self.axis)
                     all_indexed_same = all_indexes_same([x.index for x in values])
@@ -1341,6 +1341,11 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
                     result = self.obj._constructor(
                         stacked_values.T, index=v.index, columns=key_index
                     )
+                elif not self.as_index:
+                    # We add grouping column below, so create a frame here
+                    result = DataFrame(
+                        values, index=key_index, columns=[self._selection]
+                    )
                 else:
                     # GH#1738: values is list of arrays of unequal lengths
                     #  fall through to the outer else clause
@@ -1358,6 +1363,9 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
                 else:
                     result = result._convert(datetime=True)
 
+                if not self.as_index:
+                    self._insert_inaxis_grouper_inplace(result)
+
                 return self._reindex_output(result)
 
             # values are not series or array-like but scalars
@@ -1700,9 +1708,11 @@ def _insert_inaxis_grouper_inplace(self, result):
                 ),
             )
         )
-
+        columns = result.columns
         for name, lev, in_axis in izip:
-            if in_axis:
+            # GH #28549
+            # When using .apply(-), name will be in columns already
+            if in_axis and name not in columns:
                 result.insert(0, name, lev)
 
     def _wrap_aggregated_output(
@@ -1852,11 +1862,11 @@ def nunique(self, dropna: bool = True):
         5   ham       5      y
 
         >>> df.groupby('id').nunique()
-            id  value1  value2
+              value1  value2
         id
-        egg    1       1       1
-        ham    1       1       2
-        spam   1       2       1
+        egg        1       1
+        ham        1       2
+        spam       2       1
 
         Check for rows with the same id but conflicting values:
 
@@ -1867,7 +1877,7 @@ def nunique(self, dropna: bool = True):
         4   ham       5      x
         5   ham       5      y
         """
-        obj = self._selected_obj
+        obj = self._obj_with_exclusions
 
         def groupby_series(obj, col=None):
             return SeriesGroupBy(obj, selection=col, grouper=self.grouper).nunique(
@@ -1898,6 +1908,9 @@ def groupby_series(obj, col=None):
 
         if not self.as_index:
             results.index = ibase.default_index(len(results))
+            if results.ndim == 1:
+                results = results.to_frame()
+            self._insert_inaxis_grouper_inplace(results)
         return results
 
     boxplot = boxplot_frame_groupby
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -718,11 +718,11 @@ def _make_wrapper(self, name):
 
         # need to setup the selection
         # as are not passed directly but in the grouper
-        f = getattr(self._selected_obj, name)
+        f = getattr(self._obj_with_exclusions, name)
         if not isinstance(f, types.MethodType):
             return self.apply(lambda self: getattr(self, name))
 
-        f = getattr(type(self._selected_obj), name)
+        f = getattr(type(self._obj_with_exclusions), name)
         sig = inspect.signature(f)
 
         def wrapper(*args, **kwargs):
@@ -745,7 +745,7 @@ def curried(x):
                 return self.apply(curried)
 
             try:
-                return self.apply(curried)
+                return self._python_apply_general(curried, self._obj_with_exclusions)
             except TypeError as err:
                 if not re.search(
                     "reduction operation '.*' not allowed for this dtype", str(err)
@@ -836,7 +836,7 @@ def f(g):
         # ignore SettingWithCopy here in case the user mutates
         with option_context("mode.chained_assignment", None):
             try:
-                result = self._python_apply_general(f)
+                result = self._python_apply_general(f, self._selected_obj)
             except TypeError:
                 # gh-20949
                 # try again, with .apply acting as a filtering
@@ -847,12 +847,27 @@ def f(g):
                 # on a string grouper column
 
                 with _group_selection_context(self):
-                    return self._python_apply_general(f)
+                    return self._python_apply_general(f, self._selected_obj)
 
         return result
 
-    def _python_apply_general(self, f):
-        keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis)
+    def _python_apply_general(self, f, data):
+        """
+        Apply function f in python space
+
+        Parameters
+        ----------
+        f : callable
+            Function to apply
+        data : Series or DataFrame
+            Data to apply f to
+
+        Returns
+        -------
+        Series or DataFrame
+            data after applying f
+        """
+        keys, values, mutated = self.grouper.apply(f, data, self.axis)
 
         return self._wrap_applied_output(
             keys, values, not_indexed_same=mutated or self.mutated
@@ -1019,7 +1034,7 @@ def _python_agg_general(
             output[key] = maybe_cast_result(result, obj, numeric_only=True)
 
         if len(output) == 0:
-            return self._python_apply_general(f)
+            return self._python_apply_general(f, self._selected_obj)
 
         if self.grouper._filter_empty_groups:
 
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -280,7 +280,7 @@ def test_non_cython_api():
     result = g.mad()
     tm.assert_frame_equal(result, expected)
 
-    expected = DataFrame([[0.0, 0.0], [0, np.nan]], columns=["A", "B"], index=[0, 1])
+    expected = DataFrame([[1, 0.0], [3, np.nan]], columns=["A", "B"], index=[0, 1])
     result = gni.mad()
     tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -658,6 +658,37 @@ def test_groupby_as_index_agg(df):
         tm.assert_frame_equal(left, right)
 
 
+def test_ops_not_as_index(reduction_func):
+    # GH 21090
+    # Using as_index=False should not modify grouped column
+
+    if reduction_func in ("corrwith",):
+        pytest.skip("Test not applicable")
+
+    if reduction_func in ("nth", "ngroup", "size",):
+        pytest.skip("Skip until behavior is determined (GH #5755)")
+
+    if reduction_func in ("sem", "std"):
+        pytest.skip("Function incorrectly modifies keys (GH #10355)")
+
+    df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"])
+    expected = getattr(df.groupby("a"), reduction_func)().reset_index()
+
+    g = df.groupby("a", as_index=False)
+
+    result = getattr(g, reduction_func)()
+    tm.assert_frame_equal(result, expected)
+
+    result = g.agg(reduction_func)
+    tm.assert_frame_equal(result, expected)
+
+    result = getattr(g["b"], reduction_func)()
+    tm.assert_frame_equal(result, expected)
+
+    result = g["b"].agg(reduction_func)
+    tm.assert_frame_equal(result, expected)
+
+
 def test_as_index_series_return_frame(df):
     grouped = df.groupby("A", as_index=False)
     grouped2 = df.groupby(["A", "B"], as_index=False)
diff --git a/pandas/tests/groupby/test_nunique.py b/pandas/tests/groupby/test_nunique.py
@@ -25,7 +25,10 @@ def check_nunique(df, keys, as_index=True):
         if not as_index:
             right = right.reset_index(drop=True)
 
-        tm.assert_series_equal(left, right, check_names=False)
+        if as_index:
+            tm.assert_series_equal(left, right, check_names=False)
+        else:
+            tm.assert_frame_equal(left, right, check_names=False)
         tm.assert_frame_equal(df, original_df)
 
     days = date_range("2015-08-23", periods=10)
@@ -56,13 +59,14 @@ def check_nunique(df, keys, as_index=True):
 def test_nunique():
     df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")})
 
-    expected = DataFrame({"A": [1] * 3, "B": [1, 2, 1], "C": [1, 1, 2]})
+    expected = DataFrame({"A": list("abc"), "B": [1, 2, 1], "C": [1, 1, 2]})
     result = df.groupby("A", as_index=False).nunique()
     tm.assert_frame_equal(result, expected)
 
     # as_index
     expected.index = list("abc")
     expected.index.name = "A"
+    expected = expected.drop(columns="A")
     result = df.groupby("A").nunique()
     tm.assert_frame_equal(result, expected)
 
@@ -71,7 +75,7 @@ def test_nunique():
     tm.assert_frame_equal(result, expected)
 
     # dropna
-    expected = DataFrame({"A": [1] * 3, "B": [1] * 3, "C": [1] * 3}, index=list("abc"))
+    expected = DataFrame({"B": [1] * 3, "C": [1] * 3}, index=list("abc"))
     expected.index.name = "A"
     result = df.replace({"x": None}).groupby("A").nunique()
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py
@@ -406,7 +406,7 @@ def test_all_methods_categorized(mframe):
     if new_names:
         msg = f"""
 There are uncatgeorized methods defined on the Grouper class:
-{names}.
+{new_names}.
 
 Was a new method recently added?