BUG: groupby with as_index=False shouldn't modify grouping columns (#34012)

rhshadrach · web-flow · commit 333db4b765f8 · 2020-05-27T08:43:33.000-04:00
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -583,6 +583,53 @@ Assignment to multiple columns of a :class:`DataFrame` when some of the columns
    df[['a', 'c']] = 1
    df
 
+.. _whatsnew_110.api_breaking.groupby_consistency:
+
+Consistency across groupby reductions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Using :meth:`DataFrame.groupby` with ``as_index=True`` and the aggregation ``nunique`` would include the grouping column(s) in the columns of the result. Now the grouping column(s) only appear in the index, consistent with other reductions. (:issue:`32579`)
+
+.. ipython:: python
+
+   df = pd.DataFrame({"a": ["x", "x", "y", "y"], "b": [1, 1, 2, 3]})
+   df
+
+*Previous behavior*:
+
+.. code-block:: ipython
+
+   In [3]: df.groupby("a", as_index=True).nunique()
+   Out[4]:
+      a  b
+   a
+   x  1  1
+   y  1  2
+
+*New behavior*:
+
+.. ipython:: python
+
+   df.groupby("a", as_index=True).nunique()
+
+Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxmax``, ``idxmin``, ``mad``, ``nunique``, ``sem``, ``skew``, or ``std`` would modify the grouping column. Now the grouping column remains unchanged, consistent with other reductions. (:issue:`21090`, :issue:`10355`)
+
+*Previous behavior*:
+
+.. code-block:: ipython
+
+   In [3]: df.groupby("a", as_index=False).nunique()
+   Out[4]:
+      a  b
+   0  1  1
+   1  1  2
+
+*New behavior*:
+
+.. ipython:: python
+
+   df.groupby("a", as_index=False).nunique()
+
 .. _whatsnew_110.deprecations:
 
 Deprecations
@@ -855,7 +902,6 @@ Groupby/resample/rolling
 - Bug in :meth:`Series.groupby` would raise ``ValueError`` when grouping by :class:`PeriodIndex` level (:issue:`34010`)
 - Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`)
 - Bug in :meth:`GroupBy.rolling.apply` ignores args and kwargs parameters (:issue:`33433`)
-- Bug in :meth:`DataFrameGroupby.std` and :meth:`DataFrameGroupby.sem` would modify grouped-by columns when ``as_index=False`` (:issue:`10355`)
 
 Reshaping
 ^^^^^^^^^
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1265,7 +1265,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
 
             v = values[0]
 
-            if isinstance(v, (np.ndarray, Index, Series)):
+            if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index:
                 if isinstance(v, Series):
                     applied_index = self._selected_obj._get_axis(self.axis)
                     all_indexed_same = all_indexes_same([x.index for x in values])
@@ -1341,6 +1341,11 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
                     result = self.obj._constructor(
                         stacked_values.T, index=v.index, columns=key_index
                     )
+                elif not self.as_index:
+                    # We add grouping column below, so create a frame here
+                    result = DataFrame(
+                        values, index=key_index, columns=[self._selection]
+                    )
                 else:
                     # GH#1738: values is list of arrays of unequal lengths
                     #  fall through to the outer else clause
@@ -1358,6 +1363,9 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
                 else:
                     result = result._convert(datetime=True)
 
+                if not self.as_index:
+                    self._insert_inaxis_grouper_inplace(result)
+
                 return self._reindex_output(result)
 
             # values are not series or array-like but scalars
@@ -1700,9 +1708,11 @@ def _insert_inaxis_grouper_inplace(self, result):
                 ),
             )
         )
-
+        columns = result.columns
         for name, lev, in_axis in izip:
-            if in_axis:
+            # GH #28549
+            # When using .apply(-), name will be in columns already
+            if in_axis and name not in columns:
                 result.insert(0, name, lev)
 
     def _wrap_aggregated_output(
@@ -1852,11 +1862,11 @@ def nunique(self, dropna: bool = True):
         5   ham       5      y
 
         >>> df.groupby('id').nunique()
-            id  value1  value2
+              value1  value2
         id
-        egg    1       1       1
-        ham    1       1       2
-        spam   1       2       1
+        egg        1       1
+        ham        1       2
+        spam       2       1
 
         Check for rows with the same id but conflicting values:
 
@@ -1867,37 +1877,37 @@ def nunique(self, dropna: bool = True):
         4   ham       5      x
         5   ham       5      y
         """
-        obj = self._selected_obj
+        from pandas.core.reshape.concat import concat
 
-        def groupby_series(obj, col=None):
-            return SeriesGroupBy(obj, selection=col, grouper=self.grouper).nunique(
-                dropna=dropna
-            )
+        # TODO: this is duplicative of how GroupBy naturally works
+        # Try to consolidate with normal wrapping functions
 
-        if isinstance(obj, Series):
-            results = groupby_series(obj)
+        obj = self._obj_with_exclusions
+        axis_number = obj._get_axis_number(self.axis)
+        other_axis = int(not axis_number)
+        if axis_number == 0:
+            iter_func = obj.items
         else:
-            # TODO: this is duplicative of how GroupBy naturally works
-            # Try to consolidate with normal wrapping functions
-            from pandas.core.reshape.concat import concat
-
-            axis_number = obj._get_axis_number(self.axis)
-            other_axis = int(not axis_number)
-            if axis_number == 0:
-                iter_func = obj.items
-            else:
-                iter_func = obj.iterrows
+            iter_func = obj.iterrows
 
-            results = [groupby_series(content, label) for label, content in iter_func()]
-            results = concat(results, axis=1)
+        results = concat(
+            [
+                SeriesGroupBy(content, selection=label, grouper=self.grouper).nunique(
+                    dropna
+                )
+                for label, content in iter_func()
+            ],
+            axis=1,
+        )
 
-            if axis_number == 1:
-                results = results.T
+        if axis_number == 1:
+            results = results.T
 
-            results._get_axis(other_axis).names = obj._get_axis(other_axis).names
+        results._get_axis(other_axis).names = obj._get_axis(other_axis).names
 
         if not self.as_index:
             results.index = ibase.default_index(len(results))
+            self._insert_inaxis_grouper_inplace(results)
         return results
 
     boxplot = boxplot_frame_groupby
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -35,7 +35,7 @@ class providing the base-class of operations.
 
 from pandas._libs import Timestamp
 import pandas._libs.groupby as libgroupby
-from pandas._typing import FrameOrSeries, Scalar
+from pandas._typing import F, FrameOrSeries, FrameOrSeriesUnion, Scalar
 from pandas.compat.numpy import function as nv
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import Appender, Substitution, cache_readonly, doc
@@ -735,11 +735,11 @@ def _make_wrapper(self, name):
 
         # need to setup the selection
         # as are not passed directly but in the grouper
-        f = getattr(self._selected_obj, name)
+        f = getattr(self._obj_with_exclusions, name)
         if not isinstance(f, types.MethodType):
             return self.apply(lambda self: getattr(self, name))
 
-        f = getattr(type(self._selected_obj), name)
+        f = getattr(type(self._obj_with_exclusions), name)
         sig = inspect.signature(f)
 
         def wrapper(*args, **kwargs):
@@ -762,7 +762,7 @@ def curried(x):
                 return self.apply(curried)
 
             try:
-                return self.apply(curried)
+                return self._python_apply_general(curried, self._obj_with_exclusions)
             except TypeError as err:
                 if not re.search(
                     "reduction operation '.*' not allowed for this dtype", str(err)
@@ -853,7 +853,7 @@ def f(g):
         # ignore SettingWithCopy here in case the user mutates
         with option_context("mode.chained_assignment", None):
             try:
-                result = self._python_apply_general(f)
+                result = self._python_apply_general(f, self._selected_obj)
             except TypeError:
                 # gh-20949
                 # try again, with .apply acting as a filtering
@@ -864,12 +864,29 @@ def f(g):
                 # on a string grouper column
 
                 with _group_selection_context(self):
-                    return self._python_apply_general(f)
+                    return self._python_apply_general(f, self._selected_obj)
 
         return result
 
-    def _python_apply_general(self, f):
-        keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis)
+    def _python_apply_general(
+        self, f: F, data: FrameOrSeriesUnion
+    ) -> FrameOrSeriesUnion:
+        """
+        Apply function f in python space
+
+        Parameters
+        ----------
+        f : callable
+            Function to apply
+        data : Series or DataFrame
+            Data to apply f to
+
+        Returns
+        -------
+        Series or DataFrame
+            data after applying f
+        """
+        keys, values, mutated = self.grouper.apply(f, data, self.axis)
 
         return self._wrap_applied_output(
             keys, values, not_indexed_same=mutated or self.mutated
@@ -1067,7 +1084,7 @@ def _python_agg_general(
             output[key] = maybe_cast_result(result, obj, numeric_only=True)
 
         if len(output) == 0:
-            return self._python_apply_general(f)
+            return self._python_apply_general(f, self._selected_obj)
 
         if self.grouper._filter_empty_groups:
 
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
@@ -280,7 +280,7 @@ def test_non_cython_api():
     result = g.mad()
     tm.assert_frame_equal(result, expected)
 
-    expected = DataFrame([[0.0, 0.0], [0, np.nan]], columns=["A", "B"], index=[0, 1])
+    expected = DataFrame([[1, 0.0], [3, np.nan]], columns=["A", "B"], index=[0, 1])
     result = gni.mad()
     tm.assert_frame_equal(result, expected)
 
@@ -573,28 +573,6 @@ def test_ops_general(op, targop):
     tm.assert_frame_equal(result, expected)
 
 
-def test_ops_not_as_index(reduction_func):
-    # GH 10355
-    # Using as_index=False should not modify grouped column
-
-    if reduction_func in ("nth", "ngroup", "size",):
-        pytest.skip("Skip until behavior is determined (GH #5755)")
-
-    if reduction_func in ("corrwith", "idxmax", "idxmin", "mad", "nunique", "skew",):
-        pytest.xfail(
-            "_GroupBy._python_apply_general incorrectly modifies grouping columns"
-        )
-
-    df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"])
-    expected = getattr(df.groupby("a"), reduction_func)().reset_index()
-
-    result = getattr(df.groupby("a", as_index=False), reduction_func)()
-    tm.assert_frame_equal(result, expected)
-
-    result = getattr(df.groupby("a", as_index=False)["b"], reduction_func)()
-    tm.assert_frame_equal(result, expected)
-
-
 def test_max_nan_bug():
     raw = """,Date,app,File
 -04-23,2013-04-23 00:00:00,,log080001.log
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -661,6 +661,34 @@ def test_groupby_as_index_agg(df):
         tm.assert_frame_equal(left, right)
 
 
+def test_ops_not_as_index(reduction_func):
+    # GH 10355, 21090
+    # Using as_index=False should not modify grouped column
+
+    if reduction_func in ("corrwith",):
+        pytest.skip("Test not applicable")
+
+    if reduction_func in ("nth", "ngroup", "size",):
+        pytest.skip("Skip until behavior is determined (GH #5755)")
+
+    df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"])
+    expected = getattr(df.groupby("a"), reduction_func)().reset_index()
+
+    g = df.groupby("a", as_index=False)
+
+    result = getattr(g, reduction_func)()
+    tm.assert_frame_equal(result, expected)
+
+    result = g.agg(reduction_func)
+    tm.assert_frame_equal(result, expected)
+
+    result = getattr(g["b"], reduction_func)()
+    tm.assert_frame_equal(result, expected)
+
+    result = g["b"].agg(reduction_func)
+    tm.assert_frame_equal(result, expected)
+
+
 def test_as_index_series_return_frame(df):
     grouped = df.groupby("A", as_index=False)
     grouped2 = df.groupby(["A", "B"], as_index=False)
diff --git a/pandas/tests/groupby/test_nunique.py b/pandas/tests/groupby/test_nunique.py
@@ -25,7 +25,10 @@ def check_nunique(df, keys, as_index=True):
         if not as_index:
             right = right.reset_index(drop=True)
 
-        tm.assert_series_equal(left, right, check_names=False)
+        if as_index:
+            tm.assert_series_equal(left, right, check_names=False)
+        else:
+            tm.assert_frame_equal(left, right, check_names=False)
         tm.assert_frame_equal(df, original_df)
 
     days = date_range("2015-08-23", periods=10)
@@ -56,13 +59,14 @@ def check_nunique(df, keys, as_index=True):
 def test_nunique():
     df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")})
 
-    expected = DataFrame({"A": [1] * 3, "B": [1, 2, 1], "C": [1, 1, 2]})
+    expected = DataFrame({"A": list("abc"), "B": [1, 2, 1], "C": [1, 1, 2]})
     result = df.groupby("A", as_index=False).nunique()
     tm.assert_frame_equal(result, expected)
 
     # as_index
     expected.index = list("abc")
     expected.index.name = "A"
+    expected = expected.drop(columns="A")
     result = df.groupby("A").nunique()
     tm.assert_frame_equal(result, expected)
 
@@ -71,7 +75,7 @@ def test_nunique():
     tm.assert_frame_equal(result, expected)
 
     # dropna
-    expected = DataFrame({"A": [1] * 3, "B": [1] * 3, "C": [1] * 3}, index=list("abc"))
+    expected = DataFrame({"B": [1] * 3, "C": [1] * 3}, index=list("abc"))
     expected.index.name = "A"
     result = df.replace({"x": None}).groupby("A").nunique()
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py
@@ -406,7 +406,7 @@ def test_all_methods_categorized(mframe):
     if new_names:
         msg = f"""
 There are uncatgeorized methods defined on the Grouper class:
-{names}.
+{new_names}.
 
 Was a new method recently added?