BUG: Remove error raise to allow same input function on the same column in named aggregation (#28428)

charlesdong1991 · jreback · commit facd756fb0e6 · 2019-12-08T13:01:21.000-05:00
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -803,6 +803,8 @@ Groupby/resample/rolling
 - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`)
 - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`)
 - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`)
+- Remove error raised due to duplicated input functions in named aggregation in :meth:`DataFrame.groupby` and :meth:`Series.groupby`. Previously error will be raised if the same function is applied on the same column and now it is allowed if new assigned names are different. (:issue:`28426`)
+- :meth:`SeriesGroupBy.value_counts` will be able to handle the case even when the :class:`Grouper` makes empty groups (:issue: 28479)
 - Bug in :meth:`DataFrameGroupBy.rolling().quantile()` ignoring ``interpolation`` keyword argument (:issue:`28779`)
 - Bug in :meth:`DataFrame.groupby` where ``any``, ``all``, ``nunique`` and transform functions would incorrectly handle duplicate column labels (:issue:`21668`)
 - Bug in :meth:`DataFrameGroupBy.agg` with timezone-aware datetime64 column incorrectly casting results to the original dtype (:issue:`29641`)
@@ -846,7 +848,6 @@ Other
 - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`)
 - Bug in :meth:`Series.diff` where a boolean series would incorrectly raise a ``TypeError`` (:issue:`17294`)
 - :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`)
-- :meth:`SeriesGroupBy.value_counts` will be able to handle the case even when the :class:`Grouper` makes empty groups (:issue:`28479`)
 - Fix corrupted error message when calling ``pandas.libs._json.encode()`` on a 0d array (:issue:`18878`)
 - Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`)
 - Fix :class:`AbstractHolidayCalendar` to return correct results for
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -302,10 +302,6 @@ def _aggregate_multiple_funcs(self, arg):
         results = OrderedDict()
         for name, func in arg:
             obj = self
-            if name in results:
-                raise SpecificationError(
-                    f"Function names must be unique, found multiple named {name}"
-                )
 
             # reset the cache so that we
             # only include the named selection
@@ -915,6 +911,14 @@ def aggregate(self, func=None, *args, **kwargs):
             func, columns, order = _normalize_keyword_aggregation(kwargs)
 
             kwargs = {}
+        elif isinstance(func, list) and len(func) > len(set(func)):
+
+            # GH 28426 will raise error if duplicated function names are used and
+            # there is no reassigned name
+            raise SpecificationError(
+                "Function names must be unique if there is no new column "
+                "names assigned"
+            )
         elif func is None:
             # nicer error message
             raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).")
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -353,6 +353,14 @@ def test_uint64_type_handling(dtype, how):
     tm.assert_frame_equal(result, expected, check_exact=True)
 
 
+def test_func_duplicates_raises():
+    # GH28426
+    msg = "Function names"
+    df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
+    with pytest.raises(SpecificationError, match=msg):
+        df.groupby("A").agg(["min", "min"])
+
+
 class TestNamedAggregationSeries:
     def test_series_named_agg(self):
         df = pd.Series([1, 2, 3, 4])
@@ -377,12 +385,12 @@ def test_no_args_raises(self):
         expected = pd.DataFrame()
         tm.assert_frame_equal(result, expected)
 
-    def test_series_named_agg_duplicates_raises(self):
-        # This is a limitation of the named agg implementation reusing
-        # aggregate_multiple_funcs. It could maybe be lifted in the future.
+    def test_series_named_agg_duplicates_no_raises(self):
+        # GH28426
         gr = pd.Series([1, 2, 3]).groupby([0, 0, 1])
-        with pytest.raises(SpecificationError):
-            gr.agg(a="sum", b="sum")
+        grouped = gr.agg(a="sum", b="sum")
+        expected = pd.DataFrame({"a": [3, 3], "b": [3, 3]})
+        tm.assert_frame_equal(expected, grouped)
 
     def test_mangled(self):
         gr = pd.Series([1, 2, 3]).groupby([0, 0, 1])
@@ -439,12 +447,34 @@ def test_agg_relabel_non_identifier(self):
         )
         tm.assert_frame_equal(result, expected)
 
-    def test_duplicate_raises(self):
-        # TODO: we currently raise on multiple lambdas. We could *maybe*
-        # update com.get_callable_name to append `_i` to each lambda.
+    def test_duplicate_no_raises(self):
+        # GH 28426, if use same input function on same column,
+        # no error should raise
         df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
-        with pytest.raises(SpecificationError, match="Function names"):
-            df.groupby("A").agg(a=("A", "min"), b=("A", "min"))
+
+        grouped = df.groupby("A").agg(a=("B", "min"), b=("B", "min"))
+        expected = pd.DataFrame(
+            {"a": [1, 3], "b": [1, 3]}, index=pd.Index([0, 1], name="A")
+        )
+        tm.assert_frame_equal(grouped, expected)
+
+        quant50 = functools.partial(np.percentile, q=50)
+        quant70 = functools.partial(np.percentile, q=70)
+        quant50.__name__ = "quant50"
+        quant70.__name__ = "quant70"
+
+        test = pd.DataFrame(
+            {"col1": ["a", "a", "b", "b", "b"], "col2": [1, 2, 3, 4, 5]}
+        )
+
+        grouped = test.groupby("col1").agg(
+            quantile_50=("col2", quant50), quantile_70=("col2", quant70)
+        )
+        expected = pd.DataFrame(
+            {"quantile_50": [1.5, 4.0], "quantile_70": [1.7, 4.4]},
+            index=pd.Index(["a", "b"], name="col1"),
+        )
+        tm.assert_frame_equal(grouped, expected)
 
     def test_agg_relabel_with_level(self):
         df = pd.DataFrame(
@@ -557,15 +587,21 @@ def test_agg_relabel_multiindex_raises_not_exist():
         df.groupby(("x", "group")).agg(a=(("Y", "a"), "max"))
 
 
-def test_agg_relabel_multiindex_raises_duplicate():
+def test_agg_relabel_multiindex_duplicates():
     # GH29422, add test for raises senario when getting duplicates
+    # GH28426, after this change, duplicates should also work if the relabelling is
+    # different
     df = DataFrame(
         {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
     )
     df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")])
 
-    with pytest.raises(SpecificationError, match="Function names"):
-        df.groupby(("x", "group")).agg(a=(("y", "A"), "min"), b=(("y", "A"), "min"))
+    result = df.groupby(("x", "group")).agg(
+        a=(("y", "A"), "min"), b=(("y", "A"), "min")
+    )
+    idx = pd.Index(["a", "b"], name=("x", "group"))
+    expected = DataFrame({"a": [0, 2], "b": [0, 2]}, index=idx)
+    tm.assert_frame_equal(result, expected)
 
 
 def myfunc(s):