BUG: Multiple lambdas in named aggregation (#27921)

charlesdong1991 · TomAugspurger · commit 7b25463abeee · 2019-08-30T09:47:16.000-05:00
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -178,6 +178,7 @@ Groupby/resample/rolling
 -
 -
 - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`)
+- Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`)
 
 Reshaping
 ^^^^^^^^^
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -268,7 +268,9 @@ def aggregate(self, func, *args, **kwargs):
             result.index = np.arange(len(result))
 
         if relabeling:
-            result = result[order]
+
+            # used reordered index of columns
+            result = result.iloc[:, order]
             result.columns = columns
 
         return result._convert(datetime=True)
@@ -1731,8 +1733,8 @@ def _normalize_keyword_aggregation(kwargs):
         The transformed kwargs.
     columns : List[str]
         The user-provided keys.
-    order : List[Tuple[str, str]]
-        Pairs of the input and output column names.
+    col_idx_order : List[int]
+        List of columns indices.
 
     Examples
     --------
@@ -1759,7 +1761,39 @@ def _normalize_keyword_aggregation(kwargs):
         else:
             aggspec[column] = [aggfunc]
         order.append((column, com.get_callable_name(aggfunc) or aggfunc))
-    return aggspec, columns, order
+
+    # uniquify aggfunc name if duplicated in order list
+    uniquified_order = _make_unique(order)
+
+    # GH 25719, due to aggspec will change the order of assigned columns in aggregation
+    # uniquified_aggspec will store uniquified order list and will compare it with order
+    # based on index
+    aggspec_order = [
+        (column, com.get_callable_name(aggfunc) or aggfunc)
+        for column, aggfuncs in aggspec.items()
+        for aggfunc in aggfuncs
+    ]
+    uniquified_aggspec = _make_unique(aggspec_order)
+
+    # get the new indice of columns by comparison
+    col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order)
+    return aggspec, columns, col_idx_order
+
+
+def _make_unique(seq):
+    """Uniquify aggfunc name of the pairs in the order list
+
+    Examples:
+    --------
+    >>> _make_unique([('a', '<lambda>'), ('a', '<lambda>'), ('b', '<lambda>')])
+    [('a', '<lambda>_0'), ('a', '<lambda>_1'), ('b', '<lambda>')]
+    """
+    return [
+        (pair[0], "_".join([pair[1], str(seq[:i].count(pair))]))
+        if seq.count(pair) > 1
+        else pair
+        for i, pair in enumerate(seq)
+    ]
 
 
 # TODO: Can't use, because mypy doesn't like us setting __name__
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -10,7 +10,7 @@
 import pandas as pd
 from pandas import DataFrame, Index, MultiIndex, Series, compat, concat
 from pandas.core.base import SpecificationError
-from pandas.core.groupby.generic import _maybe_mangle_lambdas
+from pandas.core.groupby.generic import _make_unique, _maybe_mangle_lambdas
 from pandas.core.groupby.grouper import Grouping
 import pandas.util.testing as tm
 
@@ -560,3 +560,150 @@ def test_with_kwargs(self):
         result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10)
         expected = pd.DataFrame({"<lambda_0>": [13], "<lambda_1>": [30]})
         tm.assert_frame_equal(result, expected)
+
+    def test_agg_with_one_lambda(self):
+        # GH 25719, write tests for DataFrameGroupby.agg with only one lambda
+        df = pd.DataFrame(
+            {
+                "kind": ["cat", "dog", "cat", "dog"],
+                "height": [9.1, 6.0, 9.5, 34.0],
+                "weight": [7.9, 7.5, 9.9, 198.0],
+            }
+        )
+
+        # sort for 35 and earlier
+        columns = ["height_sqr_min", "height_max", "weight_max"]
+        if compat.PY35:
+            columns = ["height_max", "height_sqr_min", "weight_max"]
+        expected = pd.DataFrame(
+            {
+                "height_sqr_min": [82.81, 36.00],
+                "height_max": [9.5, 34.0],
+                "weight_max": [9.9, 198.0],
+            },
+            index=pd.Index(["cat", "dog"], name="kind"),
+            columns=columns,
+        )
+
+        # check pd.NameAgg case
+        result1 = df.groupby(by="kind").agg(
+            height_sqr_min=pd.NamedAgg(
+                column="height", aggfunc=lambda x: np.min(x ** 2)
+            ),
+            height_max=pd.NamedAgg(column="height", aggfunc="max"),
+            weight_max=pd.NamedAgg(column="weight", aggfunc="max"),
+        )
+        tm.assert_frame_equal(result1, expected)
+
+        # check agg(key=(col, aggfunc)) case
+        result2 = df.groupby(by="kind").agg(
+            height_sqr_min=("height", lambda x: np.min(x ** 2)),
+            height_max=("height", "max"),
+            weight_max=("weight", "max"),
+        )
+        tm.assert_frame_equal(result2, expected)
+
+    def test_agg_multiple_lambda(self):
+        # GH25719, test for DataFrameGroupby.agg with multiple lambdas
+        # with mixed aggfunc
+        df = pd.DataFrame(
+            {
+                "kind": ["cat", "dog", "cat", "dog"],
+                "height": [9.1, 6.0, 9.5, 34.0],
+                "weight": [7.9, 7.5, 9.9, 198.0],
+            }
+        )
+        # sort for 35 and earlier
+        columns = [
+            "height_sqr_min",
+            "height_max",
+            "weight_max",
+            "height_max_2",
+            "weight_min",
+        ]
+        if compat.PY35:
+            columns = [
+                "height_max",
+                "height_max_2",
+                "height_sqr_min",
+                "weight_max",
+                "weight_min",
+            ]
+        expected = pd.DataFrame(
+            {
+                "height_sqr_min": [82.81, 36.00],
+                "height_max": [9.5, 34.0],
+                "weight_max": [9.9, 198.0],
+                "height_max_2": [9.5, 34.0],
+                "weight_min": [7.9, 7.5],
+            },
+            index=pd.Index(["cat", "dog"], name="kind"),
+            columns=columns,
+        )
+
+        # check agg(key=(col, aggfunc)) case
+        result1 = df.groupby(by="kind").agg(
+            height_sqr_min=("height", lambda x: np.min(x ** 2)),
+            height_max=("height", "max"),
+            weight_max=("weight", "max"),
+            height_max_2=("height", lambda x: np.max(x)),
+            weight_min=("weight", lambda x: np.min(x)),
+        )
+        tm.assert_frame_equal(result1, expected)
+
+        # check pd.NamedAgg case
+        result2 = df.groupby(by="kind").agg(
+            height_sqr_min=pd.NamedAgg(
+                column="height", aggfunc=lambda x: np.min(x ** 2)
+            ),
+            height_max=pd.NamedAgg(column="height", aggfunc="max"),
+            weight_max=pd.NamedAgg(column="weight", aggfunc="max"),
+            height_max_2=pd.NamedAgg(column="height", aggfunc=lambda x: np.max(x)),
+            weight_min=pd.NamedAgg(column="weight", aggfunc=lambda x: np.min(x)),
+        )
+        tm.assert_frame_equal(result2, expected)
+
+    @pytest.mark.parametrize(
+        "order, expected_reorder",
+        [
+            (
+                [
+                    ("height", "<lambda>"),
+                    ("height", "max"),
+                    ("weight", "max"),
+                    ("height", "<lambda>"),
+                    ("weight", "<lambda>"),
+                ],
+                [
+                    ("height", "<lambda>_0"),
+                    ("height", "max"),
+                    ("weight", "max"),
+                    ("height", "<lambda>_1"),
+                    ("weight", "<lambda>"),
+                ],
+            ),
+            (
+                [
+                    ("col2", "min"),
+                    ("col1", "<lambda>"),
+                    ("col1", "<lambda>"),
+                    ("col1", "<lambda>"),
+                ],
+                [
+                    ("col2", "min"),
+                    ("col1", "<lambda>_0"),
+                    ("col1", "<lambda>_1"),
+                    ("col1", "<lambda>_2"),
+                ],
+            ),
+            (
+                [("col", "<lambda>"), ("col", "<lambda>"), ("col", "<lambda>")],
+                [("col", "<lambda>_0"), ("col", "<lambda>_1"), ("col", "<lambda>_2")],
+            ),
+        ],
+    )
+    def test_make_unique(self, order, expected_reorder):
+        # GH 27519, test if make_unique function reorders correctly
+        result = _make_unique(order)
+
+        assert result == expected_reorder

Original file line number	Diff line number	Diff line change
`@@ -178,6 +178,7 @@ Groupby/resample/rolling`
`178`	`178`	`-`
`179`	`179`	`-`
`180`	`180`	- Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`)
	`181`	+- Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`)
`181`	`182`
`182`	`183`	`Reshaping`
`183`	`184`	`^^^^^^^^^`