Skip to content

BUG: Lambda function returns KeyError in DataFrameGroupBy.agg #27921

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Aug 30, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
7e461a1
remove \n from docstring
charlesdong1991 Dec 3, 2018
1314059
fix conflicts
charlesdong1991 Jan 19, 2019
8bcb313
Merge remote-tracking branch 'upstream/master'
charlesdong1991 Jul 30, 2019
e313083
fix issue 27519
charlesdong1991 Aug 14, 2019
197c879
Correct tests and add comments
charlesdong1991 Aug 14, 2019
b518b2f
Merge remote-tracking branch 'upstream/master' into fix_issue_27519
charlesdong1991 Aug 14, 2019
c817df2
Add whatsnew note
charlesdong1991 Aug 14, 2019
74d4684
fix test failure
charlesdong1991 Aug 15, 2019
7df87cb
Code change based on review
charlesdong1991 Aug 15, 2019
d5e52cb
Fix case for py35
charlesdong1991 Aug 15, 2019
5be9c54
More robust solution
charlesdong1991 Aug 15, 2019
29d8348
Simplify the code
charlesdong1991 Aug 15, 2019
275a039
Optimize the code
charlesdong1991 Aug 15, 2019
5dd61da
Simplify the code
charlesdong1991 Aug 16, 2019
b5b44e9
Simplify code
charlesdong1991 Aug 16, 2019
473800f
Add more complicated case to test result
charlesdong1991 Aug 16, 2019
bad1d72
Rename to make variable name meaningful
charlesdong1991 Aug 16, 2019
60e426a
Fix linting
charlesdong1991 Aug 16, 2019
943437a
Self review on code quality
charlesdong1991 Aug 16, 2019
a3ba061
Modify commet
charlesdong1991 Aug 16, 2019
a6719f1
Merge remote-tracking branch 'upstream/master' into fix_issue_27519
charlesdong1991 Aug 16, 2019
aabfcd2
Update doc
charlesdong1991 Aug 19, 2019
0950bc4
Better python
charlesdong1991 Aug 19, 2019
c992fec
Merge remote-tracking branch 'upstream/master' into fix_issue_27519
charlesdong1991 Aug 26, 2019
fe33469
Add test for make_unique
charlesdong1991 Aug 26, 2019
ace9035
fix linting
charlesdong1991 Aug 26, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ Groupby/resample/rolling
-
-
- Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`)
- Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`)

Reshaping
^^^^^^^^^
Expand Down
42 changes: 38 additions & 4 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,9 @@ def aggregate(self, func, *args, **kwargs):
result.index = np.arange(len(result))

if relabeling:
result = result[order]

# used reordered index of columns
result = result.iloc[:, order]
result.columns = columns

return result._convert(datetime=True)
Expand Down Expand Up @@ -1731,8 +1733,8 @@ def _normalize_keyword_aggregation(kwargs):
The transformed kwargs.
columns : List[str]
The user-provided keys.
order : List[Tuple[str, str]]
Pairs of the input and output column names.
col_idx_order : List[int]
List of columns indices.

Examples
--------
Expand All @@ -1759,7 +1761,39 @@ def _normalize_keyword_aggregation(kwargs):
else:
aggspec[column] = [aggfunc]
order.append((column, com.get_callable_name(aggfunc) or aggfunc))
return aggspec, columns, order

# uniquify aggfunc name if duplicated in order list
uniquified_order = _make_unique(order)

# GH 25719, due to aggspec will change the order of assigned columns in aggregation
# uniquified_aggspec will store uniquified order list and will compare it with order
# based on index
aggspec_order = [
(column, com.get_callable_name(aggfunc) or aggfunc)
for column, aggfuncs in aggspec.items()
for aggfunc in aggfuncs
]
uniquified_aggspec = _make_unique(aggspec_order)

# get the new indice of columns by comparison
col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order)
return aggspec, columns, col_idx_order


def _make_unique(seq):
"""Uniquify aggfunc name of the pairs in the order list

Examples:
--------
>>> _make_unique([('a', '<lambda>'), ('a', '<lambda>'), ('b', '<lambda>')])
[('a', '<lambda>_0'), ('a', '<lambda>_1'), ('b', '<lambda>')]
"""
return [
(pair[0], "_".join([pair[1], str(seq[:i].count(pair))]))
if seq.count(pair) > 1
else pair
for i, pair in enumerate(seq)
]


# TODO: Can't use, because mypy doesn't like us setting __name__
Expand Down
149 changes: 148 additions & 1 deletion pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series, compat, concat
from pandas.core.base import SpecificationError
from pandas.core.groupby.generic import _maybe_mangle_lambdas
from pandas.core.groupby.generic import _make_unique, _maybe_mangle_lambdas
from pandas.core.groupby.grouper import Grouping
import pandas.util.testing as tm

Expand Down Expand Up @@ -560,3 +560,150 @@ def test_with_kwargs(self):
result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10)
expected = pd.DataFrame({"<lambda_0>": [13], "<lambda_1>": [30]})
tm.assert_frame_equal(result, expected)

def test_agg_with_one_lambda(self):
# GH 25719, write tests for DataFrameGroupby.agg with only one lambda
df = pd.DataFrame(
{
"kind": ["cat", "dog", "cat", "dog"],
"height": [9.1, 6.0, 9.5, 34.0],
"weight": [7.9, 7.5, 9.9, 198.0],
}
)

# sort for 35 and earlier
columns = ["height_sqr_min", "height_max", "weight_max"]
if compat.PY35:
columns = ["height_max", "height_sqr_min", "weight_max"]
expected = pd.DataFrame(
{
"height_sqr_min": [82.81, 36.00],
"height_max": [9.5, 34.0],
"weight_max": [9.9, 198.0],
},
index=pd.Index(["cat", "dog"], name="kind"),
columns=columns,
)

# check pd.NameAgg case
result1 = df.groupby(by="kind").agg(
height_sqr_min=pd.NamedAgg(
column="height", aggfunc=lambda x: np.min(x ** 2)
),
height_max=pd.NamedAgg(column="height", aggfunc="max"),
weight_max=pd.NamedAgg(column="weight", aggfunc="max"),
)
tm.assert_frame_equal(result1, expected)

# check agg(key=(col, aggfunc)) case
result2 = df.groupby(by="kind").agg(
height_sqr_min=("height", lambda x: np.min(x ** 2)),
height_max=("height", "max"),
weight_max=("weight", "max"),
)
tm.assert_frame_equal(result2, expected)

def test_agg_multiple_lambda(self):
# GH25719, test for DataFrameGroupby.agg with multiple lambdas
# with mixed aggfunc
df = pd.DataFrame(
{
"kind": ["cat", "dog", "cat", "dog"],
"height": [9.1, 6.0, 9.5, 34.0],
"weight": [7.9, 7.5, 9.9, 198.0],
}
)
# sort for 35 and earlier
columns = [
"height_sqr_min",
"height_max",
"weight_max",
"height_max_2",
"weight_min",
]
if compat.PY35:
columns = [
"height_max",
"height_max_2",
"height_sqr_min",
"weight_max",
"weight_min",
]
expected = pd.DataFrame(
{
"height_sqr_min": [82.81, 36.00],
"height_max": [9.5, 34.0],
"weight_max": [9.9, 198.0],
"height_max_2": [9.5, 34.0],
"weight_min": [7.9, 7.5],
},
index=pd.Index(["cat", "dog"], name="kind"),
columns=columns,
)

# check agg(key=(col, aggfunc)) case
result1 = df.groupby(by="kind").agg(
height_sqr_min=("height", lambda x: np.min(x ** 2)),
height_max=("height", "max"),
weight_max=("weight", "max"),
height_max_2=("height", lambda x: np.max(x)),
weight_min=("weight", lambda x: np.min(x)),
)
tm.assert_frame_equal(result1, expected)

# check pd.NamedAgg case
result2 = df.groupby(by="kind").agg(
height_sqr_min=pd.NamedAgg(
column="height", aggfunc=lambda x: np.min(x ** 2)
),
height_max=pd.NamedAgg(column="height", aggfunc="max"),
weight_max=pd.NamedAgg(column="weight", aggfunc="max"),
height_max_2=pd.NamedAgg(column="height", aggfunc=lambda x: np.max(x)),
weight_min=pd.NamedAgg(column="weight", aggfunc=lambda x: np.min(x)),
)
tm.assert_frame_equal(result2, expected)

@pytest.mark.parametrize(
"order, expected_reorder",
[
(
[
("height", "<lambda>"),
("height", "max"),
("weight", "max"),
("height", "<lambda>"),
("weight", "<lambda>"),
],
[
("height", "<lambda>_0"),
("height", "max"),
("weight", "max"),
("height", "<lambda>_1"),
("weight", "<lambda>"),
],
),
(
[
("col2", "min"),
("col1", "<lambda>"),
("col1", "<lambda>"),
("col1", "<lambda>"),
],
[
("col2", "min"),
("col1", "<lambda>_0"),
("col1", "<lambda>_1"),
("col1", "<lambda>_2"),
],
),
(
[("col", "<lambda>"), ("col", "<lambda>"), ("col", "<lambda>")],
[("col", "<lambda>_0"), ("col", "<lambda>_1"), ("col", "<lambda>_2")],
),
],
)
def test_make_unique(self, order, expected_reorder):
# GH 27519, test if make_unique function reorders correctly
result = _make_unique(order)

assert result == expected_reorder