Skip to content

BUG: Lambda function returns KeyError in DataFrameGroupBy.agg #27921

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Aug 30, 2019
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
7e461a1
remove \n from docstring
charlesdong1991 Dec 3, 2018
1314059
fix conflicts
charlesdong1991 Jan 19, 2019
8bcb313
Merge remote-tracking branch 'upstream/master'
charlesdong1991 Jul 30, 2019
e313083
fix issue 27519
charlesdong1991 Aug 14, 2019
197c879
Correct tests and add comments
charlesdong1991 Aug 14, 2019
b518b2f
Merge remote-tracking branch 'upstream/master' into fix_issue_27519
charlesdong1991 Aug 14, 2019
c817df2
Add whatsnew note
charlesdong1991 Aug 14, 2019
74d4684
fix test failure
charlesdong1991 Aug 15, 2019
7df87cb
Code change based on review
charlesdong1991 Aug 15, 2019
d5e52cb
Fix case for py35
charlesdong1991 Aug 15, 2019
5be9c54
More robust solution
charlesdong1991 Aug 15, 2019
29d8348
Simplify the code
charlesdong1991 Aug 15, 2019
275a039
Optimize the code
charlesdong1991 Aug 15, 2019
5dd61da
Simplify the code
charlesdong1991 Aug 16, 2019
b5b44e9
Simplify code
charlesdong1991 Aug 16, 2019
473800f
Add more complicated case to test result
charlesdong1991 Aug 16, 2019
bad1d72
Rename to make variable name meaningful
charlesdong1991 Aug 16, 2019
60e426a
Fix linting
charlesdong1991 Aug 16, 2019
943437a
Self review on code quality
charlesdong1991 Aug 16, 2019
a3ba061
Modify commet
charlesdong1991 Aug 16, 2019
a6719f1
Merge remote-tracking branch 'upstream/master' into fix_issue_27519
charlesdong1991 Aug 16, 2019
aabfcd2
Update doc
charlesdong1991 Aug 19, 2019
0950bc4
Better python
charlesdong1991 Aug 19, 2019
c992fec
Merge remote-tracking branch 'upstream/master' into fix_issue_27519
charlesdong1991 Aug 26, 2019
fe33469
Add test for make_unique
charlesdong1991 Aug 26, 2019
ace9035
fix linting
charlesdong1991 Aug 26, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ Groupby/resample/rolling
-
-
- Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`)
- Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`)

Reshaping
^^^^^^^^^
Expand Down
42 changes: 38 additions & 4 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,9 @@ def aggregate(self, func, *args, **kwargs):
result.index = np.arange(len(result))

if relabeling:
result = result[order]

# used reordered index of columns
result = result.iloc[:, order]
result.columns = columns

return result._convert(datetime=True)
Expand Down Expand Up @@ -1731,8 +1733,8 @@ def _normalize_keyword_aggregation(kwargs):
The transformed kwargs.
columns : List[str]
The user-provided keys.
order : List[Tuple[str, str]]
Pairs of the input and output column names.
col_idx_order : List[int]
List of columns indices.

Examples
--------
Expand All @@ -1759,7 +1761,39 @@ def _normalize_keyword_aggregation(kwargs):
else:
aggspec[column] = [aggfunc]
order.append((column, com.get_callable_name(aggfunc) or aggfunc))
return aggspec, columns, order

# uniquify aggfunc name if duplicated in order list
uniquified_order = _uniquify_aggfunc(order)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you verify that the output example in the docstring still passes?

Can you also add a docstring example were this new code is hit in https://github.com/pandas-dev/pandas/pull/27921/files#diff-bfee1ba9e7cb79839776fac1a57ed940R1742?

Copy link
Member Author

@charlesdong1991 charlesdong1991 Aug 26, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, verified. And added tests for _make_unique function, all tests pass locally @TomAugspurger


# GH 25719, due to aggspec will change the order of assigned columns in aggregation
# uniquified_aggspec will store uniquified order list and will compare it with order
# based on index
aggspec_order = [
(column, com.get_callable_name(aggfunc) or aggfunc)
for column, aggfuncs in aggspec.items()
for aggfunc in aggfuncs
]
uniquified_aggspec = _uniquify_aggfunc(aggspec_order)

# get the new indice of columns by comparison
col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order)
return aggspec, columns, col_idx_order


def _uniquify_aggfunc(seq):
"""Uniquify aggfunc name of the pairs in the order list

Examples:
--------
>>> _uniquify_aggfunc([('a', '<lambda>'), ('a', '<lambda>'), ('b', '<lambda>')])
[('a', '<lambda>_0'), ('a', '<lambda>_1'), ('b', '<lambda>')]
"""
return [
(pair[0], "_".join([pair[1], str(seq[:i].count(pair))]))
if seq.count(pair) > 1
else pair
for i, pair in enumerate(seq)
]


# TODO: Can't use, because mypy doesn't like us setting __name__
Expand Down
102 changes: 102 additions & 0 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,3 +560,105 @@ def test_with_kwargs(self):
result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10)
expected = pd.DataFrame({"<lambda_0>": [13], "<lambda_1>": [30]})
tm.assert_frame_equal(result, expected)

def test_agg_with_one_lambda(self):
# GH 25719, write tests for DataFrameGroupby.agg with only one lambda
df = pd.DataFrame(
{
"kind": ["cat", "dog", "cat", "dog"],
"height": [9.1, 6.0, 9.5, 34.0],
"weight": [7.9, 7.5, 9.9, 198.0],
}
)

# sort for 35 and earlier
columns = ["height_sqr_min", "height_max", "weight_max"]
if compat.PY35:
columns = ["height_max", "height_sqr_min", "weight_max"]
expected = pd.DataFrame(
{
"height_sqr_min": [82.81, 36.00],
"height_max": [9.5, 34.0],
"weight_max": [9.9, 198.0],
},
index=pd.Index(["cat", "dog"], name="kind"),
columns=columns,
)

# check pd.NameAgg case
result1 = df.groupby(by="kind").agg(
height_sqr_min=pd.NamedAgg(
column="height", aggfunc=lambda x: np.min(x ** 2)
),
height_max=pd.NamedAgg(column="height", aggfunc="max"),
weight_max=pd.NamedAgg(column="weight", aggfunc="max"),
)
tm.assert_frame_equal(result1, expected)

# check agg(key=(col, aggfunc)) case
result2 = df.groupby(by="kind").agg(
height_sqr_min=("height", lambda x: np.min(x ** 2)),
height_max=("height", "max"),
weight_max=("weight", "max"),
)
tm.assert_frame_equal(result2, expected)

def test_agg_multiple_lambda(self):
# GH25719, test for DataFrameGroupby.agg with multiple lambdas
# with mixed aggfunc
df = pd.DataFrame(
{
"kind": ["cat", "dog", "cat", "dog"],
"height": [9.1, 6.0, 9.5, 34.0],
"weight": [7.9, 7.5, 9.9, 198.0],
}
)
# sort for 35 and earlier
columns = [
"height_sqr_min",
"height_max",
"weight_max",
"height_max_2",
"weight_min",
]
if compat.PY35:
columns = [
"height_max",
"height_max_2",
"height_sqr_min",
"weight_max",
"weight_min",
]
expected = pd.DataFrame(
{
"height_sqr_min": [82.81, 36.00],
"height_max": [9.5, 34.0],
"weight_max": [9.9, 198.0],
"height_max_2": [9.5, 34.0],
"weight_min": [7.9, 7.5],
},
index=pd.Index(["cat", "dog"], name="kind"),
columns=columns,
)

# check pd.NamedAgg case
result1 = df.groupby(by="kind").agg(
height_sqr_min=("height", lambda x: np.min(x ** 2)),
height_max=("height", "max"),
weight_max=("weight", "max"),
height_max_2=("height", lambda x: np.max(x)),
weight_min=("weight", lambda x: np.min(x)),
)
tm.assert_frame_equal(result1, expected)

# check agg(key=(col, aggfunc)) case
result2 = df.groupby(by="kind").agg(
height_sqr_min=pd.NamedAgg(
column="height", aggfunc=lambda x: np.min(x ** 2)
),
height_max=pd.NamedAgg(column="height", aggfunc="max"),
weight_max=pd.NamedAgg(column="weight", aggfunc="max"),
height_max_2=pd.NamedAgg(column="height", aggfunc=lambda x: np.max(x)),
weight_min=pd.NamedAgg(column="weight", aggfunc=lambda x: np.min(x)),
)
tm.assert_frame_equal(result2, expected)