Skip to content

BUG: Lambda function returns KeyError in DataFrameGroupBy.agg #27921

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 26 commits into from
Aug 30, 2019
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
7e461a1
remove \n from docstring
charlesdong1991 Dec 3, 2018
1314059
fix conflicts
charlesdong1991 Jan 19, 2019
8bcb313
Merge remote-tracking branch 'upstream/master'
charlesdong1991 Jul 30, 2019
e313083
fix issue 27519
charlesdong1991 Aug 14, 2019
197c879
Correct tests and add comments
charlesdong1991 Aug 14, 2019
b518b2f
Merge remote-tracking branch 'upstream/master' into fix_issue_27519
charlesdong1991 Aug 14, 2019
c817df2
Add whatsnew note
charlesdong1991 Aug 14, 2019
74d4684
fix test failure
charlesdong1991 Aug 15, 2019
7df87cb
Code change based on review
charlesdong1991 Aug 15, 2019
d5e52cb
Fix case for py35
charlesdong1991 Aug 15, 2019
5be9c54
More robust solution
charlesdong1991 Aug 15, 2019
29d8348
Simplify the code
charlesdong1991 Aug 15, 2019
275a039
Optimize the code
charlesdong1991 Aug 15, 2019
5dd61da
Simplify the code
charlesdong1991 Aug 16, 2019
b5b44e9
Simplify code
charlesdong1991 Aug 16, 2019
473800f
Add more complicated case to test result
charlesdong1991 Aug 16, 2019
bad1d72
Rename to make variable name meaningful
charlesdong1991 Aug 16, 2019
60e426a
Fix linting
charlesdong1991 Aug 16, 2019
943437a
Self review on code quality
charlesdong1991 Aug 16, 2019
a3ba061
Modify commet
charlesdong1991 Aug 16, 2019
a6719f1
Merge remote-tracking branch 'upstream/master' into fix_issue_27519
charlesdong1991 Aug 16, 2019
aabfcd2
Update doc
charlesdong1991 Aug 19, 2019
0950bc4
Better python
charlesdong1991 Aug 19, 2019
c992fec
Merge remote-tracking branch 'upstream/master' into fix_issue_27519
charlesdong1991 Aug 26, 2019
fe33469
Add test for make_unique
charlesdong1991 Aug 26, 2019
ace9035
fix linting
charlesdong1991 Aug 26, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ Groupby/resample/rolling
-
-
- Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`)
- Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function for ``aggfunc`` argument (:issue:`27519`)

Reshaping
^^^^^^^^^
Expand Down
76 changes: 71 additions & 5 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,9 @@ def aggregate(self, func, *args, **kwargs):
result.index = np.arange(len(result))

if relabeling:
result = result[order]

# used reordered index of columns
result = result.iloc[:, order]
result.columns = columns

return result._convert(datetime=True)
Expand Down Expand Up @@ -1731,8 +1733,8 @@ def _normalize_keyword_aggregation(kwargs):
The transformed kwargs.
columns : List[str]
The user-provided keys.
order : List[Tuple[str, str]]
Pairs of the input and output column names.
order : List[int]
List of columns indices.

Examples
--------
Expand All @@ -1752,14 +1754,78 @@ def _normalize_keyword_aggregation(kwargs):
aggspec = OrderedDict()
order = []
columns, pairs = list(zip(*kwargs.items()))
reordered_pairs = []

def _append_order_list(order, aggfunc, column, column_dict):
"""
Append the order list given the pair of (column, _get_aggfunc_name)
is in the list or not
"""
col_aggfunc_pair = (column, _get_aggfunc_name(aggfunc))
# check if the pair not in the order list, if yes, append to order list
# and mark it to 0
if col_aggfunc_pair not in order:
order.append(col_aggfunc_pair)
column_dict[col_aggfunc_pair] = 0
else:

# if pair already in order list, then add the marker by 1, and append
# the aggfunc name by the marker number
column_dict[col_aggfunc_pair] += 1
order.append(
(column, _get_aggfunc_name(aggfunc, column_dict[col_aggfunc_pair]))
)

return order, column_dict

column_dict = {}
for name, (column, aggfunc) in zip(columns, pairs):
if column in aggspec:
aggspec[column].append(aggfunc)
else:
aggspec[column] = [aggfunc]
order.append((column, com.get_callable_name(aggfunc) or aggfunc))
return aggspec, columns, order

order, column_dict = _append_order_list(order, aggfunc, column, column_dict)

# GH 25719, due to aggspec will change the order of assigned columns in aggregation
# reordered_pairs will store this reorder and will compare it with order
# based on index, it will obtain new order in index
column_dict = {}
for column, aggfuncs in aggspec.items():
for aggfunc in aggfuncs:
reordered_pairs, column_dict = _append_order_list(
reordered_pairs, aggfunc, column, column_dict
)

# get the new indice of columns by comparison
col_idx_order = [reordered_pairs.index(o) for o in order]
return aggspec, columns, col_idx_order


def _get_aggfunc_name(aggfunc, repeat_num=0):
"""
Return aggfunc name given repeat_num. If aggfunc appears before, then repeat_num
will be given different value, and output aggfunc name will be different

Parameters:
----------
aggfunc: aggfunc
repeat_num: int
How many time the aggfunc used to the same column,
default is 0

Returns:
-------
aggfunc name in string

"""
if repeat_num == 0:
return com.get_callable_name(aggfunc) or aggfunc
else:
suffix = "_{}".format(repeat_num)
if com.get_callable_name(aggfunc):
return com.get_callable_name(aggfunc) + suffix
return aggfunc + suffix


# TODO: Can't use, because mypy doesn't like us setting __name__
Expand Down
63 changes: 63 additions & 0 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,3 +560,66 @@ def test_with_kwargs(self):
result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10)
expected = pd.DataFrame({"<lambda_0>": [13], "<lambda_1>": [30]})
tm.assert_frame_equal(result, expected)

def test_agg_one_lambda(self):
# GH 25719, write tests for DataFrameGroupby.agg with only one lambda
df = pd.DataFrame(
{
"kind": ["cat", "dog", "cat", "dog"],
"height": [9.1, 6.0, 9.5, 34.0],
"weight": [7.9, 7.5, 9.9, 198.0],
}
)

# sort for 35 and earlier
columns = ["height_sqr_min", "height_max", "weight_max"]
if compat.PY35:
columns = ["height_max", "height_sqr_min", "weight_max"]
expected = pd.DataFrame(
{
"height_sqr_min": [82.81, 36.00],
"height_max": [9.5, 34.0],
"weight_max": [9.9, 198.0],
},
index=pd.Index(["cat", "dog"], name="kind"),
columns=columns,
)

# check pd.NameAgg case
result1 = df.groupby(by="kind").agg(
height_sqr_min=pd.NamedAgg(
column="height", aggfunc=lambda x: np.min(x ** 2)
),
height_max=pd.NamedAgg(column="height", aggfunc="max"),
weight_max=pd.NamedAgg(column="weight", aggfunc="max"),
)
tm.assert_frame_equal(result1, expected)

# check agg(key=(col, aggfunc)) case
result2 = df.groupby(by="kind").agg(
height_sqr_min=("height", lambda x: np.min(x ** 2)),
height_max=("height", "max"),
weight_max=("weight", "max"),
)
tm.assert_frame_equal(result2, expected)

def test_agg_multiple_lambda(self):
# GH25719, write test for DataFrameGroupby.agg with multiple lambdas
df = pd.DataFrame({"A": [1, 2]})
expected_dict = {"foo": [2], "bar": [1]}
if compat.PY35:
expected_dict = {"bar": [1], "foo": [2]}
expected = pd.DataFrame(expected_dict, index=pd.Index([1]))

# check agg(key=(col, aggfunc)) case
result1 = df.groupby([1, 1]).agg(
foo=("A", lambda x: x.max()), bar=("A", lambda x: x.min())
)
tm.assert_frame_equal(result1, expected)

# check pd.NamedAgg case
result2 = df.groupby([1, 1]).agg(
foo=pd.NamedAgg(column="A", aggfunc=lambda x: x.max()),
bar=pd.NamedAgg(column="A", aggfunc=lambda x: x.min()),
)
tm.assert_frame_equal(result2, expected)