-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
ENH: Implement Keyword Aggregation for DataFrame.agg and Series.agg #29116
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 137 commits
7e461a1
1314059
8bcb313
7bc368d
cf5c6c3
5298331
4fb74b5
97209be
ca273ff
1d2ab15
3ca193c
c8f80ed
8c738e9
d4d9ea4
2a6de27
21e09f9
058a8e9
0da68d8
15e3659
438398d
d47b790
832b8d9
5a3b690
4fb86f0
a1369bf
ef981a3
82c8960
c610391
679ba59
2ee2628
31f7033
2acb244
dfbd67a
ff5e60f
7c6c891
3e55fcb
6d74b29
532337e
400ff3e
05af2de
6206fa4
34199ad
15d099c
c56f05f
d3f0620
20ecfda
89b8e6b
8aa1cc9
091ca75
c2d5104
50ebdaf
0484f5e
425c802
d5c2c6c
8bb9714
2607c5d
0545231
0a27889
a66053e
7311ef0
da2ff37
bcc5bc3
0825027
d3c35f5
cef2b50
b96a942
3123284
7bb3bd0
3da2e2a
3ce91fc
1426ee2
5893a0e
cc85db4
0f55073
90d52ba
381a697
238b4cc
f8e1891
66e9b38
f4d8a4f
c3e34a0
0c0dbad
61f6201
88c7751
e2b957a
99f75b2
30b7296
baea583
04bffe6
42091c3
fc13e19
1403426
3d9655e
d78c57c
0487928
7435ac5
f1cd16c
469691c
1bb35b5
7a6f496
3730f7d
cd8d00f
6dddd55
96dc3ed
075b85b
a44471c
0e2eae4
2fb4b27
5e04185
56d0f89
7f4839e
65d578b
3e6a06c
8651447
a7439fe
449d40f
9fd8ec5
736bea2
d20be20
35b2b17
74d6169
0546224
f5f0e68
54ff962
ac57023
484e42c
47e6598
f28b452
81b4186
1fd4b5b
47dc5fe
9190f7f
89de59e
8493383
9a9dd7f
c75c882
fa61db7
00a1ccf
26b380a
165ea83
d6923f2
f6a5cc1
a747ab6
44405e8
faea906
7e30a61
3d20524
05921af
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -5,12 +5,107 @@ | |||||
|
||||||
from collections import defaultdict | ||||||
from functools import partial | ||||||
from typing import Any, Callable, DefaultDict, List, Sequence, Tuple, Union | ||||||
from typing import ( | ||||||
Any, | ||||||
Callable, | ||||||
DefaultDict, | ||||||
Dict, | ||||||
List, | ||||||
Optional, | ||||||
Sequence, | ||||||
Tuple, | ||||||
Union, | ||||||
) | ||||||
|
||||||
from pandas._typing import Label | ||||||
|
||||||
from pandas.core.dtypes.common import is_dict_like, is_list_like | ||||||
|
||||||
from pandas.core.base import SpecificationError | ||||||
import pandas.core.common as com | ||||||
from pandas.core.indexes.api import Index | ||||||
from pandas.core.series import FrameOrSeriesUnion, Series | ||||||
|
||||||
|
||||||
def reconstruct_func( | ||||||
func: Optional[ | ||||||
Union[ | ||||||
Union[Callable, str], | ||||||
List[Union[Callable, str]], | ||||||
Dict[Label, Union[Union[Callable, str], List[Union[Callable, str]]]], | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
If reading correctly I think this is superfluous There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, it's indeed superfluous, i changed to |
||||||
] | ||||||
], | ||||||
**kwargs, | ||||||
) -> Tuple[ | ||||||
bool, | ||||||
Optional[ | ||||||
Union[ | ||||||
Union[Callable, str], | ||||||
List[Union[Callable, str]], | ||||||
Dict[Label, Union[Union[Callable, str], List[Union[Callable, str]]]], | ||||||
] | ||||||
], | ||||||
Optional[List[str]], | ||||||
Optional[List[int]], | ||||||
]: | ||||||
""" | ||||||
This is the internal function to reconstruct func given if there is relabeling | ||||||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
or not and also normalize the keyword to get new order of columns. | ||||||
|
||||||
If named aggregation is applied, `func` will be None, and kwargs contains the | ||||||
column and aggregation function information to be parsed; | ||||||
If named aggregation is not applied, `func` is either string (e.g. 'min') or | ||||||
Callable, or list of them (e.g. ['min', np.max]), or the dictionary of column name | ||||||
and str/Callable/list of them (e.g. {'A': 'min'}, or {'A': [np.min, lambda x: x]}) | ||||||
|
||||||
If relabeling is True, will return relabeling, reconstructed func, column | ||||||
names, and the reconstructed order of columns. | ||||||
If relabeling is False, the columns and order will be None. | ||||||
|
||||||
Parameters | ||||||
---------- | ||||||
func: agg function (e.g. 'min' or Callable) or list of agg functions | ||||||
(e.g. ['min', np.max]) or dictionary (e.g. {'A': ['min', np.max]}). | ||||||
**kwargs: dict, kwargs used in is_multi_agg_with_relabel and | ||||||
normalize_keyword_aggregation function for relabelling | ||||||
|
||||||
Returns | ||||||
------- | ||||||
relabelling: bool, if there is relabelling or not | ||||||
func: normalized and mangled func | ||||||
columns: list of column names | ||||||
order: list of columns indices | ||||||
|
||||||
Examples | ||||||
-------- | ||||||
>>> reconstruct_func(None, **{"foo": ("col", "min")}) | ||||||
(True, defaultdict(None, {'col': ['min']}), ('foo',), array([0])) | ||||||
|
||||||
>>> reconstruct_func("min") | ||||||
(False, 'min', None, None) | ||||||
""" | ||||||
relabeling = func is None and is_multi_agg_with_relabel(**kwargs) | ||||||
columns: Optional[List[str]] = None | ||||||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
order: Optional[List[int]] = None | ||||||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
if not relabeling: | ||||||
if isinstance(func, list) and len(func) > len(set(func)): | ||||||
|
||||||
# GH 28426 will raise error if duplicated function names are used and | ||||||
# there is no reassigned name | ||||||
raise SpecificationError( | ||||||
"Function names must be unique if there is no new column names " | ||||||
"assigned" | ||||||
) | ||||||
elif func is None: | ||||||
# nicer error message | ||||||
raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") | ||||||
|
||||||
if relabeling: | ||||||
func, columns, order = normalize_keyword_aggregation(kwargs) | ||||||
func = maybe_mangle_lambdas(func) | ||||||
|
||||||
return relabeling, func, columns, order | ||||||
|
||||||
|
||||||
def is_multi_agg_with_relabel(**kwargs) -> bool: | ||||||
|
@@ -198,6 +293,79 @@ def maybe_mangle_lambdas(agg_spec: Any) -> Any: | |||||
return mangled_aggspec | ||||||
|
||||||
|
||||||
def _relabel_result( | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you make this public (no _) as we are calling from a related module (its not exposed to users at all, just easier to grok) |
||||||
result: FrameOrSeriesUnion, | ||||||
func: Dict[str, List[Union[Callable, str]]], | ||||||
columns: Tuple, | ||||||
order: List[int], | ||||||
) -> Dict[Label, Series]: | ||||||
"""Internal function to reorder result if relabelling is True for | ||||||
dataframe.agg, and return the reordered result in dict. | ||||||
|
||||||
Parameters: | ||||||
---------- | ||||||
result: Result from aggregation | ||||||
func: Dict of (column name, funcs) | ||||||
columns: New columns name for relabelling | ||||||
order: New order for relabelling | ||||||
|
||||||
Examples: | ||||||
--------- | ||||||
>>> result = DataFrame({"A": [np.nan, 2, np.nan], | ||||||
... "C": [6, np.nan, np.nan], "B": [np.nan, 4, 2.5]}) # doctest: +SKIP | ||||||
>>> funcs = {"A": ["max"], "C": ["max"], "B": ["mean", "min"]} | ||||||
>>> columns = ("foo", "aab", "bar", "dat") | ||||||
>>> order = [0, 1, 2, 3] | ||||||
>>> _relabel_result(result, func, columns, order) # doctest: +SKIP | ||||||
dict(A=Series([2.0, NaN, NaN, NaN], index=["foo", "aab", "bar", "dat"]), | ||||||
C=Series([NaN, 6.0, NaN, NaN], index=["foo", "aab", "bar", "dat"]), | ||||||
B=Series([NaN, NaN, 2.5, 4.0], index=["foo", "aab", "bar", "dat"])) | ||||||
""" | ||||||
reordered_indexes = [ | ||||||
pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1]) | ||||||
] | ||||||
reordered_result_in_dict: Dict[Label, Series] = {} | ||||||
idx = 0 | ||||||
|
||||||
reorder_mask = not isinstance(result, Series) and len(result.columns) > 1 | ||||||
for col, fun in func.items(): | ||||||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
s = result[col].dropna() | ||||||
|
||||||
# In the `_aggregate`, the callable names are obtained and used in `result`, and | ||||||
# these names are ordered alphabetically. e.g. | ||||||
# C2 C1 | ||||||
# <lambda> 1 NaN | ||||||
# amax NaN 4.0 | ||||||
# max NaN 4.0 | ||||||
# sum 18.0 6.0 | ||||||
# Therefore, the order of functions for each column could be shuffled | ||||||
# accordingly so need to get the callable name if it is not parsed names, and | ||||||
# reorder the aggregated result for each column. | ||||||
# e.g. if df.agg(c1=("C2", sum), c2=("C2", lambda x: min(x))), correct order is | ||||||
# [sum, <lambda>], but in `result`, it will be [<lambda>, sum], and we need to | ||||||
# reorder so that aggregated values map to their functions regarding the order. | ||||||
|
||||||
# However there is only one column being used for aggregation, not need to | ||||||
# reorder since the index is not sorted, and keep as is in `funcs`, e.g. | ||||||
# A | ||||||
# min 1.0 | ||||||
# mean 1.5 | ||||||
# mean 1.5 | ||||||
if reorder_mask: | ||||||
fun = [ | ||||||
com.get_callable_name(f) if not isinstance(f, str) else f for f in fun | ||||||
] | ||||||
col_idx_order = Index(s.index).get_indexer(fun) | ||||||
s = s[col_idx_order] | ||||||
|
||||||
# assign the new user-provided "named aggregation" as index names, and reindex | ||||||
# it based on the whole user-provided names. | ||||||
s.index = reordered_indexes[idx : idx + len(fun)] | ||||||
reordered_result_in_dict[col] = s.reindex(columns, copy=False) | ||||||
idx = idx + len(fun) | ||||||
return reordered_result_in_dict | ||||||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
|
||||||
def validate_func_kwargs( | ||||||
kwargs: dict, | ||||||
) -> Tuple[List[str], List[Union[str, Callable[..., Any]]]]: | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
import numpy as np | ||
import pytest | ||
|
||
import pandas as pd | ||
import pandas._testing as tm | ||
|
||
|
||
class TestDataFrameNamedAggregate: | ||
def test_agg_relabel(self): | ||
# GH 26513 | ||
df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) | ||
|
||
# simplest case with one column, one func | ||
result = df.agg(foo=("B", "sum")) | ||
expected = pd.DataFrame({"B": [10]}, index=pd.Index(["foo"])) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# test on same column with different methods | ||
result = df.agg(foo=("B", "sum"), bar=("B", "min")) | ||
expected = pd.DataFrame({"B": [10, 1]}, index=pd.Index(["foo", "bar"])) | ||
|
||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_agg_relabel_multi_columns_multi_methods(self): | ||
# GH 26513, test on multiple columns with multiple methods | ||
df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) | ||
result = df.agg( | ||
foo=("A", "sum"), | ||
bar=("B", "mean"), | ||
cat=("A", "min"), | ||
dat=("B", "max"), | ||
f=("A", "max"), | ||
g=("C", "min"), | ||
) | ||
expected = pd.DataFrame( | ||
{ | ||
"A": [6.0, np.nan, 1.0, np.nan, 2.0, np.nan], | ||
"B": [np.nan, 2.5, np.nan, 4.0, np.nan, np.nan], | ||
"C": [np.nan, np.nan, np.nan, np.nan, np.nan, 3.0], | ||
}, | ||
index=pd.Index(["foo", "bar", "cat", "dat", "f", "g"]), | ||
) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_agg_relabel_partial_functions(self): | ||
# GH 26513, test on partial, functools or more complex cases | ||
df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) | ||
result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min)) | ||
expected = pd.DataFrame( | ||
{"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"]) | ||
) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
result = df.agg( | ||
foo=("A", min), | ||
bar=("A", np.min), | ||
cat=("B", max), | ||
dat=("C", "min"), | ||
f=("B", np.sum), | ||
kk=("B", lambda x: min(x)), | ||
) | ||
expected = pd.DataFrame( | ||
{ | ||
"A": [1.0, 1.0, np.nan, np.nan, np.nan, np.nan], | ||
"B": [np.nan, np.nan, 4.0, np.nan, 10.0, 1.0], | ||
"C": [np.nan, np.nan, np.nan, 3.0, np.nan, np.nan], | ||
}, | ||
index=pd.Index(["foo", "bar", "cat", "dat", "f", "kk"]), | ||
) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_agg_namedtuple(self): | ||
# GH 26513 | ||
df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) | ||
result = df.agg( | ||
foo=pd.NamedAgg("B", "sum"), | ||
bar=pd.NamedAgg("B", min), | ||
cat=pd.NamedAgg(column="B", aggfunc="count"), | ||
fft=pd.NamedAgg("B", aggfunc="max"), | ||
) | ||
|
||
expected = pd.DataFrame( | ||
{"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"]) | ||
) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
result = df.agg( | ||
foo=pd.NamedAgg("A", "min"), | ||
bar=pd.NamedAgg(column="B", aggfunc="max"), | ||
cat=pd.NamedAgg(column="A", aggfunc="max"), | ||
) | ||
expected = pd.DataFrame( | ||
{"A": [0.0, np.nan, 1.0], "B": [np.nan, 2.0, np.nan]}, | ||
index=pd.Index(["foo", "bar", "cat"]), | ||
) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
def test_agg_raises(self): | ||
# GH 26513 | ||
df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) | ||
msg = "Must provide" | ||
|
||
with pytest.raises(TypeError, match=msg): | ||
df.agg() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
crazy signatures, can you create an alias to make this easier to read, maybe some of this is Label?