Skip to content

ENH: Named aggregations with multiple columns #33306

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 14 commits into from
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ Other enhancements
- :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`).
- :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`)
- :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`)
-
- :meth:`DataFrameGroupby.aggregate` will now support named aggregations with multiple columns (:issue:`29268`)

.. ---------------------------------------------------------------------------

Expand Down
37 changes: 32 additions & 5 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,7 @@ def _aggregate(self, arg, *args, **kwargs):
None if not required
"""
is_aggregator = lambda x: isinstance(x, (list, tuple, dict))
deserialized_keys = {}

_axis = kwargs.pop("_axis", None)
if _axis is None:
Expand Down Expand Up @@ -339,8 +340,22 @@ def _aggregate(self, arg, *args, **kwargs):
raise SpecificationError("nested renamer is not supported")
elif isinstance(obj, ABCSeries):
raise SpecificationError("nested renamer is not supported")
elif isinstance(obj, ABCDataFrame) and k not in obj.columns:
raise KeyError(f"Column '{k}' does not exist!")
elif isinstance(obj, ABCDataFrame):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you do a pre-cursor PR to move the current code to pandas/core/aggregation.py (so that you just call a function here). this is too complicated

# GH 29268
if k not in obj.columns:
# Check if list thingy
try:
keys = np.frombuffer(k, dtype=np.dtype("<U1"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we don't want to do things like this, use is_list_like

except (AttributeError, TypeError):
raise KeyError(f"Column '{k}' does not exist!")

# Check keys
for key in keys:
if key not in obj.columns:
raise KeyError(f"Column '{key}' does not exist!")

# Memorize operation
deserialized_keys[k] = keys
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are we keeping state?


arg = new_arg

Expand Down Expand Up @@ -374,14 +389,28 @@ def _agg_2dim(how):
colg = self._gotitem(self._selection, ndim=2, subset=obj)
return colg.aggregate(how)

# GH 29268
def _agg_multi_dim(name, how, keys):
from pandas.core.frame import DataFrame

_obj = {k: self._gotitem(k, ndim=1, subset=None) for k in keys}
result = {com.get_callable_name(agg): agg(_obj) for agg in how}
return DataFrame(result, columns=result.keys())

def _agg(arg, func):
"""
run the aggregations over the arg with func
return a dict
"""
result = {}
for fname, agg_how in arg.items():
result[fname] = func(fname, agg_how)
# GH 29268
if fname in deserialized_keys:
keys = deserialized_keys[fname]
result[fname] = _agg_multi_dim(fname, agg_how, keys)
else:
result[fname] = func(fname, agg_how)

return result

# set the final keys
Expand Down Expand Up @@ -412,11 +441,9 @@ def _agg(arg, func):

# no selection
else:

try:
result = _agg(arg, _agg_1dim)
except SpecificationError:

# we are aggregating expecting all 1d-returns
# but we have 2d
result = _agg(arg, _agg_2dim)
Expand Down
29 changes: 21 additions & 8 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -879,23 +879,30 @@ class DataFrameGroupBy(GroupBy[DataFrame]):
1 1 2 0.590716
2 3 4 0.704907

To control the output names with different aggregations per column,
To control the output names with different aggregations,
pandas supports "named aggregation"

>>> df.groupby("A").agg(
... b_min=pd.NamedAgg(column="B", aggfunc="min"),
... c_sum=pd.NamedAgg(column="C", aggfunc="sum"))
b_min c_sum
... c_sum=pd.NamedAgg(column="C", aggfunc="sum"),
... cb_sum_diff=pd.NamedAgg(
... column=["B", "C"],
... aggfunc=lambda x: x["C"].sum() - x["B"].sum()
... )
... )
b_min c_sum cb_sum_diff
A
1 1 -1.956929
2 3 -0.322183
1 1 1.449287 -1.550713
2 3 0.110498 -6.889502

- The keywords are the *output* column names
- The values are tuples whose first element is the column to select
- The values are tuples whose first element is the column(s) to select
and the second element is the aggregation to apply to that column.
Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields
``['column', 'aggfunc']`` to make it clearer what the arguments are.
As usual, the aggregation can be a callable or a string alias.
- When performing named aggregations with multiple columns, the second
element has to be a lambda and returns a 1 dimension DataFrame.

See :ref:`groupby.aggregate.named` for more.
"""
Expand All @@ -910,11 +917,17 @@ class DataFrameGroupBy(GroupBy[DataFrame]):
)
@Appender(_shared_docs["aggregate"])
def aggregate(self, func=None, *args, **kwargs):

relabeling = func is None and is_multi_agg_with_relabel(**kwargs)
if relabeling:
func, columns, order = normalize_keyword_aggregation(kwargs)
# GH 29268
from types import LambdaType

for k, v in list(kwargs.items()):
if isinstance(v[0], list) & isinstance(v[1], LambdaType):
serialized_key = np.sort(np.array(v[0]))
kwargs[k] = (serialized_key.tobytes(),) + v[1:]

func, columns, order = normalize_keyword_aggregation(kwargs)
kwargs = {}
elif isinstance(func, list) and len(func) > len(set(func)):

Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,6 +636,34 @@ def test_mangled(self):
)
tm.assert_frame_equal(result, expected)

def test_agg_multiple_columns(self):
df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]})
result = df.groupby("A").agg(
add=(["B", "C"], lambda x: x["B"].max() + x["C"].min()),
minus=(["C", "B"], lambda x: x["B"].max() - x["C"].min()),
)
expected = pd.DataFrame(
{"add": [5, 9], "minus": [-1, -1]}, index=pd.Index([0, 1], name="A")
)
tm.assert_frame_equal(result, expected)

def test_agg_multi_missing_column_raises(self):
df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]})
with pytest.raises(KeyError, match="Column 'D' does not exist"):
df.groupby("A").agg(
minus=(["D", "C"], lambda x: x["D"].max() - x["C"].min()),
)

def test_agg_multi_missing_key_raises(self):
df = pd.DataFrame(
{"A": [0, 0, 1, 1], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6], "D": [0, 0, 1, 1]}
)
# shouldn't be able to get aggregrations on columns not specified
with pytest.raises(KeyError, match="D"):
df.groupby("A").agg(
minus=(["B", "C"], lambda x: x["D"].max() - x["D"].min()),
)


@pytest.mark.parametrize(
"agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3",
Expand Down