Skip to content

Commit facd756

Browse files
charlesdong1991jreback
authored andcommitted
BUG: Remove error raise to allow same input function on the same column in named aggregation (#28428)
1 parent 102dc61 commit facd756

File tree

3 files changed

+59
-18
lines changed

3 files changed

+59
-18
lines changed

doc/source/whatsnew/v1.0.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -803,6 +803,8 @@ Groupby/resample/rolling
803803
- Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`)
804804
- Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`)
805805
- Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`)
806+
- Remove error raised due to duplicated input functions in named aggregation in :meth:`DataFrame.groupby` and :meth:`Series.groupby`. Previously error will be raised if the same function is applied on the same column and now it is allowed if new assigned names are different. (:issue:`28426`)
807+
- :meth:`SeriesGroupBy.value_counts` will be able to handle the case even when the :class:`Grouper` makes empty groups (:issue: 28479)
806808
- Bug in :meth:`DataFrameGroupBy.rolling().quantile()` ignoring ``interpolation`` keyword argument (:issue:`28779`)
807809
- Bug in :meth:`DataFrame.groupby` where ``any``, ``all``, ``nunique`` and transform functions would incorrectly handle duplicate column labels (:issue:`21668`)
808810
- Bug in :meth:`DataFrameGroupBy.agg` with timezone-aware datetime64 column incorrectly casting results to the original dtype (:issue:`29641`)
@@ -846,7 +848,6 @@ Other
846848
- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`)
847849
- Bug in :meth:`Series.diff` where a boolean series would incorrectly raise a ``TypeError`` (:issue:`17294`)
848850
- :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`)
849-
- :meth:`SeriesGroupBy.value_counts` will be able to handle the case even when the :class:`Grouper` makes empty groups (:issue:`28479`)
850851
- Fix corrupted error message when calling ``pandas.libs._json.encode()`` on a 0d array (:issue:`18878`)
851852
- Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`)
852853
- Fix :class:`AbstractHolidayCalendar` to return correct results for

pandas/core/groupby/generic.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -302,10 +302,6 @@ def _aggregate_multiple_funcs(self, arg):
302302
results = OrderedDict()
303303
for name, func in arg:
304304
obj = self
305-
if name in results:
306-
raise SpecificationError(
307-
f"Function names must be unique, found multiple named {name}"
308-
)
309305

310306
# reset the cache so that we
311307
# only include the named selection
@@ -915,6 +911,14 @@ def aggregate(self, func=None, *args, **kwargs):
915911
func, columns, order = _normalize_keyword_aggregation(kwargs)
916912

917913
kwargs = {}
914+
elif isinstance(func, list) and len(func) > len(set(func)):
915+
916+
# GH 28426 will raise error if duplicated function names are used and
917+
# there is no reassigned name
918+
raise SpecificationError(
919+
"Function names must be unique if there is no new column "
920+
"names assigned"
921+
)
918922
elif func is None:
919923
# nicer error message
920924
raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).")

pandas/tests/groupby/aggregate/test_aggregate.py

+49-13
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,14 @@ def test_uint64_type_handling(dtype, how):
353353
tm.assert_frame_equal(result, expected, check_exact=True)
354354

355355

356+
def test_func_duplicates_raises():
357+
# GH28426
358+
msg = "Function names"
359+
df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
360+
with pytest.raises(SpecificationError, match=msg):
361+
df.groupby("A").agg(["min", "min"])
362+
363+
356364
class TestNamedAggregationSeries:
357365
def test_series_named_agg(self):
358366
df = pd.Series([1, 2, 3, 4])
@@ -377,12 +385,12 @@ def test_no_args_raises(self):
377385
expected = pd.DataFrame()
378386
tm.assert_frame_equal(result, expected)
379387

380-
def test_series_named_agg_duplicates_raises(self):
381-
# This is a limitation of the named agg implementation reusing
382-
# aggregate_multiple_funcs. It could maybe be lifted in the future.
388+
def test_series_named_agg_duplicates_no_raises(self):
389+
# GH28426
383390
gr = pd.Series([1, 2, 3]).groupby([0, 0, 1])
384-
with pytest.raises(SpecificationError):
385-
gr.agg(a="sum", b="sum")
391+
grouped = gr.agg(a="sum", b="sum")
392+
expected = pd.DataFrame({"a": [3, 3], "b": [3, 3]})
393+
tm.assert_frame_equal(expected, grouped)
386394

387395
def test_mangled(self):
388396
gr = pd.Series([1, 2, 3]).groupby([0, 0, 1])
@@ -439,12 +447,34 @@ def test_agg_relabel_non_identifier(self):
439447
)
440448
tm.assert_frame_equal(result, expected)
441449

442-
def test_duplicate_raises(self):
443-
# TODO: we currently raise on multiple lambdas. We could *maybe*
444-
# update com.get_callable_name to append `_i` to each lambda.
450+
def test_duplicate_no_raises(self):
451+
# GH 28426, if use same input function on same column,
452+
# no error should raise
445453
df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
446-
with pytest.raises(SpecificationError, match="Function names"):
447-
df.groupby("A").agg(a=("A", "min"), b=("A", "min"))
454+
455+
grouped = df.groupby("A").agg(a=("B", "min"), b=("B", "min"))
456+
expected = pd.DataFrame(
457+
{"a": [1, 3], "b": [1, 3]}, index=pd.Index([0, 1], name="A")
458+
)
459+
tm.assert_frame_equal(grouped, expected)
460+
461+
quant50 = functools.partial(np.percentile, q=50)
462+
quant70 = functools.partial(np.percentile, q=70)
463+
quant50.__name__ = "quant50"
464+
quant70.__name__ = "quant70"
465+
466+
test = pd.DataFrame(
467+
{"col1": ["a", "a", "b", "b", "b"], "col2": [1, 2, 3, 4, 5]}
468+
)
469+
470+
grouped = test.groupby("col1").agg(
471+
quantile_50=("col2", quant50), quantile_70=("col2", quant70)
472+
)
473+
expected = pd.DataFrame(
474+
{"quantile_50": [1.5, 4.0], "quantile_70": [1.7, 4.4]},
475+
index=pd.Index(["a", "b"], name="col1"),
476+
)
477+
tm.assert_frame_equal(grouped, expected)
448478

449479
def test_agg_relabel_with_level(self):
450480
df = pd.DataFrame(
@@ -557,15 +587,21 @@ def test_agg_relabel_multiindex_raises_not_exist():
557587
df.groupby(("x", "group")).agg(a=(("Y", "a"), "max"))
558588

559589

560-
def test_agg_relabel_multiindex_raises_duplicate():
590+
def test_agg_relabel_multiindex_duplicates():
561591
# GH29422, add test for raises senario when getting duplicates
592+
# GH28426, after this change, duplicates should also work if the relabelling is
593+
# different
562594
df = DataFrame(
563595
{"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
564596
)
565597
df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")])
566598

567-
with pytest.raises(SpecificationError, match="Function names"):
568-
df.groupby(("x", "group")).agg(a=(("y", "A"), "min"), b=(("y", "A"), "min"))
599+
result = df.groupby(("x", "group")).agg(
600+
a=(("y", "A"), "min"), b=(("y", "A"), "min")
601+
)
602+
idx = pd.Index(["a", "b"], name=("x", "group"))
603+
expected = DataFrame({"a": [0, 2], "b": [0, 2]}, index=idx)
604+
tm.assert_frame_equal(result, expected)
569605

570606

571607
def myfunc(s):

0 commit comments

Comments
 (0)