diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 470209a7f4a33..38e3f1316302d 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -752,6 +752,8 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) +- Remove error raised due to duplicated input functions in named aggregation in :meth:`DataFrame.groupby` and :meth:`Series.groupby`. Previously error will be raised if the same function is applied on the same column and now it is allowed if new assigned names are different. (:issue:`28426`) +- :meth:`SeriesGroupBy.value_counts` will be able to handle the case even when the :class:`Grouper` makes empty groups (:issue: 28479) - Bug in :meth:`DataFrameGroupBy.rolling().quantile()` ignoring ``interpolation`` keyword argument (:issue:`28779`) - Bug in :meth:`DataFrame.groupby` where ``any``, ``all``, ``nunique`` and transform functions would incorrectly handle duplicate column labels (:issue:`21668`) - Bug in :meth:`DataFrameGroupBy.agg` with timezone-aware datetime64 column incorrectly casting results to the original dtype (:issue:`29641`) @@ -795,7 +797,6 @@ Other - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`) - Bug in :meth:`Series.diff` where a boolean series would incorrectly raise a ``TypeError`` (:issue:`17294`) - :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`) -- :meth:`SeriesGroupBy.value_counts` will be able to handle the case even when the :class:`Grouper` makes empty groups (:issue:`28479`) - Fix corrupted error message when calling ``pandas.libs._json.encode()`` on a 0d array (:issue:`18878`) - Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`) - Fix :class:`AbstractHolidayCalendar` to return correct results for diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4726cdfb05a70..75bd919ab7c1d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -299,10 +299,6 @@ def _aggregate_multiple_funcs(self, arg): results = OrderedDict() for name, func in arg: obj = self - if name in results: - raise SpecificationError( - f"Function names must be unique, found multiple named {name}" - ) # reset the cache so that we # only include the named selection @@ -912,6 +908,14 @@ def aggregate(self, func=None, *args, **kwargs): func, columns, order = _normalize_keyword_aggregation(kwargs) kwargs = {} + elif isinstance(func, list) and len(func) > len(set(func)): + + # GH 28426 will raise error if duplicated function names are used and + # there is no reassigned name + raise SpecificationError( + "Function names must be unique if there is no new column " + "names assigned" + ) elif func is None: # nicer error message raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 9543c9d5b59de..0d8379407fef7 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -353,6 +353,14 @@ def test_uint64_type_handling(dtype, how): tm.assert_frame_equal(result, expected, check_exact=True) +def test_func_duplicates_raises(): + # GH28426 + msg = "Function names" + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) + with pytest.raises(SpecificationError, match=msg): + df.groupby("A").agg(["min", "min"]) + + class TestNamedAggregationSeries: def test_series_named_agg(self): df = pd.Series([1, 2, 3, 4]) @@ -377,12 +385,12 @@ def test_no_args_raises(self): expected = pd.DataFrame() tm.assert_frame_equal(result, expected) - def test_series_named_agg_duplicates_raises(self): - # This is a limitation of the named agg implementation reusing - # aggregate_multiple_funcs. It could maybe be lifted in the future. + def test_series_named_agg_duplicates_no_raises(self): + # GH28426 gr = pd.Series([1, 2, 3]).groupby([0, 0, 1]) - with pytest.raises(SpecificationError): - gr.agg(a="sum", b="sum") + grouped = gr.agg(a="sum", b="sum") + expected = pd.DataFrame({"a": [3, 3], "b": [3, 3]}) + tm.assert_frame_equal(expected, grouped) def test_mangled(self): gr = pd.Series([1, 2, 3]).groupby([0, 0, 1]) @@ -439,12 +447,34 @@ def test_agg_relabel_non_identifier(self): ) tm.assert_frame_equal(result, expected) - def test_duplicate_raises(self): - # TODO: we currently raise on multiple lambdas. We could *maybe* - # update com.get_callable_name to append `_i` to each lambda. + def test_duplicate_no_raises(self): + # GH 28426, if use same input function on same column, + # no error should raise df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) - with pytest.raises(SpecificationError, match="Function names"): - df.groupby("A").agg(a=("A", "min"), b=("A", "min")) + + grouped = df.groupby("A").agg(a=("B", "min"), b=("B", "min")) + expected = pd.DataFrame( + {"a": [1, 3], "b": [1, 3]}, index=pd.Index([0, 1], name="A") + ) + tm.assert_frame_equal(grouped, expected) + + quant50 = functools.partial(np.percentile, q=50) + quant70 = functools.partial(np.percentile, q=70) + quant50.__name__ = "quant50" + quant70.__name__ = "quant70" + + test = pd.DataFrame( + {"col1": ["a", "a", "b", "b", "b"], "col2": [1, 2, 3, 4, 5]} + ) + + grouped = test.groupby("col1").agg( + quantile_50=("col2", quant50), quantile_70=("col2", quant70) + ) + expected = pd.DataFrame( + {"quantile_50": [1.5, 4.0], "quantile_70": [1.7, 4.4]}, + index=pd.Index(["a", "b"], name="col1"), + ) + tm.assert_frame_equal(grouped, expected) def test_agg_relabel_with_level(self): df = pd.DataFrame( @@ -557,15 +587,21 @@ def test_agg_relabel_multiindex_raises_not_exist(): df.groupby(("x", "group")).agg(a=(("Y", "a"), "max")) -def test_agg_relabel_multiindex_raises_duplicate(): +def test_agg_relabel_multiindex_duplicates(): # GH29422, add test for raises senario when getting duplicates + # GH28426, after this change, duplicates should also work if the relabelling is + # different df = DataFrame( {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} ) df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) - with pytest.raises(SpecificationError, match="Function names"): - df.groupby(("x", "group")).agg(a=(("y", "A"), "min"), b=(("y", "A"), "min")) + result = df.groupby(("x", "group")).agg( + a=(("y", "A"), "min"), b=(("y", "A"), "min") + ) + idx = pd.Index(["a", "b"], name=("x", "group")) + expected = DataFrame({"a": [0, 2], "b": [0, 2]}, index=idx) + tm.assert_frame_equal(result, expected) def myfunc(s):