From 7e461a18d9f6928132afec6f48ce968b3e989ba6 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Mon, 3 Dec 2018 17:43:52 +0100 Subject: [PATCH 01/26] remove \n from docstring --- pandas/core/arrays/datetimes.py | 26 +++++++++++++------------- pandas/core/arrays/timedeltas.py | 16 ++++++++-------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index cfe3afcf3730a..b3df505d56d78 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -82,7 +82,7 @@ def f(self): return result f.__name__ = name - f.__doc__ = docstring + f.__doc__ = "\n{}\n".format(docstring) return property(f) @@ -1072,19 +1072,19 @@ def date(self): return tslib.ints_to_pydatetime(timestamps, box="date") - year = _field_accessor('year', 'Y', "\n The year of the datetime\n") + year = _field_accessor('year', 'Y', "The year of the datetime") month = _field_accessor('month', 'M', - "\n The month as January=1, December=12 \n") - day = _field_accessor('day', 'D', "\nThe days of the datetime\n") - hour = _field_accessor('hour', 'h', "\nThe hours of the datetime\n") - minute = _field_accessor('minute', 'm', "\nThe minutes of the datetime\n") - second = _field_accessor('second', 's', "\nThe seconds of the datetime\n") + "The month as January=1, December=12") + day = _field_accessor('day', 'D', "The days of the datetime") + hour = _field_accessor('hour', 'h', "The hours of the datetime") + minute = _field_accessor('minute', 'm', "The minutes of the datetime") + second = _field_accessor('second', 's', "The seconds of the datetime") microsecond = _field_accessor('microsecond', 'us', - "\nThe microseconds of the datetime\n") + "The microseconds of the datetime") nanosecond = _field_accessor('nanosecond', 'ns', - "\nThe nanoseconds of the datetime\n") + "The nanoseconds of the datetime") weekofyear = _field_accessor('weekofyear', 'woy', - "\nThe week ordinal of the year\n") + "The week ordinal of the year") week = weekofyear _dayofweek_doc = """ The day of the week with Monday=0, Sunday=6. @@ -1129,12 +1129,12 @@ def date(self): "The name of day in a week (ex: Friday)\n\n.. deprecated:: 0.23.0") dayofyear = _field_accessor('dayofyear', 'doy', - "\nThe ordinal day of the year\n") - quarter = _field_accessor('quarter', 'q', "\nThe quarter of the date\n") + "The ordinal day of the year") + quarter = _field_accessor('quarter', 'q', "The quarter of the date") days_in_month = _field_accessor( 'days_in_month', 'dim', - "\nThe number of days in the month\n") + "The number of days in the month") daysinmonth = days_in_month _is_month_doc = """ Indicates whether the date is the {first_or_last} day of the month. diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 830283d31a929..4afc9f5483c2a 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -59,7 +59,7 @@ def f(self): return result f.__name__ = name - f.__doc__ = docstring + f.__doc__ = "\n{}\n".format(docstring) return property(f) @@ -684,16 +684,16 @@ def to_pytimedelta(self): return tslibs.ints_to_pytimedelta(self.asi8) days = _field_accessor("days", "days", - "\nNumber of days for each element.\n") + "Number of days for each element.") seconds = _field_accessor("seconds", "seconds", - "\nNumber of seconds (>= 0 and less than 1 day) " - "for each element.\n") + "Number of seconds (>= 0 and less than 1 day) " + "for each element.") microseconds = _field_accessor("microseconds", "microseconds", - "\nNumber of microseconds (>= 0 and less " - "than 1 second) for each element.\n") + "Number of microseconds (>= 0 and less " + "than 1 second) for each element.") nanoseconds = _field_accessor("nanoseconds", "nanoseconds", - "\nNumber of nanoseconds (>= 0 and less " - "than 1 microsecond) for each element.\n") + "Number of nanoseconds (>= 0 and less " + "than 1 microsecond) for each element.") @property def components(self): From 495f4cacdc6e345630ef39bd3eb648e82ec829b1 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 13 Sep 2019 17:56:41 +0200 Subject: [PATCH 02/26] Remove error raising --- pandas/core/groupby/generic.py | 5 ----- .../tests/groupby/aggregate/test_aggregate.py | 17 +++++++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index e731cffea0671..6bb6321e1d7c7 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -932,11 +932,6 @@ def _aggregate_multiple_funcs(self, arg, _level): results = OrderedDict() for name, func in arg: obj = self - if name in results: - raise SpecificationError( - "Function names must be unique, found multiple named " - "{}".format(name) - ) # reset the cache so that we # only include the named selection diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index aa80c461a00e7..b695bbd158971 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -440,12 +440,17 @@ def test_agg_relabel_non_identifier(self): ) tm.assert_frame_equal(result, expected) - def test_duplicate_raises(self): - # TODO: we currently raise on multiple lambdas. We could *maybe* - # update com.get_callable_name to append `_i` to each lambda. - df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) - with pytest.raises(SpecificationError, match="Function names"): - df.groupby("A").agg(a=("A", "min"), b=("A", "min")) + def test_duplicate_no_raises(self): + # GH 28426, if use input function on same column, no raise should raise + quant50 = functools.partial(np.percentile, q=50) + quant70 = functools.partial(np.percentile, q=70) + + df = pd.DataFrame({"col1": ["a", "a", "b", "b", "b"], "col2": [1, 2, 3, 4, 5]}) + + grouped = df.groupby("col1").agg( + quantile_50=("col2", quant50), quantile_70=("col2", quant70) + ) + assert grouped.columns == ["quantile_50", "quantile_70"] def test_agg_relabel_with_level(self): df = pd.DataFrame( From 65f3a3487920af49b681c86aa90739ed0eb9260c Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 13 Sep 2019 18:05:27 +0200 Subject: [PATCH 03/26] better test case --- pandas/tests/groupby/aggregate/test_aggregate.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index b695bbd158971..81d0bb01c48ef 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -442,15 +442,13 @@ def test_agg_relabel_non_identifier(self): def test_duplicate_no_raises(self): # GH 28426, if use input function on same column, no raise should raise - quant50 = functools.partial(np.percentile, q=50) - quant70 = functools.partial(np.percentile, q=70) - - df = pd.DataFrame({"col1": ["a", "a", "b", "b", "b"], "col2": [1, 2, 3, 4, 5]}) + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) - grouped = df.groupby("col1").agg( - quantile_50=("col2", quant50), quantile_70=("col2", quant70) + grouped = df.groupby("A").agg(a=("B", "min"), b=("B", "min")) + expected = pd.DataFrame( + {"a": [1, 3], "b": [1, 3]}, index=pd.Index([0, 1], name="A") ) - assert grouped.columns == ["quantile_50", "quantile_70"] + tm.assert_frame_equal(grouped, expected) def test_agg_relabel_with_level(self): df = pd.DataFrame( From b9850559538e746f529adcf8cb791103812fa230 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 13 Sep 2019 18:47:17 +0200 Subject: [PATCH 04/26] Fix test for series --- pandas/tests/groupby/aggregate/test_aggregate.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 81d0bb01c48ef..581ba581892a1 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -376,12 +376,12 @@ def test_no_args_raises(self): expected = pd.DataFrame() tm.assert_frame_equal(result, expected) - def test_series_named_agg_duplicates_raises(self): - # This is a limitation of the named agg implementation reusing - # aggregate_multiple_funcs. It could maybe be lifted in the future. + def test_series_named_agg_duplicates_no_raises(self): + # GH28426 gr = pd.Series([1, 2, 3]).groupby([0, 0, 1]) - with pytest.raises(SpecificationError): - gr.agg(a="sum", b="sum") + grouped = gr.agg(a="sum", b="sum") + expected = pd.DataFrame({"a": [3, 3], "b": [3, 3]}) + tm.assert_frame_equal(expected, grouped) def test_mangled(self): gr = pd.Series([1, 2, 3]).groupby([0, 0, 1]) From fb43351add8db43dabdec8b47413e36859a14aea Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 13 Sep 2019 18:52:34 +0200 Subject: [PATCH 05/26] Add whatsnew --- doc/source/whatsnew/v0.25.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index de411ef63680a..35e95ed4a68fc 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -100,7 +100,7 @@ Other ^^^^^ - Compatibility with Python 3.8 in :meth:`DataFrame.query` (:issue:`27261`) -- +- Remove error raised due to duplicated input functions in named aggregation in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`28426`) .. _whatsnew_0.252.contributors: From 2bc05c346c7fcbd122a7a69113c38d2efb177be1 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 13 Sep 2019 18:56:58 +0200 Subject: [PATCH 06/26] better english --- pandas/tests/groupby/aggregate/test_aggregate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 581ba581892a1..16b607805609b 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -441,7 +441,7 @@ def test_agg_relabel_non_identifier(self): tm.assert_frame_equal(result, expected) def test_duplicate_no_raises(self): - # GH 28426, if use input function on same column, no raise should raise + # GH 28426, if use lambds on same column, no error should raise df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) grouped = df.groupby("A").agg(a=("B", "min"), b=("B", "min")) From 39d3aeec5fddd43a823bc80bd17298e8de32a6b5 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 13 Sep 2019 19:06:44 +0200 Subject: [PATCH 07/26] more accurate explanation --- pandas/tests/groupby/aggregate/test_aggregate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 16b607805609b..e622dd48fe371 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -441,7 +441,8 @@ def test_agg_relabel_non_identifier(self): tm.assert_frame_equal(result, expected) def test_duplicate_no_raises(self): - # GH 28426, if use lambds on same column, no error should raise + # GH 28426, if use same input function on same column, + # no error should raise df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) grouped = df.groupby("A").agg(a=("B", "min"), b=("B", "min")) From df51bf9ae02afddf2ce2cb6fb59668332c87c54e Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 13 Sep 2019 19:38:14 +0200 Subject: [PATCH 08/26] Remove unused imports --- pandas/core/groupby/generic.py | 2 +- pandas/tests/groupby/aggregate/test_aggregate.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 6bb6321e1d7c7..93defd26d3dc9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -43,7 +43,7 @@ from pandas._typing import FrameOrSeries import pandas.core.algorithms as algorithms -from pandas.core.base import DataError, SpecificationError +from pandas.core.base import DataError import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import ABCDataFrame, ABCSeries, NDFrame, _shared_docs diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index e622dd48fe371..d9e999ea87ea1 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -9,7 +9,6 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, compat, concat -from pandas.core.base import SpecificationError from pandas.core.groupby.generic import _make_unique, _maybe_mangle_lambdas from pandas.core.groupby.grouper import Grouping import pandas.util.testing as tm From d162b3e06e77bb2f5656512f99101e7c99f024e5 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 14 Sep 2019 18:12:04 +0200 Subject: [PATCH 09/26] code change based on review --- pandas/core/groupby/generic.py | 15 ++++++++++++++- pandas/tests/groupby/aggregate/test_aggregate.py | 13 +++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 93defd26d3dc9..bba8348842ce1 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -43,7 +43,7 @@ from pandas._typing import FrameOrSeries import pandas.core.algorithms as algorithms -from pandas.core.base import DataError +from pandas.core.base import DataError, SpecificationError import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import ABCDataFrame, ABCSeries, NDFrame, _shared_docs @@ -230,6 +230,13 @@ def aggregate(self, func, *args, **kwargs): elif func is None: # nicer error message raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") + elif isinstance(func, list) and len(func) > len(set(func)): + + # GH 28426 will raise error if duplicated function names are used and + raise SpecificationError( + "Function names must be unique if there is no new column " + "names assigned" + ) func = _maybe_mangle_lambdas(func) @@ -857,6 +864,12 @@ def aggregate(self, func=None, *args, **kwargs): kwargs = {} if not columns: raise TypeError(no_arg_message) + elif isinstance(func, list) and len(func) > len(set(func)): + # GH 28426 will raise error if duplicated function names are used and + raise SpecificationError( + "Function names must be unique if there is no new column " + "names assigned" + ) if isinstance(func, str): return getattr(self, func)(*args, **kwargs) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index d9e999ea87ea1..e5fc9dacf17bd 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -9,6 +9,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, compat, concat +from pandas.core.base import SpecificationError from pandas.core.groupby.generic import _make_unique, _maybe_mangle_lambdas from pandas.core.groupby.grouper import Grouping import pandas.util.testing as tm @@ -349,6 +350,18 @@ def test_uint64_type_handling(dtype, how): tm.assert_frame_equal(result, expected, check_exact=True) +def test_func_duplicates_raises(): + # GH28426 + gr = pd.Series([1, 2, 3]).groupby([0, 0, 1]) + msg = "Function names" + with pytest.raises(SpecificationError, match=msg): + gr.agg(['sum', 'sum']) + + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) + with pytest.raises(SpecificationError, match=msg): + df.groupby("A").agg(['min', 'min']) + + class TestNamedAggregationSeries: def test_series_named_agg(self): df = pd.Series([1, 2, 3, 4]) From 31e8079470882b439e036653f0292c89512d0882 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 14 Sep 2019 19:21:51 +0200 Subject: [PATCH 10/26] fix test --- pandas/core/groupby/generic.py | 6 ------ pandas/tests/groupby/aggregate/test_aggregate.py | 3 --- 2 files changed, 9 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index bba8348842ce1..ddae291c23b39 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -864,12 +864,6 @@ def aggregate(self, func=None, *args, **kwargs): kwargs = {} if not columns: raise TypeError(no_arg_message) - elif isinstance(func, list) and len(func) > len(set(func)): - # GH 28426 will raise error if duplicated function names are used and - raise SpecificationError( - "Function names must be unique if there is no new column " - "names assigned" - ) if isinstance(func, str): return getattr(self, func)(*args, **kwargs) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index e5fc9dacf17bd..77a51d4d5edd1 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -352,10 +352,7 @@ def test_uint64_type_handling(dtype, how): def test_func_duplicates_raises(): # GH28426 - gr = pd.Series([1, 2, 3]).groupby([0, 0, 1]) msg = "Function names" - with pytest.raises(SpecificationError, match=msg): - gr.agg(['sum', 'sum']) df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) with pytest.raises(SpecificationError, match=msg): From 7e87435ae8f0097a46043c3d104d8d116fcc2dc8 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 15 Sep 2019 11:11:28 +0200 Subject: [PATCH 11/26] code change and reformat --- pandas/core/groupby/generic.py | 1 + pandas/tests/groupby/aggregate/test_aggregate.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ddae291c23b39..8cb55861e408d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -233,6 +233,7 @@ def aggregate(self, func, *args, **kwargs): elif isinstance(func, list) and len(func) > len(set(func)): # GH 28426 will raise error if duplicated function names are used and + # there is no reassigned name raise SpecificationError( "Function names must be unique if there is no new column " "names assigned" diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 77a51d4d5edd1..80d716f5f823d 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -353,10 +353,9 @@ def test_uint64_type_handling(dtype, how): def test_func_duplicates_raises(): # GH28426 msg = "Function names" - df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) with pytest.raises(SpecificationError, match=msg): - df.groupby("A").agg(['min', 'min']) + df.groupby("A").agg(["min", "min"]) class TestNamedAggregationSeries: From 1361236a2a74404f85136cb90d7dad9edad7430c Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 16 Sep 2019 18:48:11 +0200 Subject: [PATCH 12/26] move whatsnew to 1.0.0 --- doc/source/whatsnew/v0.25.2.rst | 1 - doc/source/whatsnew/v1.0.0.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index 35e95ed4a68fc..46e2119aa4783 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -100,7 +100,6 @@ Other ^^^^^ - Compatibility with Python 3.8 in :meth:`DataFrame.query` (:issue:`27261`) -- Remove error raised due to duplicated input functions in named aggregation in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`28426`) .. _whatsnew_0.252.contributors: diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 2913d88fcd15f..c09782231c6ba 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -242,6 +242,7 @@ Other - Using :meth:`DataFrame.replace` with overlapping keys in a nested dictionary will no longer raise, now matching the behavior of a flat dictionary (:issue:`27660`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`) - :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`) +- Remove error raised due to duplicated input functions in named aggregation in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`28426`) .. _whatsnew_1000.contributors: From 3815507fca2cf15daab8f7de019ea9fed75a6f14 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 17 Sep 2019 09:48:51 +0200 Subject: [PATCH 13/26] add test --- .../tests/groupby/aggregate/test_aggregate.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 80d716f5f823d..be3398fef500d 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -459,6 +459,25 @@ def test_duplicate_no_raises(self): ) tm.assert_frame_equal(grouped, expected) + # Example in GH28426 + # TODO: this result is not correct, should be solved in Series.agg to + # TODO: allow named aggregation + quant50 = functools.partial(np.percentile, q=50) + quant70 = functools.partial(np.percentile, q=70) + + test = pd.DataFrame( + {"col1": ["a", "a", "b", "b", "b"], "col2": [1, 2, 3, 4, 5]} + ) + + grouped = test.groupby("col1").agg( + quantile_50=("col2", quant50), quantile_70=("col2", quant70) + ) + expected = pd.DataFrame( + {"quantile_50": [1.7, 4.4], "quantile_70": [1.7, 4.4]}, + index=pd.Index(["a", "b"], name="col1"), + ) + tm.assert_frame_equal(grouped, expected) + def test_agg_relabel_with_level(self): df = pd.DataFrame( {"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}, From 8a9c41bd0870a4cbc72c1181210530dd8a304ee5 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 17 Sep 2019 21:31:09 +0200 Subject: [PATCH 14/26] bold change --- pandas/core/common.py | 5 ++++- pandas/core/groupby/generic.py | 4 ++-- pandas/tests/groupby/aggregate/test_aggregate.py | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index cf113c8aecbfe..4093b28b70f58 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -319,12 +319,15 @@ def is_full_slice(obj, l): ) -def get_callable_name(obj): +def get_callable_name(obj, incl_keywords=False): # typical case has name if hasattr(obj, "__name__"): return getattr(obj, "__name__") # some objects don't; could recurse if isinstance(obj, partial): + if incl_keywords is True and obj.keywords is not None: + kw = "_".join("{}{}".format(*kw) for kw in obj.keywords.items()) + return get_callable_name(obj.func) + "_" + kw return get_callable_name(obj.func) # fall back to class name if hasattr(obj, "__call__"): diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8cb55861e408d..540a690fc214d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -873,6 +873,7 @@ def aggregate(self, func=None, *args, **kwargs): # Catch instances of lists / tuples # but not the class list / tuple itself. func = _maybe_mangle_lambdas(func) + print(func) ret = self._aggregate_multiple_funcs(func, (_level or 0) + 1) if relabeling: ret.columns = columns @@ -933,8 +934,7 @@ def _aggregate_multiple_funcs(self, arg, _level): # list of functions / function names columns = [] for f in arg: - columns.append(com.get_callable_name(f) or f) - + columns.append(com.get_callable_name(f, incl_keywords=True) or f) arg = zip(columns, arg) results = OrderedDict() diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index be3398fef500d..daab4a85b9493 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -473,7 +473,7 @@ def test_duplicate_no_raises(self): quantile_50=("col2", quant50), quantile_70=("col2", quant70) ) expected = pd.DataFrame( - {"quantile_50": [1.7, 4.4], "quantile_70": [1.7, 4.4]}, + {"quantile_50": [1.5, 4.0], "quantile_70": [1.7, 4.4]}, index=pd.Index(["a", "b"], name="col1"), ) tm.assert_frame_equal(grouped, expected) From 6eeb978ce4acab427c91cf09ddfd562cee5c94b3 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 17 Sep 2019 21:34:11 +0200 Subject: [PATCH 15/26] remove unused comment --- pandas/tests/groupby/aggregate/test_aggregate.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index daab4a85b9493..0bbf723a078d2 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -460,8 +460,6 @@ def test_duplicate_no_raises(self): tm.assert_frame_equal(grouped, expected) # Example in GH28426 - # TODO: this result is not correct, should be solved in Series.agg to - # TODO: allow named aggregation quant50 = functools.partial(np.percentile, q=50) quant70 = functools.partial(np.percentile, q=70) From 2ae946eecd6789a58b3518d5435ad2e1da41c6cc Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 18 Sep 2019 20:14:45 +0200 Subject: [PATCH 16/26] remove print --- pandas/core/groupby/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 540a690fc214d..5905b4ec9c4d0 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -873,7 +873,6 @@ def aggregate(self, func=None, *args, **kwargs): # Catch instances of lists / tuples # but not the class list / tuple itself. func = _maybe_mangle_lambdas(func) - print(func) ret = self._aggregate_multiple_funcs(func, (_level or 0) + 1) if relabeling: ret.columns = columns From 2e5fd6e3ead42557b0d3bf0c46de623be9d13636 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 18 Sep 2019 20:16:21 +0200 Subject: [PATCH 17/26] revert blank line change --- doc/source/whatsnew/v0.25.2.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index 46e2119aa4783..de411ef63680a 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -100,6 +100,7 @@ Other ^^^^^ - Compatibility with Python 3.8 in :meth:`DataFrame.query` (:issue:`27261`) +- .. _whatsnew_0.252.contributors: From b10b4b80a23f1c97184f9f82bc6d71767b959913 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 22 Sep 2019 22:11:52 +0200 Subject: [PATCH 18/26] revert previous change --- pandas/core/common.py | 5 +---- pandas/core/groupby/generic.py | 2 +- pandas/tests/groupby/aggregate/test_aggregate.py | 6 ++++-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 4093b28b70f58..cf113c8aecbfe 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -319,15 +319,12 @@ def is_full_slice(obj, l): ) -def get_callable_name(obj, incl_keywords=False): +def get_callable_name(obj): # typical case has name if hasattr(obj, "__name__"): return getattr(obj, "__name__") # some objects don't; could recurse if isinstance(obj, partial): - if incl_keywords is True and obj.keywords is not None: - kw = "_".join("{}{}".format(*kw) for kw in obj.keywords.items()) - return get_callable_name(obj.func) + "_" + kw return get_callable_name(obj.func) # fall back to class name if hasattr(obj, "__call__"): diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5905b4ec9c4d0..76f54f77695a3 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -933,7 +933,7 @@ def _aggregate_multiple_funcs(self, arg, _level): # list of functions / function names columns = [] for f in arg: - columns.append(com.get_callable_name(f, incl_keywords=True) or f) + columns.append(com.get_callable_name(f) or f) arg = zip(columns, arg) results = OrderedDict() diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 0bbf723a078d2..f2139549b2caf 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -459,7 +459,9 @@ def test_duplicate_no_raises(self): ) tm.assert_frame_equal(grouped, expected) - # Example in GH28426 + # Example in GH28426, this output is wrong, because aggregate cannot + # mangle partial function on the same column and should be solved by + # GH28570 quant50 = functools.partial(np.percentile, q=50) quant70 = functools.partial(np.percentile, q=70) @@ -471,7 +473,7 @@ def test_duplicate_no_raises(self): quantile_50=("col2", quant50), quantile_70=("col2", quant70) ) expected = pd.DataFrame( - {"quantile_50": [1.5, 4.0], "quantile_70": [1.7, 4.4]}, + {"quantile_50": [1.7, 4.4], "quantile_70": [1.7, 4.4]}, index=pd.Index(["a", "b"], name="col1"), ) tm.assert_frame_equal(grouped, expected) From 314eb282998dc53430d73737b139c2146b744840 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sun, 22 Sep 2019 22:13:12 +0200 Subject: [PATCH 19/26] revert unnecessary white space removal --- pandas/core/groupby/generic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 76f54f77695a3..8cb55861e408d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -934,6 +934,7 @@ def _aggregate_multiple_funcs(self, arg, _level): columns = [] for f in arg: columns.append(com.get_callable_name(f) or f) + arg = zip(columns, arg) results = OrderedDict() From fd485d7865c03efb931a4ce733eac281b3f2dec4 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 19 Oct 2019 16:36:29 +0200 Subject: [PATCH 20/26] fix tests --- pandas/core/groupby/generic.py | 16 ++++++++-------- pandas/tests/groupby/aggregate/test_aggregate.py | 7 +++---- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index bc9e3d84a543b..d1406f2180547 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -251,14 +251,6 @@ def aggregate(self, func=None, *args, **kwargs): ret = self._aggregate_multiple_funcs(func, (_level or 0) + 1) if relabeling: ret.columns = columns - # elif isinstance(func, list) and len(func) > len(set(func)): - # - # # GH 28426 will raise error if duplicated function names are used and - # # there is no reassigned name - # raise SpecificationError( - # "Function names must be unique if there is no new column " - # "names assigned" - # ) else: cyfunc = self._get_cython_func(func) if cyfunc and not args and not kwargs: @@ -868,6 +860,14 @@ def aggregate(self, func=None, *args, **kwargs): func, columns, order = _normalize_keyword_aggregation(kwargs) kwargs = {} + elif isinstance(func, abc.Iterable) and len(func) > len(set(func)): + + # GH 28426 will raise error if duplicated function names are used and + # there is no reassigned name + raise SpecificationError( + "Function names must be unique if there is no new column " + "names assigned" + ) elif func is None: # nicer error message raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index f2139549b2caf..021f9811085db 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -459,11 +459,10 @@ def test_duplicate_no_raises(self): ) tm.assert_frame_equal(grouped, expected) - # Example in GH28426, this output is wrong, because aggregate cannot - # mangle partial function on the same column and should be solved by - # GH28570 quant50 = functools.partial(np.percentile, q=50) quant70 = functools.partial(np.percentile, q=70) + quant50.__name__ = "quant50" + quant70.__name__ = "quant70" test = pd.DataFrame( {"col1": ["a", "a", "b", "b", "b"], "col2": [1, 2, 3, 4, 5]} @@ -473,7 +472,7 @@ def test_duplicate_no_raises(self): quantile_50=("col2", quant50), quantile_70=("col2", quant70) ) expected = pd.DataFrame( - {"quantile_50": [1.7, 4.4], "quantile_70": [1.7, 4.4]}, + {"quantile_50": [1.5, 4.0], "quantile_70": [1.7, 4.4]}, index=pd.Index(["a", "b"], name="col1"), ) tm.assert_frame_equal(grouped, expected) From 5817f5e8de796ee8d98292532a8b2e9cf9b55993 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Sat, 19 Oct 2019 17:12:51 +0200 Subject: [PATCH 21/26] fix test --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index d1406f2180547..d4aa6bc3edd57 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -860,7 +860,7 @@ def aggregate(self, func=None, *args, **kwargs): func, columns, order = _normalize_keyword_aggregation(kwargs) kwargs = {} - elif isinstance(func, abc.Iterable) and len(func) > len(set(func)): + elif isinstance(func, list) and len(func) > len(set(func)): # GH 28426 will raise error if duplicated function names are used and # there is no reassigned name From 0e249fc35f35bf52214a6e03075331c129eda5c7 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 8 Nov 2019 16:45:08 +0100 Subject: [PATCH 22/26] move to groupby section --- doc/source/whatsnew/v1.0.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index ce3af7cd25e47..4a7278bda7625 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -424,6 +424,8 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) +- Remove error raised due to duplicated input functions in named aggregation in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`28426`) +- :meth:`SeriesGroupBy.value_counts` will be able to handle the case even when the :class:`Grouper` makes empty groups (:issue: 28479) Reshaping ^^^^^^^^^ @@ -456,8 +458,6 @@ Other - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`) - Bug in :meth:`Series.diff` where a boolean series would incorrectly raise a ``TypeError`` (:issue:`17294`) - :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`) -- Remove error raised due to duplicated input functions in named aggregation in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`28426`) -- :meth:`SeriesGroupBy.value_counts` will be able to handle the case even when the :class:`Grouper` makes empty groups (:issue: 28479) - Fix corrupted error message when calling ``pandas.libs._json.encode()`` on a 0d array (:issue:`18878`) - Fix :class:`AbstractHolidayCalendar` to return correct results for years after 2030 (now goes up to 2200) (:issue:`27790`) From f7a219c36c3b3cba3be81c01e47ab1de97b97f32 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 20 Nov 2019 16:29:51 +0100 Subject: [PATCH 23/26] fix test for multiindex --- pandas/tests/groupby/aggregate/test_aggregate.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index ec784a7694979..d67376fda6483 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -583,15 +583,19 @@ def test_agg_relabel_multiindex_raises_not_exist(): df.groupby(("x", "group")).agg(a=(("Y", "a"), "max")) -def test_agg_relabel_multiindex_raises_duplicate(): +def test_agg_relabel_multiindex_duplicates(): # GH29422, add test for raises senario when getting duplicates + # GH28426, after this change, duplicates should also work if the relabelling is + # different df = DataFrame( {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]} ) df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) - with pytest.raises(SpecificationError, match="Function names"): - df.groupby(("x", "group")).agg(a=(("y", "A"), "min"), b=(("y", "A"), "min")) + result = df.groupby(("x", "group")).agg(a=(("y", "A"), "min"), b=(("y", "A"), "min")) + idx = pd.Index(["a", "b"], name=("x", "group")) + expected = DataFrame({"a": [0, 2], "b": [0, 2]}, index=idx) + tm.assert_frame_equal(result, expected) def myfunc(s): From cef6d3a38678fcf757d870dfc33c8eeda94613ed Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 20 Nov 2019 16:30:22 +0100 Subject: [PATCH 24/26] reformatting --- pandas/tests/groupby/aggregate/test_aggregate.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index d67376fda6483..687a1c744e1cd 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -592,7 +592,9 @@ def test_agg_relabel_multiindex_duplicates(): ) df.columns = pd.MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")]) - result = df.groupby(("x", "group")).agg(a=(("y", "A"), "min"), b=(("y", "A"), "min")) + result = df.groupby(("x", "group")).agg( + a=(("y", "A"), "min"), b=(("y", "A"), "min") + ) idx = pd.Index(["a", "b"], name=("x", "group")) expected = DataFrame({"a": [0, 2], "b": [0, 2]}, index=idx) tm.assert_frame_equal(result, expected) From 5172f46f00f30c34da171f7fe4ecf8e98d1d7138 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Fri, 22 Nov 2019 13:55:15 +0100 Subject: [PATCH 25/26] solve conflict --- pandas/core/groupby/generic.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8a07a13eb0381..e76412b1a4418 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -299,10 +299,6 @@ def _aggregate_multiple_funcs(self, arg): results = OrderedDict() for name, func in arg: obj = self - if name in results: - raise SpecificationError( - f"Function names must be unique, found multiple named {name}" - ) # reset the cache so that we # only include the named selection From 90ba1b478fc5d1fb340add5ded6f43e8c22f5815 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 2 Dec 2019 14:04:34 +0100 Subject: [PATCH 26/26] remove redundant and expand whatsnew note --- doc/source/whatsnew/v1.0.0.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 02f00ba1c8478..77d6619610d74 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -699,7 +699,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) -- Remove error raised due to duplicated input functions in named aggregation in :meth:`DataFrame.groupby` and :meth:`Series.groupby` (:issue:`28426`) +- Remove error raised due to duplicated input functions in named aggregation in :meth:`DataFrame.groupby` and :meth:`Series.groupby`. Previously error will be raised if the same function is applied on the same column and now it is allowed if new assigned names are different. (:issue:`28426`) - :meth:`SeriesGroupBy.value_counts` will be able to handle the case even when the :class:`Grouper` makes empty groups (:issue: 28479) - Bug in :meth:`DataFrameGroupBy.rolling().quantile()` ignoring ``interpolation`` keyword argument (:issue:`28779`) - Bug in :meth:`DataFrame.groupby` where ``any``, ``all``, ``nunique`` and transform functions would incorrectly handle duplicate column labels (:issue:`21668`) @@ -744,7 +744,6 @@ Other - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`) - Bug in :meth:`Series.diff` where a boolean series would incorrectly raise a ``TypeError`` (:issue:`17294`) - :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`) -- :meth:`SeriesGroupBy.value_counts` will be able to handle the case even when the :class:`Grouper` makes empty groups (:issue:`28479`) - Fix corrupted error message when calling ``pandas.libs._json.encode()`` on a 0d array (:issue:`18878`) - Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`) - Fix :class:`AbstractHolidayCalendar` to return correct results for