From 9285b6ca4c005762c6442bce6a879b855c27cd21 Mon Sep 17 00:00:00 2001 From: Mason-98 Date: Sun, 5 Apr 2020 14:59:20 -0400 Subject: [PATCH 01/12] Finished implementation of aggregation feature --- pandas/core/base.py | 28 +++++++++++++++++++++++----- pandas/core/groupby/generic.py | 10 ++++++++-- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index f55d9f905945d..ebdacf83ada73 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -352,8 +352,20 @@ def _aggregate(self, arg, *args, **kwargs): raise SpecificationError("nested renamer is not supported") elif isinstance(obj, ABCSeries): raise SpecificationError("nested renamer is not supported") - elif isinstance(obj, ABCDataFrame) and k not in obj.columns: - raise KeyError(f"Column '{k}' does not exist!") + elif isinstance(obj, ABCDataFrame): + + # GH 29268 + # Original check + if (k not in obj.columns): + # Check if list thingy + try: + keys = np.frombuffer(k, dtype=np.dtype(' len(set(func)): From 37522f02927faaa19c01587e2909f77258bd42fb Mon Sep 17 00:00:00 2001 From: fpunny Date: Mon, 6 Apr 2020 18:42:17 -0400 Subject: [PATCH 02/12] Fixed attributeError catch --- pandas/core/base.py | 2 +- pandas/core/groupby/generic.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index e8bf5a97e54aa..1c1cdbd0c8d67 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -351,7 +351,7 @@ def _aggregate(self, arg, *args, **kwargs): # Check keys if (key not in obj.columns): raise KeyError(f"Column '{key}' does not exist!") - except TypeError: + except AttributeError: raise KeyError(f"Column '{k}' does not exist!") arg = new_arg diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 87412ae278795..d1a501ca05a44 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -918,7 +918,7 @@ def aggregate(self, func=None, *args, **kwargs): if isinstance(v[0], list) & isinstance(v[1], LambdaType): # v[0] is the first parameter given (the column(s) to group) # v[1] is the 2nd parameter given and the opperation to be done to the column(s) - kwargs[k] = (np.array(v[0]).tobytes(),) + v[1:] + kwargs[k] = (np.array(v[0]).sort().tobytes(),) + v[1:] func, columns, order = normalize_keyword_aggregation(kwargs) kwargs = {} From 96346f998b69eb33650365d87865de521222664f Mon Sep 17 00:00:00 2001 From: fpunny Date: Mon, 6 Apr 2020 19:16:28 -0400 Subject: [PATCH 03/12] Updated documentation to reflect multi column named aggregation --- pandas/core/base.py | 4 ++-- pandas/core/groupby/generic.py | 23 ++++++++++++++++------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 1c1cdbd0c8d67..e2055ff266ece 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -399,8 +399,8 @@ def _agg(arg, func): _obj = {} for item in items: _obj[item] = self._gotitem(item, ndim=1, subset=None) - result[fname] = agg_how[0](_obj) - except TypeError: + result[fname] = [agg(_obj) for agg in agg_how] + except AttributeError: result[fname] = func(fname, agg_how) return result diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index d1a501ca05a44..21eba932f6ffa 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -879,23 +879,30 @@ class DataFrameGroupBy(GroupBy[DataFrame]): 1 1 2 0.590716 2 3 4 0.704907 - To control the output names with different aggregations per column, + To control the output names with different aggregations, pandas supports "named aggregation" >>> df.groupby("A").agg( ... b_min=pd.NamedAgg(column="B", aggfunc="min"), - ... c_sum=pd.NamedAgg(column="C", aggfunc="sum")) - b_min c_sum + ... c_sum=pd.NamedAgg(column="C", aggfunc="sum"), + ... cb_sum_diff=pd.NamedAgg( + ... column=["B", "C"], + ... aggfunc=lambda x: x["C"].sum() - x["B"].sum() + ... ) + ... ) + b_min c_sum cb_sum_diff A - 1 1 -1.956929 - 2 3 -0.322183 + 1 1 1.449287 -1.550713 + 2 3 0.110498 -6.889502 - The keywords are the *output* column names - - The values are tuples whose first element is the column to select + - The values are tuples whose first element is the column(s) to select and the second element is the aggregation to apply to that column. Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields ``['column', 'aggfunc']`` to make it clearer what the arguments are. As usual, the aggregation can be a callable or a string alias. + - When performing named aggregations with multiple columns, the second + element has to be a lambda and returns a 1 dimension DataFrame. See :ref:`groupby.aggregate.named` for more. """ @@ -918,7 +925,9 @@ def aggregate(self, func=None, *args, **kwargs): if isinstance(v[0], list) & isinstance(v[1], LambdaType): # v[0] is the first parameter given (the column(s) to group) # v[1] is the 2nd parameter given and the opperation to be done to the column(s) - kwargs[k] = (np.array(v[0]).sort().tobytes(),) + v[1:] + serialized_key = np.array(v[0]) + serialized_key.sort() + kwargs[k] = (serialized_key.tobytes(),) + v[1:] func, columns, order = normalize_keyword_aggregation(kwargs) kwargs = {} From ccbc40378418f9227e4546a94b28ad6e3f0c7d27 Mon Sep 17 00:00:00 2001 From: fpunny Date: Mon, 6 Apr 2020 19:49:51 -0400 Subject: [PATCH 04/12] Changes to documentation --- pandas/core/base.py | 29 ++++++++++++++++------------- pandas/core/groupby/generic.py | 5 +---- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index e2055ff266ece..110db326a80ff 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -299,6 +299,7 @@ def _aggregate(self, arg, *args, **kwargs): None if not required """ is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) + deserialized_keys = {} _axis = kwargs.pop("_axis", None) if _axis is None: @@ -340,20 +341,22 @@ def _aggregate(self, arg, *args, **kwargs): elif isinstance(obj, ABCSeries): raise SpecificationError("nested renamer is not supported") elif isinstance(obj, ABCDataFrame): - # GH 29268 - # Original check if (k not in obj.columns): # Check if list thingy try: keys = np.frombuffer(k, dtype=np.dtype(' Date: Mon, 6 Apr 2020 20:34:52 -0400 Subject: [PATCH 05/12] Fixed lint issue --- pandas/core/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 110db326a80ff..45a8077296320 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -346,7 +346,7 @@ def _aggregate(self, arg, *args, **kwargs): # Check if list thingy try: keys = np.frombuffer(k, dtype=np.dtype(' Date: Mon, 6 Apr 2020 21:18:14 -0400 Subject: [PATCH 06/12] fixed zip --- pandas/core/base.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 45a8077296320..bf85cbf1da8ae 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -399,9 +399,10 @@ def _agg(arg, func): # GH 29268 if fname in deserialized_keys: keys = deserialized_keys[fname] - _obj = zip(keys, [ - self._gotitem(k, ndim=1, subset=None) for k in keys - ]) + _obj = {} + + for k in keys: + _obj[k] = self._gotitem(k, ndim=1, subset=None) result[fname] = [agg(_obj) for agg in agg_how] else: result[fname] = func(fname, agg_how) From 6817d2657a5aba1a510ded084d9f74cad571296d Mon Sep 17 00:00:00 2001 From: Jaden Wang Date: Mon, 6 Apr 2020 21:25:12 -0400 Subject: [PATCH 07/12] Add test cases to groupby aggregate --- .../tests/groupby/aggregate/test_aggregate.py | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index e860ea1a3d052..892f9a428489d 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -636,6 +636,31 @@ def test_mangled(self): ) tm.assert_frame_equal(result, expected) + def test_agg_multiple_columns(self): + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + result = df.groupby("A").agg( + add=(["B", "C"], lambda x: x["B"].max() + x["C"].min()), + minus=(["C", "B"], lambda x: x["B"].max() - x["C"].min()) + ) + expected = pd.DataFrame( + {"add": [5, 9], "minus": [-1, -1]}, index=pd.Index([0, 1], name="A") + ) + tm.assert_frame_equal(result, expected) + + def test_agg_multi_missing_column_raises(self): + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + with pytest.raises(KeyError, match="Column 'D' does not exist"): + df.groupby("A").agg( + minus=(["D", "C"], lambda x: x["D"].max() - x["C"].min()), + ) + + def test_agg_multi_missing_key_raises(self): + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + with pytest.raises(KeyError, match="D"): + df.groupby("A").agg( + minus=(["B", "C"], lambda x: x["D"].max() - x["D"].min()), + ) + @pytest.mark.parametrize( "agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3", From 35d9129c91aca76797ffc63d18f7d47d5d3155e1 Mon Sep 17 00:00:00 2001 From: fpunny Date: Mon, 6 Apr 2020 22:15:49 -0400 Subject: [PATCH 08/12] Fixed implementation --- pandas/core/base.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index bf85cbf1da8ae..bbe5fb3f157aa 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -388,6 +388,13 @@ def _agg_2dim(how): """ colg = self._gotitem(self._selection, ndim=2, subset=obj) return colg.aggregate(how) + + # GH 29268 + def _agg_multi_dim(name, how, keys): + from pandas.core.frame import DataFrame + _obj = {k: self._gotitem(k, ndim=1, subset=None) for k in keys} + result = {com.get_callable_name(agg): agg(_obj) for agg in how} + return DataFrame(result, columns=result.keys()) def _agg(arg, func): """ @@ -399,13 +406,10 @@ def _agg(arg, func): # GH 29268 if fname in deserialized_keys: keys = deserialized_keys[fname] - _obj = {} - - for k in keys: - _obj[k] = self._gotitem(k, ndim=1, subset=None) - result[fname] = [agg(_obj) for agg in agg_how] + result[fname] = _agg_multi_dim(fname, agg_how, keys) else: result[fname] = func(fname, agg_how) + return result # set the final keys From 7016e678daa8d374d4f9a69d5291684ad16c734a Mon Sep 17 00:00:00 2001 From: fpunny Date: Tue, 7 Apr 2020 01:49:16 -0400 Subject: [PATCH 09/12] Run linter --- pandas/core/base.py | 9 +++++---- pandas/core/groupby/generic.py | 1 + pandas/tests/groupby/aggregate/test_aggregate.py | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index bbe5fb3f157aa..9cd2330c4a12b 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -342,16 +342,16 @@ def _aggregate(self, arg, *args, **kwargs): raise SpecificationError("nested renamer is not supported") elif isinstance(obj, ABCDataFrame): # GH 29268 - if (k not in obj.columns): + if k not in obj.columns: # Check if list thingy try: - keys = np.frombuffer(k, dtype=np.dtype(' Date: Tue, 7 Apr 2020 01:58:19 -0400 Subject: [PATCH 10/12] Added whatsnew entry --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index cbfc6d63e8ea3..8d399dc9051a0 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -88,7 +88,7 @@ Other enhancements - :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`). - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) - :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`) -- +- :meth:`DataFrameGroupby.aggregate` will now support named aggregations with multiple columns (:issue:`29268`) .. --------------------------------------------------------------------------- From f599eb5f525b8aa65dcb18633a40912de6348f16 Mon Sep 17 00:00:00 2001 From: William Granados Date: Tue, 7 Apr 2020 21:54:38 -0400 Subject: [PATCH 11/12] changed missing key test --- pandas/tests/groupby/aggregate/test_aggregate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index cb085f0ea9292..8e0f4f39651eb 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -655,7 +655,8 @@ def test_agg_multi_missing_column_raises(self): ) def test_agg_multi_missing_key_raises(self): - df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6], "D": [0, 0, 1, 1]}) + # shouldn't be able to get aggregrations on columns not specified with pytest.raises(KeyError, match="D"): df.groupby("A").agg( minus=(["B", "C"], lambda x: x["D"].max() - x["D"].min()), From 2b08514c4e2786075b239835efae81c0cf8630e1 Mon Sep 17 00:00:00 2001 From: William Granados Date: Tue, 7 Apr 2020 22:25:14 -0400 Subject: [PATCH 12/12] fixed failing pip8 error --- pandas/tests/groupby/aggregate/test_aggregate.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 8e0f4f39651eb..b2c3b324f21bb 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -655,7 +655,9 @@ def test_agg_multi_missing_column_raises(self): ) def test_agg_multi_missing_key_raises(self): - df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6], "D": [0, 0, 1, 1]}) + df = pd.DataFrame( + {"A": [0, 0, 1, 1], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6], "D": [0, 0, 1, 1]} + ) # shouldn't be able to get aggregrations on columns not specified with pytest.raises(KeyError, match="D"): df.groupby("A").agg(