diff --git a/Untitled.ipynb b/Untitled.ipynb new file mode 100644 index 0000000000000..d8b3681291fce --- /dev/null +++ b/Untitled.ipynb @@ -0,0 +1,81 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " a b c d group\n", + "0 0.855664 0.237612 0.660391 0.896628 0\n", + "1 0.695109 0.026930 0.315360 0.784887 0\n", + "2 0.807515 0.301360 0.400504 0.055916 1\n", + "3 0.077397 0.571981 0.429654 0.180142 1\n", + "\n", + "group\n", + "0 0.618052\n", + "1 0.235534\n", + "dtype: float64\n", + " a_sum a_mean b_mean c_sum d_range diff_a_b\n", + "group \n", + "0 1.550773 0.775387 0.132271 0.975751 0.111741 0.618052\n", + "1 0.884912 0.442456 0.436670 0.830158 0.124226 0.235534\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "df = pd.DataFrame(np.random.rand(4,4), columns=list('abcd'))\n", + "df['group'] = [0, 0, 1, 1]\n", + "\n", + "print(df)\n", + "\n", + "print(df.groupby('group'))\n", + "\n", + "print(df.groupby('group')['a'].max() - df.groupby('group')['b'].max())\n", + "\n", + "print(df.groupby('group').agg(\n", + " diff_a_b=(['a', 'b'], lambda x: x['a'].max() - x['b'].max())\n", + " a_sum=('a', 'sum'),\n", + " a_mean=('a', 'mean'),\n", + " b_mean=('b', 'mean'),\n", + " c_sum=('c', 'sum'),\n", + " d_range=('d', lambda x: x.max() - x.min()),\n", + "))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 8cd229070e365..1215f0d65984c 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -680,6 +680,16 @@ must be either implemented on GroupBy or available via :ref:`dispatching .. _groupby.aggregate.cython: + +Aggregrating multiple columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +..ipython:: python + + grouped.agg([ diff_c_d=(['C', 'D'], lambda x: x['C'].max() - x['D'].max())\n", + + + Cython-optimized aggregation functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 57c53f73962dc..cd3fc3aef9ed3 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -316,6 +316,11 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) +Grouby Aggregrations +^^^^^^^^^^^^^^^^^^^^ + +- added functionality to perform aggregrations on multiple columns + Reshaping ^^^^^^^^^ diff --git a/pandas/core/base.py b/pandas/core/base.py index f55d9f905945d..a4f50d25db8da 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -352,8 +352,20 @@ def _aggregate(self, arg, *args, **kwargs): raise SpecificationError("nested renamer is not supported") elif isinstance(obj, ABCSeries): raise SpecificationError("nested renamer is not supported") - elif isinstance(obj, ABCDataFrame) and k not in obj.columns: - raise KeyError(f"Column '{k}' does not exist!") + elif isinstance(obj, ABCDataFrame): + + # OWO CHANGES + # Original check + if (k not in obj.columns): + # Check if list thingy + try: + keys = np.frombuffer(k, dtype=np.dtype(' len(set(func)): diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 48f8de7e51ae4..d63706b3eb2b7 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -610,6 +610,30 @@ def test_mangled(self): ) tm.assert_frame_equal(result, expected) + def test_agg_multiple_columns(self): + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + result = df.groupby("A").agg( + add=(["B", "C"], lambda x: x["B"].max() + x["C"].min()), + minus=(["C", "B"], lambda x: x["B"].max() - x["C"].min()) + ) + expected = pd.DataFrame( + {"add": [5, 9], "minus": [-1, -1]}, index=pd.Index([0, 1], name="A") + ) + tm.assert_frame_equal(result, expected) + + def test_agg_multi_missing_column_raises(self): + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + with pytest.raises(KeyError, match="Column 'D' does not exist"): + df.groupby("A").agg( + minus=(["D", "C"], lambda x: x["D"].max() - x["C"].min()), + ) + + def test_agg_multi_missing_key_raises(self): + df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + with pytest.raises(KeyError, match="D"): + df.groupby("A").agg( + minus=(["B", "C"], lambda x: x["D"].max() - x["D"].min()), + ) @pytest.mark.parametrize( "agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3",