From 6b026fbdf3bae60bb914b1c47b1019682640ac56 Mon Sep 17 00:00:00 2001 From: phofl Date: Thu, 2 Apr 2020 23:25:00 +0200 Subject: [PATCH 1/6] BUG: Fix droped result column in groupby with as_index False --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/groupby/generic.py | 8 ++--- .../tests/groupby/aggregate/test_aggregate.py | 35 +++++++++++++++++++ 3 files changed, 40 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 20415bba99476..1a6e6355cf079 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -403,6 +403,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`) +- Bug in :meth:`DataFrame.groupby` lost results, when ``as_index`` option was set to ``False``. The result values were replaced with the index values (:issue:`32240`). Reshaping diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b7c071a8dfbbf..e560031741e20 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -963,16 +963,16 @@ def aggregate(self, func=None, *args, **kwargs): [self._selected_obj.columns.name] * result.columns.nlevels ).droplevel(-1) - if not self.as_index: - self._insert_inaxis_grouper_inplace(result) - result.index = np.arange(len(result)) - if relabeling: # used reordered index of columns result = result.iloc[:, order] result.columns = columns + if not self.as_index: + self._insert_inaxis_grouper_inplace(result) + result.index = np.arange(len(result)) + return result._convert(datetime=True) agg = aggregate diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index e860ea1a3d052..d6e67455451a9 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -822,6 +822,41 @@ def test_groupby_aggregate_period_frame(func): tm.assert_frame_equal(result, expected) +def test_grouby_agg_loses_results_with_as_index_false_relabel(): + # GH 32240: When the aggregate function relabels column names and + # as_index=False is specified, the results are dropped. + + df = pd.DataFrame( + {"key": ["x", "y", "z", "x", "y", "z"], "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75]} + ) + + grouped = df.groupby("key", as_index=False) + result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) + expected = pd.DataFrame({"key": ["x", "y", "z"], "min_val": [1.0, 0.8, 0.75]}) + tm.assert_frame_equal(result, expected) + + +def test_grouby_agg_loses_results_with_as_index_false_relabel_multiindex(): + # GH 32240: When the aggregate function relabels column names and + # as_index=False is specified, the results are dropped. Check if + # multiindex is returned in the right order + + df = pd.DataFrame( + { + "key": ["x", "y", "x", "y", "x", "x"], + "key1": ["a", "b", "c", "b", "a", "c"], + "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75], + } + ) + + grouped = df.groupby(["key", "key1"], as_index=False) + result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) + expected = pd.DataFrame( + {"key": ["x", "x", "y"], "key1": ["a", "c", "b"], "min_val": [1.0, 0.75, 0.8]} + ) + tm.assert_frame_equal(result, expected) + + class TestLambdaMangling: def test_basic(self): df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) From f54cfebcd1184d71610893f51ef3db569af53895 Mon Sep 17 00:00:00 2001 From: phofl Date: Fri, 3 Apr 2020 09:03:31 +0200 Subject: [PATCH 2/6] Add in Code comment --- pandas/core/groupby/generic.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index e560031741e20..ae10c4f24f811 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -969,6 +969,11 @@ def aggregate(self, func=None, *args, **kwargs): result = result.iloc[:, order] result.columns = columns + # GH 32240: The groupby function lost the result values if the as_index=False + # option was set and the relabeling flag was true, because the order and + # columns variables do not consider the columns previously set as index. + # Changing the order of relabeling and reseting the index solves this. + if not self.as_index: self._insert_inaxis_grouper_inplace(result) result.index = np.arange(len(result)) From 0605011d9e3596f74eb9a322c9ae8b2e4acaed54 Mon Sep 17 00:00:00 2001 From: phofl Date: Sat, 4 Apr 2020 01:19:48 +0200 Subject: [PATCH 3/6] Add changes requested per review --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/groupby/generic.py | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 1a6e6355cf079..fced649bfbff1 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -403,7 +403,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`) -- Bug in :meth:`DataFrame.groupby` lost results, when ``as_index`` option was set to ``False``. The result values were replaced with the index values (:issue:`32240`). +- Bug in :meth:`DataFrameGroupby.agg` lost results, when ``as_index`` option was set to ``False`` and the result columns were relabeled. The result values were replaced with the index values (:issue:`32240`). Reshaping diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ae10c4f24f811..e560031741e20 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -969,11 +969,6 @@ def aggregate(self, func=None, *args, **kwargs): result = result.iloc[:, order] result.columns = columns - # GH 32240: The groupby function lost the result values if the as_index=False - # option was set and the relabeling flag was true, because the order and - # columns variables do not consider the columns previously set as index. - # Changing the order of relabeling and reseting the index solves this. - if not self.as_index: self._insert_inaxis_grouper_inplace(result) result.index = np.arange(len(result)) From 7aa8d559bb472b56e542dd053ab5a6b75f743d03 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 19 Apr 2020 23:59:54 +0200 Subject: [PATCH 4/6] Add previous/new section and relocate tests --- doc/source/whatsnew/v1.1.0.rst | 35 ++++++++++ .../tests/groupby/aggregate/test_aggregate.py | 70 +++++++++---------- 2 files changed, 70 insertions(+), 35 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 619f66f30d5c4..f4c299d18152c 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -246,6 +246,41 @@ Assignment to multiple columns of a :class:`DataFrame` when some of the columns df[['a', 'c']] = 1 df +.. _whatsnew_110.api_breaking.groupby_results_los_as_index_false: + +:meth:`DataFrameGroupby.agg` lost results with ``as_index`` ``False`` when relabeling columns +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously :meth:`DataFrameGroupby.agg` lost the result columns, when the ``as_index`` option was +set to ``False`` and the result columns were relabeled. In this case he result values were replaced with +the previous index (:issue:`32240`). + +.. ipython:: python + + df = pd.DataFrame({"key": ["x", "y", "z", "x", "y", "z"], "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75]}) + df + +*Previous behavior*: + +.. code-block:: ipython + + In [2]: grouped = df.groupby("key", as_index=False) + In [3]: result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) + In [4]: result + Out[4]: + min_val + 0 x + 1 y + 2 z + +*New behavior*: + +.. ipython:: python + + grouped = df.groupby("key", as_index=False) + result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) + result + .. _whatsnew_110.deprecations: Deprecations diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index d6e67455451a9..aff44a1806ca2 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -717,6 +717,41 @@ def test_agg_relabel_multiindex_duplicates(): tm.assert_frame_equal(result, expected) +def test_grouby_agg_loses_results_with_as_index_false_relabel(): + # GH 32240: When the aggregate function relabels column names and + # as_index=False is specified, the results are dropped. + + df = pd.DataFrame( + {"key": ["x", "y", "z", "x", "y", "z"], "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75]} + ) + + grouped = df.groupby("key", as_index=False) + result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) + expected = pd.DataFrame({"key": ["x", "y", "z"], "min_val": [1.0, 0.8, 0.75]}) + tm.assert_frame_equal(result, expected) + + +def test_grouby_agg_loses_results_with_as_index_false_relabel_multiindex(): + # GH 32240: When the aggregate function relabels column names and + # as_index=False is specified, the results are dropped. Check if + # multiindex is returned in the right order + + df = pd.DataFrame( + { + "key": ["x", "y", "x", "y", "x", "x"], + "key1": ["a", "b", "c", "b", "a", "c"], + "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75], + } + ) + + grouped = df.groupby(["key", "key1"], as_index=False) + result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) + expected = pd.DataFrame( + {"key": ["x", "x", "y"], "key1": ["a", "c", "b"], "min_val": [1.0, 0.75, 0.8]} + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "func", [lambda s: s.mean(), lambda s: np.mean(s), lambda s: np.nanmean(s)] ) @@ -822,41 +857,6 @@ def test_groupby_aggregate_period_frame(func): tm.assert_frame_equal(result, expected) -def test_grouby_agg_loses_results_with_as_index_false_relabel(): - # GH 32240: When the aggregate function relabels column names and - # as_index=False is specified, the results are dropped. - - df = pd.DataFrame( - {"key": ["x", "y", "z", "x", "y", "z"], "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75]} - ) - - grouped = df.groupby("key", as_index=False) - result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) - expected = pd.DataFrame({"key": ["x", "y", "z"], "min_val": [1.0, 0.8, 0.75]}) - tm.assert_frame_equal(result, expected) - - -def test_grouby_agg_loses_results_with_as_index_false_relabel_multiindex(): - # GH 32240: When the aggregate function relabels column names and - # as_index=False is specified, the results are dropped. Check if - # multiindex is returned in the right order - - df = pd.DataFrame( - { - "key": ["x", "y", "x", "y", "x", "x"], - "key1": ["a", "b", "c", "b", "a", "c"], - "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75], - } - ) - - grouped = df.groupby(["key", "key1"], as_index=False) - result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) - expected = pd.DataFrame( - {"key": ["x", "x", "y"], "key1": ["a", "c", "b"], "min_val": [1.0, 0.75, 0.8]} - ) - tm.assert_frame_equal(result, expected) - - class TestLambdaMangling: def test_basic(self): df = pd.DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]}) From ea7517113258909ad2c16b3e5b2a073ea1af5205 Mon Sep 17 00:00:00 2001 From: phofl Date: Mon, 25 May 2020 21:01:41 +0200 Subject: [PATCH 5/6] Fix linter issue --- doc/source/whatsnew/v1.1.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index ab0d5a389200c..95517ebc95838 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -575,7 +575,8 @@ the previous index (:issue:`32240`). .. ipython:: python - df = pd.DataFrame({"key": ["x", "y", "z", "x", "y", "z"], "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75]}) + df = pd.DataFrame({"key": ["x", "y", "z", "x", "y", "z"], + "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75]}) df *Previous behavior*: From e7351c6c92ead7e417a46e167a7a93de37195c50 Mon Sep 17 00:00:00 2001 From: phofl Date: Sun, 14 Jun 2020 23:34:38 +0200 Subject: [PATCH 6/6] Fix whats new entry --- doc/source/whatsnew/v1.1.0.rst | 76 +++++++++++++++++----------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index fccbef41ec89a..62af097d720ef 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -676,6 +676,44 @@ Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxma df.groupby("a", as_index=False).nunique() + +.. _whatsnew_110.api_breaking.groupby_results_lost_as_index_false: + +:meth:`DataFrameGroupby.agg` lost results with ``as_index`` ``False`` when relabeling columns +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously :meth:`DataFrameGroupby.agg` lost the result columns, when the ``as_index`` option was +set to ``False`` and the result columns were relabeled. In this case he result values were replaced with +the previous index (:issue:`32240`). + +.. ipython:: python + + df = pd.DataFrame({"key": ["x", "y", "z", "x", "y", "z"], + "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75]}) + df + +*Previous behavior*: + +.. code-block:: ipython + + In [2]: grouped = df.groupby("key", as_index=False) + In [3]: result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) + In [4]: result + Out[4]: + min_val + 0 x + 1 y + 2 z + +*New behavior*: + +.. ipython:: python + + grouped = df.groupby("key", as_index=False) + result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) + result + + .. _whatsnew_110.api_breaking.apply_applymap_first_once: apply and applymap on ``DataFrame`` evaluates first row/column only once @@ -715,43 +753,6 @@ apply and applymap on ``DataFrame`` evaluates first row/column only once df.apply(func, axis=1) -.. _whatsnew_110.api_breaking.groupby_results_los_as_index_false: - -:meth:`DataFrameGroupby.agg` lost results with ``as_index`` ``False`` when relabeling columns -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Previously :meth:`DataFrameGroupby.agg` lost the result columns, when the ``as_index`` option was -set to ``False`` and the result columns were relabeled. In this case he result values were replaced with -the previous index (:issue:`32240`). - -.. ipython:: python - - df = pd.DataFrame({"key": ["x", "y", "z", "x", "y", "z"], - "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75]}) - df - -*Previous behavior*: - -.. code-block:: ipython - - In [2]: grouped = df.groupby("key", as_index=False) - In [3]: result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) - In [4]: result - Out[4]: - min_val - 0 x - 1 y - 2 z - -*New behavior*: - -.. ipython:: python - - grouped = df.groupby("key", as_index=False) - result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) - result - - .. _whatsnew_110.deprecations: Deprecations @@ -1029,7 +1030,6 @@ Groupby/resample/rolling The behaviour now is consistent, independent of internal heuristics. (:issue:`31612`, :issue:`14927`, :issue:`13056`) - Bug in :meth:`SeriesGroupBy.agg` where any column name was accepted in the named aggregation of ``SeriesGroupBy`` previously. The behaviour now allows only ``str`` and callables else would raise ``TypeError``. (:issue:`34422`) - Bug in :meth:`DataFrame.groupby` lost index, when one of the ``agg`` keys referenced an empty list (:issue:`32580`) -- Bug in :meth:`DataFrameGroupby.agg` lost results, when ``as_index`` option was set to ``False`` and the result columns were relabeled. The result values were replaced with the index values (:issue:`32240`). Reshaping ^^^^^^^^^