From debb02ae1830dc511e237ad2e354bc0256aaeabc Mon Sep 17 00:00:00 2001 From: Koustav-Samaddar Date: Mon, 24 Dec 2018 16:02:43 +0000 Subject: [PATCH 01/14] GH23970 Added test-case that causes bug 23970 --- pandas/tests/groupby/test_groupby.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6d9f60df45ec8..e96d4a170aecc 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1694,3 +1694,15 @@ def test_groupby_agg_ohlc_non_first(): result = df.groupby(pd.Grouper(freq='D')).agg(['sum', 'ohlc']) tm.assert_frame_equal(result, expected) + + +def test_groupby_agg_observed_lost_err(): + # GH-23970 + df = pd.DataFrame({ + 'a': pd.Series([1, 1, 2], dtype='category'), + 'b': [1, 2, 2], 'x': [1, 2, 3]}) + + expected = df + result = df.groupby(['a', 'b'], as_index=False, observed=True)['x'].sum() + + tm.assert_frame_equal(result, expected) From f450dc27abe1ef42f6d75ac77d3a928cdbd60f08 Mon Sep 17 00:00:00 2001 From: Koustav-Samaddar Date: Mon, 24 Dec 2018 16:12:51 +0000 Subject: [PATCH 02/14] GH 23970 Fixed source of the bug Bug would have impacted any groupby function that relied on `observed` if it were `True` --- pandas/core/groupby/generic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 33a41ab1cabc4..c5142a4ee98cc 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1336,7 +1336,8 @@ def _gotitem(self, key, ndim, subset=None): return DataFrameGroupBy(subset, self.grouper, selection=key, grouper=self.grouper, exclusions=self.exclusions, - as_index=self.as_index) + as_index=self.as_index, + observed=self.observed) elif ndim == 1: if subset is None: subset = self.obj[key] From 3b50900ddf83937b717d74a8a0a6fec125de6955 Mon Sep 17 00:00:00 2001 From: Koustav-Samaddar Date: Mon, 24 Dec 2018 16:15:36 +0000 Subject: [PATCH 03/14] GH23970 Added relevant docstring to whatsnew --- doc/source/whatsnew/v0.24.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index a2abda019812a..ec26c9a8a060c 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1560,6 +1560,7 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.groupby.GroupBy.rank` with ``method='dense'`` and ``pct=True`` when a group has only one member would raise a ``ZeroDivisionError`` (:issue:`23666`). - Calling :meth:`pandas.core.groupby.GroupBy.rank` with empty groups and ``pct=True`` was raising a ``ZeroDivisionError`` (:issue:`22519`) - Bug in :meth:`DataFrame.resample` when resampling ``NaT`` in ``TimeDeltaIndex`` (:issue:`13223`). +- Bug in :meth:`DataFrameGroupBy._gotitem` where `observed` parameter was not passed properly, causing subsequent function calls that rely on `observed` to misbehave if called with ``observed=True`` (:issue:`23970`) Reshaping ^^^^^^^^^ From e6e989444637dfd69be538f1200619e9aac4ef74 Mon Sep 17 00:00:00 2001 From: Koustav-Samaddar Date: Mon, 24 Dec 2018 23:08:16 +0000 Subject: [PATCH 04/14] Made requested change to docstring --- doc/source/whatsnew/v0.24.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index ec26c9a8a060c..768181a46ba08 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1560,7 +1560,7 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.groupby.GroupBy.rank` with ``method='dense'`` and ``pct=True`` when a group has only one member would raise a ``ZeroDivisionError`` (:issue:`23666`). - Calling :meth:`pandas.core.groupby.GroupBy.rank` with empty groups and ``pct=True`` was raising a ``ZeroDivisionError`` (:issue:`22519`) - Bug in :meth:`DataFrame.resample` when resampling ``NaT`` in ``TimeDeltaIndex`` (:issue:`13223`). -- Bug in :meth:`DataFrameGroupBy._gotitem` where `observed` parameter was not passed properly, causing subsequent function calls that rely on `observed` to misbehave if called with ``observed=True`` (:issue:`23970`) +- Bug in :meth:`DataFrame.groupby` did not work correctly with ``observed=True`` when aggregating a specified column (:issue:`23970`) Reshaping ^^^^^^^^^ From 51816390d6a27148e6ca1b43da42e18a3e56a7d7 Mon Sep 17 00:00:00 2001 From: Koustav-Samaddar Date: Mon, 24 Dec 2018 23:12:15 +0000 Subject: [PATCH 05/14] Made requested changes to tests --- pandas/tests/groupby/test_categorical.py | 11 +++++++++++ pandas/tests/groupby/test_groupby.py | 12 ------------ 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 264f2567e45c1..45b558ace3bc5 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -860,3 +860,14 @@ def test_groupby_multiindex_categorical_datetime(): expected = pd.DataFrame( {'values': [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx) assert_frame_equal(result, expected) + + +def test_groupby_agg_observed_true_single_column(): + # GH-23970 + expected = pd.DataFrame({ + 'a': pd.Series([1, 1, 2], dtype='category'), + 'b': [1, 2, 2], 'x': [1, 2, 3]}) + + result = df.groupby(['a', 'b'], as_index=False, observed=True)['x'].sum() + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e96d4a170aecc..6d9f60df45ec8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1694,15 +1694,3 @@ def test_groupby_agg_ohlc_non_first(): result = df.groupby(pd.Grouper(freq='D')).agg(['sum', 'ohlc']) tm.assert_frame_equal(result, expected) - - -def test_groupby_agg_observed_lost_err(): - # GH-23970 - df = pd.DataFrame({ - 'a': pd.Series([1, 1, 2], dtype='category'), - 'b': [1, 2, 2], 'x': [1, 2, 3]}) - - expected = df - result = df.groupby(['a', 'b'], as_index=False, observed=True)['x'].sum() - - tm.assert_frame_equal(result, expected) From 4c68d417891180bc6d8cf4e4967275a88c316c93 Mon Sep 17 00:00:00 2001 From: Koustav-Samaddar Date: Tue, 25 Dec 2018 00:36:44 +0000 Subject: [PATCH 06/14] Fixed typo from incomplete find/replace --- pandas/tests/groupby/test_categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 45b558ace3bc5..c0c3662d4768a 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -868,6 +868,6 @@ def test_groupby_agg_observed_true_single_column(): 'a': pd.Series([1, 1, 2], dtype='category'), 'b': [1, 2, 2], 'x': [1, 2, 3]}) - result = df.groupby(['a', 'b'], as_index=False, observed=True)['x'].sum() + result = expected.groupby(['a', 'b'], as_index=False, observed=True)['x'].sum() tm.assert_frame_equal(result, expected) From 77cf3c6bb9579742209ce574b17b464e5685b0cc Mon Sep 17 00:00:00 2001 From: Koustav-Samaddar Date: Tue, 25 Dec 2018 00:38:17 +0000 Subject: [PATCH 07/14] Fixed pep8 violation --- pandas/tests/groupby/test_categorical.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index c0c3662d4768a..d7588c80a9279 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -868,6 +868,7 @@ def test_groupby_agg_observed_true_single_column(): 'a': pd.Series([1, 1, 2], dtype='category'), 'b': [1, 2, 2], 'x': [1, 2, 3]}) - result = expected.groupby(['a', 'b'], as_index=False, observed=True)['x'].sum() + result = expected.groupby(['a', 'b'], + as_index=False, observed=True)['x'].sum() tm.assert_frame_equal(result, expected) From 572d9c31e952ab9a29d21845b1d9502fd7e0d1ff Mon Sep 17 00:00:00 2001 From: Koustav-Samaddar Date: Tue, 25 Dec 2018 02:07:49 +0000 Subject: [PATCH 08/14] Fixed pep's E128 violation --- pandas/tests/groupby/test_categorical.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index d7588c80a9279..a1b45519c577c 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -866,7 +866,9 @@ def test_groupby_agg_observed_true_single_column(): # GH-23970 expected = pd.DataFrame({ 'a': pd.Series([1, 1, 2], dtype='category'), - 'b': [1, 2, 2], 'x': [1, 2, 3]}) + 'b': [1, 2, 2], + 'x': [1, 2, 3] + }) result = expected.groupby(['a', 'b'], as_index=False, observed=True)['x'].sum() From 8793995e72f63e8d1f40240b70b66688678b76a1 Mon Sep 17 00:00:00 2001 From: Koustav-Samaddar Date: Tue, 25 Dec 2018 02:43:11 +0000 Subject: [PATCH 09/14] Fixed another possible pep violation --- pandas/tests/groupby/test_categorical.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index a1b45519c577c..a3f483a8bd2ca 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -870,7 +870,7 @@ def test_groupby_agg_observed_true_single_column(): 'x': [1, 2, 3] }) - result = expected.groupby(['a', 'b'], - as_index=False, observed=True)['x'].sum() + result = expected.groupby( + ['a', 'b'], as_index=False, observed=True)['x'].sum() tm.assert_frame_equal(result, expected) From b75bc7bde9c3b3f79067691437c2f086bbd7cb9b Mon Sep 17 00:00:00 2001 From: Koustav-Samaddar Date: Wed, 26 Dec 2018 03:42:21 +0000 Subject: [PATCH 10/14] Fixed whatsnew entryto meet in the middle --- doc/source/whatsnew/v0.24.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 768181a46ba08..790cc5399b36f 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1560,7 +1560,7 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.groupby.GroupBy.rank` with ``method='dense'`` and ``pct=True`` when a group has only one member would raise a ``ZeroDivisionError`` (:issue:`23666`). - Calling :meth:`pandas.core.groupby.GroupBy.rank` with empty groups and ``pct=True`` was raising a ``ZeroDivisionError`` (:issue:`22519`) - Bug in :meth:`DataFrame.resample` when resampling ``NaT`` in ``TimeDeltaIndex`` (:issue:`13223`). -- Bug in :meth:`DataFrame.groupby` did not work correctly with ``observed=True`` when aggregating a specified column (:issue:`23970`) +- Bug in :meth:`DataFrame.groupby` did not respect the `observed` argument when selecting a column and instead always used ``observed=False`` (:issue:`23970`) Reshaping ^^^^^^^^^ From d87e159e77a515b4ae8d39634eefaaea9e6101df Mon Sep 17 00:00:00 2001 From: Koustav-Samaddar Date: Fri, 28 Dec 2018 03:16:45 +0000 Subject: [PATCH 11/14] Test has been modified to have parameterised as_index --- pandas/tests/groupby/test_categorical.py | 26 +++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index a3f483a8bd2ca..a5ced921f850b 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -862,15 +862,31 @@ def test_groupby_multiindex_categorical_datetime(): assert_frame_equal(result, expected) -def test_groupby_agg_observed_true_single_column(): +@pytest.mark.parametrize("as_index, expected", [ + (True, pd.Series( + index=pd.MultiIndex.from_arrays( + [pd.Series([1, 1, 2], dtype='category'), + [1, 2, 2]], names=['a', 'b']), + data=[1, 2, 3], name='x' + )), + (False, pd.DataFrame({ + 'a': pd.Series([1, 1, 2], dtype='category'), + 'b': [1, 2, 2], + 'x': [1, 2, 3] + })) +]) +def test_groupby_agg_observed_true_single_column(as_index, expected): # GH-23970 - expected = pd.DataFrame({ + df = pd.DataFrame({ 'a': pd.Series([1, 1, 2], dtype='category'), 'b': [1, 2, 2], 'x': [1, 2, 3] }) - result = expected.groupby( - ['a', 'b'], as_index=False, observed=True)['x'].sum() + result = df.groupby( + ['a', 'b'], as_index=as_index, observed=True)['x'].sum() - tm.assert_frame_equal(result, expected) + if isinstance(result, pd.DataFrame): + tm.assert_frame_equal(result, expected) + else: + tm.assert_series_equal(result, expected) From b01d809efe95aa33be7c474c3640221f731f7a44 Mon Sep 17 00:00:00 2001 From: Koustav-Samaddar Date: Fri, 28 Dec 2018 03:19:28 +0000 Subject: [PATCH 12/14] Fixed pep8 issues in test file --- pandas/tests/groupby/test_categorical.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index a5ced921f850b..60f12ae13bef5 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -865,8 +865,9 @@ def test_groupby_multiindex_categorical_datetime(): @pytest.mark.parametrize("as_index, expected", [ (True, pd.Series( index=pd.MultiIndex.from_arrays( - [pd.Series([1, 1, 2], dtype='category'), - [1, 2, 2]], names=['a', 'b']), + [pd.Series([1, 1, 2], dtype='category'), + [1, 2, 2]], names=['a', 'b'] + ), data=[1, 2, 3], name='x' )), (False, pd.DataFrame({ From 7a6898f1ff9935435d99fcbd8f8e73ddd2970e2b Mon Sep 17 00:00:00 2001 From: Koustav-Samaddar Date: Fri, 28 Dec 2018 03:20:30 +0000 Subject: [PATCH 13/14] Made recommended change to whatsnew --- doc/source/whatsnew/v0.24.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 790cc5399b36f..bac7746879eda 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1560,7 +1560,7 @@ Groupby/Resample/Rolling - Bug in :meth:`pandas.core.groupby.GroupBy.rank` with ``method='dense'`` and ``pct=True`` when a group has only one member would raise a ``ZeroDivisionError`` (:issue:`23666`). - Calling :meth:`pandas.core.groupby.GroupBy.rank` with empty groups and ``pct=True`` was raising a ``ZeroDivisionError`` (:issue:`22519`) - Bug in :meth:`DataFrame.resample` when resampling ``NaT`` in ``TimeDeltaIndex`` (:issue:`13223`). -- Bug in :meth:`DataFrame.groupby` did not respect the `observed` argument when selecting a column and instead always used ``observed=False`` (:issue:`23970`) +- Bug in :meth:`DataFrame.groupby` did not respect the ``observed`` argument when selecting a column and instead always used ``observed=False`` (:issue:`23970`) Reshaping ^^^^^^^^^ From 3e9178adf96421a03a2a20945e94670f5e334c81 Mon Sep 17 00:00:00 2001 From: Koustav-Samaddar Date: Mon, 31 Dec 2018 10:39:33 +0000 Subject: [PATCH 14/14] Changed test to use assert_equal --- pandas/tests/groupby/test_categorical.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 54df021dd7b0d..a39600d114b89 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -888,10 +888,7 @@ def test_groupby_agg_observed_true_single_column(as_index, expected): result = df.groupby( ['a', 'b'], as_index=as_index, observed=True)['x'].sum() - if isinstance(result, pd.DataFrame): - tm.assert_frame_equal(result, expected) - else: - tm.assert_series_equal(result, expected) + assert_equal(result, expected) @pytest.mark.parametrize('fill_value', [None, np.nan, pd.NaT])