From 354344939e97c62cfbe551a3cb2dee6ea94ff3a4 Mon Sep 17 00:00:00 2001 From: Jeremy Lopez Date: Tue, 10 Jul 2018 19:46:54 -0400 Subject: [PATCH 1/7] BUG: Fix groupby bug #21624. Fixes bug where operations such as transform('sum') raise errors when only a single null group exists. --- pandas/core/groupby/groupby.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5dc1d518d1c2d..a86c8daf739c7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2350,6 +2350,10 @@ def size(self): out = np.bincount(ids[ids != -1], minlength=ngroup) else: out = ids + # Covers the edge case where only a null group exists + if self.result_index.shape[0] == 0: + out = [] + return Series(out, index=self.result_index, dtype='int64') From eddcbd177b1231457ca57d093f6ea40c98f82d34 Mon Sep 17 00:00:00 2001 From: Jeremy Lopez Date: Tue, 10 Jul 2018 20:22:39 -0400 Subject: [PATCH 2/7] Improving fix to bug #21624. --- pandas/core/groupby/groupby.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a86c8daf739c7..dc0c3b28acec8 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2349,11 +2349,7 @@ def size(self): if ngroup: out = np.bincount(ids[ids != -1], minlength=ngroup) else: - out = ids - # Covers the edge case where only a null group exists - if self.result_index.shape[0] == 0: out = [] - return Series(out, index=self.result_index, dtype='int64') From 59b3c453d1b6c55415b3f1f9ae3c7c296c128676 Mon Sep 17 00:00:00 2001 From: Jeremy Lopez Date: Wed, 11 Jul 2018 09:47:12 -0400 Subject: [PATCH 3/7] Adding test to check that the issue is resolved. --- pandas/tests/groupby/test_transform.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 7fccf1f57a886..f69380e51790a 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -782,3 +782,14 @@ def test_any_all_np_func(func): res = df.groupby('key')['val'].transform(func) tm.assert_series_equal(res, exp) + + +def test_transform_with_all_nan(): + # GH 21624 + df = DataFrame({'groups': [np.nan, np.nan, np.nan], + 'values': [1, 2, 3]}) + + grouped = df.groupby('groups') + summed = grouped['values'].transform('sum') + expected = Series([np.nan, np.nan, np.nan], name='values') + tm.assert_series_equal(summed, expected) From d47beea336c61563db52e5758a26c34f769e5b6a Mon Sep 17 00:00:00 2001 From: Jeremy Lopez Date: Wed, 11 Jul 2018 12:54:28 -0400 Subject: [PATCH 4/7] Changing variable name to match established practices --- pandas/tests/groupby/test_transform.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index f69380e51790a..d087cb6449e54 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -790,6 +790,6 @@ def test_transform_with_all_nan(): 'values': [1, 2, 3]}) grouped = df.groupby('groups') - summed = grouped['values'].transform('sum') + result = grouped['values'].transform('sum') expected = Series([np.nan, np.nan, np.nan], name='values') - tm.assert_series_equal(summed, expected) + tm.assert_series_equal(result, expected) From 6d03bc9e192d8dd495e1f5c56ef39118d7336d02 Mon Sep 17 00:00:00 2001 From: Jeremy Lopez Date: Wed, 11 Jul 2018 23:33:34 -0400 Subject: [PATCH 5/7] Update to deal with refactor --- pandas/core/groupby/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 65b9144c0ddc9..f54539a4945b8 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -235,7 +235,7 @@ def size(self): if ngroup: out = np.bincount(ids[ids != -1], minlength=ngroup) else: - out = ids + out = [] return Series(out, index=self.result_index, dtype='int64') From 2fd0e7303830a425f26a792b2160209624ed8820 Mon Sep 17 00:00:00 2001 From: Jeremy Lopez Date: Fri, 13 Jul 2018 10:30:53 -0400 Subject: [PATCH 6/7] Adding text to make sure we didn't break the behavior in non-edge cases --- pandas/tests/groupby/test_transform.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index d087cb6449e54..50edaf704fdaa 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -793,3 +793,12 @@ def test_transform_with_all_nan(): result = grouped['values'].transform('sum') expected = Series([np.nan, np.nan, np.nan], name='values') tm.assert_series_equal(result, expected) + + # Make sure the standard case still works too + df = DataFrame({'groups': [np.nan, 'A', 'A', 'B', 'B'], + 'values': range(5)}) + + grouped = df.groupby('groups') + result = grouped['values'].transform('sum') + expected = Series([np.nan, 3, 3, 7, 7], name='values') + tm.assert_series_equal(result, expected) From 6d2855c2273c6d43349c1eb4444f5c7eeabc03f4 Mon Sep 17 00:00:00 2001 From: Jeremy Lopez Date: Fri, 13 Jul 2018 13:06:19 -0400 Subject: [PATCH 7/7] Changed test to use parametrize decorator. --- pandas/tests/groupby/test_transform.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index 50edaf704fdaa..758f215c00fa5 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -784,21 +784,14 @@ def test_any_all_np_func(func): tm.assert_series_equal(res, exp) -def test_transform_with_all_nan(): +@pytest.mark.parametrize("input_df, expected", [ + (DataFrame({'groups': [np.nan, np.nan, np.nan], 'values': [1, 2, 3]}), + Series([np.nan, np.nan, np.nan], name='values')), + (DataFrame({'groups': [np.nan, 'A', 'A', 'B', 'B'], 'values': range(5)}), + Series([np.nan, 3, 3, 7, 7], name='values')) +]) +def test_transform_with_all_nan(input_df, expected): # GH 21624 - df = DataFrame({'groups': [np.nan, np.nan, np.nan], - 'values': [1, 2, 3]}) - - grouped = df.groupby('groups') - result = grouped['values'].transform('sum') - expected = Series([np.nan, np.nan, np.nan], name='values') - tm.assert_series_equal(result, expected) - - # Make sure the standard case still works too - df = DataFrame({'groups': [np.nan, 'A', 'A', 'B', 'B'], - 'values': range(5)}) - - grouped = df.groupby('groups') + grouped = input_df.groupby('groups') result = grouped['values'].transform('sum') - expected = Series([np.nan, 3, 3, 7, 7], name='values') tm.assert_series_equal(result, expected)