From 372198783e3d3d7645d75c5ec23ce23c0347bcbf Mon Sep 17 00:00:00 2001 From: Koustav Samaddar Date: Thu, 22 Nov 2018 18:10:37 -0500 Subject: [PATCH 01/11] BUG: Fixed groupby.rank w/ method="dense", pct=True if any grp count is 1 For some reason the grp_size is stored as 0 instead of 1 in this example causing the error. --- pandas/_libs/groupby_helper.pxi.in | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 523d43f893aad..ffab9b69e1047 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -588,7 +588,16 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, if out[i, 0] != out[i, 0] or out[i, 0] == NAN: out[i, 0] = NAN else: - out[i, 0] = out[i, 0] / grp_sizes[i, 0] + with gil: + try: + out[i, 0] = out[i, 0] / grp_sizes[i, 0] + except ZeroDivisionError: + # BUG 23666 - tie-breaking for dense causes the + # group sizes for groups of count 1 to become 0 + # causing a divide by zero error. The correct + # behaviour should be divide by 1 a.k.a no change + # to out[i, 0] + pass {{endif}} {{endfor}} From a1ba61ee208d7f3110e4cd048591f70685a7ac70 Mon Sep 17 00:00:00 2001 From: Koustav Samaddar Date: Thu, 22 Nov 2018 20:18:33 -0500 Subject: [PATCH 02/11] BUG: Fixed groupby.rank w/ method="dense", pct=True Added change to the what's new file. --- doc/source/whatsnew/v0.24.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 4ff3cc728f7f7..6777dcc448355 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1420,6 +1420,7 @@ Groupby/Resample/Rolling - Bug in :meth:`DataFrame.expanding` in which the ``axis`` argument was not being respected during aggregations (:issue:`23372`) - Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` which caused missing values when the input function can accept a :class:`DataFrame` but renames it (:issue:`23455`). - Bug in :func:`pandas.core.groupby.GroupBy.nth` where column order was not always preserved (:issue:`20760`) +- Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.rank` with ``method='dense'`` and ``pct=True`` when a group has only one member would raise a ZeroDivisionError (:issue:`23666`). Reshaping ^^^^^^^^^ From b8d343cea6c35961b8833da5cfd64a636a77869b Mon Sep 17 00:00:00 2001 From: Koustav Samaddar Date: Thu, 22 Nov 2018 21:17:53 -0500 Subject: [PATCH 03/11] BUG 23666 Moved my test case from local file to approporiate test file in the repo. --- pandas/tests/groupby/test_rank.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index e7e91572c56d1..820e2f37d6f3d 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -290,3 +290,16 @@ def test_rank_empty_group(): result = df.groupby(column).rank(pct=True) expected = DataFrame({"B": [0.5, np.nan, 1.0]}) tm.assert_frame_equal(result, expected) + + +def test_rank_zero_div(): + # GH 23666 + df = pd.DataFrame({ + "A": [1, 1, 1, 2, 2, 2], + "B": [1, 1, 1, 1, 2, 2], + "C": [1, 2, 1, 1, 1, 2]}) + + try: + df.groupby(["A", "B"])["C"].rank(pct=True, method="dense") + except ZeroDivisionError: + pytest.fail("Unexpected zero division error with groupby rank") From 1be7457925ccecfcc129041d3c6339b22111516b Mon Sep 17 00:00:00 2001 From: Koustav Samaddar Date: Thu, 22 Nov 2018 21:41:01 -0500 Subject: [PATCH 04/11] Fixed pep8 in the tests file --- pandas/tests/groupby/test_rank.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 820e2f37d6f3d..b83fc423b1b96 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -295,9 +295,10 @@ def test_rank_empty_group(): def test_rank_zero_div(): # GH 23666 df = pd.DataFrame({ - "A": [1, 1, 1, 2, 2, 2], - "B": [1, 1, 1, 1, 2, 2], - "C": [1, 2, 1, 1, 1, 2]}) + "A": [1, 1, 1, 2, 2, 2], + "B": [1, 1, 1, 1, 2, 2], + "C": [1, 2, 1, 1, 1, 2] + }) try: df.groupby(["A", "B"])["C"].rank(pct=True, method="dense") From f1db09c1ab899474a1439a9a20d0c371e4b16671 Mon Sep 17 00:00:00 2001 From: Koustav Samaddar Date: Thu, 22 Nov 2018 22:22:13 -0500 Subject: [PATCH 05/11] Made the requested modifications --- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/_libs/groupby_helper.pxi.in | 17 +++++++---------- pandas/tests/groupby/test_rank.py | 10 +++------- 3 files changed, 11 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 6777dcc448355..d36ddc075b91e 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1420,7 +1420,7 @@ Groupby/Resample/Rolling - Bug in :meth:`DataFrame.expanding` in which the ``axis`` argument was not being respected during aggregations (:issue:`23372`) - Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` which caused missing values when the input function can accept a :class:`DataFrame` but renames it (:issue:`23455`). - Bug in :func:`pandas.core.groupby.GroupBy.nth` where column order was not always preserved (:issue:`20760`) -- Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.rank` with ``method='dense'`` and ``pct=True`` when a group has only one member would raise a ZeroDivisionError (:issue:`23666`). +- Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.rank` with ``method='dense'`` and ``pct=True`` when a group has only one member would raise a ``ZeroDivisionError`` (:issue:`23666`). Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index ffab9b69e1047..0b7e2035787c7 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -588,16 +588,13 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, if out[i, 0] != out[i, 0] or out[i, 0] == NAN: out[i, 0] = NAN else: - with gil: - try: - out[i, 0] = out[i, 0] / grp_sizes[i, 0] - except ZeroDivisionError: - # BUG 23666 - tie-breaking for dense causes the - # group sizes for groups of count 1 to become 0 - # causing a divide by zero error. The correct - # behaviour should be divide by 1 a.k.a no change - # to out[i, 0] - pass + if grp_sizes[i, 0] != 0: + # BUG 23666 - tie-breaking for dense causes the + # group sizes for groups of count 1 to become 0 + # causing a divide by zero error. The correct + # behaviour should be divide by 1 a.k.a no change + # to out[i, 0] + out[i, 0] = out[i, 0] / grp_sizes[i, 0] {{endif}} {{endfor}} diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index b83fc423b1b96..5328edf081b28 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -295,12 +295,8 @@ def test_rank_empty_group(): def test_rank_zero_div(): # GH 23666 df = pd.DataFrame({ - "A": [1, 1, 1, 2, 2, 2], - "B": [1, 1, 1, 1, 2, 2], - "C": [1, 2, 1, 1, 1, 2] + "A": [1, 2], + "B": [1, 1] }) - try: - df.groupby(["A", "B"])["C"].rank(pct=True, method="dense") - except ZeroDivisionError: - pytest.fail("Unexpected zero division error with groupby rank") + df.groupby("A")["B"].rank(pct=True, method="dense") From 7ce206cd8e224ae57b14fe92ee86da94c32a0cd6 Mon Sep 17 00:00:00 2001 From: Koustav Samaddar Date: Fri, 23 Nov 2018 11:16:41 -0500 Subject: [PATCH 06/11] Updated test case to assert expected result --- pandas/tests/groupby/test_rank.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 5328edf081b28..d6285679ccb50 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -294,9 +294,11 @@ def test_rank_empty_group(): def test_rank_zero_div(): # GH 23666 - df = pd.DataFrame({ + df = DataFrame({ "A": [1, 2], "B": [1, 1] }) - df.groupby("A")["B"].rank(pct=True, method="dense") + result = df.groupby("A").rank(pct=True, method="dense") + expected = DataFrame({"B": [1.0, 1.0]}) + tm.assert_frame_equal(result, expected) \ No newline at end of file From 976ecccec26369b615c6670ce55dbf3d473f368c Mon Sep 17 00:00:00 2001 From: Koustav Samaddar Date: Fri, 23 Nov 2018 13:06:32 -0500 Subject: [PATCH 07/11] Moved & shrunk comment block outside if block --- pandas/_libs/groupby_helper.pxi.in | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 0b7e2035787c7..98008ebab0ae9 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -588,12 +588,9 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, if out[i, 0] != out[i, 0] or out[i, 0] == NAN: out[i, 0] = NAN else: + # BUG 23666 - grp_sizes[i, 0] should never be 0 + # If it's zero, it means to be 1 a.k.a. no change if grp_sizes[i, 0] != 0: - # BUG 23666 - tie-breaking for dense causes the - # group sizes for groups of count 1 to become 0 - # causing a divide by zero error. The correct - # behaviour should be divide by 1 a.k.a no change - # to out[i, 0] out[i, 0] = out[i, 0] / grp_sizes[i, 0] {{endif}} {{endfor}} From 87ee207470cbf277ccb272d9952d14307a0b55c9 Mon Sep 17 00:00:00 2001 From: Koustav Samaddar Date: Sat, 24 Nov 2018 16:20:04 -0500 Subject: [PATCH 08/11] Fixed pep8 formatting to have empty line at end of file --- pandas/tests/groupby/test_rank.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index d6285679ccb50..868166815d62b 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -301,4 +301,4 @@ def test_rank_zero_div(): result = df.groupby("A").rank(pct=True, method="dense") expected = DataFrame({"B": [1.0, 1.0]}) - tm.assert_frame_equal(result, expected) \ No newline at end of file + tm.assert_frame_equal(result, expected) From 34047dac5c7eae2c252cde120ba8fa14aa3324c4 Mon Sep 17 00:00:00 2001 From: Koustav Samaddar Date: Sat, 24 Nov 2018 16:22:10 -0500 Subject: [PATCH 09/11] Updated to have more test cases for better coverage --- pandas/tests/groupby/test_rank.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 868166815d62b..3caf48dc19461 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -292,13 +292,16 @@ def test_rank_empty_group(): tm.assert_frame_equal(result, expected) -def test_rank_zero_div(): +@pytest.mark.parametrize("df_dicts", [ + ({"A": [1, 2], "B": [1, 1]}, {"B": [1.0, 1.0]}), + ({"A": [1, 1, 2, 2], "B": [1, 2, 1, 2]}, {"B": [0.5, 1.0, 0.5, 1.0]}), + ({"A": [1, 1, 2, 2], "B": [1, 2, 1, np.nan]}, {"B": [0.5, 1.0, 1.0, np.nan]}), + ({"A": [1, 1, 2], "B": [1, 2, np.nan]}, {"B": [0.5, 1.0, np.nan]}) +]) +def test_rank_zero_div(df_dicts): # GH 23666 - df = DataFrame({ - "A": [1, 2], - "B": [1, 1] - }) + df = DataFrame(df_dicts[0]) - result = df.groupby("A").rank(pct=True, method="dense") - expected = DataFrame({"B": [1.0, 1.0]}) + result = df.groupby("A").rank(method="dense", pct=True) + expected = DataFrame(df_dicts[1]) tm.assert_frame_equal(result, expected) From 88f5cb46af13af7aeb1c9143196491d41e467b83 Mon Sep 17 00:00:00 2001 From: Koustav Samaddar Date: Sat, 1 Dec 2018 15:51:22 -0500 Subject: [PATCH 10/11] Made stylistic changes to code and test according to request --- pandas/_libs/groupby_helper.pxi.in | 9 ++++----- pandas/tests/groupby/test_rank.py | 16 ++++++++-------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 98008ebab0ae9..20c1a2646cab1 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -587,11 +587,10 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, # rankings, so we assign them percentages of NaN. if out[i, 0] != out[i, 0] or out[i, 0] == NAN: out[i, 0] = NAN - else: - # BUG 23666 - grp_sizes[i, 0] should never be 0 - # If it's zero, it means to be 1 a.k.a. no change - if grp_sizes[i, 0] != 0: - out[i, 0] = out[i, 0] / grp_sizes[i, 0] + # BUG 23666 - grp_sizes[i, 0] should never be 0 + # If it's zero, it means to be 1 a.k.a. no change + elif grp_sizes[i, 0] != 0: + out[i, 0] = out[i, 0] / grp_sizes[i, 0] {{endif}} {{endfor}} diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 3caf48dc19461..aaac614761083 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -292,16 +292,16 @@ def test_rank_empty_group(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("df_dicts", [ - ({"A": [1, 2], "B": [1, 1]}, {"B": [1.0, 1.0]}), - ({"A": [1, 1, 2, 2], "B": [1, 2, 1, 2]}, {"B": [0.5, 1.0, 0.5, 1.0]}), - ({"A": [1, 1, 2, 2], "B": [1, 2, 1, np.nan]}, {"B": [0.5, 1.0, 1.0, np.nan]}), - ({"A": [1, 1, 2], "B": [1, 2, np.nan]}, {"B": [0.5, 1.0, np.nan]}) +@pytest.mark.parametrize("input_key,input_value,output_value", [ + ([1, 2], [1, 1], [1.0, 1.0]), + ([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]), + ([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]), + ([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]) ]) -def test_rank_zero_div(df_dicts): +def test_rank_zero_div(input_key, input_value, output_value): # GH 23666 - df = DataFrame(df_dicts[0]) + df = DataFrame({"A": input_key, "B": input_value}) result = df.groupby("A").rank(method="dense", pct=True) - expected = DataFrame(df_dicts[1]) + expected = DataFrame({"B": output_value}) tm.assert_frame_equal(result, expected) From 74f9e0b9283700e546fc8899b8329a8bc88e981c Mon Sep 17 00:00:00 2001 From: Koustav Samaddar Date: Sat, 1 Dec 2018 15:53:48 -0500 Subject: [PATCH 11/11] Removed comment describing bug --- pandas/_libs/groupby_helper.pxi.in | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 20c1a2646cab1..abac9f147848e 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -587,8 +587,6 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, # rankings, so we assign them percentages of NaN. if out[i, 0] != out[i, 0] or out[i, 0] == NAN: out[i, 0] = NAN - # BUG 23666 - grp_sizes[i, 0] should never be 0 - # If it's zero, it means to be 1 a.k.a. no change elif grp_sizes[i, 0] != 0: out[i, 0] = out[i, 0] / grp_sizes[i, 0] {{endif}}