From cd9d8728643f0456e1cc75ea3390d81d30809ba7 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 22 Oct 2019 15:32:00 -0700 Subject: [PATCH 1/4] Tests for NA handling --- pandas/tests/groupby/test_function.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 571e710ba8928..83f2969a1a67f 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1373,6 +1373,27 @@ def test_quantile_out_of_bounds_q_raises(): g.quantile(-1) +def test_quantile_missing_group_values_no_segfaults(): + # GH 28662 + data = np.array([1., np.nan, 1.]) + df = pd.DataFrame(dict(key=data, val=range(3))) + + # Random segfaults; would have been guaranteed in loop + grp = df.groupby("key") + for _ in range(100): + grp.quantile() + + +def test_quantile_missing_group_values_correct_results(): + # GH 28662 + data = np.array([1., np.nan, 3., np.nan]) + df = pd.DataFrame(dict(key=data, val=range(4))) + + result = df.groupby("key") + expected = pd.DataFrame([1., 3.], index=pd.Index([1., 3.], name="key")) + tm.assert_frame_equal(result, expected) + + # pipe # -------------------------------- From ff6fe6a652edf64996612c08be8874359a08a787 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 22 Oct 2019 15:35:28 -0700 Subject: [PATCH 2/4] Impl and test fixes --- pandas/_libs/groupby.pyx | 3 +++ pandas/tests/groupby/test_function.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index b2ffbb3ecb4f2..1a8694a0f4746 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -763,6 +763,9 @@ def group_quantile(ndarray[float64_t] out, with nogil: for i in range(N): lab = labels[i] + if lab == -1: # NA group label + continue + counts[lab] += 1 if not mask[i]: non_na_counts[lab] += 1 diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 83f2969a1a67f..a8b98e2507b71 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1389,8 +1389,8 @@ def test_quantile_missing_group_values_correct_results(): data = np.array([1., np.nan, 3., np.nan]) df = pd.DataFrame(dict(key=data, val=range(4))) - result = df.groupby("key") - expected = pd.DataFrame([1., 3.], index=pd.Index([1., 3.], name="key")) + result = df.groupby("key").quantile() + expected = pd.DataFrame([1., 3.], index=pd.Index([1., 3.], name="key"), columns=["val"]) tm.assert_frame_equal(result, expected) From 50e70912d098e48531ff2d7d656918ffc99bfa97 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 22 Oct 2019 15:37:20 -0700 Subject: [PATCH 3/4] Whatsnew --- doc/source/whatsnew/v1.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 01a102f269886..0013a33c474da 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -379,6 +379,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) +- Bug in :meth:`DataFrameGroupBy.quantile` where NA values in the grouping could cause segfaults or incorrect results (:issue:`28882`) Reshaping ^^^^^^^^^ From 69ca2b490eb644c69761b330d23b437656066740 Mon Sep 17 00:00:00 2001 From: Will Ayd Date: Tue, 22 Oct 2019 15:37:41 -0700 Subject: [PATCH 4/4] Black --- pandas/tests/groupby/test_function.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index a8b98e2507b71..2d7dfe49dc038 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1375,7 +1375,7 @@ def test_quantile_out_of_bounds_q_raises(): def test_quantile_missing_group_values_no_segfaults(): # GH 28662 - data = np.array([1., np.nan, 1.]) + data = np.array([1.0, np.nan, 1.0]) df = pd.DataFrame(dict(key=data, val=range(3))) # Random segfaults; would have been guaranteed in loop @@ -1386,11 +1386,13 @@ def test_quantile_missing_group_values_no_segfaults(): def test_quantile_missing_group_values_correct_results(): # GH 28662 - data = np.array([1., np.nan, 3., np.nan]) + data = np.array([1.0, np.nan, 3.0, np.nan]) df = pd.DataFrame(dict(key=data, val=range(4))) result = df.groupby("key").quantile() - expected = pd.DataFrame([1., 3.], index=pd.Index([1., 3.], name="key"), columns=["val"]) + expected = pd.DataFrame( + [1.0, 3.0], index=pd.Index([1.0, 3.0], name="key"), columns=["val"] + ) tm.assert_frame_equal(result, expected)