From 7778129e16c1b6979f95dd49b9bd0a5361107958 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 15 Apr 2020 13:47:29 -0500 Subject: [PATCH 1/6] BUG: Debug grouped quantile with NA values --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/_libs/groupby.pyx | 2 +- pandas/tests/groupby/test_function.py | 31 ++++++++++++++++++++++----- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 82c43811c0444..c1cd582be6b56 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -551,6 +551,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.agg` with dictionary input losing ``ExtensionArray`` dtypes (:issue:`32194`) - Bug in :meth:`DataFrame.resample` where an ``AmbiguousTimeError`` would be raised when the resulting timezone aware :class:`DatetimeIndex` had a DST transition at midnight (:issue:`25758`) - Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`) +- Bug in :meth:`DataFrameGroupBy.quantile` where incorrect values would be returned when missing group keys were present (:issue:`33569`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e7ac3b8442c6d..2439c7f9c27af 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -779,7 +779,7 @@ def group_quantile(ndarray[float64_t] out, non_na_counts[lab] += 1 # Get an index of values sorted by labels and then values - order = (values, labels) + order = (values, np.where(labels == -1, labels.max() + 1, labels)) sort_arr = np.lexsort(order).astype(np.int64, copy=False) with nogil: diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 346de55f551df..2a50262362da2 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1507,14 +1507,35 @@ def test_quantile_missing_group_values_no_segfaults(): grp.quantile() -def test_quantile_missing_group_values_correct_results(): +@pytest.mark.parametrize( + "key, value", + [ + (["a"] * 4 + ["b"] * 4, np.array([1.0, 2.0, 3.0, np.nan] * 2)), + (["a"] * 4 + ["b"] * 3 + [np.nan], np.array([1.0, 2.0, 3.0, np.nan] * 2)), + (["a"] * 4 + [np.nan] + ["b"] * 3, np.array([np.nan, 1.0, 2.0, 3.0] * 2)), + ( + ["a"] * 3 + [np.nan] + ["b"] * 3 + [np.nan], + np.array([1.0, 2.0, 3.0, np.nan] * 2), + ), + ( + ["a"] * 3 + [np.nan] + ["b"] * 3 + [np.nan], + np.array([1.0, 2.0, 3.0, 4.0] * 2), + ), + ], +) +@pytest.mark.parametrize( + "quantile, expected_value", [(0.0, 1.0), (0.5, 2.0), (1.0, 3.0)] +) +def test_quantile_missing_group_values_correct_results( + key, value, quantile, expected_value +): # GH 28662 - data = np.array([1.0, np.nan, 3.0, np.nan]) - df = pd.DataFrame(dict(key=data, val=range(4))) + # https://github.com/pandas-dev/pandas/issues/33569 + df = pd.DataFrame({"key": key, "value": value}) - result = df.groupby("key").quantile() + result = df.groupby("key").quantile(quantile) expected = pd.DataFrame( - [1.0, 3.0], index=pd.Index([1.0, 3.0], name="key"), columns=["val"] + [expected_value] * 2, index=pd.Index(["a", "b"], name="key"), columns=["value"] ) tm.assert_frame_equal(result, expected) From 1297438895af5f35ee14647ab172c8301d41494a Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 15 Apr 2020 16:55:10 -0500 Subject: [PATCH 2/6] Comment --- pandas/_libs/groupby.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 2439c7f9c27af..25fde87690310 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -778,7 +778,8 @@ def group_quantile(ndarray[float64_t] out, if not mask[i]: non_na_counts[lab] += 1 - # Get an index of values sorted by labels and then values + # Get an index of values sorted by labels and then values, + # make sure missing labels sort to the back of the array order = (values, np.where(labels == -1, labels.max() + 1, labels)) sort_arr = np.lexsort(order).astype(np.int64, copy=False) From db39b0a2065aa0b324b290782fb118f2a5d67127 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Wed, 15 Apr 2020 17:21:41 -0500 Subject: [PATCH 3/6] Fixup --- pandas/_libs/groupby.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 25fde87690310..9ea780e0b9259 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -780,7 +780,11 @@ def group_quantile(ndarray[float64_t] out, # Get an index of values sorted by labels and then values, # make sure missing labels sort to the back of the array - order = (values, np.where(labels == -1, labels.max() + 1, labels)) + if labels.size: + labels_for_lexsort = np.where(labels == -1, labels.max() + 1, labels) + else: + labels_for_lexsort = labels + order = (values, labels_for_lexsort) sort_arr = np.lexsort(order).astype(np.int64, copy=False) with nogil: From f10e9dd84954536d4ebf559a8884c5922bf29441 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 16 Apr 2020 20:01:45 -0500 Subject: [PATCH 4/6] Better test --- pandas/tests/groupby/test_function.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 2a50262362da2..40702f5e88577 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1508,31 +1508,23 @@ def test_quantile_missing_group_values_no_segfaults(): @pytest.mark.parametrize( - "key, value", + "key", [ - (["a"] * 4 + ["b"] * 4, np.array([1.0, 2.0, 3.0, np.nan] * 2)), - (["a"] * 4 + ["b"] * 3 + [np.nan], np.array([1.0, 2.0, 3.0, np.nan] * 2)), - (["a"] * 4 + [np.nan] + ["b"] * 3, np.array([np.nan, 1.0, 2.0, 3.0] * 2)), - ( - ["a"] * 3 + [np.nan] + ["b"] * 3 + [np.nan], - np.array([1.0, 2.0, 3.0, np.nan] * 2), - ), - ( - ["a"] * 3 + [np.nan] + ["b"] * 3 + [np.nan], - np.array([1.0, 2.0, 3.0, 4.0] * 2), - ), + ["a"] * 4 + ["b"] * 3 + [np.nan], + ["a"] * 3 + [np.nan] + ["b"] * 4, + ["a"] * 3 + [np.nan] + ["b"] * 3 + [np.nan], ], ) @pytest.mark.parametrize( "quantile, expected_value", [(0.0, 1.0), (0.5, 2.0), (1.0, 3.0)] ) def test_quantile_missing_group_values_correct_results( - key, value, quantile, expected_value + key, quantile, expected_value ): # GH 28662 # https://github.com/pandas-dev/pandas/issues/33569 + value = np.array([1.0, 2.0, 3.0, np.nan] * 2) df = pd.DataFrame({"key": key, "value": value}) - result = df.groupby("key").quantile(quantile) expected = pd.DataFrame( [expected_value] * 2, index=pd.Index(["a", "b"], name="key"), columns=["value"] From 0257f86adbe3189e25731dd1b5f664ab8cdb54b9 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 16 Apr 2020 20:08:09 -0500 Subject: [PATCH 5/6] Remove --- pandas/_libs/groupby.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 9ea780e0b9259..5e464cfd70539 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -784,8 +784,8 @@ def group_quantile(ndarray[float64_t] out, labels_for_lexsort = np.where(labels == -1, labels.max() + 1, labels) else: labels_for_lexsort = labels - order = (values, labels_for_lexsort) - sort_arr = np.lexsort(order).astype(np.int64, copy=False) + + sort_arr = np.lexsort((values, labels_for_lexsort)).astype(np.int64, copy=False) with nogil: for i in range(ngroups): From e921701043b9d6a939d03ed578733a3bff05e66f Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Thu, 16 Apr 2020 20:50:07 -0500 Subject: [PATCH 6/6] Black --- pandas/tests/groupby/test_function.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 40702f5e88577..1bc236f459992 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1518,9 +1518,7 @@ def test_quantile_missing_group_values_no_segfaults(): @pytest.mark.parametrize( "quantile, expected_value", [(0.0, 1.0), (0.5, 2.0), (1.0, 3.0)] ) -def test_quantile_missing_group_values_correct_results( - key, quantile, expected_value -): +def test_quantile_missing_group_values_correct_results(key, quantile, expected_value): # GH 28662 # https://github.com/pandas-dev/pandas/issues/33569 value = np.array([1.0, 2.0, 3.0, np.nan] * 2)