diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2a641a37b46d8..65cc8df30cdce 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -598,6 +598,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.agg` with dictionary input losing ``ExtensionArray`` dtypes (:issue:`32194`) - Bug in :meth:`DataFrame.resample` where an ``AmbiguousTimeError`` would be raised when the resulting timezone aware :class:`DatetimeIndex` had a DST transition at midnight (:issue:`25758`) - Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`) +- Bug in :meth:`DataFrameGroupBy.quantile` where incorrect values would be returned when missing group keys were present (:issue:`33569`) - Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`) Reshaping diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 53e66c4b8723d..e7ef13303646e 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -778,9 +778,14 @@ def group_quantile(ndarray[float64_t] out, if not mask[i]: non_na_counts[lab] += 1 - # Get an index of values sorted by labels and then values - order = (values, labels) - sort_arr = np.lexsort(order).astype(np.int64, copy=False) + # Get an index of values sorted by labels and then values, + # make sure missing labels sort to the back of the array + if labels.size: + labels_for_lexsort = np.where(labels == -1, labels.max() + 1, labels) + else: + labels_for_lexsort = labels + + sort_arr = np.lexsort((values, labels_for_lexsort)).astype(np.int64, copy=False) with nogil: for i in range(ngroups): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 346de55f551df..1bc236f459992 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1507,14 +1507,25 @@ def test_quantile_missing_group_values_no_segfaults(): grp.quantile() -def test_quantile_missing_group_values_correct_results(): +@pytest.mark.parametrize( + "key", + [ + ["a"] * 4 + ["b"] * 3 + [np.nan], + ["a"] * 3 + [np.nan] + ["b"] * 4, + ["a"] * 3 + [np.nan] + ["b"] * 3 + [np.nan], + ], +) +@pytest.mark.parametrize( + "quantile, expected_value", [(0.0, 1.0), (0.5, 2.0), (1.0, 3.0)] +) +def test_quantile_missing_group_values_correct_results(key, quantile, expected_value): # GH 28662 - data = np.array([1.0, np.nan, 3.0, np.nan]) - df = pd.DataFrame(dict(key=data, val=range(4))) - - result = df.groupby("key").quantile() + # https://github.com/pandas-dev/pandas/issues/33569 + value = np.array([1.0, 2.0, 3.0, np.nan] * 2) + df = pd.DataFrame({"key": key, "value": value}) + result = df.groupby("key").quantile(quantile) expected = pd.DataFrame( - [1.0, 3.0], index=pd.Index([1.0, 3.0], name="key"), columns=["val"] + [expected_value] * 2, index=pd.Index(["a", "b"], name="key"), columns=["value"] ) tm.assert_frame_equal(result, expected)