diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 4605c14643fa2..5cbc01a4ba67f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -815,6 +815,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.resample` where an ``AmbiguousTimeError`` would be raised when the resulting timezone aware :class:`DatetimeIndex` had a DST transition at midnight (:issue:`25758`) - Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`) - Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`) +- Bug in :meth:`GroupBy.quantile` causes the quantiles to be shifted when the ``by`` axis contains ``NaN`` (:issue:`33200`, :issue:`33569`) - Bug in :meth:`Rolling.min` and :meth:`Rolling.max`: Growing memory usage after multiple calls when using a fixed window (:issue:`30726`) - Bug in :meth:`Series.groupby` would raise ``ValueError`` when grouping by :class:`PeriodIndex` level (:issue:`34010`) - Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d5d706650bb34..4e792da31e1d5 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -777,7 +777,13 @@ def group_quantile(ndarray[float64_t] out, non_na_counts[lab] += 1 # Get an index of values sorted by labels and then values - order = (values, labels) + if labels.any(): + # Put '-1' (NaN) labels as the last group so it does not interfere + # with the calculations. + labels_for_lexsort = np.where(labels == -1, labels.max() + 1, labels) + else: + labels_for_lexsort = labels + order = (values, labels_for_lexsort) sort_arr = np.lexsort(order).astype(np.int64, copy=False) with nogil: diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 87347fe1293ef..8cfd8035502c3 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -181,15 +181,32 @@ def test_quantile_missing_group_values_no_segfaults(): grp.quantile() -def test_quantile_missing_group_values_correct_results(): - # GH 28662 - data = np.array([1.0, np.nan, 3.0, np.nan]) - df = pd.DataFrame(dict(key=data, val=range(4))) +@pytest.mark.parametrize( + "key, val, expected_key, expected_val", + [ + ([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]), + ([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]), + (["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]), + ([0], [42], [0], [42.0]), + ([], [], np.array([], dtype="float64"), np.array([], dtype="float64")), + ], +) +def test_quantile_missing_group_values_correct_results( + key, val, expected_key, expected_val +): + # GH 28662, GH 33200, GH 33569 + df = pd.DataFrame({"key": key, "val": val}) - result = df.groupby("key").quantile() expected = pd.DataFrame( - [1.0, 3.0], index=pd.Index([1.0, 3.0], name="key"), columns=["val"] + expected_val, index=pd.Index(expected_key, name="key"), columns=["val"] ) + + grp = df.groupby("key") + + result = grp.quantile(0.5) + tm.assert_frame_equal(result, expected) + + result = grp.quantile() tm.assert_frame_equal(result, expected)