Skip to content

Commit c15b8b8

Browse files
committed
BUG: Maintain the order of the bins in group_quantile. Updated tests #3320
1 parent c8db9b9 commit c15b8b8

File tree

3 files changed

+18
-8
lines changed

3 files changed

+18
-8
lines changed

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -603,6 +603,7 @@ Groupby/resample/rolling
603603
- Bug in :meth:`DataFrame.resample` where an ``AmbiguousTimeError`` would be raised when the resulting timezone aware :class:`DatetimeIndex` had a DST transition at midnight (:issue:`25758`)
604604
- Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`)
605605
- Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`)
606+
- Bug in :meth:`SeriesGroupBy.quantile` causes the quantiles to be shifted when the ``by`` axis contains ``NaN`` (:issue:`33200`)
606607

607608
Reshaping
608609
^^^^^^^^^

pandas/_libs/groupby.pyx

+4-3
Original file line numberDiff line numberDiff line change
@@ -779,9 +779,10 @@ def group_quantile(ndarray[float64_t] out,
779779
non_na_counts[lab] += 1
780780

781781
# Get an index of values sorted by labels and then values
782-
order = (values, labels)
783-
sort_arr = np.lexsort(order).astype(np.int64, copy=False)
784-
782+
sort_arr = np.arange(len(labels), dtype=np.int64)
783+
mask = labels != -1
784+
order = (np.asarray(values)[mask], labels[mask])
785+
sort_arr[mask] = np.lexsort(order).astype(np.int64, copy=False)
785786
with nogil:
786787
for i in range(ngroups):
787788
# Figure out how many group elements there are

pandas/tests/groupby/test_function.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -1507,14 +1507,22 @@ def test_quantile_missing_group_values_no_segfaults():
15071507
grp.quantile()
15081508

15091509

1510-
def test_quantile_missing_group_values_correct_results():
1511-
# GH 28662
1512-
data = np.array([1.0, np.nan, 3.0, np.nan])
1513-
df = pd.DataFrame(dict(key=data, val=range(4)))
1510+
@pytest.mark.parametrize(
1511+
"key, val, expected_key, expected_val",
1512+
[
1513+
([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 1.0]),
1514+
(["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]),
1515+
],
1516+
)
1517+
def test_quantile_missing_group_values_correct_results(
1518+
key, val, expected_key, expected_val
1519+
):
1520+
# GH 28662, GH 33200
1521+
df = pd.DataFrame({"key": key, "val": val})
15141522

15151523
result = df.groupby("key").quantile()
15161524
expected = pd.DataFrame(
1517-
[1.0, 3.0], index=pd.Index([1.0, 3.0], name="key"), columns=["val"]
1525+
expected_val, index=pd.Index(expected_key, name="key"), columns=["val"]
15181526
)
15191527
tm.assert_frame_equal(result, expected)
15201528

0 commit comments

Comments
 (0)