Skip to content

Commit 1d2c2ea

Browse files
mabelvjsimonjayhawkins
authored andcommitted
Backport PR pandas-dev#33644 on branch 1.0.x (BUG: Groupby quantiles incorrect bins)
1 parent d7a3932 commit 1d2c2ea

File tree

3 files changed

+31
-7
lines changed

3 files changed

+31
-7
lines changed

doc/source/whatsnew/v1.0.4.rst

+1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ Bug fixes
4141
- Bug in :meth:`~DataFrame.to_csv` was silently failing when writing to an invalid s3 bucket. (:issue:`32486`)
4242
- Bug in :meth:`read_parquet` was raising a ``FileNotFoundError`` when passed an s3 directory path. (:issue:`26388`)
4343
- Bug in :meth:`~DataFrame.to_parquet` was throwing an ``AttributeError`` when writing a partitioned parquet file to s3 (:issue:`27596`)
44+
- Bug in :meth:`GroupBy.quantile` causes the quantiles to be shifted when the ``by`` axis contains ``NaN`` (:issue:`33200`, :issue:`33569`)
4445
-
4546

4647
Contributors

pandas/_libs/groupby.pyx

+7-1
Original file line numberDiff line numberDiff line change
@@ -780,7 +780,13 @@ def group_quantile(ndarray[float64_t] out,
780780
non_na_counts[lab] += 1
781781

782782
# Get an index of values sorted by labels and then values
783-
order = (values, labels)
783+
if labels.any():
784+
# Put '-1' (NaN) labels as the last group so it does not interfere
785+
# with the calculations.
786+
labels_for_lexsort = np.where(labels == -1, labels.max() + 1, labels)
787+
else:
788+
labels_for_lexsort = labels
789+
order = (values, labels_for_lexsort)
784790
sort_arr = np.lexsort(order).astype(np.int64, copy=False)
785791

786792
with nogil:

pandas/tests/groupby/test_function.py

+23-6
Original file line numberDiff line numberDiff line change
@@ -1473,15 +1473,32 @@ def test_quantile_missing_group_values_no_segfaults():
14731473
grp.quantile()
14741474

14751475

1476-
def test_quantile_missing_group_values_correct_results():
1477-
# GH 28662
1478-
data = np.array([1.0, np.nan, 3.0, np.nan])
1479-
df = pd.DataFrame(dict(key=data, val=range(4)))
1476+
@pytest.mark.parametrize(
1477+
"key, val, expected_key, expected_val",
1478+
[
1479+
([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]),
1480+
([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]),
1481+
(["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]),
1482+
([0], [42], [0], [42.0]),
1483+
([], [], np.array([], dtype="float64"), np.array([], dtype="float64")),
1484+
],
1485+
)
1486+
def test_quantile_missing_group_values_correct_results(
1487+
key, val, expected_key, expected_val
1488+
):
1489+
# GH 28662, GH 33200, GH 33569
1490+
df = pd.DataFrame({"key": key, "val": val})
14801491

1481-
result = df.groupby("key").quantile()
14821492
expected = pd.DataFrame(
1483-
[1.0, 3.0], index=pd.Index([1.0, 3.0], name="key"), columns=["val"]
1493+
expected_val, index=pd.Index(expected_key, name="key"), columns=["val"]
14841494
)
1495+
1496+
grp = df.groupby("key")
1497+
1498+
result = grp.quantile(0.5)
1499+
tm.assert_frame_equal(result, expected)
1500+
1501+
result = grp.quantile()
14851502
tm.assert_frame_equal(result, expected)
14861503

14871504

0 commit comments

Comments
 (0)