Skip to content

Commit c831ccd

Browse files
rob-silmroeschke
andauthored
BUG: Handle floating point boundaries in qcut (#59409)
* Handle floating point boundaries * Mypy errors don't go away just because I didn't check them --------- Co-authored-by: Matthew Roeschke <[email protected]>
1 parent 5591ef3 commit c831ccd

File tree

4 files changed

+33
-11
lines changed

4 files changed

+33
-11
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -618,6 +618,7 @@ Groupby/resample/rolling
618618

619619
Reshaping
620620
^^^^^^^^^
621+
- Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`)
621622
- Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`)
622623
- Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)
623624
- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)

pandas/core/array_algos/quantile.py

+10-10
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,9 @@ def quantile_with_mask(
9494
flat = np.array([fill_value] * len(qs))
9595
result = np.repeat(flat, len(values)).reshape(len(values), len(qs))
9696
else:
97-
result = _nanpercentile(
97+
result = _nanquantile(
9898
values,
99-
qs * 100.0,
99+
qs,
100100
na_value=fill_value,
101101
mask=mask,
102102
interpolation=interpolation,
@@ -108,15 +108,15 @@ def quantile_with_mask(
108108
return result
109109

110110

111-
def _nanpercentile_1d(
111+
def _nanquantile_1d(
112112
values: np.ndarray,
113113
mask: npt.NDArray[np.bool_],
114114
qs: npt.NDArray[np.float64],
115115
na_value: Scalar,
116116
interpolation: str,
117117
) -> Scalar | np.ndarray:
118118
"""
119-
Wrapper for np.percentile that skips missing values, specialized to
119+
Wrapper for np.quantile that skips missing values, specialized to
120120
1-dimensional case.
121121
122122
Parameters
@@ -142,7 +142,7 @@ def _nanpercentile_1d(
142142
# equiv: 'np.array([na_value] * len(qs))' but much faster
143143
return np.full(len(qs), na_value)
144144

145-
return np.percentile(
145+
return np.quantile(
146146
values,
147147
qs,
148148
# error: No overload variant of "percentile" matches argument
@@ -152,7 +152,7 @@ def _nanpercentile_1d(
152152
)
153153

154154

155-
def _nanpercentile(
155+
def _nanquantile(
156156
values: np.ndarray,
157157
qs: npt.NDArray[np.float64],
158158
*,
@@ -161,7 +161,7 @@ def _nanpercentile(
161161
interpolation: str,
162162
):
163163
"""
164-
Wrapper for np.percentile that skips missing values.
164+
Wrapper for np.quantile that skips missing values.
165165
166166
Parameters
167167
----------
@@ -180,7 +180,7 @@ def _nanpercentile(
180180

181181
if values.dtype.kind in "mM":
182182
# need to cast to integer to avoid rounding errors in numpy
183-
result = _nanpercentile(
183+
result = _nanquantile(
184184
values.view("i8"),
185185
qs=qs,
186186
na_value=na_value.view("i8"),
@@ -196,7 +196,7 @@ def _nanpercentile(
196196
# Caller is responsible for ensuring mask shape match
197197
assert mask.shape == values.shape
198198
result = [
199-
_nanpercentile_1d(val, m, qs, na_value, interpolation=interpolation)
199+
_nanquantile_1d(val, m, qs, na_value, interpolation=interpolation)
200200
for (val, m) in zip(list(values), list(mask))
201201
]
202202
if values.dtype.kind == "f":
@@ -215,7 +215,7 @@ def _nanpercentile(
215215
result = result.astype(values.dtype, copy=False)
216216
return result
217217
else:
218-
return np.percentile(
218+
return np.quantile(
219219
values,
220220
qs,
221221
axis=1,

pandas/core/reshape/tile.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,16 @@ def qcut(
358358
x_idx = _preprocess_for_cut(x)
359359
x_idx, _ = _coerce_to_type(x_idx)
360360

361-
quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q
361+
if is_integer(q):
362+
quantiles = np.linspace(0, 1, q + 1)
363+
# Round up rather than to nearest if not representable in base 2
364+
np.putmask(
365+
quantiles,
366+
q * quantiles != np.arange(q + 1),
367+
np.nextafter(quantiles, 1),
368+
)
369+
else:
370+
quantiles = q
362371

363372
bins = x_idx.to_series().dropna().quantile(quantiles)
364373

pandas/tests/reshape/test_qcut.py

+12
Original file line numberDiff line numberDiff line change
@@ -307,3 +307,15 @@ def test_qcut_nullable_integer(q, any_numeric_ea_dtype):
307307
expected = qcut(arr.astype(float), q)
308308

309309
tm.assert_categorical_equal(result, expected)
310+
311+
312+
@pytest.mark.parametrize("scale", [1.0, 1 / 3, 17.0])
313+
@pytest.mark.parametrize("q", [3, 7, 9])
314+
@pytest.mark.parametrize("precision", [1, 3, 16])
315+
def test_qcut_contains(scale, q, precision):
316+
# GH-59355
317+
arr = (scale * np.arange(q + 1)).round(precision)
318+
result = qcut(arr, q, precision=precision)
319+
320+
for value, bucket in zip(arr, result):
321+
assert value in bucket

0 commit comments

Comments
 (0)