From 9d40d0572426a1c13416972e9fd192b57941b682 Mon Sep 17 00:00:00 2001 From: Rob <124158982+rob-sil@users.noreply.github.com> Date: Sun, 4 Aug 2024 09:30:29 -0700 Subject: [PATCH 1/2] Handle floating point boundaries --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/array_algos/quantile.py | 35 +++++++++-------------------- pandas/core/reshape/tile.py | 11 ++++++++- pandas/tests/reshape/test_qcut.py | 12 ++++++++++ 4 files changed, 33 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e3c4e69db7cbd..df6aa3232ad1c 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -615,6 +615,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ +- Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`) - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index 5c933294fb944..9be478d6baed8 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -94,9 +94,9 @@ def quantile_with_mask( flat = np.array([fill_value] * len(qs)) result = np.repeat(flat, len(values)).reshape(len(values), len(qs)) else: - result = _nanpercentile( + result = _nanquantile( values, - qs * 100.0, + qs, na_value=fill_value, mask=mask, interpolation=interpolation, @@ -108,7 +108,7 @@ def quantile_with_mask( return result -def _nanpercentile_1d( +def _nanquantile_1d( values: np.ndarray, mask: npt.NDArray[np.bool_], qs: npt.NDArray[np.float64], @@ -116,7 +116,7 @@ def _nanpercentile_1d( interpolation: str, ) -> Scalar | np.ndarray: """ - Wrapper for np.percentile that skips missing values, specialized to + Wrapper for np.quantile that skips missing values, specialized to 1-dimensional case. Parameters @@ -142,17 +142,10 @@ def _nanpercentile_1d( # equiv: 'np.array([na_value] * len(qs))' but much faster return np.full(len(qs), na_value) - return np.percentile( - values, - qs, - # error: No overload variant of "percentile" matches argument - # types "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]" - # , "Dict[str, str]" [call-overload] - method=interpolation, # type: ignore[call-overload] - ) + return np.quantile(values, qs, method=interpolation) -def _nanpercentile( +def _nanquantile( values: np.ndarray, qs: npt.NDArray[np.float64], *, @@ -161,7 +154,7 @@ def _nanpercentile( interpolation: str, ): """ - Wrapper for np.percentile that skips missing values. + Wrapper for np.quantile that skips missing values. Parameters ---------- @@ -180,7 +173,7 @@ def _nanpercentile( if values.dtype.kind in "mM": # need to cast to integer to avoid rounding errors in numpy - result = _nanpercentile( + result = _nanquantile( values.view("i8"), qs=qs, na_value=na_value.view("i8"), @@ -196,7 +189,7 @@ def _nanpercentile( # Caller is responsible for ensuring mask shape match assert mask.shape == values.shape result = [ - _nanpercentile_1d(val, m, qs, na_value, interpolation=interpolation) + _nanquantile_1d(val, m, qs, na_value, interpolation=interpolation) for (val, m) in zip(list(values), list(mask)) ] if values.dtype.kind == "f": @@ -215,12 +208,4 @@ def _nanpercentile( result = result.astype(values.dtype, copy=False) return result else: - return np.percentile( - values, - qs, - axis=1, - # error: No overload variant of "percentile" matches argument types - # "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]", - # "int", "Dict[str, str]" [call-overload] - method=interpolation, # type: ignore[call-overload] - ) + return np.quantile(values, qs, axis=1, method=interpolation) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 18517199f073c..b3f946f289891 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -358,7 +358,16 @@ def qcut( x_idx = _preprocess_for_cut(x) x_idx, _ = _coerce_to_type(x_idx) - quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q + if is_integer(q): + quantiles = np.linspace(0, 1, q + 1) + # Round up rather than to nearest if not representable in base 2 + np.putmask( + quantiles, + q * quantiles != np.arange(q + 1), + np.nextafter(quantiles, 1), + ) + else: + quantiles = q bins = x_idx.to_series().dropna().quantile(quantiles) diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index 5f769db7f8acf..b2e9f26e1c407 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -307,3 +307,15 @@ def test_qcut_nullable_integer(q, any_numeric_ea_dtype): expected = qcut(arr.astype(float), q) tm.assert_categorical_equal(result, expected) + + +@pytest.mark.parametrize("scale", [1.0, 1 / 3, 17.0]) +@pytest.mark.parametrize("q", [3, 7, 9]) +@pytest.mark.parametrize("precision", [1, 3, 16]) +def test_qcut_contains(scale, q, precision): + # GH-59355 + arr = (scale * np.arange(q + 1)).round(precision) + result = qcut(arr, q, precision=precision) + + for value, bucket in zip(arr, result): + assert value in bucket From eef3366aebd82e58971df34b613f6e3ea57bed1e Mon Sep 17 00:00:00 2001 From: Rob <124158982+rob-sil@users.noreply.github.com> Date: Thu, 8 Aug 2024 19:49:50 -0700 Subject: [PATCH 2/2] Mypy errors don't go away just because I didn't check them --- pandas/core/array_algos/quantile.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index 9be478d6baed8..b2f78182b9bf0 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -142,7 +142,14 @@ def _nanquantile_1d( # equiv: 'np.array([na_value] * len(qs))' but much faster return np.full(len(qs), na_value) - return np.quantile(values, qs, method=interpolation) + return np.quantile( + values, + qs, + # error: No overload variant of "percentile" matches argument + # types "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]" + # , "Dict[str, str]" [call-overload] + method=interpolation, # type: ignore[call-overload] + ) def _nanquantile( @@ -208,4 +215,12 @@ def _nanquantile( result = result.astype(values.dtype, copy=False) return result else: - return np.quantile(values, qs, axis=1, method=interpolation) + return np.quantile( + values, + qs, + axis=1, + # error: No overload variant of "percentile" matches argument types + # "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]", + # "int", "Dict[str, str]" [call-overload] + method=interpolation, # type: ignore[call-overload] + )