diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 2211722ad006d..bd22fdea6b0e1 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -265,6 +265,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ - :func:`concat` creating :class:`MultiIndex` with duplicate level entries when concatenating a :class:`DataFrame` with duplicates in :class:`Index` and multiple keys (:issue:`42651`) +- Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`) - Sparse diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 7db30dc1ba9b9..c5d06bcef72a4 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -384,7 +384,7 @@ def qcut( def _bins_to_cuts( x, - bins, + bins: np.ndarray, right: bool = True, labels=None, precision: int = 3, @@ -421,7 +421,7 @@ def _bins_to_cuts( ids = ensure_platform_int(bins.searchsorted(x, side=side)) if include_lowest: - ids[x == bins[0]] = 1 + ids[np.asarray(x) == bins[0]] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 944205c66c3e6..127be504e82d5 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -691,3 +691,48 @@ def test_cut_no_warnings(): labels = [f"{i} - {i + 9}" for i in range(0, 100, 10)] with tm.assert_produces_warning(False): df["group"] = cut(df.value, range(0, 105, 10), right=False, labels=labels) + + +def test_cut_with_duplicated_index_lowest_included(): + # GH 42185 + expected = Series( + [Interval(-0.001, 2, closed="right")] * 3 + + [Interval(2, 4, closed="right"), Interval(-0.001, 2, closed="right")], + index=[0, 1, 2, 3, 0], + dtype="category", + ).cat.as_ordered() + + s = Series([0, 1, 2, 3, 0], index=[0, 1, 2, 3, 0]) + result = cut(s, bins=[0, 2, 4], include_lowest=True) + tm.assert_series_equal(result, expected) + + +def test_cut_with_nonexact_categorical_indices(): + # GH 42424 + + ser = Series(range(0, 100)) + ser1 = cut(ser, 10).value_counts().head(5) + ser2 = cut(ser, 10).value_counts().tail(5) + result = DataFrame({"1": ser1, "2": ser2}) + + index = pd.CategoricalIndex( + [ + Interval(-0.099, 9.9, closed="right"), + Interval(9.9, 19.8, closed="right"), + Interval(19.8, 29.7, closed="right"), + Interval(29.7, 39.6, closed="right"), + Interval(39.6, 49.5, closed="right"), + Interval(49.5, 59.4, closed="right"), + Interval(59.4, 69.3, closed="right"), + Interval(69.3, 79.2, closed="right"), + Interval(79.2, 89.1, closed="right"), + Interval(89.1, 99, closed="right"), + ], + ordered=True, + ) + + expected = DataFrame( + {"1": [10] * 5 + [np.nan] * 5, "2": [np.nan] * 5 + [10] * 5}, index=index + ) + + tm.assert_frame_equal(expected, result)