Skip to content

Commit 60f61ae

Browse files
debnathshohamCGe0516
authored andcommitted
BUG: Issue with pd.cut on Series with duplicate index (pandas-dev#42448)
1 parent f9058fa commit 60f61ae

File tree

3 files changed

+48
-2
lines changed

3 files changed

+48
-2
lines changed

doc/source/whatsnew/v1.4.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ Groupby/resample/rolling
265265
Reshaping
266266
^^^^^^^^^
267267
- :func:`concat` creating :class:`MultiIndex` with duplicate level entries when concatenating a :class:`DataFrame` with duplicates in :class:`Index` and multiple keys (:issue:`42651`)
268+
- Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`)
268269
-
269270

270271
Sparse

pandas/core/reshape/tile.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -384,7 +384,7 @@ def qcut(
384384

385385
def _bins_to_cuts(
386386
x,
387-
bins,
387+
bins: np.ndarray,
388388
right: bool = True,
389389
labels=None,
390390
precision: int = 3,
@@ -421,7 +421,7 @@ def _bins_to_cuts(
421421
ids = ensure_platform_int(bins.searchsorted(x, side=side))
422422

423423
if include_lowest:
424-
ids[x == bins[0]] = 1
424+
ids[np.asarray(x) == bins[0]] = 1
425425

426426
na_mask = isna(x) | (ids == len(bins)) | (ids == 0)
427427
has_nas = na_mask.any()

pandas/tests/reshape/test_cut.py

+45
Original file line numberDiff line numberDiff line change
@@ -691,3 +691,48 @@ def test_cut_no_warnings():
691691
labels = [f"{i} - {i + 9}" for i in range(0, 100, 10)]
692692
with tm.assert_produces_warning(False):
693693
df["group"] = cut(df.value, range(0, 105, 10), right=False, labels=labels)
694+
695+
696+
def test_cut_with_duplicated_index_lowest_included():
697+
# GH 42185
698+
expected = Series(
699+
[Interval(-0.001, 2, closed="right")] * 3
700+
+ [Interval(2, 4, closed="right"), Interval(-0.001, 2, closed="right")],
701+
index=[0, 1, 2, 3, 0],
702+
dtype="category",
703+
).cat.as_ordered()
704+
705+
s = Series([0, 1, 2, 3, 0], index=[0, 1, 2, 3, 0])
706+
result = cut(s, bins=[0, 2, 4], include_lowest=True)
707+
tm.assert_series_equal(result, expected)
708+
709+
710+
def test_cut_with_nonexact_categorical_indices():
711+
# GH 42424
712+
713+
ser = Series(range(0, 100))
714+
ser1 = cut(ser, 10).value_counts().head(5)
715+
ser2 = cut(ser, 10).value_counts().tail(5)
716+
result = DataFrame({"1": ser1, "2": ser2})
717+
718+
index = pd.CategoricalIndex(
719+
[
720+
Interval(-0.099, 9.9, closed="right"),
721+
Interval(9.9, 19.8, closed="right"),
722+
Interval(19.8, 29.7, closed="right"),
723+
Interval(29.7, 39.6, closed="right"),
724+
Interval(39.6, 49.5, closed="right"),
725+
Interval(49.5, 59.4, closed="right"),
726+
Interval(59.4, 69.3, closed="right"),
727+
Interval(69.3, 79.2, closed="right"),
728+
Interval(79.2, 89.1, closed="right"),
729+
Interval(89.1, 99, closed="right"),
730+
],
731+
ordered=True,
732+
)
733+
734+
expected = DataFrame(
735+
{"1": [10] * 5 + [np.nan] * 5, "2": [np.nan] * 5 + [10] * 5}, index=index
736+
)
737+
738+
tm.assert_frame_equal(expected, result)

0 commit comments

Comments
 (0)