Skip to content

Commit ecbb0ef

Browse files
Batalexjreback
authored andcommitted
BUG: prevent overflowing diffs raising error in cut (pandas-dev#26045) (pandas-dev#26063)
1 parent c18c8be commit ecbb0ef

File tree

3 files changed

+33
-1
lines changed

3 files changed

+33
-1
lines changed

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,7 @@ Reshaping
400400
- Bug in :func:`concat` where the resulting ``freq`` of two :class:`DatetimeIndex` with the same ``freq`` would be dropped (:issue:`3232`).
401401
- Bug in :func:`merge` where merging with equivalent Categorical dtypes was raising an error (:issue:`22501`)
402402
- Bug in :class:`DataFrame` constructor when passing non-empty tuples would cause a segmentation fault (:issue:`25691`)
403+
- Bug in :func:`pandas.cut` where large bins could incorrectly raise an error due to an integer overflow (:issue:`26045`)
403404

404405
Sparse
405406
^^^^^^

pandas/core/reshape/tile.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,9 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
230230
else:
231231
bins = np.asarray(bins)
232232
bins = _convert_bin_to_numeric_type(bins, dtype)
233-
if (np.diff(bins) < 0).any():
233+
234+
# GH 26045: cast to float64 to avoid an overflow
235+
if (np.diff(bins.astype('float64')) < 0).any():
234236
raise ValueError('bins must increase monotonically.')
235237

236238
fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels,

pandas/tests/reshape/test_cut.py

+29
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,35 @@ def test_bins_not_monotonic():
112112
cut(data, [0.1, 1.5, 1, 10])
113113

114114

115+
@pytest.mark.parametrize("x, bins, expected", [
116+
(date_range("2017-12-31", periods=3),
117+
[Timestamp.min, Timestamp('2018-01-01'), Timestamp.max],
118+
IntervalIndex.from_tuples([
119+
(Timestamp.min, Timestamp('2018-01-01')),
120+
(Timestamp('2018-01-01'), Timestamp.max)])),
121+
122+
([-1, 0, 1],
123+
np.array([np.iinfo(np.int64).min, 0, np.iinfo(np.int64).max],
124+
dtype="int64"),
125+
IntervalIndex.from_tuples([
126+
(np.iinfo(np.int64).min, 0),
127+
(0, np.iinfo(np.int64).max)])),
128+
129+
([np.timedelta64(-1), np.timedelta64(0), np.timedelta64(1)],
130+
np.array([
131+
np.timedelta64(-np.iinfo(np.int64).max),
132+
np.timedelta64(0),
133+
np.timedelta64(np.iinfo(np.int64).max)]),
134+
IntervalIndex.from_tuples([
135+
(np.timedelta64(-np.iinfo(np.int64).max), np.timedelta64(0)),
136+
(np.timedelta64(0), np.timedelta64(np.iinfo(np.int64).max))])),
137+
])
138+
def test_bins_monotonic_not_overflowing(x, bins, expected):
139+
# GH 26045
140+
result = cut(x, bins)
141+
tm.assert_index_equal(result.categories, expected)
142+
143+
115144
def test_wrong_num_labels():
116145
msg = "Bin labels must be one fewer than the number of bin edges"
117146
data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]

0 commit comments

Comments
 (0)