diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1953132c826ba..68cfaf539c3f1 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -399,6 +399,7 @@ Reshaping - Bug in :func:`concat` where the resulting ``freq`` of two :class:`DatetimeIndex` with the same ``freq`` would be dropped (:issue:`3232`). - Bug in :func:`merge` where merging with equivalent Categorical dtypes was raising an error (:issue:`22501`) - Bug in :class:`DataFrame` constructor when passing non-empty tuples would cause a segmentation fault (:issue:`25691`) +- Bug in :func:`pandas.cut` where large bins could incorrectly raise an error due to an integer overflow (:issue:`26045`) Sparse ^^^^^^ diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index f99fd9004bb31..8c29bdc2a974c 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -230,7 +230,9 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, else: bins = np.asarray(bins) bins = _convert_bin_to_numeric_type(bins, dtype) - if (np.diff(bins) < 0).any(): + + # GH 26045: cast to float64 to avoid an overflow + if (np.diff(bins.astype('float64')) < 0).any(): raise ValueError('bins must increase monotonically.') fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels, diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 6833460fa515b..f71730fb4a313 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -112,6 +112,35 @@ def test_bins_not_monotonic(): cut(data, [0.1, 1.5, 1, 10]) +@pytest.mark.parametrize("x, bins, expected", [ + (date_range("2017-12-31", periods=3), + [Timestamp.min, Timestamp('2018-01-01'), Timestamp.max], + IntervalIndex.from_tuples([ + (Timestamp.min, Timestamp('2018-01-01')), + (Timestamp('2018-01-01'), Timestamp.max)])), + + ([-1, 0, 1], + np.array([np.iinfo(np.int64).min, 0, np.iinfo(np.int64).max], + dtype="int64"), + IntervalIndex.from_tuples([ + (np.iinfo(np.int64).min, 0), + (0, np.iinfo(np.int64).max)])), + + ([np.timedelta64(-1), np.timedelta64(0), np.timedelta64(1)], + np.array([ + np.timedelta64(-np.iinfo(np.int64).max), + np.timedelta64(0), + np.timedelta64(np.iinfo(np.int64).max)]), + IntervalIndex.from_tuples([ + (np.timedelta64(-np.iinfo(np.int64).max), np.timedelta64(0)), + (np.timedelta64(0), np.timedelta64(np.iinfo(np.int64).max))])), +]) +def test_bins_monotonic_not_overflowing(x, bins, expected): + # GH 26045 + result = cut(x, bins) + tm.assert_index_equal(result.categories, expected) + + def test_wrong_num_labels(): msg = "Bin labels must be one fewer than the number of bin edges" data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]