From 9f76e407dbca88e8c4ec56ec1573e6211176b9c1 Mon Sep 17 00:00:00 2001 From: Alexandre Batisse Date: Fri, 12 Apr 2019 15:16:45 +0200 Subject: [PATCH 1/5] BUG: prevent overflowing diffs raising error in cut (#26045) --- pandas/core/reshape/tile.py | 2 +- pandas/tests/reshape/test_cut.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index f99fd9004bb31..a9271404be005 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -230,7 +230,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, else: bins = np.asarray(bins) bins = _convert_bin_to_numeric_type(bins, dtype) - if (np.diff(bins) < 0).any(): + if (np.diff(bins.astype('float64')) < 0).any(): raise ValueError('bins must increase monotonically.') fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels, diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 6833460fa515b..1d63ce49a1bae 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -112,6 +112,14 @@ def test_bins_not_monotonic(): cut(data, [0.1, 1.5, 1, 10]) +def test_bins_monotic_not_overflowing(): + data = date_range("2017-12-31", periods=3) + + result = cut(data, [Timestamp.min, Timestamp('2018-01-01'), Timestamp.max]) + tm.assert_numpy_array_equal(result.codes, + np.array([0, 0, 1], dtype="int8")) + + def test_wrong_num_labels(): msg = "Bin labels must be one fewer than the number of bin edges" data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] From bf0d2ac81a75e62d8300bfb817557a107486e2de Mon Sep 17 00:00:00 2001 From: Alexandre Batisse Date: Fri, 12 Apr 2019 15:39:10 +0200 Subject: [PATCH 2/5] add whatsnew entry --- doc/source/whatsnew/v0.25.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index c441244b4415d..52b1bc32b554c 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -395,6 +395,7 @@ Reshaping - Bug in :func:`concat` where the resulting ``freq`` of two :class:`DatetimeIndex` with the same ``freq`` would be dropped (:issue:`3232`). - Bug in :func:`merge` where merging with equivalent Categorical dtypes was raising an error (:issue:`22501`) - Bug in :class:`DataFrame` constructor when passing non-empty tuples would cause a segmentation fault (:issue:`25691`) +- Bug in :func:`pandas.cut` where large bins could raise an error due to an overflow (:issue:`26045`) Sparse ^^^^^^ From 3bd565aeb979a6076e4bbb9a5c0a4db05faf1e5f Mon Sep 17 00:00:00 2001 From: Alexandre Batisse Date: Mon, 15 Apr 2019 14:03:54 +0200 Subject: [PATCH 3/5] address PR remarks --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/reshape/tile.py | 2 ++ pandas/tests/reshape/test_cut.py | 23 ++++++++++++++++++----- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 52b1bc32b554c..e270bfe0b7670 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -395,7 +395,7 @@ Reshaping - Bug in :func:`concat` where the resulting ``freq`` of two :class:`DatetimeIndex` with the same ``freq`` would be dropped (:issue:`3232`). - Bug in :func:`merge` where merging with equivalent Categorical dtypes was raising an error (:issue:`22501`) - Bug in :class:`DataFrame` constructor when passing non-empty tuples would cause a segmentation fault (:issue:`25691`) -- Bug in :func:`pandas.cut` where large bins could raise an error due to an overflow (:issue:`26045`) +- Bug in :func:`pandas.cut` where large bins could incorrectly raise an error due to an overflow (:issue:`26045`) Sparse ^^^^^^ diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index a9271404be005..8c29bdc2a974c 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -230,6 +230,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, else: bins = np.asarray(bins) bins = _convert_bin_to_numeric_type(bins, dtype) + + # GH 26045: cast to float64 to avoid an overflow if (np.diff(bins.astype('float64')) < 0).any(): raise ValueError('bins must increase monotonically.') diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 1d63ce49a1bae..4fef27d76802c 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -112,12 +112,25 @@ def test_bins_not_monotonic(): cut(data, [0.1, 1.5, 1, 10]) -def test_bins_monotic_not_overflowing(): - data = date_range("2017-12-31", periods=3) +@pytest.mark.parametrize("x, bins, expected", [ + (date_range("2017-12-31", periods=3), + [Timestamp.min, Timestamp('2018-01-01'), Timestamp.max], + np.array([0, 0, 1], dtype="int8")), + ([-1, 0, 1], + np.array([np.iinfo(np.int64).min, 0, np.iinfo(np.int64).max], + dtype="int64"), + np.array([0, 0, 1], dtype="int8")), + ([np.timedelta64(-1), np.timedelta64(0), np.timedelta64(1)], + np.array([ + np.timedelta64(-np.iinfo(np.int64).max), + np.timedelta64(0), + np.timedelta64(np.iinfo(np.int64).max)]), + np.array([0, 0, 1], dtype="int8")), - result = cut(data, [Timestamp.min, Timestamp('2018-01-01'), Timestamp.max]) - tm.assert_numpy_array_equal(result.codes, - np.array([0, 0, 1], dtype="int8")) +]) +def test_bins_monotic_not_overflowing(x, bins, expected): + result = cut(x, bins) + tm.assert_numpy_array_equal(result.codes, expected) def test_wrong_num_labels(): From 689b4401f31cd3499f1aed4af38b5a59eada315c Mon Sep 17 00:00:00 2001 From: Alexandre Batisse Date: Mon, 15 Apr 2019 15:06:00 +0200 Subject: [PATCH 4/5] test on index rather than codes --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/tests/reshape/test_cut.py | 16 +++++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index e270bfe0b7670..98f163d978284 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -395,7 +395,7 @@ Reshaping - Bug in :func:`concat` where the resulting ``freq`` of two :class:`DatetimeIndex` with the same ``freq`` would be dropped (:issue:`3232`). - Bug in :func:`merge` where merging with equivalent Categorical dtypes was raising an error (:issue:`22501`) - Bug in :class:`DataFrame` constructor when passing non-empty tuples would cause a segmentation fault (:issue:`25691`) -- Bug in :func:`pandas.cut` where large bins could incorrectly raise an error due to an overflow (:issue:`26045`) +- Bug in :func:`pandas.cut` where large bins could incorrectly raise an error due to an integer overflow (:issue:`26045`) Sparse ^^^^^^ diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 4fef27d76802c..ba6aeb571698b 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -115,22 +115,28 @@ def test_bins_not_monotonic(): @pytest.mark.parametrize("x, bins, expected", [ (date_range("2017-12-31", periods=3), [Timestamp.min, Timestamp('2018-01-01'), Timestamp.max], - np.array([0, 0, 1], dtype="int8")), + IntervalIndex.from_tuples([ + (Timestamp.min, Timestamp('2018-01-01')), + (Timestamp('2018-01-01'), Timestamp.max)])), ([-1, 0, 1], np.array([np.iinfo(np.int64).min, 0, np.iinfo(np.int64).max], dtype="int64"), - np.array([0, 0, 1], dtype="int8")), + IntervalIndex.from_tuples([ + (np.iinfo(np.int64).min, 0), + (0, np.iinfo(np.int64).max)])), ([np.timedelta64(-1), np.timedelta64(0), np.timedelta64(1)], np.array([ np.timedelta64(-np.iinfo(np.int64).max), np.timedelta64(0), np.timedelta64(np.iinfo(np.int64).max)]), - np.array([0, 0, 1], dtype="int8")), - + IntervalIndex.from_tuples([ + (np.timedelta64(-np.iinfo(np.int64).max), np.timedelta64(0)), + (np.timedelta64(0), np.timedelta64(np.iinfo(np.int64).max))])), ]) def test_bins_monotic_not_overflowing(x, bins, expected): + # GH 26045 result = cut(x, bins) - tm.assert_numpy_array_equal(result.codes, expected) + tm.assert_index_equal(result.categories, expected) def test_wrong_num_labels(): From 6381dd160a754257a2e36c7f87a2f44116bbd326 Mon Sep 17 00:00:00 2001 From: Alexandre Batisse Date: Wed, 17 Apr 2019 15:15:16 +0200 Subject: [PATCH 5/5] fix typo --- pandas/tests/reshape/test_cut.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index ba6aeb571698b..f71730fb4a313 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -118,12 +118,14 @@ def test_bins_not_monotonic(): IntervalIndex.from_tuples([ (Timestamp.min, Timestamp('2018-01-01')), (Timestamp('2018-01-01'), Timestamp.max)])), + ([-1, 0, 1], np.array([np.iinfo(np.int64).min, 0, np.iinfo(np.int64).max], dtype="int64"), IntervalIndex.from_tuples([ (np.iinfo(np.int64).min, 0), (0, np.iinfo(np.int64).max)])), + ([np.timedelta64(-1), np.timedelta64(0), np.timedelta64(1)], np.array([ np.timedelta64(-np.iinfo(np.int64).max), @@ -133,7 +135,7 @@ def test_bins_not_monotonic(): (np.timedelta64(-np.iinfo(np.int64).max), np.timedelta64(0)), (np.timedelta64(0), np.timedelta64(np.iinfo(np.int64).max))])), ]) -def test_bins_monotic_not_overflowing(x, bins, expected): +def test_bins_monotonic_not_overflowing(x, bins, expected): # GH 26045 result = cut(x, bins) tm.assert_index_equal(result.categories, expected)