From db4da3bfbb3841f489b2a278c2cb4e7c3e03c7a3 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 12 Dec 2018 13:14:38 -0800 Subject: [PATCH 1/8] Fix overflow bugs in date_Range --- pandas/core/arrays/datetimes.py | 98 ++++++++++++++++++- .../indexes/datetimes/test_date_range.py | 25 +++++ 2 files changed, 121 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 4849ee1e3e665..6ca1bcdbfedbd 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1724,7 +1724,8 @@ def _generate_regular_range(cls, start, end, periods, freq): return data -def _generate_range_overflow_safe(endpoint, periods, stride, side='start'): +def _generate_range_overflow_safe(endpoint, periods, stride, + side='start'): """ Calculate the second endpoint for passing to np.arange, checking to avoid an integer overflow. Catch OverflowError and re-raise @@ -1747,12 +1748,78 @@ def _generate_range_overflow_safe(endpoint, periods, stride, side='start'): """ # GH#14187 raise instead of incorrectly wrapping around assert side in ['start', 'end'] + + i64max = np.iinfo(np.int64).max + msg = ('Cannot generate range with {side}={endpoint} and ' + 'periods={periods}' + .format(side=side, endpoint=endpoint, periods=periods)) + + with np.errstate(over="raise"): + # if periods * strides cannot be multiplied within the *uint64* bounds, + # we cannot salvage the operation by recursing, so raise + try: + addend = np.uint64(periods) * np.uint64(np.abs(stride)) + except FloatingPointError: + raise tslib.OutOfBoundsDatetime(msg) + + if np.abs(addend) <= i64max: + # relatively easy case without casting concerns + return _generate_range_overflow_safe_signed( + endpoint, periods, stride, side) + + elif ((endpoint > 0 and side == 'start') or + (endpoint < 0 and side == 'end')): + # no chance of not-overflowing + raise tslib.OutOfBoundsDatetime(msg) + + elif (side == 'end' and endpoint > i64max and endpoint - stride <= i64max): + # in _generate_regular_range we added `stride` thereby overflowing + # the bounds. Adjust to fix this. + return _generate_range_overflow_safe(endpoint - stride, + periods - 1, stride, side) + + # split into smaller pieces + return _generate_range_recurse(endpoint, periods, stride, side) + + +def _generate_range_overflow_safe_signed(endpoint, periods, stride, side): + """ + A special case for _generate_range_overflow_safe where `periods * stride` + can be calculated without overflowing int64 bounds. + """ + assert side in ['start', 'end'] if side == 'end': stride *= -1 + with np.errstate(over="raise"): + addend = np.int64(periods) * np.int64(stride) + try: + # easy case with no overflows + return np.int64(endpoint) + addend + except (FloatingPointError, OverflowError): + # with endpoint negative and addend positive we risk + # FloatingPointError; with reversed signed we risk OverflowError + pass + + if stride > 0: + # watch out for very special case in which we just slightly + # exceed implementation bounds, but when passing the result to + # np.arange will get a result slightly within the bounds + if endpoint >= 0: + result = np.uint64(endpoint) + np.uint64(addend) + i64max = np.uint64(np.iinfo(np.int64).max) + if result <= i64max + np.uint64(stride): + return result + else: + return _generate_range_recurse(endpoint, periods, + np.abs(stride), side) + elif stride < 0 and endpoint > 0: + return _generate_range_recurse(np.uint64(endpoint), periods, + np.abs(stride), side) + try: other_end = checked_add_with_arr(np.int64(endpoint), - np.int64(periods) * stride) + addend) except OverflowError: raise tslib.OutOfBoundsDatetime('Cannot generate range with ' '{side}={endpoint} and ' @@ -1762,6 +1829,33 @@ def _generate_range_overflow_safe(endpoint, periods, stride, side='start'): return other_end +def _generate_range_recurse(endpoint, periods, stride, side): + """ + Avoid problems in int64/uint64 mismatch by splitting range generation into + smaller pieces. + + Parameters + ---------- + endpoint : int + periods : int + stride : int + side : {'start', 'end'} + + Returns + ------- + other_end : int + """ + # split into smaller pieces + mid_periods = periods // 2 + remaining = periods - mid_periods + assert 0 < remaining < periods, (remaining, periods, endpoint, stride) + print(periods, mid_periods, endpoint, stride, side) + + midpoint = _generate_range_overflow_safe(endpoint, mid_periods, + stride, side) + return _generate_range_overflow_safe(midpoint, remaining, stride, side) + + # ------------------------------------------------------------------- # Validation and Inference diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 11cefec4f34cf..f0997f3b6cd57 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -80,6 +80,31 @@ def test_date_range_timestamp_equiv_preserve_frequency(self): class TestDateRanges(TestData): + def test_date_range_multiplication_overflow(self): + # check that overflows in calculating `addend = periods * stride` + # are caught + with tm.assert_produces_warning(None): + # we should _not_ be seeing a overflow RuntimeWarning + dti = date_range(start='1677-09-22', periods=213503, freq='D') + + assert dti[0] == Timestamp('1677-09-22') + assert len(dti) == 213503 + + msg = "Cannot generate range with" + with pytest.raises(OutOfBoundsDatetime, match=msg): + date_range('1969-05-04', periods=200000000, freq='30000D') + + def test_date_range_unsigned_overflow_handling(self): + # case where `addend = periods * stride` overflows int64 bounds + # but not uint64 bounds + dti = date_range(start='1677-09-22', end='2262-04-11', freq='D') + + dti2 = date_range(start=dti[0], periods=len(dti), freq='D') + assert dti2.equals(dti) + + dti3 = date_range(end=dti[-1], periods=len(dti), freq='D') + assert dti3.equals(dti) + def test_date_range_out_of_bounds(self): # GH#14187 with pytest.raises(OutOfBoundsDatetime): From 87acff1ee8ddd8fbfe28c5e93e25b0c6a7364ad4 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 12 Dec 2018 13:26:56 -0800 Subject: [PATCH 2/8] add GH reference --- pandas/tests/indexes/datetimes/test_date_range.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index f0997f3b6cd57..bc9719b036366 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -81,6 +81,7 @@ def test_date_range_timestamp_equiv_preserve_frequency(self): class TestDateRanges(TestData): def test_date_range_multiplication_overflow(self): + # GH#24255 # check that overflows in calculating `addend = periods * stride` # are caught with tm.assert_produces_warning(None): @@ -95,6 +96,7 @@ def test_date_range_multiplication_overflow(self): date_range('1969-05-04', periods=200000000, freq='30000D') def test_date_range_unsigned_overflow_handling(self): + # GH#24255 # case where `addend = periods * stride` overflows int64 bounds # but not uint64 bounds dti = date_range(start='1677-09-22', end='2262-04-11', freq='D') From 67f98f11b04e3596949e34c0444d05a76068a963 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 12 Dec 2018 17:24:14 -0800 Subject: [PATCH 3/8] cleanup, simplification, better error message --- pandas/core/arrays/datetimes.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 6ca1bcdbfedbd..5873b19b5a517 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1698,9 +1698,13 @@ def _generate_regular_range(cls, start, end, periods, freq): e = _generate_range_overflow_safe(b, periods, stride, side='start') tz = start.tz elif end is not None: - e = Timestamp(end).value + stride + e = Timestamp(end).value b = _generate_range_overflow_safe(e, periods, stride, side='end') tz = end.tz + + # add an additional step to `e` because np.arange(b, e) will + # not include `e` + e += stride else: raise ValueError("at least 'start' or 'end' should be specified " "if a 'period' is given.") @@ -1752,7 +1756,7 @@ def _generate_range_overflow_safe(endpoint, periods, stride, i64max = np.iinfo(np.int64).max msg = ('Cannot generate range with {side}={endpoint} and ' 'periods={periods}' - .format(side=side, endpoint=endpoint, periods=periods)) + .format(side=side, endpoint=Timestamp(endpoint), periods=periods)) with np.errstate(over="raise"): # if periods * strides cannot be multiplied within the *uint64* bounds, @@ -1772,12 +1776,6 @@ def _generate_range_overflow_safe(endpoint, periods, stride, # no chance of not-overflowing raise tslib.OutOfBoundsDatetime(msg) - elif (side == 'end' and endpoint > i64max and endpoint - stride <= i64max): - # in _generate_regular_range we added `stride` thereby overflowing - # the bounds. Adjust to fix this. - return _generate_range_overflow_safe(endpoint - stride, - periods - 1, stride, side) - # split into smaller pieces return _generate_range_recurse(endpoint, periods, stride, side) @@ -1849,7 +1847,6 @@ def _generate_range_recurse(endpoint, periods, stride, side): mid_periods = periods // 2 remaining = periods - mid_periods assert 0 < remaining < periods, (remaining, periods, endpoint, stride) - print(periods, mid_periods, endpoint, stride, side) midpoint = _generate_range_overflow_safe(endpoint, mid_periods, stride, side) From 1bf5642b9c00463d92116284aa6d80d8f9c67a74 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 13 Dec 2018 16:52:36 -0800 Subject: [PATCH 4/8] handle missed corner case --- pandas/core/arrays/datetimes.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 5873b19b5a517..bb1fb5f10dc97 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1698,13 +1698,9 @@ def _generate_regular_range(cls, start, end, periods, freq): e = _generate_range_overflow_safe(b, periods, stride, side='start') tz = start.tz elif end is not None: - e = Timestamp(end).value + e = Timestamp(end).value + stride b = _generate_range_overflow_safe(e, periods, stride, side='end') tz = end.tz - - # add an additional step to `e` because np.arange(b, e) will - # not include `e` - e += stride else: raise ValueError("at least 'start' or 'end' should be specified " "if a 'period' is given.") @@ -1753,10 +1749,10 @@ def _generate_range_overflow_safe(endpoint, periods, stride, # GH#14187 raise instead of incorrectly wrapping around assert side in ['start', 'end'] - i64max = np.iinfo(np.int64).max + i64max = np.uint64(np.iinfo(np.int64).max) msg = ('Cannot generate range with {side}={endpoint} and ' 'periods={periods}' - .format(side=side, endpoint=Timestamp(endpoint), periods=periods)) + .format(side=side, endpoint=endpoint, periods=periods)) with np.errstate(over="raise"): # if periods * strides cannot be multiplied within the *uint64* bounds, @@ -1776,6 +1772,12 @@ def _generate_range_overflow_safe(endpoint, periods, stride, # no chance of not-overflowing raise tslib.OutOfBoundsDatetime(msg) + elif (side == 'end' and endpoint > i64max and endpoint - stride <= i64max): + # in _generate_regular_range we added `stride` thereby overflowing + # the bounds. Adjust to fix this. + return _generate_range_overflow_safe(endpoint - stride, + periods - 1, stride, side) + # split into smaller pieces return _generate_range_recurse(endpoint, periods, stride, side) From 9af17d4bb9572b121ee1e2af2f97054c01859d5d Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sat, 15 Dec 2018 16:40:45 -0800 Subject: [PATCH 5/8] more tests, docstrings --- pandas/core/arrays/datetimes.py | 89 ++++++++----------- .../indexes/datetimes/test_date_range.py | 34 +++++++ 2 files changed, 71 insertions(+), 52 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 6e7f687b674e2..fcddcba3f5b17 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1728,7 +1728,14 @@ def _generate_regular_range(cls, start, end, periods, freq): raise ValueError("at least 'start' or 'end' should be specified " "if a 'period' is given.") - values = np.arange(b, e, stride, dtype=np.int64) + with np.errstate(over="raise"): + try: + values = np.arange(b, e, stride, dtype=np.int64) + except FloatingPointError: + xdr = [b] + while xdr[-1] != e: + xdr.append(xdr[-1] + stride) + values = np.array(xdr[:-1], dtype=np.int64) else: tz = None @@ -1757,9 +1764,13 @@ def _generate_range_overflow_safe(endpoint, periods, stride, Parameters ---------- endpoint : int + nanosecond timestamp of the known endpoint of the desired range periods : int + number of periods in the desired range stride : int + nanoseconds between periods in the desired range side : {'start', 'end'} + which end of the range `endpoint` refers to Returns ------- @@ -1790,8 +1801,8 @@ def _generate_range_overflow_safe(endpoint, periods, stride, return _generate_range_overflow_safe_signed( endpoint, periods, stride, side) - elif ((endpoint > 0 and side == 'start') or - (endpoint < 0 and side == 'end')): + elif ((endpoint > 0 and side == 'start' and stride > 0) or + (endpoint < 0 and side == 'end' and stride > 0)): # no chance of not-overflowing raise tslib.OutOfBoundsDatetime(msg) @@ -1802,7 +1813,13 @@ def _generate_range_overflow_safe(endpoint, periods, stride, periods - 1, stride, side) # split into smaller pieces - return _generate_range_recurse(endpoint, periods, stride, side) + mid_periods = periods // 2 + remaining = periods - mid_periods + assert 0 < remaining < periods, (remaining, periods, endpoint, stride) + + midpoint = _generate_range_overflow_safe(endpoint, mid_periods, + stride, side) + return _generate_range_overflow_safe(midpoint, remaining, stride, side) def _generate_range_overflow_safe_signed(endpoint, periods, stride, side): @@ -1824,58 +1841,26 @@ def _generate_range_overflow_safe_signed(endpoint, periods, stride, side): # FloatingPointError; with reversed signed we risk OverflowError pass + # if stride and endpoint had opposite signs, then endpoint + addend + # should never overflow. so they must have the same signs + assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0) + if stride > 0: # watch out for very special case in which we just slightly # exceed implementation bounds, but when passing the result to # np.arange will get a result slightly within the bounds - if endpoint >= 0: - result = np.uint64(endpoint) + np.uint64(addend) - i64max = np.uint64(np.iinfo(np.int64).max) - if result <= i64max + np.uint64(stride): - return result - else: - return _generate_range_recurse(endpoint, periods, - np.abs(stride), side) - elif stride < 0 and endpoint > 0: - return _generate_range_recurse(np.uint64(endpoint), periods, - np.abs(stride), side) - - try: - other_end = checked_add_with_arr(np.int64(endpoint), - addend) - except OverflowError: - raise tslib.OutOfBoundsDatetime('Cannot generate range with ' - '{side}={endpoint} and ' - 'periods={periods}' - .format(side=side, endpoint=endpoint, - periods=periods)) - return other_end - - -def _generate_range_recurse(endpoint, periods, stride, side): - """ - Avoid problems in int64/uint64 mismatch by splitting range generation into - smaller pieces. - - Parameters - ---------- - endpoint : int - periods : int - stride : int - side : {'start', 'end'} - - Returns - ------- - other_end : int - """ - # split into smaller pieces - mid_periods = periods // 2 - remaining = periods - mid_periods - assert 0 < remaining < periods, (remaining, periods, endpoint, stride) - - midpoint = _generate_range_overflow_safe(endpoint, mid_periods, - stride, side) - return _generate_range_overflow_safe(midpoint, remaining, stride, side) + assert endpoint >= 0 + result = np.uint64(endpoint) + np.uint64(addend) + i64max = np.uint64(np.iinfo(np.int64).max) + assert result > i64max + if result <= i64max + np.uint64(stride): + return result + + raise tslib.OutOfBoundsDatetime('Cannot generate range with ' + '{side}={endpoint} and ' + 'periods={periods}' + .format(side=side, endpoint=endpoint, + periods=periods)) # ------------------------------------------------------------------- diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index d33b25d618f64..918d9c4ed820e 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -115,6 +115,40 @@ def test_date_range_unsigned_overflow_handling(self): dti3 = date_range(end=dti[-1], periods=len(dti), freq='D') assert dti3.equals(dti) + def test_date_range_int64_overflow_non_recoverable(self): + # GH#24255 + # case with start later than 1970-01-01, overflow int64 but not uint64 + msg = "Cannot generate range with" + with pytest.raises(OutOfBoundsDatetime, match=msg): + date_range(start='1970-02-01', periods=106752 * 24, freq='H') + + # case with end before 1970-01-01, overflow int64 but not uint64 + with pytest.raises(OutOfBoundsDatetime, match=msg): + date_range(end='1969-11-14', periods=106752 * 24, freq='H') + + def test_date_range_int64_overflow_stride_endpoint_different_signs(self): + # cases where stride * periods overflow int64 and stride/endpoint + # have different signs + start = Timestamp('2262-02-23') + end = Timestamp('1969-11-14') + + expected = date_range(start=start, end=end, freq='-1H') + assert expected[0] == start + assert expected[-1] == end + + dti = date_range(end=end, periods=len(expected), freq='-1H') + tm.assert_index_equal(dti, expected) + + start2 = Timestamp('1970-02-01') + end2 = Timestamp('1677-10-22') + + expected2 = date_range(start=start2, end=end2, freq='-1H') + assert expected2[0] == start2 + assert expected2[-1] == end2 + + dti2 = date_range(start=start2, periods=len(expected2), freq='-1H') + tm.assert_index_equal(dti2, expected2) + def test_date_range_out_of_bounds(self): # GH#14187 with pytest.raises(OutOfBoundsDatetime): From 40c2f5bb5efdb72d4039acb2b40dc2cc0bedb2a0 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Fri, 21 Dec 2018 15:38:56 -0800 Subject: [PATCH 6/8] move range-generation functions to arrays._ranges --- pandas/core/arrays/_ranges.py | 186 ++++++++++++++++++++++++++++++++ pandas/core/arrays/datetimes.py | 185 +------------------------------ 2 files changed, 190 insertions(+), 181 deletions(-) create mode 100644 pandas/core/arrays/_ranges.py diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py new file mode 100644 index 0000000000000..91356f66fc66e --- /dev/null +++ b/pandas/core/arrays/_ranges.py @@ -0,0 +1,186 @@ +# -*- coding: utf-8 -*- +""" +Helper functions to generate range-like data for DatetimeArray +(and possibly TimedeltaArray/PeriodArray) +""" + +import numpy as np + +from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp + +from pandas.tseries.offsets import Tick, generate_range + + +def generate_regular_range(start, end, periods, freq): + """ + Generate a range of dates with the spans between dates described by + the given `freq` DateOffset. + + Parameters + ---------- + start : Timestamp or None + first point of produced date range + end : Timestamp or None + last point of produced date range + periods : int + number of periods in produced date range + freq : DateOffset + describes space between dates in produced date range + + Returns + ------- + ndarray[np.int64] representing nanosecond unix timestamps + """ + if isinstance(freq, Tick): + stride = freq.nanos + if periods is None: + b = Timestamp(start).value + # cannot just use e = Timestamp(end) + 1 because arange breaks when + # stride is too large, see GH10887 + e = (b + (Timestamp(end).value - b) // stride * stride + + stride // 2 + 1) + # end.tz == start.tz by this point due to _generate implementation + tz = start.tz + elif start is not None: + b = Timestamp(start).value + e = _generate_range_overflow_safe(b, periods, stride, side='start') + tz = start.tz + elif end is not None: + e = Timestamp(end).value + stride + b = _generate_range_overflow_safe(e, periods, stride, side='end') + tz = end.tz + else: + raise ValueError("at least 'start' or 'end' should be specified " + "if a 'period' is given.") + + with np.errstate(over="raise"): + try: + values = np.arange(b, e, stride, dtype=np.int64) + except FloatingPointError: + xdr = [b] + while xdr[-1] != e: + xdr.append(xdr[-1] + stride) + values = np.array(xdr[:-1], dtype=np.int64) + + else: + tz = None + # start and end should have the same timezone by this point + if start is not None: + tz = start.tz + elif end is not None: + tz = end.tz + + xdr = generate_range(start=start, end=end, + periods=periods, offset=freq) + + values = np.array([x.value for x in xdr], dtype=np.int64) + + return values, tz + + +def _generate_range_overflow_safe(endpoint, periods, stride, side='start'): + """ + Calculate the second endpoint for passing to np.arange, checking + to avoid an integer overflow. Catch OverflowError and re-raise + as OutOfBoundsDatetime. + + Parameters + ---------- + endpoint : int + nanosecond timestamp of the known endpoint of the desired range + periods : int + number of periods in the desired range + stride : int + nanoseconds between periods in the desired range + side : {'start', 'end'} + which end of the range `endpoint` refers to + + Returns + ------- + other_end : int + + Raises + ------ + OutOfBoundsDatetime + """ + # GH#14187 raise instead of incorrectly wrapping around + assert side in ['start', 'end'] + + i64max = np.uint64(np.iinfo(np.int64).max) + msg = ('Cannot generate range with {side}={endpoint} and ' + 'periods={periods}' + .format(side=side, endpoint=endpoint, periods=periods)) + + with np.errstate(over="raise"): + # if periods * strides cannot be multiplied within the *uint64* bounds, + # we cannot salvage the operation by recursing, so raise + try: + addend = np.uint64(periods) * np.uint64(np.abs(stride)) + except FloatingPointError: + raise OutOfBoundsDatetime(msg) + + if np.abs(addend) <= i64max: + # relatively easy case without casting concerns + return _generate_range_overflow_safe_signed( + endpoint, periods, stride, side) + + elif ((endpoint > 0 and side == 'start' and stride > 0) or + (endpoint < 0 and side == 'end' and stride > 0)): + # no chance of not-overflowing + raise OutOfBoundsDatetime(msg) + + elif (side == 'end' and endpoint > i64max and endpoint - stride <= i64max): + # in _generate_regular_range we added `stride` thereby overflowing + # the bounds. Adjust to fix this. + return _generate_range_overflow_safe(endpoint - stride, + periods - 1, stride, side) + + # split into smaller pieces + mid_periods = periods // 2 + remaining = periods - mid_periods + assert 0 < remaining < periods, (remaining, periods, endpoint, stride) + + midpoint = _generate_range_overflow_safe(endpoint, mid_periods, + stride, side) + return _generate_range_overflow_safe(midpoint, remaining, stride, side) + + +def _generate_range_overflow_safe_signed(endpoint, periods, stride, side): + """ + A special case for _generate_range_overflow_safe where `periods * stride` + can be calculated without overflowing int64 bounds. + """ + assert side in ['start', 'end'] + if side == 'end': + stride *= -1 + + with np.errstate(over="raise"): + addend = np.int64(periods) * np.int64(stride) + try: + # easy case with no overflows + return np.int64(endpoint) + addend + except (FloatingPointError, OverflowError): + # with endpoint negative and addend positive we risk + # FloatingPointError; with reversed signed we risk OverflowError + pass + + # if stride and endpoint had opposite signs, then endpoint + addend + # should never overflow. so they must have the same signs + assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0) + + if stride > 0: + # watch out for very special case in which we just slightly + # exceed implementation bounds, but when passing the result to + # np.arange will get a result slightly within the bounds + assert endpoint >= 0 + result = np.uint64(endpoint) + np.uint64(addend) + i64max = np.uint64(np.iinfo(np.int64).max) + assert result > i64max + if result <= i64max + np.uint64(stride): + return result + + raise OutOfBoundsDatetime('Cannot generate range with ' + '{side}={endpoint} and ' + 'periods={periods}' + .format(side=side, endpoint=endpoint, + periods=periods)) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index bd66f56f40c59..8fff4ef1442ee 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -24,10 +24,11 @@ from pandas.core import ops from pandas.core.algorithms import checked_add_with_arr from pandas.core.arrays import datetimelike as dtl +from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com from pandas.tseries.frequencies import get_period_alias, to_offset -from pandas.tseries.offsets import Day, Tick, generate_range +from pandas.tseries.offsets import Day, Tick _midnight = time(0, 0) @@ -306,7 +307,8 @@ def _generate_range(cls, start, end, periods, freq, tz=None, if end is not None: end = end.tz_localize(None) # TODO: consider re-implementing _cached_range; GH#17914 - index = _generate_regular_range(cls, start, end, periods, freq) + values, tz = generate_regular_range(start, end, periods, freq) + index = cls._simple_new(values, freq=freq, tz=tz) if tz is not None and index.tz is None: arr = conversion.tz_localize_to_utc( @@ -1685,185 +1687,6 @@ def maybe_convert_dtype(data, copy): return data, copy -def _generate_regular_range(cls, start, end, periods, freq): - """ - Generate a range of dates with the spans between dates described by - the given `freq` DateOffset. - - Parameters - ---------- - cls : class - start : Timestamp or None - first point of produced date range - end : Timestamp or None - last point of produced date range - periods : int - number of periods in produced date range - freq : DateOffset - describes space between dates in produced date range - - Returns - ------- - ndarray[np.int64] representing nanosecond unix timestamps - - """ - if isinstance(freq, Tick): - stride = freq.nanos - if periods is None: - b = Timestamp(start).value - # cannot just use e = Timestamp(end) + 1 because arange breaks when - # stride is too large, see GH10887 - e = (b + (Timestamp(end).value - b) // stride * stride + - stride // 2 + 1) - # end.tz == start.tz by this point due to _generate implementation - tz = start.tz - elif start is not None: - b = Timestamp(start).value - e = _generate_range_overflow_safe(b, periods, stride, side='start') - tz = start.tz - elif end is not None: - e = Timestamp(end).value + stride - b = _generate_range_overflow_safe(e, periods, stride, side='end') - tz = end.tz - else: - raise ValueError("at least 'start' or 'end' should be specified " - "if a 'period' is given.") - - with np.errstate(over="raise"): - try: - values = np.arange(b, e, stride, dtype=np.int64) - except FloatingPointError: - xdr = [b] - while xdr[-1] != e: - xdr.append(xdr[-1] + stride) - values = np.array(xdr[:-1], dtype=np.int64) - - else: - tz = None - # start and end should have the same timezone by this point - if start is not None: - tz = start.tz - elif end is not None: - tz = end.tz - - xdr = generate_range(start=start, end=end, - periods=periods, offset=freq) - - values = np.array([x.value for x in xdr], dtype=np.int64) - - data = cls._simple_new(values, freq=freq, tz=tz) - return data - - -def _generate_range_overflow_safe(endpoint, periods, stride, - side='start'): - """ - Calculate the second endpoint for passing to np.arange, checking - to avoid an integer overflow. Catch OverflowError and re-raise - as OutOfBoundsDatetime. - - Parameters - ---------- - endpoint : int - nanosecond timestamp of the known endpoint of the desired range - periods : int - number of periods in the desired range - stride : int - nanoseconds between periods in the desired range - side : {'start', 'end'} - which end of the range `endpoint` refers to - - Returns - ------- - other_end : int - - Raises - ------ - OutOfBoundsDatetime - """ - # GH#14187 raise instead of incorrectly wrapping around - assert side in ['start', 'end'] - - i64max = np.uint64(np.iinfo(np.int64).max) - msg = ('Cannot generate range with {side}={endpoint} and ' - 'periods={periods}' - .format(side=side, endpoint=endpoint, periods=periods)) - - with np.errstate(over="raise"): - # if periods * strides cannot be multiplied within the *uint64* bounds, - # we cannot salvage the operation by recursing, so raise - try: - addend = np.uint64(periods) * np.uint64(np.abs(stride)) - except FloatingPointError: - raise tslib.OutOfBoundsDatetime(msg) - - if np.abs(addend) <= i64max: - # relatively easy case without casting concerns - return _generate_range_overflow_safe_signed( - endpoint, periods, stride, side) - - elif ((endpoint > 0 and side == 'start' and stride > 0) or - (endpoint < 0 and side == 'end' and stride > 0)): - # no chance of not-overflowing - raise tslib.OutOfBoundsDatetime(msg) - - elif (side == 'end' and endpoint > i64max and endpoint - stride <= i64max): - # in _generate_regular_range we added `stride` thereby overflowing - # the bounds. Adjust to fix this. - return _generate_range_overflow_safe(endpoint - stride, - periods - 1, stride, side) - - # split into smaller pieces - mid_periods = periods // 2 - remaining = periods - mid_periods - assert 0 < remaining < periods, (remaining, periods, endpoint, stride) - - midpoint = _generate_range_overflow_safe(endpoint, mid_periods, - stride, side) - return _generate_range_overflow_safe(midpoint, remaining, stride, side) - - -def _generate_range_overflow_safe_signed(endpoint, periods, stride, side): - """ - A special case for _generate_range_overflow_safe where `periods * stride` - can be calculated without overflowing int64 bounds. - """ - assert side in ['start', 'end'] - if side == 'end': - stride *= -1 - - with np.errstate(over="raise"): - addend = np.int64(periods) * np.int64(stride) - try: - # easy case with no overflows - return np.int64(endpoint) + addend - except (FloatingPointError, OverflowError): - # with endpoint negative and addend positive we risk - # FloatingPointError; with reversed signed we risk OverflowError - pass - - # if stride and endpoint had opposite signs, then endpoint + addend - # should never overflow. so they must have the same signs - assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0) - - if stride > 0: - # watch out for very special case in which we just slightly - # exceed implementation bounds, but when passing the result to - # np.arange will get a result slightly within the bounds - assert endpoint >= 0 - result = np.uint64(endpoint) + np.uint64(addend) - i64max = np.uint64(np.iinfo(np.int64).max) - assert result > i64max - if result <= i64max + np.uint64(stride): - return result - - raise tslib.OutOfBoundsDatetime('Cannot generate range with ' - '{side}={endpoint} and ' - 'periods={periods}' - .format(side=side, endpoint=endpoint, - periods=periods)) - - # ------------------------------------------------------------------- # Validation and Inference From 018ccca8bdccba385574bd191df048cf398e1873 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Fri, 21 Dec 2018 17:16:00 -0800 Subject: [PATCH 7/8] avoid over-writing tz --- pandas/core/arrays/datetimes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 8fff4ef1442ee..467a6ba7ecc94 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -307,8 +307,8 @@ def _generate_range(cls, start, end, periods, freq, tz=None, if end is not None: end = end.tz_localize(None) # TODO: consider re-implementing _cached_range; GH#17914 - values, tz = generate_regular_range(start, end, periods, freq) - index = cls._simple_new(values, freq=freq, tz=tz) + values, _tz = generate_regular_range(start, end, periods, freq) + index = cls._simple_new(values, freq=freq, tz=_tz) if tz is not None and index.tz is None: arr = conversion.tz_localize_to_utc( From 43d3d8344fd282104ea66f6e87c6165cad39187c Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Sun, 23 Dec 2018 08:47:53 -0800 Subject: [PATCH 8/8] overflow comment --- pandas/core/arrays/_ranges.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py index 91356f66fc66e..66c1b8e158672 100644 --- a/pandas/core/arrays/_ranges.py +++ b/pandas/core/arrays/_ranges.py @@ -54,6 +54,8 @@ def generate_regular_range(start, end, periods, freq): "if a 'period' is given.") with np.errstate(over="raise"): + # If the range is sufficiently large, np.arange may overflow + # and incorrectly return an empty array if not caught. try: values = np.arange(b, e, stride, dtype=np.int64) except FloatingPointError: