diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py new file mode 100644 index 0000000000000..66c1b8e158672 --- /dev/null +++ b/pandas/core/arrays/_ranges.py @@ -0,0 +1,188 @@ +# -*- coding: utf-8 -*- +""" +Helper functions to generate range-like data for DatetimeArray +(and possibly TimedeltaArray/PeriodArray) +""" + +import numpy as np + +from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp + +from pandas.tseries.offsets import Tick, generate_range + + +def generate_regular_range(start, end, periods, freq): + """ + Generate a range of dates with the spans between dates described by + the given `freq` DateOffset. + + Parameters + ---------- + start : Timestamp or None + first point of produced date range + end : Timestamp or None + last point of produced date range + periods : int + number of periods in produced date range + freq : DateOffset + describes space between dates in produced date range + + Returns + ------- + ndarray[np.int64] representing nanosecond unix timestamps + """ + if isinstance(freq, Tick): + stride = freq.nanos + if periods is None: + b = Timestamp(start).value + # cannot just use e = Timestamp(end) + 1 because arange breaks when + # stride is too large, see GH10887 + e = (b + (Timestamp(end).value - b) // stride * stride + + stride // 2 + 1) + # end.tz == start.tz by this point due to _generate implementation + tz = start.tz + elif start is not None: + b = Timestamp(start).value + e = _generate_range_overflow_safe(b, periods, stride, side='start') + tz = start.tz + elif end is not None: + e = Timestamp(end).value + stride + b = _generate_range_overflow_safe(e, periods, stride, side='end') + tz = end.tz + else: + raise ValueError("at least 'start' or 'end' should be specified " + "if a 'period' is given.") + + with np.errstate(over="raise"): + # If the range is sufficiently large, np.arange may overflow + # and incorrectly return an empty array if not caught. + try: + values = np.arange(b, e, stride, dtype=np.int64) + except FloatingPointError: + xdr = [b] + while xdr[-1] != e: + xdr.append(xdr[-1] + stride) + values = np.array(xdr[:-1], dtype=np.int64) + + else: + tz = None + # start and end should have the same timezone by this point + if start is not None: + tz = start.tz + elif end is not None: + tz = end.tz + + xdr = generate_range(start=start, end=end, + periods=periods, offset=freq) + + values = np.array([x.value for x in xdr], dtype=np.int64) + + return values, tz + + +def _generate_range_overflow_safe(endpoint, periods, stride, side='start'): + """ + Calculate the second endpoint for passing to np.arange, checking + to avoid an integer overflow. Catch OverflowError and re-raise + as OutOfBoundsDatetime. + + Parameters + ---------- + endpoint : int + nanosecond timestamp of the known endpoint of the desired range + periods : int + number of periods in the desired range + stride : int + nanoseconds between periods in the desired range + side : {'start', 'end'} + which end of the range `endpoint` refers to + + Returns + ------- + other_end : int + + Raises + ------ + OutOfBoundsDatetime + """ + # GH#14187 raise instead of incorrectly wrapping around + assert side in ['start', 'end'] + + i64max = np.uint64(np.iinfo(np.int64).max) + msg = ('Cannot generate range with {side}={endpoint} and ' + 'periods={periods}' + .format(side=side, endpoint=endpoint, periods=periods)) + + with np.errstate(over="raise"): + # if periods * strides cannot be multiplied within the *uint64* bounds, + # we cannot salvage the operation by recursing, so raise + try: + addend = np.uint64(periods) * np.uint64(np.abs(stride)) + except FloatingPointError: + raise OutOfBoundsDatetime(msg) + + if np.abs(addend) <= i64max: + # relatively easy case without casting concerns + return _generate_range_overflow_safe_signed( + endpoint, periods, stride, side) + + elif ((endpoint > 0 and side == 'start' and stride > 0) or + (endpoint < 0 and side == 'end' and stride > 0)): + # no chance of not-overflowing + raise OutOfBoundsDatetime(msg) + + elif (side == 'end' and endpoint > i64max and endpoint - stride <= i64max): + # in _generate_regular_range we added `stride` thereby overflowing + # the bounds. Adjust to fix this. + return _generate_range_overflow_safe(endpoint - stride, + periods - 1, stride, side) + + # split into smaller pieces + mid_periods = periods // 2 + remaining = periods - mid_periods + assert 0 < remaining < periods, (remaining, periods, endpoint, stride) + + midpoint = _generate_range_overflow_safe(endpoint, mid_periods, + stride, side) + return _generate_range_overflow_safe(midpoint, remaining, stride, side) + + +def _generate_range_overflow_safe_signed(endpoint, periods, stride, side): + """ + A special case for _generate_range_overflow_safe where `periods * stride` + can be calculated without overflowing int64 bounds. + """ + assert side in ['start', 'end'] + if side == 'end': + stride *= -1 + + with np.errstate(over="raise"): + addend = np.int64(periods) * np.int64(stride) + try: + # easy case with no overflows + return np.int64(endpoint) + addend + except (FloatingPointError, OverflowError): + # with endpoint negative and addend positive we risk + # FloatingPointError; with reversed signed we risk OverflowError + pass + + # if stride and endpoint had opposite signs, then endpoint + addend + # should never overflow. so they must have the same signs + assert (stride > 0 and endpoint >= 0) or (stride < 0 and endpoint <= 0) + + if stride > 0: + # watch out for very special case in which we just slightly + # exceed implementation bounds, but when passing the result to + # np.arange will get a result slightly within the bounds + assert endpoint >= 0 + result = np.uint64(endpoint) + np.uint64(addend) + i64max = np.uint64(np.iinfo(np.int64).max) + assert result > i64max + if result <= i64max + np.uint64(stride): + return result + + raise OutOfBoundsDatetime('Cannot generate range with ' + '{side}={endpoint} and ' + 'periods={periods}' + .format(side=side, endpoint=endpoint, + periods=periods)) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 2d1330dd87152..a933f41faab67 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -24,10 +24,11 @@ from pandas.core import ops from pandas.core.algorithms import checked_add_with_arr from pandas.core.arrays import datetimelike as dtl +from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com from pandas.tseries.frequencies import get_period_alias, to_offset -from pandas.tseries.offsets import Day, Tick, generate_range +from pandas.tseries.offsets import Day, Tick _midnight = time(0, 0) @@ -306,7 +307,8 @@ def _generate_range(cls, start, end, periods, freq, tz=None, if end is not None: end = end.tz_localize(None) # TODO: consider re-implementing _cached_range; GH#17914 - index = _generate_regular_range(cls, start, end, periods, freq) + values, _tz = generate_regular_range(start, end, periods, freq) + index = cls._simple_new(values, freq=freq, tz=_tz) if tz is not None and index.tz is None: arr = conversion.tz_localize_to_utc( @@ -1715,107 +1717,6 @@ def maybe_convert_dtype(data, copy): return data, copy -def _generate_regular_range(cls, start, end, periods, freq): - """ - Generate a range of dates with the spans between dates described by - the given `freq` DateOffset. - - Parameters - ---------- - cls : class - start : Timestamp or None - first point of produced date range - end : Timestamp or None - last point of produced date range - periods : int - number of periods in produced date range - freq : DateOffset - describes space between dates in produced date range - - Returns - ------- - ndarray[np.int64] representing nanosecond unix timestamps - - """ - if isinstance(freq, Tick): - stride = freq.nanos - if periods is None: - b = Timestamp(start).value - # cannot just use e = Timestamp(end) + 1 because arange breaks when - # stride is too large, see GH10887 - e = (b + (Timestamp(end).value - b) // stride * stride + - stride // 2 + 1) - # end.tz == start.tz by this point due to _generate implementation - tz = start.tz - elif start is not None: - b = Timestamp(start).value - e = _generate_range_overflow_safe(b, periods, stride, side='start') - tz = start.tz - elif end is not None: - e = Timestamp(end).value + stride - b = _generate_range_overflow_safe(e, periods, stride, side='end') - tz = end.tz - else: - raise ValueError("at least 'start' or 'end' should be specified " - "if a 'period' is given.") - - values = np.arange(b, e, stride, dtype=np.int64) - - else: - tz = None - # start and end should have the same timezone by this point - if start is not None: - tz = start.tz - elif end is not None: - tz = end.tz - - xdr = generate_range(start=start, end=end, - periods=periods, offset=freq) - - values = np.array([x.value for x in xdr], dtype=np.int64) - - data = cls._simple_new(values, freq=freq, tz=tz) - return data - - -def _generate_range_overflow_safe(endpoint, periods, stride, side='start'): - """ - Calculate the second endpoint for passing to np.arange, checking - to avoid an integer overflow. Catch OverflowError and re-raise - as OutOfBoundsDatetime. - - Parameters - ---------- - endpoint : int - periods : int - stride : int - side : {'start', 'end'} - - Returns - ------- - other_end : int - - Raises - ------ - OutOfBoundsDatetime - """ - # GH#14187 raise instead of incorrectly wrapping around - assert side in ['start', 'end'] - if side == 'end': - stride *= -1 - - try: - other_end = checked_add_with_arr(np.int64(endpoint), - np.int64(periods) * stride) - except OverflowError: - raise tslib.OutOfBoundsDatetime('Cannot generate range with ' - '{side}={endpoint} and ' - 'periods={periods}' - .format(side=side, endpoint=endpoint, - periods=periods)) - return other_end - - # ------------------------------------------------------------------- # Validation and Inference diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index d150a91fe7f18..a9bece248e9d0 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -88,6 +88,67 @@ def test_date_range_nat(self): with pytest.raises(ValueError, match=msg): date_range(start=pd.NaT, end='2016-01-01', freq='D') + def test_date_range_multiplication_overflow(self): + # GH#24255 + # check that overflows in calculating `addend = periods * stride` + # are caught + with tm.assert_produces_warning(None): + # we should _not_ be seeing a overflow RuntimeWarning + dti = date_range(start='1677-09-22', periods=213503, freq='D') + + assert dti[0] == Timestamp('1677-09-22') + assert len(dti) == 213503 + + msg = "Cannot generate range with" + with pytest.raises(OutOfBoundsDatetime, match=msg): + date_range('1969-05-04', periods=200000000, freq='30000D') + + def test_date_range_unsigned_overflow_handling(self): + # GH#24255 + # case where `addend = periods * stride` overflows int64 bounds + # but not uint64 bounds + dti = date_range(start='1677-09-22', end='2262-04-11', freq='D') + + dti2 = date_range(start=dti[0], periods=len(dti), freq='D') + assert dti2.equals(dti) + + dti3 = date_range(end=dti[-1], periods=len(dti), freq='D') + assert dti3.equals(dti) + + def test_date_range_int64_overflow_non_recoverable(self): + # GH#24255 + # case with start later than 1970-01-01, overflow int64 but not uint64 + msg = "Cannot generate range with" + with pytest.raises(OutOfBoundsDatetime, match=msg): + date_range(start='1970-02-01', periods=106752 * 24, freq='H') + + # case with end before 1970-01-01, overflow int64 but not uint64 + with pytest.raises(OutOfBoundsDatetime, match=msg): + date_range(end='1969-11-14', periods=106752 * 24, freq='H') + + def test_date_range_int64_overflow_stride_endpoint_different_signs(self): + # cases where stride * periods overflow int64 and stride/endpoint + # have different signs + start = Timestamp('2262-02-23') + end = Timestamp('1969-11-14') + + expected = date_range(start=start, end=end, freq='-1H') + assert expected[0] == start + assert expected[-1] == end + + dti = date_range(end=end, periods=len(expected), freq='-1H') + tm.assert_index_equal(dti, expected) + + start2 = Timestamp('1970-02-01') + end2 = Timestamp('1677-10-22') + + expected2 = date_range(start=start2, end=end2, freq='-1H') + assert expected2[0] == start2 + assert expected2[-1] == end2 + + dti2 = date_range(start=start2, periods=len(expected2), freq='-1H') + tm.assert_index_equal(dti2, expected2) + def test_date_range_out_of_bounds(self): # GH#14187 with pytest.raises(OutOfBoundsDatetime):