diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 20d149493951f..fe657936c403e 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -18,7 +18,7 @@ def setup(self): self.float = pd.Float64Index(np.random.randn(N).repeat(5)) # Convenience naming. - self.checked_add = pd.core.nanops._checked_add_with_arr + self.checked_add = pd.core.algorithms.checked_add_with_arr self.arr = np.arange(1000000) self.arrpos = np.arange(1000000) @@ -26,6 +26,9 @@ def setup(self): self.arrmixed = np.array([1, -1]).repeat(500000) self.strings = tm.makeStringIndex(100000) + self.arr_nan = np.random.choice([True, False], size=1000000) + self.arrmixed_nan = np.random.choice([True, False], size=1000000) + # match self.uniques = tm.makeStringIndex(1000).values self.all = self.uniques.repeat(10) @@ -69,6 +72,16 @@ def time_add_overflow_neg_arr(self): def time_add_overflow_mixed_arr(self): self.checked_add(self.arr, self.arrmixed) + def time_add_overflow_first_arg_nan(self): + self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan) + + def time_add_overflow_second_arg_nan(self): + self.checked_add(self.arr, self.arrmixed, b_mask=self.arrmixed_nan) + + def time_add_overflow_both_arg_nan(self): + self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan, + b_mask=self.arrmixed_nan) + class Hashing(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index e264fb15f3e67..8c486370c5eed 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -213,6 +213,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`) - Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0d4d4143e6b9b..b2702ea0acca7 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -27,6 +27,7 @@ _ensure_float64, _ensure_int64, is_list_like) +from pandas.compat.numpy import _np_version_under1p10 from pandas.types.missing import isnull import pandas.core.common as com @@ -550,6 +551,95 @@ def rank(values, axis=0, method='average', na_option='keep', return ranks + +def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): + """ + Perform array addition that checks for underflow and overflow. + + Performs the addition of an int64 array and an int64 integer (or array) + but checks that they do not result in overflow first. For elements that + are indicated to be NaN, whether or not there is overflow for that element + is automatically ignored. + + Parameters + ---------- + arr : array addend. + b : array or scalar addend. + arr_mask : boolean array or None + array indicating which elements to exclude from checking + b_mask : boolean array or boolean or None + array or scalar indicating which element(s) to exclude from checking + + Returns + ------- + sum : An array for elements x + b for each element x in arr if b is + a scalar or an array for elements x + y for each element pair + (x, y) in (arr, b). + + Raises + ------ + OverflowError if any x + y exceeds the maximum or minimum int64 value. + """ + def _broadcast(arr_or_scalar, shape): + """ + Helper function to broadcast arrays / scalars to the desired shape. + """ + if _np_version_under1p10: + if lib.isscalar(arr_or_scalar): + out = np.empty(shape) + out.fill(arr_or_scalar) + else: + out = arr_or_scalar + else: + out = np.broadcast_to(arr_or_scalar, shape) + return out + + # For performance reasons, we broadcast 'b' to the new array 'b2' + # so that it has the same size as 'arr'. + b2 = _broadcast(b, arr.shape) + if b_mask is not None: + # We do the same broadcasting for b_mask as well. + b2_mask = _broadcast(b_mask, arr.shape) + else: + b2_mask = None + + # For elements that are NaN, regardless of their value, we should + # ignore whether they overflow or not when doing the checked add. + if arr_mask is not None and b2_mask is not None: + not_nan = np.logical_not(arr_mask | b2_mask) + elif arr_mask is not None: + not_nan = np.logical_not(arr_mask) + elif b_mask is not None: + not_nan = np.logical_not(b2_mask) + else: + not_nan = np.empty(arr.shape, dtype=bool) + not_nan.fill(True) + + # gh-14324: For each element in 'arr' and its corresponding element + # in 'b2', we check the sign of the element in 'b2'. If it is positive, + # we then check whether its sum with the element in 'arr' exceeds + # np.iinfo(np.int64).max. If so, we have an overflow error. If it + # it is negative, we then check whether its sum with the element in + # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow + # error as well. + mask1 = b2 > 0 + mask2 = b2 < 0 + + if not mask1.any(): + to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any() + elif not mask2.any(): + to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any() + else: + to_raise = (((np.iinfo(np.int64).max - + b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or + ((np.iinfo(np.int64).min - + b2[mask2] > arr[mask2]) & not_nan[mask2]).any()) + + if to_raise: + raise OverflowError("Overflow in int64 addition") + return arr + b + + _rank1d_functions = { 'float64': algos.rank_1d_float64, 'int64': algos.rank_1d_int64, diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index d7d68ad536be5..a76e348b7dee2 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -11,7 +11,6 @@ import pandas.hashtable as _hash from pandas import compat, lib, algos, tslib -from pandas.compat.numpy import _np_version_under1p10 from pandas.types.common import (_ensure_int64, _ensure_object, _ensure_float64, _get_dtype, is_float, is_scalar, @@ -810,57 +809,3 @@ def unique1d(values): table = _hash.PyObjectHashTable(len(values)) uniques = table.unique(_ensure_object(values)) return uniques - - -def _checked_add_with_arr(arr, b): - """ - Performs the addition of an int64 array and an int64 integer (or array) - but checks that they do not result in overflow first. - - Parameters - ---------- - arr : array addend. - b : array or scalar addend. - - Returns - ------- - sum : An array for elements x + b for each element x in arr if b is - a scalar or an array for elements x + y for each element pair - (x, y) in (arr, b). - - Raises - ------ - OverflowError if any x + y exceeds the maximum or minimum int64 value. - """ - # For performance reasons, we broadcast 'b' to the new array 'b2' - # so that it has the same size as 'arr'. - if _np_version_under1p10: - if lib.isscalar(b): - b2 = np.empty(arr.shape) - b2.fill(b) - else: - b2 = b - else: - b2 = np.broadcast_to(b, arr.shape) - - # gh-14324: For each element in 'arr' and its corresponding element - # in 'b2', we check the sign of the element in 'b2'. If it is positive, - # we then check whether its sum with the element in 'arr' exceeds - # np.iinfo(np.int64).max. If so, we have an overflow error. If it - # it is negative, we then check whether its sum with the element in - # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow - # error as well. - mask1 = b2 > 0 - mask2 = b2 < 0 - - if not mask1.any(): - to_raise = (np.iinfo(np.int64).min - b2 > arr).any() - elif not mask2.any(): - to_raise = (np.iinfo(np.int64).max - b2 < arr).any() - else: - to_raise = ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]).any() or - (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]).any()) - - if to_raise: - raise OverflowError("Overflow in int64 addition") - return arr + b diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index f89f41abd0d35..d0c909b9c1b30 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1129,6 +1129,55 @@ def test_ensure_platform_int(): assert (result is arr) +def test_int64_add_overflow(): + # see gh-14068 + msg = "Overflow in int64 addition" + m = np.iinfo(np.int64).max + n = np.iinfo(np.int64).min + + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), m) + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m])) + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([n, n]), n) + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([n, n]), np.array([n, n])) + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, n]), np.array([n, n])) + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), + arr_mask=np.array([False, True])) + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), + b_mask=np.array([False, True])) + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), + arr_mask=np.array([False, True]), + b_mask=np.array([False, True])) + with tm.assertRaisesRegexp(OverflowError, msg): + with tm.assert_produces_warning(RuntimeWarning): + algos.checked_add_with_arr(np.array([m, m]), + np.array([np.nan, m])) + + # Check that the nan boolean arrays override whether or not + # the addition overflows. We don't check the result but just + # the fact that an OverflowError is not raised. + with tm.assertRaises(AssertionError): + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), + arr_mask=np.array([True, True])) + with tm.assertRaises(AssertionError): + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), + b_mask=np.array([True, True])) + with tm.assertRaises(AssertionError): + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), + arr_mask=np.array([True, False]), + b_mask=np.array([False, True])) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index be634228b1b6e..dd3a49de55d73 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1002,28 +1002,6 @@ def prng(self): return np.random.RandomState(1234) -def test_int64_add_overflow(): - # see gh-14068 - msg = "Overflow in int64 addition" - m = np.iinfo(np.int64).max - n = np.iinfo(np.int64).min - - with tm.assertRaisesRegexp(OverflowError, msg): - nanops._checked_add_with_arr(np.array([m, m]), m) - with tm.assertRaisesRegexp(OverflowError, msg): - nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m])) - with tm.assertRaisesRegexp(OverflowError, msg): - nanops._checked_add_with_arr(np.array([n, n]), n) - with tm.assertRaisesRegexp(OverflowError, msg): - nanops._checked_add_with_arr(np.array([n, n]), np.array([n, n])) - with tm.assertRaisesRegexp(OverflowError, msg): - nanops._checked_add_with_arr(np.array([m, n]), np.array([n, n])) - with tm.assertRaisesRegexp(OverflowError, msg): - with tm.assert_produces_warning(RuntimeWarning): - nanops._checked_add_with_arr(np.array([m, m]), - np.array([np.nan, m])) - - if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s' diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 8e24f430108b3..63e56e09e91fe 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -16,6 +16,7 @@ ABCPeriodIndex, ABCIndexClass) from pandas.types.missing import isnull from pandas.core import common as com, algorithms +from pandas.core.algorithms import checked_add_with_arr from pandas.core.common import AbstractMethodError import pandas.formats.printing as printing @@ -688,7 +689,8 @@ def _add_delta_td(self, other): # return the i8 result view inc = tslib._delta_to_nanoseconds(other) - new_values = (self.asi8 + inc).view('i8') + new_values = checked_add_with_arr(self.asi8, inc, + arr_mask=self._isnan).view('i8') if self.hasnans: new_values[self._isnan] = tslib.iNaT return new_values.view('i8') @@ -703,7 +705,9 @@ def _add_delta_tdi(self, other): self_i8 = self.asi8 other_i8 = other.asi8 - new_values = self_i8 + other_i8 + new_values = checked_add_with_arr(self_i8, other_i8, + arr_mask=self._isnan, + b_mask=other._isnan) if self.hasnans or other.hasnans: mask = (self._isnan) | (other._isnan) new_values[mask] = tslib.iNaT diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 7e77d8baf3b2c..1585aac0c8ead 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -20,8 +20,8 @@ import pandas.compat as compat from pandas.compat import u from pandas.tseries.frequencies import to_offset +from pandas.core.algorithms import checked_add_with_arr from pandas.core.base import _shared_docs -from pandas.core.nanops import _checked_add_with_arr from pandas.indexes.base import _index_shared_docs import pandas.core.common as com import pandas.types.concat as _concat @@ -347,7 +347,7 @@ def _add_datelike(self, other): else: other = Timestamp(other) i8 = self.asi8 - result = _checked_add_with_arr(i8, other.value) + result = checked_add_with_arr(i8, other.value) result = self._maybe_mask_results(result, fill_value=tslib.iNaT) return DatetimeIndex(result, name=self.name, copy=False) diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index ca957ca0394d1..fc95b17b9b52d 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -1958,11 +1958,34 @@ def test_add_overflow(self): with tm.assertRaisesRegexp(OverflowError, msg): Timestamp('2000') + to_timedelta(106580, 'D') + _NaT = int(pd.NaT) + 1 msg = "Overflow in int64 addition" with tm.assertRaisesRegexp(OverflowError, msg): to_timedelta([106580], 'D') + Timestamp('2000') with tm.assertRaisesRegexp(OverflowError, msg): Timestamp('2000') + to_timedelta([106580], 'D') + with tm.assertRaisesRegexp(OverflowError, msg): + to_timedelta([_NaT]) - Timedelta('1 days') + with tm.assertRaisesRegexp(OverflowError, msg): + to_timedelta(['5 days', _NaT]) - Timedelta('1 days') + with tm.assertRaisesRegexp(OverflowError, msg): + (to_timedelta([_NaT, '5 days', '1 hours']) - + to_timedelta(['7 seconds', _NaT, '4 hours'])) + + # These should not overflow! + exp = TimedeltaIndex([pd.NaT]) + result = to_timedelta([pd.NaT]) - Timedelta('1 days') + tm.assert_index_equal(result, exp) + + exp = TimedeltaIndex(['4 days', pd.NaT]) + result = to_timedelta(['5 days', pd.NaT]) - Timedelta('1 days') + tm.assert_index_equal(result, exp) + + exp = TimedeltaIndex([pd.NaT, pd.NaT, '5 hours']) + result = (to_timedelta([pd.NaT, '5 days', '1 hours']) + + to_timedelta(['7 seconds', pd.NaT, '4 hours'])) + tm.assert_index_equal(result, exp) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],