Skip to content

BUG: Prevent addition overflow with TimedeltaIndex #14816

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,17 @@ def setup(self):
self.float = pd.Float64Index(np.random.randn(N).repeat(5))

# Convenience naming.
self.checked_add = pd.core.nanops._checked_add_with_arr
self.checked_add = pd.core.algorithms.checked_add_with_arr

self.arr = np.arange(1000000)
self.arrpos = np.arange(1000000)
self.arrneg = np.arange(-1000000, 0)
self.arrmixed = np.array([1, -1]).repeat(500000)
self.strings = tm.makeStringIndex(100000)

self.arr_nan = np.random.choice([True, False], size=1000000)
self.arrmixed_nan = np.random.choice([True, False], size=1000000)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the difference between those two?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Those are the masks for self.arr and self.arrmixed respectively.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But they are defined exactly the same, that's the reason I am confused on why two different variables are needed and the mask is not just reused. (but they are of course both random, so not exactly the same)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I fail to see your confusion here. Why re-use the mask when that won't be the case in reality (i.e. the masks will likely be different in most cases)?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, no problem (I also merged BTW :-)), I was just skimming the code, and saw two apparantly identical lines of code, hence my comment (which was not needed)


# match
self.uniques = tm.makeStringIndex(1000).values
self.all = self.uniques.repeat(10)
Expand Down Expand Up @@ -69,6 +72,16 @@ def time_add_overflow_neg_arr(self):
def time_add_overflow_mixed_arr(self):
self.checked_add(self.arr, self.arrmixed)

def time_add_overflow_first_arg_nan(self):
self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan)

def time_add_overflow_second_arg_nan(self):
self.checked_add(self.arr, self.arrmixed, b_mask=self.arrmixed_nan)

def time_add_overflow_both_arg_nan(self):
self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan,
b_mask=self.arrmixed_nan)


class Hashing(object):
goal_time = 0.2
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ Performance Improvements
Bug Fixes
~~~~~~~~~

- Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`)
- Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`)


Expand Down
90 changes: 90 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
_ensure_float64,
_ensure_int64,
is_list_like)
from pandas.compat.numpy import _np_version_under1p10
from pandas.types.missing import isnull

import pandas.core.common as com
Expand Down Expand Up @@ -550,6 +551,95 @@ def rank(values, axis=0, method='average', na_option='keep',

return ranks


def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None):
"""
Perform array addition that checks for underflow and overflow.

Performs the addition of an int64 array and an int64 integer (or array)
but checks that they do not result in overflow first. For elements that
are indicated to be NaN, whether or not there is overflow for that element
is automatically ignored.

Parameters
----------
arr : array addend.
b : array or scalar addend.
arr_mask : boolean array or None
array indicating which elements to exclude from checking
b_mask : boolean array or boolean or None
array or scalar indicating which element(s) to exclude from checking

Returns
-------
sum : An array for elements x + b for each element x in arr if b is
a scalar or an array for elements x + y for each element pair
(x, y) in (arr, b).

Raises
------
OverflowError if any x + y exceeds the maximum or minimum int64 value.
"""
def _broadcast(arr_or_scalar, shape):
"""
Helper function to broadcast arrays / scalars to the desired shape.
"""
if _np_version_under1p10:
if lib.isscalar(arr_or_scalar):
out = np.empty(shape)
out.fill(arr_or_scalar)
else:
out = arr_or_scalar
else:
out = np.broadcast_to(arr_or_scalar, shape)
return out

# For performance reasons, we broadcast 'b' to the new array 'b2'
# so that it has the same size as 'arr'.
b2 = _broadcast(b, arr.shape)
if b_mask is not None:
# We do the same broadcasting for b_mask as well.
b2_mask = _broadcast(b_mask, arr.shape)
else:
b2_mask = None

# For elements that are NaN, regardless of their value, we should
# ignore whether they overflow or not when doing the checked add.
if arr_mask is not None and b2_mask is not None:
not_nan = np.logical_not(arr_mask | b2_mask)
elif arr_mask is not None:
not_nan = np.logical_not(arr_mask)
elif b_mask is not None:
not_nan = np.logical_not(b2_mask)
else:
not_nan = np.empty(arr.shape, dtype=bool)
not_nan.fill(True)

# gh-14324: For each element in 'arr' and its corresponding element
# in 'b2', we check the sign of the element in 'b2'. If it is positive,
# we then check whether its sum with the element in 'arr' exceeds
# np.iinfo(np.int64).max. If so, we have an overflow error. If it
# it is negative, we then check whether its sum with the element in
# 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow
# error as well.
mask1 = b2 > 0
mask2 = b2 < 0

if not mask1.any():
to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any()
elif not mask2.any():
to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any()
else:
to_raise = (((np.iinfo(np.int64).max -
b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or
((np.iinfo(np.int64).min -
b2[mask2] > arr[mask2]) & not_nan[mask2]).any())

if to_raise:
raise OverflowError("Overflow in int64 addition")
return arr + b


_rank1d_functions = {
'float64': algos.rank_1d_float64,
'int64': algos.rank_1d_int64,
Expand Down
55 changes: 0 additions & 55 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

import pandas.hashtable as _hash
from pandas import compat, lib, algos, tslib
from pandas.compat.numpy import _np_version_under1p10
from pandas.types.common import (_ensure_int64, _ensure_object,
_ensure_float64, _get_dtype,
is_float, is_scalar,
Expand Down Expand Up @@ -810,57 +809,3 @@ def unique1d(values):
table = _hash.PyObjectHashTable(len(values))
uniques = table.unique(_ensure_object(values))
return uniques


def _checked_add_with_arr(arr, b):
"""
Performs the addition of an int64 array and an int64 integer (or array)
but checks that they do not result in overflow first.

Parameters
----------
arr : array addend.
b : array or scalar addend.

Returns
-------
sum : An array for elements x + b for each element x in arr if b is
a scalar or an array for elements x + y for each element pair
(x, y) in (arr, b).

Raises
------
OverflowError if any x + y exceeds the maximum or minimum int64 value.
"""
# For performance reasons, we broadcast 'b' to the new array 'b2'
# so that it has the same size as 'arr'.
if _np_version_under1p10:
if lib.isscalar(b):
b2 = np.empty(arr.shape)
b2.fill(b)
else:
b2 = b
else:
b2 = np.broadcast_to(b, arr.shape)

# gh-14324: For each element in 'arr' and its corresponding element
# in 'b2', we check the sign of the element in 'b2'. If it is positive,
# we then check whether its sum with the element in 'arr' exceeds
# np.iinfo(np.int64).max. If so, we have an overflow error. If it
# it is negative, we then check whether its sum with the element in
# 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow
# error as well.
mask1 = b2 > 0
mask2 = b2 < 0

if not mask1.any():
to_raise = (np.iinfo(np.int64).min - b2 > arr).any()
elif not mask2.any():
to_raise = (np.iinfo(np.int64).max - b2 < arr).any()
else:
to_raise = ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]).any() or
(np.iinfo(np.int64).min - b2[mask2] > arr[mask2]).any())

if to_raise:
raise OverflowError("Overflow in int64 addition")
return arr + b
49 changes: 49 additions & 0 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -1129,6 +1129,55 @@ def test_ensure_platform_int():
assert (result is arr)


def test_int64_add_overflow():
# see gh-14068
msg = "Overflow in int64 addition"
m = np.iinfo(np.int64).max
n = np.iinfo(np.int64).min

with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([m, m]), m)
with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]))
with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([n, n]), n)
with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([n, n]), np.array([n, n]))
with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([m, n]), np.array([n, n]))
with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
arr_mask=np.array([False, True]))
with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
b_mask=np.array([False, True]))
with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
arr_mask=np.array([False, True]),
b_mask=np.array([False, True]))
with tm.assertRaisesRegexp(OverflowError, msg):
with tm.assert_produces_warning(RuntimeWarning):
algos.checked_add_with_arr(np.array([m, m]),
np.array([np.nan, m]))

# Check that the nan boolean arrays override whether or not
# the addition overflows. We don't check the result but just
# the fact that an OverflowError is not raised.
with tm.assertRaises(AssertionError):
with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
arr_mask=np.array([True, True]))
with tm.assertRaises(AssertionError):
with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
b_mask=np.array([True, True]))
with tm.assertRaises(AssertionError):
with tm.assertRaisesRegexp(OverflowError, msg):
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
arr_mask=np.array([True, False]),
b_mask=np.array([False, True]))


if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down
22 changes: 0 additions & 22 deletions pandas/tests/test_nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1002,28 +1002,6 @@ def prng(self):
return np.random.RandomState(1234)


def test_int64_add_overflow():
# see gh-14068
msg = "Overflow in int64 addition"
m = np.iinfo(np.int64).max
n = np.iinfo(np.int64).min

with tm.assertRaisesRegexp(OverflowError, msg):
nanops._checked_add_with_arr(np.array([m, m]), m)
with tm.assertRaisesRegexp(OverflowError, msg):
nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]))
with tm.assertRaisesRegexp(OverflowError, msg):
nanops._checked_add_with_arr(np.array([n, n]), n)
with tm.assertRaisesRegexp(OverflowError, msg):
nanops._checked_add_with_arr(np.array([n, n]), np.array([n, n]))
with tm.assertRaisesRegexp(OverflowError, msg):
nanops._checked_add_with_arr(np.array([m, n]), np.array([n, n]))
with tm.assertRaisesRegexp(OverflowError, msg):
with tm.assert_produces_warning(RuntimeWarning):
nanops._checked_add_with_arr(np.array([m, m]),
np.array([np.nan, m]))


if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s'
Expand Down
8 changes: 6 additions & 2 deletions pandas/tseries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
ABCPeriodIndex, ABCIndexClass)
from pandas.types.missing import isnull
from pandas.core import common as com, algorithms
from pandas.core.algorithms import checked_add_with_arr
from pandas.core.common import AbstractMethodError

import pandas.formats.printing as printing
Expand Down Expand Up @@ -688,7 +689,8 @@ def _add_delta_td(self, other):
# return the i8 result view

inc = tslib._delta_to_nanoseconds(other)
new_values = (self.asi8 + inc).view('i8')
new_values = checked_add_with_arr(self.asi8, inc,
arr_mask=self._isnan).view('i8')
if self.hasnans:
new_values[self._isnan] = tslib.iNaT
return new_values.view('i8')
Expand All @@ -703,7 +705,9 @@ def _add_delta_tdi(self, other):

self_i8 = self.asi8
other_i8 = other.asi8
new_values = self_i8 + other_i8
new_values = checked_add_with_arr(self_i8, other_i8,
arr_mask=self._isnan,
b_mask=other._isnan)
if self.hasnans or other.hasnans:
mask = (self._isnan) | (other._isnan)
new_values[mask] = tslib.iNaT
Expand Down
4 changes: 2 additions & 2 deletions pandas/tseries/tdi.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
import pandas.compat as compat
from pandas.compat import u
from pandas.tseries.frequencies import to_offset
from pandas.core.algorithms import checked_add_with_arr
from pandas.core.base import _shared_docs
from pandas.core.nanops import _checked_add_with_arr
from pandas.indexes.base import _index_shared_docs
import pandas.core.common as com
import pandas.types.concat as _concat
Expand Down Expand Up @@ -347,7 +347,7 @@ def _add_datelike(self, other):
else:
other = Timestamp(other)
i8 = self.asi8
result = _checked_add_with_arr(i8, other.value)
result = checked_add_with_arr(i8, other.value)
result = self._maybe_mask_results(result, fill_value=tslib.iNaT)
return DatetimeIndex(result, name=self.name, copy=False)

Expand Down
23 changes: 23 additions & 0 deletions pandas/tseries/tests/test_timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -1958,11 +1958,34 @@ def test_add_overflow(self):
with tm.assertRaisesRegexp(OverflowError, msg):
Timestamp('2000') + to_timedelta(106580, 'D')

_NaT = int(pd.NaT) + 1
msg = "Overflow in int64 addition"
with tm.assertRaisesRegexp(OverflowError, msg):
to_timedelta([106580], 'D') + Timestamp('2000')
with tm.assertRaisesRegexp(OverflowError, msg):
Timestamp('2000') + to_timedelta([106580], 'D')
with tm.assertRaisesRegexp(OverflowError, msg):
to_timedelta([_NaT]) - Timedelta('1 days')
with tm.assertRaisesRegexp(OverflowError, msg):
to_timedelta(['5 days', _NaT]) - Timedelta('1 days')
with tm.assertRaisesRegexp(OverflowError, msg):
(to_timedelta([_NaT, '5 days', '1 hours']) -
to_timedelta(['7 seconds', _NaT, '4 hours']))

# These should not overflow!
exp = TimedeltaIndex([pd.NaT])
result = to_timedelta([pd.NaT]) - Timedelta('1 days')
tm.assert_index_equal(result, exp)

exp = TimedeltaIndex(['4 days', pd.NaT])
result = to_timedelta(['5 days', pd.NaT]) - Timedelta('1 days')
tm.assert_index_equal(result, exp)

exp = TimedeltaIndex([pd.NaT, pd.NaT, '5 hours'])
result = (to_timedelta([pd.NaT, '5 days', '1 hours']) +
to_timedelta(['7 seconds', pd.NaT, '4 hours']))
tm.assert_index_equal(result, exp)


if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down