Skip to content

Commit d238fbb

Browse files
committed
BUG: Prevent addition overflow with TimedeltaIndex
Expands checked-add array addition introduced in pandas-devgh-14237 to include all other addition cases (i.e. TimedeltaIndex and Timedelta). Follow-up to pandas-devgh-14453.
1 parent 033d345 commit d238fbb

File tree

9 files changed

+174
-81
lines changed

9 files changed

+174
-81
lines changed

asv_bench/benchmarks/algorithms.py

+13
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ def setup(self):
2424
self.arrneg = np.arange(-1000000, 0)
2525
self.arrmixed = np.array([1, -1]).repeat(500000)
2626

27+
self.arr_nan = np.random.choice([True, False], size=1000000)
28+
self.arrmixed_nan = np.random.choice([True, False], size=1000000)
29+
2730
# match
2831
self.uniques = tm.makeStringIndex(1000).values
2932
self.all = self.uniques.repeat(10)
@@ -64,6 +67,16 @@ def time_add_overflow_neg_arr(self):
6467
def time_add_overflow_mixed_arr(self):
6568
self.checked_add(self.arr, self.arrmixed)
6669

70+
def time_add_overflow_first_arg_nan(self):
71+
self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan)
72+
73+
def time_add_overflow_second_arg_nan(self):
74+
self.checked_add(self.arr, self.arrmixed, b_mask=self.arrmixed_arr_nan)
75+
76+
def time_add_overflow_both_arg_nan(self):
77+
self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan,
78+
b_mask=self.arrmixed_arr_nan)
79+
6780

6881
class Hashing(object):
6982
goal_time = 0.2

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ Bug Fixes
145145
~~~~~~~~~
146146

147147
- Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`)
148+
- Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`)
148149

149150

150151

pandas/core/algorithms.py

+89
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
_ensure_float64,
2828
_ensure_int64,
2929
is_list_like)
30+
from pandas.compat.numpy import _np_version_under1p10
3031
from pandas.types.missing import isnull
3132

3233
import pandas.core.common as com
@@ -40,6 +41,94 @@
4041
# top-level algos #
4142
# --------------- #
4243

44+
def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None):
45+
"""
46+
Perform array addition that checks for underflow and overflow.
47+
48+
Performs the addition of an int64 array and an int64 integer (or array)
49+
but checks that they do not result in overflow first. For elements that
50+
are indicated to be NaN, whether or not there is overflow for that element
51+
is automatically ignored.
52+
53+
Parameters
54+
----------
55+
arr : array addend.
56+
b : array or scalar addend.
57+
arr_mask : boolean array or None
58+
array indicating which elements to exclude from checking
59+
b_mask : boolean array or boolean or None
60+
array or scalar indicating which element(s) to exclude from checking
61+
62+
Returns
63+
-------
64+
sum : An array for elements x + b for each element x in arr if b is
65+
a scalar or an array for elements x + y for each element pair
66+
(x, y) in (arr, b).
67+
68+
Raises
69+
------
70+
OverflowError if any x + y exceeds the maximum or minimum int64 value.
71+
"""
72+
def _broadcast(arr_or_scalar, shape):
73+
"""
74+
Helper function to broadcast arrays / scalars to the desired shape.
75+
76+
This function is compatible with different versions of NumPy and is
77+
implemented for performance reasons.
78+
"""
79+
if _np_version_under1p10:
80+
if lib.isscalar(arr_or_scalar):
81+
out = np.empty(shape)
82+
out.fill(arr_or_scalar)
83+
else:
84+
out = arr_or_scalar
85+
else:
86+
out = np.broadcast_to(arr_or_scalar, shape)
87+
return out
88+
89+
b2 = _broadcast(b, arr.shape)
90+
if b_mask is not None:
91+
b2_mask = _broadcast(b_mask, arr.shape)
92+
else:
93+
b2_mask = None
94+
95+
# For elements that are NaN, regardless of their value, we should
96+
# ignore whether they overflow or not when doing the checked add.
97+
if arr_mask is not None and b2_mask is not None:
98+
not_nan = np.logical_not(arr_mask | b2_mask)
99+
elif arr_mask is not None:
100+
not_nan = np.logical_not(arr_mask)
101+
elif b_mask is not None:
102+
not_nan = np.logical_not(b2_mask)
103+
else:
104+
not_nan = np.empty(arr.shape, dtype=bool)
105+
not_nan.fill(True)
106+
107+
# gh-14324: For each element in 'arr' and its corresponding element
108+
# in 'b2', we check the sign of the element in 'b2'. If it is positive,
109+
# we then check whether its sum with the element in 'arr' exceeds
110+
# np.iinfo(np.int64).max. If so, we have an overflow error. If it
111+
# it is negative, we then check whether its sum with the element in
112+
# 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow
113+
# error as well.
114+
mask1 = b2 > 0
115+
mask2 = b2 < 0
116+
117+
if not mask1.any():
118+
to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any()
119+
elif not mask2.any():
120+
to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any()
121+
else:
122+
to_raise = (((np.iinfo(np.int64).max -
123+
b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or
124+
((np.iinfo(np.int64).min -
125+
b2[mask2] > arr[mask2]) & not_nan[mask2]).any())
126+
127+
if to_raise:
128+
raise OverflowError("Overflow in int64 addition")
129+
return arr + b
130+
131+
43132
def match(to_match, values, na_sentinel=-1):
44133
"""
45134
Compute locations of to_match into values

pandas/core/nanops.py

-55
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111

1212
import pandas.hashtable as _hash
1313
from pandas import compat, lib, algos, tslib
14-
from pandas.compat.numpy import _np_version_under1p10
1514
from pandas.types.common import (_ensure_int64, _ensure_object,
1615
_ensure_float64, _get_dtype,
1716
is_float, is_scalar,
@@ -810,57 +809,3 @@ def unique1d(values):
810809
table = _hash.PyObjectHashTable(len(values))
811810
uniques = table.unique(_ensure_object(values))
812811
return uniques
813-
814-
815-
def _checked_add_with_arr(arr, b):
816-
"""
817-
Performs the addition of an int64 array and an int64 integer (or array)
818-
but checks that they do not result in overflow first.
819-
820-
Parameters
821-
----------
822-
arr : array addend.
823-
b : array or scalar addend.
824-
825-
Returns
826-
-------
827-
sum : An array for elements x + b for each element x in arr if b is
828-
a scalar or an array for elements x + y for each element pair
829-
(x, y) in (arr, b).
830-
831-
Raises
832-
------
833-
OverflowError if any x + y exceeds the maximum or minimum int64 value.
834-
"""
835-
# For performance reasons, we broadcast 'b' to the new array 'b2'
836-
# so that it has the same size as 'arr'.
837-
if _np_version_under1p10:
838-
if lib.isscalar(b):
839-
b2 = np.empty(arr.shape)
840-
b2.fill(b)
841-
else:
842-
b2 = b
843-
else:
844-
b2 = np.broadcast_to(b, arr.shape)
845-
846-
# gh-14324: For each element in 'arr' and its corresponding element
847-
# in 'b2', we check the sign of the element in 'b2'. If it is positive,
848-
# we then check whether its sum with the element in 'arr' exceeds
849-
# np.iinfo(np.int64).max. If so, we have an overflow error. If it
850-
# it is negative, we then check whether its sum with the element in
851-
# 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow
852-
# error as well.
853-
mask1 = b2 > 0
854-
mask2 = b2 < 0
855-
856-
if not mask1.any():
857-
to_raise = (np.iinfo(np.int64).min - b2 > arr).any()
858-
elif not mask2.any():
859-
to_raise = (np.iinfo(np.int64).max - b2 < arr).any()
860-
else:
861-
to_raise = ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]).any() or
862-
(np.iinfo(np.int64).min - b2[mask2] > arr[mask2]).any())
863-
864-
if to_raise:
865-
raise OverflowError("Overflow in int64 addition")
866-
return arr + b

pandas/tests/test_algos.py

+49
Original file line numberDiff line numberDiff line change
@@ -1129,6 +1129,55 @@ def test_ensure_platform_int():
11291129
assert (result is arr)
11301130

11311131

1132+
def test_int64_add_overflow():
1133+
# see gh-14068
1134+
msg = "Overflow in int64 addition"
1135+
m = np.iinfo(np.int64).max
1136+
n = np.iinfo(np.int64).min
1137+
1138+
with tm.assertRaisesRegexp(OverflowError, msg):
1139+
algos.checked_add_with_arr(np.array([m, m]), m)
1140+
with tm.assertRaisesRegexp(OverflowError, msg):
1141+
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]))
1142+
with tm.assertRaisesRegexp(OverflowError, msg):
1143+
algos.checked_add_with_arr(np.array([n, n]), n)
1144+
with tm.assertRaisesRegexp(OverflowError, msg):
1145+
algos.checked_add_with_arr(np.array([n, n]), np.array([n, n]))
1146+
with tm.assertRaisesRegexp(OverflowError, msg):
1147+
algos.checked_add_with_arr(np.array([m, n]), np.array([n, n]))
1148+
with tm.assertRaisesRegexp(OverflowError, msg):
1149+
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
1150+
arr_mask=np.array([False, True]))
1151+
with tm.assertRaisesRegexp(OverflowError, msg):
1152+
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
1153+
b_mask=np.array([False, True]))
1154+
with tm.assertRaisesRegexp(OverflowError, msg):
1155+
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
1156+
arr_mask=np.array([False, True]),
1157+
b_mask=np.array([False, True]))
1158+
with tm.assertRaisesRegexp(OverflowError, msg):
1159+
with tm.assert_produces_warning(RuntimeWarning):
1160+
algos.checked_add_with_arr(np.array([m, m]),
1161+
np.array([np.nan, m]))
1162+
1163+
# Check that the nan boolean arrays override whether or not
1164+
# the addition overflows. We don't check the result but just
1165+
# the fact that an OverflowError is not raised.
1166+
with tm.assertRaises(AssertionError):
1167+
with tm.assertRaisesRegexp(OverflowError, msg):
1168+
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
1169+
arr_mask=np.array([True, True]))
1170+
with tm.assertRaises(AssertionError):
1171+
with tm.assertRaisesRegexp(OverflowError, msg):
1172+
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
1173+
b_mask=np.array([True, True]))
1174+
with tm.assertRaises(AssertionError):
1175+
with tm.assertRaisesRegexp(OverflowError, msg):
1176+
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
1177+
arr_mask=np.array([True, False]),
1178+
b_mask=np.array([False, True]))
1179+
1180+
11321181
if __name__ == '__main__':
11331182
import nose
11341183
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

pandas/tests/test_nanops.py

-22
Original file line numberDiff line numberDiff line change
@@ -1002,28 +1002,6 @@ def prng(self):
10021002
return np.random.RandomState(1234)
10031003

10041004

1005-
def test_int64_add_overflow():
1006-
# see gh-14068
1007-
msg = "Overflow in int64 addition"
1008-
m = np.iinfo(np.int64).max
1009-
n = np.iinfo(np.int64).min
1010-
1011-
with tm.assertRaisesRegexp(OverflowError, msg):
1012-
nanops._checked_add_with_arr(np.array([m, m]), m)
1013-
with tm.assertRaisesRegexp(OverflowError, msg):
1014-
nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]))
1015-
with tm.assertRaisesRegexp(OverflowError, msg):
1016-
nanops._checked_add_with_arr(np.array([n, n]), n)
1017-
with tm.assertRaisesRegexp(OverflowError, msg):
1018-
nanops._checked_add_with_arr(np.array([n, n]), np.array([n, n]))
1019-
with tm.assertRaisesRegexp(OverflowError, msg):
1020-
nanops._checked_add_with_arr(np.array([m, n]), np.array([n, n]))
1021-
with tm.assertRaisesRegexp(OverflowError, msg):
1022-
with tm.assert_produces_warning(RuntimeWarning):
1023-
nanops._checked_add_with_arr(np.array([m, m]),
1024-
np.array([np.nan, m]))
1025-
1026-
10271005
if __name__ == '__main__':
10281006
import nose
10291007
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s'

pandas/tseries/base.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
ABCPeriodIndex, ABCIndexClass)
1717
from pandas.types.missing import isnull
1818
from pandas.core import common as com, algorithms
19+
from pandas.core.algorithms import checked_add_with_arr
1920
from pandas.core.common import AbstractMethodError
2021

2122
import pandas.formats.printing as printing
@@ -684,7 +685,8 @@ def _add_delta_td(self, other):
684685
# return the i8 result view
685686

686687
inc = tslib._delta_to_nanoseconds(other)
687-
new_values = (self.asi8 + inc).view('i8')
688+
new_values = checked_add_with_arr(self.asi8, inc,
689+
arr_mask=self._isnan).view('i8')
688690
if self.hasnans:
689691
new_values[self._isnan] = tslib.iNaT
690692
return new_values.view('i8')
@@ -699,7 +701,9 @@ def _add_delta_tdi(self, other):
699701

700702
self_i8 = self.asi8
701703
other_i8 = other.asi8
702-
new_values = self_i8 + other_i8
704+
new_values = checked_add_with_arr(self_i8, other_i8,
705+
arr_mask=self._isnan,
706+
b_mask=other._isnan)
703707
if self.hasnans or other.hasnans:
704708
mask = (self._isnan) | (other._isnan)
705709
new_values[mask] = tslib.iNaT

pandas/tseries/tdi.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@
2020
import pandas.compat as compat
2121
from pandas.compat import u
2222
from pandas.tseries.frequencies import to_offset
23+
from pandas.core.algorithms import checked_add_with_arr
2324
from pandas.core.base import _shared_docs
24-
from pandas.core.nanops import _checked_add_with_arr
2525
from pandas.indexes.base import _index_shared_docs
2626
import pandas.core.common as com
2727
import pandas.types.concat as _concat
@@ -347,7 +347,7 @@ def _add_datelike(self, other):
347347
else:
348348
other = Timestamp(other)
349349
i8 = self.asi8
350-
result = _checked_add_with_arr(i8, other.value)
350+
result = checked_add_with_arr(i8, other.value)
351351
result = self._maybe_mask_results(result, fill_value=tslib.iNaT)
352352
return DatetimeIndex(result, name=self.name, copy=False)
353353

pandas/tseries/tests/test_timedeltas.py

+14
Original file line numberDiff line numberDiff line change
@@ -1964,6 +1964,20 @@ def test_add_overflow(self):
19641964
with tm.assertRaisesRegexp(OverflowError, msg):
19651965
Timestamp('2000') + to_timedelta([106580], 'D')
19661966

1967+
# These should not overflow!
1968+
exp = TimedeltaIndex([pd.NaT])
1969+
result = to_timedelta([pd.NaT]) - Timedelta('1 days')
1970+
tm.assert_index_equal(result, exp)
1971+
1972+
exp = TimedeltaIndex(['4 days', pd.NaT])
1973+
result = to_timedelta(['5 days', pd.NaT]) - Timedelta('1 days')
1974+
tm.assert_index_equal(result, exp)
1975+
1976+
exp = TimedeltaIndex([pd.NaT, pd.NaT, '5 hours'])
1977+
result = (to_timedelta([pd.NaT, '5 days', '1 hours']) +
1978+
to_timedelta(['7 seconds', pd.NaT, '4 hours']))
1979+
tm.assert_index_equal(result, exp)
1980+
19671981
if __name__ == '__main__':
19681982
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
19691983
exit=False)

0 commit comments

Comments
 (0)