Skip to content

Commit ac5ec9d

Browse files
gfyoungischurov
authored andcommitted
BUG: Prevent addition overflow with TimedeltaIndex (pandas-dev#14816)
Expands checked-add array addition introduced in pandas-devgh-14237 to include all other addition cases (i.e. TimedeltaIndex and Timedelta). Follow-up to pandas-devgh-14453. In addition, move checked add function to core/algorithms.
1 parent a189108 commit ac5ec9d

File tree

9 files changed

+185
-82
lines changed

9 files changed

+185
-82
lines changed

asv_bench/benchmarks/algorithms.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,17 @@ def setup(self):
1818
self.float = pd.Float64Index(np.random.randn(N).repeat(5))
1919

2020
# Convenience naming.
21-
self.checked_add = pd.core.nanops._checked_add_with_arr
21+
self.checked_add = pd.core.algorithms.checked_add_with_arr
2222

2323
self.arr = np.arange(1000000)
2424
self.arrpos = np.arange(1000000)
2525
self.arrneg = np.arange(-1000000, 0)
2626
self.arrmixed = np.array([1, -1]).repeat(500000)
2727
self.strings = tm.makeStringIndex(100000)
2828

29+
self.arr_nan = np.random.choice([True, False], size=1000000)
30+
self.arrmixed_nan = np.random.choice([True, False], size=1000000)
31+
2932
# match
3033
self.uniques = tm.makeStringIndex(1000).values
3134
self.all = self.uniques.repeat(10)
@@ -69,6 +72,16 @@ def time_add_overflow_neg_arr(self):
6972
def time_add_overflow_mixed_arr(self):
7073
self.checked_add(self.arr, self.arrmixed)
7174

75+
def time_add_overflow_first_arg_nan(self):
76+
self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan)
77+
78+
def time_add_overflow_second_arg_nan(self):
79+
self.checked_add(self.arr, self.arrmixed, b_mask=self.arrmixed_nan)
80+
81+
def time_add_overflow_both_arg_nan(self):
82+
self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan,
83+
b_mask=self.arrmixed_nan)
84+
7285

7386
class Hashing(object):
7487
goal_time = 0.2

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,7 @@ Performance Improvements
234234
Bug Fixes
235235
~~~~~~~~~
236236

237+
- Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`)
237238
- Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`)
238239

239240

pandas/core/algorithms.py

+90
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
_ensure_float64,
2828
_ensure_int64,
2929
is_list_like)
30+
from pandas.compat.numpy import _np_version_under1p10
3031
from pandas.types.missing import isnull
3132

3233
import pandas.core.common as com
@@ -550,6 +551,95 @@ def rank(values, axis=0, method='average', na_option='keep',
550551

551552
return ranks
552553

554+
555+
def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None):
556+
"""
557+
Perform array addition that checks for underflow and overflow.
558+
559+
Performs the addition of an int64 array and an int64 integer (or array)
560+
but checks that they do not result in overflow first. For elements that
561+
are indicated to be NaN, whether or not there is overflow for that element
562+
is automatically ignored.
563+
564+
Parameters
565+
----------
566+
arr : array addend.
567+
b : array or scalar addend.
568+
arr_mask : boolean array or None
569+
array indicating which elements to exclude from checking
570+
b_mask : boolean array or boolean or None
571+
array or scalar indicating which element(s) to exclude from checking
572+
573+
Returns
574+
-------
575+
sum : An array for elements x + b for each element x in arr if b is
576+
a scalar or an array for elements x + y for each element pair
577+
(x, y) in (arr, b).
578+
579+
Raises
580+
------
581+
OverflowError if any x + y exceeds the maximum or minimum int64 value.
582+
"""
583+
def _broadcast(arr_or_scalar, shape):
584+
"""
585+
Helper function to broadcast arrays / scalars to the desired shape.
586+
"""
587+
if _np_version_under1p10:
588+
if lib.isscalar(arr_or_scalar):
589+
out = np.empty(shape)
590+
out.fill(arr_or_scalar)
591+
else:
592+
out = arr_or_scalar
593+
else:
594+
out = np.broadcast_to(arr_or_scalar, shape)
595+
return out
596+
597+
# For performance reasons, we broadcast 'b' to the new array 'b2'
598+
# so that it has the same size as 'arr'.
599+
b2 = _broadcast(b, arr.shape)
600+
if b_mask is not None:
601+
# We do the same broadcasting for b_mask as well.
602+
b2_mask = _broadcast(b_mask, arr.shape)
603+
else:
604+
b2_mask = None
605+
606+
# For elements that are NaN, regardless of their value, we should
607+
# ignore whether they overflow or not when doing the checked add.
608+
if arr_mask is not None and b2_mask is not None:
609+
not_nan = np.logical_not(arr_mask | b2_mask)
610+
elif arr_mask is not None:
611+
not_nan = np.logical_not(arr_mask)
612+
elif b_mask is not None:
613+
not_nan = np.logical_not(b2_mask)
614+
else:
615+
not_nan = np.empty(arr.shape, dtype=bool)
616+
not_nan.fill(True)
617+
618+
# gh-14324: For each element in 'arr' and its corresponding element
619+
# in 'b2', we check the sign of the element in 'b2'. If it is positive,
620+
# we then check whether its sum with the element in 'arr' exceeds
621+
# np.iinfo(np.int64).max. If so, we have an overflow error. If it
622+
# it is negative, we then check whether its sum with the element in
623+
# 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow
624+
# error as well.
625+
mask1 = b2 > 0
626+
mask2 = b2 < 0
627+
628+
if not mask1.any():
629+
to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any()
630+
elif not mask2.any():
631+
to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any()
632+
else:
633+
to_raise = (((np.iinfo(np.int64).max -
634+
b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or
635+
((np.iinfo(np.int64).min -
636+
b2[mask2] > arr[mask2]) & not_nan[mask2]).any())
637+
638+
if to_raise:
639+
raise OverflowError("Overflow in int64 addition")
640+
return arr + b
641+
642+
553643
_rank1d_functions = {
554644
'float64': algos.rank_1d_float64,
555645
'int64': algos.rank_1d_int64,

pandas/core/nanops.py

-55
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111

1212
import pandas.hashtable as _hash
1313
from pandas import compat, lib, algos, tslib
14-
from pandas.compat.numpy import _np_version_under1p10
1514
from pandas.types.common import (_ensure_int64, _ensure_object,
1615
_ensure_float64, _get_dtype,
1716
is_float, is_scalar,
@@ -810,57 +809,3 @@ def unique1d(values):
810809
table = _hash.PyObjectHashTable(len(values))
811810
uniques = table.unique(_ensure_object(values))
812811
return uniques
813-
814-
815-
def _checked_add_with_arr(arr, b):
816-
"""
817-
Performs the addition of an int64 array and an int64 integer (or array)
818-
but checks that they do not result in overflow first.
819-
820-
Parameters
821-
----------
822-
arr : array addend.
823-
b : array or scalar addend.
824-
825-
Returns
826-
-------
827-
sum : An array for elements x + b for each element x in arr if b is
828-
a scalar or an array for elements x + y for each element pair
829-
(x, y) in (arr, b).
830-
831-
Raises
832-
------
833-
OverflowError if any x + y exceeds the maximum or minimum int64 value.
834-
"""
835-
# For performance reasons, we broadcast 'b' to the new array 'b2'
836-
# so that it has the same size as 'arr'.
837-
if _np_version_under1p10:
838-
if lib.isscalar(b):
839-
b2 = np.empty(arr.shape)
840-
b2.fill(b)
841-
else:
842-
b2 = b
843-
else:
844-
b2 = np.broadcast_to(b, arr.shape)
845-
846-
# gh-14324: For each element in 'arr' and its corresponding element
847-
# in 'b2', we check the sign of the element in 'b2'. If it is positive,
848-
# we then check whether its sum with the element in 'arr' exceeds
849-
# np.iinfo(np.int64).max. If so, we have an overflow error. If it
850-
# it is negative, we then check whether its sum with the element in
851-
# 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow
852-
# error as well.
853-
mask1 = b2 > 0
854-
mask2 = b2 < 0
855-
856-
if not mask1.any():
857-
to_raise = (np.iinfo(np.int64).min - b2 > arr).any()
858-
elif not mask2.any():
859-
to_raise = (np.iinfo(np.int64).max - b2 < arr).any()
860-
else:
861-
to_raise = ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]).any() or
862-
(np.iinfo(np.int64).min - b2[mask2] > arr[mask2]).any())
863-
864-
if to_raise:
865-
raise OverflowError("Overflow in int64 addition")
866-
return arr + b

pandas/tests/test_algos.py

+49
Original file line numberDiff line numberDiff line change
@@ -1129,6 +1129,55 @@ def test_ensure_platform_int():
11291129
assert (result is arr)
11301130

11311131

1132+
def test_int64_add_overflow():
1133+
# see gh-14068
1134+
msg = "Overflow in int64 addition"
1135+
m = np.iinfo(np.int64).max
1136+
n = np.iinfo(np.int64).min
1137+
1138+
with tm.assertRaisesRegexp(OverflowError, msg):
1139+
algos.checked_add_with_arr(np.array([m, m]), m)
1140+
with tm.assertRaisesRegexp(OverflowError, msg):
1141+
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]))
1142+
with tm.assertRaisesRegexp(OverflowError, msg):
1143+
algos.checked_add_with_arr(np.array([n, n]), n)
1144+
with tm.assertRaisesRegexp(OverflowError, msg):
1145+
algos.checked_add_with_arr(np.array([n, n]), np.array([n, n]))
1146+
with tm.assertRaisesRegexp(OverflowError, msg):
1147+
algos.checked_add_with_arr(np.array([m, n]), np.array([n, n]))
1148+
with tm.assertRaisesRegexp(OverflowError, msg):
1149+
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
1150+
arr_mask=np.array([False, True]))
1151+
with tm.assertRaisesRegexp(OverflowError, msg):
1152+
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
1153+
b_mask=np.array([False, True]))
1154+
with tm.assertRaisesRegexp(OverflowError, msg):
1155+
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
1156+
arr_mask=np.array([False, True]),
1157+
b_mask=np.array([False, True]))
1158+
with tm.assertRaisesRegexp(OverflowError, msg):
1159+
with tm.assert_produces_warning(RuntimeWarning):
1160+
algos.checked_add_with_arr(np.array([m, m]),
1161+
np.array([np.nan, m]))
1162+
1163+
# Check that the nan boolean arrays override whether or not
1164+
# the addition overflows. We don't check the result but just
1165+
# the fact that an OverflowError is not raised.
1166+
with tm.assertRaises(AssertionError):
1167+
with tm.assertRaisesRegexp(OverflowError, msg):
1168+
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
1169+
arr_mask=np.array([True, True]))
1170+
with tm.assertRaises(AssertionError):
1171+
with tm.assertRaisesRegexp(OverflowError, msg):
1172+
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
1173+
b_mask=np.array([True, True]))
1174+
with tm.assertRaises(AssertionError):
1175+
with tm.assertRaisesRegexp(OverflowError, msg):
1176+
algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
1177+
arr_mask=np.array([True, False]),
1178+
b_mask=np.array([False, True]))
1179+
1180+
11321181
if __name__ == '__main__':
11331182
import nose
11341183
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

pandas/tests/test_nanops.py

-22
Original file line numberDiff line numberDiff line change
@@ -1002,28 +1002,6 @@ def prng(self):
10021002
return np.random.RandomState(1234)
10031003

10041004

1005-
def test_int64_add_overflow():
1006-
# see gh-14068
1007-
msg = "Overflow in int64 addition"
1008-
m = np.iinfo(np.int64).max
1009-
n = np.iinfo(np.int64).min
1010-
1011-
with tm.assertRaisesRegexp(OverflowError, msg):
1012-
nanops._checked_add_with_arr(np.array([m, m]), m)
1013-
with tm.assertRaisesRegexp(OverflowError, msg):
1014-
nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]))
1015-
with tm.assertRaisesRegexp(OverflowError, msg):
1016-
nanops._checked_add_with_arr(np.array([n, n]), n)
1017-
with tm.assertRaisesRegexp(OverflowError, msg):
1018-
nanops._checked_add_with_arr(np.array([n, n]), np.array([n, n]))
1019-
with tm.assertRaisesRegexp(OverflowError, msg):
1020-
nanops._checked_add_with_arr(np.array([m, n]), np.array([n, n]))
1021-
with tm.assertRaisesRegexp(OverflowError, msg):
1022-
with tm.assert_produces_warning(RuntimeWarning):
1023-
nanops._checked_add_with_arr(np.array([m, m]),
1024-
np.array([np.nan, m]))
1025-
1026-
10271005
if __name__ == '__main__':
10281006
import nose
10291007
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s'

pandas/tseries/base.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
ABCPeriodIndex, ABCIndexClass)
1717
from pandas.types.missing import isnull
1818
from pandas.core import common as com, algorithms
19+
from pandas.core.algorithms import checked_add_with_arr
1920
from pandas.core.common import AbstractMethodError
2021

2122
import pandas.formats.printing as printing
@@ -688,7 +689,8 @@ def _add_delta_td(self, other):
688689
# return the i8 result view
689690

690691
inc = tslib._delta_to_nanoseconds(other)
691-
new_values = (self.asi8 + inc).view('i8')
692+
new_values = checked_add_with_arr(self.asi8, inc,
693+
arr_mask=self._isnan).view('i8')
692694
if self.hasnans:
693695
new_values[self._isnan] = tslib.iNaT
694696
return new_values.view('i8')
@@ -703,7 +705,9 @@ def _add_delta_tdi(self, other):
703705

704706
self_i8 = self.asi8
705707
other_i8 = other.asi8
706-
new_values = self_i8 + other_i8
708+
new_values = checked_add_with_arr(self_i8, other_i8,
709+
arr_mask=self._isnan,
710+
b_mask=other._isnan)
707711
if self.hasnans or other.hasnans:
708712
mask = (self._isnan) | (other._isnan)
709713
new_values[mask] = tslib.iNaT

pandas/tseries/tdi.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@
2020
import pandas.compat as compat
2121
from pandas.compat import u
2222
from pandas.tseries.frequencies import to_offset
23+
from pandas.core.algorithms import checked_add_with_arr
2324
from pandas.core.base import _shared_docs
24-
from pandas.core.nanops import _checked_add_with_arr
2525
from pandas.indexes.base import _index_shared_docs
2626
import pandas.core.common as com
2727
import pandas.types.concat as _concat
@@ -347,7 +347,7 @@ def _add_datelike(self, other):
347347
else:
348348
other = Timestamp(other)
349349
i8 = self.asi8
350-
result = _checked_add_with_arr(i8, other.value)
350+
result = checked_add_with_arr(i8, other.value)
351351
result = self._maybe_mask_results(result, fill_value=tslib.iNaT)
352352
return DatetimeIndex(result, name=self.name, copy=False)
353353

pandas/tseries/tests/test_timedeltas.py

+23
Original file line numberDiff line numberDiff line change
@@ -1958,11 +1958,34 @@ def test_add_overflow(self):
19581958
with tm.assertRaisesRegexp(OverflowError, msg):
19591959
Timestamp('2000') + to_timedelta(106580, 'D')
19601960

1961+
_NaT = int(pd.NaT) + 1
19611962
msg = "Overflow in int64 addition"
19621963
with tm.assertRaisesRegexp(OverflowError, msg):
19631964
to_timedelta([106580], 'D') + Timestamp('2000')
19641965
with tm.assertRaisesRegexp(OverflowError, msg):
19651966
Timestamp('2000') + to_timedelta([106580], 'D')
1967+
with tm.assertRaisesRegexp(OverflowError, msg):
1968+
to_timedelta([_NaT]) - Timedelta('1 days')
1969+
with tm.assertRaisesRegexp(OverflowError, msg):
1970+
to_timedelta(['5 days', _NaT]) - Timedelta('1 days')
1971+
with tm.assertRaisesRegexp(OverflowError, msg):
1972+
(to_timedelta([_NaT, '5 days', '1 hours']) -
1973+
to_timedelta(['7 seconds', _NaT, '4 hours']))
1974+
1975+
# These should not overflow!
1976+
exp = TimedeltaIndex([pd.NaT])
1977+
result = to_timedelta([pd.NaT]) - Timedelta('1 days')
1978+
tm.assert_index_equal(result, exp)
1979+
1980+
exp = TimedeltaIndex(['4 days', pd.NaT])
1981+
result = to_timedelta(['5 days', pd.NaT]) - Timedelta('1 days')
1982+
tm.assert_index_equal(result, exp)
1983+
1984+
exp = TimedeltaIndex([pd.NaT, pd.NaT, '5 hours'])
1985+
result = (to_timedelta([pd.NaT, '5 days', '1 hours']) +
1986+
to_timedelta(['7 seconds', pd.NaT, '4 hours']))
1987+
tm.assert_index_equal(result, exp)
1988+
19661989

19671990
if __name__ == '__main__':
19681991
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)