Skip to content

Commit 7990eec

Browse files
committed
BUG: Prevent addition overflow with TimedeltaIndex
Expands checked-add array addition introduced in gh-14237 to include all other addition cases (i.e. TimedeltaIndex and Timedelta). Follow-up to gh-14453.
1 parent b6de920 commit 7990eec

File tree

5 files changed

+108
-16
lines changed

5 files changed

+108
-16
lines changed

asv_bench/benchmarks/algorithms.py

+13
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ def setup(self):
2424
self.arrneg = np.arange(-1000000, 0)
2525
self.arrmixed = np.array([1, -1]).repeat(500000)
2626

27+
self.arr_nan = np.random.choice([True, False], size=1000000)
28+
self.arrmixed_nan = np.random.choice([True, False], size=1000000)
29+
2730
# match
2831
self.uniques = tm.makeStringIndex(1000).values
2932
self.all = self.uniques.repeat(10)
@@ -64,6 +67,16 @@ def time_add_overflow_neg_arr(self):
6467
def time_add_overflow_mixed_arr(self):
6568
self.checked_add(self.arr, self.arrmixed)
6669

70+
def time_add_overflow_first_arg_nan(self):
71+
self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan)
72+
73+
def time_add_overflow_second_arg_nan(self):
74+
self.checked_add(self.arr, self.arrmixed, b_mask=self.arrmixed_arr_nan)
75+
76+
def time_add_overflow_both_arg_nan(self):
77+
self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan,
78+
b_mask=self.arrmixed_arr_nan)
79+
6780

6881
class Hashing(object):
6982
goal_time = 0.2

pandas/core/nanops.py

+48-14
Original file line numberDiff line numberDiff line change
@@ -812,15 +812,23 @@ def unique1d(values):
812812
return uniques
813813

814814

815-
def _checked_add_with_arr(arr, b):
815+
def _checked_add_with_arr(arr, b, arr_mask=None, b_mask=None):
816816
"""
817+
Perform array addition that checks for underflow and overflow.
818+
817819
Performs the addition of an int64 array and an int64 integer (or array)
818-
but checks that they do not result in overflow first.
820+
but checks that they do not result in overflow first. For elements that
821+
are indicated to be NaN, whether or not there is overflow for that element
822+
is automatically ignored.
819823
820824
Parameters
821825
----------
822826
arr : array addend.
823827
b : array or scalar addend.
828+
arr_mask : boolean array or None
829+
array indicating which elements to exclude from checking
830+
b_mask : boolean array or boolean or None
831+
array or scalar indicating which element(s) to exclude from checking
824832
825833
Returns
826834
-------
@@ -832,16 +840,40 @@ def _checked_add_with_arr(arr, b):
832840
------
833841
OverflowError if any x + y exceeds the maximum or minimum int64 value.
834842
"""
835-
# For performance reasons, we broadcast 'b' to the new array 'b2'
836-
# so that it has the same size as 'arr'.
837-
if _np_version_under1p10:
838-
if lib.isscalar(b):
839-
b2 = np.empty(arr.shape)
840-
b2.fill(b)
843+
def _broadcast(arr_or_scalar, shape):
844+
"""
845+
Helper function to broadcast arrays / scalars to the desired shape.
846+
847+
This function is compatible with different versions of NumPy and is
848+
implemented for performance reasons.
849+
"""
850+
if _np_version_under1p10:
851+
if lib.isscalar(arr_or_scalar):
852+
out = np.empty(shape)
853+
out.fill(arr_or_scalar)
854+
else:
855+
out = arr_or_scalar
841856
else:
842-
b2 = b
857+
out = np.broadcast_to(arr_or_scalar, shape)
858+
return out
859+
860+
b2 = _broadcast(b, arr.shape)
861+
if b_mask is not None:
862+
b2_mask = _broadcast(b_mask, arr.shape)
863+
else:
864+
b2_mask = None
865+
866+
# For elements that are NaN, regardless of their value, we should
867+
# ignore whether they overflow or not when doing the checked add.
868+
if arr_mask is not None and b2_mask is not None:
869+
not_nan = np.logical_not(arr_mask | b2_mask)
870+
elif arr_mask is not None:
871+
not_nan = np.logical_not(arr_mask)
872+
elif b_mask is not None:
873+
not_nan = np.logical_not(b2_mask)
843874
else:
844-
b2 = np.broadcast_to(b, arr.shape)
875+
not_nan = np.empty(arr.shape, dtype=bool)
876+
not_nan.fill(True)
845877

846878
# gh-14324: For each element in 'arr' and its corresponding element
847879
# in 'b2', we check the sign of the element in 'b2'. If it is positive,
@@ -854,12 +886,14 @@ def _checked_add_with_arr(arr, b):
854886
mask2 = b2 < 0
855887

856888
if not mask1.any():
857-
to_raise = (np.iinfo(np.int64).min - b2 > arr).any()
889+
to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any()
858890
elif not mask2.any():
859-
to_raise = (np.iinfo(np.int64).max - b2 < arr).any()
891+
to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any()
860892
else:
861-
to_raise = ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]).any() or
862-
(np.iinfo(np.int64).min - b2[mask2] > arr[mask2]).any())
893+
to_raise = (((np.iinfo(np.int64).max -
894+
b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or
895+
((np.iinfo(np.int64).min -
896+
b2[mask2] > arr[mask2]) & not_nan[mask2]).any())
863897

864898
if to_raise:
865899
raise OverflowError("Overflow in int64 addition")

pandas/tests/test_nanops.py

+27
Original file line numberDiff line numberDiff line change
@@ -1018,11 +1018,38 @@ def test_int64_add_overflow():
10181018
nanops._checked_add_with_arr(np.array([n, n]), np.array([n, n]))
10191019
with tm.assertRaisesRegexp(OverflowError, msg):
10201020
nanops._checked_add_with_arr(np.array([m, n]), np.array([n, n]))
1021+
with tm.assertRaisesRegexp(OverflowError, msg):
1022+
nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]),
1023+
arr_mask=np.array([False, True]))
1024+
with tm.assertRaisesRegexp(OverflowError, msg):
1025+
nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]),
1026+
b_mask=np.array([False, True]))
1027+
with tm.assertRaisesRegexp(OverflowError, msg):
1028+
nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]),
1029+
arr_mask=np.array([False, True]),
1030+
b_mask=np.array([False, True]))
10211031
with tm.assertRaisesRegexp(OverflowError, msg):
10221032
with tm.assert_produces_warning(RuntimeWarning):
10231033
nanops._checked_add_with_arr(np.array([m, m]),
10241034
np.array([np.nan, m]))
10251035

1036+
# Check that the nan boolean arrays override whether or not
1037+
# the addition overflows. We don't check the result but just
1038+
# the fact that an OverflowError is not raised.
1039+
with tm.assertRaises(AssertionError):
1040+
with tm.assertRaisesRegexp(OverflowError, msg):
1041+
nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]),
1042+
arr_mask=np.array([True, True]))
1043+
with tm.assertRaises(AssertionError):
1044+
with tm.assertRaisesRegexp(OverflowError, msg):
1045+
nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]),
1046+
b_mask=np.array([True, True]))
1047+
with tm.assertRaises(AssertionError):
1048+
with tm.assertRaisesRegexp(OverflowError, msg):
1049+
nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]),
1050+
arr_mask=np.array([True, False]),
1051+
b_mask=np.array([False, True]))
1052+
10261053

10271054
if __name__ == '__main__':
10281055
import nose

pandas/tseries/base.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from pandas.types.missing import isnull
1818
from pandas.core import common as com, algorithms
1919
from pandas.core.common import AbstractMethodError
20+
from pandas.core.nanops import _checked_add_with_arr
2021

2122
import pandas.formats.printing as printing
2223
import pandas.tslib as tslib
@@ -684,7 +685,8 @@ def _add_delta_td(self, other):
684685
# return the i8 result view
685686

686687
inc = tslib._delta_to_nanoseconds(other)
687-
new_values = (self.asi8 + inc).view('i8')
688+
new_values = _checked_add_with_arr(self.asi8, inc,
689+
arr_mask=self._isnan).view('i8')
688690
if self.hasnans:
689691
new_values[self._isnan] = tslib.iNaT
690692
return new_values.view('i8')
@@ -699,7 +701,9 @@ def _add_delta_tdi(self, other):
699701

700702
self_i8 = self.asi8
701703
other_i8 = other.asi8
702-
new_values = self_i8 + other_i8
704+
new_values = _checked_add_with_arr(self_i8, other_i8,
705+
arr_mask=self._isnan,
706+
b_mask=other._isnan)
703707
if self.hasnans or other.hasnans:
704708
mask = (self._isnan) | (other._isnan)
705709
new_values[mask] = tslib.iNaT

pandas/tseries/tests/test_timedeltas.py

+14
Original file line numberDiff line numberDiff line change
@@ -1964,6 +1964,20 @@ def test_add_overflow(self):
19641964
with tm.assertRaisesRegexp(OverflowError, msg):
19651965
Timestamp('2000') + to_timedelta([106580], 'D')
19661966

1967+
# These should not overflow!
1968+
exp = TimedeltaIndex([pd.NaT])
1969+
result = to_timedelta([pd.NaT]) - Timedelta('1 days')
1970+
tm.assert_index_equal(result, exp)
1971+
1972+
exp = TimedeltaIndex(['4 days', pd.NaT])
1973+
result = to_timedelta(['5 days', pd.NaT]) - Timedelta('1 days')
1974+
tm.assert_index_equal(result, exp)
1975+
1976+
exp = TimedeltaIndex([pd.NaT, pd.NaT, '5 hours'])
1977+
result = (to_timedelta([pd.NaT, '5 days', '1 hours']) +
1978+
to_timedelta(['7 seconds', pd.NaT, '4 hours']))
1979+
tm.assert_index_equal(result, exp)
1980+
19671981
if __name__ == '__main__':
19681982
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
19691983
exit=False)

0 commit comments

Comments
 (0)