BUG: Prevent addition overflow with TimedeltaIndex (pandas-dev#14816)

gfyoung · ischurov · commit ac5ec9d1e8bc · 2016-12-19T04:05:03.000+03:00
Expands checked-add array addition introduced in pandas-devgh-14237 to include all other addition cases (i.e. TimedeltaIndex and Timedelta). Follow-up to pandas-devgh-14453. In addition, move checked add function to core/algorithms.
diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
@@ -18,14 +18,17 @@ def setup(self):
         self.float = pd.Float64Index(np.random.randn(N).repeat(5))
 
         # Convenience naming.
-        self.checked_add = pd.core.nanops._checked_add_with_arr
+        self.checked_add = pd.core.algorithms.checked_add_with_arr
 
         self.arr = np.arange(1000000)
         self.arrpos = np.arange(1000000)
         self.arrneg = np.arange(-1000000, 0)
         self.arrmixed = np.array([1, -1]).repeat(500000)
         self.strings = tm.makeStringIndex(100000)
 
+        self.arr_nan = np.random.choice([True, False], size=1000000)
+        self.arrmixed_nan = np.random.choice([True, False], size=1000000)
+
         # match
         self.uniques = tm.makeStringIndex(1000).values
         self.all = self.uniques.repeat(10)
@@ -69,6 +72,16 @@ def time_add_overflow_neg_arr(self):
     def time_add_overflow_mixed_arr(self):
         self.checked_add(self.arr, self.arrmixed)
 
+    def time_add_overflow_first_arg_nan(self):
+        self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan)
+
+    def time_add_overflow_second_arg_nan(self):
+        self.checked_add(self.arr, self.arrmixed, b_mask=self.arrmixed_nan)
+
+    def time_add_overflow_both_arg_nan(self):
+        self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan,
+                         b_mask=self.arrmixed_nan)
+
 
 class Hashing(object):
     goal_time = 0.2
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -234,6 +234,7 @@ Performance Improvements
 Bug Fixes
 ~~~~~~~~~
 
+- Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`)
 - Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`)
 
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -27,6 +27,7 @@
                                  _ensure_float64,
                                  _ensure_int64,
                                  is_list_like)
+from pandas.compat.numpy import _np_version_under1p10
 from pandas.types.missing import isnull
 
 import pandas.core.common as com
@@ -550,6 +551,95 @@ def rank(values, axis=0, method='average', na_option='keep',
 
     return ranks
 
+
+def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None):
+    """
+    Perform array addition that checks for underflow and overflow.
+
+    Performs the addition of an int64 array and an int64 integer (or array)
+    but checks that they do not result in overflow first. For elements that
+    are indicated to be NaN, whether or not there is overflow for that element
+    is automatically ignored.
+
+    Parameters
+    ----------
+    arr : array addend.
+    b : array or scalar addend.
+    arr_mask : boolean array or None
+        array indicating which elements to exclude from checking
+    b_mask : boolean array or boolean or None
+        array or scalar indicating which element(s) to exclude from checking
+
+    Returns
+    -------
+    sum : An array for elements x + b for each element x in arr if b is
+          a scalar or an array for elements x + y for each element pair
+          (x, y) in (arr, b).
+
+    Raises
+    ------
+    OverflowError if any x + y exceeds the maximum or minimum int64 value.
+    """
+    def _broadcast(arr_or_scalar, shape):
+        """
+        Helper function to broadcast arrays / scalars to the desired shape.
+        """
+        if _np_version_under1p10:
+            if lib.isscalar(arr_or_scalar):
+                out = np.empty(shape)
+                out.fill(arr_or_scalar)
+            else:
+                out = arr_or_scalar
+        else:
+            out = np.broadcast_to(arr_or_scalar, shape)
+        return out
+
+    # For performance reasons, we broadcast 'b' to the new array 'b2'
+    # so that it has the same size as 'arr'.
+    b2 = _broadcast(b, arr.shape)
+    if b_mask is not None:
+        # We do the same broadcasting for b_mask as well.
+        b2_mask = _broadcast(b_mask, arr.shape)
+    else:
+        b2_mask = None
+
+    # For elements that are NaN, regardless of their value, we should
+    # ignore whether they overflow or not when doing the checked add.
+    if arr_mask is not None and b2_mask is not None:
+        not_nan = np.logical_not(arr_mask | b2_mask)
+    elif arr_mask is not None:
+        not_nan = np.logical_not(arr_mask)
+    elif b_mask is not None:
+        not_nan = np.logical_not(b2_mask)
+    else:
+        not_nan = np.empty(arr.shape, dtype=bool)
+        not_nan.fill(True)
+
+    # gh-14324: For each element in 'arr' and its corresponding element
+    # in 'b2', we check the sign of the element in 'b2'. If it is positive,
+    # we then check whether its sum with the element in 'arr' exceeds
+    # np.iinfo(np.int64).max. If so, we have an overflow error. If it
+    # it is negative, we then check whether its sum with the element in
+    # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow
+    # error as well.
+    mask1 = b2 > 0
+    mask2 = b2 < 0
+
+    if not mask1.any():
+        to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any()
+    elif not mask2.any():
+        to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any()
+    else:
+        to_raise = (((np.iinfo(np.int64).max -
+                      b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or
+                    ((np.iinfo(np.int64).min -
+                      b2[mask2] > arr[mask2]) & not_nan[mask2]).any())
+
+    if to_raise:
+        raise OverflowError("Overflow in int64 addition")
+    return arr + b
+
+
 _rank1d_functions = {
     'float64': algos.rank_1d_float64,
     'int64': algos.rank_1d_int64,
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -11,7 +11,6 @@
 
 import pandas.hashtable as _hash
 from pandas import compat, lib, algos, tslib
-from pandas.compat.numpy import _np_version_under1p10
 from pandas.types.common import (_ensure_int64, _ensure_object,
                                  _ensure_float64, _get_dtype,
                                  is_float, is_scalar,
@@ -810,57 +809,3 @@ def unique1d(values):
         table = _hash.PyObjectHashTable(len(values))
         uniques = table.unique(_ensure_object(values))
     return uniques
-
-
-def _checked_add_with_arr(arr, b):
-    """
-    Performs the addition of an int64 array and an int64 integer (or array)
-    but checks that they do not result in overflow first.
-
-    Parameters
-    ----------
-    arr : array addend.
-    b : array or scalar addend.
-
-    Returns
-    -------
-    sum : An array for elements x + b for each element x in arr if b is
-          a scalar or an array for elements x + y for each element pair
-          (x, y) in (arr, b).
-
-    Raises
-    ------
-    OverflowError if any x + y exceeds the maximum or minimum int64 value.
-    """
-    # For performance reasons, we broadcast 'b' to the new array 'b2'
-    # so that it has the same size as 'arr'.
-    if _np_version_under1p10:
-        if lib.isscalar(b):
-            b2 = np.empty(arr.shape)
-            b2.fill(b)
-        else:
-            b2 = b
-    else:
-        b2 = np.broadcast_to(b, arr.shape)
-
-    # gh-14324: For each element in 'arr' and its corresponding element
-    # in 'b2', we check the sign of the element in 'b2'. If it is positive,
-    # we then check whether its sum with the element in 'arr' exceeds
-    # np.iinfo(np.int64).max. If so, we have an overflow error. If it
-    # it is negative, we then check whether its sum with the element in
-    # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow
-    # error as well.
-    mask1 = b2 > 0
-    mask2 = b2 < 0
-
-    if not mask1.any():
-        to_raise = (np.iinfo(np.int64).min - b2 > arr).any()
-    elif not mask2.any():
-        to_raise = (np.iinfo(np.int64).max - b2 < arr).any()
-    else:
-        to_raise = ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]).any() or
-                    (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]).any())
-
-    if to_raise:
-        raise OverflowError("Overflow in int64 addition")
-    return arr + b
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -1129,6 +1129,55 @@ def test_ensure_platform_int():
     assert (result is arr)
 
 
+def test_int64_add_overflow():
+    # see gh-14068
+    msg = "Overflow in int64 addition"
+    m = np.iinfo(np.int64).max
+    n = np.iinfo(np.int64).min
+
+    with tm.assertRaisesRegexp(OverflowError, msg):
+        algos.checked_add_with_arr(np.array([m, m]), m)
+    with tm.assertRaisesRegexp(OverflowError, msg):
+        algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]))
+    with tm.assertRaisesRegexp(OverflowError, msg):
+        algos.checked_add_with_arr(np.array([n, n]), n)
+    with tm.assertRaisesRegexp(OverflowError, msg):
+        algos.checked_add_with_arr(np.array([n, n]), np.array([n, n]))
+    with tm.assertRaisesRegexp(OverflowError, msg):
+        algos.checked_add_with_arr(np.array([m, n]), np.array([n, n]))
+    with tm.assertRaisesRegexp(OverflowError, msg):
+        algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
+                                   arr_mask=np.array([False, True]))
+    with tm.assertRaisesRegexp(OverflowError, msg):
+        algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
+                                   b_mask=np.array([False, True]))
+    with tm.assertRaisesRegexp(OverflowError, msg):
+        algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
+                                   arr_mask=np.array([False, True]),
+                                   b_mask=np.array([False, True]))
+    with tm.assertRaisesRegexp(OverflowError, msg):
+        with tm.assert_produces_warning(RuntimeWarning):
+            algos.checked_add_with_arr(np.array([m, m]),
+                                       np.array([np.nan, m]))
+
+    # Check that the nan boolean arrays override whether or not
+    # the addition overflows. We don't check the result but just
+    # the fact that an OverflowError is not raised.
+    with tm.assertRaises(AssertionError):
+        with tm.assertRaisesRegexp(OverflowError, msg):
+            algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
+                                       arr_mask=np.array([True, True]))
+    with tm.assertRaises(AssertionError):
+        with tm.assertRaisesRegexp(OverflowError, msg):
+            algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
+                                       b_mask=np.array([True, True]))
+    with tm.assertRaises(AssertionError):
+        with tm.assertRaisesRegexp(OverflowError, msg):
+            algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]),
+                                       arr_mask=np.array([True, False]),
+                                       b_mask=np.array([False, True]))
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py
@@ -1002,28 +1002,6 @@ def prng(self):
         return np.random.RandomState(1234)
 
 
-def test_int64_add_overflow():
-    # see gh-14068
-    msg = "Overflow in int64 addition"
-    m = np.iinfo(np.int64).max
-    n = np.iinfo(np.int64).min
-
-    with tm.assertRaisesRegexp(OverflowError, msg):
-        nanops._checked_add_with_arr(np.array([m, m]), m)
-    with tm.assertRaisesRegexp(OverflowError, msg):
-        nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]))
-    with tm.assertRaisesRegexp(OverflowError, msg):
-        nanops._checked_add_with_arr(np.array([n, n]), n)
-    with tm.assertRaisesRegexp(OverflowError, msg):
-        nanops._checked_add_with_arr(np.array([n, n]), np.array([n, n]))
-    with tm.assertRaisesRegexp(OverflowError, msg):
-        nanops._checked_add_with_arr(np.array([m, n]), np.array([n, n]))
-    with tm.assertRaisesRegexp(OverflowError, msg):
-        with tm.assert_produces_warning(RuntimeWarning):
-            nanops._checked_add_with_arr(np.array([m, m]),
-                                         np.array([np.nan, m]))
-
-
 if __name__ == '__main__':
     import nose
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s'
diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py
@@ -16,6 +16,7 @@
                                   ABCPeriodIndex, ABCIndexClass)
 from pandas.types.missing import isnull
 from pandas.core import common as com, algorithms
+from pandas.core.algorithms import checked_add_with_arr
 from pandas.core.common import AbstractMethodError
 
 import pandas.formats.printing as printing
@@ -688,7 +689,8 @@ def _add_delta_td(self, other):
         # return the i8 result view
 
         inc = tslib._delta_to_nanoseconds(other)
-        new_values = (self.asi8 + inc).view('i8')
+        new_values = checked_add_with_arr(self.asi8, inc,
+                                          arr_mask=self._isnan).view('i8')
         if self.hasnans:
             new_values[self._isnan] = tslib.iNaT
         return new_values.view('i8')
@@ -703,7 +705,9 @@ def _add_delta_tdi(self, other):
 
         self_i8 = self.asi8
         other_i8 = other.asi8
-        new_values = self_i8 + other_i8
+        new_values = checked_add_with_arr(self_i8, other_i8,
+                                          arr_mask=self._isnan,
+                                          b_mask=other._isnan)
         if self.hasnans or other.hasnans:
             mask = (self._isnan) | (other._isnan)
             new_values[mask] = tslib.iNaT
diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py
@@ -20,8 +20,8 @@
 import pandas.compat as compat
 from pandas.compat import u
 from pandas.tseries.frequencies import to_offset
+from pandas.core.algorithms import checked_add_with_arr
 from pandas.core.base import _shared_docs
-from pandas.core.nanops import _checked_add_with_arr
 from pandas.indexes.base import _index_shared_docs
 import pandas.core.common as com
 import pandas.types.concat as _concat
@@ -347,7 +347,7 @@ def _add_datelike(self, other):
         else:
             other = Timestamp(other)
             i8 = self.asi8
-            result = _checked_add_with_arr(i8, other.value)
+            result = checked_add_with_arr(i8, other.value)
             result = self._maybe_mask_results(result, fill_value=tslib.iNaT)
         return DatetimeIndex(result, name=self.name, copy=False)
 
diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py
@@ -1958,11 +1958,34 @@ def test_add_overflow(self):
         with tm.assertRaisesRegexp(OverflowError, msg):
             Timestamp('2000') + to_timedelta(106580, 'D')
 
+        _NaT = int(pd.NaT) + 1
         msg = "Overflow in int64 addition"
         with tm.assertRaisesRegexp(OverflowError, msg):
             to_timedelta([106580], 'D') + Timestamp('2000')
         with tm.assertRaisesRegexp(OverflowError, msg):
             Timestamp('2000') + to_timedelta([106580], 'D')
+        with tm.assertRaisesRegexp(OverflowError, msg):
+            to_timedelta([_NaT]) - Timedelta('1 days')
+        with tm.assertRaisesRegexp(OverflowError, msg):
+            to_timedelta(['5 days', _NaT]) - Timedelta('1 days')
+        with tm.assertRaisesRegexp(OverflowError, msg):
+            (to_timedelta([_NaT, '5 days', '1 hours']) -
+             to_timedelta(['7 seconds', _NaT, '4 hours']))
+
+        # These should not overflow!
+        exp = TimedeltaIndex([pd.NaT])
+        result = to_timedelta([pd.NaT]) - Timedelta('1 days')
+        tm.assert_index_equal(result, exp)
+
+        exp = TimedeltaIndex(['4 days', pd.NaT])
+        result = to_timedelta(['5 days', pd.NaT]) - Timedelta('1 days')
+        tm.assert_index_equal(result, exp)
+
+        exp = TimedeltaIndex([pd.NaT, pd.NaT, '5 hours'])
+        result = (to_timedelta([pd.NaT, '5 days', '1 hours']) +
+                  to_timedelta(['7 seconds', pd.NaT, '4 hours']))
+        tm.assert_index_equal(result, exp)
+
 
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],