BUG: Catch overflow in both directions for checked add

gfyoung · gfyoung · commit 85ca49d5eb22 · 2016-10-16T15:30:43.000-04:00
1) Add checks to ensure that add overflow does not
   occur both in the positive or negative directions.
2) Add benchmarks to ensure that operations involving
   this checked add function are significantly impacted.
diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
@@ -15,6 +15,14 @@ def setup(self):
         self.int = pd.Int64Index(np.arange(N).repeat(5))
         self.float = pd.Float64Index(np.random.randn(N).repeat(5))
 
+        # Convenience naming.
+        self.checked_add = pd.core.nanops._checked_add_with_arr
+
+        self.arr = np.arange(1000000)
+        self.arrpos = np.arange(1000000)
+        self.arrneg = np.arange(-1000000, 0)
+        self.arrmixed = np.array([1, -1]).repeat(500000)
+
     def time_int_factorize(self):
         self.int.factorize()
 
@@ -29,3 +37,21 @@ def time_int_duplicated(self):
 
     def time_float_duplicated(self):
         self.float.duplicated()
+
+    def time_add_overflow_pos_scalar(self):
+        self.checked_add(self.arr, 1)
+
+    def time_add_overflow_neg_scalar(self):
+        self.checked_add(self.arr, -1)
+
+    def time_add_overflow_zero_scalar(self):
+        self.checked_add(self.arr, 0)
+
+    def time_add_overflow_pos_arr(self):
+        self.checked_add(self.arr, self.arrpos)
+
+    def time_add_overflow_neg_arr(self):
+        self.checked_add(self.arr, self.arrneg)
+
+    def time_add_overflow_mixed_arr(self):
+        self.checked_add(self.arr, self.arrmixed)
diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py
@@ -1,5 +1,5 @@
 from .pandas_vb_common import *
-from pandas import to_timedelta
+from pandas import to_timedelta, Timestamp
 
 
 class timedelta_convert_int(object):
@@ -47,3 +47,14 @@ def time_timedelta_convert_coerce(self):
 
     def time_timedelta_convert_ignore(self):
         to_timedelta(self.arr, errors='ignore')
+
+
+class timedelta_add_overflow(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.td = to_timedelta(np.arange(1000000))
+        self.ts = Timestamp('2000')
+
+    def test_add_td_ts(self):
+        self.td + self.ts
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -11,6 +11,7 @@
 
 import pandas.hashtable as _hash
 from pandas import compat, lib, algos, tslib
+from pandas.compat.numpy import _np_version_under1p10
 from pandas.types.common import (_ensure_int64, _ensure_object,
                                  _ensure_float64, _get_dtype,
                                  is_float, is_scalar,
@@ -829,9 +830,37 @@ def _checked_add_with_arr(arr, b):
 
     Raises
     ------
-    OverflowError if any x + y exceeds the maximum int64 value.
+    OverflowError if any x + y exceeds the maximum or minimum int64 value.
     """
-    if (np.iinfo(np.int64).max - b < arr).any():
-        raise OverflowError("Python int too large to "
-                            "convert to C long")
+    # For performance reasons, we broadcast 'b' to the new array 'b2'
+    # so that it has the same size as 'arr'.
+    if _np_version_under1p10:
+        if lib.isscalar(b):
+            b2 = np.empty(arr.shape)
+            b2.fill(b)
+        else:
+            b2 = b
+    else:
+        b2 = np.broadcast_to(b, arr.shape)
+
+    # gh-14324: For each element in 'arr' and its corresponding element
+    # in 'b2', we check the sign of the element in 'b2'. If it is positive,
+    # we then check whether its sum with the element in 'arr' exceeds
+    # np.iinfo(np.int64).max. If so, we have an overflow error. If it
+    # it is negative, we then check whether its sum with the element in
+    # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow
+    # error as well.
+    mask1 = b2 > 0
+    mask2 = b2 < 0
+
+    if not mask1.any():
+        to_raise = (np.iinfo(np.int64).min - b2 > arr).any()
+    elif not mask2.any():
+        to_raise = (np.iinfo(np.int64).max - b2 < arr).any()
+    else:
+        to_raise = ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]).any() or
+                    (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]).any())
+
+    if to_raise:
+        raise OverflowError("Overflow in int64 addition")
     return arr + b
diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py
@@ -1004,13 +1004,20 @@ def prng(self):
 
 def test_int64_add_overflow():
     # see gh-14068
-    msg = "too (big|large) to convert"
+    msg = "Overflow in int64 addition"
     m = np.iinfo(np.int64).max
+    n = np.iinfo(np.int64).min
 
     with tm.assertRaisesRegexp(OverflowError, msg):
         nanops._checked_add_with_arr(np.array([m, m]), m)
     with tm.assertRaisesRegexp(OverflowError, msg):
         nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m]))
+    with tm.assertRaisesRegexp(OverflowError, msg):
+        nanops._checked_add_with_arr(np.array([n, n]), n)
+    with tm.assertRaisesRegexp(OverflowError, msg):
+        nanops._checked_add_with_arr(np.array([n, n]), np.array([n, n]))
+    with tm.assertRaisesRegexp(OverflowError, msg):
+        nanops._checked_add_with_arr(np.array([m, n]), np.array([n, n]))
     with tm.assertRaisesRegexp(OverflowError, msg):
         with tm.assert_produces_warning(RuntimeWarning):
             nanops._checked_add_with_arr(np.array([m, m]),
diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py
@@ -1957,6 +1957,8 @@ def test_add_overflow(self):
             to_timedelta(106580, 'D') + Timestamp('2000')
         with tm.assertRaisesRegexp(OverflowError, msg):
             Timestamp('2000') + to_timedelta(106580, 'D')
+
+        msg = "Overflow in int64 addition"
         with tm.assertRaisesRegexp(OverflowError, msg):
             to_timedelta([106580], 'D') + Timestamp('2000')
         with tm.assertRaisesRegexp(OverflowError, msg):