From 1da4a6d98b0f679b639af388c9a25222887c68d4 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 28 Aug 2015 21:40:15 -0400 Subject: [PATCH] BUG: Bug in incorrection computation of .mean() on timedelta64[ns] because of overflow #9442 --- doc/source/whatsnew/v0.17.0.txt | 1 + pandas/core/common.py | 1 + pandas/core/nanops.py | 16 ++++++++++++---- pandas/tseries/tests/test_timedeltas.py | 19 +++++++++++++++++++ 4 files changed, 33 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 1607d81543946..3e81a923a114c 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -725,6 +725,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug in incorrection computation of ``.mean()`` on ``timedelta64[ns]`` because of overflow (:issue:`9442`) - Bug in ``DataFrame.to_html(index=False)`` renders unnecessary ``name`` row (:issue:`10344`) - Bug in ``DataFrame.apply`` when function returns categorical series. (:issue:`9573`) - Bug in ``to_datetime`` with invalid dates and formats supplied (:issue:`10154`) diff --git a/pandas/core/common.py b/pandas/core/common.py index 245535e47abd8..72ea6d14456b0 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -63,6 +63,7 @@ def __str__(self): _int8_max = np.iinfo(np.int8).max _int16_max = np.iinfo(np.int16).max _int32_max = np.iinfo(np.int32).max +_int64_max = np.iinfo(np.int64).max # define abstract base classes to enable isinstance type checking on our # objects diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index c70fb6339517d..447a273a1e171 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -21,7 +21,8 @@ is_bool_dtype, is_object_dtype, is_datetime64_dtype, is_timedelta64_dtype, is_datetime_or_timedelta_dtype, _get_dtype, - is_int_or_datetime_dtype, is_any_int_dtype) + is_int_or_datetime_dtype, is_any_int_dtype, + _int64_max) class disallow(object): @@ -145,7 +146,7 @@ def _get_fill_value(dtype, fill_value=None, fill_value_typ=None): else: if fill_value_typ == '+inf': # need the max int here - return np.iinfo(np.int64).max + return _int64_max else: return tslib.iNaT @@ -223,7 +224,12 @@ def _wrap_results(result, dtype): result = result.view(dtype) elif is_timedelta64_dtype(dtype): if not isinstance(result, np.ndarray): - result = lib.Timedelta(result) + + # raise if we have a timedelta64[ns] which is too large + if np.fabs(result) > _int64_max: + raise ValueError("overflow in timedelta operation") + + result = lib.Timedelta(result, unit='ns') else: result = result.astype('i8').view(dtype) @@ -247,6 +253,8 @@ def nansum(values, axis=None, skipna=True): dtype_sum = dtype_max if is_float_dtype(dtype): dtype_sum = dtype + elif is_timedelta64_dtype(dtype): + dtype_sum = np.float64 the_sum = values.sum(axis, dtype=dtype_sum) the_sum = _maybe_null_out(the_sum, axis, mask) @@ -260,7 +268,7 @@ def nanmean(values, axis=None, skipna=True): dtype_sum = dtype_max dtype_count = np.float64 - if is_integer_dtype(dtype): + if is_integer_dtype(dtype) or is_timedelta64_dtype(dtype): dtype_sum = np.float64 elif is_float_dtype(dtype): dtype_sum = dtype diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 753e76fd1faea..4870fbd55f33e 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -686,6 +686,25 @@ def test_timedelta_ops(self): s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07'), Timestamp('2015-02-15')]) self.assertEqual(s.diff().median(), timedelta(days=6)) + def test_overflow(self): + # GH 9442 + s = Series(pd.date_range('20130101',periods=100000,freq='H')) + s[0] += pd.Timedelta('1s 1ms') + + # mean + result = (s-s.min()).mean() + expected = pd.Timedelta((pd.DatetimeIndex((s-s.min())).asi8/len(s)).sum()) + + # the computation is converted to float so might be some loss of precision + self.assertTrue(np.allclose(result.value/1000, expected.value/1000)) + + # sum + self.assertRaises(ValueError, lambda : (s-s.min()).sum()) + s1 = s[0:10000] + self.assertRaises(ValueError, lambda : (s1-s1.min()).sum()) + s2 = s[0:1000] + result = (s2-s2.min()).sum() + def test_timedelta_ops_scalar(self): # GH 6808 base = pd.to_datetime('20130101 09:01:12.123456')