Skip to content

Commit ba0704f

Browse files
committed
Merge pull request #10926 from jreback/timedelta_mean
BUG: Bug in incorrection computation of .mean() on timedelta64[ns] because of overflow #9442
2 parents f43746c + 1da4a6d commit ba0704f

File tree

4 files changed

+33
-4
lines changed

4 files changed

+33
-4
lines changed

doc/source/whatsnew/v0.17.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -725,6 +725,7 @@ Performance Improvements
725725
Bug Fixes
726726
~~~~~~~~~
727727

728+
- Bug in incorrection computation of ``.mean()`` on ``timedelta64[ns]`` because of overflow (:issue:`9442`)
728729
- Bug in ``DataFrame.to_html(index=False)`` renders unnecessary ``name`` row (:issue:`10344`)
729730
- Bug in ``DataFrame.apply`` when function returns categorical series. (:issue:`9573`)
730731
- Bug in ``to_datetime`` with invalid dates and formats supplied (:issue:`10154`)

pandas/core/common.py

+1
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ def __str__(self):
6363
_int8_max = np.iinfo(np.int8).max
6464
_int16_max = np.iinfo(np.int16).max
6565
_int32_max = np.iinfo(np.int32).max
66+
_int64_max = np.iinfo(np.int64).max
6667

6768
# define abstract base classes to enable isinstance type checking on our
6869
# objects

pandas/core/nanops.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@
2121
is_bool_dtype, is_object_dtype,
2222
is_datetime64_dtype, is_timedelta64_dtype,
2323
is_datetime_or_timedelta_dtype, _get_dtype,
24-
is_int_or_datetime_dtype, is_any_int_dtype)
24+
is_int_or_datetime_dtype, is_any_int_dtype,
25+
_int64_max)
2526

2627

2728
class disallow(object):
@@ -145,7 +146,7 @@ def _get_fill_value(dtype, fill_value=None, fill_value_typ=None):
145146
else:
146147
if fill_value_typ == '+inf':
147148
# need the max int here
148-
return np.iinfo(np.int64).max
149+
return _int64_max
149150
else:
150151
return tslib.iNaT
151152

@@ -223,7 +224,12 @@ def _wrap_results(result, dtype):
223224
result = result.view(dtype)
224225
elif is_timedelta64_dtype(dtype):
225226
if not isinstance(result, np.ndarray):
226-
result = lib.Timedelta(result)
227+
228+
# raise if we have a timedelta64[ns] which is too large
229+
if np.fabs(result) > _int64_max:
230+
raise ValueError("overflow in timedelta operation")
231+
232+
result = lib.Timedelta(result, unit='ns')
227233
else:
228234
result = result.astype('i8').view(dtype)
229235

@@ -247,6 +253,8 @@ def nansum(values, axis=None, skipna=True):
247253
dtype_sum = dtype_max
248254
if is_float_dtype(dtype):
249255
dtype_sum = dtype
256+
elif is_timedelta64_dtype(dtype):
257+
dtype_sum = np.float64
250258
the_sum = values.sum(axis, dtype=dtype_sum)
251259
the_sum = _maybe_null_out(the_sum, axis, mask)
252260

@@ -260,7 +268,7 @@ def nanmean(values, axis=None, skipna=True):
260268

261269
dtype_sum = dtype_max
262270
dtype_count = np.float64
263-
if is_integer_dtype(dtype):
271+
if is_integer_dtype(dtype) or is_timedelta64_dtype(dtype):
264272
dtype_sum = np.float64
265273
elif is_float_dtype(dtype):
266274
dtype_sum = dtype

pandas/tseries/tests/test_timedeltas.py

+19
Original file line numberDiff line numberDiff line change
@@ -686,6 +686,25 @@ def test_timedelta_ops(self):
686686
s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07'), Timestamp('2015-02-15')])
687687
self.assertEqual(s.diff().median(), timedelta(days=6))
688688

689+
def test_overflow(self):
690+
# GH 9442
691+
s = Series(pd.date_range('20130101',periods=100000,freq='H'))
692+
s[0] += pd.Timedelta('1s 1ms')
693+
694+
# mean
695+
result = (s-s.min()).mean()
696+
expected = pd.Timedelta((pd.DatetimeIndex((s-s.min())).asi8/len(s)).sum())
697+
698+
# the computation is converted to float so might be some loss of precision
699+
self.assertTrue(np.allclose(result.value/1000, expected.value/1000))
700+
701+
# sum
702+
self.assertRaises(ValueError, lambda : (s-s.min()).sum())
703+
s1 = s[0:10000]
704+
self.assertRaises(ValueError, lambda : (s1-s1.min()).sum())
705+
s2 = s[0:1000]
706+
result = (s2-s2.min()).sum()
707+
689708
def test_timedelta_ops_scalar(self):
690709
# GH 6808
691710
base = pd.to_datetime('20130101 09:01:12.123456')

0 commit comments

Comments
 (0)