Skip to content

BUG: Bug in incorrection computation of .mean() on timedelta64[ns] because of overflow #9442 #10926

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 29, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,7 @@ Performance Improvements
Bug Fixes
~~~~~~~~~

- Bug in incorrection computation of ``.mean()`` on ``timedelta64[ns]`` because of overflow (:issue:`9442`)
- Bug in ``DataFrame.to_html(index=False)`` renders unnecessary ``name`` row (:issue:`10344`)
- Bug in ``DataFrame.apply`` when function returns categorical series. (:issue:`9573`)
- Bug in ``to_datetime`` with invalid dates and formats supplied (:issue:`10154`)
Expand Down
1 change: 1 addition & 0 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def __str__(self):
_int8_max = np.iinfo(np.int8).max
_int16_max = np.iinfo(np.int16).max
_int32_max = np.iinfo(np.int32).max
_int64_max = np.iinfo(np.int64).max

# define abstract base classes to enable isinstance type checking on our
# objects
Expand Down
16 changes: 12 additions & 4 deletions pandas/core/nanops.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
is_bool_dtype, is_object_dtype,
is_datetime64_dtype, is_timedelta64_dtype,
is_datetime_or_timedelta_dtype, _get_dtype,
is_int_or_datetime_dtype, is_any_int_dtype)
is_int_or_datetime_dtype, is_any_int_dtype,
_int64_max)


class disallow(object):
Expand Down Expand Up @@ -145,7 +146,7 @@ def _get_fill_value(dtype, fill_value=None, fill_value_typ=None):
else:
if fill_value_typ == '+inf':
# need the max int here
return np.iinfo(np.int64).max
return _int64_max
else:
return tslib.iNaT

Expand Down Expand Up @@ -223,7 +224,12 @@ def _wrap_results(result, dtype):
result = result.view(dtype)
elif is_timedelta64_dtype(dtype):
if not isinstance(result, np.ndarray):
result = lib.Timedelta(result)

# raise if we have a timedelta64[ns] which is too large
if np.fabs(result) > _int64_max:
raise ValueError("overflow in timedelta operation")

result = lib.Timedelta(result, unit='ns')
else:
result = result.astype('i8').view(dtype)

Expand All @@ -247,6 +253,8 @@ def nansum(values, axis=None, skipna=True):
dtype_sum = dtype_max
if is_float_dtype(dtype):
dtype_sum = dtype
elif is_timedelta64_dtype(dtype):
dtype_sum = np.float64
the_sum = values.sum(axis, dtype=dtype_sum)
the_sum = _maybe_null_out(the_sum, axis, mask)

Expand All @@ -260,7 +268,7 @@ def nanmean(values, axis=None, skipna=True):

dtype_sum = dtype_max
dtype_count = np.float64
if is_integer_dtype(dtype):
if is_integer_dtype(dtype) or is_timedelta64_dtype(dtype):
dtype_sum = np.float64
elif is_float_dtype(dtype):
dtype_sum = dtype
Expand Down
19 changes: 19 additions & 0 deletions pandas/tseries/tests/test_timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -686,6 +686,25 @@ def test_timedelta_ops(self):
s = Series([Timestamp('2015-02-03'), Timestamp('2015-02-07'), Timestamp('2015-02-15')])
self.assertEqual(s.diff().median(), timedelta(days=6))

def test_overflow(self):
# GH 9442
s = Series(pd.date_range('20130101',periods=100000,freq='H'))
s[0] += pd.Timedelta('1s 1ms')

# mean
result = (s-s.min()).mean()
expected = pd.Timedelta((pd.DatetimeIndex((s-s.min())).asi8/len(s)).sum())

# the computation is converted to float so might be some loss of precision
self.assertTrue(np.allclose(result.value/1000, expected.value/1000))

# sum
self.assertRaises(ValueError, lambda : (s-s.min()).sum())
s1 = s[0:10000]
self.assertRaises(ValueError, lambda : (s1-s1.min()).sum())
s2 = s[0:1000]
result = (s2-s2.min()).sum()

def test_timedelta_ops_scalar(self):
# GH 6808
base = pd.to_datetime('20130101 09:01:12.123456')
Expand Down