diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 1cff74d41f686..6f04b0358394f 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -63,6 +63,7 @@ Bug Fixes - Bug in ``Categorical`` repr with ``display.width`` of ``None`` in Python 3 (:issue:`10087`) +- Bug in ``mean()`` where integer dtypes can overflow (:issue:`10172`) - Bug where Panel.from_dict does not set dtype when specified (:issue:`10058`) - Bug in ``Timestamp``'s' ``microsecond``, ``quarter``, ``dayofyear``, ``week`` and ``daysinmonth`` properties return ``np.int`` type, not built-in ``int``. (:issue:`10050`) - Bug in ``NaT`` raises ``AttributeError`` when accessing to ``daysinmonth``, ``dayofweek`` properties. (:issue:`10096`) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index e921a9d562bc1..0df160618b7c3 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -20,7 +20,7 @@ is_complex_dtype, is_integer_dtype, is_bool_dtype, is_object_dtype, is_datetime64_dtype, is_timedelta64_dtype, - is_datetime_or_timedelta_dtype, + is_datetime_or_timedelta_dtype, _get_dtype, is_int_or_datetime_dtype, is_any_int_dtype) @@ -254,8 +254,16 @@ def nansum(values, axis=None, skipna=True): @bottleneck_switch() def nanmean(values, axis=None, skipna=True): values, mask, dtype, dtype_max = _get_values(values, skipna, 0) - the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_max)) - count = _get_counts(mask, axis) + + dtype_sum = dtype_max + dtype_count = np.float64 + if is_integer_dtype(dtype): + dtype_sum = np.float64 + elif is_float_dtype(dtype): + dtype_sum = dtype + dtype_count = dtype + count = _get_counts(mask, axis, dtype=dtype_count) + the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) if axis is not None and getattr(the_sum, 'ndim', False): the_mean = the_sum / count @@ -557,15 +565,16 @@ def _maybe_arg_null_out(result, axis, mask, skipna): return result -def _get_counts(mask, axis): +def _get_counts(mask, axis, dtype=float): + dtype = _get_dtype(dtype) if axis is None: - return float(mask.size - mask.sum()) + return dtype.type(mask.size - mask.sum()) count = mask.shape[axis] - mask.sum(axis) try: - return count.astype(float) + return count.astype(dtype) except AttributeError: - return np.array(count, dtype=float) + return np.array(count, dtype=dtype) def _maybe_null_out(result, axis, mask): diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 2a605cba8a6c0..1adb8a5d9217c 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -5,7 +5,7 @@ import numpy as np -from pandas.core.common import isnull +from pandas.core.common import isnull, is_integer_dtype import pandas.core.nanops as nanops import pandas.util.testing as tm @@ -323,6 +323,32 @@ def test_nanmean(self): allow_complex=False, allow_obj=False, allow_str=False, allow_date=False, allow_tdelta=True) + def test_nanmean_overflow(self): + # GH 10155 + # In the previous implementation mean can overflow for int dtypes, it + # is now consistent with numpy + from pandas import Series + + # numpy < 1.9.0 is not computing this correctly + from distutils.version import LooseVersion + if LooseVersion(np.__version__) >= '1.9.0': + for a in [2 ** 55, -2 ** 55, 20150515061816532]: + s = Series(a, index=range(500), dtype=np.int64) + result = s.mean() + np_result = s.values.mean() + self.assertEqual(result, a) + self.assertEqual(result, np_result) + self.assertTrue(result.dtype == np.float64) + + # check returned dtype + for dtype in [np.int16, np.int32, np.int64, np.float16, np.float32, np.float64]: + s = Series(range(10), dtype=dtype) + result = s.mean() + if is_integer_dtype(dtype): + self.assertTrue(result.dtype == np.float64) + else: + self.assertTrue(result.dtype == dtype) + def test_nanmedian(self): self.check_funs(nanops.nanmedian, np.median, allow_complex=False, allow_str=False, allow_date=False,