pandas-dev · shoyer · May 30, 2015 · May 16, 2015 · jreback · May 29, 2015
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -63,6 +63,7 @@ Bug Fixes
 - Bug in ``Categorical`` repr with ``display.width`` of ``None`` in Python 3 (:issue:`10087`)
 
 
+- Bug in ``mean()`` where integer dtypes can overflow (:issue:`10172`)
 - Bug where Panel.from_dict does not set dtype when specified (:issue:`10058`)
 - Bug in ``Timestamp``'s' ``microsecond``, ``quarter``, ``dayofyear``, ``week`` and ``daysinmonth`` properties return ``np.int`` type, not built-in ``int``. (:issue:`10050`)
 - Bug in ``NaT`` raises ``AttributeError`` when accessing to ``daysinmonth``, ``dayofweek`` properties. (:issue:`10096`)

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -20,7 +20,7 @@
                                 is_complex_dtype, is_integer_dtype,
                                 is_bool_dtype, is_object_dtype,
                                 is_datetime64_dtype, is_timedelta64_dtype,
-                                is_datetime_or_timedelta_dtype,
+                                is_datetime_or_timedelta_dtype, _get_dtype,
                                 is_int_or_datetime_dtype, is_any_int_dtype)
 
 
@@ -254,8 +254,16 @@ def nansum(values, axis=None, skipna=True):
 @bottleneck_switch()
 def nanmean(values, axis=None, skipna=True):
     values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
-    the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_max))
-    count = _get_counts(mask, axis)
+
+    dtype_sum = dtype_max
+    dtype_count = np.float64
+    if is_integer_dtype(dtype):
+        dtype_sum = np.float64
+    elif is_float_dtype(dtype):
+        dtype_sum = dtype
+        dtype_count = dtype
+    count = _get_counts(mask, axis, dtype=dtype_count)
+    the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
 
     if axis is not None and getattr(the_sum, 'ndim', False):
         the_mean = the_sum / count
@@ -557,15 +565,16 @@ def _maybe_arg_null_out(result, axis, mask, skipna):
     return result
 
 
-def _get_counts(mask, axis):
+def _get_counts(mask, axis, dtype=float):
+    dtype = _get_dtype(dtype)
     if axis is None:
-        return float(mask.size - mask.sum())
+        return dtype.type(mask.size - mask.sum())
 
     count = mask.shape[axis] - mask.sum(axis)
     try:
-        return count.astype(float)
+        return count.astype(dtype)
     except AttributeError:
-        return np.array(count, dtype=float)
+        return np.array(count, dtype=dtype)
 
 
 def _maybe_null_out(result, axis, mask):

diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 
-from pandas.core.common import isnull
+from pandas.core.common import isnull, is_integer_dtype
 import pandas.core.nanops as nanops
 import pandas.util.testing as tm
 
@@ -323,6 +323,32 @@ def test_nanmean(self):
                         allow_complex=False, allow_obj=False,
                         allow_str=False, allow_date=False, allow_tdelta=True)
 
+    def test_nanmean_overflow(self):
+        # GH 10155
+        # In the previous implementation mean can overflow for int dtypes, it
+        # is now consistent with numpy
+        from pandas import Series
+
+        # numpy < 1.9.0 is not computing this correctly
+        from distutils.version import LooseVersion
+        if LooseVersion(np.__version__) >= '1.9.0':
+            for a in [2 ** 55, -2 ** 55, 20150515061816532]:
+                s = Series(a, index=range(500), dtype=np.int64)
+                result = s.mean()
+                np_result = s.values.mean()
+                self.assertEqual(result, a)
+                self.assertEqual(result, np_result)
+                self.assertTrue(result.dtype == np.float64)
+
+        # check returned dtype
+        for dtype in [np.int16, np.int32, np.int64, np.float16, np.float32, np.float64]:
+            s = Series(range(10), dtype=dtype)
+            result = s.mean()
+            if is_integer_dtype(dtype):
+                self.assertTrue(result.dtype == np.float64)
+            else:
+                self.assertTrue(result.dtype == dtype)
+
     def test_nanmedian(self):
         self.check_funs(nanops.nanmedian, np.median,
                         allow_complex=False, allow_str=False, allow_date=False,