Fixed bug pandas-dev#9733 where stat functions returned a python scalar for empty series

remiremi · remiremi · commit 69ffca6ee7c5 · 2015-08-21T10:04:42.000+02:00
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -701,3 +701,5 @@ Bug Fixes
 - Bug in ``iloc`` allowing memory outside bounds of a Series to be accessed with negative integers (:issue:`10779`)
 - Bug in ``read_msgpack`` where encoding is not respected (:issue:`10580`)
 - Bug preventing access to the first index when using ``iloc`` with a list containing the appropriate negative integer (:issue:`10547`, :issue:`10779`)
+
+- Bug in stat functions (``sum``, ``mean``, etc) returning a python scalar for empty series (:issue:`9733`)
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -2484,11 +2484,18 @@ def is_integer_dtype(arr_or_dtype):
     return (issubclass(tipo, np.integer) and
             not issubclass(tipo, (np.datetime64, np.timedelta64)))
 
+
 def is_int64_dtype(arr_or_dtype):
     tipo = _get_dtype_type(arr_or_dtype)
     return issubclass(tipo, np.int64)
 
 
+def is_unsigned_integer_dtype(arr_or_dtype):
+    tipo = _get_dtype_type(arr_or_dtype)
+    return (issubclass(tipo, np.unsignedinteger) and
+            not issubclass(tipo, (np.datetime64, np.timedelta64)))
+
+
 def is_int_or_datetime_dtype(arr_or_dtype):
     tipo = _get_dtype_type(arr_or_dtype)
     return (issubclass(tipo, np.integer) or
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -18,6 +18,7 @@
                                 is_float, is_integer, is_complex,
                                 is_float_dtype,
                                 is_complex_dtype, is_integer_dtype,
+                                is_unsigned_integer_dtype,
                                 is_bool_dtype, is_object_dtype,
                                 is_datetime64_dtype, is_timedelta64_dtype,
                                 is_datetime_or_timedelta_dtype, _get_dtype,
@@ -67,21 +68,7 @@ def f(values, axis=None, skipna=True, **kwds):
                     if k not in kwds:
                         kwds[k] = v
             try:
-                if self.zero_value is not None and values.size == 0:
-                    if values.ndim == 1:
-
-                        # wrap the 0's if needed
-                        if is_timedelta64_dtype(values):
-                            return lib.Timedelta(0)
-                        return 0
-                    else:
-                        result_shape = (values.shape[:axis] +
-                                        values.shape[axis + 1:])
-                        result = np.empty(result_shape)
-                        result.fill(0)
-                        return result
-
-                if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype,
+                if values.size != 0 and _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype,
                                                                bn_name):
                     result = bn_func(values, axis=axis, **kwds)
 
@@ -187,7 +174,10 @@ def _get_values(values, skipna, fill_value=None, fill_value_typ=None,
     # return a platform independent precision dtype
     dtype_max = dtype
     if is_integer_dtype(dtype) or is_bool_dtype(dtype):
-        dtype_max = np.int64
+        if is_unsigned_integer_dtype(dtype):
+            dtype_max = np.uint64
+        else:
+            dtype_max = np.int64
     elif is_float_dtype(dtype):
         dtype_max = np.float64
 
@@ -241,14 +231,14 @@ def nanall(values, axis=None, skipna=True):
 
 
 @disallow('M8')
-@bottleneck_switch(zero_value=0)
+@bottleneck_switch()
 def nansum(values, axis=None, skipna=True):
     values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
     dtype_sum = dtype_max
     if is_float_dtype(dtype):
         dtype_sum = dtype
     the_sum = values.sum(axis, dtype=dtype_sum)
-    the_sum = _maybe_null_out(the_sum, axis, mask)
+    the_sum = _maybe_null_out(the_sum, axis, mask, False)
 
     return _wrap_results(the_sum, dtype)
 
@@ -414,7 +404,7 @@ def nanmin(values, axis=None, skipna=True):
             result = values.min(axis)
 
     result = _wrap_results(result, dtype)
-    return _maybe_null_out(result, axis, mask)
+    return _maybe_null_out(result, axis, mask, True)
 
 
 @bottleneck_switch()
@@ -445,7 +435,7 @@ def nanmax(values, axis=None, skipna=True):
             result = values.max(axis)
 
     result = _wrap_results(result, dtype)
-    return _maybe_null_out(result, axis, mask)
+    return _maybe_null_out(result, axis, mask, True)
 
 
 def nanargmax(values, axis=None, skipna=True):
@@ -554,7 +544,7 @@ def nanprod(values, axis=None, skipna=True):
         values = values.copy()
         values[mask] = 1
     result = values.prod(axis)
-    return _maybe_null_out(result, axis, mask)
+    return _maybe_null_out(result, axis, mask, False)
 
 
 def _maybe_arg_null_out(result, axis, mask, skipna):
@@ -588,9 +578,11 @@ def _get_counts(mask, axis, dtype=float):
         return np.array(count, dtype=dtype)
 
 
-def _maybe_null_out(result, axis, mask):
+def _maybe_null_out(result, axis, mask, null_on_empty):
     if axis is not None and getattr(result, 'ndim', False):
         null_mask = (mask.shape[axis] - mask.sum(axis)) == 0
+        if not null_on_empty:
+            null_mask = null_mask & (mask.shape[axis] > 0)
         if np.any(null_mask):
             if np.iscomplexobj(result):
                 result = result.astype('c16')
@@ -599,9 +591,8 @@ def _maybe_null_out(result, axis, mask):
             result[null_mask] = np.nan
     else:
         null_mask = mask.size - mask.sum()
-        if null_mask == 0:
-            result = np.nan
-
+        if null_mask == 0 and (mask.size > 0 or null_on_empty):
+            return np.nan
     return result
 
 
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -3616,15 +3616,44 @@ def test_ops_consistency_on_empty(self):
         # GH 7869
         # consistency on empty
 
-        # float
-        result = Series(dtype=float).sum()
-        self.assertEqual(result,0)
+        # Test type of empty Series
 
-        result = Series(dtype=float).mean()
-        self.assertTrue(isnull(result))
+        ops = ['median', 'mean', 'sum', 'prod']
 
-        result = Series(dtype=float).median()
-        self.assertTrue(isnull(result))
+        # First test numpy types
+        # Just make sure that numpy and pandas have the same return type
+        for dtype in ['int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64', 'float16', 'float32',
+                      'float64', 'complex64', 'complex128']:
+            s = Series(dtype=dtype)
+            for op in ops:
+                result = getattr(s, op)()
+                np_type = getattr(np, dtype)
+                reference = getattr(np, op)(np_type([]))
+                if np.isnan(reference):
+                    self.assertTrue(np.isnan(result),
+                                    msg="%s on empty %s Series: expecting nan, got %s" % (op, dtype, str(result)))
+                else:
+                    self.assertEqual(result.dtype, reference.dtype,
+                                     msg="%s on empty %s Series: returned type %s, expected %s" %
+                                         (op, dtype, str(result.dtype), str(reference.dtype)))
+                    self.assertEqual(result, reference,
+                                     msg='%s on empty %s Series: expected %s but received %s' %
+                                         (op, dtype, str(reference), str(result)))
+
+        # Test str/unicode types
+        str_series = Series(dtype='str')
+        unicode_series = Series(dtype='unicode')
+        for op in ['median', 'mean', 'prod']:
+            # TODO: these operations should raise type errors
+            # self.assertRaises(TypeError, getattr(str_series, op)(),
+            #                 msg="%s on empty str Series should raise TypeError" % op)
+            # self.assertRaises(TypeError, getattr(unicode_series, op)(),
+            #                 msg="%s on empty unicode Series should raise TypeError" % op)
+            pass
+
+        # TODO: these operations should return empty strings
+        # self.assertEqual('', str_series.sum())
+        # self.assertEqual('', unicode_series.sum())
 
         # timedelta64[ns]
         result = Series(dtype='m8[ns]').sum()