WIP fixing bug pandas-dev#9733 where stat functions returned a python scalar for empty series

remiremi · remiremi · commit c84ed78cd2e5 · 2015-04-17T16:54:55.000+02:00
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -2452,6 +2452,11 @@ def is_integer_dtype(arr_or_dtype):
     return (issubclass(tipo, np.integer) and
             not issubclass(tipo, (np.datetime64, np.timedelta64)))
 
+def is_unsigned_integer_dtype(arr_or_dtype):
+    tipo = _get_dtype_type(arr_or_dtype)
+    return (issubclass(tipo, np.unsignedinteger) and
+            not issubclass(tipo, (np.datetime64, np.timedelta64)))
+
 
 def is_int_or_datetime_dtype(arr_or_dtype):
     tipo = _get_dtype_type(arr_or_dtype)
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -21,6 +21,7 @@
                                 is_float, is_integer, is_complex,
                                 is_float_dtype, is_floating_dtype,
                                 is_complex_dtype, is_integer_dtype,
+                                is_unsigned_integer_dtype,
                                 is_bool_dtype, is_object_dtype,
                                 is_datetime64_dtype, is_timedelta64_dtype,
                                 is_datetime_or_timedelta_dtype,
@@ -70,21 +71,7 @@ def f(values, axis=None, skipna=True, **kwds):
                     if k not in kwds:
                         kwds[k] = v
             try:
-                if self.zero_value is not None and values.size == 0:
-                    if values.ndim == 1:
-
-                        # wrap the 0's if needed
-                        if is_timedelta64_dtype(values):
-                            return lib.Timedelta(0)
-                        return values.dtype.type(0)
-                    else:
-                        result_shape = (values.shape[:axis] +
-                                        values.shape[axis + 1:])
-                        result = np.empty(result_shape)
-                        result.fill(0)
-                        return result
-
-                if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype,
+                if values.size != 0 and _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype,
                                                                bn_name):
                     result = bn_func(values, axis=axis, **kwds)
 
@@ -190,7 +177,10 @@ def _get_values(values, skipna, fill_value=None, fill_value_typ=None,
     # return a platform independent precision dtype
     dtype_max = dtype
     if is_integer_dtype(dtype) or is_bool_dtype(dtype):
-        dtype_max = np.int64
+        if is_unsigned_integer_dtype(dtype):
+            dtype_max = np.uint64
+        else:
+            dtype_max = np.int64
     elif is_float_dtype(dtype):
         dtype_max = np.float64
 
@@ -244,10 +234,10 @@ def nanall(values, axis=None, skipna=True):
 
 
 @disallow('M8')
-@bottleneck_switch(zero_value=0)
+@bottleneck_switch()
 def nansum(values, axis=None, skipna=True):
     values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
-    the_sum = values.sum(axis, dtype=dtype_max)
+    the_sum = values.sum(axis)
     the_sum = _maybe_null_out(the_sum, axis, mask)
 
     return _wrap_results(the_sum, dtype)
@@ -571,6 +561,8 @@ def _get_counts(mask, axis):
 
 
 def _maybe_null_out(result, axis, mask):
+    if mask.size == 0:
+        return result
     if axis is not None and getattr(result, 'ndim', False):
         null_mask = (mask.shape[axis] - mask.sum(axis)) == 0
         if np.any(null_mask):
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -2592,12 +2592,6 @@ def testit():
                 self.assertRaisesRegexp(NotImplementedError, name, f,
                                         self.series, numeric_only=True)
 
-            # Test type of empty Series
-            s = Series()
-            self.assertEqual(s.dtype, s.sum().dtype)
-            s = Series(dtype=np.int64)
-            self.assertEqual(s.dtype, s.sum().dtype)
-
         testit()
 
         try:
@@ -3399,15 +3393,35 @@ def test_ops_consistency_on_empty(self):
         # GH 7869
         # consistency on empty
 
-        # float
-        result = Series(dtype=float).sum()
-        self.assertEqual(result,0)
+        # Test type of empty Series
 
-        result = Series(dtype=float).mean()
-        self.assertTrue(isnull(result))
+        ops = ['median', 'mean', 'sum', 'prod']
 
-        result = Series(dtype=float).median()
-        self.assertTrue(isnull(result))
+        # First test numpy types
+        for dtype in ['int8', 'int16', 'int32', 'int64', 'uint8', 'uint16', 'uint32', 'uint64', 'float16', 'float32',
+                      'float64', 'complex64', 'complex128']:
+            s = Series(dtype=dtype)
+            for op in ops:
+                result = getattr(s, op)()
+                np_type = getattr(np, dtype)
+                reference = getattr(np, op)(np_type([]))
+                if np.isnan(reference):
+                    self.assertTrue(np.isnan(result), msg="Expecting nan, got %s" % (str(result)))
+                else:
+                    self.assertEqual(result.dtype, reference.dtype, msg="Failed to %s on %s: returned type %s, expected %s" % (op, dtype, str(result.dtype), str(reference.dtype)))
+                    self.assertEqual(result, reference,
+                                     msg='Different result for empty %s with dtype=%s: expected %s but received %s' %
+                                         (op, dtype, str(reference), str(result)))
+
+        # Test str/unicode types
+        str_series = Series(dtype='str')
+        unicode_series = Series(dtype='unicode')
+        for op in ['median', 'mean', 'prod']:
+            print op
+            self.assertTrue(np.isnan(getattr(str_series, op)()))
+            self.assertTrue(np.isnan(getattr(unicode_series, op)()))
+        self.assertEqual('', str_series.sum())
+        self.assertEqual('', unicode_series.sum())
 
         # timedelta64[ns]
         result = Series(dtype='m8[ns]').sum()