pandas-dev · TomAugspurger · Dec 20, 2017 · Dec 20, 2017 · Dec 20, 2017 · Dec 20, 2017
diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -8,6 +8,103 @@ deprecations, new features, enhancements, and performance improvements along
 with a large number of bug fixes. We recommend that all users upgrade to this
 version.
 
+.. _whatsnew_0220.na_sum:
+
+Pandas 0.22.0 changes the handling of empty and all-NA sums and products. The
+summary is that
+
+* The sum of an all-NA or empty series is now 0
+* The product of an all-NA or empty series is now 1
+* We've added an ``empty_is_na`` keyword to the ``sum`` and ``prod`` methods
+  to control whether the sum or product of an empty series should be NA. The
+  default is ``False``. To restore the 0.21 behavior, use
+  ``empty_is_na=True``.
+
+Some background: In pandas 0.21.1, we fixed a long-standing inconsistency
+in the return value of all-NA series depending on whether or not bottleneck
+was installed. See :ref:`whatsnew_0210.api_breaking.bottleneck`_. At the same
+time, we changed the sum and prod of an empty Series to also be ``NaN``.
+
+Based on feedback, we've partially reverted those changes. The defualt sum
+for all-NA and empty series is now 0 (1 for ``prod``). You can achieve the
+pandas 0.21.0 behavior, returning ``NaN``, with the ``empty_is_na`` keyword.
+
+*pandas 0.21*
+
+.. code-block:: ipython
+
+   In [1]: import pandas as pd
+
+   In [2]: import numpy as np
+
+   In [3]: pd.Series([]).sum()
+   Out[3]: nan
+
+   In [4]: pd.Series([np.nan]).sum()
+   Out[4]: nan
+
+*pandas 0.22.0*
+
+.. ipython:: python
+
+   pd.Series([]).sum()
+   pd.Series([np.nan]).sum()
+
+To have the sum of an empty series return ``NaN``, use the ``empty_is_na``
+keyword. Thanks to the ``skipna`` parameter, the ``.sum`` on an all-NA
+series is conceptually the same as on an empty. The ``empty_is_na`` parameter
+controls the return value after removing NAs.
+
+.. ipython:: python
+
+   pd.Series([]).sum(empty_is_na=True)
+   pd.Series([np.nan]).sum(empty_is_na=True)
+
+Note that this affects some other places in the library:
+
+1. Grouping by a Categorical with some unobserved categories
+
+*pandas 0.21*
+
+.. code-block:: ipython
+
+   In [3]: grouper = pd.Categorical(['a', 'a'], categories=['a', 'b'])
+
+   In [4]: pd.Series([1, 2]).groupby(grouper).sum()
+   Out[4]:
+   a    3.0
+   b    NaN
+   dtype: float64
+
+*pandas 0.22*
+
+.. ipython:: python
+
+   grouper = pd.Categorical(['a', 'a'], categories=['a', 'b'])
+   pd.Series([1, 2]).groupby(grouepr).sum()
+
+2. Upsampling
+
+*pandas 0.21.0*
+
+.. code-block:: ipython
+
+   In [5]: idx = pd.DatetimeIndex(['2017-01-01', '2017-01-02'])
+
+   In [6]: pd.Series([1, 2], index=idx).resample('12H').sum()
+   Out[6]:
+   2017-01-01 00:00:00    1.0
+   2017-01-01 12:00:00    NaN
+   2017-01-02 00:00:00    2.0
+   Freq: 12H, dtype: float64
+
+*pandas 0.22.0*
+
+.. ipython:: python
+
+   idx = pd.DatetimeIndex(['2017-01-01', '2017-01-02'])
+   pd.Series([1, 2], index=idx).resample("12H").sum()
+
 .. _whatsnew_0220.enhancements:
 
 New features

diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
@@ -89,7 +89,7 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
         for i in range(ncounts):
             for j in range(K):
                 if nobs[i, j] == 0:
-                    out[i, j] = NAN
+                    out[i, j] = 0
                 else:
                     out[i, j] = sumx[i, j]
 
@@ -148,7 +148,7 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
         for i in range(ncounts):
             for j in range(K):
                 if nobs[i, j] == 0:
-                    out[i, j] = NAN
+                    out[i, j] = 1
                 else:
                     out[i, j] = prodx[i, j]
 

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -7310,7 +7310,8 @@ def _add_numeric_operations(cls):
         @Substitution(outname='mad',
                       desc="Return the mean absolute deviation of the values "
                            "for the requested axis",
-                      name1=name, name2=name2, axis_descr=axis_descr)
+                      name1=name, name2=name2, axis_descr=axis_descr,
+                      empty_is_na='')
         @Appender(_num_doc)
         def mad(self, axis=None, skipna=None, level=None):
             if skipna is None:
@@ -7351,7 +7352,7 @@ def mad(self, axis=None, skipna=None, level=None):
         @Substitution(outname='compounded',
                       desc="Return the compound percentage of the values for "
                       "the requested axis", name1=name, name2=name2,
-                      axis_descr=axis_descr)
+                      axis_descr=axis_descr, empty_is_na='')
         @Appender(_num_doc)
         def compound(self, axis=None, skipna=None, level=None):
             if skipna is None:
@@ -7375,10 +7376,11 @@ def compound(self, axis=None, skipna=None, level=None):
             lambda y, axis: np.maximum.accumulate(y, axis), "max",
             -np.inf, np.nan)
 
-        cls.sum = _make_stat_function(
+        cls.sum = _make_empty_stat_function(
             cls, 'sum', name, name2, axis_descr,
             'Return the sum of the values for the requested axis',
-            nanops.nansum)
+            nanops.nansum,
+            empty_is_na=False)
         cls.mean = _make_stat_function(
             cls, 'mean', name, name2, axis_descr,
             'Return the mean of the values for the requested axis',
@@ -7394,10 +7396,11 @@ def compound(self, axis=None, skipna=None, level=None):
             "by N-1\n",
             nanops.nankurt)
         cls.kurtosis = cls.kurt
-        cls.prod = _make_stat_function(
+        cls.prod = _make_empty_stat_function(
             cls, 'prod', name, name2, axis_descr,
             'Return the product of the values for the requested axis',
-            nanops.nanprod)
+            nanops.nanprod,
+            empty_is_na=False)
         cls.product = cls.prod
         cls.median = _make_stat_function(
             cls, 'median', name, name2, axis_descr,
@@ -7520,14 +7523,14 @@ def _doc_parms(cls):
 ----------
 axis : %(axis_descr)s
 skipna : boolean, default True
-    Exclude NA/null values. If an entire row/column is NA or empty, the result
-    will be NA
+    Exclude NA/null values before computing the result.
 level : int or level name, default None
     If the axis is a MultiIndex (hierarchical), count along a
     particular level, collapsing into a %(name1)s
 numeric_only : boolean, default None
     Include only float, int, boolean columns. If None, will attempt to use
-    everything, then use only numeric data. Not implemented for Series.
+    everything, then use only numeric data. Not implemented for
+    Series.%(empty_is_na)s
 
 Returns
 -------
@@ -7584,7 +7587,7 @@ def _doc_parms(cls):
 axis : %(axis_descr)s
 skipna : boolean, default True
     Exclude NA/null values. If an entire row/column is NA, the result
-    will be NA
+    will be NA.
 
 Returns
 -------
@@ -7598,16 +7601,45 @@ def _doc_parms(cls):
 
 """
 
+_empty_is_na_doc = """
+empty_is_na : bool, default False
+    The result of operating on an empty array should be NA. The default
+    behavior is for the sum of an empty array to be 0, and the product
+    of an empty array to be 1.
+
+    When ``skipna=True``, "empty" refers to whether or not the array
+    is empty after removing NAs. So operating on an all-NA array with
+    ``skipna=True`` will be NA when ``empty_is_na`` is True.
+    """
+
+
+def _make_empty_stat_function(cls, name, name1, name2, axis_descr, desc, f,
+                              empty_is_na=False):
+    @Substitution(outname=name, desc=desc, name1=name1, name2=name2,
+                  axis_descr=axis_descr, empty_is_na=_empty_is_na_doc)
+    @Appender(_num_doc)
+    def stat_func(self, axis=None, skipna=True, level=None, numeric_only=None,
+                  empty_is_na=empty_is_na, **kwargs):
+        nv.validate_stat_func(tuple(), kwargs, fname=name)
+        if axis is None:
+            axis = self._stat_axis_number
+        if level is not None:
+            return self._agg_by_level(name, axis=axis, level=level,
+                                      skipna=skipna, empty_is_na=empty_is_na)
+        return self._reduce(f, name, axis=axis, skipna=skipna,
+                            numeric_only=numeric_only,
+                            empty_is_na=empty_is_na)
+
+    return set_function_name(stat_func, name, cls)
+
 
 def _make_stat_function(cls, name, name1, name2, axis_descr, desc, f):
     @Substitution(outname=name, desc=desc, name1=name1, name2=name2,
-                  axis_descr=axis_descr)
+                  axis_descr=axis_descr, empty_is_na='')
     @Appender(_num_doc)
-    def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None,
+    def stat_func(self, axis=None, skipna=True, level=None, numeric_only=None,
                   **kwargs):
         nv.validate_stat_func(tuple(), kwargs, fname=name)
-        if skipna is None:
-            skipna = True
         if axis is None:
             axis = self._stat_axis_number
         if level is not None:

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -107,7 +107,8 @@ def f(values, axis=None, skipna=True, **kwds):
                     if k not in kwds:
                         kwds[k] = v
             try:
-                if values.size == 0:
+                # TODO: NaT
+                if values.size == 0 and kwds.get('empty_is_na'):
 
                     # we either return np.nan or pd.NaT
                     if is_numeric_dtype(values):
@@ -155,6 +156,7 @@ def _bn_ok_dtype(dt, name):
     # Bottleneck chokes on datetime64
     if (not is_object_dtype(dt) and not is_datetime_or_timedelta_dtype(dt)):
 
+        # TODO: handle this overflow
         # GH 15507
         # bottleneck does not properly upcast during the sum
         # so can overflow
@@ -163,6 +165,9 @@ def _bn_ok_dtype(dt, name):
         # further we also want to preserve NaN when all elements
         # are NaN, unlinke bottleneck/numpy which consider this
         # to be 0
+
+        # https://github.com/kwgoodman/bottleneck/issues/180
+        # No upcast for boolean -> int
         if name in ['nansum', 'nanprod']:
             return False
 
@@ -303,22 +308,21 @@ def nanall(values, axis=None, skipna=True):
 
 
 @disallow('M8')
-@bottleneck_switch()
-def nansum(values, axis=None, skipna=True):
+@bottleneck_switch(empty_is_na=False)
+def nansum(values, axis=None, skipna=True, empty_is_na=False):
     values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
     dtype_sum = dtype_max
     if is_float_dtype(dtype):
         dtype_sum = dtype
     elif is_timedelta64_dtype(dtype):
         dtype_sum = np.float64
     the_sum = values.sum(axis, dtype=dtype_sum)
-    the_sum = _maybe_null_out(the_sum, axis, mask)
+    the_sum = _maybe_null_out(the_sum, axis, mask, empty_is_na)
 
     return _wrap_results(the_sum, dtype)
 
 
 @disallow('M8')
-@bottleneck_switch()
 def nanmean(values, axis=None, skipna=True):
     values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
 
@@ -641,13 +645,15 @@ def nankurt(values, axis=None, skipna=True):
 
 
 @disallow('M8', 'm8')
-def nanprod(values, axis=None, skipna=True):
+@bottleneck_switch(empty_is_na=False)
+def nanprod(values, axis=None, skipna=True, empty_is_na=False):
     mask = isna(values)
     if skipna and not is_any_int_dtype(values):
         values = values.copy()
         values[mask] = 1
     result = values.prod(axis)
-    return _maybe_null_out(result, axis, mask)
+
+    return _maybe_null_out(result, axis, mask, empty_is_na, unit=1.0)
 
 
 def _maybe_arg_null_out(result, axis, mask, skipna):
@@ -683,9 +689,13 @@ def _get_counts(mask, axis, dtype=float):
         return np.array(count, dtype=dtype)
 
 
-def _maybe_null_out(result, axis, mask):
+def _maybe_null_out(result, axis, mask, empty_is_na=True, unit=0.0):
     if axis is not None and getattr(result, 'ndim', False):
         null_mask = (mask.shape[axis] - mask.sum(axis)) == 0
+
+        if not empty_is_na:
+            null_mask[result == unit] = False
+
         if np.any(null_mask):
             if is_numeric_dtype(result):
                 if np.iscomplexobj(result):
@@ -698,7 +708,7 @@ def _maybe_null_out(result, axis, mask):
                 result[null_mask] = None
     elif result is not tslib.NaT:
         null_mask = mask.size - mask.sum()
-        if null_mask == 0:
+        if null_mask == 0.0 and empty_is_na:
             result = np.nan
 
     return result