pandas-dev
diff --git a/‎doc/source/whatsnew/v0.22.0.txt
+102-4 b/‎doc/source/whatsnew/v0.22.0.txt
+102-4
diff --git a/‎pandas/_libs/groupby_helper.pxi.in
+2-2 b/‎pandas/_libs/groupby_helper.pxi.in
+2-2
diff --git a/‎pandas/_libs/window.pyx
+11 b/‎pandas/_libs/window.pyx
+11
diff --git a/‎pandas/core/generic.py
+17-17 b/‎pandas/core/generic.py
+17-17
diff --git a/‎pandas/core/nanops.py
+2-2 b/‎pandas/core/nanops.py
+2-2
diff --git a/‎pandas/tests/frame/test_analytics.py
+19-10 b/‎pandas/tests/frame/test_analytics.py
+19-10
diff --git a/‎pandas/tests/groupby/test_aggregate.py
+35-8 b/‎pandas/tests/groupby/test_aggregate.py
+35-8
@@ -3,12 +3,110 @@
 v0.22.0
 -------
 
-This is a major release from 0.21.1 and includes a number of API changes,
-deprecations, new features, enhancements, and performance improvements along
-with a large number of bug fixes. We recommend that all users upgrade to this
-version.
+This is a major release from 0.21.1 and includes a single, API breaking change.
+We recommend that all users upgrade to this version after carefully reading the
+release note (singular!).
 
 .. _whatsnew_0220.api_breaking:
 
 Backwards incompatible API changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Pandas 0.22.0 changes the handling of empty and all-NA sums and products. The
+summary is that
+
+* The sum of an all-NA or empty series is now 0
+* The product of an all-NA or empty series is now 1
+* We've added a ``min_count`` parameter to ``.sum`` and ``.prod`` to control
+  the minimum number of valid values for the result to be valid. If fewer than
+  ``min_count`` valid values are present, the result is NA. The default is
+  ``0``. To restore the 0.21 behavior, use ``min_count=1``.
+
+Some background: In pandas 0.21.1, we fixed a long-standing inconsistency
+in the return value of all-NA series depending on whether or not bottleneck
+was installed. See :ref:`whatsnew_0210.api_breaking.bottleneck`_. At the same
+time, we changed the sum and prod of an empty Series to also be ``NaN``.
+
+Based on feedback, we've partially reverted those changes. The default sum
+for all-NA and empty series is now 0 (1 for ``prod``).
+
+*pandas 0.21*
+
+.. code-block:: ipython
+
+   In [1]: import pandas as pd
+
+   In [2]: import numpy as np
+
+   In [3]: pd.Series([]).sum()
+   Out[3]: nan
+
+   In [4]: pd.Series([np.nan]).sum()
+   Out[4]: nan
+
+*pandas 0.22.0*
+
+.. ipython:: python
+
+   pd.Series([]).sum()
+   pd.Series([np.nan]).sum()
+
+To have the sum of an empty series return ``NaN``, use the ``min_count``
+keyword. Thanks to the ``skipna`` parameter, the ``.sum`` on an all-NA
+series is conceptually the same as on an empty. The ``min_count`` parameter
+refers to the minimum number of *valid* values required for a non-NA sum
+or product.
+
+.. ipython:: python
+
+   pd.Series([]).sum(min_count=1)
+   pd.Series([np.nan]).sum(min_count=1)
+
+Note that this affects some other places in the library:
+
+1. Grouping by a Categorical with some unobserved categories
+
+*pandas 0.21*
+
+.. code-block:: ipython
+
+   In [3]: grouper = pd.Categorical(['a', 'a'], categories=['a', 'b'])
+
+   In [4]: pd.Series([1, 2]).groupby(grouper).sum()
+   Out[4]:
+   a    3.0
+   b    NaN
+   dtype: float64
+
+*pandas 0.22*
+
+.. ipython:: python
+
+   grouper = pd.Categorical(['a', 'a'], categories=['a', 'b'])
+   pd.Series([1, 2]).groupby(grouper).sum()
+
+   pd.Series([1, 2]).groupby(groupuer).sum(min_count=1)
+
+2. Upsampling
+
+*pandas 0.21.0*
+
+.. code-block:: ipython
+
+   In [5]: idx = pd.DatetimeIndex(['2017-01-01', '2017-01-02'])
+
+   In [6]: pd.Series([1, 2], index=idx).resample('12H').sum()
+   Out[6]:
+   2017-01-01 00:00:00    1.0
+   2017-01-01 12:00:00    NaN
+   2017-01-02 00:00:00    2.0
+   Freq: 12H, dtype: float64
+
+*pandas 0.22.0*
+
+.. ipython:: python
+
+   idx = pd.DatetimeIndex(['2017-01-01', '2017-01-02'])
+   pd.Series([1, 2], index=idx).resample("12H").sum()
+
+   pd.Series([1, 2], index=idx).resample("12H").sum(min_count=1)
@@ -37,7 +37,7 @@ def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                        ndarray[int64_t] counts,
                        ndarray[{{c_type}}, ndim=2] values,
                        ndarray[int64_t] labels,
-                       Py_ssize_t min_count=1):
+                       Py_ssize_t min_count=0):
     """
     Only aggregates on axis=0
     """
@@ -101,7 +101,7 @@ def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                         ndarray[int64_t] counts,
                         ndarray[{{c_type}}, ndim=2] values,
                         ndarray[int64_t] labels,
-                        Py_ssize_t min_count=1):
+                        Py_ssize_t min_count=0):
     """
     Only aggregates on axis=0
     """
 
@@ -443,10 +443,17 @@ def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp,
         double val, prev_x, sum_x = 0
         int64_t s, e
         int64_t nobs = 0, i, j, N
+        int64_t minp2 = -1
         bint is_variable
         ndarray[int64_t] start, end
         ndarray[double_t] output
 
+    if minp == 0:
+        # in get_window_indexer, we ensure that minp >= 1. That's fine for
+        # all cases except nobs = 0 (all missing values) and minp=0. For
+        # any other minp, the sum will be NA. For minp=0, the sum will be 0.
+        # So we track that here and pass it later if needed.
+        minp2 = 0
     start, end, N, win, minp, is_variable = get_window_indexer(input, win,
                                                                minp, index,
                                                                closed)
@@ -483,6 +490,8 @@ def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp,
                     for j in range(end[i - 1], e):
                         add_sum(input[j], &nobs, &sum_x)
 
+                if minp2 == 0:
+                    minp = 0
                 output[i] = calc_sum(minp, nobs, sum_x)
 
     else:
@@ -503,6 +512,8 @@ def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp,
                     prev_x = input[i - win]
                     remove_sum(prev_x, &nobs, &sum_x)
 
+                if minp2 == 0:
+                    minp = 0
                 output[i] = calc_sum(minp, nobs, sum_x)
 
     return output
 
@@ -7619,48 +7619,48 @@ def _doc_parms(cls):
 _sum_examples = """\
 Examples
 --------
-By default, the sum of an empty series is ``NaN``.
+By default, the sum of an empty series is ``0``.
 
->>> pd.Series([]).sum()  # min_count=1 is the default
-nan
+>>> pd.Series([]).sum()  # min_count=0 is the default
+0.0
 
 This can be controlled with the ``min_count`` parameter. For example, if
-you'd like the sum of an empty series to be 0, pass ``min_count=0``.
+you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
 
->>> pd.Series([]).sum(min_count=0)
-0.0
+>>> pd.Series([]).sum(min_count=1)
+nan
 
 Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
 empty series identically.
 
 >>> pd.Series([np.nan]).sum()
-nan
-
->>> pd.Series([np.nan]).sum(min_count=0)
 0.0
+
+>>> pd.Series([np.nan]).sum(min_count=1)
+nan
 """
 
 _prod_examples = """\
 Examples
 --------
-By default, the product of an empty series is ``NaN``
+By default, the product of an empty series is ``1``
 
 >>> pd.Series([]).prod()
-nan
+1.0
 
 This can be controlled with the ``min_count`` parameter
 
->>> pd.Series([]).prod(min_count=0)
-1.0
+>>> pd.Series([]).prod(min_count=1)
+nan
 
 Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
 empty series identically.
 
 >>> pd.Series([np.nan]).prod()
-nan
-
->>> pd.Series([np.nan]).sum(min_count=0)
 1.0
+
+>>> pd.Series([np.nan]).sum(min_count=1)
+nan
 """
 
 
@@ -7683,7 +7683,7 @@ def _make_min_count_stat_function(cls, name, name1, name2, axis_descr, desc,
                   examples=examples)
     @Appender(_num_doc)
     def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None,
-                  min_count=1,
+                  min_count=0,
                   **kwargs):
         nv.validate_stat_func(tuple(), kwargs, fname=name)
         if skipna is None:
 
@@ -308,7 +308,7 @@ def nanall(values, axis=None, skipna=True):
 
 @disallow('M8')
 @bottleneck_switch()
-def nansum(values, axis=None, skipna=True, min_count=1):
+def nansum(values, axis=None, skipna=True, min_count=0):
     values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
     dtype_sum = dtype_max
     if is_float_dtype(dtype):
@@ -645,7 +645,7 @@ def nankurt(values, axis=None, skipna=True):
 
 
 @disallow('M8', 'm8')
-def nanprod(values, axis=None, skipna=True, min_count=1):
+def nanprod(values, axis=None, skipna=True, min_count=0):
     mask = isna(values)
     if skipna and not is_any_int_dtype(values):
         values = values.copy()
 
@@ -478,7 +478,8 @@ def test_nunique(self):
                                Series({0: 1, 1: 3, 2: 2}))
 
     def test_sum(self):
-        self._check_stat_op('sum', np.sum, has_numeric_only=True)
+        self._check_stat_op('sum', np.sum, has_numeric_only=True,
+                            skipna_alternative=np.nansum)
 
         # mixed types (with upcasting happening)
         self._check_stat_op('sum', np.sum,
@@ -753,7 +754,8 @@ def alt(x):
 
     def _check_stat_op(self, name, alternative, frame=None, has_skipna=True,
                        has_numeric_only=False, check_dtype=True,
-                       check_dates=False, check_less_precise=False):
+                       check_dates=False, check_less_precise=False,
+                       skipna_alternative=None):
         if frame is None:
             frame = self.frame
             # set some NAs
@@ -774,15 +776,19 @@ def _check_stat_op(self, name, alternative, frame=None, has_skipna=True,
             assert len(result)
 
         if has_skipna:
-            def skipna_wrapper(x):
-                nona = x.dropna()
-                if len(nona) == 0:
-                    return np.nan
-                return alternative(nona)
-
             def wrapper(x):
                 return alternative(x.values)
 
+            if skipna_alternative:
+                def skipna_wrapper(x):
+                    return skipna_alternative(x.values)
+            else:
+                def skipna_wrapper(x):
+                    nona = x.dropna()
+                    if len(nona) == 0:
+                        return np.nan
+                    return alternative(nona)
+
             result0 = f(axis=0, skipna=False)
             result1 = f(axis=1, skipna=False)
             tm.assert_series_equal(result0, frame.apply(wrapper),
@@ -834,8 +840,11 @@ def wrapper(x):
             r0 = getattr(all_na, name)(axis=0)
             r1 = getattr(all_na, name)(axis=1)
             if name in ['sum', 'prod']:
-                assert np.isnan(r0).all()
-                assert np.isnan(r1).all()
+                unit = int(name == 'prod')
+                expected = pd.Series(unit, index=r0.index, dtype=r0.dtype)
+                tm.assert_series_equal(r0, expected)
+                expected = pd.Series(unit, index=r1.index, dtype=r1.dtype)
+                tm.assert_series_equal(r1, expected)
 
     def test_mode(self):
         df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11],
 
@@ -813,8 +813,6 @@ def test__cython_agg_general(self):
         ('mean', np.mean),
         ('median', lambda x: np.median(x) if len(x) > 0 else np.nan),
         ('var', lambda x: np.var(x, ddof=1)),
-        ('add', lambda x: np.sum(x) if len(x) > 0 else np.nan),
-        ('prod', np.prod),
         ('min', np.min),
         ('max', np.max), ]
     )
@@ -824,19 +822,48 @@ def test_cython_agg_empty_buckets(self, op, targop):
 
         # calling _cython_agg_general directly, instead of via the user API
         # which sets different values for min_count, so do that here.
-        if op in ('add', 'prod'):
-            min_count = 1
-        else:
-            min_count = -1
-        result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(
-            op, min_count=min_count)
+        result = df.groupby(pd.cut(df[0], grps))._cython_agg_general(op)
         expected = df.groupby(pd.cut(df[0], grps)).agg(lambda x: targop(x))
         try:
             tm.assert_frame_equal(result, expected)
         except BaseException as exc:
             exc.args += ('operation: %s' % op,)
             raise
 
+    def test_cython_agg_empty_buckets_nanops(self):
+        # GH-18869 can't call nanops on empty groups, so hardcode expected
+        # for these
+        df = pd.DataFrame([11, 12, 13], columns=['a'])
+        grps = range(0, 25, 5)
+        # add / sum
+        result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('add')
+        intervals = pd.interval_range(0, 20, freq=5)
+        expected = pd.DataFrame(
+            {"a": [0, 0, 36, 0]},
+            index=pd.CategoricalIndex(intervals, name='a', ordered=True))
+        tm.assert_frame_equal(result, expected)
+
+        # prod
+        result = df.groupby(pd.cut(df['a'], grps))._cython_agg_general('prod')
+        expected = pd.DataFrame(
+            {"a": [1, 1, 1716, 1]},
+            index=pd.CategoricalIndex(intervals, name='a', ordered=True))
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.xfail(reason="GH-18869: agg func not called on empty groups.")
+    def test_agg_category_nansum(self):
+        categories = ['a', 'b', 'c']
+        df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
+                                               categories=categories),
+                           'B': [1, 2, 3]})
+        result = df.groupby("A").B.agg(np.nansum)
+        expected = pd.Series([3, 3, 0],
+                             index=pd.CategoricalIndex(['a', 'b', 'c'],
+                                                       categories=categories,
+                                                       name='A'),
+                             name='B')
+        tm.assert_series_equal(result, expected)
+
     def test_agg_over_numpy_arrays(self):
         # GH 3788
         df = pd.DataFrame([[1, np.array([10, 20, 30])],