pandas-dev#31422 GroupBy.sum() returns 0 for missing categories when grouping by multiple Categoricals. Updates to tests to reflect this expected output

smithto1 · smithto1 · commit 4de1d6ba23c3 · 2020-07-09T00:19:24.000+01:00
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -363,7 +363,9 @@ def _wrap_series_output(
         return result
 
     def _wrap_aggregated_output(
-        self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
+        self,
+        output: Mapping[base.OutputKey, Union[Series, np.ndarray]],
+        fill_value: Scalar = np.NaN,
     ) -> Union[Series, DataFrame]:
         """
         Wraps the output of a SeriesGroupBy aggregation into the expected result.
@@ -385,7 +387,7 @@ def _wrap_aggregated_output(
         result = self._wrap_series_output(
             output=output, index=self.grouper.result_index
         )
-        return self._reindex_output(result)
+        return self._reindex_output(result, fill_value)
 
     def _wrap_transformed_output(
         self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]]
@@ -415,7 +417,11 @@ def _wrap_transformed_output(
         return result
 
     def _wrap_applied_output(
-        self, keys: Index, values: Optional[List[Any]], not_indexed_same: bool = False
+        self,
+        keys: Index,
+        values: Optional[List[Any]],
+        not_indexed_same: bool = False,
+        fill_value: Scalar = np.NaN,
     ) -> FrameOrSeriesUnion:
         """
         Wrap the output of SeriesGroupBy.apply into the expected result.
@@ -465,7 +471,7 @@ def _get_index() -> Index:
             result = self.obj._constructor(
                 data=values, index=_get_index(), name=self._selection_name
             )
-            return self._reindex_output(result)
+            return self._reindex_output(result, fill_value)
 
     def _aggregate_named(self, func, *args, **kwargs):
         result = {}
@@ -1029,7 +1035,10 @@ def _cython_agg_general(
         agg_blocks, agg_items = self._cython_agg_blocks(
             how, alt=alt, numeric_only=numeric_only, min_count=min_count
         )
-        return self._wrap_agged_blocks(agg_blocks, items=agg_items)
+        fill_value = self._cython_func_fill_values.get(alt, np.NaN)
+        return self._wrap_agged_blocks(
+            agg_blocks, items=agg_items, fill_value=fill_value
+        )
 
     def _cython_agg_blocks(
         self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
@@ -1219,7 +1228,9 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:
 
         return self.obj._constructor(result, columns=result_columns)
 
-    def _wrap_applied_output(self, keys, values, not_indexed_same=False):
+    def _wrap_applied_output(
+        self, keys, values, not_indexed_same=False, fill_value: Scalar = np.NaN
+    ):
         if len(keys) == 0:
             return self.obj._constructor(index=keys)
 
@@ -1380,7 +1391,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
                 if not self.as_index:
                     self._insert_inaxis_grouper_inplace(result)
 
-                return self._reindex_output(result)
+                return self._reindex_output(result, fill_value)
 
             # values are not series or array-like but scalars
             else:
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -888,8 +888,12 @@ def _python_apply_general(
         """
         keys, values, mutated = self.grouper.apply(f, data, self.axis)
 
+        fill_value = self._cython_func_fill_values.get(f, np.NaN)
         return self._wrap_applied_output(
-            keys, values, not_indexed_same=mutated or self.mutated
+            keys,
+            values,
+            not_indexed_same=mutated or self.mutated,
+            fill_value=fill_value,
         )
 
     def _iterate_slices(self) -> Iterable[Series]:
@@ -1010,6 +1014,8 @@ def _agg_general(
         result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
         return result
 
+    _cython_func_fill_values = {np.sum: 0}
+
     def _cython_agg_general(
         self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1
     ):
@@ -1045,7 +1051,9 @@ def _cython_agg_general(
         if len(output) == 0:
             raise DataError("No numeric types to aggregate")
 
-        return self._wrap_aggregated_output(output)
+        fill_value = self._cython_func_fill_values.get(alt, np.NaN)
+
+        return self._wrap_aggregated_output(output, fill_value)
 
     def _python_agg_general(
         self, func, *args, engine="cython", engine_kwargs=None, **kwargs
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -19,7 +19,7 @@
 import pandas._testing as tm
 
 
-def cartesian_product_for_groupers(result, args, names):
+def cartesian_product_for_groupers(result, args, names, fill_value=np.NaN):
     """ Reindex to a cartesian production for the groupers,
     preserving the nature (Categorical) of each grouper
     """
@@ -33,7 +33,7 @@ def f(a):
         return a
 
     index = MultiIndex.from_product(map(f, args), names=names)
-    return result.reindex(index).sort_index()
+    return result.reindex(index, fill_value=fill_value).sort_index()
 
 
 _results_for_groupbys_with_missing_categories = dict(
@@ -309,7 +309,7 @@ def test_observed(observed):
     result = gb.sum()
     if not observed:
         expected = cartesian_product_for_groupers(
-            expected, [cat1, cat2, ["foo", "bar"]], list("ABC")
+            expected, [cat1, cat2, ["foo", "bar"]], list("ABC"), fill_value=0
         )
 
     tm.assert_frame_equal(result, expected)
@@ -319,7 +319,9 @@ def test_observed(observed):
     expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index)
     result = gb.sum()
     if not observed:
-        expected = cartesian_product_for_groupers(expected, [cat1, cat2], list("AB"))
+        expected = cartesian_product_for_groupers(
+            expected, [cat1, cat2], list("AB"), fill_value=0
+        )
 
     tm.assert_frame_equal(result, expected)
 
@@ -1188,9 +1190,10 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation):
         names=["A", "B"],
     ).sortlevel()
 
-    expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C")
+    expected = Series(data=[2, 4, 0, 1, 0, 3], index=index, name="C")
     grouped = df_cat.groupby(["A", "B"], observed=observed)["C"]
     result = getattr(grouped, operation)(sum)
+
     tm.assert_series_equal(result, expected)
 
 
@@ -1340,15 +1343,6 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
         )
         request.node.add_marker(mark)
 
-    if reduction_func == "sum":  # GH 31422
-        mark = pytest.mark.xfail(
-            reason=(
-                "sum should return 0 but currently returns NaN. "
-                "This is a known bug. See GH 31422."
-            )
-        )
-        request.node.add_marker(mark)
-
     df = pd.DataFrame(
         {
             "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
@@ -1369,8 +1363,11 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
         val = result.loc[idx]
         assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan)
 
-    # If we expect unobserved values to be zero, we also expect the dtype to be int
-    if zero_or_nan == 0:
+    # If we expect unobserved values to be zero, we also expect the dtype to be int.
+    # Except for .sum(). If the observed categories sum to dtype=float (i.e. their
+    # sums have decimals), then the zeros for the missing categories should also be
+    # floats.
+    if zero_or_nan == 0 and reduction_func != "sum":
         assert np.issubdtype(result.dtype, np.integer)
 
 
@@ -1412,15 +1409,6 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false(
     if reduction_func == "ngroup":
         pytest.skip("ngroup does not return the Categories on the index")
 
-    if reduction_func == "sum":  # GH 31422
-        mark = pytest.mark.xfail(
-            reason=(
-                "sum should return 0 but currently returns NaN. "
-                "This is a known bug. See GH 31422."
-            )
-        )
-        request.node.add_marker(mark)
-
     df = pd.DataFrame(
         {
             "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),