BUG: GroupBy.count() and GroupBy.sum() incorreclty return NaN instead of 0 for missing categories (#35280)

smithto1 · web-flow · commit a21ce87329c3 · 2020-08-07T11:21:11.000-04:00
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -152,6 +152,7 @@ Plotting
 Groupby/resample/rolling
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
+- Bug in :meth:`DataFrameGroupBy.count` and :meth:`SeriesGroupBy.sum` returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`)
 - Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`)
 -
 -
@@ -160,7 +161,7 @@ Groupby/resample/rolling
 Reshaping
 ^^^^^^^^^
 
--
+- Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`)
 -
 
 Sparse
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1829,7 +1829,13 @@ def count(self):
         )
         blocks = [make_block(val, placement=loc) for val, loc in zip(counted, locs)]
 
-        return self._wrap_agged_blocks(blocks, items=data.items)
+        # If we are grouping on categoricals we want unobserved categories to
+        # return zero, rather than the default of NaN which the reindexing in
+        # _wrap_agged_blocks() returns. GH 35028
+        with com.temp_setattr(self, "observed", True):
+            result = self._wrap_agged_blocks(blocks, items=data.items)
+
+        return self._reindex_output(result, fill_value=0)
 
     def nunique(self, dropna: bool = True):
         """
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1536,9 +1536,19 @@ def size(self) -> FrameOrSeriesUnion:
 
     @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0)
     def sum(self, numeric_only: bool = True, min_count: int = 0):
-        return self._agg_general(
-            numeric_only=numeric_only, min_count=min_count, alias="add", npfunc=np.sum
-        )
+
+        # If we are grouping on categoricals we want unobserved categories to
+        # return zero, rather than the default of NaN which the reindexing in
+        # _agg_general() returns. GH #31422
+        with com.temp_setattr(self, "observed", True):
+            result = self._agg_general(
+                numeric_only=numeric_only,
+                min_count=min_count,
+                alias="add",
+                npfunc=np.sum,
+            )
+
+        return self._reindex_output(result, fill_value=0)
 
     @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0)
     def prod(self, numeric_only: bool = True, min_count: int = 0):
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -19,7 +19,7 @@
 import pandas._testing as tm
 
 
-def cartesian_product_for_groupers(result, args, names):
+def cartesian_product_for_groupers(result, args, names, fill_value=np.NaN):
     """ Reindex to a cartesian production for the groupers,
     preserving the nature (Categorical) of each grouper
     """
@@ -33,7 +33,7 @@ def f(a):
         return a
 
     index = MultiIndex.from_product(map(f, args), names=names)
-    return result.reindex(index).sort_index()
+    return result.reindex(index, fill_value=fill_value).sort_index()
 
 
 _results_for_groupbys_with_missing_categories = dict(
@@ -309,7 +309,7 @@ def test_observed(observed):
     result = gb.sum()
     if not observed:
         expected = cartesian_product_for_groupers(
-            expected, [cat1, cat2, ["foo", "bar"]], list("ABC")
+            expected, [cat1, cat2, ["foo", "bar"]], list("ABC"), fill_value=0
         )
 
     tm.assert_frame_equal(result, expected)
@@ -319,7 +319,9 @@ def test_observed(observed):
     expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index)
     result = gb.sum()
     if not observed:
-        expected = cartesian_product_for_groupers(expected, [cat1, cat2], list("AB"))
+        expected = cartesian_product_for_groupers(
+            expected, [cat1, cat2], list("AB"), fill_value=0
+        )
 
     tm.assert_frame_equal(result, expected)
 
@@ -1189,6 +1191,8 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation):
     ).sortlevel()
 
     expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C")
+    if operation == "agg":
+        expected = expected.fillna(0, downcast="infer")
     grouped = df_cat.groupby(["A", "B"], observed=observed)["C"]
     result = getattr(grouped, operation)(sum)
     tm.assert_series_equal(result, expected)
@@ -1338,15 +1342,6 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
         )
         request.node.add_marker(mark)
 
-    if reduction_func == "sum":  # GH 31422
-        mark = pytest.mark.xfail(
-            reason=(
-                "sum should return 0 but currently returns NaN. "
-                "This is a known bug. See GH 31422."
-            )
-        )
-        request.node.add_marker(mark)
-
     df = pd.DataFrame(
         {
             "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
@@ -1367,8 +1362,11 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
         val = result.loc[idx]
         assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan)
 
-    # If we expect unobserved values to be zero, we also expect the dtype to be int
-    if zero_or_nan == 0:
+    # If we expect unobserved values to be zero, we also expect the dtype to be int.
+    # Except for .sum(). If the observed categories sum to dtype=float (i.e. their
+    # sums have decimals), then the zeros for the missing categories should also be
+    # floats.
+    if zero_or_nan == 0 and reduction_func != "sum":
         assert np.issubdtype(result.dtype, np.integer)
 
 
@@ -1410,24 +1408,6 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false(
     if reduction_func == "ngroup":
         pytest.skip("ngroup does not return the Categories on the index")
 
-    if reduction_func == "count":  # GH 35028
-        mark = pytest.mark.xfail(
-            reason=(
-                "DataFrameGroupBy.count returns np.NaN for missing "
-                "categories, when it should return 0. See GH 35028"
-            )
-        )
-        request.node.add_marker(mark)
-
-    if reduction_func == "sum":  # GH 31422
-        mark = pytest.mark.xfail(
-            reason=(
-                "sum should return 0 but currently returns NaN. "
-                "This is a known bug. See GH 31422."
-            )
-        )
-        request.node.add_marker(mark)
-
     df = pd.DataFrame(
         {
             "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
@@ -1817,7 +1817,7 @@ def test_categorical_aggfunc(self, observed):
             ["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1"
         )
         expected_columns = pd.Index(["a", "b"], name="C2")
-        expected_data = np.array([[1.0, np.nan], [1.0, np.nan], [np.nan, 2.0]])
+        expected_data = np.array([[1, 0], [1, 0], [0, 2]], dtype=np.int64)
         expected = pd.DataFrame(
             expected_data, index=expected_index, columns=expected_columns
         )
@@ -1851,18 +1851,19 @@ def test_categorical_pivot_index_ordering(self, observed):
             values="Sales",
             index="Month",
             columns="Year",
-            dropna=observed,
+            observed=observed,
             aggfunc="sum",
         )
         expected_columns = pd.Int64Index([2013, 2014], name="Year")
         expected_index = pd.CategoricalIndex(
-            ["January"], categories=months, ordered=False, name="Month"
+            months, categories=months, ordered=False, name="Month"
         )
+        expected_data = [[320, 120]] + [[0, 0]] * 11
         expected = pd.DataFrame(
-            [[320, 120]], index=expected_index, columns=expected_columns
+            expected_data, index=expected_index, columns=expected_columns
         )
-        if not observed:
-            result = result.dropna().astype(np.int64)
+        if observed:
+            expected = expected.loc[["January"]]
 
         tm.assert_frame_equal(result, expected)
 

Original file line number	Diff line number	Diff line change
`@@ -152,6 +152,7 @@ Plotting`
`152`	`152`	`Groupby/resample/rolling`
`153`	`153`	`^^^^^^^^^^^^^^^^^^^^^^^^`
`154`	`154`
	`155`	+- Bug in :meth:`DataFrameGroupBy.count` and :meth:`SeriesGroupBy.sum` returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`)
`155`	`156`	- Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`)
`156`	`157`	`-`
`157`	`158`	`-`
`@@ -160,7 +161,7 @@ Groupby/resample/rolling`
`160`	`161`	`Reshaping`
`161`	`162`	`^^^^^^^^^`
`162`	`163`
`163`		`--`
	`164`	+- Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`)
`164`	`165`	`-`
`165`	`166`
`166`	`167`	`Sparse`