BUG: Series groupby does not include nan counts for all categorical labels (pandas-dev#17605) (pandas-dev#29690)

Oliver Hofkens · jacobaustin123 · commit 0939732b74f0 · 2019-11-20T11:26:41.000-05:00
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -196,6 +196,47 @@ New repr for :class:`pandas.core.arrays.IntervalArray`
 
    pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)])
 
+
+All :class:`SeriesGroupBy` aggregation methods now respect the ``observed`` keyword
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The following methods now also correctly output values for unobserved categories when called through ``groupby(..., observed=False)`` (:issue:`17605`)
+
+- :meth:`SeriesGroupBy.count`
+- :meth:`SeriesGroupBy.size`
+- :meth:`SeriesGroupBy.nunique`
+- :meth:`SeriesGroupBy.nth`
+
+.. ipython:: python
+
+    df = pd.DataFrame({
+        "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
+        "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")),
+        "value": [0.1] * 4,
+    })
+    df
+
+
+*pandas 0.25.x*
+
+.. code-block:: ipython
+
+   In [2]: df.groupby(["cat_1", "cat_2"], observed=False)["value"].count()
+   Out[2]:
+        cat_1  cat_2
+        A      A        1
+               B        1
+        B      A        1
+               B        1
+    Name: value, dtype: int64
+
+
+*pandas 1.0.0*
+
+.. ipython:: python
+
+   df.groupby(["cat_1", "cat_2"], observed=False)["value"].count()
+
+
 .. _whatsnew_1000.api.other:
 
 Other API changes
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -557,7 +557,8 @@ def nunique(self, dropna: bool = True) -> Series:
             res, out = np.zeros(len(ri), dtype=out.dtype), res
             res[ids[idx]] = out
 
-        return Series(res, index=ri, name=self._selection_name)
+        result = Series(res, index=ri, name=self._selection_name)
+        return self._reindex_output(result, fill_value=0)
 
     @Appender(Series.describe.__doc__)
     def describe(self, **kwargs):
@@ -709,12 +710,13 @@ def count(self) -> Series:
         minlength = ngroups or 0
         out = np.bincount(ids[mask], minlength=minlength)
 
-        return Series(
+        result = Series(
             out,
             index=self.grouper.result_index,
             name=self._selection_name,
             dtype="int64",
         )
+        return self._reindex_output(result, fill_value=0)
 
     def _apply_to_column_groupbys(self, func):
         """ return a pass thru """
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -39,6 +39,7 @@ class providing the base-class of operations.
 )
 from pandas.core.dtypes.missing import isna, notna
 
+from pandas._typing import FrameOrSeries, Scalar
 from pandas.core import nanops
 import pandas.core.algorithms as algorithms
 from pandas.core.arrays import Categorical, try_cast_to_ea
@@ -1296,7 +1297,7 @@ def size(self):
 
         if isinstance(self.obj, Series):
             result.name = self.obj.name
-        return result
+        return self._reindex_output(result, fill_value=0)
 
     @classmethod
     def _add_numeric_operations(cls):
@@ -1740,6 +1741,7 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra
             if not self.observed and isinstance(result_index, CategoricalIndex):
                 out = out.reindex(result_index)
 
+            out = self._reindex_output(out)
             return out.sort_index() if self.sort else out
 
         # dropna is truthy
@@ -2380,7 +2382,9 @@ def tail(self, n=5):
         mask = self._cumcount_array(ascending=False) < n
         return self._selected_obj[mask]
 
-    def _reindex_output(self, output):
+    def _reindex_output(
+        self, output: FrameOrSeries, fill_value: Scalar = np.NaN
+    ) -> FrameOrSeries:
         """
         If we have categorical groupers, then we might want to make sure that
         we have a fully re-indexed output to the levels. This means expanding
@@ -2394,8 +2398,10 @@ def _reindex_output(self, output):
 
         Parameters
         ----------
-        output: Series or DataFrame
+        output : Series or DataFrame
             Object resulting from grouping and applying an operation.
+        fill_value : scalar, default np.NaN
+            Value to use for unobserved categories if self.observed is False.
 
         Returns
         -------
@@ -2426,7 +2432,11 @@ def _reindex_output(self, output):
         ).sortlevel()
 
         if self.as_index:
-            d = {self.obj._get_axis_name(self.axis): index, "copy": False}
+            d = {
+                self.obj._get_axis_name(self.axis): index,
+                "copy": False,
+                "fill_value": fill_value,
+            }
             return output.reindex(**d)
 
         # GH 13204
@@ -2448,7 +2458,9 @@ def _reindex_output(self, output):
         output = output.drop(labels=list(g_names), axis=1)
 
         # Set a temp index and reindex (possibly expanding)
-        output = output.set_index(self.grouper.result_index).reindex(index, copy=False)
+        output = output.set_index(self.grouper.result_index).reindex(
+            index, copy=False, fill_value=fill_value
+        )
 
         # Reset in-axis grouper columns
         # (using level numbers `g_nums` because level names may not be unique)
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -1252,3 +1252,82 @@ def test_get_nonexistent_category():
                 {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]}
             )
         )
+
+
+def test_series_groupby_on_2_categoricals_unobserved(
+    reduction_func: str, observed: bool
+):
+    # GH 17605
+
+    if reduction_func == "ngroup":
+        pytest.skip("ngroup is not truly a reduction")
+
+    df = pd.DataFrame(
+        {
+            "cat_1": pd.Categorical(list("AABB"), categories=list("ABCD")),
+            "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABCD")),
+            "value": [0.1] * 4,
+        }
+    )
+    args = {"nth": [0]}.get(reduction_func, [])
+
+    expected_length = 4 if observed else 16
+
+    series_groupby = df.groupby(["cat_1", "cat_2"], observed=observed)["value"]
+    agg = getattr(series_groupby, reduction_func)
+    result = agg(*args)
+
+    assert len(result) == expected_length
+
+
+@pytest.mark.parametrize(
+    "func, zero_or_nan",
+    [
+        ("all", np.NaN),
+        ("any", np.NaN),
+        ("count", 0),
+        ("first", np.NaN),
+        ("idxmax", np.NaN),
+        ("idxmin", np.NaN),
+        ("last", np.NaN),
+        ("mad", np.NaN),
+        ("max", np.NaN),
+        ("mean", np.NaN),
+        ("median", np.NaN),
+        ("min", np.NaN),
+        ("nth", np.NaN),
+        ("nunique", 0),
+        ("prod", np.NaN),
+        ("quantile", np.NaN),
+        ("sem", np.NaN),
+        ("size", 0),
+        ("skew", np.NaN),
+        ("std", np.NaN),
+        ("sum", np.NaN),
+        ("var", np.NaN),
+    ],
+)
+def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_or_nan):
+    # GH 17605
+    # Tests whether the unobserved categories in the result contain 0 or NaN
+    df = pd.DataFrame(
+        {
+            "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
+            "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")),
+            "value": [0.1] * 4,
+        }
+    )
+    unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")]
+    args = {"nth": [0]}.get(func, [])
+
+    series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"]
+    agg = getattr(series_groupby, func)
+    result = agg(*args)
+
+    for idx in unobserved:
+        val = result.loc[idx]
+        assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan)
+
+    # If we expect unobserved values to be zero, we also expect the dtype to be int
+    if zero_or_nan == 0:
+        assert np.issubdtype(result.dtype, np.integer)