pandas-dev · jreback · Nov 20, 2019 · Nov 18, 2019 · Nov 18, 2019 · Nov 18, 2019
diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -438,6 +438,7 @@ Groupby/resample/rolling
 - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`)
 - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`)
 - Bug in :meth:`DataFrameGroupBy.rolling().quantile()` ignoring ``interpolation`` keyword argument (:issue:`28779`)
+- Bug in :meth:`SeriesGroupBy.count`, :meth:`SeriesGroupBy.size`, :meth:`SeriesGroupBy.nunique` and :meth:`SeriesGroupBy.nth` missing unobserved categories when ``observed=False`` (:issue:`17605`)
 
 Reshaping
 ^^^^^^^^^

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -569,7 +569,8 @@ def nunique(self, dropna: bool = True) -> Series:
             res, out = np.zeros(len(ri), dtype=out.dtype), res
             res[ids[idx]] = out
 
-        return Series(res, index=ri, name=self._selection_name)
+        result = Series(res, index=ri, name=self._selection_name)
+        return self._reindex_output(result, fill_value=0)
 
     @Appender(Series.describe.__doc__)
     def describe(self, **kwargs):
@@ -721,12 +722,13 @@ def count(self) -> Series:
         minlength = ngroups or 0
         out = np.bincount(ids[mask], minlength=minlength)
 
-        return Series(
+        result = Series(
             out,
             index=self.grouper.result_index,
             name=self._selection_name,
             dtype="int64",
         )
+        return self._reindex_output(result, fill_value=0)
 
     def _apply_to_column_groupbys(self, func):
         """ return a pass thru """

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1296,7 +1296,7 @@ def size(self):
 
         if isinstance(self.obj, Series):
             result.name = self.obj.name
-        return result
+        return self._reindex_output(result, fill_value=0)
 
     @classmethod
     def _add_numeric_operations(cls):
@@ -1743,6 +1743,7 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra
             if not self.observed and isinstance(result_index, CategoricalIndex):
                 out = out.reindex(result_index)
 
+            out = self._reindex_output(out)
             return out.sort_index() if self.sort else out
 
         # dropna is truthy
@@ -2383,7 +2384,7 @@ def tail(self, n=5):
         mask = self._cumcount_array(ascending=False) < n
         return self._selected_obj[mask]
 
-    def _reindex_output(self, output):
+    def _reindex_output(self, output, fill_value=np.NaN):
         """
         If we have categorical groupers, then we might want to make sure that
         we have a fully re-indexed output to the levels. This means expanding
@@ -2429,7 +2430,11 @@ def _reindex_output(self, output):
         ).sortlevel()
 
         if self.as_index:
-            d = {self.obj._get_axis_name(self.axis): index, "copy": False}
+            d = {
+                self.obj._get_axis_name(self.axis): index,
+                "copy": False,
+                "fill_value": fill_value,
+            }
             return output.reindex(**d)
 
         # GH 13204
@@ -2451,7 +2456,9 @@ def _reindex_output(self, output):
         output = output.drop(labels=list(g_names), axis=1)
 
         # Set a temp index and reindex (possibly expanding)
-        output = output.set_index(self.grouper.result_index).reindex(index, copy=False)
+        output = output.set_index(self.grouper.result_index).reindex(
+            index, copy=False, fill_value=fill_value
+        )
 
         # Reset in-axis grouper columns
         # (using level numbers `g_nums` because level names may not be unique)

diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -1252,3 +1252,83 @@ def test_get_nonexistent_category():
                 {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]}
             )
         )
+
+
+@pytest.mark.parametrize("observed", [True, False])
+def test_series_groupby_on_2_categoricals_unobserved(
+    reduction_func: str, observed: bool
+):
+    # GH 17605
+
+    if reduction_func == "ngroup":
+        pytest.skip("ngroup is not truly a reduction")
+
+    df = pd.DataFrame(
+        {
+            "cat_1": pd.Categorical(list("AABB"), categories=list("ABCD")),
+            "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABCD")),
+            "value": [0.1] * 4,
+        }
+    )
+    args = {"nth": [0]}.get(reduction_func, [])
+
+    expected_length = 4 if observed else 16
+
+    series_groupby = df.groupby(["cat_1", "cat_2"], observed=observed)["value"]
+    agg = getattr(series_groupby, reduction_func)
+    result = agg(*args)
+
+    assert len(result) == expected_length
+
+
+@pytest.mark.parametrize(
+    "func, zero_or_nan",
+    [
+        ("all", np.NaN),
+        ("any", np.NaN),
+        ("count", 0),
+        ("first", np.NaN),
+        ("idxmax", np.NaN),
+        ("idxmin", np.NaN),
+        ("last", np.NaN),
+        ("mad", np.NaN),
+        ("max", np.NaN),
+        ("mean", np.NaN),
+        ("median", np.NaN),
+        ("min", np.NaN),
+        ("nth", np.NaN),
+        ("nunique", 0),
+        ("prod", np.NaN),
+        ("quantile", np.NaN),
+        ("sem", np.NaN),
+        ("size", 0),
+        ("skew", np.NaN),
+        ("std", np.NaN),
+        ("sum", np.NaN),
+        ("var", np.NaN),
+    ],
+)
+def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_or_nan):
+    # GH 17605
+    # Tests whether the unobserved categories in the result contain 0 or NaN
+    df = pd.DataFrame(
+        {
+            "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
+            "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")),
+            "value": [0.1] * 4,
+        }
+    )
+    unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")]
+    args = {"nth": [0]}.get(func, [])
+
+    series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"]
+    agg = getattr(series_groupby, func)
+    result = agg(*args)
+
+    for idx in unobserved:
+        val = result.loc[idx]
+        assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan)
+
+    # If we expect unobserved values to be zero, we also expect the dtype to be int
+    if zero_or_nan == 0:
+        assert np.issubdtype(result.dtype, np.integer)