BUG: ValueError on groupby with categoricals (#35253)

smithto1 · web-flow · commit 0ed1dcd5395a · 2020-07-13T18:22:45.000-04:00
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -1091,6 +1091,7 @@ Groupby/resample/rolling
 - Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`)
 - Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`)
 - Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`)
+- Bug in :meth:'DataFrameGroupBy.first' and :meth:'DataFrameGroupBy.last' that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`)
 
 Reshaping
 ^^^^^^^^^
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -1058,7 +1058,11 @@ def _cython_agg_blocks(
                     #  reductions; see GH#28949
                     obj = obj.iloc[:, 0]
 
-                s = get_groupby(obj, self.grouper)
+                # Create SeriesGroupBy with observed=True so that it does
+                # not try to add missing categories if grouping over multiple
+                # Categoricals. This will done by later self._reindex_output()
+                # Doing it here creates an error. See GH#34951
+                s = get_groupby(obj, self.grouper, observed=True)
                 try:
                     result = s.aggregate(lambda x: alt(x, axis=self.axis))
                 except TypeError:
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
@@ -1669,3 +1669,53 @@ def test_categorical_transform():
     expected["status"] = expected["status"].astype(delivery_status_type)
 
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("func", ["first", "last"])
+def test_series_groupby_first_on_categorical_col_grouped_on_2_categoricals(
+    func: str, observed: bool
+):
+    # GH 34951
+    cat = pd.Categorical([0, 0, 1, 1])
+    val = [0, 1, 1, 0]
+    df = pd.DataFrame({"a": cat, "b": cat, "c": val})
+
+    idx = pd.Categorical([0, 1])
+    idx = pd.MultiIndex.from_product([idx, idx], names=["a", "b"])
+    expected_dict = {
+        "first": pd.Series([0, np.NaN, np.NaN, 1], idx, name="c"),
+        "last": pd.Series([1, np.NaN, np.NaN, 0], idx, name="c"),
+    }
+
+    expected = expected_dict[func]
+    if observed:
+        expected = expected.dropna().astype(np.int64)
+
+    srs_grp = df.groupby(["a", "b"], observed=observed)["c"]
+    result = getattr(srs_grp, func)()
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("func", ["first", "last"])
+def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals(
+    func: str, observed: bool
+):
+    # GH 34951
+    cat = pd.Categorical([0, 0, 1, 1])
+    val = [0, 1, 1, 0]
+    df = pd.DataFrame({"a": cat, "b": cat, "c": val})
+
+    idx = pd.Categorical([0, 1])
+    idx = pd.MultiIndex.from_product([idx, idx], names=["a", "b"])
+    expected_dict = {
+        "first": pd.Series([0, np.NaN, np.NaN, 1], idx, name="c"),
+        "last": pd.Series([1, np.NaN, np.NaN, 0], idx, name="c"),
+    }
+
+    expected = expected_dict[func].to_frame()
+    if observed:
+        expected = expected.dropna().astype(np.int64)
+
+    df_grp = df.groupby(["a", "b"], observed=observed)
+    result = getattr(df_grp, func)()
+    tm.assert_frame_equal(result, expected)