diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5f93e08d51baa..85b29a58a1f15 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1091,6 +1091,7 @@ Groupby/resample/rolling - Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`) - Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`) - Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`) +- Bug in :meth:'DataFrameGroupBy.first' and :meth:'DataFrameGroupBy.last' that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1f49ee2b0b665..093e1d4ab3942 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1058,7 +1058,11 @@ def _cython_agg_blocks( # reductions; see GH#28949 obj = obj.iloc[:, 0] - s = get_groupby(obj, self.grouper) + # Create SeriesGroupBy with observed=True so that it does + # not try to add missing categories if grouping over multiple + # Categoricals. This will done by later self._reindex_output() + # Doing it here creates an error. See GH#34951 + s = get_groupby(obj, self.grouper, observed=True) try: result = s.aggregate(lambda x: alt(x, axis=self.axis)) except TypeError: diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 118d928ac02f4..7e4513da37dc9 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1669,3 +1669,53 @@ def test_categorical_transform(): expected["status"] = expected["status"].astype(delivery_status_type) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("func", ["first", "last"]) +def test_series_groupby_first_on_categorical_col_grouped_on_2_categoricals( + func: str, observed: bool +): + # GH 34951 + cat = pd.Categorical([0, 0, 1, 1]) + val = [0, 1, 1, 0] + df = pd.DataFrame({"a": cat, "b": cat, "c": val}) + + idx = pd.Categorical([0, 1]) + idx = pd.MultiIndex.from_product([idx, idx], names=["a", "b"]) + expected_dict = { + "first": pd.Series([0, np.NaN, np.NaN, 1], idx, name="c"), + "last": pd.Series([1, np.NaN, np.NaN, 0], idx, name="c"), + } + + expected = expected_dict[func] + if observed: + expected = expected.dropna().astype(np.int64) + + srs_grp = df.groupby(["a", "b"], observed=observed)["c"] + result = getattr(srs_grp, func)() + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("func", ["first", "last"]) +def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals( + func: str, observed: bool +): + # GH 34951 + cat = pd.Categorical([0, 0, 1, 1]) + val = [0, 1, 1, 0] + df = pd.DataFrame({"a": cat, "b": cat, "c": val}) + + idx = pd.Categorical([0, 1]) + idx = pd.MultiIndex.from_product([idx, idx], names=["a", "b"]) + expected_dict = { + "first": pd.Series([0, np.NaN, np.NaN, 1], idx, name="c"), + "last": pd.Series([1, np.NaN, np.NaN, 0], idx, name="c"), + } + + expected = expected_dict[func].to_frame() + if observed: + expected = expected.dropna().astype(np.int64) + + df_grp = df.groupby(["a", "b"], observed=observed) + result = getattr(df_grp, func)() + tm.assert_frame_equal(result, expected)