Skip to content

Commit 0ed1dcd

Browse files
authored
BUG: ValueError on groupby with categoricals (#35253)
1 parent af964ca commit 0ed1dcd

File tree

3 files changed

+56
-1
lines changed

3 files changed

+56
-1
lines changed

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1091,6 +1091,7 @@ Groupby/resample/rolling
10911091
- Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`)
10921092
- Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`)
10931093
- Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`)
1094+
- Bug in :meth:'DataFrameGroupBy.first' and :meth:'DataFrameGroupBy.last' that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`)
10941095

10951096
Reshaping
10961097
^^^^^^^^^

pandas/core/groupby/generic.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -1058,7 +1058,11 @@ def _cython_agg_blocks(
10581058
# reductions; see GH#28949
10591059
obj = obj.iloc[:, 0]
10601060

1061-
s = get_groupby(obj, self.grouper)
1061+
# Create SeriesGroupBy with observed=True so that it does
1062+
# not try to add missing categories if grouping over multiple
1063+
# Categoricals. This will done by later self._reindex_output()
1064+
# Doing it here creates an error. See GH#34951
1065+
s = get_groupby(obj, self.grouper, observed=True)
10621066
try:
10631067
result = s.aggregate(lambda x: alt(x, axis=self.axis))
10641068
except TypeError:

pandas/tests/groupby/test_categorical.py

+50
Original file line numberDiff line numberDiff line change
@@ -1669,3 +1669,53 @@ def test_categorical_transform():
16691669
expected["status"] = expected["status"].astype(delivery_status_type)
16701670

16711671
tm.assert_frame_equal(result, expected)
1672+
1673+
1674+
@pytest.mark.parametrize("func", ["first", "last"])
1675+
def test_series_groupby_first_on_categorical_col_grouped_on_2_categoricals(
1676+
func: str, observed: bool
1677+
):
1678+
# GH 34951
1679+
cat = pd.Categorical([0, 0, 1, 1])
1680+
val = [0, 1, 1, 0]
1681+
df = pd.DataFrame({"a": cat, "b": cat, "c": val})
1682+
1683+
idx = pd.Categorical([0, 1])
1684+
idx = pd.MultiIndex.from_product([idx, idx], names=["a", "b"])
1685+
expected_dict = {
1686+
"first": pd.Series([0, np.NaN, np.NaN, 1], idx, name="c"),
1687+
"last": pd.Series([1, np.NaN, np.NaN, 0], idx, name="c"),
1688+
}
1689+
1690+
expected = expected_dict[func]
1691+
if observed:
1692+
expected = expected.dropna().astype(np.int64)
1693+
1694+
srs_grp = df.groupby(["a", "b"], observed=observed)["c"]
1695+
result = getattr(srs_grp, func)()
1696+
tm.assert_series_equal(result, expected)
1697+
1698+
1699+
@pytest.mark.parametrize("func", ["first", "last"])
1700+
def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals(
1701+
func: str, observed: bool
1702+
):
1703+
# GH 34951
1704+
cat = pd.Categorical([0, 0, 1, 1])
1705+
val = [0, 1, 1, 0]
1706+
df = pd.DataFrame({"a": cat, "b": cat, "c": val})
1707+
1708+
idx = pd.Categorical([0, 1])
1709+
idx = pd.MultiIndex.from_product([idx, idx], names=["a", "b"])
1710+
expected_dict = {
1711+
"first": pd.Series([0, np.NaN, np.NaN, 1], idx, name="c"),
1712+
"last": pd.Series([1, np.NaN, np.NaN, 0], idx, name="c"),
1713+
}
1714+
1715+
expected = expected_dict[func].to_frame()
1716+
if observed:
1717+
expected = expected.dropna().astype(np.int64)
1718+
1719+
df_grp = df.groupby(["a", "b"], observed=observed)
1720+
result = getattr(df_grp, func)()
1721+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)