diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index fe2886a022ad5..5223b36006102 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -806,6 +806,7 @@ Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`.DataFrameGroupBy.__len__` and :meth:`.SeriesGroupBy.__len__` would raise when the grouping contained NA values and ``dropna=False`` (:issue:`58644`) - Bug in :meth:`.DataFrameGroupBy.any` that returned True for groups where all Timedelta values are NaT. (:issue:`59712`) +- Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupBy.groups` would fail when the groups were :class:`Categorical` with an NA value (:issue:`61356`) - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`) - Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`) - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index c9d874fc08dbe..f8e92b7e2650a 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -12,11 +12,16 @@ import numpy as np +from pandas._libs import ( + algos as libalgos, +) from pandas._libs.tslibs import OutOfBoundsDatetime from pandas.errors import InvalidIndexError from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( + ensure_int64, + ensure_platform_int, is_list_like, is_scalar, ) @@ -38,7 +43,10 @@ ) from pandas.core.series import Series -from pandas.io.formats.printing import pprint_thing +from pandas.io.formats.printing import ( + PrettyDict, + pprint_thing, +) if TYPE_CHECKING: from collections.abc import ( @@ -668,8 +676,14 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: def groups(self) -> dict[Hashable, Index]: codes, uniques = self._codes_and_uniques uniques = Index._with_infer(uniques, name=self.name) - cats = Categorical.from_codes(codes, uniques, validate=False) - return self._index.groupby(cats) + + r, counts = libalgos.groupsort_indexer(ensure_platform_int(codes), len(uniques)) + counts = ensure_int64(counts).cumsum() + _result = (r[start:end] for start, end in zip(counts, counts[1:])) + # map to the label + result = {k: self._index.take(v) for k, v in zip(uniques, _result)} + + return PrettyDict(result) @property def observed_grouping(self) -> Grouping: diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index e49be8c00b426..cae3013642739 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -506,6 +506,23 @@ def test_observed_groups(observed): tm.assert_dict_equal(result, expected) +def test_groups_na_category(dropna, observed): + # https://github.com/pandas-dev/pandas/issues/61356 + df = DataFrame( + {"cat": Categorical(["a", np.nan, "a"], categories=list("adb"))}, + index=list("xyz"), + ) + g = df.groupby("cat", observed=observed, dropna=dropna) + + result = g.groups + expected = {"a": Index(["x", "z"])} + if not dropna: + expected |= {np.nan: Index(["y"])} + if not observed: + expected |= {"b": Index([]), "d": Index([])} + tm.assert_dict_equal(result, expected) + + @pytest.mark.parametrize( "keys, expected_values, expected_index_levels", [