Skip to content

BUG: groupby reorders categorical categories #49131

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Oct 24, 2022
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ Bug fixes
Categorical
^^^^^^^^^^^
- Bug in :meth:`Categorical.set_categories` losing dtype information (:issue:`48812`)
-
- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would reorder categories when used as a grouper (:issue:`48749`)

Datetimelike
^^^^^^^^^^^^
Expand Down
36 changes: 1 addition & 35 deletions pandas/core/groupby/categorical.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import annotations

from typing import TYPE_CHECKING

import numpy as np

from pandas.core.algorithms import unique1d
Expand All @@ -11,9 +9,6 @@
recode_for_categories,
)

if TYPE_CHECKING:
from pandas.core.indexes.api import CategoricalIndex


def recode_for_groupby(
c: Categorical, sort: bool, observed: bool
Expand Down Expand Up @@ -77,7 +72,7 @@ def recode_for_groupby(
# sort=False should order groups in as-encountered order (GH-8868)

# xref GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories
all_codes = np.arange(c.categories.nunique(), dtype=np.int8)
all_codes = np.arange(c.categories.nunique())
# GH 38140: exclude nan from indexer for categories
unique_notnan_codes = unique1d(c.codes[c.codes != -1])
if c.ordered:
Expand All @@ -90,32 +85,3 @@ def recode_for_groupby(
take_codes = unique_notnan_codes

return Categorical(c, c.unique().categories.take(take_codes)), None


def recode_from_groupby(
c: Categorical, sort: bool, ci: CategoricalIndex
) -> CategoricalIndex:
"""
Reverse the codes_to_groupby to account for sort / observed.

Parameters
----------
c : Categorical
sort : bool
The value of the sort parameter groupby was called with.
ci : CategoricalIndex
The codes / categories to recode

Returns
-------
CategoricalIndex
"""
# we re-order to the original category orderings
if sort:
# error: "CategoricalIndex" has no attribute "set_categories"
return ci.set_categories(c.categories) # type: ignore[attr-defined]

# we are not sorting, so add unobserved to the end
new_cats = c.categories[~c.categories.isin(ci.categories)]
# error: "CategoricalIndex" has no attribute "add_categories"
return ci.add_categories(new_cats) # type: ignore[attr-defined]
14 changes: 9 additions & 5 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,7 @@
import pandas.core.common as com
from pandas.core.frame import DataFrame
from pandas.core.groupby import ops
from pandas.core.groupby.categorical import (
recode_for_groupby,
recode_from_groupby,
)
from pandas.core.groupby.categorical import recode_for_groupby
from pandas.core.indexes.api import (
CategoricalIndex,
Index,
Expand Down Expand Up @@ -462,6 +459,7 @@ class Grouping:
_group_index: Index | None = None
_passed_categorical: bool
_all_grouper: Categorical | None
_orig_cats: Index | None
_index: Index

def __init__(
Expand All @@ -479,6 +477,7 @@ def __init__(
self._orig_grouper = grouper
self.grouping_vector = _convert_grouper(index, grouper)
self._all_grouper = None
self._orig_cats = None
self._index = index
self._sort = sort
self.obj = obj
Expand Down Expand Up @@ -529,6 +528,7 @@ def __init__(
# a passed Categorical
self._passed_categorical = True

self._orig_cats = self.grouping_vector.categories
self.grouping_vector, self._all_grouper = recode_for_groupby(
self.grouping_vector, sort, observed
)
Expand Down Expand Up @@ -646,7 +646,9 @@ def result_index(self) -> Index:
if self._all_grouper is not None:
group_idx = self.group_index
assert isinstance(group_idx, CategoricalIndex)
return recode_from_groupby(self._all_grouper, self._sort, group_idx)
categories = self._all_grouper.categories
# set_categories is dynamically added
return group_idx.set_categories(categories) # type: ignore[attr-defined]
return self.group_index

@cache_readonly
Expand Down Expand Up @@ -678,6 +680,8 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
uniques = Categorical.from_codes(
codes=ucodes, categories=categories, ordered=cat.ordered
)
if not self._observed:
uniques = uniques.reorder_categories(self._orig_cats)
return cat.codes, uniques

elif isinstance(self.grouping_vector, ops.BaseGrouper):
Expand Down
Loading