Skip to content

Commit 1294b19

Browse files
authored
BUG: groupby reorders categorical categories (#49131)
* BUG: groupby reorders categorical categories * Tests and whatsnew * type-ignore * GH# * Add test * Add TODO * GH# * fixups * Revert test change; catch warnings
1 parent 886f841 commit 1294b19

File tree

5 files changed

+240
-60
lines changed

5 files changed

+240
-60
lines changed

doc/source/whatsnew/v2.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ Bug fixes
247247
Categorical
248248
^^^^^^^^^^^
249249
- Bug in :meth:`Categorical.set_categories` losing dtype information (:issue:`48812`)
250-
-
250+
- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would reorder categories when used as a grouper (:issue:`48749`)
251251

252252
Datetimelike
253253
^^^^^^^^^^^^

pandas/core/groupby/categorical.py

+1-35
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
from __future__ import annotations
22

3-
from typing import TYPE_CHECKING
4-
53
import numpy as np
64

75
from pandas.core.algorithms import unique1d
@@ -11,9 +9,6 @@
119
recode_for_categories,
1210
)
1311

14-
if TYPE_CHECKING:
15-
from pandas.core.indexes.api import CategoricalIndex
16-
1712

1813
def recode_for_groupby(
1914
c: Categorical, sort: bool, observed: bool
@@ -77,7 +72,7 @@ def recode_for_groupby(
7772
# sort=False should order groups in as-encountered order (GH-8868)
7873

7974
# xref GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories
80-
all_codes = np.arange(c.categories.nunique(), dtype=np.int8)
75+
all_codes = np.arange(c.categories.nunique())
8176
# GH 38140: exclude nan from indexer for categories
8277
unique_notnan_codes = unique1d(c.codes[c.codes != -1])
8378
if c.ordered:
@@ -90,32 +85,3 @@ def recode_for_groupby(
9085
take_codes = unique_notnan_codes
9186

9287
return Categorical(c, c.unique().categories.take(take_codes)), None
93-
94-
95-
def recode_from_groupby(
96-
c: Categorical, sort: bool, ci: CategoricalIndex
97-
) -> CategoricalIndex:
98-
"""
99-
Reverse the codes_to_groupby to account for sort / observed.
100-
101-
Parameters
102-
----------
103-
c : Categorical
104-
sort : bool
105-
The value of the sort parameter groupby was called with.
106-
ci : CategoricalIndex
107-
The codes / categories to recode
108-
109-
Returns
110-
-------
111-
CategoricalIndex
112-
"""
113-
# we re-order to the original category orderings
114-
if sort:
115-
# error: "CategoricalIndex" has no attribute "set_categories"
116-
return ci.set_categories(c.categories) # type: ignore[attr-defined]
117-
118-
# we are not sorting, so add unobserved to the end
119-
new_cats = c.categories[~c.categories.isin(ci.categories)]
120-
# error: "CategoricalIndex" has no attribute "add_categories"
121-
return ci.add_categories(new_cats) # type: ignore[attr-defined]

pandas/core/groupby/grouper.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,7 @@
3939
import pandas.core.common as com
4040
from pandas.core.frame import DataFrame
4141
from pandas.core.groupby import ops
42-
from pandas.core.groupby.categorical import (
43-
recode_for_groupby,
44-
recode_from_groupby,
45-
)
42+
from pandas.core.groupby.categorical import recode_for_groupby
4643
from pandas.core.indexes.api import (
4744
CategoricalIndex,
4845
Index,
@@ -462,6 +459,7 @@ class Grouping:
462459
_group_index: Index | None = None
463460
_passed_categorical: bool
464461
_all_grouper: Categorical | None
462+
_orig_cats: Index | None
465463
_index: Index
466464

467465
def __init__(
@@ -479,6 +477,7 @@ def __init__(
479477
self._orig_grouper = grouper
480478
self.grouping_vector = _convert_grouper(index, grouper)
481479
self._all_grouper = None
480+
self._orig_cats = None
482481
self._index = index
483482
self._sort = sort
484483
self.obj = obj
@@ -529,6 +528,7 @@ def __init__(
529528
# a passed Categorical
530529
self._passed_categorical = True
531530

531+
self._orig_cats = self.grouping_vector.categories
532532
self.grouping_vector, self._all_grouper = recode_for_groupby(
533533
self.grouping_vector, sort, observed
534534
)
@@ -646,7 +646,9 @@ def result_index(self) -> Index:
646646
if self._all_grouper is not None:
647647
group_idx = self.group_index
648648
assert isinstance(group_idx, CategoricalIndex)
649-
return recode_from_groupby(self._all_grouper, self._sort, group_idx)
649+
categories = self._all_grouper.categories
650+
# set_categories is dynamically added
651+
return group_idx.set_categories(categories) # type: ignore[attr-defined]
650652
return self.group_index
651653

652654
@cache_readonly
@@ -678,6 +680,8 @@ def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
678680
uniques = Categorical.from_codes(
679681
codes=ucodes, categories=categories, ordered=cat.ordered
680682
)
683+
if not self._observed:
684+
uniques = uniques.reorder_categories(self._orig_cats)
681685
return cat.codes, uniques
682686

683687
elif isinstance(self.grouping_vector, ops.BaseGrouper):

0 commit comments

Comments
 (0)