Skip to content

Commit e2f2e21

Browse files
authored
CLN: clean up groupby / categorical (#21753)
xref #21151
1 parent 44a88af commit e2f2e21

File tree

3 files changed

+105
-81
lines changed

3 files changed

+105
-81
lines changed

pandas/core/arrays/categorical.py

-67
Original file line numberDiff line numberDiff line change
@@ -698,73 +698,6 @@ def _set_categories(self, categories, fastpath=False):
698698

699699
self._dtype = new_dtype
700700

701-
def _codes_for_groupby(self, sort, observed):
702-
"""
703-
Code the categories to ensure we can groupby for categoricals.
704-
705-
If observed=True, we return a new Categorical with the observed
706-
categories only.
707-
708-
If sort=False, return a copy of self, coded with categories as
709-
returned by .unique(), followed by any categories not appearing in
710-
the data. If sort=True, return self.
711-
712-
This method is needed solely to ensure the categorical index of the
713-
GroupBy result has categories in the order of appearance in the data
714-
(GH-8868).
715-
716-
Parameters
717-
----------
718-
sort : boolean
719-
The value of the sort parameter groupby was called with.
720-
observed : boolean
721-
Account only for the observed values
722-
723-
Returns
724-
-------
725-
Categorical
726-
If sort=False, the new categories are set to the order of
727-
appearance in codes (unless ordered=True, in which case the
728-
original order is preserved), followed by any unrepresented
729-
categories in the original order.
730-
"""
731-
732-
# we only care about observed values
733-
if observed:
734-
unique_codes = unique1d(self.codes)
735-
cat = self.copy()
736-
737-
take_codes = unique_codes[unique_codes != -1]
738-
if self.ordered:
739-
take_codes = np.sort(take_codes)
740-
741-
# we recode according to the uniques
742-
categories = self.categories.take(take_codes)
743-
codes = _recode_for_categories(self.codes,
744-
self.categories,
745-
categories)
746-
747-
# return a new categorical that maps our new codes
748-
# and categories
749-
dtype = CategoricalDtype(categories, ordered=self.ordered)
750-
return type(self)(codes, dtype=dtype, fastpath=True)
751-
752-
# Already sorted according to self.categories; all is fine
753-
if sort:
754-
return self
755-
756-
# sort=False should order groups in as-encountered order (GH-8868)
757-
cat = self.unique()
758-
759-
# But for groupby to work, all categories should be present,
760-
# including those missing from the data (GH-13179), which .unique()
761-
# above dropped
762-
cat.add_categories(
763-
self.categories[~self.categories.isin(cat.categories)],
764-
inplace=True)
765-
766-
return self.reorder_categories(cat.categories)
767-
768701
def _set_dtype(self, dtype):
769702
"""Internal method for directly updating the CategoricalDtype
770703

pandas/core/groupby/categorical.py

+99
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import numpy as np
2+
from pandas.core.algorithms import unique1d
3+
from pandas.core.arrays.categorical import (
4+
_recode_for_categories, CategoricalDtype, Categorical)
5+
6+
7+
def recode_for_groupby(c, sort, observed):
8+
"""
9+
Code the categories to ensure we can groupby for categoricals.
10+
11+
If observed=True, we return a new Categorical with the observed
12+
categories only.
13+
14+
If sort=False, return a copy of self, coded with categories as
15+
returned by .unique(), followed by any categories not appearing in
16+
the data. If sort=True, return self.
17+
18+
This method is needed solely to ensure the categorical index of the
19+
GroupBy result has categories in the order of appearance in the data
20+
(GH-8868).
21+
22+
Parameters
23+
----------
24+
c : Categorical
25+
sort : boolean
26+
The value of the sort parameter groupby was called with.
27+
observed : boolean
28+
Account only for the observed values
29+
30+
Returns
31+
-------
32+
New Categorical
33+
If sort=False, the new categories are set to the order of
34+
appearance in codes (unless ordered=True, in which case the
35+
original order is preserved), followed by any unrepresented
36+
categories in the original order.
37+
Categorical or None
38+
If we are observed, return the original categorical, otherwise None
39+
"""
40+
41+
# we only care about observed values
42+
if observed:
43+
unique_codes = unique1d(c.codes)
44+
45+
take_codes = unique_codes[unique_codes != -1]
46+
if c.ordered:
47+
take_codes = np.sort(take_codes)
48+
49+
# we recode according to the uniques
50+
categories = c.categories.take(take_codes)
51+
codes = _recode_for_categories(c.codes,
52+
c.categories,
53+
categories)
54+
55+
# return a new categorical that maps our new codes
56+
# and categories
57+
dtype = CategoricalDtype(categories, ordered=c.ordered)
58+
return Categorical(codes, dtype=dtype, fastpath=True), c
59+
60+
# Already sorted according to c.categories; all is fine
61+
if sort:
62+
return c, None
63+
64+
# sort=False should order groups in as-encountered order (GH-8868)
65+
cat = c.unique()
66+
67+
# But for groupby to work, all categories should be present,
68+
# including those missing from the data (GH-13179), which .unique()
69+
# above dropped
70+
cat = cat.add_categories(
71+
c.categories[~c.categories.isin(cat.categories)])
72+
73+
return c.reorder_categories(cat.categories), None
74+
75+
76+
def recode_from_groupby(c, sort, ci):
77+
"""
78+
Reverse the codes_to_groupby to account for sort / observed.
79+
80+
Parameters
81+
----------
82+
c : Categorical
83+
sort : boolean
84+
The value of the sort parameter groupby was called with.
85+
ci : CategoricalIndex
86+
The codes / categories to recode
87+
88+
Returns
89+
-------
90+
CategoricalIndex
91+
"""
92+
93+
# we re-order to the original category orderings
94+
if sort:
95+
return ci.set_categories(c.categories)
96+
97+
# we are not sorting, so add unobserved to the end
98+
return ci.add_categories(
99+
c.categories[~c.categories.isin(ci.categories)])

pandas/core/groupby/groupby.py

+6-14
Original file line numberDiff line numberDiff line change
@@ -2994,9 +2994,9 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
29942994
# a passed Categorical
29952995
elif is_categorical_dtype(self.grouper):
29962996

2997-
self.all_grouper = self.grouper
2998-
self.grouper = self.grouper._codes_for_groupby(
2999-
self.sort, observed)
2997+
from pandas.core.groupby.categorical import recode_for_groupby
2998+
self.grouper, self.all_grouper = recode_for_groupby(
2999+
self.grouper, self.sort, observed)
30003000
categories = self.grouper.categories
30013001

30023002
# we make a CategoricalIndex out of the cat grouper
@@ -3073,17 +3073,9 @@ def labels(self):
30733073
@cache_readonly
30743074
def result_index(self):
30753075
if self.all_grouper is not None:
3076-
all_categories = self.all_grouper.categories
3077-
3078-
# we re-order to the original category orderings
3079-
if self.sort:
3080-
return self.group_index.set_categories(all_categories)
3081-
3082-
# we are not sorting, so add unobserved to the end
3083-
categories = self.group_index.categories
3084-
return self.group_index.add_categories(
3085-
all_categories[~all_categories.isin(categories)])
3086-
3076+
from pandas.core.groupby.categorical import recode_from_groupby
3077+
return recode_from_groupby(self.all_grouper,
3078+
self.sort, self.group_index)
30873079
return self.group_index
30883080

30893081
@property

0 commit comments

Comments
 (0)