Skip to content

Commit da877df

Browse files
committed
CLN: clean up groupby / categorical
xref pandas-dev#21151
1 parent d24a950 commit da877df

File tree

3 files changed

+106
-81
lines changed

3 files changed

+106
-81
lines changed

pandas/core/arrays/categorical.py

-67
Original file line numberDiff line numberDiff line change
@@ -698,73 +698,6 @@ def _set_categories(self, categories, fastpath=False):
698698

699699
self._dtype = new_dtype
700700

701-
def _codes_for_groupby(self, sort, observed):
702-
"""
703-
Code the categories to ensure we can groupby for categoricals.
704-
705-
If observed=True, we return a new Categorical with the observed
706-
categories only.
707-
708-
If sort=False, return a copy of self, coded with categories as
709-
returned by .unique(), followed by any categories not appearing in
710-
the data. If sort=True, return self.
711-
712-
This method is needed solely to ensure the categorical index of the
713-
GroupBy result has categories in the order of appearance in the data
714-
(GH-8868).
715-
716-
Parameters
717-
----------
718-
sort : boolean
719-
The value of the sort parameter groupby was called with.
720-
observed : boolean
721-
Account only for the observed values
722-
723-
Returns
724-
-------
725-
Categorical
726-
If sort=False, the new categories are set to the order of
727-
appearance in codes (unless ordered=True, in which case the
728-
original order is preserved), followed by any unrepresented
729-
categories in the original order.
730-
"""
731-
732-
# we only care about observed values
733-
if observed:
734-
unique_codes = unique1d(self.codes)
735-
cat = self.copy()
736-
737-
take_codes = unique_codes[unique_codes != -1]
738-
if self.ordered:
739-
take_codes = np.sort(take_codes)
740-
741-
# we recode according to the uniques
742-
categories = self.categories.take(take_codes)
743-
codes = _recode_for_categories(self.codes,
744-
self.categories,
745-
categories)
746-
747-
# return a new categorical that maps our new codes
748-
# and categories
749-
dtype = CategoricalDtype(categories, ordered=self.ordered)
750-
return type(self)(codes, dtype=dtype, fastpath=True)
751-
752-
# Already sorted according to self.categories; all is fine
753-
if sort:
754-
return self
755-
756-
# sort=False should order groups in as-encountered order (GH-8868)
757-
cat = self.unique()
758-
759-
# But for groupby to work, all categories should be present,
760-
# including those missing from the data (GH-13179), which .unique()
761-
# above dropped
762-
cat.add_categories(
763-
self.categories[~self.categories.isin(cat.categories)],
764-
inplace=True)
765-
766-
return self.reorder_categories(cat.categories)
767-
768701
def _set_dtype(self, dtype):
769702
"""Internal method for directly updating the CategoricalDtype
770703

pandas/core/groupby/categorical.py

+100
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
import numpy as np
2+
from pandas.core.algorithms import unique1d
3+
from pandas.core.arrays.categorical import (
4+
_recode_for_categories, CategoricalDtype, Categorical)
5+
from pandas import CategoricalIndex, Series
6+
7+
8+
def recode_for_groupby(c, sort, observed):
9+
"""
10+
Code the categories to ensure we can groupby for categoricals.
11+
12+
If observed=True, we return a new Categorical with the observed
13+
categories only.
14+
15+
If sort=False, return a copy of self, coded with categories as
16+
returned by .unique(), followed by any categories not appearing in
17+
the data. If sort=True, return self.
18+
19+
This method is needed solely to ensure the categorical index of the
20+
GroupBy result has categories in the order of appearance in the data
21+
(GH-8868).
22+
23+
Parameters
24+
----------
25+
c : Categorical
26+
sort : boolean
27+
The value of the sort parameter groupby was called with.
28+
observed : boolean
29+
Account only for the observed values
30+
31+
Returns
32+
-------
33+
New Categorical
34+
If sort=False, the new categories are set to the order of
35+
appearance in codes (unless ordered=True, in which case the
36+
original order is preserved), followed by any unrepresented
37+
categories in the original order.
38+
Categorical or None
39+
If we are observed, return the original categorical, otherwise None
40+
"""
41+
42+
# we only care about observed values
43+
if observed:
44+
unique_codes = unique1d(c.codes)
45+
46+
take_codes = unique_codes[unique_codes != -1]
47+
if c.ordered:
48+
take_codes = np.sort(take_codes)
49+
50+
# we recode according to the uniques
51+
categories = c.categories.take(take_codes)
52+
codes = _recode_for_categories(c.codes,
53+
c.categories,
54+
categories)
55+
56+
# return a new categorical that maps our new codes
57+
# and categories
58+
dtype = CategoricalDtype(categories, ordered=c.ordered)
59+
return Categorical(codes, dtype=dtype, fastpath=True), c
60+
61+
# Already sorted according to c.categories; all is fine
62+
if sort:
63+
return c, None
64+
65+
# sort=False should order groups in as-encountered order (GH-8868)
66+
cat = c.unique()
67+
68+
# But for groupby to work, all categories should be present,
69+
# including those missing from the data (GH-13179), which .unique()
70+
# above dropped
71+
cat = cat.add_categories(
72+
c.categories[~c.categories.isin(cat.categories)])
73+
74+
return c.reorder_categories(cat.categories), None
75+
76+
77+
def recode_from_groupby(c, sort, ci):
78+
"""
79+
Reverse the codes_to_groupby to account for sort / observed.
80+
81+
Parameters
82+
----------
83+
c : Categorical
84+
sort : boolean
85+
The value of the sort parameter groupby was called with.
86+
ci : CategoricalIndex
87+
The codes / categories to recode
88+
89+
Returns
90+
-------
91+
CategoricalIndex
92+
"""
93+
94+
# we re-order to the original category orderings
95+
if sort:
96+
return ci.set_categories(c.categories)
97+
98+
# we are not sorting, so add unobserved to the end
99+
return ci.add_categories(
100+
c.categories[~c.categories.isin(ci.categories)])

pandas/core/groupby/groupby.py

+6-14
Original file line numberDiff line numberDiff line change
@@ -2994,9 +2994,9 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
29942994
# a passed Categorical
29952995
elif is_categorical_dtype(self.grouper):
29962996

2997-
self.all_grouper = self.grouper
2998-
self.grouper = self.grouper._codes_for_groupby(
2999-
self.sort, observed)
2997+
from pandas.core.groupby.categorical import recode_for_groupby
2998+
self.grouper, self.all_grouper = recode_for_groupby(
2999+
self.grouper, self.sort, observed)
30003000
categories = self.grouper.categories
30013001

30023002
# we make a CategoricalIndex out of the cat grouper
@@ -3073,17 +3073,9 @@ def labels(self):
30733073
@cache_readonly
30743074
def result_index(self):
30753075
if self.all_grouper is not None:
3076-
all_categories = self.all_grouper.categories
3077-
3078-
# we re-order to the original category orderings
3079-
if self.sort:
3080-
return self.group_index.set_categories(all_categories)
3081-
3082-
# we are not sorting, so add unobserved to the end
3083-
categories = self.group_index.categories
3084-
return self.group_index.add_categories(
3085-
all_categories[~all_categories.isin(categories)])
3086-
3076+
from pandas.core.groupby.categorical import recode_from_groupby
3077+
return recode_from_groupby(self.all_grouper,
3078+
self.sort, self.group_index)
30873079
return self.group_index
30883080

30893081
@property

0 commit comments

Comments
 (0)