forked from pandas-dev/pandas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcategorical.py
120 lines (97 loc) · 3.75 KB
/
categorical.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from typing import (
Optional,
Tuple,
)
import numpy as np
from pandas.core.algorithms import unique1d
from pandas.core.arrays.categorical import (
Categorical,
CategoricalDtype,
recode_for_categories,
)
from pandas.core.indexes.api import CategoricalIndex
def recode_for_groupby(
c: Categorical, sort: bool, observed: bool
) -> Tuple[Categorical, Optional[Categorical]]:
"""
Code the categories to ensure we can groupby for categoricals.
If observed=True, we return a new Categorical with the observed
categories only.
If sort=False, return a copy of self, coded with categories as
returned by .unique(), followed by any categories not appearing in
the data. If sort=True, return self.
This method is needed solely to ensure the categorical index of the
GroupBy result has categories in the order of appearance in the data
(GH-8868).
Parameters
----------
c : Categorical
sort : bool
The value of the sort parameter groupby was called with.
observed : bool
Account only for the observed values
Returns
-------
Categorical
If sort=False, the new categories are set to the order of
appearance in codes (unless ordered=True, in which case the
original order is preserved), followed by any unrepresented
categories in the original order.
Categorical or None
If we are observed, return the original categorical, otherwise None
"""
# we only care about observed values
if observed:
# In cases with c.ordered, this is equivalent to
# return c.remove_unused_categories(), c
unique_codes = unique1d(c.codes)
take_codes = unique_codes[unique_codes != -1]
if c.ordered:
take_codes = np.sort(take_codes)
# we recode according to the uniques
categories = c.categories.take(take_codes)
codes = recode_for_categories(c.codes, c.categories, categories)
# return a new categorical that maps our new codes
# and categories
dtype = CategoricalDtype(categories, ordered=c.ordered)
return Categorical(codes, dtype=dtype, fastpath=True), c
# Already sorted according to c.categories; all is fine
if sort:
return c, None
# sort=False should order groups in as-encountered order (GH-8868)
cat = c.unique()
# See GH-38140 for block below
# exclude nan from indexer for categories
take_codes = cat.codes[cat.codes != -1]
if cat.ordered:
take_codes = np.sort(take_codes)
cat = cat.set_categories(cat.categories.take(take_codes))
# But for groupby to work, all categories should be present,
# including those missing from the data (GH-13179), which .unique()
# above dropped
cat = cat.add_categories(c.categories[~c.categories.isin(cat.categories)])
return c.reorder_categories(cat.categories), None
def recode_from_groupby(
c: Categorical, sort: bool, ci: CategoricalIndex
) -> CategoricalIndex:
"""
Reverse the codes_to_groupby to account for sort / observed.
Parameters
----------
c : Categorical
sort : bool
The value of the sort parameter groupby was called with.
ci : CategoricalIndex
The codes / categories to recode
Returns
-------
CategoricalIndex
"""
# we re-order to the original category orderings
if sort:
# error: "CategoricalIndex" has no attribute "set_categories"
return ci.set_categories(c.categories) # type: ignore[attr-defined]
# we are not sorting, so add unobserved to the end
new_cats = c.categories[~c.categories.isin(ci.categories)]
# error: "CategoricalIndex" has no attribute "add_categories"
return ci.add_categories(new_cats) # type: ignore[attr-defined]