|
| 1 | +import numpy as np |
| 2 | +from pandas.core.algorithms import unique1d |
| 3 | +from pandas.core.arrays.categorical import ( |
| 4 | + _recode_for_categories, CategoricalDtype, Categorical) |
| 5 | +from pandas import CategoricalIndex, Series |
| 6 | + |
| 7 | + |
| 8 | +def recode_for_groupby(c, sort, observed): |
| 9 | + """ |
| 10 | + Code the categories to ensure we can groupby for categoricals. |
| 11 | +
|
| 12 | + If observed=True, we return a new Categorical with the observed |
| 13 | + categories only. |
| 14 | +
|
| 15 | + If sort=False, return a copy of self, coded with categories as |
| 16 | + returned by .unique(), followed by any categories not appearing in |
| 17 | + the data. If sort=True, return self. |
| 18 | +
|
| 19 | + This method is needed solely to ensure the categorical index of the |
| 20 | + GroupBy result has categories in the order of appearance in the data |
| 21 | + (GH-8868). |
| 22 | +
|
| 23 | + Parameters |
| 24 | + ---------- |
| 25 | + c : Categorical |
| 26 | + sort : boolean |
| 27 | + The value of the sort parameter groupby was called with. |
| 28 | + observed : boolean |
| 29 | + Account only for the observed values |
| 30 | +
|
| 31 | + Returns |
| 32 | + ------- |
| 33 | + New Categorical |
| 34 | + If sort=False, the new categories are set to the order of |
| 35 | + appearance in codes (unless ordered=True, in which case the |
| 36 | + original order is preserved), followed by any unrepresented |
| 37 | + categories in the original order. |
| 38 | + Categorical or None |
| 39 | + If we are observed, return the original categorical, otherwise None |
| 40 | + """ |
| 41 | + |
| 42 | + # we only care about observed values |
| 43 | + if observed: |
| 44 | + unique_codes = unique1d(c.codes) |
| 45 | + |
| 46 | + take_codes = unique_codes[unique_codes != -1] |
| 47 | + if c.ordered: |
| 48 | + take_codes = np.sort(take_codes) |
| 49 | + |
| 50 | + # we recode according to the uniques |
| 51 | + categories = c.categories.take(take_codes) |
| 52 | + codes = _recode_for_categories(c.codes, |
| 53 | + c.categories, |
| 54 | + categories) |
| 55 | + |
| 56 | + # return a new categorical that maps our new codes |
| 57 | + # and categories |
| 58 | + dtype = CategoricalDtype(categories, ordered=c.ordered) |
| 59 | + return Categorical(codes, dtype=dtype, fastpath=True), c |
| 60 | + |
| 61 | + # Already sorted according to c.categories; all is fine |
| 62 | + if sort: |
| 63 | + return c, None |
| 64 | + |
| 65 | + # sort=False should order groups in as-encountered order (GH-8868) |
| 66 | + cat = c.unique() |
| 67 | + |
| 68 | + # But for groupby to work, all categories should be present, |
| 69 | + # including those missing from the data (GH-13179), which .unique() |
| 70 | + # above dropped |
| 71 | + cat = cat.add_categories( |
| 72 | + c.categories[~c.categories.isin(cat.categories)]) |
| 73 | + |
| 74 | + return c.reorder_categories(cat.categories), None |
| 75 | + |
| 76 | + |
| 77 | +def recode_from_groupby(c, sort, ci): |
| 78 | + """ |
| 79 | + Reverse the codes_to_groupby to account for sort / observed. |
| 80 | +
|
| 81 | + Parameters |
| 82 | + ---------- |
| 83 | + c : Categorical |
| 84 | + sort : boolean |
| 85 | + The value of the sort parameter groupby was called with. |
| 86 | + ci : CategoricalIndex |
| 87 | + The codes / categories to recode |
| 88 | +
|
| 89 | + Returns |
| 90 | + ------- |
| 91 | + CategoricalIndex |
| 92 | + """ |
| 93 | + |
| 94 | + # we re-order to the original category orderings |
| 95 | + if sort: |
| 96 | + return ci.set_categories(c.categories) |
| 97 | + |
| 98 | + # we are not sorting, so add unobserved to the end |
| 99 | + return ci.add_categories( |
| 100 | + c.categories[~c.categories.isin(ci.categories)]) |
0 commit comments