-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
REF: Move Categorical logic from Grouping (#46203) #46207
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,72 @@ | ||
from __future__ import annotations | ||
|
||
import dataclasses | ||
from typing import final | ||
|
||
import numpy as np | ||
|
||
from pandas._typing import npt | ||
|
||
from pandas.core.algorithms import unique1d | ||
from pandas.core.arrays.categorical import ( | ||
Categorical, | ||
CategoricalDtype, | ||
recode_for_categories, | ||
) | ||
from pandas.core.indexes.api import CategoricalIndex | ||
from pandas.core.indexes.api import ( | ||
CategoricalIndex, | ||
Index, | ||
) | ||
|
||
|
||
@final | ||
@dataclasses.dataclass | ||
class CategoricalGrouper: | ||
original_grouping_vector: Categorical | None | ||
new_grouping_vector: Categorical | ||
observed: bool | ||
sort: bool | ||
|
||
def result_index(self, group_index: Index) -> Index: | ||
if self.original_grouping_vector is None: | ||
return group_index | ||
assert isinstance(group_index, CategoricalIndex) | ||
return recode_from_groupby( | ||
self.original_grouping_vector, self.sort, group_index | ||
) | ||
|
||
@classmethod | ||
def make(cls, grouping_vector, sort: bool, observed: bool) -> CategoricalGrouper: | ||
new_grouping_vector, original_grouping_vector = recode_for_groupby( | ||
grouping_vector, sort, observed | ||
) | ||
return cls( | ||
original_grouping_vector=original_grouping_vector, | ||
new_grouping_vector=new_grouping_vector, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not totally happy with these names |
||
observed=observed, | ||
sort=sort, | ||
) | ||
|
||
def codes_and_uniques( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This whole function overlaps a ton with Also, perhaps |
||
self, cat: Categorical | ||
) -> tuple[npt.NDArray[np.signedinteger], Categorical]: | ||
""" | ||
This does a version of `algorithms.factorize()` that works on categoricals. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps should just rename this (and the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. seems reasonable There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Addressed this in #46258 |
||
""" | ||
categories = cat.categories | ||
|
||
if self.observed: | ||
ucodes = unique1d(cat.codes) | ||
ucodes = ucodes[ucodes != -1] | ||
if self.sort or cat.ordered: | ||
ucodes = np.sort(ucodes) | ||
else: | ||
ucodes = np.arange(len(categories)) | ||
|
||
uniques = Categorical.from_codes( | ||
codes=ucodes, categories=categories, ordered=cat.ordered | ||
) | ||
return cat.codes, uniques | ||
|
||
|
||
def recode_for_groupby( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,13 +8,15 @@ | |
TYPE_CHECKING, | ||
Any, | ||
Hashable, | ||
cast, | ||
final, | ||
) | ||
import warnings | ||
|
||
import numpy as np | ||
|
||
from pandas._typing import ( | ||
AnyArrayLike, | ||
ArrayLike, | ||
NDFrameT, | ||
npt, | ||
|
@@ -38,12 +40,8 @@ | |
import pandas.core.common as com | ||
from pandas.core.frame import DataFrame | ||
from pandas.core.groupby import ops | ||
from pandas.core.groupby.categorical import ( | ||
recode_for_groupby, | ||
recode_from_groupby, | ||
) | ||
from pandas.core.groupby.categorical import CategoricalGrouper | ||
from pandas.core.indexes.api import ( | ||
CategoricalIndex, | ||
Index, | ||
MultiIndex, | ||
) | ||
|
@@ -461,8 +459,7 @@ class Grouping: | |
|
||
_codes: npt.NDArray[np.signedinteger] | None = None | ||
_group_index: Index | None = None | ||
_passed_categorical: bool | ||
_all_grouper: Categorical | None | ||
_cat_grouper: CategoricalGrouper | None = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the big benefit of this PR: It condenses all of the other instance variables such as _all_grouper into this one container object, since those variables are irrelevant if we aren't dealing with categoricals. |
||
_index: Index | ||
|
||
def __init__( | ||
|
@@ -479,16 +476,12 @@ def __init__( | |
self.level = level | ||
self._orig_grouper = grouper | ||
self.grouping_vector = _convert_grouper(index, grouper) | ||
self._all_grouper = None | ||
self._index = index | ||
self._sort = sort | ||
self.obj = obj | ||
self._observed = observed | ||
self.in_axis = in_axis | ||
self._dropna = dropna | ||
|
||
self._passed_categorical = False | ||
|
||
# we have a single grouper which may be a myriad of things, | ||
# some of which are dependent on the passing in level | ||
|
||
|
@@ -527,13 +520,10 @@ def __init__( | |
self.grouping_vector = Index(ng, name=newgrouper.result_index.name) | ||
|
||
elif is_categorical_dtype(self.grouping_vector): | ||
# a passed Categorical | ||
self._passed_categorical = True | ||
|
||
self.grouping_vector, self._all_grouper = recode_for_groupby( | ||
self._cat_grouper = CategoricalGrouper.make( | ||
self.grouping_vector, sort, observed | ||
) | ||
|
||
self.grouping_vector = self._cat_grouper.new_grouping_vector | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See comment above, could change this to just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this the way you would do it? it seems much more natural to have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for the review @jreback. I'll admit this felt like a kludge. I tried to grok how all the groupby logic interacts, but I mostly failed. There's a BaseGrouper as well as Grouper, but Grouper doesn't inherit from BaseGrouper? What's the difference between a Grouping and a Grouper? A lot of outdated docstrings and lack of typing annotations (and just plain loose typing, eg grouping_vector is sometimes not at all like a vector?) didn't help either. Even though my choice of name might make it appear otherwise, I wasn't really trying to fit it into any type hierarchy. It is just a container for a few bits of data with some help methods. I would love to clean this so it makes more sense, but I've been pounding my head on the wall long enough that I think I will need to give up without a bit of guidance. If you're willing to give a summary of how all the classes are intended to interact, then I can try to fix this up. But, I understand you're very busy so if you don't have time to onboard someone new then I get it. Thank you! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
You're right that the hierarchy/naming is confusing. If you can come up with better names (for the not-public-API items) please go for it. |
||
elif not isinstance( | ||
self.grouping_vector, (Series, Index, ExtensionArray, np.ndarray) | ||
): | ||
|
@@ -631,20 +621,23 @@ def group_arraylike(self) -> ArrayLike: | |
# _group_index is set in __init__ for MultiIndex cases | ||
return self._group_index._values | ||
|
||
elif self._all_grouper is not None: | ||
elif ( | ||
self._cat_grouper is not None | ||
and self._cat_grouper.original_grouping_vector is not None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not sure I like how we are reaching inside the _cat_grouper to ask it about original_grouping_vector. Maybe should make it more opaque like in
|
||
): | ||
# retain dtype for categories, including unobserved ones | ||
return self.result_index._values | ||
|
||
return self._codes_and_uniques[1] | ||
return cast(ArrayLike, self._codes_and_uniques[1]) | ||
|
||
@cache_readonly | ||
def result_index(self) -> Index: | ||
# result_index retains dtype for categories, including unobserved ones, | ||
# which group_index does not | ||
if self._all_grouper is not None: | ||
group_idx = self.group_index | ||
assert isinstance(group_idx, CategoricalIndex) | ||
return recode_from_groupby(self._all_grouper, self._sort, group_idx) | ||
""" | ||
result_index retains dtype for categories, including unobserved ones, | ||
which group_index does not | ||
""" | ||
if self._cat_grouper is not None: | ||
return self._cat_grouper.result_index(self.group_index) | ||
return self.group_index | ||
|
||
@cache_readonly | ||
|
@@ -657,26 +650,10 @@ def group_index(self) -> Index: | |
return Index._with_infer(uniques, name=self.name) | ||
|
||
@cache_readonly | ||
def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: | ||
if self._passed_categorical: | ||
# we make a CategoricalIndex out of the cat grouper | ||
# preserving the categories / ordered attributes | ||
cat = self.grouping_vector | ||
categories = cat.categories | ||
|
||
if self._observed: | ||
ucodes = algorithms.unique1d(cat.codes) | ||
ucodes = ucodes[ucodes != -1] | ||
if self._sort or cat.ordered: | ||
ucodes = np.sort(ucodes) | ||
else: | ||
ucodes = np.arange(len(categories)) | ||
|
||
uniques = Categorical.from_codes( | ||
codes=ucodes, categories=categories, ordered=cat.ordered | ||
) | ||
return cat.codes, uniques | ||
|
||
def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], AnyArrayLike]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Notice the type annotation change here. I'm not sure why mypy was happy before, since algorithms.factorize returns uniques is typed as merely ArrayLike, mypy gets mad with the result from algorithms.factorize: Not totally sure this is the best solution, alternatively keep the return type as ArrayLike and do a cast with the result from algorithms.factorize. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like the factorize docstring/annotation is inaccurate. it can return ArrayLike, and im pretty sure does in the case(s) here |
||
uniques: AnyArrayLike | ||
if self._cat_grouper is not None: | ||
return self._cat_grouper.codes_and_uniques(self.grouping_vector) | ||
elif isinstance(self.grouping_vector, ops.BaseGrouper): | ||
# we have a list of groupers | ||
codes = self.grouping_vector.codes_info | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
instead of saving new_grouping_vector as an instance variable, could just return it from this method, since it isn't ever needed by this class.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why 'make' instead of just
__init__
?