Skip to content

REF: Move Categorical logic from Grouping (#46203) #46207

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 59 additions & 1 deletion pandas/core/groupby/categorical.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,72 @@
from __future__ import annotations

import dataclasses
from typing import final

import numpy as np

from pandas._typing import npt

from pandas.core.algorithms import unique1d
from pandas.core.arrays.categorical import (
Categorical,
CategoricalDtype,
recode_for_categories,
)
from pandas.core.indexes.api import CategoricalIndex
from pandas.core.indexes.api import (
CategoricalIndex,
Index,
)


@final
@dataclasses.dataclass
class CategoricalGrouper:
original_grouping_vector: Categorical | None
new_grouping_vector: Categorical
observed: bool
sort: bool

def result_index(self, group_index: Index) -> Index:
if self.original_grouping_vector is None:
return group_index
assert isinstance(group_index, CategoricalIndex)
return recode_from_groupby(
self.original_grouping_vector, self.sort, group_index
)

@classmethod
def make(cls, grouping_vector, sort: bool, observed: bool) -> CategoricalGrouper:
new_grouping_vector, original_grouping_vector = recode_for_groupby(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instead of saving new_grouping_vector as an instance variable, could just return it from this method, since it isn't ever needed by this class.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why 'make' instead of just __init__?

grouping_vector, sort, observed
)
return cls(
original_grouping_vector=original_grouping_vector,
new_grouping_vector=new_grouping_vector,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not totally happy with these names

observed=observed,
sort=sort,
)

def codes_and_uniques(
Copy link
Contributor Author

@NickCrews NickCrews Mar 3, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This whole function overlaps a ton with recode_for_groupby, maybe even doing the exact same thing. Maybe should get unified? In this refactor though I just moved code though, didn't want to change any content yet.

Also, perhaps recode_for_groupby and recode_from_groupby should just get absorbed into this class as methods?

self, cat: Categorical
) -> tuple[npt.NDArray[np.signedinteger], Categorical]:
"""
This does a version of `algorithms.factorize()` that works on categoricals.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps should just rename this (and the grouping.codes_and_uniques()) to factorize() to be more consistent, since really these are just wrappers for that.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems reasonable

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed this in #46258

"""
categories = cat.categories

if self.observed:
ucodes = unique1d(cat.codes)
ucodes = ucodes[ucodes != -1]
if self.sort or cat.ordered:
ucodes = np.sort(ucodes)
else:
ucodes = np.arange(len(categories))

uniques = Categorical.from_codes(
codes=ucodes, categories=categories, ordered=cat.ordered
)
return cat.codes, uniques


def recode_for_groupby(
Expand Down
65 changes: 21 additions & 44 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@
TYPE_CHECKING,
Any,
Hashable,
cast,
final,
)
import warnings

import numpy as np

from pandas._typing import (
AnyArrayLike,
ArrayLike,
NDFrameT,
npt,
Expand All @@ -38,12 +40,8 @@
import pandas.core.common as com
from pandas.core.frame import DataFrame
from pandas.core.groupby import ops
from pandas.core.groupby.categorical import (
recode_for_groupby,
recode_from_groupby,
)
from pandas.core.groupby.categorical import CategoricalGrouper
from pandas.core.indexes.api import (
CategoricalIndex,
Index,
MultiIndex,
)
Expand Down Expand Up @@ -461,8 +459,7 @@ class Grouping:

_codes: npt.NDArray[np.signedinteger] | None = None
_group_index: Index | None = None
_passed_categorical: bool
_all_grouper: Categorical | None
_cat_grouper: CategoricalGrouper | None = None
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the big benefit of this PR: It condenses all of the other instance variables such as _all_grouper into this one container object, since those variables are irrelevant if we aren't dealing with categoricals.

_index: Index

def __init__(
Expand All @@ -479,16 +476,12 @@ def __init__(
self.level = level
self._orig_grouper = grouper
self.grouping_vector = _convert_grouper(index, grouper)
self._all_grouper = None
self._index = index
self._sort = sort
self.obj = obj
self._observed = observed
self.in_axis = in_axis
self._dropna = dropna

self._passed_categorical = False

# we have a single grouper which may be a myriad of things,
# some of which are dependent on the passing in level

Expand Down Expand Up @@ -527,13 +520,10 @@ def __init__(
self.grouping_vector = Index(ng, name=newgrouper.result_index.name)

elif is_categorical_dtype(self.grouping_vector):
# a passed Categorical
self._passed_categorical = True

self.grouping_vector, self._all_grouper = recode_for_groupby(
self._cat_grouper = CategoricalGrouper.make(
self.grouping_vector, sort, observed
)

self.grouping_vector = self._cat_grouper.new_grouping_vector
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See comment above, could change this to just self._cat_grouper, self.grouping_vector = CategoricalGrouper.make(...)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this the way you would do it? it seems much more natural to have self.groupbyin_vector itself returr be a class that can be handled, similar to BaseGrouper (in fact you should inherit from BaseGrouper)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the review @jreback. I'll admit this felt like a kludge. I tried to grok how all the groupby logic interacts, but I mostly failed. There's a BaseGrouper as well as Grouper, but Grouper doesn't inherit from BaseGrouper? What's the difference between a Grouping and a Grouper? A lot of outdated docstrings and lack of typing annotations (and just plain loose typing, eg grouping_vector is sometimes not at all like a vector?) didn't help either.

Even though my choice of name might make it appear otherwise, I wasn't really trying to fit it into any type hierarchy. It is just a container for a few bits of data with some help methods.

I would love to clean this so it makes more sense, but I've been pounding my head on the wall long enough that I think I will need to give up without a bit of guidance. If you're willing to give a summary of how all the classes are intended to interact, then I can try to fix this up. But, I understand you're very busy so if you don't have time to onboard someone new then I get it. Thank you!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a BaseGrouper as well as Grouper, but Grouper doesn't inherit from BaseGrouper? What's the difference between a Grouping and a Grouper?

You're right that the hierarchy/naming is confusing. If you can come up with better names (for the not-public-API items) please go for it.

elif not isinstance(
self.grouping_vector, (Series, Index, ExtensionArray, np.ndarray)
):
Expand Down Expand Up @@ -631,20 +621,23 @@ def group_arraylike(self) -> ArrayLike:
# _group_index is set in __init__ for MultiIndex cases
return self._group_index._values

elif self._all_grouper is not None:
elif (
self._cat_grouper is not None
and self._cat_grouper.original_grouping_vector is not None
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure I like how we are reaching inside the _cat_grouper to ask it about original_grouping_vector. Maybe should make it more opaque like in result_index():

if self._cat_grouper is not None:
            return self._cat_grouper.group_arraylike(?????)

):
# retain dtype for categories, including unobserved ones
return self.result_index._values

return self._codes_and_uniques[1]
return cast(ArrayLike, self._codes_and_uniques[1])

@cache_readonly
def result_index(self) -> Index:
# result_index retains dtype for categories, including unobserved ones,
# which group_index does not
if self._all_grouper is not None:
group_idx = self.group_index
assert isinstance(group_idx, CategoricalIndex)
return recode_from_groupby(self._all_grouper, self._sort, group_idx)
"""
result_index retains dtype for categories, including unobserved ones,
which group_index does not
"""
if self._cat_grouper is not None:
return self._cat_grouper.result_index(self.group_index)
return self.group_index

@cache_readonly
Expand All @@ -657,26 +650,10 @@ def group_index(self) -> Index:
return Index._with_infer(uniques, name=self.name)

@cache_readonly
def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
if self._passed_categorical:
# we make a CategoricalIndex out of the cat grouper
# preserving the categories / ordered attributes
cat = self.grouping_vector
categories = cat.categories

if self._observed:
ucodes = algorithms.unique1d(cat.codes)
ucodes = ucodes[ucodes != -1]
if self._sort or cat.ordered:
ucodes = np.sort(ucodes)
else:
ucodes = np.arange(len(categories))

uniques = Categorical.from_codes(
codes=ucodes, categories=categories, ordered=cat.ordered
)
return cat.codes, uniques

def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], AnyArrayLike]:
Copy link
Contributor Author

@NickCrews NickCrews Mar 3, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Notice the type annotation change here.

I'm not sure why mypy was happy before, since algorithms.factorize returns np.ndarray | Index for uniques, and an Index does not satisfy ArrayLike.

uniques is typed as merely ArrayLike, mypy gets mad with the result from algorithms.factorize:
error: Incompatible types in assignment (expression has type "Union[ndarray[Any, Any], Index]", variable has type "Union[ExtensionArray, ndarray[Any, Any]]") [assignment]

Not totally sure this is the best solution, alternatively keep the return type as ArrayLike and do a cast with the result from algorithms.factorize.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like the factorize docstring/annotation is inaccurate. it can return ArrayLike, and im pretty sure does in the case(s) here

uniques: AnyArrayLike
if self._cat_grouper is not None:
return self._cat_grouper.codes_and_uniques(self.grouping_vector)
elif isinstance(self.grouping_vector, ops.BaseGrouper):
# we have a list of groupers
codes = self.grouping_vector.codes_info
Expand Down