Skip to content

REF: do less in Grouping.__init__ #41375

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -777,11 +777,7 @@ def apply_series_value_counts():
# multi-index components
codes = self.grouper.reconstructed_codes
codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
# error: List item 0 has incompatible type "Union[ndarray, Any]";
# expected "Index"
levels = [ping.group_index for ping in self.grouper.groupings] + [
lev # type: ignore[list-item]
]
levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
names = self.grouper.names + [self.obj.name]

if dropna:
Expand Down
87 changes: 54 additions & 33 deletions pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,9 @@ class Grouping:
* groups : dict of {group -> label_list}
"""

_codes: np.ndarray | None = None
_group_index: Index | None = None

def __init__(
self,
index: Index,
Expand All @@ -461,27 +464,25 @@ def __init__(
self.in_axis = in_axis
self.dropna = dropna

self._passed_categorical = False

# right place for this?
if isinstance(grouper, (Series, Index)) and name is None:
self.name = grouper.name

# we have a single grouper which may be a myriad of things,
# some of which are dependent on the passing in level

if level is not None:
if not isinstance(level, int):
if level not in index.names:
raise AssertionError(f"Level {level} not in index")
level = index.names.index(level)

ilevel = self._ilevel
if ilevel is not None:
if self.name is None:
self.name = index.names[level]
self.name = index.names[ilevel]

(
self.grouper,
self.grouper, # Index
self._codes,
self._group_index,
) = index._get_grouper_for_level(self.grouper, level)
) = index._get_grouper_for_level(self.grouper, ilevel)

# a passed Grouper like, directly get the grouper in the same way
# as single grouper groupby, use the group_info to get codes
Expand All @@ -502,32 +503,13 @@ def __init__(
self.grouper = grouper._get_grouper()

else:

# a passed Categorical
if is_categorical_dtype(self.grouper):
self._passed_categorical = True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why don't you just make this a property of the grouper itself?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think what you're suggesting is something like

@property
def _passed_categorical(self):
    return is_categorical_dtype(self.grouper)

that is the longer-term goal, but ATM we treat directly-passed Categoricals differently from Categoricals derived within __init__, so that property wouldn't be quite the same as this variable. I expect that the different treatment is non-intentional, but need to track that down to be sure.


self.grouper, self.all_grouper = recode_for_groupby(
self.grouper, self.sort, observed
)
categories = self.grouper.categories

# we make a CategoricalIndex out of the cat grouper
# preserving the categories / ordered attributes
self._codes = self.grouper.codes
if observed:
codes = algorithms.unique1d(self.grouper.codes)
codes = codes[codes != -1]
if sort or self.grouper.ordered:
codes = np.sort(codes)
else:
codes = np.arange(len(categories))

self._group_index = CategoricalIndex(
Categorical.from_codes(
codes=codes, categories=categories, ordered=self.grouper.ordered
),
name=self.name,
)

# we are done
elif isinstance(self.grouper, Grouping):
Expand Down Expand Up @@ -564,8 +546,20 @@ def __repr__(self) -> str:
def __iter__(self):
return iter(self.indices)

_codes: np.ndarray | None = None
_group_index: Index | None = None
@cache_readonly
def _ilevel(self) -> int | None:
"""
If necessary, converted index level name to index level position.
"""
level = self.level
if level is None:
return None
if not isinstance(level, int):
index = self.index
if level not in index.names:
raise AssertionError(f"Level {level} not in index")
return index.names.index(level)
return level

@property
def ngroups(self) -> int:
Expand All @@ -582,6 +576,12 @@ def indices(self):

@property
def codes(self) -> np.ndarray:
if self._passed_categorical:
# we make a CategoricalIndex out of the cat grouper
# preserving the categories / ordered attributes
cat = self.grouper
return cat.codes

if self._codes is None:
self._make_codes()
# error: Incompatible return value type (got "Optional[ndarray]",
Expand All @@ -592,12 +592,33 @@ def codes(self) -> np.ndarray:
def result_index(self) -> Index:
if self.all_grouper is not None:
group_idx = self.group_index
assert isinstance(group_idx, CategoricalIndex) # set in __init__
assert isinstance(group_idx, CategoricalIndex)
return recode_from_groupby(self.all_grouper, self.sort, group_idx)
return self.group_index

@property
@cache_readonly
def group_index(self) -> Index:
if self._passed_categorical:
# we make a CategoricalIndex out of the cat grouper
# preserving the categories / ordered attributes
cat = self.grouper
categories = cat.categories

if self.observed:
codes = algorithms.unique1d(cat.codes)
codes = codes[codes != -1]
if self.sort or cat.ordered:
codes = np.sort(codes)
else:
codes = np.arange(len(categories))

return CategoricalIndex(
Categorical.from_codes(
codes=codes, categories=categories, ordered=cat.ordered
),
name=self.name,
)

if self._group_index is None:
self._make_codes()
assert self._group_index is not None
Expand Down