-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
CLN: type up core.groupby.grouper.get_grouper #29458
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,7 +3,7 @@ | |
split-apply-combine paradigm. | ||
""" | ||
|
||
from typing import Optional, Tuple | ||
from typing import Hashable, List, Optional, Tuple | ||
import warnings | ||
|
||
import numpy as np | ||
|
@@ -26,7 +26,6 @@ | |
from pandas.core.arrays import Categorical, ExtensionArray | ||
import pandas.core.common as com | ||
from pandas.core.frame import DataFrame | ||
from pandas.core.generic import NDFrame | ||
from pandas.core.groupby.categorical import recode_for_groupby, recode_from_groupby | ||
from pandas.core.groupby.ops import BaseGrouper | ||
from pandas.core.index import CategoricalIndex, Index, MultiIndex | ||
|
@@ -134,7 +133,7 @@ def _get_grouper(self, obj, validate=True): | |
""" | ||
|
||
self._set_grouper(obj) | ||
self.grouper, exclusions, self.obj = _get_grouper( | ||
self.grouper, exclusions, self.obj = get_grouper( | ||
self.obj, | ||
[self.key], | ||
axis=self.axis, | ||
|
@@ -429,18 +428,18 @@ def groups(self) -> dict: | |
return self.index.groupby(Categorical.from_codes(self.codes, self.group_index)) | ||
|
||
|
||
def _get_grouper( | ||
obj: NDFrame, | ||
def get_grouper( | ||
obj: FrameOrSeries, | ||
key=None, | ||
axis: int = 0, | ||
level=None, | ||
sort=True, | ||
observed=False, | ||
mutated=False, | ||
validate=True, | ||
): | ||
) -> Tuple[BaseGrouper, List[Hashable], FrameOrSeries]: | ||
""" | ||
create and return a BaseGrouper, which is an internal | ||
Create and return a BaseGrouper, which is an internal | ||
mapping of how to create the grouper indexers. | ||
This may be composed of multiple Grouping objects, indicating | ||
multiple groupers | ||
|
@@ -456,9 +455,9 @@ def _get_grouper( | |
a BaseGrouper. | ||
|
||
If observed & we have a categorical grouper, only show the observed | ||
values | ||
values. | ||
|
||
If validate, then check for key/level overlaps | ||
If validate, then check for key/level overlaps. | ||
|
||
""" | ||
group_axis = obj._get_axis(axis) | ||
|
@@ -517,7 +516,7 @@ def _get_grouper( | |
if key.key is None: | ||
return grouper, [], obj | ||
else: | ||
return grouper, {key.key}, obj | ||
return grouper, [key.key], obj | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So is there no impact changing this from a set to a list? If not, does it even need to be a container? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This just makes the types consistent for all returns. A set or a list is not important when there's only 1 item in the container. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW It sometimes has lists with several items, so needs to be a list/container. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm not sure I follow. Maybe better to ask what the error was that mypy was giving? I assume this gets mutated outside of the function right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, it wasn't a mypy error, I think the function should have a clear return type, so (More general): I think function may have broad parameter types, but should strive to have clear return types, so there's less ambiguity over what you're looking at. Don't you agree? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK. Yea I'm not necessarily saying the old container type was correct and this is wrong, I guess was just questioning why this even needs to be held in a container if it's simply one value being returned. Can be refactored or reviewed in a follow up if that's a big effort; not a blocker as is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I'm haven't checked if a single value could be returned, instead of the list. My motivation for doing this is mostly wanting to pin down what I'm looking at in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds good. Yea this part of code is tricky and could use some refactor, so this is all moving in the right direction |
||
|
||
# already have a BaseGrouper, just return it | ||
elif isinstance(key, BaseGrouper): | ||
|
@@ -530,10 +529,8 @@ def _get_grouper( | |
# unhashable elements of `key`. Any unhashable elements implies that | ||
# they wanted a list of keys. | ||
# https://github.com/pandas-dev/pandas/issues/18314 | ||
is_tuple = isinstance(key, tuple) | ||
all_hashable = is_tuple and is_hashable(key) | ||
|
||
if is_tuple: | ||
if isinstance(key, tuple): | ||
all_hashable = is_hashable(key) | ||
if ( | ||
all_hashable and key not in obj and set(key).issubset(obj) | ||
) or not all_hashable: | ||
|
@@ -573,7 +570,8 @@ def _get_grouper( | |
all_in_columns_index = all( | ||
g in obj.columns or g in obj.index.names for g in keys | ||
) | ||
elif isinstance(obj, Series): | ||
else: | ||
assert isinstance(obj, Series) | ||
all_in_columns_index = all(g in obj.index.names for g in keys) | ||
|
||
if not all_in_columns_index: | ||
|
@@ -586,8 +584,8 @@ def _get_grouper( | |
else: | ||
levels = [level] * len(keys) | ||
|
||
groupings = [] | ||
exclusions = [] | ||
groupings = [] # type: List[Grouping] | ||
exclusions = [] # type: List[Hashable] | ||
|
||
# if the actual grouper should be obj[key] | ||
def is_in_axis(key) -> bool: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
there are some places in these modules where
name
attrs are annotated asstr
orOptional[str]
where Hashable would be more accurate I think