diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index db734bb2f0c07..4d5acf527a867 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -1,3 +1,5 @@ +from typing import Optional, Tuple + import numpy as np from pandas.core.algorithms import unique1d @@ -6,9 +8,12 @@ CategoricalDtype, recode_for_categories, ) +from pandas.core.indexes.api import CategoricalIndex -def recode_for_groupby(c: Categorical, sort: bool, observed: bool): +def recode_for_groupby( + c: Categorical, sort: bool, observed: bool +) -> Tuple[Categorical, Optional[Categorical]]: """ Code the categories to ensure we can groupby for categoricals. @@ -73,7 +78,9 @@ def recode_for_groupby(c: Categorical, sort: bool, observed: bool): return c.reorder_categories(cat.categories), None -def recode_from_groupby(c: Categorical, sort: bool, ci): +def recode_from_groupby( + c: Categorical, sort: bool, ci: CategoricalIndex +) -> CategoricalIndex: """ Reverse the codes_to_groupby to account for sort / observed. @@ -91,7 +98,8 @@ def recode_from_groupby(c: Categorical, sort: bool, ci): """ # we re-order to the original category orderings if sort: - return ci.set_categories(c.categories) + return ci.set_categories(c.categories) # type: ignore [attr-defined] # we are not sorting, so add unobserved to the end - return ci.add_categories(c.categories[~c.categories.isin(ci.categories)]) + new_cats = c.categories[~c.categories.isin(ci.categories)] + return ci.add_categories(new_cats) # type: ignore [attr-defined] diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 3172fb4e0e853..e39464628ccaa 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -23,6 +23,7 @@ Type, TypeVar, Union, + cast, ) import warnings @@ -83,7 +84,7 @@ from pandas.plotting import boxplot_frame_groupby if TYPE_CHECKING: - from pandas.core.internals import Block + from pandas.core.internals import Block # noqa:F401 NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) @@ -1591,7 +1592,7 @@ def _gotitem(self, key, ndim: int, subset=None): Parameters ---------- key : string / list of selections - ndim : 1,2 + ndim : {1, 2} requested ndim of result subset : object, default None subset to act on @@ -1617,7 +1618,7 @@ def _gotitem(self, key, ndim: int, subset=None): raise AssertionError("invalid ndim for _gotitem") - def _wrap_frame_output(self, result, obj) -> DataFrame: + def _wrap_frame_output(self, result, obj: DataFrame) -> DataFrame: result_index = self.grouper.levels[0] if self.axis == 0: @@ -1634,20 +1635,14 @@ def _get_data_to_aggregate(self) -> BlockManager: else: return obj._mgr - def _insert_inaxis_grouper_inplace(self, result): + def _insert_inaxis_grouper_inplace(self, result: DataFrame) -> None: # zip in reverse so we can always insert at loc 0 - izip = zip( - *map( - reversed, - ( - self.grouper.names, - self.grouper.get_group_levels(), - [grp.in_axis for grp in self.grouper.groupings], - ), - ) - ) columns = result.columns - for name, lev, in_axis in izip: + for name, lev, in_axis in zip( + reversed(self.grouper.names), + reversed(self.grouper.get_group_levels()), + reversed([grp.in_axis for grp in self.grouper.groupings]), + ): # GH #28549 # When using .apply(-), name will be in columns already if in_axis and name not in columns: @@ -1712,7 +1707,7 @@ def _wrap_transformed_output( return result - def _wrap_agged_blocks(self, blocks: "Sequence[Block]", items: Index) -> DataFrame: + def _wrap_agged_blocks(self, blocks: Sequence["Block"], items: Index) -> DataFrame: if not self.as_index: index = np.arange(blocks[0].values.shape[-1]) mgr = BlockManager(blocks, axes=[items, index]) @@ -1739,7 +1734,7 @@ def _iterate_column_groupbys(self): exclusions=self.exclusions, ) - def _apply_to_column_groupbys(self, func): + def _apply_to_column_groupbys(self, func) -> DataFrame: from pandas.core.reshape.concat import concat return concat( @@ -1748,7 +1743,7 @@ def _apply_to_column_groupbys(self, func): axis=1, ) - def count(self): + def count(self) -> DataFrame: """ Compute count of group, excluding missing values. @@ -1778,7 +1773,7 @@ def count(self): return self._reindex_output(result, fill_value=0) - def nunique(self, dropna: bool = True): + def nunique(self, dropna: bool = True) -> DataFrame: """ Return DataFrame with counts of unique elements in each position. @@ -1844,6 +1839,7 @@ def nunique(self, dropna: bool = True): ], axis=1, ) + results = cast(DataFrame, results) if axis_number == 1: results = results.T diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a91366af61d0d..651af2d314251 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -459,7 +459,7 @@ def f(self): @contextmanager -def _group_selection_context(groupby): +def _group_selection_context(groupby: "_GroupBy"): """ Set / reset the _group_selection_context. """ @@ -489,7 +489,7 @@ def __init__( keys: Optional[_KeysArgType] = None, axis: int = 0, level=None, - grouper: "Optional[ops.BaseGrouper]" = None, + grouper: Optional["ops.BaseGrouper"] = None, exclusions=None, selection=None, as_index: bool = True, @@ -734,7 +734,7 @@ def pipe(self, func, *args, **kwargs): plot = property(GroupByPlot) - def _make_wrapper(self, name): + def _make_wrapper(self, name: str) -> Callable: assert name in self._apply_allowlist with _group_selection_context(self): diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 8239a792c65dd..18970ea0544e4 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -568,7 +568,9 @@ def codes(self) -> np.ndarray: @cache_readonly def result_index(self) -> Index: if self.all_grouper is not None: - return recode_from_groupby(self.all_grouper, self.sort, self.group_index) + group_idx = self.group_index + assert isinstance(group_idx, CategoricalIndex) # set in __init__ + return recode_from_groupby(self.all_grouper, self.sort, group_idx) return self.group_index @property @@ -607,7 +609,7 @@ def get_grouper( mutated: bool = False, validate: bool = True, dropna: bool = True, -) -> "Tuple[ops.BaseGrouper, List[Hashable], FrameOrSeries]": +) -> Tuple["ops.BaseGrouper", List[Hashable], FrameOrSeries]: """ Create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 290680f380f5f..4dd5b7f30e7f0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -82,7 +82,7 @@ class BaseGrouper: def __init__( self, axis: Index, - groupings: "Sequence[grouper.Grouping]", + groupings: Sequence["grouper.Grouping"], sort: bool = True, group_keys: bool = True, mutated: bool = False,