diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 877204f5bb2a2..62f75f52bb6ad 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -441,6 +441,9 @@ class Grouping: _codes: np.ndarray | None = None _group_index: Index | None = None + _passed_categorical: bool + _all_grouper: Categorical | None + _index: Index def __init__( self, @@ -456,13 +459,13 @@ def __init__( self.level = level self._orig_grouper = grouper self.grouping_vector = _convert_grouper(index, grouper) - self.all_grouper = None - self.index = index - self.sort = sort + self._all_grouper = None + self._index = index + self._sort = sort self.obj = obj - self.observed = observed + self._observed = observed self.in_axis = in_axis - self.dropna = dropna + self._dropna = dropna self._passed_categorical = False @@ -471,11 +474,15 @@ def __init__( ilevel = self._ilevel if ilevel is not None: + mapper = self.grouping_vector + # In extant tests, the new self.grouping_vector matches + # `index.get_level_values(ilevel)` whenever + # mapper is None and isinstance(index, MultiIndex) ( self.grouping_vector, # Index self._codes, self._group_index, - ) = index._get_grouper_for_level(self.grouping_vector, ilevel) + ) = index._get_grouper_for_level(mapper, ilevel) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get codes @@ -505,8 +512,8 @@ def __init__( # a passed Categorical self._passed_categorical = True - self.grouping_vector, self.all_grouper = recode_for_groupby( - self.grouping_vector, self.sort, observed + self.grouping_vector, self._all_grouper = recode_for_groupby( + self.grouping_vector, sort, observed ) elif not isinstance( @@ -517,11 +524,11 @@ def __init__( t = self.name or str(type(self.grouping_vector)) raise ValueError(f"Grouper for '{t}' not 1-dimensional") - self.grouping_vector = self.index.map(self.grouping_vector) + self.grouping_vector = index.map(self.grouping_vector) if not ( hasattr(self.grouping_vector, "__len__") - and len(self.grouping_vector) == len(self.index) + and len(self.grouping_vector) == len(index) ): grper = pprint_thing(self.grouping_vector) errmsg = ( @@ -546,7 +553,7 @@ def __iter__(self): def name(self) -> Hashable: ilevel = self._ilevel if ilevel is not None: - return self.index.names[ilevel] + return self._index.names[ilevel] if isinstance(self._orig_grouper, (Index, Series)): return self._orig_grouper.name @@ -569,7 +576,7 @@ def _ilevel(self) -> int | None: if level is None: return None if not isinstance(level, int): - index = self.index + index = self._index if level not in index.names: raise AssertionError(f"Level {level} not in index") return index.names.index(level) @@ -607,10 +614,10 @@ def group_arraylike(self) -> ArrayLike: @cache_readonly def result_index(self) -> Index: # TODO: what's the difference between result_index vs group_index? - if self.all_grouper is not None: + if self._all_grouper is not None: group_idx = self.group_index assert isinstance(group_idx, CategoricalIndex) - return recode_from_groupby(self.all_grouper, self.sort, group_idx) + return recode_from_groupby(self._all_grouper, self._sort, group_idx) return self.group_index @cache_readonly @@ -629,10 +636,10 @@ def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: cat = self.grouping_vector categories = cat.categories - if self.observed: + if self._observed: ucodes = algorithms.unique1d(cat.codes) ucodes = ucodes[ucodes != -1] - if self.sort or cat.ordered: + if self._sort or cat.ordered: ucodes = np.sort(ucodes) else: ucodes = np.arange(len(categories)) @@ -648,18 +655,18 @@ def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: uniques = self.grouping_vector.result_arraylike else: # GH35667, replace dropna=False with na_sentinel=None - if not self.dropna: + if not self._dropna: na_sentinel = None else: na_sentinel = -1 codes, uniques = algorithms.factorize( - self.grouping_vector, sort=self.sort, na_sentinel=na_sentinel + self.grouping_vector, sort=self._sort, na_sentinel=na_sentinel ) return codes, uniques @cache_readonly def groups(self) -> dict[Hashable, np.ndarray]: - return self.index.groupby(Categorical.from_codes(self.codes, self.group_index)) + return self._index.groupby(Categorical.from_codes(self.codes, self.group_index)) def get_grouper( diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b995a74c20a40..2ff97bf535e0c 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -678,7 +678,7 @@ def __init__( self.axis = axis self._groupings: list[grouper.Grouping] = list(groupings) - self.sort = sort + self._sort = sort self.group_keys = group_keys self.mutated = mutated self.indexer = indexer @@ -891,7 +891,7 @@ def codes_info(self) -> np.ndarray: def _get_compressed_codes(self) -> tuple[np.ndarray, np.ndarray]: if len(self.groupings) > 1: group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) - return compress_group_index(group_index, sort=self.sort) + return compress_group_index(group_index, sort=self._sort) ping = self.groupings[0] return ping.codes, np.arange(len(ping.group_index))