CLN: standardize naming in core.groupby (pandas-dev#41247)

jbrockmendel · JulianWgs · commit 3c038e004794 · 2021-07-03T13:09:29.000+02:00
diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py
@@ -1,7 +1,4 @@
-from typing import (
-    Optional,
-    Tuple,
-)
+from __future__ import annotations
 
 import numpy as np
 
@@ -16,7 +13,7 @@
 
 def recode_for_groupby(
     c: Categorical, sort: bool, observed: bool
-) -> Tuple[Categorical, Optional[Categorical]]:
+) -> tuple[Categorical, Categorical | None]:
     """
     Code the categories to ensure we can groupby for categoricals.
 
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -366,7 +366,7 @@ def array_func(values: ArrayLike) -> ArrayLike:
                 )
             except NotImplementedError:
                 ser = Series(values)  # equiv 'obj' from outer frame
-                if self.grouper.ngroups > 0:
+                if self.ngroups > 0:
                     res_values, _ = self.grouper.agg_series(ser, alt)
                 else:
                     # equiv: res_values = self._python_agg_general(alt)
@@ -604,12 +604,12 @@ def _transform_fast(self, result) -> Series:
         fast version of transform, only applicable to
         builtin/cythonizable functions
         """
-        ids, _, ngroup = self.grouper.group_info
+        ids, _, _ = self.grouper.group_info
         result = result.reindex(self.grouper.result_index, copy=False)
         out = algorithms.take_nd(result._values, ids)
         return self.obj._constructor(out, index=self.obj.index, name=self.obj.name)
 
-    def filter(self, func, dropna=True, *args, **kwargs):
+    def filter(self, func, dropna: bool = True, *args, **kwargs):
         """
         Return a copy of a Series excluding elements from groups that
         do not satisfy the boolean criterion specified by func.
@@ -1445,7 +1445,7 @@ def _transform_fast(self, result: DataFrame) -> DataFrame:
         obj = self._obj_with_exclusions
 
         # for each col, reshape to size of original frame by take operation
-        ids, _, ngroup = self.grouper.group_info
+        ids, _, _ = self.grouper.group_info
         result = result.reindex(self.grouper.result_index, copy=False)
         output = [
             algorithms.take_nd(result.iloc[:, i].values, ids)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1108,13 +1108,13 @@ def _numba_prep(self, func, data):
             raise NotImplementedError(
                 "Numba engine can only be used with a single function."
             )
-        labels, _, n_groups = self.grouper.group_info
-        sorted_index = get_group_index_sorter(labels, n_groups)
-        sorted_labels = algorithms.take_nd(labels, sorted_index, allow_fill=False)
+        ids, _, ngroups = self.grouper.group_info
+        sorted_index = get_group_index_sorter(ids, ngroups)
+        sorted_ids = algorithms.take_nd(ids, sorted_index, allow_fill=False)
 
         sorted_data = data.take(sorted_index, axis=self.axis).to_numpy()
 
-        starts, ends = lib.generate_slices(sorted_labels, n_groups)
+        starts, ends = lib.generate_slices(sorted_ids, ngroups)
         return starts, ends, sorted_index, sorted_data
 
     @final
@@ -1258,11 +1258,12 @@ def _python_agg_general(self, func, *args, **kwargs):
         # iterate through "columns" ex exclusions to populate output dict
         output: dict[base.OutputKey, ArrayLike] = {}
 
+        if self.ngroups == 0:
+            # agg_series below assumes ngroups > 0
+            return self._python_apply_general(f, self._selected_obj)
+
         for idx, obj in enumerate(self._iterate_slices()):
             name = obj.name
-            if self.grouper.ngroups == 0:
-                # agg_series below assumes ngroups > 0
-                continue
 
             try:
                 # if this function is invalid for this dtype, we will ignore it.
@@ -1373,7 +1374,7 @@ def _apply_filter(self, indices, dropna):
         return filtered
 
     @final
-    def _cumcount_array(self, ascending: bool = True):
+    def _cumcount_array(self, ascending: bool = True) -> np.ndarray:
         """
         Parameters
         ----------
@@ -2726,7 +2727,7 @@ def _get_cythonized_result(
 
         grouper = self.grouper
 
-        labels, _, ngroups = grouper.group_info
+        ids, _, ngroups = grouper.group_info
         output: dict[base.OutputKey, np.ndarray] = {}
         base_func = getattr(libgroupby, how)
 
@@ -2759,15 +2760,15 @@ def _get_cythonized_result(
                 if pre_processing:
                     try:
                         vals, inferences = pre_processing(vals)
-                    except TypeError as e:
-                        error_msg = str(e)
+                    except TypeError as err:
+                        error_msg = str(err)
                         continue
                 vals = vals.astype(cython_dtype, copy=False)
                 if needs_2d:
                     vals = vals.reshape((-1, 1))
                 func = partial(func, vals)
 
-            func = partial(func, labels)
+            func = partial(func, ids)
 
             if min_count is not None:
                 func = partial(func, min_count)
diff --git a/pandas/core/groupby/numba_.py b/pandas/core/groupby/numba_.py
@@ -1,11 +1,10 @@
 """Common utilities for Numba operations with groupby ops"""
+from __future__ import annotations
+
 import inspect
 from typing import (
     Any,
     Callable,
-    Dict,
-    Optional,
-    Tuple,
 )
 
 import numpy as np
@@ -57,10 +56,10 @@ def f(values, index, ...):
 
 
 def generate_numba_agg_func(
-    args: Tuple,
-    kwargs: Dict[str, Any],
+    args: tuple,
+    kwargs: dict[str, Any],
     func: Callable[..., Scalar],
-    engine_kwargs: Optional[Dict[str, bool]],
+    engine_kwargs: dict[str, bool] | None,
 ) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int], np.ndarray]:
     """
     Generate a numba jitted agg function specified by values from engine_kwargs.
@@ -117,10 +116,10 @@ def group_agg(
 
 
 def generate_numba_transform_func(
-    args: Tuple,
-    kwargs: Dict[str, Any],
+    args: tuple,
+    kwargs: dict[str, Any],
     func: Callable[..., np.ndarray],
-    engine_kwargs: Optional[Dict[str, bool]],
+    engine_kwargs: dict[str, bool] | None,
 ) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int], np.ndarray]:
     """
     Generate a numba jitted transform function specified by values from engine_kwargs.
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -723,8 +723,8 @@ def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> DataSplitter:
 
         __finalize__ has not been called for the subsetted objects returned.
         """
-        comp_ids, _, ngroups = self.group_info
-        return get_splitter(data, comp_ids, ngroups, axis=axis)
+        ids, _, ngroups = self.group_info
+        return get_splitter(data, ids, ngroups, axis=axis)
 
     def _get_grouper(self):
         """
@@ -740,10 +740,10 @@ def _get_group_keys(self):
         if len(self.groupings) == 1:
             return self.levels[0]
         else:
-            comp_ids, _, ngroups = self.group_info
+            ids, _, ngroups = self.group_info
 
             # provide "flattened" iterator for multi-group setting
-            return get_flattened_list(comp_ids, ngroups, self.levels, self.codes)
+            return get_flattened_list(ids, ngroups, self.levels, self.codes)
 
     @final
     def apply(self, f: F, data: FrameOrSeries, axis: int = 0):
@@ -846,9 +846,9 @@ def size(self) -> Series:
         """
         Compute group sizes.
         """
-        ids, _, ngroup = self.group_info
-        if ngroup:
-            out = np.bincount(ids[ids != -1], minlength=ngroup)
+        ids, _, ngroups = self.group_info
+        if ngroups:
+            out = np.bincount(ids[ids != -1], minlength=ngroups)
         else:
             out = []
         return Series(out, index=self.result_index, dtype="int64")
@@ -882,11 +882,11 @@ def group_info(self):
     @cache_readonly
     def codes_info(self) -> np.ndarray:
         # return the codes of items in original grouped axis
-        codes, _, _ = self.group_info
+        ids, _, _ = self.group_info
         if self.indexer is not None:
-            sorter = np.lexsort((codes, self.indexer))
-            codes = codes[sorter]
-        return codes
+            sorter = np.lexsort((ids, self.indexer))
+            ids = ids[sorter]
+        return ids
 
     @final
     def _get_compressed_codes(self) -> tuple[np.ndarray, np.ndarray]:
@@ -906,8 +906,8 @@ def ngroups(self) -> int:
     @property
     def reconstructed_codes(self) -> list[np.ndarray]:
         codes = self.codes
-        comp_ids, obs_ids, _ = self.group_info
-        return decons_obs_group_ids(comp_ids, obs_ids, self.shape, codes, xnull=True)
+        ids, obs_ids, _ = self.group_info
+        return decons_obs_group_ids(ids, obs_ids, self.shape, codes, xnull=True)
 
     @cache_readonly
     def result_index(self) -> Index:
@@ -954,13 +954,13 @@ def _cython_operation(
 
         cy_op = WrappedCythonOp(kind=kind, how=how)
 
-        comp_ids, _, _ = self.group_info
+        ids, _, _ = self.group_info
         ngroups = self.ngroups
         return cy_op.cython_operation(
             values=values,
             axis=axis,
             min_count=min_count,
-            comp_ids=comp_ids,
+            comp_ids=ids,
             ngroups=ngroups,
             **kwargs,
         )
@@ -997,26 +997,26 @@ def _aggregate_series_fast(
         #  - ngroups != 0
         func = com.is_builtin_func(func)
 
-        group_index, _, ngroups = self.group_info
+        ids, _, ngroups = self.group_info
 
         # avoids object / Series creation overhead
-        indexer = get_group_index_sorter(group_index, ngroups)
+        indexer = get_group_index_sorter(ids, ngroups)
         obj = obj.take(indexer)
-        group_index = group_index.take(indexer)
-        sgrouper = libreduction.SeriesGrouper(obj, func, group_index, ngroups)
+        ids = ids.take(indexer)
+        sgrouper = libreduction.SeriesGrouper(obj, func, ids, ngroups)
         result, counts = sgrouper.get_result()
         return result, counts
 
     @final
     def _aggregate_series_pure_python(self, obj: Series, func: F):
-        group_index, _, ngroups = self.group_info
+        ids, _, ngroups = self.group_info
 
         counts = np.zeros(ngroups, dtype=int)
         result = np.empty(ngroups, dtype="O")
         initialized = False
 
         # equiv: splitter = self._get_splitter(obj, axis=0)
-        splitter = get_splitter(obj, group_index, ngroups, axis=0)
+        splitter = get_splitter(obj, ids, ngroups, axis=0)
 
         for i, group in enumerate(splitter):
 
@@ -1152,7 +1152,7 @@ def indices(self):
     @cache_readonly
     def group_info(self):
         ngroups = self.ngroups
-        obs_group_ids = np.arange(ngroups)
+        obs_group_ids = np.arange(ngroups, dtype=np.int64)
         rep = np.diff(np.r_[0, self.bins])
 
         rep = ensure_platform_int(rep)
@@ -1163,7 +1163,7 @@ def group_info(self):
 
         return (
             ensure_platform_int(comp_ids),
-            obs_group_ids.astype("int64", copy=False),
+            obs_group_ids,
             ngroups,
         )