CLN: type annotations in groupby.grouper, groupby.ops (#29456)

jbrockmendel · WillAyd · commit 4b3027f79a2d · 2019-11-12T16:39:18.000-08:00
* Annotate groupby.ops

* annotations, needs debugging

* whitespace

* types

* circular import

* fix msot mypy complaints

* fix mypy groupings

* merge cleanup
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -119,7 +119,7 @@ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False):
     def ax(self):
         return self.grouper
 
-    def _get_grouper(self, obj, validate=True):
+    def _get_grouper(self, obj, validate: bool = True):
         """
         Parameters
         ----------
@@ -143,17 +143,18 @@ def _get_grouper(self, obj, validate=True):
         )
         return self.binner, self.grouper, self.obj
 
-    def _set_grouper(self, obj, sort=False):
+    def _set_grouper(self, obj: FrameOrSeries, sort: bool = False):
         """
         given an object and the specifications, setup the internal grouper
         for this particular specification
 
         Parameters
         ----------
-        obj : the subject object
+        obj : Series or DataFrame
         sort : bool, default False
             whether the resulting grouper should be sorted
         """
+        assert obj is not None
 
         if self.key is not None and self.level is not None:
             raise ValueError("The Grouper cannot specify both a key and a level!")
@@ -211,13 +212,13 @@ def groups(self):
 
     def __repr__(self) -> str:
         attrs_list = (
-            "{}={!r}".format(attr_name, getattr(self, attr_name))
+            "{name}={val!r}".format(name=attr_name, val=getattr(self, attr_name))
             for attr_name in self._attributes
             if getattr(self, attr_name) is not None
         )
         attrs = ", ".join(attrs_list)
         cls_name = self.__class__.__name__
-        return "{}({})".format(cls_name, attrs)
+        return "{cls}({attrs})".format(cls=cls_name, attrs=attrs)
 
 
 class Grouping:
@@ -372,7 +373,7 @@ def __init__(
                 self.grouper = self.grouper.astype("timedelta64[ns]")
 
     def __repr__(self) -> str:
-        return "Grouping({0})".format(self.name)
+        return "Grouping({name})".format(name=self.name)
 
     def __iter__(self):
         return iter(self.indices)
@@ -433,10 +434,10 @@ def get_grouper(
     key=None,
     axis: int = 0,
     level=None,
-    sort=True,
-    observed=False,
-    mutated=False,
-    validate=True,
+    sort: bool = True,
+    observed: bool = False,
+    mutated: bool = False,
+    validate: bool = True,
 ) -> Tuple[BaseGrouper, List[Hashable], FrameOrSeries]:
     """
     Create and return a BaseGrouper, which is an internal
@@ -670,7 +671,7 @@ def is_in_obj(gpr) -> bool:
     return grouper, exclusions, obj
 
 
-def _is_label_like(val):
+def _is_label_like(val) -> bool:
     return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val))
 
 
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -36,6 +36,7 @@
 )
 from pandas.core.dtypes.missing import _maybe_fill, isna
 
+from pandas._typing import FrameOrSeries
 import pandas.core.algorithms as algorithms
 from pandas.core.base import SelectionMixin
 import pandas.core.common as com
@@ -89,12 +90,16 @@ def __init__(
 
         self._filter_empty_groups = self.compressed = len(groupings) != 1
         self.axis = axis
-        self.groupings = groupings  # type: Sequence[grouper.Grouping]
+        self._groupings = list(groupings)  # type: List[grouper.Grouping]
         self.sort = sort
         self.group_keys = group_keys
         self.mutated = mutated
         self.indexer = indexer
 
+    @property
+    def groupings(self) -> List["grouper.Grouping"]:
+        return self._groupings
+
     @property
     def shape(self):
         return tuple(ping.ngroups for ping in self.groupings)
@@ -106,7 +111,7 @@ def __iter__(self):
     def nkeys(self) -> int:
         return len(self.groupings)
 
-    def get_iterator(self, data, axis=0):
+    def get_iterator(self, data: FrameOrSeries, axis: int = 0):
         """
         Groupby iterator
 
@@ -120,7 +125,7 @@ def get_iterator(self, data, axis=0):
         for key, (i, group) in zip(keys, splitter):
             yield key, group
 
-    def _get_splitter(self, data, axis=0):
+    def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> "DataSplitter":
         comp_ids, _, ngroups = self.group_info
         return get_splitter(data, comp_ids, ngroups, axis=axis)
 
@@ -142,13 +147,13 @@ def _get_group_keys(self):
             # provide "flattened" iterator for multi-group setting
             return get_flattened_iterator(comp_ids, ngroups, self.levels, self.codes)
 
-    def apply(self, f, data, axis: int = 0):
+    def apply(self, f, data: FrameOrSeries, axis: int = 0):
         mutated = self.mutated
         splitter = self._get_splitter(data, axis=axis)
         group_keys = self._get_group_keys()
         result_values = None
 
-        sdata = splitter._get_sorted_data()
+        sdata = splitter._get_sorted_data()  # type: FrameOrSeries
         if sdata.ndim == 2 and np.any(sdata.dtypes.apply(is_extension_array_dtype)):
             # calling splitter.fast_apply will raise TypeError via apply_frame_axis0
             #  if we pass EA instead of ndarray
@@ -157,7 +162,7 @@ def apply(self, f, data, axis: int = 0):
 
         elif (
             com.get_callable_name(f) not in base.plotting_methods
-            and hasattr(splitter, "fast_apply")
+            and isinstance(splitter, FrameSplitter)
             and axis == 0
             # with MultiIndex, apply_frame_axis0 would raise InvalidApply
             # TODO: can we make this check prettier?
@@ -229,8 +234,7 @@ def names(self):
 
     def size(self) -> Series:
         """
-        Compute group sizes
-
+        Compute group sizes.
         """
         ids, _, ngroup = self.group_info
         ids = ensure_platform_int(ids)
@@ -292,7 +296,7 @@ def reconstructed_codes(self) -> List[np.ndarray]:
         return decons_obs_group_ids(comp_ids, obs_ids, self.shape, codes, xnull=True)
 
     @cache_readonly
-    def result_index(self):
+    def result_index(self) -> Index:
         if not self.compressed and len(self.groupings) == 1:
             return self.groupings[0].result_index.rename(self.names[0])
 
@@ -629,7 +633,7 @@ def agg_series(self, obj: Series, func):
                 raise
         return self._aggregate_series_pure_python(obj, func)
 
-    def _aggregate_series_fast(self, obj, func):
+    def _aggregate_series_fast(self, obj: Series, func):
         # At this point we have already checked that
         #  - obj.index is not a MultiIndex
         #  - obj is backed by an ndarray, not ExtensionArray
@@ -648,7 +652,7 @@ def _aggregate_series_fast(self, obj, func):
         result, counts = grouper.get_result()
         return result, counts
 
-    def _aggregate_series_pure_python(self, obj, func):
+    def _aggregate_series_pure_python(self, obj: Series, func):
 
         group_index, _, ngroups = self.group_info
 
@@ -705,7 +709,12 @@ class BinGrouper(BaseGrouper):
     """
 
     def __init__(
-        self, bins, binlabels, filter_empty=False, mutated=False, indexer=None
+        self,
+        bins,
+        binlabels,
+        filter_empty: bool = False,
+        mutated: bool = False,
+        indexer=None,
     ):
         self.bins = ensure_int64(bins)
         self.binlabels = ensure_index(binlabels)
@@ -739,7 +748,7 @@ def _get_grouper(self):
         """
         return self
 
-    def get_iterator(self, data: NDFrame, axis: int = 0):
+    def get_iterator(self, data: FrameOrSeries, axis: int = 0):
         """
         Groupby iterator
 
@@ -811,11 +820,9 @@ def names(self):
         return [self.binlabels.name]
 
     @property
-    def groupings(self):
-        from pandas.core.groupby.grouper import Grouping
-
+    def groupings(self) -> "List[grouper.Grouping]":
         return [
-            Grouping(lvl, lvl, in_axis=False, level=None, name=name)
+            grouper.Grouping(lvl, lvl, in_axis=False, level=None, name=name)
             for lvl, name in zip(self.levels, self.names)
         ]
 
@@ -856,7 +863,7 @@ def _is_indexed_like(obj, axes) -> bool:
 
 
 class DataSplitter:
-    def __init__(self, data, labels, ngroups, axis: int = 0):
+    def __init__(self, data: FrameOrSeries, labels, ngroups: int, axis: int = 0):
         self.data = data
         self.labels = ensure_int64(labels)
         self.ngroups = ngroups
@@ -887,15 +894,15 @@ def __iter__(self):
         for i, (start, end) in enumerate(zip(starts, ends)):
             yield i, self._chop(sdata, slice(start, end))
 
-    def _get_sorted_data(self):
+    def _get_sorted_data(self) -> FrameOrSeries:
         return self.data.take(self.sort_idx, axis=self.axis)
 
-    def _chop(self, sdata, slice_obj: slice):
+    def _chop(self, sdata, slice_obj: slice) -> NDFrame:
         raise AbstractMethodError(self)
 
 
 class SeriesSplitter(DataSplitter):
-    def _chop(self, sdata, slice_obj: slice):
+    def _chop(self, sdata: Series, slice_obj: slice) -> Series:
         return sdata._get_values(slice_obj)
 
 
@@ -907,14 +914,14 @@ def fast_apply(self, f, names):
         sdata = self._get_sorted_data()
         return libreduction.apply_frame_axis0(sdata, f, names, starts, ends)
 
-    def _chop(self, sdata, slice_obj: slice):
+    def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame:
         if self.axis == 0:
             return sdata.iloc[slice_obj]
         else:
             return sdata._slice(slice_obj, axis=1)
 
 
-def get_splitter(data: NDFrame, *args, **kwargs):
+def get_splitter(data: FrameOrSeries, *args, **kwargs) -> DataSplitter:
     if isinstance(data, Series):
         klass = SeriesSplitter  # type: Type[DataSplitter]
     else: