From 98b53d78124951087c8a1fa9a00a70748273ab19 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 6 Nov 2019 20:41:55 -0800 Subject: [PATCH 1/8] Annotate groupby.ops --- pandas/core/groupby/ops.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 2c8aa1294451d..16a32c30a9857 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -106,7 +106,7 @@ def __iter__(self): def nkeys(self) -> int: return len(self.groupings) - def get_iterator(self, data, axis=0): + def get_iterator(self, data: NDFrame, axis: int = 0): """ Groupby iterator @@ -120,7 +120,7 @@ def get_iterator(self, data, axis=0): for key, (i, group) in zip(keys, splitter): yield key, group - def _get_splitter(self, data, axis=0): + def _get_splitter(self, data: NDFrame, axis: int = 0) -> "DataSplitter": comp_ids, _, ngroups = self.group_info return get_splitter(data, comp_ids, ngroups, axis=axis) @@ -142,7 +142,7 @@ def _get_group_keys(self): # provide "flattened" iterator for multi-group setting return get_flattened_iterator(comp_ids, ngroups, self.levels, self.codes) - def apply(self, f, data, axis: int = 0): + def apply(self, f, data: NDFrame, axis: int = 0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) group_keys = self._get_group_keys() @@ -157,7 +157,7 @@ def apply(self, f, data, axis: int = 0): elif ( com.get_callable_name(f) not in base.plotting_methods - and hasattr(splitter, "fast_apply") + and splitter.fast_apply is not None and axis == 0 # with MultiIndex, apply_frame_axis0 would raise InvalidApply # TODO: can we make this check prettier? @@ -292,7 +292,7 @@ def recons_codes(self): return decons_obs_group_ids(comp_ids, obs_ids, self.shape, codes, xnull=True) @cache_readonly - def result_index(self): + def result_index(self) -> Index: if not self.compressed and len(self.groupings) == 1: return self.groupings[0].result_index.rename(self.names[0]) @@ -607,7 +607,7 @@ def agg_series(self, obj: Series, func): raise return self._aggregate_series_pure_python(obj, func) - def _aggregate_series_fast(self, obj, func): + def _aggregate_series_fast(self, obj: Series, func): # At this point we have already checked that obj.index is not a MultiIndex # and that obj is backed by an ndarray, not ExtensionArray func = self._is_builtin_func(func) @@ -623,7 +623,7 @@ def _aggregate_series_fast(self, obj, func): result, counts = grouper.get_result() return result, counts - def _aggregate_series_pure_python(self, obj, func): + def _aggregate_series_pure_python(self, obj: Series, func): group_index, _, ngroups = self.group_info @@ -682,7 +682,12 @@ class BinGrouper(BaseGrouper): """ def __init__( - self, bins, binlabels, filter_empty=False, mutated=False, indexer=None + self, + bins, + binlabels, + filter_empty: bool = False, + mutated: bool = False, + indexer=None, ): self.bins = ensure_int64(bins) self.binlabels = ensure_index(binlabels) @@ -825,7 +830,9 @@ def _is_indexed_like(obj, axes) -> bool: class DataSplitter: - def __init__(self, data, labels, ngroups, axis: int = 0): + fast_apply = None + + def __init__(self, data: NDFrame, labels, ngroups: int, axis: int = 0): self.data = data self.labels = ensure_int64(labels) self.ngroups = ngroups @@ -856,7 +863,7 @@ def __iter__(self): for i, (start, end) in enumerate(zip(starts, ends)): yield i, self._chop(sdata, slice(start, end)) - def _get_sorted_data(self): + def _get_sorted_data(self) -> NDFrame: return self.data.take(self.sort_idx, axis=self.axis) def _chop(self, sdata, slice_obj: slice): @@ -864,7 +871,7 @@ def _chop(self, sdata, slice_obj: slice): class SeriesSplitter(DataSplitter): - def _chop(self, sdata, slice_obj: slice): + def _chop(self, sdata: Series, slice_obj: slice) -> Series: return sdata._get_values(slice_obj) @@ -876,14 +883,14 @@ def fast_apply(self, f, names): sdata = self._get_sorted_data() return libreduction.apply_frame_axis0(sdata, f, names, starts, ends) - def _chop(self, sdata, slice_obj: slice): + def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: if self.axis == 0: return sdata.iloc[slice_obj] else: return sdata._slice(slice_obj, axis=1) -def get_splitter(data: NDFrame, *args, **kwargs): +def get_splitter(data: NDFrame, *args, **kwargs) -> DataSplitter: if isinstance(data, Series): klass = SeriesSplitter # type: Type[DataSplitter] else: From efd4a9b856b36484958c63531289ed78b17693f7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 6 Nov 2019 20:51:41 -0800 Subject: [PATCH 2/8] annotations, needs debugging --- pandas/core/groupby/grouper.py | 2 +- pandas/core/groupby/ops.py | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index dc6336b17ac1e..dc079190eb6bc 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -672,7 +672,7 @@ def is_in_obj(gpr) -> bool: return grouper, exclusions, obj -def _is_label_like(val): +def _is_label_like(val) -> bool: return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val)) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 16a32c30a9857..830c977885b6f 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -41,7 +41,8 @@ import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.groupby import base, grouper +from pandas.core.groupby import base +from pandas.core.groupby.grouper import Grouping from pandas.core.index import Index, MultiIndex, ensure_index from pandas.core.series import Series from pandas.core.sorting import ( @@ -79,7 +80,7 @@ class BaseGrouper: def __init__( self, axis: Index, - groupings: "Sequence[grouper.Grouping]", + groupings: Sequence[Grouping], sort: bool = True, group_keys: bool = True, mutated: bool = False, @@ -89,7 +90,7 @@ def __init__( self._filter_empty_groups = self.compressed = len(groupings) != 1 self.axis = axis - self.groupings = groupings # type: Sequence[grouper.Grouping] + self.groupings = list(groupings) # type: List[Grouping] self.sort = sort self.group_keys = group_keys self.mutated = mutated @@ -788,9 +789,7 @@ def names(self): return [self.binlabels.name] @property - def groupings(self): - from pandas.core.groupby.grouper import Grouping - + def groupings(self) -> List[Grouping]: return [ Grouping(lvl, lvl, in_axis=False, level=None, name=name) for lvl, name in zip(self.levels, self.names) @@ -866,7 +865,7 @@ def __iter__(self): def _get_sorted_data(self) -> NDFrame: return self.data.take(self.sort_idx, axis=self.axis) - def _chop(self, sdata, slice_obj: slice): + def _chop(self, sdata: NDFrame, slice_obj: slice) -> NDFrame: raise AbstractMethodError(self) From 19332773191033db159f1be659be3d1c8bce37ff Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 6 Nov 2019 21:02:25 -0800 Subject: [PATCH 3/8] whitespace --- pandas/core/groupby/ops.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 830c977885b6f..ebeb55ac1d323 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -230,8 +230,7 @@ def names(self): def size(self) -> Series: """ - Compute group sizes - + Compute group sizes. """ ids, _, ngroup = self.group_info ids = ensure_platform_int(ids) From 9b6a87ab94ce4527dda1436ce520ce94b9e35ce7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 6 Nov 2019 21:04:32 -0800 Subject: [PATCH 4/8] types --- pandas/core/groupby/grouper.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index dc079190eb6bc..e120e605d616d 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -372,8 +372,8 @@ def __init__( self.grouper = self.grouper.astype("timedelta64[ns]") - def __repr__(self): - return "Grouping({0})".format(self.name) + def __repr__(self) -> str: + return "Grouping({name})".format(name=self.name) def __iter__(self): return iter(self.indices) @@ -434,10 +434,10 @@ def _get_grouper( key=None, axis: int = 0, level=None, - sort=True, - observed=False, - mutated=False, - validate=True, + sort: bool = True, + observed: bool = False, + mutated: bool = False, + validate: bool = True, ): """ create and return a BaseGrouper, which is an internal From d52add414042fc379c88edff90f174840eb850c2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 6 Nov 2019 21:20:17 -0800 Subject: [PATCH 5/8] circular import --- pandas/core/groupby/grouper.py | 13 +++++++------ pandas/core/groupby/ops.py | 11 +++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e120e605d616d..bae725f723715 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -120,7 +120,7 @@ def __init__(self, key=None, level=None, freq=None, axis=0, sort=False): def ax(self): return self.grouper - def _get_grouper(self, obj, validate=True): + def _get_grouper(self, obj, validate: bool = True): """ Parameters ---------- @@ -144,17 +144,18 @@ def _get_grouper(self, obj, validate=True): ) return self.binner, self.grouper, self.obj - def _set_grouper(self, obj, sort=False): + def _set_grouper(self, obj: NDFrame, sort: bool = False): """ given an object and the specifications, setup the internal grouper for this particular specification Parameters ---------- - obj : the subject object + obj : Series or DataFrame sort : bool, default False whether the resulting grouper should be sorted """ + assert obj is not None if self.key is not None and self.level is not None: raise ValueError("The Grouper cannot specify both a key and a level!") @@ -210,15 +211,15 @@ def _set_grouper(self, obj, sort=False): def groups(self): return self.grouper.groups - def __repr__(self): + def __repr__(self) -> str: attrs_list = ( - "{}={!r}".format(attr_name, getattr(self, attr_name)) + "{name}={val!r}".format(name=attr_name, val=getattr(self, attr_name)) for attr_name in self._attributes if getattr(self, attr_name) is not None ) attrs = ", ".join(attrs_list) cls_name = self.__class__.__name__ - return "{}({})".format(cls_name, attrs) + return "{cls}({attrs})".format(cls=cls_name, attrs=attrs) class Grouping: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index ebeb55ac1d323..9c87a6b1b9c41 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -41,8 +41,7 @@ import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.groupby import base -from pandas.core.groupby.grouper import Grouping +from pandas.core.groupby import base, grouper from pandas.core.index import Index, MultiIndex, ensure_index from pandas.core.series import Series from pandas.core.sorting import ( @@ -80,7 +79,7 @@ class BaseGrouper: def __init__( self, axis: Index, - groupings: Sequence[Grouping], + groupings: "Sequence[grouper.Grouping]", sort: bool = True, group_keys: bool = True, mutated: bool = False, @@ -90,7 +89,7 @@ def __init__( self._filter_empty_groups = self.compressed = len(groupings) != 1 self.axis = axis - self.groupings = list(groupings) # type: List[Grouping] + self.groupings = list(groupings) # type: List[grouper.Grouping] self.sort = sort self.group_keys = group_keys self.mutated = mutated @@ -788,9 +787,9 @@ def names(self): return [self.binlabels.name] @property - def groupings(self) -> List[Grouping]: + def groupings(self) -> "List[grouper.Grouping]": return [ - Grouping(lvl, lvl, in_axis=False, level=None, name=name) + grouper.Grouping(lvl, lvl, in_axis=False, level=None, name=name) for lvl, name in zip(self.levels, self.names) ] From a7e6ad16eb7ceab3dee6bc366a99a60ae0d8ac53 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 7 Nov 2019 09:52:28 -0800 Subject: [PATCH 6/8] fix msot mypy complaints --- pandas/core/groupby/ops.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 9c87a6b1b9c41..cbe012012fd29 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -36,6 +36,7 @@ ) from pandas.core.dtypes.missing import _maybe_fill, isna +from pandas._typing import FrameOrSeries import pandas.core.algorithms as algorithms from pandas.core.base import SelectionMixin import pandas.core.common as com @@ -106,7 +107,7 @@ def __iter__(self): def nkeys(self) -> int: return len(self.groupings) - def get_iterator(self, data: NDFrame, axis: int = 0): + def get_iterator(self, data: FrameOrSeries, axis: int = 0): """ Groupby iterator @@ -120,7 +121,7 @@ def get_iterator(self, data: NDFrame, axis: int = 0): for key, (i, group) in zip(keys, splitter): yield key, group - def _get_splitter(self, data: NDFrame, axis: int = 0) -> "DataSplitter": + def _get_splitter(self, data: FrameOrSeries, axis: int = 0) -> "DataSplitter": comp_ids, _, ngroups = self.group_info return get_splitter(data, comp_ids, ngroups, axis=axis) @@ -142,13 +143,13 @@ def _get_group_keys(self): # provide "flattened" iterator for multi-group setting return get_flattened_iterator(comp_ids, ngroups, self.levels, self.codes) - def apply(self, f, data: NDFrame, axis: int = 0): + def apply(self, f, data: FrameOrSeries, axis: int = 0): mutated = self.mutated splitter = self._get_splitter(data, axis=axis) group_keys = self._get_group_keys() result_values = None - sdata = splitter._get_sorted_data() + sdata = splitter._get_sorted_data() # type: FrameOrSeries if sdata.ndim == 2 and np.any(sdata.dtypes.apply(is_extension_array_dtype)): # calling splitter.fast_apply will raise TypeError via apply_frame_axis0 # if we pass EA instead of ndarray @@ -157,7 +158,7 @@ def apply(self, f, data: NDFrame, axis: int = 0): elif ( com.get_callable_name(f) not in base.plotting_methods - and splitter.fast_apply is not None + and isinstance(splitter, FrameSplitter) and axis == 0 # with MultiIndex, apply_frame_axis0 would raise InvalidApply # TODO: can we make this check prettier? @@ -720,7 +721,7 @@ def _get_grouper(self): """ return self - def get_iterator(self, data: NDFrame, axis: int = 0): + def get_iterator(self, data: FrameOrSeries, axis: int = 0): """ Groupby iterator @@ -827,9 +828,7 @@ def _is_indexed_like(obj, axes) -> bool: class DataSplitter: - fast_apply = None - - def __init__(self, data: NDFrame, labels, ngroups: int, axis: int = 0): + def __init__(self, data: FrameOrSeries, labels, ngroups: int, axis: int = 0): self.data = data self.labels = ensure_int64(labels) self.ngroups = ngroups @@ -860,10 +859,10 @@ def __iter__(self): for i, (start, end) in enumerate(zip(starts, ends)): yield i, self._chop(sdata, slice(start, end)) - def _get_sorted_data(self) -> NDFrame: + def _get_sorted_data(self) -> FrameOrSeries: return self.data.take(self.sort_idx, axis=self.axis) - def _chop(self, sdata: NDFrame, slice_obj: slice) -> NDFrame: + def _chop(self, sdata, slice_obj: slice) -> NDFrame: raise AbstractMethodError(self) @@ -887,7 +886,7 @@ def _chop(self, sdata: DataFrame, slice_obj: slice) -> DataFrame: return sdata._slice(slice_obj, axis=1) -def get_splitter(data: NDFrame, *args, **kwargs) -> DataSplitter: +def get_splitter(data: FrameOrSeries, *args, **kwargs) -> DataSplitter: if isinstance(data, Series): klass = SeriesSplitter # type: Type[DataSplitter] else: From f03830247e16a583fc0467bc10b02ac8a0fc23ff Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 7 Nov 2019 14:00:27 -0800 Subject: [PATCH 7/8] fix mypy groupings --- pandas/core/groupby/ops.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 55fa3ad6ab435..f8e75f0d3782d 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -90,12 +90,16 @@ def __init__( self._filter_empty_groups = self.compressed = len(groupings) != 1 self.axis = axis - self.groupings = list(groupings) # type: List[grouper.Grouping] + self._groupings = list(groupings) # type: List[grouper.Grouping] self.sort = sort self.group_keys = group_keys self.mutated = mutated self.indexer = indexer + @property + def groupings(self) -> List["grouper.Grouping"]: + return self._groupings + @property def shape(self): return tuple(ping.ngroups for ping in self.groupings) From 0b28143c5f7611776ac6d1677ae0aa8600078f09 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 8 Nov 2019 09:56:30 -0800 Subject: [PATCH 8/8] merge cleanup --- pandas/core/groupby/grouper.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 47d9510e8d59a..e6e3ee62459ca 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -143,7 +143,7 @@ def _get_grouper(self, obj, validate: bool = True): ) return self.binner, self.grouper, self.obj - def _set_grouper(self, obj: NDFrame, sort: bool = False): + def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): """ given an object and the specifications, setup the internal grouper for this particular specification @@ -435,6 +435,9 @@ def get_grouper( axis: int = 0, level=None, sort: bool = True, + observed: bool = False, + mutated: bool = False, + validate: bool = True, ) -> Tuple[BaseGrouper, List[Hashable], FrameOrSeries]: """ Create and return a BaseGrouper, which is an internal