From 3357d064db957ba5c7811bcb3248a518ed714526 Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 27 Feb 2020 18:05:51 +0000 Subject: [PATCH 1/4] CLN/TYP: Groupby agg methods --- pandas/core/groupby/generic.py | 82 +++++++++++++++ pandas/core/groupby/groupby.py | 176 +++++++++++++++------------------ 2 files changed, 161 insertions(+), 97 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 69b143febeea2..6d89cd1366ea4 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -87,6 +87,28 @@ if TYPE_CHECKING: from pandas.core.internals import Block +_agg_template = """ +Compute %(f)s of group values. + +Parameters +---------- +numeric_only : bool, default %(no)s + Include only float, int, boolean columns. If None, will attempt to use + everything, then use only numeric data. +min_count : int, default %(mc)s + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + +Returns +------- +%(return_type)s + Computed %(f)s of values within each group. + +See Also +-------- +%(return_type)s.groupby +""" + NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) # TODO(typing) the return value on this callable should be any *scalar*. @@ -813,6 +835,36 @@ def count(self) -> Series: ) return self._reindex_output(result, fill_value=0) + @Substitution(f="sum", no=True, mc=0, return_type="Series") + @Appender(_agg_template) + def sum(self, numeric_only=True, min_count=0) -> Series: + return super().sum(numeric_only=numeric_only, min_count=min_count) + + @Substitution(f="prod", no=True, mc=0, return_type="Series") + @Appender(_agg_template) + def prod(self, numeric_only=True, min_count=0) -> Series: + return super().prod(numeric_only=numeric_only, min_count=min_count) + + @Substitution(f="min", no=False, mc=-1, return_type="Series") + @Appender(_agg_template) + def min(self, numeric_only=False, min_count=-1) -> Series: + return super().min(numeric_only=numeric_only, min_count=min_count) + + @Substitution(f="max", no=False, mc=-1, return_type="Series") + @Appender(_agg_template) + def max(self, numeric_only=False, min_count=-1) -> Series: + return super().max(numeric_only=numeric_only, min_count=min_count) + + @Substitution(f="first", no=False, mc=-1, return_type="Series") + @Appender(_agg_template) + def first(self, numeric_only=False, min_count=-1) -> Series: + return super().first(numeric_only=numeric_only, min_count=min_count) + + @Substitution(f="last", no=False, mc=-1, return_type="Series") + @Appender(_agg_template) + def last(self, numeric_only=False, min_count=-1) -> Series: + return super().last(numeric_only=numeric_only, min_count=min_count) + def _apply_to_column_groupbys(self, func): """ return a pass thru """ return func(self) @@ -1900,6 +1952,36 @@ def groupby_series(obj, col=None): results.index = ibase.default_index(len(results)) return results + @Substitution(f="sum", no=True, mc=0, return_type="DataFrame") + @Appender(_agg_template) + def sum(self, numeric_only=True, min_count=0) -> DataFrame: + return super().sum(numeric_only=numeric_only, min_count=min_count) + + @Substitution(f="prod", no=True, mc=0, return_type="DataFrame") + @Appender(_agg_template) + def prod(self, numeric_only=True, min_count=0) -> DataFrame: + return super().prod(numeric_only=numeric_only, min_count=min_count) + + @Substitution(f="min", no=False, mc=-1, return_type="DataFrame") + @Appender(_agg_template) + def min(self, numeric_only=False, min_count=-1) -> DataFrame: + return super().min(numeric_only=numeric_only, min_count=min_count) + + @Substitution(f="max", no=False, mc=-1, return_type="DataFrame") + @Appender(_agg_template) + def max(self, numeric_only=False, min_count=-1) -> DataFrame: + return super().max(numeric_only=numeric_only, min_count=min_count) + + @Substitution(f="first", no=False, mc=-1, return_type="DataFrame") + @Appender(_agg_template) + def first(self, numeric_only=False, min_count=-1) -> DataFrame: + return super().first(numeric_only=numeric_only, min_count=min_count) + + @Substitution(f="last", no=False, mc=-1, return_type="DataFrame") + @Appender(_agg_template) + def last(self, numeric_only=False, min_count=-1) -> DataFrame: + return super().last(numeric_only=numeric_only, min_count=min_count) + boxplot = boxplot_frame_groupby diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b9b403ffdc69a..65420ee3a3ae1 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -36,7 +36,6 @@ class providing the base-class of operations. from pandas._libs import Timestamp import pandas._libs.groupby as libgroupby from pandas._typing import FrameOrSeries, Scalar -from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, cache_readonly, doc @@ -945,6 +944,32 @@ def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): raise AbstractMethodError(self) + def _agg_general( + self, numeric_only=True, min_count=-1, *, alias: str, npfunc: Callable + ): + self._set_group_selection() + + # try a cython aggregation if we can + try: + return self._cython_agg_general( + how=alias, alt=npfunc, numeric_only=numeric_only, min_count=min_count, + ) + except DataError: + pass + except NotImplementedError as err: + if "function is not implemented for this dtype" in str( + err + ) or "category dtype not supported" in str(err): + # raised in _get_cython_function, in some cases can + # be trimmed by implementing cython funcs for more dtypes + pass + else: + raise + + # apply a non-cython aggregation + result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + return result + def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ): @@ -1438,105 +1463,62 @@ def size(self): result = self._obj_1d_constructor(result) return self._reindex_output(result, fill_value=0) - @classmethod - def _add_numeric_operations(cls): - """ - Add numeric operations to the GroupBy generically. + def sum(self, numeric_only=True, min_count=0): + return self._agg_general( + numeric_only=numeric_only, min_count=min_count, alias="add", npfunc=np.sum + ) + + def prod(self, numeric_only=True, min_count=0): + return self._agg_general( + numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod + ) + + def min(self, numeric_only=False, min_count=-1): + return self._agg_general( + numeric_only=numeric_only, min_count=min_count, alias="min", npfunc=np.min + ) + + def max(self, numeric_only=False, min_count=-1): + return self._agg_general( + numeric_only=numeric_only, min_count=min_count, alias="max", npfunc=np.max + ) + + @staticmethod + def _get_loc(x, axis: int = 0, *, loc: int): + """Helper function for first/last item that isn't NA. """ - def groupby_function( - name: str, - alias: str, - npfunc, - numeric_only: bool = True, - min_count: int = -1, - ): + def get_loc_notna(x, loc: int): + x = x.to_numpy() + x = x[notna(x)] + if len(x) == 0: + return np.nan + return x[loc] - _local_template = """ - Compute %(f)s of group values. - - Parameters - ---------- - numeric_only : bool, default %(no)s - Include only float, int, boolean columns. If None, will attempt to use - everything, then use only numeric data. - min_count : int, default %(mc)s - The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. - - Returns - ------- - Series or DataFrame - Computed %(f)s of values within each group. - """ - - @Substitution(name="groupby", f=name, no=numeric_only, mc=min_count) - @Appender(_common_see_also) - @Appender(_local_template) - def func(self, numeric_only=numeric_only, min_count=min_count): - self._set_group_selection() - - # try a cython aggregation if we can - try: - return self._cython_agg_general( - how=alias, - alt=npfunc, - numeric_only=numeric_only, - min_count=min_count, - ) - except DataError: - pass - except NotImplementedError as err: - if "function is not implemented for this dtype" in str( - err - ) or "category dtype not supported" in str(err): - # raised in _get_cython_function, in some cases can - # be trimmed by implementing cython funcs for more dtypes - pass - else: - raise - - # apply a non-cython aggregation - result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) - return result - - set_function_name(func, name, cls) - - return func - - def first_compat(x, axis=0): - def first(x): - x = x.to_numpy() - - x = x[notna(x)] - if len(x) == 0: - return np.nan - return x[0] - - if isinstance(x, DataFrame): - return x.apply(first, axis=axis) - else: - return first(x) - - def last_compat(x, axis=0): - def last(x): - x = x.to_numpy() - x = x[notna(x)] - if len(x) == 0: - return np.nan - return x[-1] - - if isinstance(x, DataFrame): - return x.apply(last, axis=axis) - else: - return last(x) - - cls.sum = groupby_function("sum", "add", np.sum, min_count=0) - cls.prod = groupby_function("prod", "prod", np.prod, min_count=0) - cls.min = groupby_function("min", "min", np.min, numeric_only=False) - cls.max = groupby_function("max", "max", np.max, numeric_only=False) - cls.first = groupby_function("first", "first", first_compat, numeric_only=False) - cls.last = groupby_function("last", "last", last_compat, numeric_only=False) + if isinstance(x, DataFrame): + return x.apply(get_loc_notna, axis=axis, loc=loc) + else: + return get_loc_notna(x, loc=loc) + + def first(self, numeric_only=False, min_count=-1): + first_compat = partial(self._get_loc, loc=0) + + return self._agg_general( + numeric_only=numeric_only, + min_count=min_count, + alias="first", + npfunc=first_compat, + ) + + def last(self, numeric_only=False, min_count=-1): + last_compat = partial(self._get_loc, loc=-1) + + return self._agg_general( + numeric_only=numeric_only, + min_count=min_count, + alias="last", + npfunc=last_compat, + ) @Substitution(name="groupby") @Appender(_common_see_also) From 4b96a52c85148c97e7d9e59d374255dc07799a22 Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 27 Feb 2020 21:24:17 +0000 Subject: [PATCH 2/4] add parameter types + @doc decorator --- pandas/core/groupby/generic.py | 72 ++++++++++++++-------------------- pandas/core/groupby/groupby.py | 19 +++++---- pandas/util/_decorators.py | 6 +-- 3 files changed, 45 insertions(+), 52 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 6d89cd1366ea4..bde9d21bb0b93 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -88,25 +88,25 @@ from pandas.core.internals import Block _agg_template = """ -Compute %(f)s of group values. +Compute {fname} of group values. Parameters ---------- -numeric_only : bool, default %(no)s +numeric_only : bool, default {no} Include only float, int, boolean columns. If None, will attempt to use everything, then use only numeric data. -min_count : int, default %(mc)s +min_count : int, default {mc} The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. Returns ------- -%(return_type)s - Computed %(f)s of values within each group. +{return_type} + Computed {fname} of values within each group. See Also -------- -%(return_type)s.groupby +{return_type}.groupby """ @@ -835,34 +835,28 @@ def count(self) -> Series: ) return self._reindex_output(result, fill_value=0) - @Substitution(f="sum", no=True, mc=0, return_type="Series") - @Appender(_agg_template) - def sum(self, numeric_only=True, min_count=0) -> Series: + @doc(_agg_template, fname="sum", no=True, mc=0, return_type="Series") + def sum(self, numeric_only: bool = True, min_count: int = 0) -> Series: return super().sum(numeric_only=numeric_only, min_count=min_count) - @Substitution(f="prod", no=True, mc=0, return_type="Series") - @Appender(_agg_template) - def prod(self, numeric_only=True, min_count=0) -> Series: + @doc(_agg_template, fname="prod", no=True, mc=0, return_type="Series") + def prod(self, numeric_only: bool = True, min_count: int = 0) -> Series: return super().prod(numeric_only=numeric_only, min_count=min_count) - @Substitution(f="min", no=False, mc=-1, return_type="Series") - @Appender(_agg_template) - def min(self, numeric_only=False, min_count=-1) -> Series: + @doc(_agg_template, fname="min", no=False, mc=-1, return_type="Series") + def min(self, numeric_only: bool = False, min_count: int = -1) -> Series: return super().min(numeric_only=numeric_only, min_count=min_count) - @Substitution(f="max", no=False, mc=-1, return_type="Series") - @Appender(_agg_template) - def max(self, numeric_only=False, min_count=-1) -> Series: + @doc(_agg_template, fname="max", no=False, mc=-1, return_type="Series") + def max(self, numeric_only: bool = False, min_count: int = -1) -> Series: return super().max(numeric_only=numeric_only, min_count=min_count) - @Substitution(f="first", no=False, mc=-1, return_type="Series") - @Appender(_agg_template) - def first(self, numeric_only=False, min_count=-1) -> Series: + @doc(_agg_template, fname="first", no=False, mc=-1, return_type="Series") + def first(self, numeric_only: bool = False, min_count: int = -1) -> Series: return super().first(numeric_only=numeric_only, min_count=min_count) - @Substitution(f="last", no=False, mc=-1, return_type="Series") - @Appender(_agg_template) - def last(self, numeric_only=False, min_count=-1) -> Series: + @doc(_agg_template, fname="last", no=False, mc=-1, return_type="Series") + def last(self, numeric_only: bool = False, min_count: int = -1) -> Series: return super().last(numeric_only=numeric_only, min_count=min_count) def _apply_to_column_groupbys(self, func): @@ -1952,34 +1946,28 @@ def groupby_series(obj, col=None): results.index = ibase.default_index(len(results)) return results - @Substitution(f="sum", no=True, mc=0, return_type="DataFrame") - @Appender(_agg_template) - def sum(self, numeric_only=True, min_count=0) -> DataFrame: + @doc(_agg_template, fname="sum", no=True, mc=0, return_type="DataFrame") + def sum(self, numeric_only: bool = True, min_count: int = 0) -> DataFrame: return super().sum(numeric_only=numeric_only, min_count=min_count) - @Substitution(f="prod", no=True, mc=0, return_type="DataFrame") - @Appender(_agg_template) - def prod(self, numeric_only=True, min_count=0) -> DataFrame: + @doc(_agg_template, fname="prod", no=True, mc=0, return_type="DataFrame") + def prod(self, numeric_only: bool = True, min_count: int = 0) -> DataFrame: return super().prod(numeric_only=numeric_only, min_count=min_count) - @Substitution(f="min", no=False, mc=-1, return_type="DataFrame") - @Appender(_agg_template) - def min(self, numeric_only=False, min_count=-1) -> DataFrame: + @doc(_agg_template, fname="min", no=False, mc=-1, return_type="DataFrame") + def min(self, numeric_only: bool = False, min_count: int = -1) -> DataFrame: return super().min(numeric_only=numeric_only, min_count=min_count) - @Substitution(f="max", no=False, mc=-1, return_type="DataFrame") - @Appender(_agg_template) - def max(self, numeric_only=False, min_count=-1) -> DataFrame: + @doc(_agg_template, fname="max", no=False, mc=-1, return_type="DataFrame") + def max(self, numeric_only: bool = False, min_count: int = -1) -> DataFrame: return super().max(numeric_only=numeric_only, min_count=min_count) - @Substitution(f="first", no=False, mc=-1, return_type="DataFrame") - @Appender(_agg_template) - def first(self, numeric_only=False, min_count=-1) -> DataFrame: + @doc(_agg_template, fname="first", no=False, mc=-1, return_type="DataFrame") + def first(self, numeric_only: bool = False, min_count: int = -1) -> DataFrame: return super().first(numeric_only=numeric_only, min_count=min_count) - @Substitution(f="last", no=False, mc=-1, return_type="DataFrame") - @Appender(_agg_template) - def last(self, numeric_only=False, min_count=-1) -> DataFrame: + @doc(_agg_template, fname="last", no=False, mc=-1, return_type="DataFrame") + def last(self, numeric_only: bool = False, min_count: int = -1) -> DataFrame: return super().last(numeric_only=numeric_only, min_count=min_count) boxplot = boxplot_frame_groupby diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 65420ee3a3ae1..ddd8df392109a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -945,7 +945,12 @@ def _wrap_applied_output(self, keys, values, not_indexed_same: bool = False): raise AbstractMethodError(self) def _agg_general( - self, numeric_only=True, min_count=-1, *, alias: str, npfunc: Callable + self, + numeric_only: bool = True, + min_count: int = -1, + *, + alias: str, + npfunc: Callable, ): self._set_group_selection() @@ -1463,22 +1468,22 @@ def size(self): result = self._obj_1d_constructor(result) return self._reindex_output(result, fill_value=0) - def sum(self, numeric_only=True, min_count=0): + def sum(self, numeric_only: bool = True, min_count: int = 0): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="add", npfunc=np.sum ) - def prod(self, numeric_only=True, min_count=0): + def prod(self, numeric_only: bool = True, min_count: int = 0): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod ) - def min(self, numeric_only=False, min_count=-1): + def min(self, numeric_only: bool = False, min_count: int = -1): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="min", npfunc=np.min ) - def max(self, numeric_only=False, min_count=-1): + def max(self, numeric_only: bool = False, min_count: int = -1): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="max", npfunc=np.max ) @@ -1500,7 +1505,7 @@ def get_loc_notna(x, loc: int): else: return get_loc_notna(x, loc=loc) - def first(self, numeric_only=False, min_count=-1): + def first(self, numeric_only: bool = False, min_count: int = -1): first_compat = partial(self._get_loc, loc=0) return self._agg_general( @@ -1510,7 +1515,7 @@ def first(self, numeric_only=False, min_count=-1): npfunc=first_compat, ) - def last(self, numeric_only=False, min_count=-1): + def last(self, numeric_only: bool = False, min_count: int = -1): last_compat = partial(self._get_loc, loc=-1) return self._agg_general( diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index 92bfce7ec9c83..0fac0ce4d9bd0 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -329,7 +329,7 @@ def wrapper(*args, **kwargs) -> Callable[..., Any]: return decorate -def doc(*args: Union[str, Callable], **kwargs: str) -> Callable[[F], F]: +def doc(*args: Union[str, Callable], **kwargs: Any) -> Callable[[F], F]: """ A decorator take docstring templates, concatenate them and perform string substitution on it. @@ -345,8 +345,8 @@ def doc(*args: Union[str, Callable], **kwargs: str) -> Callable[[F], F]: *args : str or callable The string / docstring / docstring template to be appended in order after default docstring under function. - **kwargs : str - The string which would be used to format docstring template. + **kwargs : Any + The objects which would be used to format docstring template. """ def decorator(func: F) -> F: From be54f6c4adfd7c2b607fc852fbd6d27166a29ccc Mon Sep 17 00:00:00 2001 From: tp Date: Mon, 2 Mar 2020 21:24:12 +0000 Subject: [PATCH 3/4] Add doc string to GroupBy methods --- pandas/core/groupby/generic.py | 70 ---------------------------------- pandas/core/groupby/groupby.py | 27 +++++++++++-- 2 files changed, 24 insertions(+), 73 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index bde9d21bb0b93..69b143febeea2 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -87,28 +87,6 @@ if TYPE_CHECKING: from pandas.core.internals import Block -_agg_template = """ -Compute {fname} of group values. - -Parameters ----------- -numeric_only : bool, default {no} - Include only float, int, boolean columns. If None, will attempt to use - everything, then use only numeric data. -min_count : int, default {mc} - The required number of valid values to perform the operation. If fewer - than ``min_count`` non-NA values are present the result will be NA. - -Returns -------- -{return_type} - Computed {fname} of values within each group. - -See Also --------- -{return_type}.groupby -""" - NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) # TODO(typing) the return value on this callable should be any *scalar*. @@ -835,30 +813,6 @@ def count(self) -> Series: ) return self._reindex_output(result, fill_value=0) - @doc(_agg_template, fname="sum", no=True, mc=0, return_type="Series") - def sum(self, numeric_only: bool = True, min_count: int = 0) -> Series: - return super().sum(numeric_only=numeric_only, min_count=min_count) - - @doc(_agg_template, fname="prod", no=True, mc=0, return_type="Series") - def prod(self, numeric_only: bool = True, min_count: int = 0) -> Series: - return super().prod(numeric_only=numeric_only, min_count=min_count) - - @doc(_agg_template, fname="min", no=False, mc=-1, return_type="Series") - def min(self, numeric_only: bool = False, min_count: int = -1) -> Series: - return super().min(numeric_only=numeric_only, min_count=min_count) - - @doc(_agg_template, fname="max", no=False, mc=-1, return_type="Series") - def max(self, numeric_only: bool = False, min_count: int = -1) -> Series: - return super().max(numeric_only=numeric_only, min_count=min_count) - - @doc(_agg_template, fname="first", no=False, mc=-1, return_type="Series") - def first(self, numeric_only: bool = False, min_count: int = -1) -> Series: - return super().first(numeric_only=numeric_only, min_count=min_count) - - @doc(_agg_template, fname="last", no=False, mc=-1, return_type="Series") - def last(self, numeric_only: bool = False, min_count: int = -1) -> Series: - return super().last(numeric_only=numeric_only, min_count=min_count) - def _apply_to_column_groupbys(self, func): """ return a pass thru """ return func(self) @@ -1946,30 +1900,6 @@ def groupby_series(obj, col=None): results.index = ibase.default_index(len(results)) return results - @doc(_agg_template, fname="sum", no=True, mc=0, return_type="DataFrame") - def sum(self, numeric_only: bool = True, min_count: int = 0) -> DataFrame: - return super().sum(numeric_only=numeric_only, min_count=min_count) - - @doc(_agg_template, fname="prod", no=True, mc=0, return_type="DataFrame") - def prod(self, numeric_only: bool = True, min_count: int = 0) -> DataFrame: - return super().prod(numeric_only=numeric_only, min_count=min_count) - - @doc(_agg_template, fname="min", no=False, mc=-1, return_type="DataFrame") - def min(self, numeric_only: bool = False, min_count: int = -1) -> DataFrame: - return super().min(numeric_only=numeric_only, min_count=min_count) - - @doc(_agg_template, fname="max", no=False, mc=-1, return_type="DataFrame") - def max(self, numeric_only: bool = False, min_count: int = -1) -> DataFrame: - return super().max(numeric_only=numeric_only, min_count=min_count) - - @doc(_agg_template, fname="first", no=False, mc=-1, return_type="DataFrame") - def first(self, numeric_only: bool = False, min_count: int = -1) -> DataFrame: - return super().first(numeric_only=numeric_only, min_count=min_count) - - @doc(_agg_template, fname="last", no=False, mc=-1, return_type="DataFrame") - def last(self, numeric_only: bool = False, min_count: int = -1) -> DataFrame: - return super().last(numeric_only=numeric_only, min_count=min_count) - boxplot = boxplot_frame_groupby diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ddd8df392109a..916b6cd0616da 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -191,6 +191,24 @@ class providing the base-class of operations. """, ) +_groupby_agg_method_template = """ +Compute {fname} of group values. + +Parameters +---------- +numeric_only : bool, default {no} + Include only float, int, boolean columns. If None, will attempt to use + everything, then use only numeric data. +min_count : int, default {mc} + The required number of valid values to perform the operation. If fewer + than ``min_count`` non-NA values are present the result will be NA. + +Returns +------- +Series or DataFrame + Computed {fname} of values within each group. +""" + _pipe_template = """ Apply a function `func` with arguments to this %(klass)s object and return the function's result. @@ -1468,21 +1486,25 @@ def size(self): result = self._obj_1d_constructor(result) return self._reindex_output(result, fill_value=0) + @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) def sum(self, numeric_only: bool = True, min_count: int = 0): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="add", npfunc=np.sum ) + @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0) def prod(self, numeric_only: bool = True, min_count: int = 0): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod ) + @doc(_groupby_agg_method_template, fname="min", no=False, mc=-1) def min(self, numeric_only: bool = False, min_count: int = -1): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="min", npfunc=np.min ) + @doc(_groupby_agg_method_template, fname="max", no=False, mc=-1) def max(self, numeric_only: bool = False, min_count: int = -1): return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="max", npfunc=np.max @@ -1505,6 +1527,7 @@ def get_loc_notna(x, loc: int): else: return get_loc_notna(x, loc=loc) + @doc(_groupby_agg_method_template, fname="first", no=False, mc=-1) def first(self, numeric_only: bool = False, min_count: int = -1): first_compat = partial(self._get_loc, loc=0) @@ -1515,6 +1538,7 @@ def first(self, numeric_only: bool = False, min_count: int = -1): npfunc=first_compat, ) + @doc(_groupby_agg_method_template, fname="last", no=False, mc=-1) def last(self, numeric_only: bool = False, min_count: int = -1): last_compat = partial(self._get_loc, loc=-1) @@ -2623,9 +2647,6 @@ def _reindex_output( return output.reset_index(drop=True) -GroupBy._add_numeric_operations() - - @doc(GroupBy) def get_groupby( obj: NDFrame, From 279a89e1b5d007f712e063f4c626fbda6392c279 Mon Sep 17 00:00:00 2001 From: tp Date: Fri, 15 May 2020 06:29:21 +0100 Subject: [PATCH 4/4] simplify first_compat and last_compat --- pandas/core/groupby/groupby.py | 54 ++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 916b6cd0616da..20da8ca3ec2b7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1165,6 +1165,27 @@ def _apply_filter(self, indices, dropna): OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame) +def get_loc_notna(obj: "Series", *, loc: int): + """Find the value in position ``loc`` after filtering ``obj`` for nan values. + + if ``obj`` is empty or has only nan values, np.nan er returned. + + Examples + -------- + >>> ser = pd.Series([np.nan, np.nan, 1, 2, np.nan]) + >>> get_loc_notna(ser, loc=0) # get first non-na + 1.0 + >>> get_loc_notna(ser, loc=-1) # get last non-na + 2.0 + """ + x = obj.to_numpy() + x = x[notna(x)] + + if len(x) == 0: + return np.nan + return x[loc] + + class GroupBy(_GroupBy[FrameOrSeries]): """ Class for grouping and aggregating relational data. @@ -1510,26 +1531,15 @@ def max(self, numeric_only: bool = False, min_count: int = -1): numeric_only=numeric_only, min_count=min_count, alias="max", npfunc=np.max ) - @staticmethod - def _get_loc(x, axis: int = 0, *, loc: int): - """Helper function for first/last item that isn't NA. - """ - - def get_loc_notna(x, loc: int): - x = x.to_numpy() - x = x[notna(x)] - if len(x) == 0: - return np.nan - return x[loc] - - if isinstance(x, DataFrame): - return x.apply(get_loc_notna, axis=axis, loc=loc) - else: - return get_loc_notna(x, loc=loc) - @doc(_groupby_agg_method_template, fname="first", no=False, mc=-1) def first(self, numeric_only: bool = False, min_count: int = -1): - first_compat = partial(self._get_loc, loc=0) + def first_compat(x, axis: int = 0): + """Helper function for first item that isn't NA. + """ + if isinstance(x, DataFrame): + return x.apply(get_loc_notna, axis=axis, loc=0) + else: + return get_loc_notna(x, loc=0) return self._agg_general( numeric_only=numeric_only, @@ -1540,7 +1550,13 @@ def first(self, numeric_only: bool = False, min_count: int = -1): @doc(_groupby_agg_method_template, fname="last", no=False, mc=-1) def last(self, numeric_only: bool = False, min_count: int = -1): - last_compat = partial(self._get_loc, loc=-1) + def last_compat(x, axis: int = 0): + """Helper function for last item that isn't NA. + """ + if isinstance(x, DataFrame): + return x.apply(get_loc_notna, axis=axis, loc=-1) + else: + return get_loc_notna(x, loc=-1) return self._agg_general( numeric_only=numeric_only,