From 3dfef1a4a7569088e0adc419a9cf6d012e4f6bc7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 25 Oct 2019 19:28:18 -0700 Subject: [PATCH 1/5] WIP: libgroupby getattr pattern --- pandas/_libs/groupby.pyx | 4 ++-- pandas/core/groupby/ops.py | 25 +++++++------------------ 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index b2ffbb3ecb4f2..e613889ce440e 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1019,8 +1019,8 @@ def group_nth(rank_t[:, :] out, def group_rank(float64_t[:, :] out, rank_t[:, :] values, const int64_t[:] labels, - bint is_datetimelike, object ties_method, - bint ascending, bint pct, object na_option): + bint is_datetimelike, object ties_method="average", + bint ascending=True, bint pct=False, object na_option="keep"): """ Provides the rank of values within each group. diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 79b51ef57cd37..3c3ff588172a2 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -372,7 +372,7 @@ def get_group_levels(self): "min": "group_min", "max": "group_max", "mean": "group_mean", - "median": {"name": "group_median"}, + "median": "group_median", "var": "group_var", "first": { "name": "group_nth", @@ -386,19 +386,7 @@ def get_group_levels(self): "cumsum": "group_cumsum", "cummin": "group_cummin", "cummax": "group_cummax", - "rank": { - "name": "group_rank", - "f": lambda func, a, b, c, d, e, **kwargs: func( - a, - b, - c, - e, - kwargs.get("ties_method", "average"), - kwargs.get("ascending", True), - kwargs.get("pct", False), - kwargs.get("na_option", "keep"), - ), - }, + "rank": "group_rank", }, } @@ -445,6 +433,7 @@ def get_func(fname): ftype = self._cython_functions[kind][how] if isinstance(ftype, dict): + # we only get here with (kind, how) == ("aggregate", "first") func = afunc = get_func(ftype["name"]) # a sub-function @@ -575,7 +564,6 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): values, labels, func, - is_numeric, is_datetimelike, min_count, ) @@ -586,7 +574,7 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): # TODO: min_count result = self._transform( - result, values, labels, func, is_numeric, is_datetimelike, **kwargs + result, values, labels, func, is_datetimelike, **kwargs ) if is_integer_dtype(result) and not is_datetimelike: @@ -633,7 +621,6 @@ def _aggregate( values, comp_ids, agg_func, - is_numeric, is_datetimelike, min_count=-1, ): @@ -651,7 +638,6 @@ def _transform( values, comp_ids, transform_func, - is_numeric, is_datetimelike, **kwargs ): @@ -660,6 +646,9 @@ def _transform( if values.ndim > 2: # punting for now raise NotImplementedError("number of dimensions is currently limited to 2") + elif transform_func is libgroupby.group_rank: + # different signature from the others + transform_func(result, values, comp_ids, is_datetimelike=is_datetimelike, **kwargs) else: transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) From cd67a415ca0e2e21a68cb35359ef2c1b2e94a045 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 27 Oct 2019 11:04:09 -0700 Subject: [PATCH 2/5] revert median --- pandas/core/groupby/ops.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 3c3ff588172a2..7198ddac805eb 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -372,7 +372,7 @@ def get_group_levels(self): "min": "group_min", "max": "group_max", "mean": "group_mean", - "median": "group_median", + "median": {"name": "group_median"}, "var": "group_var", "first": { "name": "group_nth", @@ -433,7 +433,8 @@ def get_func(fname): ftype = self._cython_functions[kind][how] if isinstance(ftype, dict): - # we only get here with (kind, how) == ("aggregate", "first") + # we only get here with kind == "aggregate" and + # how == "first" or "median" func = afunc = get_func(ftype["name"]) # a sub-function From 2e0e76a5bb20205130ae096fc5c9fa81abc4abb3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 27 Oct 2019 11:07:38 -0700 Subject: [PATCH 3/5] blackify --- pandas/core/groupby/ops.py | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7198ddac805eb..72b97ee761e23 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -560,13 +560,7 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): ) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( - result, - counts, - values, - labels, - func, - is_datetimelike, - min_count, + result, counts, values, labels, func, is_datetimelike, min_count ) elif kind == "transform": result = _maybe_fill( @@ -616,14 +610,7 @@ def transform(self, values, how, axis=0, **kwargs): return self._cython_operation("transform", values, how, axis, **kwargs) def _aggregate( - self, - result, - counts, - values, - comp_ids, - agg_func, - is_datetimelike, - min_count=-1, + self, result, counts, values, comp_ids, agg_func, is_datetimelike, min_count=-1 ): if values.ndim > 2: # punting for now @@ -634,13 +621,7 @@ def _aggregate( return result def _transform( - self, - result, - values, - comp_ids, - transform_func, - is_datetimelike, - **kwargs + self, result, values, comp_ids, transform_func, is_datetimelike, **kwargs ): comp_ids, _, ngroups = self.group_info @@ -649,7 +630,9 @@ def _transform( raise NotImplementedError("number of dimensions is currently limited to 2") elif transform_func is libgroupby.group_rank: # different signature from the others - transform_func(result, values, comp_ids, is_datetimelike=is_datetimelike, **kwargs) + transform_func( + result, values, comp_ids, is_datetimelike=is_datetimelike, **kwargs + ) else: transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) From ce253d8e62f92a31f127584bb75b057770af595c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 29 Oct 2019 20:03:10 -0700 Subject: [PATCH 4/5] REF: remove libgroupby dict cases --- pandas/_libs/groupby.pyx | 2 +- pandas/core/groupby/ops.py | 29 +++++++---------------------- 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 1261d7392f4d5..217010b85c8f9 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -931,7 +931,7 @@ def group_last(rank_t[:, :] out, def group_nth(rank_t[:, :] out, int64_t[:] counts, rank_t[:, :] values, - const int64_t[:] labels, int64_t rank, + const int64_t[:] labels, int64_t rank=1, Py_ssize_t min_count=-1): """ Only aggregates on axis=0 diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 81fbb81f3f423..4dd30a3a5c55e 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -319,12 +319,9 @@ def get_group_levels(self): "min": "group_min", "max": "group_max", "mean": "group_mean", - "median": {"name": "group_median"}, + "median": "group_median", "var": "group_var", - "first": { - "name": "group_nth", - "f": lambda func, a, b, c, d, e: func(a, b, c, d, 1, -1), - }, + "first": "group_nth", "last": "group_last", "ohlc": "group_ohlc", }, @@ -379,23 +376,7 @@ def get_func(fname): ftype = self._cython_functions[kind][how] - if isinstance(ftype, dict): - # we only get here with kind == "aggregate" and - # how == "first" or "median" - func = afunc = get_func(ftype["name"]) - - # a sub-function - f = ftype.get("f") - if f is not None: - - def wrapper(*args, **kwargs): - return f(afunc, *args, **kwargs) - - # need to curry our sub-function - func = wrapper - - else: - func = get_func(ftype) + func = get_func(ftype) if func is None: raise NotImplementedError( @@ -562,6 +543,10 @@ def _aggregate( if values.ndim > 2: # punting for now raise NotImplementedError("number of dimensions is currently limited to 2") + elif agg_func is libgroupby.group_nth: + # different signature from the others + # TODO: should we be using min_count instead of hard-coding it? + agg_func(result, counts, values, comp_ids, rank=1, min_count=-1) else: agg_func(result, counts, values, comp_ids, min_count) From eed591d57906c23fe66b0863a55ebb49da0e6a09 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 30 Oct 2019 16:16:40 -0700 Subject: [PATCH 5/5] add dummy arg for group_rank --- pandas/_libs/groupby.pyx | 4 ++++ pandas/core/groupby/ops.py | 5 ----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 217010b85c8f9..98be97def3dda 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1022,6 +1022,7 @@ def group_nth(rank_t[:, :] out, def group_rank(float64_t[:, :] out, rank_t[:, :] values, const int64_t[:] labels, + int ngroups, bint is_datetimelike, object ties_method="average", bint ascending=True, bint pct=False, object na_option="keep"): """ @@ -1033,6 +1034,9 @@ def group_rank(float64_t[:, :] out, values : array of rank_t values to be ranked labels : array containing unique label for each group, with its ordering matching up to the corresponding record in `values` + ngroups : int + This parameter is not used, is needed to match signatures of other + groupby functions. is_datetimelike : bool, default False unused in this method but provided for call compatibility with other Cython transformations diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4dd30a3a5c55e..2a7fd079679a4 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -560,11 +560,6 @@ def _transform( if values.ndim > 2: # punting for now raise NotImplementedError("number of dimensions is currently limited to 2") - elif transform_func is libgroupby.group_rank: - # different signature from the others - transform_func( - result, values, comp_ids, is_datetimelike=is_datetimelike, **kwargs - ) else: transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)