Skip to content

REF: Simplify _cython_functions lookup #29246

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Oct 31, 2019
6 changes: 3 additions & 3 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -931,7 +931,7 @@ def group_last(rank_t[:, :] out,
def group_nth(rank_t[:, :] out,
int64_t[:] counts,
rank_t[:, :] values,
const int64_t[:] labels, int64_t rank,
const int64_t[:] labels, int64_t rank=1,
Py_ssize_t min_count=-1):
"""
Only aggregates on axis=0
Expand Down Expand Up @@ -1022,8 +1022,8 @@ def group_nth(rank_t[:, :] out,
def group_rank(float64_t[:, :] out,
rank_t[:, :] values,
const int64_t[:] labels,
bint is_datetimelike, object ties_method,
bint ascending, bint pct, object na_option):
bint is_datetimelike, object ties_method="average",
bint ascending=True, bint pct=False, object na_option="keep"):
"""
Provides the rank of values within each group.

Expand Down
76 changes: 17 additions & 59 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,12 +319,9 @@ def get_group_levels(self):
"min": "group_min",
"max": "group_max",
"mean": "group_mean",
"median": {"name": "group_median"},
"median": "group_median",
"var": "group_var",
"first": {
"name": "group_nth",
"f": lambda func, a, b, c, d, e: func(a, b, c, d, 1, -1),
},
"first": "group_nth",
"last": "group_last",
"ohlc": "group_ohlc",
},
Expand All @@ -333,19 +330,7 @@ def get_group_levels(self):
"cumsum": "group_cumsum",
"cummin": "group_cummin",
"cummax": "group_cummax",
"rank": {
"name": "group_rank",
"f": lambda func, a, b, c, d, e, **kwargs: func(
a,
b,
c,
e,
kwargs.get("ties_method", "average"),
kwargs.get("ascending", True),
kwargs.get("pct", False),
kwargs.get("na_option", "keep"),
),
},
"rank": "group_rank",
},
}

Expand Down Expand Up @@ -391,21 +376,7 @@ def get_func(fname):

ftype = self._cython_functions[kind][how]

if isinstance(ftype, dict):
func = afunc = get_func(ftype["name"])

# a sub-function
f = ftype.get("f")
if f is not None:

def wrapper(*args, **kwargs):
return f(afunc, *args, **kwargs)

# need to curry our sub-function
func = wrapper

else:
func = get_func(ftype)
func = get_func(ftype)

if func is None:
raise NotImplementedError(
Expand Down Expand Up @@ -517,14 +488,7 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs):
)
counts = np.zeros(self.ngroups, dtype=np.int64)
result = self._aggregate(
result,
counts,
values,
labels,
func,
is_numeric,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was is_numeric just not used at all?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAICT

is_datetimelike,
min_count,
result, counts, values, labels, func, is_datetimelike, min_count
)
elif kind == "transform":
result = _maybe_fill(
Expand All @@ -533,7 +497,7 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs):

# TODO: min_count
result = self._transform(
result, values, labels, func, is_numeric, is_datetimelike, **kwargs
result, values, labels, func, is_datetimelike, **kwargs
)

if is_integer_dtype(result) and not is_datetimelike:
Expand Down Expand Up @@ -574,39 +538,33 @@ def transform(self, values, how, axis=0, **kwargs):
return self._cython_operation("transform", values, how, axis, **kwargs)

def _aggregate(
self,
result,
counts,
values,
comp_ids,
agg_func,
is_numeric,
is_datetimelike,
min_count=-1,
self, result, counts, values, comp_ids, agg_func, is_datetimelike, min_count=-1
):
if values.ndim > 2:
# punting for now
raise NotImplementedError("number of dimensions is currently limited to 2")
elif agg_func is libgroupby.group_nth:
# different signature from the others
# TODO: should we be using min_count instead of hard-coding it?
agg_func(result, counts, values, comp_ids, rank=1, min_count=-1)
else:
agg_func(result, counts, values, comp_ids, min_count)

return result

def _transform(
self,
result,
values,
comp_ids,
transform_func,
is_numeric,
is_datetimelike,
**kwargs
self, result, values, comp_ids, transform_func, is_datetimelike, **kwargs
):

comp_ids, _, ngroups = self.group_info
if values.ndim > 2:
# punting for now
raise NotImplementedError("number of dimensions is currently limited to 2")
elif transform_func is libgroupby.group_rank:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May or may not be viable but is there a way to make transform_func a partial somewhere upstream to just freeze the one-off arguments that rank would need rather than special-casing here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think for that we'd need to do a partial on all the other cases, or just add the ngroups kwarg to group_rank signature.

On the margin I'd like to have fewer partials/lambdas floating around the groupby code

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we just add ngroups to the rank signatures and then this would just work?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you update for this (and for n_th above as well) so we don't have differeing signatures

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm ambivalent about this. Having these two extra checks here is non-pretty, but changing the signature in the cython func means we have unused args/kwargs there, which is a code smell. @WillAyd thoughts?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with Jeff and think the unused args would be preferable, at least to try and make these as generic as possible. Could also add a check within the function bodies that they are unused.

I've been guilty of this in the past myself but I think adding special cases in methods like this for one-off function applications is more difficult to track over time

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've been outvoted, will change. In the meantime i'll draw your attention to #29294 which should hopefully fix the CI

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so this is easy to do for group_rank, but would mean a small behavior change for group_nth (which currently ignores min_count). do we want that?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok let's fix rank for now and discuss nth? (we can also fix the signature and just ignore the arg for now).

# different signature from the others
transform_func(
result, values, comp_ids, is_datetimelike=is_datetimelike, **kwargs
)
else:
transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)

Expand Down