-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
API: add numeric_only support to groupby agg #58132
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -219,6 +219,25 @@ def _obj_with_exclusions(self): | |
else: | ||
return self.obj | ||
|
||
@final | ||
@cache_readonly | ||
def _obj_numeric_only_with_exclusions(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the performance of this change needs to be closely studied to see if it is really worth the complexity (and extra memory usage). In any case, I do not think it should be introduced here, but rather in its own PR. Can you follow the existing pattern of subsetting the data directly in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. related to comment I put below: |
||
if isinstance(self.obj, ABCSeries): | ||
return self.obj.select_dtypes("number") | ||
|
||
if self._selection is not None: | ||
return self.obj[self._selection_list].select_dtypes("number") | ||
|
||
if len(self.exclusions) > 0: | ||
# equivalent to `self.obj.drop(self.exclusions, axis=1) | ||
# but this avoids consolidating and making a copy | ||
# TODO: following GH#45287 can we now use .drop directly without | ||
# making a copy? | ||
obj = self.obj._drop_axis(self.exclusions, axis=1, only_slice=True) | ||
return obj.select_dtypes("number") | ||
else: | ||
return self.obj.select_dtypes("number") | ||
|
||
def __getitem__(self, key): | ||
if self._selection is not None: | ||
raise IndexError(f"Column(s) {self._selection} already selected") | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1553,7 +1553,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) | |
|
||
else: | ||
# try to treat as if we are passing a list | ||
gba = GroupByApply(self, [func], args=(), kwargs={}) | ||
gba = GroupByApply(self, [func], args=args, kwargs=kwargs) | ||
try: | ||
result = gba.agg() | ||
|
||
|
@@ -1582,15 +1582,18 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) | |
|
||
agg = aggregate | ||
|
||
def _python_agg_general(self, func, *args, **kwargs): | ||
def _python_agg_general(self, func, *args, numeric_only=False, **kwargs): | ||
f = lambda x: func(x, *args, **kwargs) | ||
|
||
if self.ngroups == 0: | ||
# e.g. test_evaluate_with_empty_groups different path gets different | ||
# result dtype in empty case. | ||
return self._python_apply_general(f, self._selected_obj, is_agg=True) | ||
|
||
obj = self._obj_with_exclusions | ||
if numeric_only: | ||
obj = self._obj_numeric_only_with_exclusions | ||
else: | ||
obj = self._obj_with_exclusions | ||
|
||
if not len(obj.columns): | ||
# e.g. test_margins_no_values_no_cols | ||
|
@@ -1605,19 +1608,19 @@ def _python_agg_general(self, func, *args, **kwargs): | |
res.columns = obj.columns.copy(deep=False) | ||
return self._wrap_aggregated_output(res) | ||
|
||
def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: | ||
def _aggregate_frame(self, func, *args, numeric_only=False, **kwargs) -> DataFrame: | ||
if self._grouper.nkeys != 1: | ||
raise AssertionError("Number of keys must be 1") | ||
|
||
obj = self._obj_with_exclusions | ||
|
||
mgr = self._get_data_to_aggregate(numeric_only=numeric_only) | ||
data = self._wrap_agged_manager(mgr) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see any advantage (or disadvantage - for that matter) of calling this obj vs data. Because of this, I think we should leave it as obj (to avoid the code churn). If you see a reason to prefer data over obj, let me know! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, the issue I was having on this point was because On my local branch, I simplified this by creating another object:
and in those segments:
Do you have any thoughts on this approach? |
||
result: dict[Hashable, NDFrame | np.ndarray] = {} | ||
for name, grp_df in self._grouper.get_iterator(obj): | ||
for name, grp_df in self._grouper.get_iterator(data): | ||
fres = func(grp_df, *args, **kwargs) | ||
result[name] = fres | ||
|
||
result_index = self._grouper.result_index | ||
out = self.obj._constructor(result, index=obj.columns, columns=result_index) | ||
out = self.obj._constructor(result, index=data.columns, columns=result_index) | ||
out = out.T | ||
|
||
return out | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1663,3 +1663,53 @@ def func(x): | |
msg = "length must not be 0" | ||
with pytest.raises(ValueError, match=msg): | ||
df.groupby("A", observed=False).agg(func) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"aggfunc", | ||
[ | ||
"mean", | ||
np.mean, | ||
["sum", "mean"], | ||
[np.sum, np.mean], | ||
["sum", np.mean], | ||
lambda x: x.mean(), | ||
{"A": "mean"}, | ||
{"A": "mean", "B": "sum"}, | ||
{"A": np.mean}, | ||
], | ||
ids=[ | ||
" string_mean ", | ||
" numpy_mean ", | ||
" list_of_str_and_str ", | ||
Comment on lines
+1682
to
+1684
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why the spaces? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Think it was ease for reading for myself when it was showing up on my tests, so it was easier to read
Will make sure to cleanup before taking off draft marker though. |
||
" list_of_numpy_and_numpy ", | ||
" list_of_str_and_numpy ", | ||
" lambda ", | ||
" dict_with_str ", | ||
" dict with 2 vars ", | ||
" dict with numpy", | ||
], | ||
) | ||
@pytest.mark.parametrize( | ||
"groupers", | ||
["groupby1", "groupby2", ["groupby1", "groupby2"]], | ||
ids=[" 1_grouper_str ", " 1_grouper_int ", " 2_groupers_str_and_int "], | ||
) | ||
@pytest.mark.parametrize( | ||
"numeric_only", [True, None], ids=[" numeric_only True ", " no_numeric_only_arg "] | ||
) # need to add other kwargs | ||
def test_different_combinations_of_groupby_agg(aggfunc, groupers, numeric_only): | ||
df = DataFrame( | ||
{ | ||
"A": [1, 2, 3, 4, 5], | ||
"B": [10, 20, 30, 40, 50], | ||
"groupby1": ["diamond", "diamond", "spade", "spade", "spade"], | ||
"groupby2": [1, 1, 1, 2, 2], | ||
"attr": ["a", "b", "c", "d", "e"], | ||
} | ||
) | ||
if numeric_only or isinstance(aggfunc, dict): | ||
df.groupby(by=groupers).agg(func=aggfunc, numeric_only=numeric_only) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You need to check that the result is as expected. See other tests above for examples. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yup, sorry, I was in the middle of this ticket when I encountered the bug around #39169 which was an issue because I will look into submitting a fix with an explicit |
||
else: | ||
with pytest.raises(TypeError): | ||
df.groupby(by=groupers).agg(func=aggfunc) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you revert this change. If you make it
**kwargs
, then users will run into an error if they try to pass through e.g.selected_obj
as a keyword argument.