Skip to content

API: add numeric_only support to groupby agg #58132

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 9 additions & 11 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,13 +303,14 @@ def agg_list_like(self) -> DataFrame | Series:
-------
Result of aggregation.
"""
return self.agg_or_apply_list_like(op_name="agg")
kwargs = self.kwargs
return self.agg_or_apply_list_like(op_name="agg", **kwargs)

def compute_list_like(
self,
op_name: Literal["agg", "apply"],
selected_obj: Series | DataFrame,
kwargs: dict[str, Any],
**kwargs: dict[str, Any],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you revert this change. If you make it **kwargs, then users will run into an error if they try to pass through e.g. selected_obj as a keyword argument.

) -> tuple[list[Hashable] | Index, list[Any]]:
"""
Compute agg/apply results for like-like input.
Expand Down Expand Up @@ -348,7 +349,6 @@ def compute_list_like(
)
new_res = getattr(colg, op_name)(a, *args, **kwargs)
results.append(new_res)

# make sure we find a good name
name = com.get_callable_name(a) or a
keys.append(name)
Expand Down Expand Up @@ -691,10 +691,9 @@ def agg_axis(self) -> Index:
return self.obj._get_agg_axis(self.axis)

def agg_or_apply_list_like(
self, op_name: Literal["agg", "apply"]
self, op_name: Literal["agg", "apply"], numeric_only=False, **kwargs
) -> DataFrame | Series:
obj = self.obj
kwargs = self.kwargs

if op_name == "apply":
if isinstance(self, FrameApply):
Expand All @@ -709,7 +708,7 @@ def agg_or_apply_list_like(
if getattr(obj, "axis", 0) == 1:
raise NotImplementedError("axis other than 0 is not supported")

keys, results = self.compute_list_like(op_name, obj, kwargs)
keys, results = self.compute_list_like(op_name, obj, **kwargs)
result = self.wrap_results_list_like(keys, results)
return result

Expand Down Expand Up @@ -1485,28 +1484,27 @@ def transform(self):
raise NotImplementedError

def agg_or_apply_list_like(
self, op_name: Literal["agg", "apply"]
self, op_name: Literal["agg", "apply"], numeric_only=False, **kwargs
) -> DataFrame | Series:
obj = self.obj
kwargs = self.kwargs
if op_name == "apply":
kwargs = {**kwargs, "by_row": False}

if getattr(obj, "axis", 0) == 1:
raise NotImplementedError("axis other than 0 is not supported")

if obj._selected_obj.ndim == 1:
# For SeriesGroupBy this matches _obj_with_exclusions
selected_obj = obj._selected_obj
elif numeric_only:
selected_obj = obj._obj_numeric_only_with_exclusions
else:
selected_obj = obj._obj_with_exclusions

# Only set as_index=True on groupby objects, not Window or Resample
# that inherit from this class.
with com.temp_setattr(
obj, "as_index", True, condition=hasattr(obj, "as_index")
):
keys, results = self.compute_list_like(op_name, selected_obj, kwargs)
keys, results = self.compute_list_like(op_name, selected_obj, **kwargs)
result = self.wrap_results_list_like(keys, results)
return result

Expand Down
19 changes: 19 additions & 0 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,25 @@ def _obj_with_exclusions(self):
else:
return self.obj

@final
@cache_readonly
def _obj_numeric_only_with_exclusions(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the performance of this change needs to be closely studied to see if it is really worth the complexity (and extra memory usage). In any case, I do not think it should be introduced here, but rather in its own PR. Can you follow the existing pattern of subsetting the data directly in the agg methods instead.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

related to comment I put below:

#58132 (comment)

if isinstance(self.obj, ABCSeries):
return self.obj.select_dtypes("number")

if self._selection is not None:
return self.obj[self._selection_list].select_dtypes("number")

if len(self.exclusions) > 0:
# equivalent to `self.obj.drop(self.exclusions, axis=1)
# but this avoids consolidating and making a copy
# TODO: following GH#45287 can we now use .drop directly without
# making a copy?
obj = self.obj._drop_axis(self.exclusions, axis=1, only_slice=True)
return obj.select_dtypes("number")
else:
return self.obj.select_dtypes("number")

def __getitem__(self, key):
if self._selection is not None:
raise IndexError(f"Column(s) {self._selection} already selected")
Expand Down
19 changes: 11 additions & 8 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1553,7 +1553,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)

else:
# try to treat as if we are passing a list
gba = GroupByApply(self, [func], args=(), kwargs={})
gba = GroupByApply(self, [func], args=args, kwargs=kwargs)
try:
result = gba.agg()

Expand Down Expand Up @@ -1582,15 +1582,18 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)

agg = aggregate

def _python_agg_general(self, func, *args, **kwargs):
def _python_agg_general(self, func, *args, numeric_only=False, **kwargs):
f = lambda x: func(x, *args, **kwargs)

if self.ngroups == 0:
# e.g. test_evaluate_with_empty_groups different path gets different
# result dtype in empty case.
return self._python_apply_general(f, self._selected_obj, is_agg=True)

obj = self._obj_with_exclusions
if numeric_only:
obj = self._obj_numeric_only_with_exclusions
else:
obj = self._obj_with_exclusions

if not len(obj.columns):
# e.g. test_margins_no_values_no_cols
Expand All @@ -1605,19 +1608,19 @@ def _python_agg_general(self, func, *args, **kwargs):
res.columns = obj.columns.copy(deep=False)
return self._wrap_aggregated_output(res)

def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame:
def _aggregate_frame(self, func, *args, numeric_only=False, **kwargs) -> DataFrame:
if self._grouper.nkeys != 1:
raise AssertionError("Number of keys must be 1")

obj = self._obj_with_exclusions

mgr = self._get_data_to_aggregate(numeric_only=numeric_only)
data = self._wrap_agged_manager(mgr)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see any advantage (or disadvantage - for that matter) of calling this obj vs data. Because of this, I think we should leave it as obj (to avoid the code churn).

If you see a reason to prefer data over obj, let me know!

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, the issue I was having on this point was because _obj_with_exclusions doesn't update when the dataframe is updated to remove non-numeric columns. I assume this is related to it being a @cache_readonly in pandas.core.base.py. My tentative workaround was to generate the data again.

On my local branch, I simplified this by creating another object:

@cache_readonly
def _obj_numeric_only_with_exclusions(self):

and in those segments:

if numeric_only:
    obj = self._obj_numeric_only_with_exclusions(self):
else:
    obj = self._obj_with_exclusions

Do you have any thoughts on this approach?

result: dict[Hashable, NDFrame | np.ndarray] = {}
for name, grp_df in self._grouper.get_iterator(obj):
for name, grp_df in self._grouper.get_iterator(data):
fres = func(grp_df, *args, **kwargs)
result[name] = fres

result_index = self._grouper.result_index
out = self.obj._constructor(result, index=obj.columns, columns=result_index)
out = self.obj._constructor(result, index=data.columns, columns=result_index)
out = out.T

return out
Expand Down
50 changes: 50 additions & 0 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1663,3 +1663,53 @@ def func(x):
msg = "length must not be 0"
with pytest.raises(ValueError, match=msg):
df.groupby("A", observed=False).agg(func)


@pytest.mark.parametrize(
"aggfunc",
[
"mean",
np.mean,
["sum", "mean"],
[np.sum, np.mean],
["sum", np.mean],
lambda x: x.mean(),
{"A": "mean"},
{"A": "mean", "B": "sum"},
{"A": np.mean},
],
ids=[
" string_mean ",
" numpy_mean ",
" list_of_str_and_str ",
Comment on lines +1682 to +1684
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why the spaces?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Think it was ease for reading for myself when it was showing up on my tests, so it was easier to read

xxx - yyyy - zzzz

Will make sure to cleanup before taking off draft marker though.

" list_of_numpy_and_numpy ",
" list_of_str_and_numpy ",
" lambda ",
" dict_with_str ",
" dict with 2 vars ",
" dict with numpy",
],
)
@pytest.mark.parametrize(
"groupers",
["groupby1", "groupby2", ["groupby1", "groupby2"]],
ids=[" 1_grouper_str ", " 1_grouper_int ", " 2_groupers_str_and_int "],
)
@pytest.mark.parametrize(
"numeric_only", [True, None], ids=[" numeric_only True ", " no_numeric_only_arg "]
) # need to add other kwargs
def test_different_combinations_of_groupby_agg(aggfunc, groupers, numeric_only):
df = DataFrame(
{
"A": [1, 2, 3, 4, 5],
"B": [10, 20, 30, 40, 50],
"groupby1": ["diamond", "diamond", "spade", "spade", "spade"],
"groupby2": [1, 1, 1, 2, 2],
"attr": ["a", "b", "c", "d", "e"],
}
)
if numeric_only or isinstance(aggfunc, dict):
df.groupby(by=groupers).agg(func=aggfunc, numeric_only=numeric_only)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need to check that the result is as expected. See other tests above for examples.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yup, sorry, I was in the middle of this ticket when I encountered the bug around #39169 which was an issue because numeric_only being fed as a kwarg sends it down this bugged path, then raised #58132 hoping to address that. We've discussed over there that this silent fix isn't ideal and we will be deprecating.

I will look into submitting a fix with an explicit numeric_only argument, as that can change the code path management.

else:
with pytest.raises(TypeError):
df.groupby(by=groupers).agg(func=aggfunc)
Loading