-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
REF: Decouple Series.apply from Series.agg #53400
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
75ce829
52db878
fc26828
4d0db30
c521691
9353f06
e7e3433
755ec07
92e7a9f
e68c46e
9af24b2
af0417d
8564968
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -101,6 +101,7 @@ Other enhancements | |
- :meth:`DataFrame.unstack` gained the ``sort`` keyword to dictate whether the resulting :class:`MultiIndex` levels are sorted (:issue:`15105`) | ||
- :meth:`SeriesGroupby.agg` and :meth:`DataFrameGroupby.agg` now support passing in multiple functions for ``engine="numba"`` (:issue:`53486`) | ||
- Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`) | ||
- Added a new parameter ``array_ops_only`` to :meth:`Series.apply`. When set to ``True`` the supplied callables will always operate on the whole Series (:issue:`53400`). | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. by_row now; not array_ops_only. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, changed. |
||
- Many read/to_* functions, such as :meth:`DataFrame.to_pickle` and :func:`read_csv`, support forwarding compression arguments to lzma.LZMAFile (:issue:`52979`) | ||
- Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`) | ||
- Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ | |
Iterable, | ||
Iterator, | ||
List, | ||
Literal, | ||
Sequence, | ||
cast, | ||
) | ||
|
@@ -288,6 +289,11 @@ def agg_list_like(self) -> DataFrame | Series: | |
------- | ||
Result of aggregation. | ||
""" | ||
return self.agg_or_apply_list_like(op_name="agg") | ||
|
||
def agg_or_apply_list_like( | ||
self, op_name: Literal["agg", "apply"] | ||
) -> DataFrame | Series: | ||
from pandas.core.groupby.generic import ( | ||
DataFrameGroupBy, | ||
SeriesGroupBy, | ||
|
@@ -296,6 +302,9 @@ def agg_list_like(self) -> DataFrame | Series: | |
|
||
obj = self.obj | ||
func = cast(List[AggFuncTypeBase], self.func) | ||
kwargs = self.kwargs | ||
if op_name == "apply": | ||
kwargs = {**kwargs, "by_row": False} | ||
Comment on lines
+306
to
+307
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @topper-123: shouldn't by_row here be True for backwards compatibility? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On second thought, I'm thinking this should now be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you are right. I'll make a new PR on that. |
||
|
||
if getattr(obj, "axis", 0) == 1: | ||
raise NotImplementedError("axis other than 0 is not supported") | ||
|
@@ -313,8 +322,6 @@ def agg_list_like(self) -> DataFrame | Series: | |
keys = [] | ||
|
||
is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy)) | ||
is_ser_or_df = isinstance(obj, (ABCDataFrame, ABCSeries)) | ||
this_args = [self.axis, *self.args] if is_ser_or_df else self.args | ||
|
||
context_manager: ContextManager | ||
if is_groupby: | ||
|
@@ -323,12 +330,19 @@ def agg_list_like(self) -> DataFrame | Series: | |
context_manager = com.temp_setattr(obj, "as_index", True) | ||
else: | ||
context_manager = nullcontext() | ||
|
||
def include_axis(colg) -> bool: | ||
return isinstance(colg, ABCDataFrame) or ( | ||
isinstance(colg, ABCSeries) and op_name == "agg" | ||
) | ||
|
||
with context_manager: | ||
# degenerate case | ||
if selected_obj.ndim == 1: | ||
for a in func: | ||
colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj) | ||
new_res = colg.aggregate(a, *this_args, **self.kwargs) | ||
args = [self.axis, *self.args] if include_axis(colg) else self.args | ||
new_res = getattr(colg, op_name)(a, *args, **kwargs) | ||
results.append(new_res) | ||
|
||
# make sure we find a good name | ||
|
@@ -339,7 +353,8 @@ def agg_list_like(self) -> DataFrame | Series: | |
indices = [] | ||
for index, col in enumerate(selected_obj): | ||
colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) | ||
new_res = colg.aggregate(func, *this_args, **self.kwargs) | ||
args = [self.axis, *self.args] if include_axis(colg) else self.args | ||
new_res = getattr(colg, op_name)(func, *args, **kwargs) | ||
results.append(new_res) | ||
indices.append(index) | ||
keys = selected_obj.columns.take(indices) | ||
|
@@ -366,15 +381,23 @@ def agg_dict_like(self) -> DataFrame | Series: | |
------- | ||
Result of aggregation. | ||
""" | ||
return self.agg_or_apply_dict_like(op_name="agg") | ||
|
||
def agg_or_apply_dict_like( | ||
self, op_name: Literal["agg", "apply"] | ||
) -> DataFrame | Series: | ||
from pandas import Index | ||
from pandas.core.groupby.generic import ( | ||
DataFrameGroupBy, | ||
SeriesGroupBy, | ||
) | ||
from pandas.core.reshape.concat import concat | ||
|
||
assert op_name in ["agg", "apply"] | ||
|
||
obj = self.obj | ||
func = cast(AggFuncTypeDict, self.func) | ||
kwds = {"by_row": False} if op_name == "apply" else {} | ||
|
||
if getattr(obj, "axis", 0) == 1: | ||
raise NotImplementedError("axis other than 0 is not supported") | ||
|
@@ -387,7 +410,7 @@ def agg_dict_like(self) -> DataFrame | Series: | |
selected_obj = obj._selected_obj | ||
selection = obj._selection | ||
|
||
func = self.normalize_dictlike_arg("agg", selected_obj, func) | ||
func = self.normalize_dictlike_arg(op_name, selected_obj, func) | ||
|
||
is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy)) | ||
context_manager: ContextManager | ||
|
@@ -404,17 +427,18 @@ def agg_dict_like(self) -> DataFrame | Series: | |
) | ||
|
||
# Numba Groupby engine/engine-kwargs passthrough | ||
kwargs = {} | ||
if is_groupby: | ||
engine = self.kwargs.get("engine", None) | ||
engine_kwargs = self.kwargs.get("engine_kwargs", None) | ||
kwargs = {"engine": engine, "engine_kwargs": engine_kwargs} | ||
kwds.update({"engine": engine, "engine_kwargs": engine_kwargs}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NBD, but I wonder why the change from kwargs to kwds? In pandas.core we overwhelmingly use kwargs instead of kwds. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. kwargs would make a line further down excedd 88 lines and be reformatted to fill 3 lines. So a stylistic preference, but not a strong opinion, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think consistency in variable names is more important here. |
||
|
||
with context_manager: | ||
if selected_obj.ndim == 1: | ||
# key only used for output | ||
colg = obj._gotitem(selection, ndim=1) | ||
result_data = [colg.agg(how, **kwargs) for _, how in func.items()] | ||
result_data = [ | ||
getattr(colg, op_name)(how, **kwds) for _, how in func.items() | ||
] | ||
result_index = list(func.keys()) | ||
elif is_non_unique_col: | ||
# key used for column selection and output | ||
|
@@ -429,7 +453,7 @@ def agg_dict_like(self) -> DataFrame | Series: | |
label_to_indices[label].append(index) | ||
|
||
key_data = [ | ||
selected_obj._ixs(indice, axis=1).agg(how, **kwargs) | ||
getattr(selected_obj._ixs(indice, axis=1), op_name)(how, **kwds) | ||
for label, indices in label_to_indices.items() | ||
for indice in indices | ||
] | ||
|
@@ -439,7 +463,7 @@ def agg_dict_like(self) -> DataFrame | Series: | |
else: | ||
# key used for column selection and output | ||
result_data = [ | ||
obj._gotitem(key, ndim=1).agg(how, **kwargs) | ||
getattr(obj._gotitem(key, ndim=1), op_name)(how, **kwds) | ||
for key, how in func.items() | ||
] | ||
result_index = list(func.keys()) | ||
|
@@ -535,7 +559,7 @@ def apply_str(self) -> DataFrame | Series: | |
self.kwargs["axis"] = self.axis | ||
return self._apply_str(obj, func, *self.args, **self.kwargs) | ||
|
||
def apply_multiple(self) -> DataFrame | Series: | ||
def apply_list_or_dict_like(self) -> DataFrame | Series: | ||
""" | ||
Compute apply in case of a list-like or dict-like. | ||
|
||
|
@@ -551,9 +575,9 @@ def apply_multiple(self) -> DataFrame | Series: | |
kwargs = self.kwargs | ||
|
||
if is_dict_like(func): | ||
result = self.agg_dict_like() | ||
result = self.agg_or_apply_dict_like(op_name="apply") | ||
else: | ||
result = self.agg_list_like() | ||
result = self.agg_or_apply_list_like(op_name="apply") | ||
|
||
result = reconstruct_and_relabel_result(result, func, **kwargs) | ||
|
||
|
@@ -693,8 +717,8 @@ def values(self): | |
def apply(self) -> DataFrame | Series: | ||
"""compute the results""" | ||
# dispatch to agg | ||
if is_list_like(self.func): | ||
return self.apply_multiple() | ||
if is_list_like(self.func) or is_dict_like(self.func): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. dicts are considered list-like; no need for the 2nd check here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, I changed it. I've changed the comment above instead to explain dictlike go here too. |
||
return self.apply_list_or_dict_like() | ||
|
||
# all empty | ||
if len(self.columns) == 0 and len(self.index) == 0: | ||
|
@@ -1041,13 +1065,15 @@ def infer_to_same_shape(self, results: ResType, res_index: Index) -> DataFrame: | |
class SeriesApply(NDFrameApply): | ||
obj: Series | ||
axis: AxisInt = 0 | ||
by_row: bool # only relevant for apply() | ||
|
||
def __init__( | ||
self, | ||
obj: Series, | ||
func: AggFuncType, | ||
*, | ||
convert_dtype: bool | lib.NoDefault = lib.no_default, | ||
by_row: bool = True, | ||
args, | ||
kwargs, | ||
) -> None: | ||
|
@@ -1062,6 +1088,7 @@ def __init__( | |
stacklevel=find_stack_level(), | ||
) | ||
self.convert_dtype = convert_dtype | ||
self.by_row = by_row | ||
|
||
super().__init__( | ||
obj, | ||
|
@@ -1079,8 +1106,8 @@ def apply(self) -> DataFrame | Series: | |
return self.apply_empty_result() | ||
|
||
# dispatch to agg | ||
if is_list_like(self.func): | ||
return self.apply_multiple() | ||
if is_list_like(self.func) or is_dict_like(self.func): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, changed. |
||
return self.apply_list_or_dict_like() | ||
|
||
if isinstance(self.func, str): | ||
# if we are a string, try to dispatch | ||
|
@@ -1126,6 +1153,8 @@ def apply_standard(self) -> DataFrame | Series: | |
if isinstance(func, np.ufunc): | ||
with np.errstate(all="ignore"): | ||
return func(obj, *self.args, **self.kwargs) | ||
elif not self.by_row: | ||
return func(obj, *self.args, **self.kwargs) | ||
|
||
if self.args or self.kwargs: | ||
# _map_values does not support args/kwargs | ||
|
Uh oh!
There was an error while loading. Please reload this page.