Skip to content

API: Clarify difference between agg and apply for Series / DataFrame #49672

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 51 additions & 42 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
Iterable,
Iterator,
List,
Literal,
Sequence,
cast,
)
Expand Down Expand Up @@ -158,19 +159,54 @@ def agg(self) -> DataFrame | Series | None:
return self.apply_str()

if is_dict_like(arg):
return self.agg_dict_like()
return self.dict_like("agg")
elif is_list_like(arg):
# we require a list, but not a 'str'
return self.agg_list_like()
return self.list_like("agg")

if callable(arg):
f = com.get_cython_func(arg)
if f and not args and not kwargs:
return getattr(obj, f)()
elif not isinstance(obj, SelectionMixin):
# i.e. obj is Series or DataFrame
return self.agg_udf()

# caller can react
return None

def agg_udf(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is "udf" here a shorthand for? Can you rename to something more explicit and/or add a doc string & type hints to help better understand this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

User Defined Function; I'd prefer sticking with the name but completely agree it should be expanding upon in a docstring.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

obj = self.obj
arg = cast(Callable, self.f)

if not isinstance(obj, SelectionMixin):
# i.e. obj is Series or DataFrame
selected_obj = obj
elif obj._selected_obj.ndim == 1:
# For SeriesGroupBy this matches _obj_with_exclusions
selected_obj = obj._selected_obj
else:
selected_obj = obj._obj_with_exclusions

results = []

if selected_obj.ndim == 1:
colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj)
return arg(colg)

indices = []
for index, col in enumerate(selected_obj):
colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index])
new_res = arg(colg)
results.append(new_res)
indices.append(index)
keys = selected_obj.columns.take(indices)

from pandas import Series

result = Series(results, index=keys)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we be using something._constructor here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could be using selected_obj._constructor_sliced

return result

def transform(self) -> DataFrame | Series:
"""
Transform a DataFrame or Series.
Expand Down Expand Up @@ -284,7 +320,7 @@ def transform_str_or_callable(self, func) -> DataFrame | Series:
except Exception:
return func(obj, *args, **kwargs)

def agg_list_like(self) -> DataFrame | Series:
def list_like(self, method: Literal["agg", "apply"]) -> DataFrame | Series:
"""
Compute aggregation in the case of a list-like argument.

Expand Down Expand Up @@ -316,7 +352,7 @@ def agg_list_like(self) -> DataFrame | Series:
if selected_obj.ndim == 1:
for a in arg:
colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj)
new_res = colg.aggregate(a)
new_res = getattr(colg, method)(a)
results.append(new_res)

# make sure we find a good name
Expand All @@ -328,7 +364,7 @@ def agg_list_like(self) -> DataFrame | Series:
indices = []
for index, col in enumerate(selected_obj):
colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index])
new_res = colg.aggregate(arg)
new_res = getattr(colg, method)(arg)
results.append(new_res)
indices.append(index)
keys = selected_obj.columns.take(indices)
Expand Down Expand Up @@ -357,7 +393,7 @@ def agg_list_like(self) -> DataFrame | Series:
)
return concatenated.reindex(full_ordered_index, copy=False)

def agg_dict_like(self) -> DataFrame | Series:
def dict_like(self, method: Literal["agg", "apply"]) -> DataFrame | Series:
"""
Compute aggregation in the case of a dict-like argument.

Expand All @@ -382,16 +418,17 @@ def agg_dict_like(self) -> DataFrame | Series:
selected_obj = obj._selected_obj
selection = obj._selection

arg = self.normalize_dictlike_arg("agg", selected_obj, arg)
arg = self.normalize_dictlike_arg(method, selected_obj, arg)

if selected_obj.ndim == 1:
# key only used for output
colg = obj._gotitem(selection, ndim=1)
results = {key: colg.agg(how) for key, how in arg.items()}
results = {key: getattr(colg, method)(how) for key, how in arg.items()}
else:
# key used for column selection and output
results = {
key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items()
key: getattr(obj._gotitem(key, ndim=1), method)(how)
for key, how in arg.items()
}

# set the final keys
Expand All @@ -412,7 +449,7 @@ def agg_dict_like(self) -> DataFrame | Series:
ktu._set_names(selected_obj.columns.names)
keys_to_use = ktu

axis: AxisInt = 0 if isinstance(obj, ABCSeries) else 1
axis: AxisInt = 0 if isinstance(obj, ABCSeries) and method == "agg" else 1
result = concat(
{k: results[k] for k in keys_to_use}, # type: ignore[misc]
axis=axis,
Expand Down Expand Up @@ -477,7 +514,10 @@ def apply_multiple(self) -> DataFrame | Series:
result: Series, DataFrame, or None
Result when self.f is a list-like or dict-like, None otherwise.
"""
return self.obj.aggregate(self.f, self.axis, *self.args, **self.kwargs)
if is_dict_like(self.f):
return self.dict_like("apply")
else:
return self.list_like("apply")

def normalize_dictlike_arg(
self, how: str, obj: DataFrame | Series, func: AggFuncTypeDict
Expand Down Expand Up @@ -676,9 +716,6 @@ def agg(self):
if axis == 1:
result = result.T if result is not None else result

if result is None:
result = self.obj.apply(self.orig_f, axis, args=self.args, **self.kwargs)

return result

def apply_empty_result(self):
Expand Down Expand Up @@ -1009,34 +1046,6 @@ def apply(self) -> DataFrame | Series:
# self.f is Callable
return self.apply_standard()

def agg(self):
result = super().agg()
if result is None:
f = self.f
kwargs = self.kwargs

# string, list-like, and dict-like are entirely handled in super
assert callable(f)

# we can be called from an inner function which
# passes this meta-data
kwargs.pop("_level", None)

# try a regular apply, this evaluates lambdas
# row-by-row; however if the lambda is expected a Series
# expression, e.g.: lambda x: x-x.quantile(0.25)
# this will fail, so we can try a vectorized evaluation

# we cannot FIRST try the vectorized evaluation, because
# then .agg and .apply would have different semantics if the
# operation is actually defined on the Series, e.g. str
try:
result = self.obj.apply(f)
except (ValueError, AttributeError, TypeError):
result = f(self.obj)

return result

def apply_empty_result(self) -> Series:
obj = self.obj
return obj._constructor(dtype=obj.dtype, index=obj.index).__finalize__(
Expand Down
35 changes: 16 additions & 19 deletions pandas/tests/apply/test_frame_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,10 @@ def test_apply_axis1_with_ea():
"data, dtype",
[(1, None), (1, CategoricalDtype([1])), (Timestamp("2013-01-01", tz="UTC"), None)],
)
def test_agg_axis1_duplicate_index(data, dtype):
def test_apply_axis1_duplicate_index(data, dtype):
# GH 42380
expected = DataFrame([[data], [data]], index=["a", "a"], dtype=dtype)
result = expected.agg(lambda x: x, axis=1)
result = expected.apply(lambda x: x, axis=1)
tm.assert_frame_equal(result, expected)


Expand Down Expand Up @@ -1065,8 +1065,6 @@ def test_consistency_for_boxed(box, int_frame_const_col):


def test_agg_transform(axis, float_frame):
other_axis = 1 if axis in {0, "index"} else 0

with np.errstate(all="ignore"):

f_abs = np.abs(float_frame)
Expand All @@ -1080,25 +1078,17 @@ def test_agg_transform(axis, float_frame):
# list-like
result = float_frame.apply([np.sqrt], axis=axis)
expected = f_sqrt.copy()
if axis in {0, "index"}:
expected.columns = MultiIndex.from_product([float_frame.columns, ["sqrt"]])
else:
expected.index = MultiIndex.from_product([float_frame.index, ["sqrt"]])
expected.columns = MultiIndex.from_product([float_frame.columns, ["sqrt"]])
tm.assert_frame_equal(result, expected)

# multiple items in list
# these are in the order as if we are applying both
# functions per series and then concatting
result = float_frame.apply([np.abs, np.sqrt], axis=axis)
expected = zip_frames([f_abs, f_sqrt], axis=other_axis)
if axis in {0, "index"}:
expected.columns = MultiIndex.from_product(
[float_frame.columns, ["absolute", "sqrt"]]
)
else:
expected.index = MultiIndex.from_product(
[float_frame.index, ["absolute", "sqrt"]]
)
expected = zip_frames([f_abs, f_sqrt], axis=1)
expected.columns = MultiIndex.from_product(
[float_frame.columns, ["absolute", "sqrt"]]
)
tm.assert_frame_equal(result, expected)


Expand Down Expand Up @@ -1486,10 +1476,10 @@ def test_apply_empty_list_reduce():
tm.assert_series_equal(result, expected)


def test_apply_no_suffix_index():
def test_agg_no_suffix_index():
# GH36189
pdf = DataFrame([[4, 9]] * 3, columns=["A", "B"])
result = pdf.apply(["sum", lambda x: x.sum(), lambda x: x.sum()])
result = pdf.agg(["sum", lambda x: x.sum(), lambda x: x.sum()])
expected = DataFrame(
{"A": [12, 12, 12], "B": [27, 27, 27]}, index=["sum", "<lambda>", "<lambda>"]
)
Expand Down Expand Up @@ -1624,3 +1614,10 @@ def test_any_apply_keyword_non_zero_axis_regression():

result = df.apply("any", 1)
tm.assert_series_equal(result, expected)


def test_agg_list_aggregated():
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
result = df.agg(list)
expected = Series({"a": [1, 2, 3], "b": [4, 5, 6]})
tm.assert_series_equal(result, expected)
Loading