pandas-dev · rhshadrach · Sep 6, 2021 · Sep 24, 2021 · Sep 25, 2021 · Sep 25, 2021
diff --git a/doc/source/user_guide/future_udf_behavior.rst b/doc/source/user_guide/future_udf_behavior.rst
@@ -0,0 +1,72 @@
+.. _future_udf_behavior:
+
+:orphan:
+
+{{ header }}
+
+*******************
+Future UDF Behavior
+*******************
+
+pandas is experimenting with improving the behavior of methods that take a
+user-defined function (UDF). These methods include ``.apply``, ``.agg``, ``.transform``,
+and ``.filter``. The goal is to make these methods behave in a more predictable
+and consistent manner, reducing the complexity of their implementation, and improving
+performance where possible. This page details the differences between the old and
+new behaviors, as well as providing some context behind each change that is being made.
+
+There are a great number of changes that are planned. In order to transition in a
+reasonable manner for users, all changes are behind an experimental "future_udf_behavior"
+option. This is currently experimental and subject to breaking changes without notice.
+Users can opt into the new behavior and provide feedback. Once the improvements have
+been made, this option will be declared no longer experimental. pandas will then raise
+a ``FutureWarning`` that the default value of this option will be set to ``True`` in
+a future version. Once the default is ``True``, users can still override it to ``False``.
+After a sufficient amount of time, pandas will remove this option altogether and only
+the future behavior will remain.
+
+``DataFrame.agg`` with list-likes
+---------------------------------
+
+Previously, using ``DataFrame.agg`` with a list-like argument would transpose the result when
+compared with just providing a single aggregation function.
+
+.. ipython:: python
+
+   df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
+
+   df.agg("sum")
+   df.agg(["sum"])
+
+This transpose no longer occurs, making the result more consistent.
+
+.. ipython:: python
+
+   with pd.option_context("future_udf_behavior", True):
+       result = df.agg(["sum"])
+   result
+
+   with pd.option_context("future_udf_behavior", True):
+       result = df.agg(["sum", "mean"])
+   result
+
+``DataFrame.groupby(...).agg`` with list-likes
+----------------------------------------------
+
+Previously, using ``DataFrame.groupby(...).agg`` with a list-like argument would put the
+columns as the first level of the resulting hierarchical columns. The result is
+that the columns for each aggregation function are separated, inconsistent with the result
+for a single aggregator.
+
+.. ipython:: python
+
+   df.groupby("a").agg("sum")
+   df.groupby("a").agg(["sum", "min"])
+
+Now the levels are swapped, so that the columns for each aggregation are together.
+
+.. ipython:: python
+
+   with pd.option_context("future_udf_behavior", True):
+       result = df.groupby("a").agg(["sum", "min"])
+   result
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -22,7 +22,10 @@
 
 import numpy as np
 
-from pandas._config import option_context
+from pandas._config import (
+    get_option,
+    option_context,
+)
 
 from pandas._libs import lib
 from pandas._typing import (
@@ -169,7 +172,10 @@ def agg(self) -> DataFrame | Series | None:
             return self.agg_dict_like()
         elif is_list_like(arg):
             # we require a list, but not a 'str'
-            return self.agg_list_like()
+            if get_option("future_udf_behavior"):
+                return self.future_list_like("agg")
+            else:
+                return self.agg_list_like()
 
         if callable(arg):
             f = com.get_cython_func(arg)
@@ -443,6 +449,88 @@ def agg_list_like(self) -> DataFrame | Series:
             )
             return concatenated.reindex(full_ordered_index, copy=False)
 
+    def future_list_single_arg(
+        self, method: str, a: AggFuncTypeBase, result_dim: int | None
+    ) -> tuple[int | None, AggFuncTypeBase | None, DataFrame | Series | None]:
+        name = None
+        result = None
+        try:
+            if isinstance(a, (tuple, list)):
+                # Handle (name, value) pairs
+                name, a = a
+            result = getattr(self.obj, method)(a)
+            if result_dim is None:
+                result_dim = getattr(result, "ndim", 0)
+            elif getattr(result, "ndim", 0) != result_dim:
+                raise ValueError("cannot combine transform and aggregation operations")
+        except TypeError:
+            pass
+        # make sure we find a good name
+        if name is None:
+            name = com.get_callable_name(a) or a
+        return result_dim, name, result
+
+    def future_list_like(self, method: str) -> DataFrame | Series:
+        """
+        Compute aggregation in the case of a list-like argument.
+
+        Returns
+        -------
+        Result of aggregation.
+        """
+        from pandas.core.reshape.concat import concat
+
+        obj = self.obj
+        arg = cast(List[AggFuncTypeBase], self.f)
+
+        results = []
+        keys = []
+        result_dim = None
+        failed_names = []
+
+        for a in arg:
+            result_dim, name, new_res = self.future_list_single_arg(
+                method, a, result_dim
+            )
+            if new_res is not None:
+                results.append(new_res)
+                keys.append(name)
+            else:
+                failed_names.append(a)
+
+        # if we are empty
+        if not len(results):
+            raise ValueError("no results")
+
+        if len(failed_names) > 0:
+            warnings.warn(
+                f"{failed_names} did not aggregate successfully. If any error is "
+                "raised this will raise in a future version of pandas. "
+                "Drop these columns/ops to avoid this warning.",
+                FutureWarning,
+                stacklevel=find_stack_level(),
+            )
+
+        try:
+            concatenated = concat(results, keys=keys, axis=1, sort=False)
+        except TypeError:
+            # we are concatting non-NDFrame objects,
+            # e.g. a list of scalars
+            from pandas import Series
+
+            result = Series(results, index=keys, name=obj.name)
+            return result
+        else:
+            # Concat uses the first index to determine the final indexing order.
+            # The union of a shorter first index with the other indices causes
+            # the index sorting to be different from the order of the aggregating
+            # functions. Reindex if this is the case.
+            index_size = concatenated.index.size
+            full_ordered_index = next(
+                result.index for result in results if result.index.size == index_size
+            )
+            return concatenated.reindex(full_ordered_index, copy=False)
+
     def agg_dict_like(self) -> DataFrame | Series:
         """
         Compute aggregation in the case of a dict-like argument.

diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -511,6 +511,23 @@ def use_inf_as_na_cb(key):
         validator=is_one_of_factory(["block", "array"]),
     )
 
+future_udf_behavior = """
+: boolean
+    Whether to use the future UDF method implementations. Currently experimental.
+    Defaults to False.
+"""
+
+
+with cf.config_prefix("mode"):
+    cf.register_option(
+        "future_udf_behavior",
+        # Get the default from an environment variable, if set, otherwise defaults
+        # to False. This environment variable can be set for testing.
+        os.environ.get("PANDAS_FUTURE_UDF_BEHAVIOR", "false").lower() == "true",
+        future_udf_behavior,
+        validator=is_bool,
+    )
+
 
 # user warnings
 chained_assignment = """

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -83,6 +83,7 @@
     doc,
     rewrite_axis_style_signature,
 )
+from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import (
     validate_ascending,
     validate_axis_style_args,
@@ -10016,7 +10017,7 @@ def _get_data() -> DataFrame:
                     "version this will raise TypeError.  Select only valid "
                     "columns before calling the reduction.",
                     FutureWarning,
-                    stacklevel=5,
+                    stacklevel=find_stack_level(),
                 )
 
             return out
@@ -10049,7 +10050,7 @@ def _get_data() -> DataFrame:
                 "version this will raise TypeError.  Select only valid "
                 "columns before calling the reduction.",
                 FutureWarning,
-                stacklevel=5,
+                stacklevel=find_stack_level(),
             )
 
         if hasattr(result, "dtype"):

diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -25,6 +25,8 @@
 
 import numpy as np
 
+from pandas._config import get_option
+
 from pandas._libs import reduction as libreduction
 from pandas._typing import (
     ArrayLike,
@@ -873,6 +875,8 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
             result.columns = columns
 
         if result is None:
+            if get_option("future_udf_behavior"):
+                return self._future_agg(func, args, kwargs)
 
             # grouper specific aggregations
             if self.grouper.nkeys > 1:
@@ -923,6 +927,28 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
 
         return result
 
+    def _future_agg(self, func, args, kwargs):
+        if args or kwargs:
+            # test_pass_args_kwargs gets here (with and without as_index)
+            # can't return early
+            result = self._aggregate_frame(func, *args, **kwargs)
+
+        elif self.axis == 1 and self.grouper.nkeys == 1:
+            # _aggregate_multiple_funcs does not allow self.axis == 1
+            # Note: axis == 1 precludes 'not self.as_index', see __init__
+            result = self._aggregate_frame(func)
+            return result
+        else:
+            # test_groupby_as_index_series_scalar gets here
+            # with 'not self.as_index'
+            return self._python_agg_general(func, *args, **kwargs)
+
+        if not self.as_index:
+            self._insert_inaxis_grouper_inplace(result)
+            result.index = Index(range(len(result)))
+
+        return result
+
     agg = aggregate
 
     def _iterate_slices(self) -> Iterable[Series]: