pandas-dev · rhshadrach · Sep 6, 2021 · Jan 23, 2022 · Jan 23, 2022 · Jan 23, 2022
diff --git a/doc/source/user_guide/homs_api.rst b/doc/source/user_guide/homs_api.rst
@@ -0,0 +1,80 @@
+.. _homs:
+
+:orphan:
+
+{{ header }}
+
+***************************
+pandas Higher Order Methods
+***************************
+
+pandas is experimenting with improving the behavior of higher order methods (HOMs). These
+are methods that take a function as an argument, often a user-defined function (UDF).
+The modified methods include the following.
+
+ - :meth:`DataFrame.agg`
+ - :meth:`.DataFrameGroupBy.aggregate`
+
+The goal is to make these methods behave in a more predictable and consistent manner,
+reducing the complexity of their implementation, and improving performance where
+possible. This page details the differences between the old and new behaviors, as well
+as providing some context behind each change that is being made.
+
+There are a great number of changes that are planned. In order to transition in a
+reasonable manner for users, all changes are behind an experimental "api.use_hom"
+option. When enabled, pandas HOMs are subject to breaking changes without notice.
+Users can opt into the new behavior and provide feedback. Once the improvements have
+been made, this option will be declared no longer experimental. At this point, any
+breaking changes will happen only when preceded by a ``FutureWarning`` and when
+pandas releases a major version. After a period of community feedback, and when the
+behavior is deemed ready for release, pandas will then raise a ``FutureWarning`` that
+the default value of this option will be set to ``True`` in a future version. Once the
+default is ``True``, users can still override it to ``False``. After a sufficient
+amount of time, pandas will remove this option altogether and only the new behavior
+will remain.
+
+``DataFrame.agg`` with list-likes
+---------------------------------
+
+Previously, using ``DataFrame.agg`` with a list-like argument would transpose the result when
+compared with just providing a single aggregation function.
+
+.. ipython:: python
+
+   df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
+
+   df.agg("sum")
+   df.agg(["sum"])
+
+This transpose no longer occurs, making the result more consistent.
+
+.. ipython:: python
+
+   with pd.option_context("api.use_hom", True):
+       result = df.agg(["sum"])
+   result
+
+   with pd.option_context("api.use_hom", True):
+       result = df.agg(["sum", "mean"])
+   result
+
+``DataFrame.groupby(...).agg`` with list-likes
+----------------------------------------------
+
+Previously, using ``DataFrame.groupby(...).agg`` with a list-like argument would put the
+columns as the first level of the resulting hierarchical columns. The result is
+that the columns for each aggregation function are separated, inconsistent with the result
+for a single aggregator.
+
+.. ipython:: python
+
+   df.groupby("a").agg("sum")
+   df.groupby("a").agg(["sum", "min"])
+
+Now the levels are swapped, so that the columns for each aggregation are together.
+
+.. ipython:: python
+
+   with pd.option_context("api.use_hom", True):
+       result = df.groupby("a").agg(["sum", "min"])
+   result
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -1780,3 +1780,11 @@ def using_array_manager(request):
     Fixture to check if the array manager is being used.
     """
     return pd.options.mode.data_manager == "array"
+
+
+@pytest.fixture
+def using_hom_api(request):
+    """
+    Fixture to check if the Higher Order Methods API is being used.
+    """
+    return pd.options.api.use_hom
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -22,7 +22,10 @@
 
 import numpy as np
 
-from pandas._config import option_context
+from pandas._config import (
+    get_option,
+    option_context,
+)
 
 from pandas._libs import lib
 from pandas._typing import (
@@ -168,7 +171,10 @@ def agg(self) -> DataFrame | Series | None:
             return self.agg_dict_like()
         elif is_list_like(arg):
             # we require a list, but not a 'str'
-            return self.agg_list_like()
+            if get_option("api.use_hom"):
+                return self.hom_list_like("agg")
+            else:
+                return self.agg_list_like()
 
         if callable(arg):
             f = com.get_cython_func(arg)
@@ -442,6 +448,79 @@ def agg_list_like(self) -> DataFrame | Series:
             )
             return concatenated.reindex(full_ordered_index, copy=False)
 
+    def hom_list_single_arg(
+        self, method: str, a: AggFuncTypeBase, result_dim: int | None
+    ) -> tuple[int | None, AggFuncTypeBase | None, DataFrame | Series | None]:
+        result = None
+        if isinstance(a, (tuple, list)):
+            # Handle (name, value) pairs
+            name, a = a
+        else:
+            name = com.get_callable_name(a) or a
+        try:
+            result = getattr(self.obj, method)(a)
+        except (TypeError, DataError):
+            warnings.warn(
+                f"{name} did not aggregate successfully. If any error is "
+                "raised this will raise in a future version of pandas. "
+                "Drop these columns/ops to avoid this warning.",
+                FutureWarning,
+                stacklevel=find_stack_level(),
+            )
+        if result_dim is None:
+            result_dim = getattr(result, "ndim", 0)
+        elif getattr(result, "ndim", 0) != result_dim:
+            raise ValueError("cannot combine transform and aggregation operations")
+
+        return result_dim, name, result
+
+    def hom_list_like(self, method: str) -> DataFrame | Series:
+        """
+        Compute aggregation in the case of a list-like argument.
+
+        Returns
+        -------
+        Result of aggregation.
+        """
+        from pandas.core.reshape.concat import concat
+
+        obj = self.obj
+        arg = cast(List[AggFuncTypeBase], self.f)
+
+        results = []
+        keys = []
+        result_dim = None
+
+        for a in arg:
+            result_dim, name, new_res = self.hom_list_single_arg(method, a, result_dim)
+            if new_res is not None:
+                results.append(new_res)
+                keys.append(name)
+
+        # if we are empty
+        if not len(results):
+            raise ValueError("no results")
+
+        try:
+            concatenated = concat(results, keys=keys, axis=1, sort=False)
+        except TypeError:
+            # we are concatting non-NDFrame objects,
+            # e.g. a list of scalars
+            from pandas import Series
+
+            result = Series(results, index=keys, name=obj.name)
+            return result
+        else:
+            # Concat uses the first index to determine the final indexing order.
+            # The union of a shorter first index with the other indices causes
+            # the index sorting to be different from the order of the aggregating
+            # functions. Reindex if this is the case.
+            index_size = concatenated.index.size
+            full_ordered_index = next(
+                result.index for result in results if result.index.size == index_size
+            )
+            return concatenated.reindex(full_ordered_index, copy=False)
+
     def agg_dict_like(self) -> DataFrame | Series:
         """
         Compute aggregation in the case of a dict-like argument.

diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -526,6 +526,23 @@ def use_inf_as_na_cb(key):
         validator=is_one_of_factory(["block", "array"]),
     )
 
+use_hom_doc = """
+: boolean
+    Whether to use the Higher Order Methods implementations. Currently experimental.
+    Defaults to False.
+"""
+
+
+with cf.config_prefix("api"):
+    cf.register_option(
+        "use_hom",
+        # Get the default from an environment variable, if set, otherwise defaults
+        # to False. This environment variable can be set for testing.
+        os.environ.get("PANDAS_USE_HOM", "false").lower() == "true",
+        use_hom_doc,
+        validator=is_bool,
+    )
+
 
 # user warnings
 chained_assignment = """

@@ -26,6 +26,8 @@
 
 import numpy as np
 
+from pandas._config import get_option
+
 from pandas._libs import reduction as libreduction
 from pandas._typing import (
     ArrayLike,
@@ -876,6 +878,8 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
             result.columns = columns
 
         if result is None:
+            if get_option("api.use_hom"):
+                return self._hom_agg(func, args, kwargs)
 
             # grouper specific aggregations
             if self.grouper.nkeys > 1:
@@ -926,6 +930,28 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
 
         return result
 
+    def _hom_agg(self, func, args, kwargs):
+        if args or kwargs:
+            # test_pass_args_kwargs gets here (with and without as_index)
+            # can't return early
+            result = self._aggregate_frame(func, *args, **kwargs)
+
+        elif self.axis == 1 and self.grouper.nkeys == 1:
+            # _aggregate_multiple_funcs does not allow self.axis == 1
+            # Note: axis == 1 precludes 'not self.as_index', see __init__
+            result = self._aggregate_frame(func)
+            return result
+        else:
+            # test_groupby_as_index_series_scalar gets here
+            # with 'not self.as_index'
+            return self._python_agg_general(func, *args, **kwargs)
+
+        if not self.as_index:
+            self._insert_inaxis_grouper_inplace(result)
+            result.index = Index(range(len(result)))
+
+        return result
+
     agg = aggregate
 
     def _iterate_slices(self) -> Iterable[Series]: