diff --git a/doc/source/user_guide/future_udf_behavior.rst b/doc/source/user_guide/future_udf_behavior.rst
new file mode 100644
index 0000000000000..8871f767c9cb5
--- /dev/null
+++ b/doc/source/user_guide/future_udf_behavior.rst
@@ -0,0 +1,72 @@
+.. _future_udf_behavior:
+
+:orphan:
+
+{{ header }}
+
+*******************
+Future UDF Behavior
+*******************
+
+pandas is experimenting with improving the behavior of methods that take a
+user-defined function (UDF). These methods include ``.apply``, ``.agg``, ``.transform``,
+and ``.filter``. The goal is to make these methods behave in a more predictable
+and consistent manner, reducing the complexity of their implementation, and improving
+performance where possible. This page details the differences between the old and
+new behaviors, as well as providing some context behind each change that is being made.
+
+There are a great number of changes that are planned. In order to transition in a
+reasonable manner for users, all changes are behind an experimental "future_udf_behavior"
+option. This is currently experimental and subject to breaking changes without notice.
+Users can opt into the new behavior and provide feedback. Once the improvements have
+been made, this option will be declared no longer experimental. pandas will then raise
+a ``FutureWarning`` that the default value of this option will be set to ``True`` in
+a future version. Once the default is ``True``, users can still override it to ``False``.
+After a sufficient amount of time, pandas will remove this option altogether and only
+the future behavior will remain.
+
+``DataFrame.agg`` with list-likes
+---------------------------------
+
+Previously, using ``DataFrame.agg`` with a list-like argument would transpose the result when
+compared with just providing a single aggregation function.
+
+.. ipython:: python
+
+   df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
+
+   df.agg("sum")
+   df.agg(["sum"])
+
+This transpose no longer occurs, making the result more consistent.
+
+.. ipython:: python
+
+   with pd.option_context("future_udf_behavior", True):
+       result = df.agg(["sum"])
+   result
+
+   with pd.option_context("future_udf_behavior", True):
+       result = df.agg(["sum", "mean"])
+   result
+
+``DataFrame.groupby(...).agg`` with list-likes
+----------------------------------------------
+
+Previously, using ``DataFrame.groupby(...).agg`` with a list-like argument would put the
+columns as the first level of the resulting hierarchical columns. The result is
+that the columns for each aggregation function are separated, inconsistent with the result
+for a single aggregator.
+
+.. ipython:: python
+
+   df.groupby("a").agg("sum")
+   df.groupby("a").agg(["sum", "min"])
+
+Now the levels are swapped, so that the columns for each aggregation are together.
+
+.. ipython:: python
+
+   with pd.option_context("future_udf_behavior", True):
+       result = df.groupby("a").agg(["sum", "min"])
+   result
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index 291ad2b071665..558bc027e6ce5 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -22,7 +22,10 @@
 
 import numpy as np
 
-from pandas._config import option_context
+from pandas._config import (
+    get_option,
+    option_context,
+)
 
 from pandas._libs import lib
 from pandas._typing import (
@@ -169,7 +172,10 @@ def agg(self) -> DataFrame | Series | None:
             return self.agg_dict_like()
         elif is_list_like(arg):
             # we require a list, but not a 'str'
-            return self.agg_list_like()
+            if get_option("future_udf_behavior"):
+                return self.future_list_like("agg")
+            else:
+                return self.agg_list_like()
 
         if callable(arg):
             f = com.get_cython_func(arg)
@@ -443,6 +449,88 @@ def agg_list_like(self) -> DataFrame | Series:
             )
             return concatenated.reindex(full_ordered_index, copy=False)
 
+    def future_list_single_arg(
+        self, method: str, a: AggFuncTypeBase, result_dim: int | None
+    ) -> tuple[int | None, AggFuncTypeBase | None, DataFrame | Series | None]:
+        name = None
+        result = None
+        try:
+            if isinstance(a, (tuple, list)):
+                # Handle (name, value) pairs
+                name, a = a
+            result = getattr(self.obj, method)(a)
+            if result_dim is None:
+                result_dim = getattr(result, "ndim", 0)
+            elif getattr(result, "ndim", 0) != result_dim:
+                raise ValueError("cannot combine transform and aggregation operations")
+        except TypeError:
+            pass
+        # make sure we find a good name
+        if name is None:
+            name = com.get_callable_name(a) or a
+        return result_dim, name, result
+
+    def future_list_like(self, method: str) -> DataFrame | Series:
+        """
+        Compute aggregation in the case of a list-like argument.
+
+        Returns
+        -------
+        Result of aggregation.
+        """
+        from pandas.core.reshape.concat import concat
+
+        obj = self.obj
+        arg = cast(List[AggFuncTypeBase], self.f)
+
+        results = []
+        keys = []
+        result_dim = None
+        failed_names = []
+
+        for a in arg:
+            result_dim, name, new_res = self.future_list_single_arg(
+                method, a, result_dim
+            )
+            if new_res is not None:
+                results.append(new_res)
+                keys.append(name)
+            else:
+                failed_names.append(a)
+
+        # if we are empty
+        if not len(results):
+            raise ValueError("no results")
+
+        if len(failed_names) > 0:
+            warnings.warn(
+                f"{failed_names} did not aggregate successfully. If any error is "
+                "raised this will raise in a future version of pandas. "
+                "Drop these columns/ops to avoid this warning.",
+                FutureWarning,
+                stacklevel=find_stack_level(),
+            )
+
+        try:
+            concatenated = concat(results, keys=keys, axis=1, sort=False)
+        except TypeError:
+            # we are concatting non-NDFrame objects,
+            # e.g. a list of scalars
+            from pandas import Series
+
+            result = Series(results, index=keys, name=obj.name)
+            return result
+        else:
+            # Concat uses the first index to determine the final indexing order.
+            # The union of a shorter first index with the other indices causes
+            # the index sorting to be different from the order of the aggregating
+            # functions. Reindex if this is the case.
+            index_size = concatenated.index.size
+            full_ordered_index = next(
+                result.index for result in results if result.index.size == index_size
+            )
+            return concatenated.reindex(full_ordered_index, copy=False)
+
     def agg_dict_like(self) -> DataFrame | Series:
         """
         Compute aggregation in the case of a dict-like argument.
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
index 0081f8cd074b6..e7bdaae6a100d 100644
--- a/pandas/core/config_init.py
+++ b/pandas/core/config_init.py
@@ -511,6 +511,23 @@ def use_inf_as_na_cb(key):
         validator=is_one_of_factory(["block", "array"]),
     )
 
+future_udf_behavior = """
+: boolean
+    Whether to use the future UDF method implementations. Currently experimental.
+    Defaults to False.
+"""
+
+
+with cf.config_prefix("mode"):
+    cf.register_option(
+        "future_udf_behavior",
+        # Get the default from an environment variable, if set, otherwise defaults
+        # to False. This environment variable can be set for testing.
+        os.environ.get("PANDAS_FUTURE_UDF_BEHAVIOR", "false").lower() == "true",
+        future_udf_behavior,
+        validator=is_bool,
+    )
+
 
 # user warnings
 chained_assignment = """
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index ebf3428020652..3f46669c61683 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -83,6 +83,7 @@
     doc,
     rewrite_axis_style_signature,
 )
+from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import (
     validate_ascending,
     validate_axis_style_args,
@@ -10016,7 +10017,7 @@ def _get_data() -> DataFrame:
                     "version this will raise TypeError.  Select only valid "
                     "columns before calling the reduction.",
                     FutureWarning,
-                    stacklevel=5,
+                    stacklevel=find_stack_level(),
                 )
 
             return out
@@ -10049,7 +10050,7 @@ def _get_data() -> DataFrame:
                 "version this will raise TypeError.  Select only valid "
                 "columns before calling the reduction.",
                 FutureWarning,
-                stacklevel=5,
+                stacklevel=find_stack_level(),
             )
 
         if hasattr(result, "dtype"):
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index 8a330d08bef78..d81c503139dc7 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -25,6 +25,8 @@
 
 import numpy as np
 
+from pandas._config import get_option
+
 from pandas._libs import reduction as libreduction
 from pandas._typing import (
     ArrayLike,
@@ -873,6 +875,8 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
             result.columns = columns
 
         if result is None:
+            if get_option("future_udf_behavior"):
+                return self._future_agg(func, args, kwargs)
 
             # grouper specific aggregations
             if self.grouper.nkeys > 1:
@@ -923,6 +927,28 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
 
         return result
 
+    def _future_agg(self, func, args, kwargs):
+        if args or kwargs:
+            # test_pass_args_kwargs gets here (with and without as_index)
+            # can't return early
+            result = self._aggregate_frame(func, *args, **kwargs)
+
+        elif self.axis == 1 and self.grouper.nkeys == 1:
+            # _aggregate_multiple_funcs does not allow self.axis == 1
+            # Note: axis == 1 precludes 'not self.as_index', see __init__
+            result = self._aggregate_frame(func)
+            return result
+        else:
+            # test_groupby_as_index_series_scalar gets here
+            # with 'not self.as_index'
+            return self._python_agg_general(func, *args, **kwargs)
+
+        if not self.as_index:
+            self._insert_inaxis_grouper_inplace(result)
+            result.index = Index(range(len(result)))
+
+        return result
+
     agg = aggregate
 
     def _iterate_slices(self) -> Iterable[Series]:
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
index f8c945bb496a8..a12ca64a7a0eb 100644
--- a/pandas/tests/apply/test_frame_apply.py
+++ b/pandas/tests/apply/test_frame_apply.py
@@ -13,6 +13,7 @@
     Series,
     Timestamp,
     date_range,
+    get_option,
 )
 import pandas._testing as tm
 from pandas.tests.frame.common import zip_frames
@@ -639,6 +640,8 @@ def test_apply_dup_names_multi_agg():
     # GH 21063
     df = DataFrame([[0, 1], [2, 3]], columns=["a", "a"])
     expected = DataFrame([[0, 1]], columns=["a", "a"], index=["min"])
+    if get_option("future_udf_behavior"):
+        expected = expected.T
     result = df.agg(["min"])
 
     tm.assert_frame_equal(result, expected)
@@ -1010,25 +1013,46 @@ def test_agg_transform(axis, float_frame):
         # list-like
         result = float_frame.apply([np.sqrt], axis=axis)
         expected = f_sqrt.copy()
-        if axis in {0, "index"}:
-            expected.columns = MultiIndex.from_product([float_frame.columns, ["sqrt"]])
+        if get_option("future_udf_behavior"):
+            if axis in {0, "index"}:
+                expected.columns = MultiIndex.from_product(
+                    [["sqrt"], float_frame.columns]
+                )
+            else:
+                expected.index = MultiIndex.from_product([["sqrt"], float_frame.index])
         else:
-            expected.index = MultiIndex.from_product([float_frame.index, ["sqrt"]])
+            if axis in {0, "index"}:
+                expected.columns = MultiIndex.from_product(
+                    [float_frame.columns, ["sqrt"]]
+                )
+            else:
+                expected.index = MultiIndex.from_product([float_frame.index, ["sqrt"]])
         tm.assert_frame_equal(result, expected)
 
         # multiple items in list
         # these are in the order as if we are applying both
         # functions per series and then concatting
         result = float_frame.apply([np.abs, np.sqrt], axis=axis)
-        expected = zip_frames([f_abs, f_sqrt], axis=other_axis)
-        if axis in {0, "index"}:
-            expected.columns = MultiIndex.from_product(
-                [float_frame.columns, ["absolute", "sqrt"]]
-            )
+        if get_option("future_udf_behavior"):
+            expected = pd.concat([f_abs, f_sqrt], axis=other_axis)
+            if axis in {0, "index"}:
+                expected.columns = MultiIndex.from_product(
+                    [["absolute", "sqrt"], float_frame.columns]
+                )
+            else:
+                expected.index = MultiIndex.from_product(
+                    [["absolute", "sqrt"], float_frame.index]
+                )
         else:
-            expected.index = MultiIndex.from_product(
-                [float_frame.index, ["absolute", "sqrt"]]
-            )
+            expected = zip_frames([f_abs, f_sqrt], axis=other_axis)
+            if axis in {0, "index"}:
+                expected.columns = MultiIndex.from_product(
+                    [float_frame.columns, ["absolute", "sqrt"]]
+                )
+            else:
+                expected.index = MultiIndex.from_product(
+                    [float_frame.index, ["absolute", "sqrt"]]
+                )
         tm.assert_frame_equal(result, expected)
 
 
@@ -1040,6 +1064,8 @@ def test_demo():
     expected = DataFrame(
         {"A": [0, 4], "B": [5, 5]}, columns=["A", "B"], index=["min", "max"]
     )
+    if get_option("future_udf_behavior"):
+        expected = expected.T
     tm.assert_frame_equal(result, expected)
 
     result = df.agg({"A": ["min", "max"], "B": ["sum", "max"]})
@@ -1086,22 +1112,31 @@ def test_agg_multiple_mixed_no_warning():
         },
         index=["min", "sum"],
     )
+    if get_option("future_udf_behavior"):
+        expected = expected.T
+        match = "Dropping of nuisance columns"
+    else:
+        match = "did not aggregate successfully"
     # sorted index
-    with tm.assert_produces_warning(
-        FutureWarning, match=r"\['D'\] did not aggregate successfully"
-    ):
+    with tm.assert_produces_warning(FutureWarning, match=match):
         result = mdf.agg(["min", "sum"])
 
     tm.assert_frame_equal(result, expected)
 
-    with tm.assert_produces_warning(
-        FutureWarning, match=r"\['D'\] did not aggregate successfully"
-    ):
+    if get_option("future_udf_behavior"):
+        match = "Dropping of nuisance columns"
+    else:
+        match = "did not aggregate successfully"
+
+    with tm.assert_produces_warning(FutureWarning, match=match, check_stacklevel=False):
         result = mdf[["D", "C", "B", "A"]].agg(["sum", "min"])
 
     # GH40420: the result of .agg should have an index that is sorted
     # according to the arguments provided to agg.
-    expected = expected[["D", "C", "B", "A"]].reindex(["sum", "min"])
+    if get_option("future_udf_behavior"):
+        expected = expected.loc[["D", "C", "B", "A"], ["sum", "min"]]
+    else:
+        expected = expected[["D", "C", "B", "A"]].reindex(["sum", "min"])
     tm.assert_frame_equal(result, expected)
 
 
@@ -1120,6 +1155,8 @@ def test_agg_reduce(axis, float_frame):
     )
     expected.columns = ["mean", "max", "sum"]
     expected = expected.T if axis in {0, "index"} else expected
+    if get_option("future_udf_behavior"):
+        expected = expected.T
 
     result = float_frame.agg(["mean", "max", "sum"], axis=axis)
     tm.assert_frame_equal(result, expected)
@@ -1196,6 +1233,8 @@ def test_nuiscance_columns():
         index=["min"],
         columns=df.columns,
     )
+    if get_option("future_udf_behavior"):
+        expected = expected.T
     tm.assert_frame_equal(result, expected)
 
     with tm.assert_produces_warning(
@@ -1205,13 +1244,17 @@ def test_nuiscance_columns():
     expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"])
     tm.assert_series_equal(result, expected)
 
-    with tm.assert_produces_warning(
-        FutureWarning, match=r"\['D'\] did not aggregate successfully"
-    ):
+    if get_option("future_udf_behavior"):
+        match = "Select only valid"
+    else:
+        match = "did not aggregate successfully"
+    with tm.assert_produces_warning(FutureWarning, match=match):
         result = df.agg(["sum"])
     expected = DataFrame(
         [[6, 6.0, "foobarbaz"]], index=["sum"], columns=["A", "B", "C"]
     )
+    if get_option("future_udf_behavior"):
+        expected = expected.T
     tm.assert_frame_equal(result, expected)
 
 
@@ -1251,8 +1294,12 @@ def test_non_callable_aggregates(how):
         }
     )
 
-    tm.assert_frame_equal(result1, result2, check_like=True)
-    tm.assert_frame_equal(result2, expected, check_like=True)
+    if get_option("future_udf_behavior"):
+        tm.assert_frame_equal(result2, expected)
+        tm.assert_frame_equal(result1, expected.T)
+    else:
+        tm.assert_frame_equal(result1, result2, check_like=True)
+        tm.assert_frame_equal(result2, expected, check_like=True)
 
     # Just functional string arg is same as calling df.arg()
     result = getattr(df, how)("count")
@@ -1289,7 +1336,9 @@ def func(group_col):
     tm.assert_series_equal(result, expected)
 
     result = df.agg([func])
-    expected = expected.to_frame("func").T
+    expected = expected.to_frame("func")
+    if not get_option("future_udf_behavior"):
+        expected = expected.T
     tm.assert_frame_equal(result, expected)
 
 
@@ -1402,14 +1451,20 @@ def test_apply_empty_list_reduce():
     tm.assert_series_equal(result, expected)
 
 
-def test_apply_no_suffix_index():
+def test_apply_no_suffix_index(request):
     # GH36189
     pdf = DataFrame([[4, 9]] * 3, columns=["A", "B"])
-    result = pdf.apply(["sum", lambda x: x.sum(), lambda x: x.sum()])
-    expected = DataFrame(
-        {"A": [12, 12, 12], "B": [27, 27, 27]}, index=["sum", "<lambda>", "<lambda>"]
-    )
-
+    result = pdf.apply([np.square, lambda x: x, lambda x: x])
+    if get_option("future_udf_behavior"):
+        columns = MultiIndex.from_product(
+            [["square", "<lambda>", "<lambda>"], ["A", "B"]]
+        )
+        expected = DataFrame(3 * [[16, 81, 4, 9, 4, 9]], columns=columns)
+    else:
+        columns = MultiIndex.from_product(
+            [["A", "B"], ["square", "<lambda>", "<lambda>"]]
+        )
+        expected = DataFrame(3 * [[16, 4, 4, 81, 9, 9]], columns=columns)
     tm.assert_frame_equal(result, expected)
 
 
@@ -1440,19 +1495,30 @@ def foo(s):
         return s.sum() / 2
 
     aggs = ["sum", foo, "count", "min"]
+    klass = None if get_option("future_udf_behavior") else FutureWarning
     with tm.assert_produces_warning(
-        FutureWarning, match=r"\['item'\] did not aggregate successfully"
+        klass, match=r"\['item'\] did not aggregate successfully"
     ):
         result = df.agg(aggs)
-    expected = DataFrame(
-        {
-            "item": ["123456", np.nan, 6, "1"],
-            "att1": [21.0, 10.5, 6.0, 1.0],
-            "att2": [18.0, 9.0, 6.0, 0.0],
-            "att3": [17.0, 8.5, 6.0, 0.0],
-        },
-        index=["sum", "foo", "count", "min"],
-    )
+    if get_option("future_udf_behavior"):
+        expected = DataFrame(
+            {
+                "sum": ["123456", 21, 18, 17],
+                "count": [6, 6, 6, 6],
+                "min": ["1", 1, 0, 0],
+            },
+            index=["item", "att1", "att2", "att3"],
+        )
+    else:
+        expected = DataFrame(
+            {
+                "item": ["123456", np.nan, 6, "1"],
+                "att1": [21.0, 10.5, 6.0, 1.0],
+                "att2": [18.0, 9.0, 6.0, 0.0],
+                "att3": [17.0, 8.5, 6.0, 0.0],
+            },
+            index=["sum", "foo", "count", "min"],
+        )
     tm.assert_frame_equal(result, expected)
 
 
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
index f178f85154319..095c3fbaf10fb 100644
--- a/pandas/tests/groupby/aggregate/test_aggregate.py
+++ b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -20,6 +20,7 @@
     MultiIndex,
     Series,
     concat,
+    get_option,
     to_datetime,
 )
 import pandas._testing as tm
@@ -389,12 +390,13 @@ def test_multiple_functions_tuples_and_non_tuples(df):
     expected = df.groupby("A")["C"].agg(ex_funcs)
     tm.assert_frame_equal(result, expected)
 
+    klass = None if get_option("future_udf_behavior") else FutureWarning
     with tm.assert_produces_warning(
-        FutureWarning, match=r"\['B'\] did not aggregate successfully"
+        klass, match=r"\['B'\] did not aggregate successfully"
     ):
         result = df.groupby("A").agg(funcs)
     with tm.assert_produces_warning(
-        FutureWarning, match=r"\['B'\] did not aggregate successfully"
+        klass, match=r"\['B'\] did not aggregate successfully"
     ):
         expected = df.groupby("A").agg(ex_funcs)
     tm.assert_frame_equal(result, expected)
@@ -555,12 +557,18 @@ def test_order_aggregate_multiple_funcs():
     # GH 25692
     df = DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]})
 
-    res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"])
-    result = res.columns.levels[1]
+    if get_option("future_udf_behavior"):
+        # TODO (GH 35725): This will not raise when agg-must-agg is implemented
+        msg = "Cannot concat indices that do not have the same number of levels"
+        with pytest.raises(AssertionError, match=msg):
+            df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"])
+    else:
+        res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"])
+        result = res.columns.levels[1]
 
-    expected = Index(["sum", "max", "mean", "ohlc", "min"])
+        expected = Index(["sum", "max", "mean", "ohlc", "min"])
 
-    tm.assert_index_equal(result, expected)
+        tm.assert_index_equal(result, expected)
 
 
 @pytest.mark.parametrize("dtype", [np.int64, np.uint64])
@@ -1273,7 +1281,10 @@ def test_nonagg_agg():
     g = df.groupby("a")
 
     result = g.agg(["cumsum"])
-    result.columns = result.columns.droplevel(-1)
+    if get_option("future_udf_behavior"):
+        result.columns = result.columns.droplevel(0)
+    else:
+        result.columns = result.columns.droplevel(-1)
     expected = g.agg("cumsum")
 
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py
index 66b968e01eef1..c79878d71f5ae 100644
--- a/pandas/tests/groupby/aggregate/test_other.py
+++ b/pandas/tests/groupby/aggregate/test_other.py
@@ -8,6 +8,8 @@
 import numpy as np
 import pytest
 
+from pandas._config import get_option
+
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -44,16 +46,20 @@ def test_agg_api():
     def peak_to_peak(arr):
         return arr.max() - arr.min()
 
+    if get_option("future_udf_behavior"):
+        msg = "Dropping invalid columns"
+    else:
+        msg = r"\['key2'\] did not aggregate successfully"
     with tm.assert_produces_warning(
         FutureWarning,
-        match=r"\['key2'\] did not aggregate successfully",
+        match=msg,
     ):
         expected = grouped.agg([peak_to_peak])
     expected.columns = ["data1", "data2"]
 
     with tm.assert_produces_warning(
         FutureWarning,
-        match=r"\['key2'\] did not aggregate successfully",
+        match=msg,
     ):
         result = grouped.agg(peak_to_peak)
     tm.assert_frame_equal(result, expected)
@@ -203,13 +209,21 @@ def test_aggregate_api_consistency():
     tm.assert_frame_equal(result, expected, check_like=True)
 
     result = grouped.agg([np.sum, np.mean])
-    expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
-    expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]])
+    if get_option("future_udf_behavior"):
+        expected = pd.concat([c_sum, d_sum, c_mean, d_mean], axis=1)
+        expected.columns = MultiIndex.from_product([["sum", "mean"], ["C", "D"]])
+    else:
+        expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
+        expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]])
     tm.assert_frame_equal(result, expected, check_like=True)
 
     result = grouped[["D", "C"]].agg([np.sum, np.mean])
-    expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
-    expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]])
+    if get_option("future_udf_behavior"):
+        expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1)
+        expected.columns = MultiIndex.from_product([["sum", "mean"], ["D", "C"]])
+    else:
+        expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
+        expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]])
     tm.assert_frame_equal(result, expected, check_like=True)
 
     result = grouped.agg({"C": "mean", "D": "sum"})
@@ -395,7 +409,10 @@ def P1(a):
     g = df.groupby("date")
 
     expected = g.agg([P1])
-    expected.columns = expected.columns.levels[0]
+    if get_option("future_udf_behavior"):
+        expected.columns = expected.columns.levels[1]
+    else:
+        expected.columns = expected.columns.levels[0]
 
     result = g.agg(P1)
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
index 3c402480ea2ec..56f0f2dddf80a 100644
--- a/pandas/tests/groupby/test_function.py
+++ b/pandas/tests/groupby/test_function.py
@@ -14,6 +14,7 @@
     Series,
     Timestamp,
     date_range,
+    get_option,
 )
 import pandas._testing as tm
 import pandas.core.nanops as nanops
@@ -1138,7 +1139,10 @@ def test_apply_to_nullable_integer_returns_float(values, function):
     tm.assert_frame_equal(result, expected)
 
     result = groups.agg([function])
-    expected.columns = MultiIndex.from_tuples([("b", function)])
+    if get_option("future_udf_behavior"):
+        expected.columns = MultiIndex.from_tuples([(function, "b")])
+    else:
+        expected.columns = MultiIndex.from_tuples([("b", function)])
     tm.assert_frame_equal(result, expected)
 
 
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 203d8abb465d0..2f1fc1efa26c7 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -20,6 +20,7 @@
     Timedelta,
     Timestamp,
     date_range,
+    get_option,
     read_csv,
     to_datetime,
 )
@@ -588,15 +589,23 @@ def test_frame_multi_key_function_list():
 
     grouped = data.groupby(["A", "B"])
     funcs = [np.mean, np.std]
+    klass = None if get_option("future_udf_behavior") else FutureWarning
     with tm.assert_produces_warning(
-        FutureWarning, match=r"\['C'\] did not aggregate successfully"
+        klass, match=r"\['C'\] did not aggregate successfully"
     ):
         agged = grouped.agg(funcs)
-    expected = pd.concat(
-        [grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)],
-        keys=["D", "E", "F"],
-        axis=1,
-    )
+    if get_option("future_udf_behavior"):
+        expected = pd.concat(
+            [grouped.agg(funcs[0]), grouped.agg(funcs[1])],
+            keys=["mean", "std"],
+            axis=1,
+        )
+    else:
+        expected = pd.concat(
+            [grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)],
+            keys=["D", "E", "F"],
+            axis=1,
+        )
     assert isinstance(agged.index, MultiIndex)
     assert isinstance(expected.index, MultiIndex)
     tm.assert_frame_equal(agged, expected)
@@ -2080,9 +2089,14 @@ def test_groupby_agg_ohlc_non_first():
         index=date_range("2018-01-01", periods=2, freq="D", name="dti"),
     )
 
-    result = df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"])
-
-    tm.assert_frame_equal(result, expected)
+    if get_option("future_udf_behavior"):
+        # TODO (GH 35725): This will not raise when agg-must-agg is implemented
+        msg = "Cannot concat indices that do not have the same number of levels"
+        with pytest.raises(AssertionError, match=msg):
+            df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"])
+    else:
+        result = df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"])
+        tm.assert_frame_equal(result, expected)
 
 
 def test_groupby_multiindex_nat():
diff --git a/pandas/tests/resample/test_deprecated.py b/pandas/tests/resample/test_deprecated.py
index 359c3cea62f9c..3de3694f1eb52 100644
--- a/pandas/tests/resample/test_deprecated.py
+++ b/pandas/tests/resample/test_deprecated.py
@@ -10,6 +10,7 @@
 from pandas import (
     DataFrame,
     Series,
+    get_option,
 )
 import pandas._testing as tm
 from pandas.core.indexes.datetimes import date_range
@@ -97,7 +98,10 @@ def test_resample_loffset_arg_type(frame, create_index, arg):
         result_agg = df.resample("2D", loffset="2H").agg(arg)
 
     if isinstance(arg, list):
-        expected.columns = pd.MultiIndex.from_tuples([("value", "mean")])
+        if get_option("future_udf_behavior"):
+            expected.columns = pd.MultiIndex.from_tuples([("mean", "value")])
+        else:
+            expected.columns = pd.MultiIndex.from_tuples([("value", "mean")])
 
     tm.assert_frame_equal(result_agg, expected)
 
@@ -216,7 +220,10 @@ def test_loffset_returns_datetimeindex(frame, kind, agg_arg):
     with tm.assert_produces_warning(FutureWarning):
         result_agg = df.resample("2D", loffset="2H", kind=kind).agg(agg_arg)
     if isinstance(agg_arg, list):
-        expected.columns = pd.MultiIndex.from_tuples([("value", "mean")])
+        if get_option("future_udf_behavior"):
+            expected.columns = pd.MultiIndex.from_tuples([("mean", "value")])
+        else:
+            expected.columns = pd.MultiIndex.from_tuples([("value", "mean")])
     tm.assert_frame_equal(result_agg, expected)
 
 
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
index 10fabe234d218..476b29217a8c0 100644
--- a/pandas/tests/resample/test_resample_api.py
+++ b/pandas/tests/resample/test_resample_api.py
@@ -8,6 +8,7 @@
     DataFrame,
     NamedAgg,
     Series,
+    get_option,
 )
 import pandas._testing as tm
 from pandas.core.indexes.datetimes import date_range
@@ -347,10 +348,17 @@ def test_agg():
     b_std = r["B"].std()
     b_sum = r["B"].sum()
 
-    expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
-    expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]])
+    if get_option("future_udf_behavior"):
+        expected = pd.concat([a_mean, b_mean, a_std, b_std], axis=1)
+        expected.columns = pd.MultiIndex.from_product([["mean", "std"], ["A", "B"]])
+    else:
+        expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
+        expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]])
     for t in cases:
-        warn = FutureWarning if t in cases[1:3] else None
+        if t in cases[1:3] and not get_option("future_udf_behavior"):
+            warn = FutureWarning
+        else:
+            warn = None
         with tm.assert_produces_warning(
             warn,
             match=r"\['date'\] did not aggregate successfully",
@@ -629,11 +637,22 @@ def test_agg_with_datetime_index_list_agg_func(col_name):
         columns=[col_name],
     )
     result = df.resample("1d").aggregate(["mean"])
-    expected = DataFrame(
-        [47.5, 143.5, 195.5],
-        index=date_range(start="2017-01-01", freq="D", periods=3, tz="Europe/Berlin"),
-        columns=pd.MultiIndex(levels=[[col_name], ["mean"]], codes=[[0], [0]]),
-    )
+    if get_option("future_udf_behavior"):
+        expected = DataFrame(
+            [47.5, 143.5, 195.5],
+            index=date_range(
+                start="2017-01-01", freq="D", periods=3, tz="Europe/Berlin"
+            ),
+            columns=pd.MultiIndex(levels=[["mean"], [col_name]], codes=[[0], [0]]),
+        )
+    else:
+        expected = DataFrame(
+            [47.5, 143.5, 195.5],
+            index=date_range(
+                start="2017-01-01", freq="D", periods=3, tz="Europe/Berlin"
+            ),
+            columns=pd.MultiIndex(levels=[[col_name], ["mean"]], codes=[[0], [0]]),
+        )
     tm.assert_frame_equal(result, expected)
 
 
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
index 88607f4b036a0..080eb7dd9cd29 100644
--- a/pandas/tests/reshape/test_pivot.py
+++ b/pandas/tests/reshape/test_pivot.py
@@ -8,6 +8,8 @@
 import numpy as np
 import pytest
 
+from pandas._config import get_option
+
 import pandas as pd
 from pandas import (
     Categorical,
@@ -1905,8 +1907,14 @@ def test_pivot_margins_name_unicode(self):
             frame, index=["foo"], aggfunc=len, margins=True, margins_name=greek
         )
         index = Index([1, 2, 3, greek], dtype="object", name="foo")
-        expected = DataFrame(index=index)
-        tm.assert_frame_equal(table, expected)
+
+        if get_option("future_udf_behavior"):
+            expected = Series([1, 1, 1, 3], index=index)
+            expected.index.name = None
+            tm.assert_series_equal(table, expected)
+        else:
+            expected = DataFrame(index=index)
+            tm.assert_frame_equal(table, expected)
 
     def test_pivot_string_as_func(self):
         # GH #18713
diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py
index f84a579247630..0df8872c0b02f 100644
--- a/pandas/tests/window/test_api.py
+++ b/pandas/tests/window/test_api.py
@@ -10,6 +10,7 @@
     Timestamp,
     concat,
     date_range,
+    get_option,
     timedelta_range,
 )
 import pandas._testing as tm
@@ -90,8 +91,12 @@ def test_agg():
     b_std = r["B"].std()
 
     result = r.aggregate([np.mean, np.std])
-    expected = concat([a_mean, a_std, b_mean, b_std], axis=1)
-    expected.columns = MultiIndex.from_product([["A", "B"], ["mean", "std"]])
+    if get_option("future_udf_behavior"):
+        expected = concat([a_mean, b_mean, a_std, b_std], axis=1)
+        expected.columns = MultiIndex.from_product([["mean", "std"], ["A", "B"]])
+    else:
+        expected = concat([a_mean, a_std, b_mean, b_std], axis=1)
+        expected.columns = MultiIndex.from_product([["A", "B"], ["mean", "std"]])
     tm.assert_frame_equal(result, expected)
 
     result = r.aggregate({"A": np.mean, "B": np.std})
@@ -147,7 +152,10 @@ def test_agg_consistency():
     r = df.rolling(window=3)
 
     result = r.agg([np.sum, np.mean]).columns
-    expected = MultiIndex.from_product([list("AB"), ["sum", "mean"]])
+    if get_option("future_udf_behavior"):
+        expected = MultiIndex.from_product([["sum", "mean"], list("AB")])
+    else:
+        expected = MultiIndex.from_product([list("AB"), ["sum", "mean"]])
     tm.assert_index_equal(result, expected)
 
     result = r["A"].agg([np.sum, np.mean]).columns