diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
index 882f42ff18bdd..78b99a00d43ce 100644
--- a/pandas/tests/groupby/aggregate/test_aggregate.py
+++ b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -356,6 +356,12 @@ def test_agg_multiple_functions_maintain_order(df):
     tm.assert_index_equal(result.columns, exp_cols)
 
 
+def test_series_index_name(df):
+    grouped = df.loc[:, ["C"]].groupby(df["A"])
+    result = grouped.agg(lambda x: x.mean())
+    assert result.index.name == "A"
+
+
 def test_agg_multiple_functions_same_name():
     # GH 30880
     df = DataFrame(
diff --git a/pandas/tests/groupby/methods/test_describe.py b/pandas/tests/groupby/methods/test_describe.py
index f38de8faddb59..c2ffcb04caa60 100644
--- a/pandas/tests/groupby/methods/test_describe.py
+++ b/pandas/tests/groupby/methods/test_describe.py
@@ -219,3 +219,73 @@ def test_describe_duplicate_columns():
     )
     expected.index.names = [1]
     tm.assert_frame_equal(result, expected)
+
+
+class TestGroupByNonCythonPaths:
+    # GH#5610 non-cython calls should not include the grouper
+    # Tests for code not expected to go through cython paths.
+
+    @pytest.fixture
+    def df(self):
+        df = DataFrame(
+            [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
+            columns=["A", "B", "C"],
+        )
+        return df
+
+    @pytest.fixture
+    def gb(self, df):
+        gb = df.groupby("A")
+        return gb
+
+    @pytest.fixture
+    def gni(self, df):
+        gni = df.groupby("A", as_index=False)
+        return gni
+
+    def test_describe(self, df, gb, gni):
+        # describe
+        expected_index = Index([1, 3], name="A")
+        expected_col = MultiIndex(
+            levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
+            codes=[[0] * 8, list(range(8))],
+        )
+        expected = DataFrame(
+            [
+                [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
+                [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
+            ],
+            index=expected_index,
+            columns=expected_col,
+        )
+        result = gb.describe()
+        tm.assert_frame_equal(result, expected)
+
+        expected = expected.reset_index()
+        result = gni.describe()
+        tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", [int, float, object])
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None},
+        {"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]},
+        {"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None},
+    ],
+)
+def test_groupby_empty_dataset(dtype, kwargs):
+    # GH#41575
+    df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype)
+    df["B"] = df["B"].astype(int)
+    df["C"] = df["C"].astype(float)
+
+    result = df.iloc[:0].groupby("A").describe(**kwargs)
+    expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0]
+    tm.assert_frame_equal(result, expected)
+
+    result = df.iloc[:0].groupby("A").B.describe(**kwargs)
+    expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0]
+    expected.index = Index([])
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
index 2f2648b9293c5..60b386adb664a 100644
--- a/pandas/tests/groupby/test_apply.py
+++ b/pandas/tests/groupby/test_apply.py
@@ -1559,3 +1559,45 @@ def test_include_groups(include_groups):
     if not include_groups:
         expected = expected[["b"]]
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("f", [max, min, sum])
+@pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]])  # Single key  # Multi-key
+def test_builtins_apply(keys, f):
+    # see gh-8155
+    rs = np.random.default_rng(2)
+    df = DataFrame(rs.integers(1, 7, (10, 2)), columns=["jim", "joe"])
+    df["jolie"] = rs.standard_normal(10)
+
+    gb = df.groupby(keys)
+
+    fname = f.__name__
+
+    warn = None if f is not sum else FutureWarning
+    msg = "The behavior of DataFrame.sum with axis=None is deprecated"
+    with tm.assert_produces_warning(
+        warn, match=msg, check_stacklevel=False, raise_on_extra_warnings=False
+    ):
+        # Also warns on deprecation GH#53425
+        result = gb.apply(f)
+    ngroups = len(df.drop_duplicates(subset=keys))
+
+    assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))"
+    assert result.shape == (ngroups, 3), assert_msg
+
+    npfunc = lambda x: getattr(np, fname)(x, axis=0)  # numpy's equivalent function
+    msg = "DataFrameGroupBy.apply operated on the grouping columns"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        expected = gb.apply(npfunc)
+    tm.assert_frame_equal(result, expected)
+
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        expected2 = gb.apply(lambda x: npfunc(x))
+    tm.assert_frame_equal(result, expected2)
+
+    if f != sum:
+        expected = gb.agg(fname).reset_index()
+        expected.set_index(keys, inplace=True, drop=False)
+        tm.assert_frame_equal(result, expected, check_dtype=False)
+
+    tm.assert_series_equal(getattr(result, fname)(axis=0), getattr(df, fname)(axis=0))
diff --git a/pandas/tests/groupby/test_cumulative.py b/pandas/tests/groupby/test_cumulative.py
index eecb82cd5050b..25534865b3486 100644
--- a/pandas/tests/groupby/test_cumulative.py
+++ b/pandas/tests/groupby/test_cumulative.py
@@ -289,3 +289,30 @@ def test_nullable_int_not_cast_as_float(method, dtype, val):
     expected = DataFrame({"b": data}, dtype=dtype)
 
     tm.assert_frame_equal(result, expected)
+
+
+def test_cython_api2():
+    # this takes the fast apply path
+
+    # cumsum (GH5614)
+    df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"])
+    expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"])
+    result = df.groupby("A").cumsum()
+    tm.assert_frame_equal(result, expected)
+
+    # GH 5755 - cumsum is a transformer and should ignore as_index
+    result = df.groupby("A", as_index=False).cumsum()
+    tm.assert_frame_equal(result, expected)
+
+    # GH 13994
+    msg = "DataFrameGroupBy.cumsum with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = df.groupby("A").cumsum(axis=1)
+    expected = df.cumsum(axis=1)
+    tm.assert_frame_equal(result, expected)
+
+    msg = "DataFrameGroupBy.cumprod with axis=1 is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = df.groupby("A").cumprod(axis=1)
+    expected = df.cumprod(axis=1)
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
index b840443aab347..399ea534ae373 100644
--- a/pandas/tests/groupby/test_function.py
+++ b/pandas/tests/groupby/test_function.py
@@ -1,4 +1,3 @@
-import builtins
 import re
 
 import numpy as np
@@ -10,73 +9,12 @@
 from pandas import (
     DataFrame,
     Index,
-    MultiIndex,
     Series,
     Timestamp,
     date_range,
 )
 import pandas._testing as tm
 from pandas.tests.groupby import get_groupby_method_args
-from pandas.util import _test_decorators as td
-
-
-def test_intercept_builtin_sum():
-    s = Series([1.0, 2.0, np.nan, 3.0])
-    grouped = s.groupby([0, 1, 2, 2])
-
-    msg = "using SeriesGroupBy.sum"
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        # GH#53425
-        result = grouped.agg(builtins.sum)
-    msg = "using np.sum"
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        # GH#53425
-        result2 = grouped.apply(builtins.sum)
-    expected = grouped.sum()
-    tm.assert_series_equal(result, expected)
-    tm.assert_series_equal(result2, expected)
-
-
-@pytest.mark.parametrize("f", [max, min, sum])
-@pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]])  # Single key  # Multi-key
-def test_builtins_apply(keys, f):
-    # see gh-8155
-    rs = np.random.default_rng(2)
-    df = DataFrame(rs.integers(1, 7, (10, 2)), columns=["jim", "joe"])
-    df["jolie"] = rs.standard_normal(10)
-
-    gb = df.groupby(keys)
-
-    fname = f.__name__
-
-    warn = None if f is not sum else FutureWarning
-    msg = "The behavior of DataFrame.sum with axis=None is deprecated"
-    with tm.assert_produces_warning(
-        warn, match=msg, check_stacklevel=False, raise_on_extra_warnings=False
-    ):
-        # Also warns on deprecation GH#53425
-        result = gb.apply(f)
-    ngroups = len(df.drop_duplicates(subset=keys))
-
-    assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))"
-    assert result.shape == (ngroups, 3), assert_msg
-
-    npfunc = lambda x: getattr(np, fname)(x, axis=0)  # numpy's equivalent function
-    msg = "DataFrameGroupBy.apply operated on the grouping columns"
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        expected = gb.apply(npfunc)
-    tm.assert_frame_equal(result, expected)
-
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        expected2 = gb.apply(lambda x: npfunc(x))
-    tm.assert_frame_equal(result, expected2)
-
-    if f != sum:
-        expected = gb.agg(fname).reset_index()
-        expected.set_index(keys, inplace=True, drop=False)
-        tm.assert_frame_equal(result, expected, check_dtype=False)
-
-    tm.assert_series_equal(getattr(result, fname)(axis=0), getattr(df, fname)(axis=0))
 
 
 class TestNumericOnly:
@@ -267,118 +205,6 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
             tm.assert_index_equal(result.columns, expected_columns)
 
 
-class TestGroupByNonCythonPaths:
-    # GH#5610 non-cython calls should not include the grouper
-    # Tests for code not expected to go through cython paths.
-
-    @pytest.fixture
-    def df(self):
-        df = DataFrame(
-            [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
-            columns=["A", "B", "C"],
-        )
-        return df
-
-    @pytest.fixture
-    def gb(self, df):
-        gb = df.groupby("A")
-        return gb
-
-    @pytest.fixture
-    def gni(self, df):
-        gni = df.groupby("A", as_index=False)
-        return gni
-
-    def test_describe(self, df, gb, gni):
-        # describe
-        expected_index = Index([1, 3], name="A")
-        expected_col = MultiIndex(
-            levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
-            codes=[[0] * 8, list(range(8))],
-        )
-        expected = DataFrame(
-            [
-                [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
-                [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
-            ],
-            index=expected_index,
-            columns=expected_col,
-        )
-        result = gb.describe()
-        tm.assert_frame_equal(result, expected)
-
-        expected = expected.reset_index()
-        result = gni.describe()
-        tm.assert_frame_equal(result, expected)
-
-
-def test_cython_api2():
-    # this takes the fast apply path
-
-    # cumsum (GH5614)
-    df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"])
-    expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"])
-    result = df.groupby("A").cumsum()
-    tm.assert_frame_equal(result, expected)
-
-    # GH 5755 - cumsum is a transformer and should ignore as_index
-    result = df.groupby("A", as_index=False).cumsum()
-    tm.assert_frame_equal(result, expected)
-
-    # GH 13994
-    msg = "DataFrameGroupBy.cumsum with axis=1 is deprecated"
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        result = df.groupby("A").cumsum(axis=1)
-    expected = df.cumsum(axis=1)
-    tm.assert_frame_equal(result, expected)
-
-    msg = "DataFrameGroupBy.cumprod with axis=1 is deprecated"
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        result = df.groupby("A").cumprod(axis=1)
-    expected = df.cumprod(axis=1)
-    tm.assert_frame_equal(result, expected)
-
-
-@pytest.mark.parametrize(
-    "dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"]
-)
-@pytest.mark.parametrize(
-    "method,data",
-    [
-        ("first", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}),
-        ("last", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}),
-        ("min", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}),
-        ("max", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}),
-        ("count", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 2}], "out_type": "int64"}),
-    ],
-)
-def test_groupby_non_arithmetic_agg_types(dtype, method, data):
-    # GH9311, GH6620
-    df = DataFrame(
-        [{"a": 1, "b": 1}, {"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 2, "b": 4}]
-    )
-
-    df["b"] = df.b.astype(dtype)
-
-    if "args" not in data:
-        data["args"] = []
-
-    if "out_type" in data:
-        out_type = data["out_type"]
-    else:
-        out_type = dtype
-
-    exp = data["df"]
-    df_out = DataFrame(exp)
-
-    df_out["b"] = df_out.b.astype(out_type)
-    df_out.set_index("a", inplace=True)
-
-    grpd = df.groupby("a")
-    t = getattr(grpd, method)(*data["args"])
-    tm.assert_frame_equal(t, df_out)
-
-
 @pytest.mark.parametrize(
     "i",
     [
@@ -493,78 +319,6 @@ def test_axis1_numeric_only(request, groupby_func, numeric_only):
         tm.assert_equal(result, expected)
 
 
-def scipy_sem(*args, **kwargs):
-    from scipy.stats import sem
-
-    return sem(*args, ddof=1, **kwargs)
-
-
-@pytest.mark.parametrize(
-    "op,targop",
-    [
-        ("mean", np.mean),
-        ("median", np.median),
-        ("std", np.std),
-        ("var", np.var),
-        ("sum", np.sum),
-        ("prod", np.prod),
-        ("min", np.min),
-        ("max", np.max),
-        ("first", lambda x: x.iloc[0]),
-        ("last", lambda x: x.iloc[-1]),
-        ("count", np.size),
-        pytest.param("sem", scipy_sem, marks=td.skip_if_no_scipy),
-    ],
-)
-def test_ops_general(op, targop):
-    df = DataFrame(np.random.default_rng(2).standard_normal(1000))
-    labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float)
-
-    result = getattr(df.groupby(labels), op)()
-    warn = None if op in ("first", "last", "count", "sem") else FutureWarning
-    msg = f"using DataFrameGroupBy.{op}"
-    with tm.assert_produces_warning(warn, match=msg):
-        expected = df.groupby(labels).agg(targop)
-    tm.assert_frame_equal(result, expected)
-
-
-def test_series_index_name(df):
-    grouped = df.loc[:, ["C"]].groupby(df["A"])
-    result = grouped.agg(lambda x: x.mean())
-    assert result.index.name == "A"
-
-
-@pytest.mark.parametrize(
-    "values",
-    [
-        {
-            "a": [1, 1, 1, 2, 2, 2, 3, 3, 3],
-            "b": [1, pd.NA, 2, 1, pd.NA, 2, 1, pd.NA, 2],
-        },
-        {"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 1, 2, 1, 2]},
-    ],
-)
-@pytest.mark.parametrize("function", ["mean", "median", "var"])
-def test_apply_to_nullable_integer_returns_float(values, function):
-    # https://github.com/pandas-dev/pandas/issues/32219
-    output = 0.5 if function == "var" else 1.5
-    arr = np.array([output] * 3, dtype=float)
-    idx = Index([1, 2, 3], name="a", dtype="Int64")
-    expected = DataFrame({"b": arr}, index=idx).astype("Float64")
-
-    groups = DataFrame(values, dtype="Int64").groupby("a")
-
-    result = getattr(groups, function)()
-    tm.assert_frame_equal(result, expected)
-
-    result = groups.agg(function)
-    tm.assert_frame_equal(result, expected)
-
-    result = groups.agg([function])
-    expected.columns = MultiIndex.from_tuples([("b", function)])
-    tm.assert_frame_equal(result, expected)
-
-
 @pytest.mark.parametrize(
     "kernel, has_arg",
     [
@@ -781,31 +535,6 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request):
         tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.parametrize("dtype", [int, float, object])
-@pytest.mark.parametrize(
-    "kwargs",
-    [
-        {"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None},
-        {"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]},
-        {"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None},
-    ],
-)
-def test_groupby_empty_dataset(dtype, kwargs):
-    # GH#41575
-    df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype)
-    df["B"] = df["B"].astype(int)
-    df["C"] = df["C"].astype(float)
-
-    result = df.iloc[:0].groupby("A").describe(**kwargs)
-    expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0]
-    tm.assert_frame_equal(result, expected)
-
-    result = df.iloc[:0].groupby("A").B.describe(**kwargs)
-    expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0]
-    expected.index = Index([])
-    tm.assert_frame_equal(result, expected)
-
-
 def test_multiindex_group_all_columns_when_empty(groupby_func):
     # GH 32464
     df = DataFrame({"a": [], "b": [], "c": []}).set_index(["a", "b", "c"])
@@ -835,53 +564,3 @@ def test_duplicate_columns(request, groupby_func, as_index):
     if groupby_func not in ("size", "ngroup", "cumcount"):
         expected = expected.rename(columns={"c": "b"})
     tm.assert_equal(result, expected)
-
-
-@pytest.mark.parametrize(
-    "op",
-    [
-        "sum",
-        "prod",
-        "min",
-        "max",
-        "median",
-        "mean",
-        "skew",
-        "std",
-        "var",
-        "sem",
-    ],
-)
-@pytest.mark.parametrize("axis", [0, 1])
-@pytest.mark.parametrize("skipna", [True, False])
-@pytest.mark.parametrize("sort", [True, False])
-def test_regression_allowlist_methods(op, axis, skipna, sort):
-    # GH6944
-    # GH 17537
-    # explicitly test the allowlist methods
-    raw_frame = DataFrame([0])
-    if axis == 0:
-        frame = raw_frame
-        msg = "The 'axis' keyword in DataFrame.groupby is deprecated and will be"
-    else:
-        frame = raw_frame.T
-        msg = "DataFrame.groupby with axis=1 is deprecated"
-
-    with tm.assert_produces_warning(FutureWarning, match=msg):
-        grouped = frame.groupby(level=0, axis=axis, sort=sort)
-
-    if op == "skew":
-        # skew has skipna
-        result = getattr(grouped, op)(skipna=skipna)
-        expected = frame.groupby(level=0).apply(
-            lambda h: getattr(h, op)(axis=axis, skipna=skipna)
-        )
-        if sort:
-            expected = expected.sort_index(axis=axis)
-        tm.assert_frame_equal(result, expected)
-    else:
-        result = getattr(grouped, op)()
-        expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)(axis=axis))
-        if sort:
-            expected = expected.sort_index(axis=axis)
-        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py
index f48926e4d0c77..e5836ce1e61c9 100644
--- a/pandas/tests/groupby/test_reductions.py
+++ b/pandas/tests/groupby/test_reductions.py
@@ -17,6 +17,7 @@
     isna,
 )
 import pandas._testing as tm
+from pandas.util import _test_decorators as td
 
 
 @pytest.mark.parametrize("agg_func", ["any", "all"])
@@ -793,6 +794,23 @@ def test_empty_categorical(observed):
     tm.assert_series_equal(result, expected)
 
 
+def test_intercept_builtin_sum():
+    s = Series([1.0, 2.0, np.nan, 3.0])
+    grouped = s.groupby([0, 1, 2, 2])
+
+    msg = "using SeriesGroupBy.sum"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        # GH#53425
+        result = grouped.agg(builtins.sum)
+    msg = "using np.sum"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        # GH#53425
+        result2 = grouped.apply(builtins.sum)
+    expected = grouped.sum()
+    tm.assert_series_equal(result, expected)
+    tm.assert_series_equal(result2, expected)
+
+
 @pytest.mark.parametrize("min_count", [0, 10])
 def test_groupby_sum_mincount_boolean(min_count):
     b = True
@@ -853,3 +871,159 @@ def test_groupby_sum_timedelta_with_nat():
     res = gb["b"].sum(min_count=2)
     expected = Series([td3, pd.NaT], dtype="m8[ns]", name="b", index=expected.index)
     tm.assert_series_equal(res, expected)
+
+
+@pytest.mark.parametrize(
+    "dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"]
+)
+@pytest.mark.parametrize(
+    "method,data",
+    [
+        ("first", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}),
+        ("last", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}),
+        ("min", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}),
+        ("max", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}),
+        ("count", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 2}], "out_type": "int64"}),
+    ],
+)
+def test_groupby_non_arithmetic_agg_types(dtype, method, data):
+    # GH9311, GH6620
+    df = DataFrame(
+        [{"a": 1, "b": 1}, {"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 2, "b": 4}]
+    )
+
+    df["b"] = df.b.astype(dtype)
+
+    if "args" not in data:
+        data["args"] = []
+
+    if "out_type" in data:
+        out_type = data["out_type"]
+    else:
+        out_type = dtype
+
+    exp = data["df"]
+    df_out = DataFrame(exp)
+
+    df_out["b"] = df_out.b.astype(out_type)
+    df_out.set_index("a", inplace=True)
+
+    grpd = df.groupby("a")
+    t = getattr(grpd, method)(*data["args"])
+    tm.assert_frame_equal(t, df_out)
+
+
+def scipy_sem(*args, **kwargs):
+    from scipy.stats import sem
+
+    return sem(*args, ddof=1, **kwargs)
+
+
+@pytest.mark.parametrize(
+    "op,targop",
+    [
+        ("mean", np.mean),
+        ("median", np.median),
+        ("std", np.std),
+        ("var", np.var),
+        ("sum", np.sum),
+        ("prod", np.prod),
+        ("min", np.min),
+        ("max", np.max),
+        ("first", lambda x: x.iloc[0]),
+        ("last", lambda x: x.iloc[-1]),
+        ("count", np.size),
+        pytest.param("sem", scipy_sem, marks=td.skip_if_no_scipy),
+    ],
+)
+def test_ops_general(op, targop):
+    df = DataFrame(np.random.default_rng(2).standard_normal(1000))
+    labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float)
+
+    result = getattr(df.groupby(labels), op)()
+    warn = None if op in ("first", "last", "count", "sem") else FutureWarning
+    msg = f"using DataFrameGroupBy.{op}"
+    with tm.assert_produces_warning(warn, match=msg):
+        expected = df.groupby(labels).agg(targop)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "values",
+    [
+        {
+            "a": [1, 1, 1, 2, 2, 2, 3, 3, 3],
+            "b": [1, pd.NA, 2, 1, pd.NA, 2, 1, pd.NA, 2],
+        },
+        {"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 1, 2, 1, 2]},
+    ],
+)
+@pytest.mark.parametrize("function", ["mean", "median", "var"])
+def test_apply_to_nullable_integer_returns_float(values, function):
+    # https://github.com/pandas-dev/pandas/issues/32219
+    output = 0.5 if function == "var" else 1.5
+    arr = np.array([output] * 3, dtype=float)
+    idx = pd.Index([1, 2, 3], name="a", dtype="Int64")
+    expected = DataFrame({"b": arr}, index=idx).astype("Float64")
+
+    groups = DataFrame(values, dtype="Int64").groupby("a")
+
+    result = getattr(groups, function)()
+    tm.assert_frame_equal(result, expected)
+
+    result = groups.agg(function)
+    tm.assert_frame_equal(result, expected)
+
+    result = groups.agg([function])
+    expected.columns = MultiIndex.from_tuples([("b", function)])
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "op",
+    [
+        "sum",
+        "prod",
+        "min",
+        "max",
+        "median",
+        "mean",
+        "skew",
+        "std",
+        "var",
+        "sem",
+    ],
+)
+@pytest.mark.parametrize("axis", [0, 1])
+@pytest.mark.parametrize("skipna", [True, False])
+@pytest.mark.parametrize("sort", [True, False])
+def test_regression_allowlist_methods(op, axis, skipna, sort):
+    # GH6944
+    # GH 17537
+    # explicitly test the allowlist methods
+    raw_frame = DataFrame([0])
+    if axis == 0:
+        frame = raw_frame
+        msg = "The 'axis' keyword in DataFrame.groupby is deprecated and will be"
+    else:
+        frame = raw_frame.T
+        msg = "DataFrame.groupby with axis=1 is deprecated"
+
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        grouped = frame.groupby(level=0, axis=axis, sort=sort)
+
+    if op == "skew":
+        # skew has skipna
+        result = getattr(grouped, op)(skipna=skipna)
+        expected = frame.groupby(level=0).apply(
+            lambda h: getattr(h, op)(axis=axis, skipna=skipna)
+        )
+        if sort:
+            expected = expected.sort_index(axis=axis)
+        tm.assert_frame_equal(result, expected)
+    else:
+        result = getattr(grouped, op)()
+        expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)(axis=axis))
+        if sort:
+            expected = expected.sort_index(axis=axis)
+        tm.assert_frame_equal(result, expected)