ENH: Add engine="numba" to groupby mean (#43731)

mroeschke · web-flow · commit e87ad05a09b5 · 2021-11-04T20:59:13.000-04:00
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -182,7 +182,7 @@ Other enhancements
 - Added :meth:`.ExponentialMovingWindow.sum` (:issue:`13297`)
 - :meth:`Series.str.split` now supports a ``regex`` argument that explicitly specifies whether the pattern is a regular expression. Default is ``None`` (:issue:`43563`, :issue:`32835`, :issue:`25549`)
 - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`)
--
+- :meth:`.GroupBy.mean` now supports `Numba <http://numba.pydata.org/>`_ execution with the ``engine`` keyword (:issue:`43731`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -79,6 +79,7 @@ class providing the base-class of operations.
 )
 
 from pandas.core import nanops
+from pandas.core._numba import executor
 import pandas.core.algorithms as algorithms
 from pandas.core.arrays import (
     BaseMaskedArray,
@@ -1259,6 +1260,44 @@ def _numba_prep(self, func, data):
             sorted_data,
         )
 
+    def _numba_agg_general(
+        self,
+        func: Callable,
+        engine_kwargs: dict[str, bool] | None,
+        numba_cache_key_str: str,
+    ):
+        """
+        Perform groupby with a standard numerical aggregation function (e.g. mean)
+        with Numba.
+        """
+        if not self.as_index:
+            raise NotImplementedError(
+                "as_index=False is not supported. Use .reset_index() instead."
+            )
+        if self.axis == 1:
+            raise NotImplementedError("axis=1 is not supported.")
+
+        with self._group_selection_context():
+            data = self._selected_obj
+        df = data if data.ndim == 2 else data.to_frame()
+        starts, ends, sorted_index, sorted_data = self._numba_prep(func, df)
+        aggregator = executor.generate_shared_aggregator(
+            func, engine_kwargs, numba_cache_key_str
+        )
+        result = aggregator(sorted_data, starts, ends, 0)
+
+        cache_key = (func, numba_cache_key_str)
+        if cache_key not in NUMBA_FUNC_CACHE:
+            NUMBA_FUNC_CACHE[cache_key] = aggregator
+
+        index = self.grouper.result_index
+        if data.ndim == 1:
+            result_kwargs = {"name": data.name}
+            result = result.ravel()
+        else:
+            result_kwargs = {"columns": data.columns}
+        return data._constructor(result, index=index, **result_kwargs)
+
     @final
     def _transform_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs):
         """
@@ -1827,7 +1866,12 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike:
     @final
     @Substitution(name="groupby")
     @Substitution(see_also=_common_see_also)
-    def mean(self, numeric_only: bool | lib.NoDefault = lib.no_default):
+    def mean(
+        self,
+        numeric_only: bool | lib.NoDefault = lib.no_default,
+        engine: str = "cython",
+        engine_kwargs: dict[str, bool] | None = None,
+    ):
         """
         Compute mean of groups, excluding missing values.
 
@@ -1837,6 +1881,23 @@ def mean(self, numeric_only: bool | lib.NoDefault = lib.no_default):
             Include only float, int, boolean columns. If None, will attempt to use
             everything, then use only numeric data.
 
+        engine : str, default None
+            * ``'cython'`` : Runs the operation through C-extensions from cython.
+            * ``'numba'`` : Runs the operation through JIT compiled code from numba.
+            * ``None`` : Defaults to ``'cython'`` or globally setting
+              ``compute.use_numba``
+
+            .. versionadded:: 1.4.0
+
+        engine_kwargs : dict, default None
+            * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
+            * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
+              and ``parallel`` dictionary keys. The values must either be ``True`` or
+              ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
+              ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
+
+            .. versionadded:: 1.4.0
+
         Returns
         -------
         pandas.Series or pandas.DataFrame
@@ -1877,12 +1938,17 @@ def mean(self, numeric_only: bool | lib.NoDefault = lib.no_default):
         """
         numeric_only = self._resolve_numeric_only(numeric_only)
 
-        result = self._cython_agg_general(
-            "mean",
-            alt=lambda x: Series(x).mean(numeric_only=numeric_only),
-            numeric_only=numeric_only,
-        )
-        return result.__finalize__(self.obj, method="groupby")
+        if maybe_use_numba(engine):
+            from pandas.core._numba.kernels import sliding_mean
+
+            return self._numba_agg_general(sliding_mean, engine_kwargs, "groupby_mean")
+        else:
+            result = self._cython_agg_general(
+                "mean",
+                alt=lambda x: Series(x).mean(numeric_only=numeric_only),
+                numeric_only=numeric_only,
+            )
+            return result.__finalize__(self.obj, method="groupby")
 
     @final
     @Substitution(name="groupby")
diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py
@@ -12,6 +12,11 @@
 )
 
 
+@pytest.fixture(params=[True, False])
+def sort(request):
+    return request.param
+
+
 @pytest.fixture(params=[True, False])
 def as_index(request):
     return request.param
diff --git a/pandas/tests/groupby/test_numba.py b/pandas/tests/groupby/test_numba.py
@@ -0,0 +1,51 @@
+import pytest
+
+import pandas.util._test_decorators as td
+
+from pandas import (
+    DataFrame,
+    Series,
+)
+import pandas._testing as tm
+
+
+@td.skip_if_no("numba")
+@pytest.mark.filterwarnings("ignore:\\nThe keyword argument")
+# Filter warnings when parallel=True and the function can't be parallelized by Numba
+class TestEngine:
+    def test_cython_vs_numba_frame(self, sort, nogil, parallel, nopython):
+        df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
+        engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+        result = df.groupby("a", sort=sort).mean(
+            engine="numba", engine_kwargs=engine_kwargs
+        )
+        expected = df.groupby("a", sort=sort).mean()
+        tm.assert_frame_equal(result, expected)
+
+    def test_cython_vs_numba_getitem(self, sort, nogil, parallel, nopython):
+        df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
+        engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+        result = df.groupby("a", sort=sort)["c"].mean(
+            engine="numba", engine_kwargs=engine_kwargs
+        )
+        expected = df.groupby("a", sort=sort)["c"].mean()
+        tm.assert_series_equal(result, expected)
+
+    def test_cython_vs_numba_series(self, sort, nogil, parallel, nopython):
+        ser = Series(range(3), index=[1, 2, 1], name="foo")
+        engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+        result = ser.groupby(level=0, sort=sort).mean(
+            engine="numba", engine_kwargs=engine_kwargs
+        )
+        expected = ser.groupby(level=0, sort=sort).mean()
+        tm.assert_series_equal(result, expected)
+
+    def test_as_index_false_unsupported(self):
+        df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
+        with pytest.raises(NotImplementedError, match="as_index=False"):
+            df.groupby("a", as_index=False).mean(engine="numba")
+
+    def test_axis_1_unsupported(self):
+        df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
+        with pytest.raises(NotImplementedError, match="axis=1"):
+            df.groupby("a", axis=1).mean(engine="numba")