From 2226c84b0f258560e2bca032d6d1ee6598de2fc1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 12 Dec 2021 16:59:58 -0800 Subject: [PATCH 1/2] ENH: Add numba engine to groupby.var/std --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/core/groupby/groupby.py | 95 ++++++++++++++++++++++++------ pandas/tests/groupby/conftest.py | 14 +++++ pandas/tests/groupby/test_numba.py | 48 ++++++++++----- 4 files changed, 125 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 7cf8c07683514..f9e2b9713d074 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -219,7 +219,7 @@ Other enhancements - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`) - :class:`ExcelWriter` argument ``if_sheet_exists="overlay"`` option added (:issue:`40231`) - :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`) -- :meth:`.GroupBy.mean` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`) +- :meth:`.GroupBy.mean`, :meth:`.GroupBy.std`, and :meth:`.GroupBy.var` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`) - :meth:`Timestamp.isoformat`, now handles the ``timespec`` argument from the base :class:``datetime`` class (:issue:`26131`) - :meth:`NaT.to_numpy` ``dtype`` argument is now respected, so ``np.timedelta64`` can be returned (:issue:`44460`) - New option ``display.max_dir_items`` customizes the number of columns added to :meth:`Dataframe.__dir__` and suggested for tab completion (:issue:`37996`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2876ec1cb5a0d..e11d420ada29f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1272,6 +1272,7 @@ def _numba_agg_general( func: Callable, engine_kwargs: dict[str, bool] | None, numba_cache_key_str: str, + *aggregator_args, ): """ Perform groupby with a standard numerical aggregation function (e.g. mean) @@ -1291,7 +1292,7 @@ def _numba_agg_general( aggregator = executor.generate_shared_aggregator( func, engine_kwargs, numba_cache_key_str ) - result = aggregator(sorted_data, starts, ends, 0) + result = aggregator(sorted_data, starts, ends, 0, *aggregator_args) cache_key = (func, numba_cache_key_str) if cache_key not in NUMBA_FUNC_CACHE: @@ -1989,7 +1990,12 @@ def median(self, numeric_only: bool | lib.NoDefault = lib.no_default): @final @Substitution(name="groupby") @Appender(_common_see_also) - def std(self, ddof: int = 1): + def std( + self, + ddof: int = 1, + engine: str | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): """ Compute standard deviation of groups, excluding missing values. @@ -2000,23 +2006,52 @@ def std(self, ddof: int = 1): ddof : int, default 1 Degrees of freedom. + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.4.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` + + .. versionadded:: 1.4.0 + Returns ------- Series or DataFrame Standard deviation of values within each group. """ - return self._get_cythonized_result( - libgroupby.group_var, - needs_counts=True, - cython_dtype=np.dtype(np.float64), - post_processing=lambda vals, inference: np.sqrt(vals), - ddof=ddof, - ) + if maybe_use_numba(engine): + from pandas.core._numba.kernels import sliding_var + + return np.sqrt( + self._numba_agg_general(sliding_var, engine_kwargs, "groupby_std", ddof) + ) + else: + return self._get_cythonized_result( + libgroupby.group_var, + needs_counts=True, + cython_dtype=np.dtype(np.float64), + post_processing=lambda vals, inference: np.sqrt(vals), + ddof=ddof, + ) @final @Substitution(name="groupby") @Appender(_common_see_also) - def var(self, ddof: int = 1): + def var( + self, + ddof: int = 1, + engine: str | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): """ Compute variance of groups, excluding missing values. @@ -2027,20 +2062,46 @@ def var(self, ddof: int = 1): ddof : int, default 1 Degrees of freedom. + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.4.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` + + .. versionadded:: 1.4.0 + Returns ------- Series or DataFrame Variance of values within each group. """ - if ddof == 1: - numeric_only = self._resolve_numeric_only(lib.no_default) - return self._cython_agg_general( - "var", alt=lambda x: Series(x).var(ddof=ddof), numeric_only=numeric_only + if maybe_use_numba(engine): + from pandas.core._numba.kernels import sliding_var + + return self._numba_agg_general( + sliding_var, engine_kwargs, "groupby_var", ddof ) else: - func = lambda x: x.var(ddof=ddof) - with self._group_selection_context(): - return self._python_agg_general(func) + if ddof == 1: + numeric_only = self._resolve_numeric_only(lib.no_default) + return self._cython_agg_general( + "var", + alt=lambda x: Series(x).var(ddof=ddof), + numeric_only=numeric_only, + ) + else: + func = lambda x: x.var(ddof=ddof) + with self._group_selection_context(): + return self._python_agg_general(func) @final @Substitution(name="groupby") diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index b61f6700f8d1d..596e9e0a4de77 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -174,3 +174,17 @@ def nogil(request): def nopython(request): """nopython keyword argument for numba.jit""" return request.param + + +@pytest.fixture( + params=[ + ("mean", {}), + ("var", {"ddof": 1}), + ("var", {"ddof": 0}), + ("std", {"ddof": 1}), + ("std", {"ddof": 0}), + ] +) +def numba_supported_reductions(request): + """reductions supported with engine='numba'""" + return request.param diff --git a/pandas/tests/groupby/test_numba.py b/pandas/tests/groupby/test_numba.py index 20fd02b21a744..6fc1f0d808bb2 100644 --- a/pandas/tests/groupby/test_numba.py +++ b/pandas/tests/groupby/test_numba.py @@ -13,39 +13,55 @@ @pytest.mark.filterwarnings("ignore:\n") # Filter warnings when parallel=True and the function can't be parallelized by Numba class TestEngine: - def test_cython_vs_numba_frame(self, sort, nogil, parallel, nopython): + def test_cython_vs_numba_frame( + self, sort, nogil, parallel, nopython, numba_supported_reductions + ): + func, kwargs = numba_supported_reductions df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)}) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - result = df.groupby("a", sort=sort).mean( - engine="numba", engine_kwargs=engine_kwargs + gb = df.groupby("a", sort=sort) + result = getattr(gb, func)( + engine="numba", engine_kwargs=engine_kwargs, **kwargs ) - expected = df.groupby("a", sort=sort).mean() + expected = getattr(gb, func)(**kwargs) tm.assert_frame_equal(result, expected) - def test_cython_vs_numba_getitem(self, sort, nogil, parallel, nopython): + def test_cython_vs_numba_getitem( + self, sort, nogil, parallel, nopython, numba_supported_reductions + ): + func, kwargs = numba_supported_reductions df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)}) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - result = df.groupby("a", sort=sort)["c"].mean( - engine="numba", engine_kwargs=engine_kwargs + gb = df.groupby("a", sort=sort)["c"] + result = getattr(gb, func)( + engine="numba", engine_kwargs=engine_kwargs, **kwargs ) - expected = df.groupby("a", sort=sort)["c"].mean() + expected = getattr(gb, func)(**kwargs) tm.assert_series_equal(result, expected) - def test_cython_vs_numba_series(self, sort, nogil, parallel, nopython): + def test_cython_vs_numba_series( + self, sort, nogil, parallel, nopython, numba_supported_reductions + ): + func, kwargs = numba_supported_reductions ser = Series(range(3), index=[1, 2, 1], name="foo") engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} - result = ser.groupby(level=0, sort=sort).mean( - engine="numba", engine_kwargs=engine_kwargs + gb = ser.groupby(level=0, sort=sort) + result = getattr(gb, func)( + engine="numba", engine_kwargs=engine_kwargs, **kwargs ) - expected = ser.groupby(level=0, sort=sort).mean() + expected = getattr(gb, func)(**kwargs) tm.assert_series_equal(result, expected) - def test_as_index_false_unsupported(self): + def test_as_index_false_unsupported(self, numba_supported_reductions): + func, kwargs = numba_supported_reductions df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)}) + gb = df.groupby("a", as_index=False) with pytest.raises(NotImplementedError, match="as_index=False"): - df.groupby("a", as_index=False).mean(engine="numba") + getattr(gb, func)(engine="numba", **kwargs) - def test_axis_1_unsupported(self): + def test_axis_1_unsupported(self, numba_supported_reductions): + func, kwargs = numba_supported_reductions df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)}) + gb = df.groupby("a", axis=1) with pytest.raises(NotImplementedError, match="axis=1"): - df.groupby("a", axis=1).mean(engine="numba") + getattr(gb, func)(engine="numba", **kwargs) From 6ad4f778ebe44184159249070eb2c8198d670a43 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 12 Dec 2021 17:08:24 -0800 Subject: [PATCH 2/2] Add issue ref --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index f9e2b9713d074..e29332c761dc9 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -219,7 +219,7 @@ Other enhancements - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`) - :class:`ExcelWriter` argument ``if_sheet_exists="overlay"`` option added (:issue:`40231`) - :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`) -- :meth:`.GroupBy.mean`, :meth:`.GroupBy.std`, and :meth:`.GroupBy.var` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`) +- :meth:`.GroupBy.mean`, :meth:`.GroupBy.std`, and :meth:`.GroupBy.var` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`, :issue:`44862`) - :meth:`Timestamp.isoformat`, now handles the ``timespec`` argument from the base :class:``datetime`` class (:issue:`26131`) - :meth:`NaT.to_numpy` ``dtype`` argument is now respected, so ``np.timedelta64`` can be returned (:issue:`44460`) - New option ``display.max_dir_items`` customizes the number of columns added to :meth:`Dataframe.__dir__` and suggested for tab completion (:issue:`37996`)