From 88aa9f74341cb3b8078de44fd9b60a0fd4e2e6a1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 16 Dec 2021 20:00:23 -0800 Subject: [PATCH 1/3] ENH: Add numba engine to groupby.sum --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/core/groupby/groupby.py | 37 ++++++++++++++++++++---------- pandas/tests/groupby/conftest.py | 1 + pandas/tests/groupby/test_numba.py | 9 +++++--- 4 files changed, 33 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index f50442eb7ca46..057e7a1e9c2b7 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -219,7 +219,7 @@ Other enhancements - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`) - :class:`ExcelWriter` argument ``if_sheet_exists="overlay"`` option added (:issue:`40231`) - :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`) -- :meth:`.GroupBy.mean`, :meth:`.GroupBy.std`, and :meth:`.GroupBy.var` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`, :issue:`44862`) +- :meth:`.GroupBy.mean`, :meth:`.GroupBy.std`, :meth:`.GroupBy.var`, :meth:`.GroupBy.sum` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`, :issue:`44862`) - :meth:`Timestamp.isoformat`, now handles the ``timespec`` argument from the base :class:``datetime`` class (:issue:`26131`) - :meth:`NaT.to_numpy` ``dtype`` argument is now respected, so ``np.timedelta64`` can be returned (:issue:`44460`) - New option ``display.max_dir_items`` customizes the number of columns added to :meth:`Dataframe.__dir__` and suggested for tab completion (:issue:`37996`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e11d420ada29f..a1866e3bdc9f6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2163,22 +2163,35 @@ def size(self) -> DataFrame | Series: @final @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) def sum( - self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0 + self, + numeric_only: bool | lib.NoDefault = lib.no_default, + min_count: int = 0, + engine: str | None = None, + engine_kwargs: dict[str, bool] | None = None, ): - numeric_only = self._resolve_numeric_only(numeric_only) + if maybe_use_numba(engine): + from pandas.core._numba.kernels import sliding_sum - # If we are grouping on categoricals we want unobserved categories to - # return zero, rather than the default of NaN which the reindexing in - # _agg_general() returns. GH #31422 - with com.temp_setattr(self, "observed", True): - result = self._agg_general( - numeric_only=numeric_only, - min_count=min_count, - alias="add", - npfunc=np.sum, + return self._numba_agg_general( + sliding_sum, + engine_kwargs, + "groupby_sum", ) + else: + numeric_only = self._resolve_numeric_only(numeric_only) - return self._reindex_output(result, fill_value=0) + # If we are grouping on categoricals we want unobserved categories to + # return zero, rather than the default of NaN which the reindexing in + # _agg_general() returns. GH #31422 + with com.temp_setattr(self, "observed", True): + result = self._agg_general( + numeric_only=numeric_only, + min_count=min_count, + alias="add", + npfunc=np.sum, + ) + + return self._reindex_output(result, fill_value=0) @final @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0) diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 596e9e0a4de77..2be680d7a4ccd 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -183,6 +183,7 @@ def nopython(request): ("var", {"ddof": 0}), ("std", {"ddof": 1}), ("std", {"ddof": 0}), + ("sum", {}), ] ) def numba_supported_reductions(request): diff --git a/pandas/tests/groupby/test_numba.py b/pandas/tests/groupby/test_numba.py index 6fc1f0d808bb2..645f2ebddf63d 100644 --- a/pandas/tests/groupby/test_numba.py +++ b/pandas/tests/groupby/test_numba.py @@ -24,7 +24,8 @@ def test_cython_vs_numba_frame( engine="numba", engine_kwargs=engine_kwargs, **kwargs ) expected = getattr(gb, func)(**kwargs) - tm.assert_frame_equal(result, expected) + check_dtype = func != "sum" + tm.assert_frame_equal(result, expected, check_dtype=check_dtype) def test_cython_vs_numba_getitem( self, sort, nogil, parallel, nopython, numba_supported_reductions @@ -37,7 +38,8 @@ def test_cython_vs_numba_getitem( engine="numba", engine_kwargs=engine_kwargs, **kwargs ) expected = getattr(gb, func)(**kwargs) - tm.assert_series_equal(result, expected) + check_dtype = func != "sum" + tm.assert_series_equal(result, expected, check_dtype=check_dtype) def test_cython_vs_numba_series( self, sort, nogil, parallel, nopython, numba_supported_reductions @@ -50,7 +52,8 @@ def test_cython_vs_numba_series( engine="numba", engine_kwargs=engine_kwargs, **kwargs ) expected = getattr(gb, func)(**kwargs) - tm.assert_series_equal(result, expected) + check_dtype = func != "sum" + tm.assert_series_equal(result, expected, check_dtype=check_dtype) def test_as_index_false_unsupported(self, numba_supported_reductions): func, kwargs = numba_supported_reductions From 318ab087b7b9289298240343e100994c38ec8e51 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 16 Dec 2021 20:05:47 -0800 Subject: [PATCH 2/3] Add issue number --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 057e7a1e9c2b7..d7ea4c553f8e0 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -219,7 +219,7 @@ Other enhancements - :meth:`DataFrame.dropna` now accepts a single label as ``subset`` along with array-like (:issue:`41021`) - :class:`ExcelWriter` argument ``if_sheet_exists="overlay"`` option added (:issue:`40231`) - :meth:`read_excel` now accepts a ``decimal`` argument that allow the user to specify the decimal point when parsing string columns to numeric (:issue:`14403`) -- :meth:`.GroupBy.mean`, :meth:`.GroupBy.std`, :meth:`.GroupBy.var`, :meth:`.GroupBy.sum` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`, :issue:`44862`) +- :meth:`.GroupBy.mean`, :meth:`.GroupBy.std`, :meth:`.GroupBy.var`, :meth:`.GroupBy.sum` now supports `Numba `_ execution with the ``engine`` keyword (:issue:`43731`, :issue:`44862`, :issue:`44939`) - :meth:`Timestamp.isoformat`, now handles the ``timespec`` argument from the base :class:``datetime`` class (:issue:`26131`) - :meth:`NaT.to_numpy` ``dtype`` argument is now respected, so ``np.timedelta64`` can be returned (:issue:`44460`) - New option ``display.max_dir_items`` customizes the number of columns added to :meth:`Dataframe.__dir__` and suggested for tab completion (:issue:`37996`) From 26826d5525766fd7c00767d6eadfbc0906541202 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 17 Dec 2021 17:55:30 -0800 Subject: [PATCH 3/3] Reference dtype gh issue --- pandas/tests/groupby/test_numba.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/groupby/test_numba.py b/pandas/tests/groupby/test_numba.py index 645f2ebddf63d..6554993c140a1 100644 --- a/pandas/tests/groupby/test_numba.py +++ b/pandas/tests/groupby/test_numba.py @@ -24,6 +24,7 @@ def test_cython_vs_numba_frame( engine="numba", engine_kwargs=engine_kwargs, **kwargs ) expected = getattr(gb, func)(**kwargs) + # check_dtype can be removed if GH 44952 is addressed check_dtype = func != "sum" tm.assert_frame_equal(result, expected, check_dtype=check_dtype) @@ -38,6 +39,7 @@ def test_cython_vs_numba_getitem( engine="numba", engine_kwargs=engine_kwargs, **kwargs ) expected = getattr(gb, func)(**kwargs) + # check_dtype can be removed if GH 44952 is addressed check_dtype = func != "sum" tm.assert_series_equal(result, expected, check_dtype=check_dtype) @@ -52,6 +54,7 @@ def test_cython_vs_numba_series( engine="numba", engine_kwargs=engine_kwargs, **kwargs ) expected = getattr(gb, func)(**kwargs) + # check_dtype can be removed if GH 44952 is addressed check_dtype = func != "sum" tm.assert_series_equal(result, expected, check_dtype=check_dtype)