diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index cd1a31d4eaf34..7886b63e9983e 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -43,6 +43,7 @@ "matplotlib": [], "sqlalchemy": [], "scipy": [], + "numba": [], "numexpr": [], "pytables": [null, ""], // platform dependent, see excludes below "tables": [null, ""], diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 7a72622fd5fe3..f7e1e395a76bc 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -44,6 +44,27 @@ def time_rolling(self, constructor, window, dtype, function, raw): self.roll.apply(function, raw=raw) +class Engine: + params = ( + ["DataFrame", "Series"], + ["int", "float"], + [np.sum, lambda x: np.sum(x) + 5], + ["cython", "numba"], + ) + param_names = ["constructor", "dtype", "function", "engine"] + + def setup(self, constructor, dtype, function, engine): + N = 10 ** 3 + arr = (100 * np.random.random(N)).astype(dtype) + self.data = getattr(pd, constructor)(arr) + + def time_rolling_apply(self, constructor, dtype, function, engine): + self.data.rolling(10).apply(function, raw=True, engine=engine) + + def time_expanding_apply(self, constructor, dtype, function, engine): + self.data.expanding().apply(function, raw=True, engine=engine) + + class ExpandingMethods: params = ( diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index a2150c207c0b0..aeb32db639ffb 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -348,6 +348,7 @@ Numba will be applied in potentially two routines: 1. If ``func`` is a standard Python function, the engine will `JIT `__ the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again. + 2. The engine will JIT the for loop where the apply function is applied to each window. The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index c423933d4c438..fa562838c8f7c 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -159,14 +159,14 @@ You can use the alias ``"boolean"`` as well. .. _whatsnew_100.numba_rolling_apply: -Using Numba in ``rolling.apply`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Using Numba in ``rolling.apply`` and ``expanding.apply`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We've added an ``engine`` keyword to :meth:`~core.window.rolling.Rolling.apply` that allows the user to execute the -routine using `Numba `__ instead of Cython. Using the Numba engine -can yield significant performance gains if the apply function can operate on numpy arrays and +We've added an ``engine`` keyword to :meth:`~core.window.rolling.Rolling.apply` and :meth:`~core.window.expanding.Expanding.apply` +that allows the user to execute the routine using `Numba `__ instead of Cython. +Using the Numba engine can yield significant performance gains if the apply function can operate on numpy arrays and the data set is larger (1 million rows or greater). For more details, see -:ref:`rolling apply documentation ` (:issue:`28987`) +:ref:`rolling apply documentation ` (:issue:`28987`, :issue:`30936`) .. _whatsnew_100.custom_window: diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 68c3514308cbc..a0bf3376d2352 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -1,4 +1,5 @@ from textwrap import dedent +from typing import Dict, Optional from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution @@ -148,8 +149,23 @@ def count(self, **kwargs): @Substitution(name="expanding") @Appender(_shared_docs["apply"]) - def apply(self, func, raw=False, args=(), kwargs={}): - return super().apply(func, raw=raw, args=args, kwargs=kwargs) + def apply( + self, + func, + raw: bool = False, + engine: str = "cython", + engine_kwargs: Optional[Dict[str, bool]] = None, + args=None, + kwargs=None, + ): + return super().apply( + func, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + args=args, + kwargs=kwargs, + ) @Substitution(name="expanding") @Appender(_shared_docs["sum"]) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index bdc94c7402eb5..f7efa69778c44 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1203,7 +1203,7 @@ def count(self): _shared_docs["apply"] = dedent( r""" - The %(name)s function's apply function. + Apply an arbitrary function to each %(name)s window. Parameters ---------- diff --git a/pandas/tests/window/moments/test_moments_expanding.py b/pandas/tests/window/moments/test_moments_expanding.py index 4596552d8f255..322082187f531 100644 --- a/pandas/tests/window/moments/test_moments_expanding.py +++ b/pandas/tests/window/moments/test_moments_expanding.py @@ -13,15 +13,17 @@ class TestExpandingMomentsConsistency(ConsistencyBase): def setup_method(self, method): self._create_data() - def test_expanding_apply_args_kwargs(self, raw): + def test_expanding_apply_args_kwargs(self, engine_and_raw): def mean_w_arg(x, const): return np.mean(x) + const + engine, raw = engine_and_raw + df = DataFrame(np.random.rand(20, 3)) - expected = df.expanding().apply(np.mean, raw=raw) + 20.0 + expected = df.expanding().apply(np.mean, engine=engine, raw=raw) + 20.0 - result = df.expanding().apply(mean_w_arg, raw=raw, args=(20,)) + result = df.expanding().apply(mean_w_arg, engine=engine, raw=raw, args=(20,)) tm.assert_frame_equal(result, expected) result = df.expanding().apply(mean_w_arg, raw=raw, kwargs={"const": 20}) @@ -190,11 +192,14 @@ def expanding_func(x, min_periods=1, center=False, axis=0): ) @pytest.mark.parametrize("has_min_periods", [True, False]) - def test_expanding_apply(self, raw, has_min_periods): + def test_expanding_apply(self, engine_and_raw, has_min_periods): + + engine, raw = engine_and_raw + def expanding_mean(x, min_periods=1): exp = x.expanding(min_periods=min_periods) - result = exp.apply(lambda x: x.mean(), raw=raw) + result = exp.apply(lambda x: x.mean(), raw=raw, engine=engine) return result # TODO(jreback), needed to add preserve_nan=False @@ -202,14 +207,20 @@ def expanding_mean(x, min_periods=1): self._check_expanding(expanding_mean, np.mean, preserve_nan=False) self._check_expanding_has_min_periods(expanding_mean, np.mean, has_min_periods) - def test_expanding_apply_empty_series(self, raw): + def test_expanding_apply_empty_series(self, engine_and_raw): + engine, raw = engine_and_raw ser = Series([], dtype=np.float64) - tm.assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean(), raw=raw)) + tm.assert_series_equal( + ser, ser.expanding().apply(lambda x: x.mean(), raw=raw, engine=engine) + ) - def test_expanding_apply_min_periods_0(self, raw): + def test_expanding_apply_min_periods_0(self, engine_and_raw): # GH 8080 + engine, raw = engine_and_raw s = Series([None, None, None]) - result = s.expanding(min_periods=0).apply(lambda x: len(x), raw=raw) + result = s.expanding(min_periods=0).apply( + lambda x: len(x), raw=raw, engine=engine + ) expected = Series([1.0, 2.0, 3.0]) tm.assert_series_equal(result, expected)