From ad5e12e735788c4ad8dcc504e8d9e08540d0e6e9 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sat, 11 Jan 2020 20:35:05 -0800 Subject: [PATCH 1/7] Add engine keyword to expanding.apply to utilize Numba --- doc/source/user_guide/computation.rst | 1 + pandas/core/window/expanding.py | 19 ++++++++++-- pandas/core/window/rolling.py | 2 +- .../window/moments/test_moments_expanding.py | 29 +++++++++++++------ 4 files changed, 39 insertions(+), 12 deletions(-) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index a2150c207c0b0..aeb32db639ffb 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -348,6 +348,7 @@ Numba will be applied in potentially two routines: 1. If ``func`` is a standard Python function, the engine will `JIT `__ the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again. + 2. The engine will JIT the for loop where the apply function is applied to each window. The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 68c3514308cbc..279af82b2bad3 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -148,8 +148,23 @@ def count(self, **kwargs): @Substitution(name="expanding") @Appender(_shared_docs["apply"]) - def apply(self, func, raw=False, args=(), kwargs={}): - return super().apply(func, raw=raw, args=args, kwargs=kwargs) + def apply( + self, + func, + raw=False, + engine="cython", + engine_kwargs=None, + args=None, + kwargs=None, + ): + return super().apply( + func, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + args=args, + kwargs=kwargs, + ) @Substitution(name="expanding") @Appender(_shared_docs["sum"]) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index f612826132fd7..2b327ec406923 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1203,7 +1203,7 @@ def count(self): _shared_docs["apply"] = dedent( r""" - The %(name)s function's apply function. + Apply an arbitrary function to each %(name)s window. Parameters ---------- diff --git a/pandas/tests/window/moments/test_moments_expanding.py b/pandas/tests/window/moments/test_moments_expanding.py index 4596552d8f255..322082187f531 100644 --- a/pandas/tests/window/moments/test_moments_expanding.py +++ b/pandas/tests/window/moments/test_moments_expanding.py @@ -13,15 +13,17 @@ class TestExpandingMomentsConsistency(ConsistencyBase): def setup_method(self, method): self._create_data() - def test_expanding_apply_args_kwargs(self, raw): + def test_expanding_apply_args_kwargs(self, engine_and_raw): def mean_w_arg(x, const): return np.mean(x) + const + engine, raw = engine_and_raw + df = DataFrame(np.random.rand(20, 3)) - expected = df.expanding().apply(np.mean, raw=raw) + 20.0 + expected = df.expanding().apply(np.mean, engine=engine, raw=raw) + 20.0 - result = df.expanding().apply(mean_w_arg, raw=raw, args=(20,)) + result = df.expanding().apply(mean_w_arg, engine=engine, raw=raw, args=(20,)) tm.assert_frame_equal(result, expected) result = df.expanding().apply(mean_w_arg, raw=raw, kwargs={"const": 20}) @@ -190,11 +192,14 @@ def expanding_func(x, min_periods=1, center=False, axis=0): ) @pytest.mark.parametrize("has_min_periods", [True, False]) - def test_expanding_apply(self, raw, has_min_periods): + def test_expanding_apply(self, engine_and_raw, has_min_periods): + + engine, raw = engine_and_raw + def expanding_mean(x, min_periods=1): exp = x.expanding(min_periods=min_periods) - result = exp.apply(lambda x: x.mean(), raw=raw) + result = exp.apply(lambda x: x.mean(), raw=raw, engine=engine) return result # TODO(jreback), needed to add preserve_nan=False @@ -202,14 +207,20 @@ def expanding_mean(x, min_periods=1): self._check_expanding(expanding_mean, np.mean, preserve_nan=False) self._check_expanding_has_min_periods(expanding_mean, np.mean, has_min_periods) - def test_expanding_apply_empty_series(self, raw): + def test_expanding_apply_empty_series(self, engine_and_raw): + engine, raw = engine_and_raw ser = Series([], dtype=np.float64) - tm.assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean(), raw=raw)) + tm.assert_series_equal( + ser, ser.expanding().apply(lambda x: x.mean(), raw=raw, engine=engine) + ) - def test_expanding_apply_min_periods_0(self, raw): + def test_expanding_apply_min_periods_0(self, engine_and_raw): # GH 8080 + engine, raw = engine_and_raw s = Series([None, None, None]) - result = s.expanding(min_periods=0).apply(lambda x: len(x), raw=raw) + result = s.expanding(min_periods=0).apply( + lambda x: len(x), raw=raw, engine=engine + ) expected = Series([1.0, 2.0, 3.0]) tm.assert_series_equal(result, expected) From 36bbead14a38a2ce37000ef523e5bb51459ff03a Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 13 Jan 2020 23:51:11 -0800 Subject: [PATCH 2/7] Add asvs and typing --- asv_bench/asv.conf.json | 1 + asv_bench/benchmarks/rolling.py | 21 +++++++++++++++++++++ pandas/core/window/expanding.py | 7 ++++--- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index cd1a31d4eaf34..7886b63e9983e 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -43,6 +43,7 @@ "matplotlib": [], "sqlalchemy": [], "scipy": [], + "numba": [], "numexpr": [], "pytables": [null, ""], // platform dependent, see excludes below "tables": [null, ""], diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 7a72622fd5fe3..3ab0633813a50 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -44,6 +44,27 @@ def time_rolling(self, constructor, window, dtype, function, raw): self.roll.apply(function, raw=raw) +class Engine: + params = ( + ["DataFrame", "Series"], + ["int", "float"], + [sum, np.sum, lambda x: np.sum(x) + 5], + ['cython', 'numba'], + ) + param_names = ["constructor", "dtype", "function", "engine"] + + def setup(self, constructor, dtype, function, engine): + N = 10 ** 3 + arr = (100 * np.random.random(N)).astype(dtype) + self.data = getattr(pd, constructor)(arr) + + def time_rolling_apply(self, constructor, dtype, function, engine): + self.data.rolling(10).apply(function, raw=False, engine=engine) + + def time_expanding_apply(self, constructor, dtype, function, engine): + self.data.expanding().apply(function, raw=False, engine=engine) + + class ExpandingMethods: params = ( diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 279af82b2bad3..a0bf3376d2352 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -1,4 +1,5 @@ from textwrap import dedent +from typing import Dict, Optional from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution @@ -151,9 +152,9 @@ def count(self, **kwargs): def apply( self, func, - raw=False, - engine="cython", - engine_kwargs=None, + raw: bool = False, + engine: str = "cython", + engine_kwargs: Optional[Dict[str, bool]] = None, args=None, kwargs=None, ): From 970778b8d02d213aea98124697955a40741f5547 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 14 Jan 2020 08:55:30 -0800 Subject: [PATCH 3/7] Black benchmarks --- asv_bench/benchmarks/rolling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 3ab0633813a50..25f263539ce19 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -49,7 +49,7 @@ class Engine: ["DataFrame", "Series"], ["int", "float"], [sum, np.sum, lambda x: np.sum(x) + 5], - ['cython', 'numba'], + ["cython", "numba"], ) param_names = ["constructor", "dtype", "function", "engine"] From 6442f153c259037d027579728bb2120f0b751d96 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 14 Jan 2020 09:57:07 -0800 Subject: [PATCH 4/7] Change raw=True in benchmarks --- asv_bench/benchmarks/rolling.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 25f263539ce19..6e936e5e179a3 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -49,7 +49,7 @@ class Engine: ["DataFrame", "Series"], ["int", "float"], [sum, np.sum, lambda x: np.sum(x) + 5], - ["cython", "numba"], + ['cython', 'numba'], ) param_names = ["constructor", "dtype", "function", "engine"] @@ -59,10 +59,10 @@ def setup(self, constructor, dtype, function, engine): self.data = getattr(pd, constructor)(arr) def time_rolling_apply(self, constructor, dtype, function, engine): - self.data.rolling(10).apply(function, raw=False, engine=engine) + self.data.rolling(10).apply(function, raw=True, engine=engine) def time_expanding_apply(self, constructor, dtype, function, engine): - self.data.expanding().apply(function, raw=False, engine=engine) + self.data.expanding().apply(function, raw=True, engine=engine) class ExpandingMethods: From 42b60648c72c9ba0e00eacf326bee90b1eb59d7d Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 14 Jan 2020 11:42:20 -0800 Subject: [PATCH 5/7] Black benchmarks again --- asv_bench/benchmarks/rolling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 6e936e5e179a3..60e61cf89368c 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -49,7 +49,7 @@ class Engine: ["DataFrame", "Series"], ["int", "float"], [sum, np.sum, lambda x: np.sum(x) + 5], - ['cython', 'numba'], + ["cython", "numba"], ) param_names = ["constructor", "dtype", "function", "engine"] From cb7bf1c907625de09d40cbc6389210f71d3f6c60 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 14 Jan 2020 22:25:48 -0800 Subject: [PATCH 6/7] Add 1.0.0 whatsnew --- doc/source/whatsnew/v1.0.0.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index c423933d4c438..fa562838c8f7c 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -159,14 +159,14 @@ You can use the alias ``"boolean"`` as well. .. _whatsnew_100.numba_rolling_apply: -Using Numba in ``rolling.apply`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Using Numba in ``rolling.apply`` and ``expanding.apply`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We've added an ``engine`` keyword to :meth:`~core.window.rolling.Rolling.apply` that allows the user to execute the -routine using `Numba `__ instead of Cython. Using the Numba engine -can yield significant performance gains if the apply function can operate on numpy arrays and +We've added an ``engine`` keyword to :meth:`~core.window.rolling.Rolling.apply` and :meth:`~core.window.expanding.Expanding.apply` +that allows the user to execute the routine using `Numba `__ instead of Cython. +Using the Numba engine can yield significant performance gains if the apply function can operate on numpy arrays and the data set is larger (1 million rows or greater). For more details, see -:ref:`rolling apply documentation ` (:issue:`28987`) +:ref:`rolling apply documentation ` (:issue:`28987`, :issue:`30936`) .. _whatsnew_100.custom_window: From 7ab92299677f6fc3e7b510fbe73aea2ba309d1b8 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 14 Jan 2020 22:36:38 -0800 Subject: [PATCH 7/7] Simplify benchmarks --- asv_bench/benchmarks/rolling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 60e61cf89368c..f7e1e395a76bc 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -48,7 +48,7 @@ class Engine: params = ( ["DataFrame", "Series"], ["int", "float"], - [sum, np.sum, lambda x: np.sum(x) + 5], + [np.sum, lambda x: np.sum(x) + 5], ["cython", "numba"], ) param_names = ["constructor", "dtype", "function", "engine"]