ENH: Add compute.use_numba configuration for automatically using numba (pandas-dev#35182)

mroeschke · fangchenli · commit 7dc0edb96089 · 2020-07-16T18:04:00.000-05:00
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -338,6 +338,7 @@ Other enhancements
 - :meth:`read_csv` now accepts string values like "0", "0.0", "1", "1.0" as convertible to the nullable boolean dtype (:issue:`34859`)
 - :class:`pandas.core.window.ExponentialMovingWindow` now supports a ``times`` argument that allows ``mean`` to be calculated with observations spaced by the timestamps in ``times`` (:issue:`34839`)
 - :meth:`DataFrame.agg` and :meth:`Series.agg` now accept named aggregation for renaming the output columns/indexes. (:issue:`26513`)
+- ``compute.use_numba`` now exists as a configuration option that utilizes the numba engine when available (:issue:`33966`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -52,6 +52,20 @@ def use_numexpr_cb(key):
     expressions.set_use_numexpr(cf.get_option(key))
 
 
+use_numba_doc = """
+: bool
+    Use the numba engine option for select operations if it is installed,
+    the default is False
+    Valid values: False,True
+"""
+
+
+def use_numba_cb(key):
+    from pandas.core.util import numba_
+
+    numba_.set_use_numba(cf.get_option(key))
+
+
 with cf.config_prefix("compute"):
     cf.register_option(
         "use_bottleneck",
@@ -63,6 +77,9 @@ def use_numexpr_cb(key):
     cf.register_option(
         "use_numexpr", True, use_numexpr_doc, validator=is_bool, cb=use_numexpr_cb
     )
+    cf.register_option(
+        "use_numba", False, use_numba_doc, validator=is_bool, cb=use_numba_cb
+    )
 #
 # options from the "display" namespace
 
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -80,6 +80,7 @@
 from pandas.core.util.numba_ import (
     NUMBA_FUNC_CACHE,
     generate_numba_func,
+    maybe_use_numba,
     split_for_numba,
 )
 
@@ -227,9 +228,7 @@ def apply(self, func, *args, **kwargs):
     @doc(
         _agg_template, examples=_agg_examples_doc, klass="Series",
     )
-    def aggregate(
-        self, func=None, *args, engine="cython", engine_kwargs=None, **kwargs
-    ):
+    def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
 
         relabeling = func is None
         columns = None
@@ -483,7 +482,7 @@ def _aggregate_named(self, func, *args, **kwargs):
 
     @Substitution(klass="Series")
     @Appender(_transform_template)
-    def transform(self, func, *args, engine="cython", engine_kwargs=None, **kwargs):
+    def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
         func = self._get_cython_func(func) or func
 
         if not isinstance(func, str):
@@ -515,7 +514,7 @@ def _transform_general(
         Transform with a non-str `func`.
         """
 
-        if engine == "numba":
+        if maybe_use_numba(engine):
             numba_func, cache_key = generate_numba_func(
                 func, engine_kwargs, kwargs, "groupby_transform"
             )
@@ -525,7 +524,7 @@ def _transform_general(
         results = []
         for name, group in self:
             object.__setattr__(group, "name", name)
-            if engine == "numba":
+            if maybe_use_numba(engine):
                 values, index = split_for_numba(group)
                 res = numba_func(values, index, *args)
                 if cache_key not in NUMBA_FUNC_CACHE:
@@ -934,13 +933,11 @@ class DataFrameGroupBy(GroupBy[DataFrame]):
     @doc(
         _agg_template, examples=_agg_examples_doc, klass="DataFrame",
     )
-    def aggregate(
-        self, func=None, *args, engine="cython", engine_kwargs=None, **kwargs
-    ):
+    def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
 
         relabeling, func, columns, order = reconstruct_func(func, **kwargs)
 
-        if engine == "numba":
+        if maybe_use_numba(engine):
             return self._python_agg_general(
                 func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
             )
@@ -1385,7 +1382,7 @@ def _transform_general(
         applied = []
         obj = self._obj_with_exclusions
         gen = self.grouper.get_iterator(obj, axis=self.axis)
-        if engine == "numba":
+        if maybe_use_numba(engine):
             numba_func, cache_key = generate_numba_func(
                 func, engine_kwargs, kwargs, "groupby_transform"
             )
@@ -1395,7 +1392,7 @@ def _transform_general(
         for name, group in gen:
             object.__setattr__(group, "name", name)
 
-            if engine == "numba":
+            if maybe_use_numba(engine):
                 values, index = split_for_numba(group)
                 res = numba_func(values, index, *args)
                 if cache_key not in NUMBA_FUNC_CACHE:
@@ -1446,7 +1443,7 @@ def _transform_general(
 
     @Substitution(klass="DataFrame")
     @Appender(_transform_template)
-    def transform(self, func, *args, engine="cython", engine_kwargs=None, **kwargs):
+    def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
 
         # optimized transforms
         func = self._get_cython_func(func) or func
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -65,6 +65,7 @@ class providing the base-class of operations.
 from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex
 from pandas.core.series import Series
 from pandas.core.sorting import get_group_index_sorter
+from pandas.core.util.numba_ import maybe_use_numba
 
 _common_see_also = """
         See Also
@@ -286,9 +287,10 @@ class providing the base-class of operations.
     .. versionchanged:: 1.1.0
 *args
     Positional arguments to pass to func
-engine : str, default 'cython'
+engine : str, default None
     * ``'cython'`` : Runs the function through C-extensions from cython.
     * ``'numba'`` : Runs the function through JIT compiled code from numba.
+    * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
 
     .. versionadded:: 1.1.0
 engine_kwargs : dict, default None
@@ -393,9 +395,10 @@ class providing the base-class of operations.
     .. versionchanged:: 1.1.0
 *args
     Positional arguments to pass to func
-engine : str, default 'cython'
+engine : str, default None
     * ``'cython'`` : Runs the function through C-extensions from cython.
     * ``'numba'`` : Runs the function through JIT compiled code from numba.
+    * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
 
     .. versionadded:: 1.1.0
 engine_kwargs : dict, default None
@@ -1063,7 +1066,7 @@ def _python_agg_general(
                 # agg_series below assumes ngroups > 0
                 continue
 
-            if engine == "numba":
+            if maybe_use_numba(engine):
                 result, counts = self.grouper.agg_series(
                     obj,
                     func,
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -58,6 +58,7 @@
 from pandas.core.util.numba_ import (
     NUMBA_FUNC_CACHE,
     generate_numba_func,
+    maybe_use_numba,
     split_for_numba,
 )
 
@@ -620,7 +621,7 @@ def agg_series(
         # Caller is responsible for checking ngroups != 0
         assert self.ngroups != 0
 
-        if engine == "numba":
+        if maybe_use_numba(engine):
             return self._aggregate_series_pure_python(
                 obj, func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
             )
@@ -678,7 +679,7 @@ def _aggregate_series_pure_python(
         **kwargs,
     ):
 
-        if engine == "numba":
+        if maybe_use_numba(engine):
             numba_func, cache_key = generate_numba_func(
                 func, engine_kwargs, kwargs, "groupby_agg"
             )
@@ -691,7 +692,7 @@ def _aggregate_series_pure_python(
         splitter = get_splitter(obj, group_index, ngroups, axis=0)
 
         for label, group in splitter:
-            if engine == "numba":
+            if maybe_use_numba(engine):
                 values, index = split_for_numba(group)
                 res = numba_func(values, index, *args)
                 if cache_key not in NUMBA_FUNC_CACHE:
diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py
@@ -10,9 +10,22 @@
 from pandas.compat._optional import import_optional_dependency
 from pandas.errors import NumbaUtilError
 
+GLOBAL_USE_NUMBA: bool = False
 NUMBA_FUNC_CACHE: Dict[Tuple[Callable, str], Callable] = dict()
 
 
+def maybe_use_numba(engine: Optional[str]) -> bool:
+    """Signal whether to use numba routines."""
+    return engine == "numba" or (engine is None and GLOBAL_USE_NUMBA)
+
+
+def set_use_numba(enable: bool = False) -> None:
+    global GLOBAL_USE_NUMBA
+    if enable:
+        import_optional_dependency("numba")
+    GLOBAL_USE_NUMBA = enable
+
+
 def check_kwargs_and_nopython(
     kwargs: Optional[Dict] = None, nopython: Optional[bool] = None
 ) -> None:
diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
@@ -39,7 +39,7 @@
 import pandas.core.common as com
 from pandas.core.construction import extract_array
 from pandas.core.indexes.api import Index, MultiIndex, ensure_index
-from pandas.core.util.numba_ import NUMBA_FUNC_CACHE
+from pandas.core.util.numba_ import NUMBA_FUNC_CACHE, maybe_use_numba
 from pandas.core.window.common import (
     WindowGroupByMixin,
     _doc_template,
@@ -1298,10 +1298,11 @@ def count(self):
           objects instead.
           If you are just applying a NumPy reduction function this will
           achieve much better performance.
-    engine : str, default 'cython'
+    engine : str, default None
         * ``'cython'`` : Runs rolling apply through C-extensions from cython.
         * ``'numba'`` : Runs rolling apply through JIT compiled code from numba.
           Only available when ``raw`` is set to ``True``.
+        * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
 
           .. versionadded:: 1.0.0
 
@@ -1357,18 +1358,7 @@ def apply(
         if not is_bool(raw):
             raise ValueError("raw parameter must be `True` or `False`")
 
-        if engine == "cython":
-            if engine_kwargs is not None:
-                raise ValueError("cython engine does not accept engine_kwargs")
-            # Cython apply functions handle center, so don't need to use
-            # _apply's center handling
-            window = self._get_window()
-            offset = calculate_center_offset(window) if self.center else 0
-            apply_func = self._generate_cython_apply_func(
-                args, kwargs, raw, offset, func
-            )
-            center = False
-        elif engine == "numba":
+        if maybe_use_numba(engine):
             if raw is False:
                 raise ValueError("raw must be `True` when using the numba engine")
             cache_key = (func, "rolling_apply")
@@ -1380,6 +1370,17 @@ def apply(
                     args, kwargs, func, engine_kwargs
                 )
             center = self.center
+        elif engine in ("cython", None):
+            if engine_kwargs is not None:
+                raise ValueError("cython engine does not accept engine_kwargs")
+            # Cython apply functions handle center, so don't need to use
+            # _apply's center handling
+            window = self._get_window()
+            offset = calculate_center_offset(window) if self.center else 0
+            apply_func = self._generate_cython_apply_func(
+                args, kwargs, raw, offset, func
+            )
+            center = False
         else:
             raise ValueError("engine must be either 'numba' or 'cython'")
 
@@ -2053,13 +2054,7 @@ def count(self):
     @Substitution(name="rolling")
     @Appender(_shared_docs["apply"])
     def apply(
-        self,
-        func,
-        raw=False,
-        engine="cython",
-        engine_kwargs=None,
-        args=None,
-        kwargs=None,
+        self, func, raw=False, engine=None, engine_kwargs=None, args=None, kwargs=None,
     ):
         return super().apply(
             func,
diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py
@@ -4,7 +4,7 @@
 from pandas.errors import NumbaUtilError
 import pandas.util._test_decorators as td
 
-from pandas import DataFrame
+from pandas import DataFrame, option_context
 import pandas._testing as tm
 from pandas.core.util.numba_ import NUMBA_FUNC_CACHE
 
@@ -113,3 +113,18 @@ def func_2(values, index):
     result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
     expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
     tm.assert_equal(result, expected)
+
+
+@td.skip_if_no("numba", "0.46.0")
+def test_use_global_config():
+    def func_1(values, index):
+        return np.mean(values) - 3.4
+
+    data = DataFrame(
+        {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1],
+    )
+    grouped = data.groupby(0)
+    expected = grouped.agg(func_1, engine="numba")
+    with option_context("compute.use_numba", True):
+        result = grouped.agg(func_1, engine=None)
+    tm.assert_frame_equal(expected, result)
diff --git a/pandas/tests/groupby/transform/test_numba.py b/pandas/tests/groupby/transform/test_numba.py
@@ -3,7 +3,7 @@
 from pandas.errors import NumbaUtilError
 import pandas.util._test_decorators as td
 
-from pandas import DataFrame
+from pandas import DataFrame, option_context
 import pandas._testing as tm
 from pandas.core.util.numba_ import NUMBA_FUNC_CACHE
 
@@ -112,3 +112,18 @@ def func_2(values, index):
     result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
     expected = grouped.transform(lambda x: x + 1, engine="cython")
     tm.assert_equal(result, expected)
+
+
+@td.skip_if_no("numba", "0.46.0")
+def test_use_global_config():
+    def func_1(values, index):
+        return values + 1
+
+    data = DataFrame(
+        {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1],
+    )
+    grouped = data.groupby(0)
+    expected = grouped.transform(func_1, engine="numba")
+    with option_context("compute.use_numba", True):
+        result = grouped.transform(func_1, engine=None)
+    tm.assert_frame_equal(expected, result)
diff --git a/pandas/tests/util/test_numba.py b/pandas/tests/util/test_numba.py
@@ -0,0 +1,12 @@
+import pytest
+
+import pandas.util._test_decorators as td
+
+from pandas import option_context
+
+
+@td.skip_if_installed("numba")
+def test_numba_not_installed_option_context():
+    with pytest.raises(ImportError, match="Missing optional"):
+        with option_context("compute.use_numba", True):
+            pass
diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py
@@ -3,7 +3,7 @@
 
 import pandas.util._test_decorators as td
 
-from pandas import Series
+from pandas import Series, option_context
 import pandas._testing as tm
 from pandas.core.util.numba_ import NUMBA_FUNC_CACHE
 
@@ -75,3 +75,15 @@ def func_2(x):
         )
         expected = roll.apply(func_1, engine="cython", raw=True)
         tm.assert_series_equal(result, expected)
+
+
+@td.skip_if_no("numba", "0.46.0")
+def test_use_global_config():
+    def f(x):
+        return np.mean(x) + 2
+
+    s = Series(range(10))
+    with option_context("compute.use_numba", True):
+        result = s.rolling(2).apply(f, engine=None, raw=True)
+    expected = s.rolling(2).apply(f, engine="numba", raw=True)
+    tm.assert_series_equal(expected, result)