Better execution engine API

datapythonista · datapythonista · commit 6a9ee5aad0f5 · 2025-03-09T17:24:45.000+07:00
diff --git a/pandas/api/__init__.py b/pandas/api/__init__.py
@@ -1,6 +1,7 @@
 """public toolkit API"""
 
 from pandas.api import (
+    executors,
     extensions,
     indexers,
     interchange,
@@ -9,6 +10,7 @@
 )
 
 __all__ = [
+    "executors",
     "extensions",
     "indexers",
     "interchange",
diff --git a/pandas/api/executors/__init__.py b/pandas/api/executors/__init__.py
@@ -0,0 +1,7 @@
+"""
+Public API for function executor engines to be used with ``map`` and ``apply``.
+"""
+
+from pandas.core.apply import BaseExecutionEngine
+
+__all__ = ["BaseExecutionEngine"]
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -74,6 +74,110 @@
 ResType = dict[int, Any]
 
 
+class BaseExecutionEngine(abc.ABC):
+    """
+    Base class for execution engines for map and apply methods.
+
+    An execution engine receives all the parameters of a call to
+    ``apply`` or ``map``, such as the data container, the function,
+    etc. and takes care of running the execution.
+
+    Supporting different engines allows functions to be JIT compiled,
+    run in parallel, and others. Besides the default executor which
+    simply runs the code with the Python interpreter and pandas.
+    """
+
+    @staticmethod
+    @abc.abstractmethod
+    def map(
+        data: Series | DataFrame | np.ndarray,
+        func: AggFuncType,
+        args: tuple,
+        kwargs: dict[str, Any],
+        decorator: Callable | None,
+        skip_na: bool,
+    ):
+        """
+        Executor method to run functions elementwise.
+
+        In general, pandas uses ``map`` for running functions elementwise,
+        but ``Series.apply`` with the default ``by_row='compat'`` will also
+        call this executor function.
+
+        Parameters
+        ----------
+        data : Series, DataFrame or NumPy ndarray
+            The object to use for the data. Some methods implement a ``raw``
+            parameter which will convert the original pandas object to a
+            NumPy array, which will then be passed here to the executor.
+        func : function or NumPy ufunc
+            The function to execute.
+        args : tuple
+            Positional arguments to be passed to ``func``.
+        kwargs : dict
+            Keyword arguments to be passed to ``func``.
+        decorator : function, optional
+            For JIT compilers and other engines that need to decorate the
+            function ``func``, this is the decorator to use. While the
+            executor may already know which is the decorator to use, this
+            is useful as for a single executor the user can specify for a
+            example ``numba.jit`` or ``numba.njit(nogil=True)``, and this
+            decorator parameter will contain the exact decortor from the
+            executor the user wants to use.
+        skip_na : bool
+            Whether the function should be called for missing values or not.
+            This is specified by the pandas user as ``map(na_action=None)``
+            or ``map(na_action='ignore')``.
+        """
+
+    @staticmethod
+    @abc.abstractmethod
+    def apply(
+        data: Series | DataFrame | np.ndarray,
+        func: AggFuncType,
+        args: tuple,
+        kwargs: dict[str, Any],
+        decorator: Callable,
+        axis: Axis,
+    ):
+        """
+        Executor method to run functions by an axis.
+
+        While we can see ``map`` as executing the function for each cell
+        in a ``DataFrame`` (or ``Series``), ``apply`` will execute the
+        function for each column (or row).
+
+        Parameters
+        ----------
+        data : Series, DataFrame or NumPy ndarray
+            The object to use for the data. Some methods implement a ``raw``
+            parameter which will convert the original pandas object to a
+            NumPy array, which will then be passed here to the executor.
+        func : function or NumPy ufunc
+            The function to execute.
+        args : tuple
+            Positional arguments to be passed to ``func``.
+        kwargs : dict
+            Keyword arguments to be passed to ``func``.
+        decorator : function, optional
+            For JIT compilers and other engines that need to decorate the
+            function ``func``, this is the decorator to use. While the
+            executor may already know which is the decorator to use, this
+            is useful as for a single executor the user can specify for a
+            example ``numba.jit`` or ``numba.njit(nogil=True)``, and this
+            decorator parameter will contain the exact decortor from the
+            executor the user wants to use.
+        axis : {0 or 'index', 1 or 'columns'}
+            0 or 'index' should execute the function passing each column as
+            parameter. 1 or 'columns' should execute the function passing
+            each row as parameter. The default executor engine passes rows
+            as pandas ``Series``. Other executor engines should probably
+            expect functions to be implemented this way for compatibility.
+            But passing rows as other data structures is technically possible
+            as far as the function ``func`` is implemented accordingly.
+        """
+
+
 def frame_apply(
     obj: DataFrame,
     func: AggFuncType,
diff --git a/pandas/core/bodo_patched.py b/pandas/core/bodo_patched.py
@@ -8,10 +8,10 @@
 from typing import (
     TYPE_CHECKING,
     Any,
-    Literal,
 )
 
 import bodo
+import numpy as np
 
 import pandas as pd
 
@@ -24,45 +24,50 @@
     )
 
 
-def __pandas_udf__(
-    jit_decorator: Callable,
-    obj: pd.Series | pd.DataFrame,
-    method: Literal["apply", "map"],
-    func: AggFuncType,
-    axis: Axis,
-    raw: bool,
-    result_type: Literal["expand", "reduce", "broadcast"] | None,
-    args: tuple,
-    kwargs: dict[str, Any],
-    by_row: Literal[False, "compat"],
-):
-    if isinstance(obj, pd.DataFrame) and method == "apply":
-        if result_type is not None:
+class BodoExecutionEngine(pd.api.executors.BaseExecutionEngine):
+    @staticmethod
+    def map(
+        data: pd.Series | pd.DataFrame | np.ndarray,
+        func: AggFuncType,
+        args: tuple,
+        kwargs: dict[str, Any],
+        decorator: Callable,
+        skip_na: bool,
+    ):
+        raise NotImplementedError("engine='bodo' not supported for map")
+
+    @staticmethod
+    def apply(
+        data: pd.Series | pd.DataFrame | np.ndarray,
+        func: AggFuncType,
+        args: tuple,
+        kwargs: dict[str, Any],
+        decorator: Callable,
+        axis: Axis,
+    ):
+        if isinstance(data, pd.Series):
+            raise NotImplementedError("engine='bodo' not supported for Series.apply")
+
+        if isinstance(data, np.ndarray):
+            raise NotImplementedError("engine='bodo' not supported when raw=True")
+
+        if args or kwargs:
             raise NotImplementedError(
-                "engine='bodo' not supported when result_type is not None"
+                "engine='bodo' not supported when args or kwargs are specified"
             )
 
-        if raw:
-            raise NotImplementedError("engine='bodo' not supported when raw=True")
         if isinstance(func, str) and axis != 1:
             raise NotImplementedError(
                 "engine='bodo' only supports axis=1 when func is the name of a "
                 "user-defined function"
             )
-        if args or kwargs:
-            raise NotImplementedError(
-                "engine='bodo' not supported when args or kwargs are specified"
-            )
 
-        @jit_decorator
         def jit_func(df, func, axis):
             return df.apply(func, axis=axis)
 
-        return jit_func(obj, func, axis)
-    else:
-        raise NotImplementedError(
-            f"engine='bodo' not supported for {obj.__name__}.{method}"
-        )
+        jit_func = decorator(jit_func)
+
+        return jit_func(data, func, axis)
 
 
-bodo.jit.__pandas_udf__ = __pandas_udf__
+bodo.jit.__pandas_udf__ = BodoExecutionEngine
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -10254,9 +10254,8 @@ def apply(
         result_type: Literal["expand", "reduce", "broadcast"] | None = None,
         args=(),
         by_row: Literal[False, "compat"] = "compat",
-        engine: Literal["python", "numba"] = "python",
+        engine: Callable | None | Literal["python", "numba"] = None,
         engine_kwargs: dict[str, bool] | None = None,
-        jit: Callable | None = None,
         **kwargs,
     ):
         """
@@ -10317,28 +10316,24 @@ def apply(
 
             .. versionadded:: 2.1.0
 
-        engine : {'python', 'numba'}, default 'python'
-            Choose between the python (default) engine or the numba engine in apply.
+        engine : decorator or {'python', 'numba'}, optional
+            Choose the execution engine to use. If not provided the function
+            will be executed by the regular Python interpreter.
 
-            The numba engine will attempt to JIT compile the passed function,
-            which may result in speedups for large DataFrames.
-            It also supports the following engine_kwargs :
+            Other options include JIT compilers such Numba and Bodo, which in some
+            cases can speed up the execution. To use an executor you can provide
+            the decorators ``numba.jit``, ``numba.njit`` or ``bodo.jit``. You can
+            also provide the decorator with parameters, like ``numba.jit(nogit=True)``.
 
-            - nopython (compile the function in nopython mode)
-            - nogil (release the GIL inside the JIT compiled function)
-            - parallel (try to apply the function in parallel over the DataFrame)
+            Not all functions can be executed with all execution engines. In general,
+            JIT compilers will require type stability in the function (no variable
+            should change data type during the execution). And not all pandas and
+            NumPy APIs are supported. Check the engine documentation [1]_ and [2]_
+            for limitations.
 
-              Note: Due to limitations within numba/how pandas interfaces with numba,
-              you should only use this if raw=True
-
-            Note: The numba compiler only supports a subset of
-            valid Python/numpy operations.
+            .. warning::
 
-            Please read more about the `supported python features
-            <https://numba.pydata.org/numba-doc/dev/reference/pysupported.html>`_
-            and `supported numpy features
-            <https://numba.pydata.org/numba-doc/dev/reference/numpysupported.html>`_
-            in numba to learn what you can or cannot use in the passed function.
+                String parameters will stop being supported in a future pandas version.
 
             .. versionadded:: 2.2.0
 
@@ -10347,14 +10342,6 @@ def apply(
             This is currently only used by the numba engine,
             see the documentation for the engine argument for more information.
 
-        jit : function, optional
-            Decorator to JIT compile the execution. The main available options are
-            ``numba.jit``, ``numba.njit`` or ``bodo.jit``. Parameters can be used in
-            the same way as the decorators, for example ``numba.jit(parallel=True)``.
-
-            Refer to the the [1]_ and [2]_ documentation to learn about limitations
-            on what code can be JIT compiled.
-
         **kwargs
             Additional keyword arguments to pass as keywords arguments to
             `func`.
@@ -10460,41 +10447,52 @@ def apply(
         type during the execution).
 
         >>> import bodo
-        >>> df.apply(lambda x: x.A + x.B, axis=1, jit=bodo.jit(parallel=True))
+        >>> df.apply(lambda x: x.A + x.B, axis=1, engine=bodo.jit(parallel=True))
 
         Note that JIT compilation is only recommended for functions that take a
         significant amount of time to run. Fast functions are unlikely to run faster
         with JIT compilation.
         """
-        if hasattr(jit, "__pandas_udf__"):
-            return jit.__pandas_udf__(
-                jit_decorator=jit,
-                obj=self,
-                method="apply",
+        if engine is None or isinstance(engine, str):
+            from pandas.core.apply import frame_apply
+
+            if engine is None:
+                engine = "python"
+
+            op = frame_apply(
+                self,
                 func=func,
-                args=args,
-                kwargs=kwargs,
                 axis=axis,
                 raw=raw,
                 result_type=result_type,
                 by_row=by_row,
+                engine=engine,
+                engine_kwargs=engine_kwargs,
+                args=args,
+                kwargs=kwargs,
             )
+            return op.apply().__finalize__(self, method="apply")
+        elif hasattr(engine, "__pandas_udf__"):
+            if result_type is not None:
+                raise NotImplementedError(
+                    f"{result_type=} only implemented for the default engine"
+                )
 
-        from pandas.core.apply import frame_apply
-
-        op = frame_apply(
-            self,
-            func=func,
-            axis=axis,
-            raw=raw,
-            result_type=result_type,
-            by_row=by_row,
-            engine=engine,
-            engine_kwargs=engine_kwargs,
-            args=args,
-            kwargs=kwargs,
-        )
-        return op.apply().__finalize__(self, method="apply")
+            data = self
+            if raw:
+                # This will upcast the whole DataFrame to the same type,
+                # and likely result in an object 2D array.
+                # We should probably pass a list of 1D arrays instead, at
+                # lest for ``axis=0``
+                data = data.values
+            return engine.__pandas_udf__.apply(
+                data=data,
+                func=func,
+                args=args,
+                kwargs=kwargs,
+                decorator=engine,
+                axis=axis,
+            )
 
     def map(
         self, func: PythonFuncType, na_action: Literal["ignore"] | None = None, **kwargs