Skip to content

Commit 6a9ee5a

Browse files
Better execution engine API
1 parent 8b420cc commit 6a9ee5a

File tree

5 files changed

+196
-80
lines changed

5 files changed

+196
-80
lines changed

pandas/api/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""public toolkit API"""
22

33
from pandas.api import (
4+
executors,
45
extensions,
56
indexers,
67
interchange,
@@ -9,6 +10,7 @@
910
)
1011

1112
__all__ = [
13+
"executors",
1214
"extensions",
1315
"indexers",
1416
"interchange",

pandas/api/executors/__init__.py

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
"""
2+
Public API for function executor engines to be used with ``map`` and ``apply``.
3+
"""
4+
5+
from pandas.core.apply import BaseExecutionEngine
6+
7+
__all__ = ["BaseExecutionEngine"]

pandas/core/apply.py

+104
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,110 @@
7474
ResType = dict[int, Any]
7575

7676

77+
class BaseExecutionEngine(abc.ABC):
78+
"""
79+
Base class for execution engines for map and apply methods.
80+
81+
An execution engine receives all the parameters of a call to
82+
``apply`` or ``map``, such as the data container, the function,
83+
etc. and takes care of running the execution.
84+
85+
Supporting different engines allows functions to be JIT compiled,
86+
run in parallel, and others. Besides the default executor which
87+
simply runs the code with the Python interpreter and pandas.
88+
"""
89+
90+
@staticmethod
91+
@abc.abstractmethod
92+
def map(
93+
data: Series | DataFrame | np.ndarray,
94+
func: AggFuncType,
95+
args: tuple,
96+
kwargs: dict[str, Any],
97+
decorator: Callable | None,
98+
skip_na: bool,
99+
):
100+
"""
101+
Executor method to run functions elementwise.
102+
103+
In general, pandas uses ``map`` for running functions elementwise,
104+
but ``Series.apply`` with the default ``by_row='compat'`` will also
105+
call this executor function.
106+
107+
Parameters
108+
----------
109+
data : Series, DataFrame or NumPy ndarray
110+
The object to use for the data. Some methods implement a ``raw``
111+
parameter which will convert the original pandas object to a
112+
NumPy array, which will then be passed here to the executor.
113+
func : function or NumPy ufunc
114+
The function to execute.
115+
args : tuple
116+
Positional arguments to be passed to ``func``.
117+
kwargs : dict
118+
Keyword arguments to be passed to ``func``.
119+
decorator : function, optional
120+
For JIT compilers and other engines that need to decorate the
121+
function ``func``, this is the decorator to use. While the
122+
executor may already know which is the decorator to use, this
123+
is useful as for a single executor the user can specify for a
124+
example ``numba.jit`` or ``numba.njit(nogil=True)``, and this
125+
decorator parameter will contain the exact decortor from the
126+
executor the user wants to use.
127+
skip_na : bool
128+
Whether the function should be called for missing values or not.
129+
This is specified by the pandas user as ``map(na_action=None)``
130+
or ``map(na_action='ignore')``.
131+
"""
132+
133+
@staticmethod
134+
@abc.abstractmethod
135+
def apply(
136+
data: Series | DataFrame | np.ndarray,
137+
func: AggFuncType,
138+
args: tuple,
139+
kwargs: dict[str, Any],
140+
decorator: Callable,
141+
axis: Axis,
142+
):
143+
"""
144+
Executor method to run functions by an axis.
145+
146+
While we can see ``map`` as executing the function for each cell
147+
in a ``DataFrame`` (or ``Series``), ``apply`` will execute the
148+
function for each column (or row).
149+
150+
Parameters
151+
----------
152+
data : Series, DataFrame or NumPy ndarray
153+
The object to use for the data. Some methods implement a ``raw``
154+
parameter which will convert the original pandas object to a
155+
NumPy array, which will then be passed here to the executor.
156+
func : function or NumPy ufunc
157+
The function to execute.
158+
args : tuple
159+
Positional arguments to be passed to ``func``.
160+
kwargs : dict
161+
Keyword arguments to be passed to ``func``.
162+
decorator : function, optional
163+
For JIT compilers and other engines that need to decorate the
164+
function ``func``, this is the decorator to use. While the
165+
executor may already know which is the decorator to use, this
166+
is useful as for a single executor the user can specify for a
167+
example ``numba.jit`` or ``numba.njit(nogil=True)``, and this
168+
decorator parameter will contain the exact decortor from the
169+
executor the user wants to use.
170+
axis : {0 or 'index', 1 or 'columns'}
171+
0 or 'index' should execute the function passing each column as
172+
parameter. 1 or 'columns' should execute the function passing
173+
each row as parameter. The default executor engine passes rows
174+
as pandas ``Series``. Other executor engines should probably
175+
expect functions to be implemented this way for compatibility.
176+
But passing rows as other data structures is technically possible
177+
as far as the function ``func`` is implemented accordingly.
178+
"""
179+
180+
77181
def frame_apply(
78182
obj: DataFrame,
79183
func: AggFuncType,

pandas/core/bodo_patched.py

+34-29
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@
88
from typing import (
99
TYPE_CHECKING,
1010
Any,
11-
Literal,
1211
)
1312

1413
import bodo
14+
import numpy as np
1515

1616
import pandas as pd
1717

@@ -24,45 +24,50 @@
2424
)
2525

2626

27-
def __pandas_udf__(
28-
jit_decorator: Callable,
29-
obj: pd.Series | pd.DataFrame,
30-
method: Literal["apply", "map"],
31-
func: AggFuncType,
32-
axis: Axis,
33-
raw: bool,
34-
result_type: Literal["expand", "reduce", "broadcast"] | None,
35-
args: tuple,
36-
kwargs: dict[str, Any],
37-
by_row: Literal[False, "compat"],
38-
):
39-
if isinstance(obj, pd.DataFrame) and method == "apply":
40-
if result_type is not None:
27+
class BodoExecutionEngine(pd.api.executors.BaseExecutionEngine):
28+
@staticmethod
29+
def map(
30+
data: pd.Series | pd.DataFrame | np.ndarray,
31+
func: AggFuncType,
32+
args: tuple,
33+
kwargs: dict[str, Any],
34+
decorator: Callable,
35+
skip_na: bool,
36+
):
37+
raise NotImplementedError("engine='bodo' not supported for map")
38+
39+
@staticmethod
40+
def apply(
41+
data: pd.Series | pd.DataFrame | np.ndarray,
42+
func: AggFuncType,
43+
args: tuple,
44+
kwargs: dict[str, Any],
45+
decorator: Callable,
46+
axis: Axis,
47+
):
48+
if isinstance(data, pd.Series):
49+
raise NotImplementedError("engine='bodo' not supported for Series.apply")
50+
51+
if isinstance(data, np.ndarray):
52+
raise NotImplementedError("engine='bodo' not supported when raw=True")
53+
54+
if args or kwargs:
4155
raise NotImplementedError(
42-
"engine='bodo' not supported when result_type is not None"
56+
"engine='bodo' not supported when args or kwargs are specified"
4357
)
4458

45-
if raw:
46-
raise NotImplementedError("engine='bodo' not supported when raw=True")
4759
if isinstance(func, str) and axis != 1:
4860
raise NotImplementedError(
4961
"engine='bodo' only supports axis=1 when func is the name of a "
5062
"user-defined function"
5163
)
52-
if args or kwargs:
53-
raise NotImplementedError(
54-
"engine='bodo' not supported when args or kwargs are specified"
55-
)
5664

57-
@jit_decorator
5865
def jit_func(df, func, axis):
5966
return df.apply(func, axis=axis)
6067

61-
return jit_func(obj, func, axis)
62-
else:
63-
raise NotImplementedError(
64-
f"engine='bodo' not supported for {obj.__name__}.{method}"
65-
)
68+
jit_func = decorator(jit_func)
69+
70+
return jit_func(data, func, axis)
6671

6772

68-
bodo.jit.__pandas_udf__ = __pandas_udf__
73+
bodo.jit.__pandas_udf__ = BodoExecutionEngine

pandas/core/frame.py

+49-51
Original file line numberDiff line numberDiff line change
@@ -10254,9 +10254,8 @@ def apply(
1025410254
result_type: Literal["expand", "reduce", "broadcast"] | None = None,
1025510255
args=(),
1025610256
by_row: Literal[False, "compat"] = "compat",
10257-
engine: Literal["python", "numba"] = "python",
10257+
engine: Callable | None | Literal["python", "numba"] = None,
1025810258
engine_kwargs: dict[str, bool] | None = None,
10259-
jit: Callable | None = None,
1026010259
**kwargs,
1026110260
):
1026210261
"""
@@ -10317,28 +10316,24 @@ def apply(
1031710316
1031810317
.. versionadded:: 2.1.0
1031910318
10320-
engine : {'python', 'numba'}, default 'python'
10321-
Choose between the python (default) engine or the numba engine in apply.
10319+
engine : decorator or {'python', 'numba'}, optional
10320+
Choose the execution engine to use. If not provided the function
10321+
will be executed by the regular Python interpreter.
1032210322
10323-
The numba engine will attempt to JIT compile the passed function,
10324-
which may result in speedups for large DataFrames.
10325-
It also supports the following engine_kwargs :
10323+
Other options include JIT compilers such Numba and Bodo, which in some
10324+
cases can speed up the execution. To use an executor you can provide
10325+
the decorators ``numba.jit``, ``numba.njit`` or ``bodo.jit``. You can
10326+
also provide the decorator with parameters, like ``numba.jit(nogit=True)``.
1032610327
10327-
- nopython (compile the function in nopython mode)
10328-
- nogil (release the GIL inside the JIT compiled function)
10329-
- parallel (try to apply the function in parallel over the DataFrame)
10328+
Not all functions can be executed with all execution engines. In general,
10329+
JIT compilers will require type stability in the function (no variable
10330+
should change data type during the execution). And not all pandas and
10331+
NumPy APIs are supported. Check the engine documentation [1]_ and [2]_
10332+
for limitations.
1033010333
10331-
Note: Due to limitations within numba/how pandas interfaces with numba,
10332-
you should only use this if raw=True
10333-
10334-
Note: The numba compiler only supports a subset of
10335-
valid Python/numpy operations.
10334+
.. warning::
1033610335
10337-
Please read more about the `supported python features
10338-
<https://numba.pydata.org/numba-doc/dev/reference/pysupported.html>`_
10339-
and `supported numpy features
10340-
<https://numba.pydata.org/numba-doc/dev/reference/numpysupported.html>`_
10341-
in numba to learn what you can or cannot use in the passed function.
10336+
String parameters will stop being supported in a future pandas version.
1034210337
1034310338
.. versionadded:: 2.2.0
1034410339
@@ -10347,14 +10342,6 @@ def apply(
1034710342
This is currently only used by the numba engine,
1034810343
see the documentation for the engine argument for more information.
1034910344
10350-
jit : function, optional
10351-
Decorator to JIT compile the execution. The main available options are
10352-
``numba.jit``, ``numba.njit`` or ``bodo.jit``. Parameters can be used in
10353-
the same way as the decorators, for example ``numba.jit(parallel=True)``.
10354-
10355-
Refer to the the [1]_ and [2]_ documentation to learn about limitations
10356-
on what code can be JIT compiled.
10357-
1035810345
**kwargs
1035910346
Additional keyword arguments to pass as keywords arguments to
1036010347
`func`.
@@ -10460,41 +10447,52 @@ def apply(
1046010447
type during the execution).
1046110448
1046210449
>>> import bodo
10463-
>>> df.apply(lambda x: x.A + x.B, axis=1, jit=bodo.jit(parallel=True))
10450+
>>> df.apply(lambda x: x.A + x.B, axis=1, engine=bodo.jit(parallel=True))
1046410451
1046510452
Note that JIT compilation is only recommended for functions that take a
1046610453
significant amount of time to run. Fast functions are unlikely to run faster
1046710454
with JIT compilation.
1046810455
"""
10469-
if hasattr(jit, "__pandas_udf__"):
10470-
return jit.__pandas_udf__(
10471-
jit_decorator=jit,
10472-
obj=self,
10473-
method="apply",
10456+
if engine is None or isinstance(engine, str):
10457+
from pandas.core.apply import frame_apply
10458+
10459+
if engine is None:
10460+
engine = "python"
10461+
10462+
op = frame_apply(
10463+
self,
1047410464
func=func,
10475-
args=args,
10476-
kwargs=kwargs,
1047710465
axis=axis,
1047810466
raw=raw,
1047910467
result_type=result_type,
1048010468
by_row=by_row,
10469+
engine=engine,
10470+
engine_kwargs=engine_kwargs,
10471+
args=args,
10472+
kwargs=kwargs,
1048110473
)
10474+
return op.apply().__finalize__(self, method="apply")
10475+
elif hasattr(engine, "__pandas_udf__"):
10476+
if result_type is not None:
10477+
raise NotImplementedError(
10478+
f"{result_type=} only implemented for the default engine"
10479+
)
1048210480

10483-
from pandas.core.apply import frame_apply
10484-
10485-
op = frame_apply(
10486-
self,
10487-
func=func,
10488-
axis=axis,
10489-
raw=raw,
10490-
result_type=result_type,
10491-
by_row=by_row,
10492-
engine=engine,
10493-
engine_kwargs=engine_kwargs,
10494-
args=args,
10495-
kwargs=kwargs,
10496-
)
10497-
return op.apply().__finalize__(self, method="apply")
10481+
data = self
10482+
if raw:
10483+
# This will upcast the whole DataFrame to the same type,
10484+
# and likely result in an object 2D array.
10485+
# We should probably pass a list of 1D arrays instead, at
10486+
# lest for ``axis=0``
10487+
data = data.values
10488+
return engine.__pandas_udf__.apply(
10489+
data=data,
10490+
func=func,
10491+
args=args,
10492+
kwargs=kwargs,
10493+
decorator=engine,
10494+
axis=axis,
10495+
)
1049810496

1049910497
def map(
1050010498
self, func: PythonFuncType, na_action: Literal["ignore"] | None = None, **kwargs

0 commit comments

Comments
 (0)