Skip to content

PERF/REF: Check use of numexpr earlier in the DataFrame operation #41122

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

44 changes: 44 additions & 0 deletions pandas/core/computation/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,50 @@ def _evaluate_standard(op, op_str, a, b):
return op(a, b)


def can_use_numexpr(op, size=None, dtypes=None, scalar=None):
"""
Initial check whether numexpr can be used with the given size and
involved data types and/or scalar operand.

Returns False if it definitely cannot use numexpr, otherwise returns
True (which doesn't mean we always end up using numexpr)

Parameters
----------
op : operator
size : int
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

int or None?

dtypes : list
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

list[DtypeObj]?

List of dtypes involved in the operation
scalar :
Optionally a scalar, eg if the right operand is a scalar.

Returns
-------
bool
"""
if not USE_NUMEXPR:
return False

if size is not None:
if size < _MIN_ELEMENTS:
return False

op_str = _op_str_mapping.get(op, None)
if op_str is None:
return False

if scalar is not None:
if isinstance(scalar, str):
return False

# allowed are a superset
if dtypes is not None:
return _ALLOWED_DTYPES["evaluate"] >= set(dtypes)

# safe fallback if dtypes were not specified
return True


def _can_use_numexpr(op, op_str, a, b, dtype_check):
""" return a boolean if we WILL be using numexpr """
if op_str is not None:
Expand Down
22 changes: 22 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6827,11 +6827,23 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None):
-------
DataFrame
"""
import pandas.core.computation.expressions as expressions

# Get the appropriate array-op to apply to each column/block's values.
array_op = ops.get_array_op(func)

right = lib.item_from_zerodim(right)
if not is_list_like(right):

if isinstance(self._mgr, ArrayManager):
use_numexpr = expressions.can_use_numexpr(
func, self.shape[0], None, right
)
else:
use_numexpr = expressions.can_use_numexpr(func, None, None, right)

array_op = ops.get_array_op(func, use_numexpr=use_numexpr)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the get_array_op on L6832 still needed?


# i.e. scalar, faster than checking np.ndim(right) == 0
with np.errstate(all="ignore"):
bm = self._mgr.apply(array_op, right=right)
Expand All @@ -6844,6 +6856,16 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None):
# fails in cases with empty columns reached via
# _frame_arith_method_with_reindex

if isinstance(self._mgr, ArrayManager):
use_numexpr = expressions.can_use_numexpr(
func, self.shape[0], None, None
)
else:
use_numexpr = expressions.USE_NUMEXPR

# breakpoint()
array_op = ops.get_array_op(func, use_numexpr=use_numexpr)

# TODO operate_blockwise expects a manager of the same type
with np.errstate(all="ignore"):
bm = self._mgr.operate_blockwise(
Expand Down
24 changes: 13 additions & 11 deletions pandas/core/ops/array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def _masked_arith_op(x: np.ndarray, y, op):
return result


def _na_arithmetic_op(left, right, op, is_cmp: bool = False):
def _na_arithmetic_op(left, right, op, is_cmp: bool = False, use_numexpr=True):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

annotate, docstring

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can use_numexpr be keyword-only

"""
Return the result of evaluating op on the passed in values.

Expand All @@ -155,7 +155,7 @@ def _na_arithmetic_op(left, right, op, is_cmp: bool = False):
TypeError : invalid operation
"""
try:
result = expressions.evaluate(op, left, right)
result = expressions.evaluate(op, left, right, use_numexpr=use_numexpr)
except TypeError:
if is_object_dtype(left) or is_object_dtype(right) and not is_cmp:
# For object dtype, fallback to a masked operation (only operating
Expand All @@ -174,7 +174,7 @@ def _na_arithmetic_op(left, right, op, is_cmp: bool = False):
return missing.dispatch_fill_zeros(op, left, right, result)


def arithmetic_op(left: ArrayLike, right: Any, op):
def arithmetic_op(left: ArrayLike, right: Any, op, use_numexpr=True):
"""
Evaluate an arithmetic operation `+`, `-`, `*`, `/`, `//`, `%`, `**`, ...

Expand Down Expand Up @@ -205,12 +205,12 @@ def arithmetic_op(left: ArrayLike, right: Any, op):
# Timedelta is included because numexpr will fail on it, see GH#31457
res_values = op(left, right)
else:
res_values = _na_arithmetic_op(left, right, op)
res_values = _na_arithmetic_op(left, right, op, use_numexpr=use_numexpr)

return res_values


def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike:
def comparison_op(left: ArrayLike, right: Any, op, use_numexpr=True) -> ArrayLike:
"""
Evaluate a comparison operation `=`, `!=`, `>=`, `>`, `<=`, or `<`.

Expand Down Expand Up @@ -265,7 +265,9 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike:
res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues)

else:
res_values = _na_arithmetic_op(lvalues, rvalues, op, is_cmp=True)
res_values = _na_arithmetic_op(
lvalues, rvalues, op, is_cmp=True, use_numexpr=use_numexpr
)

return res_values

Expand Down Expand Up @@ -311,7 +313,7 @@ def na_logical_op(x: np.ndarray, y, op):
return result.reshape(x.shape)


def logical_op(left: ArrayLike, right: Any, op) -> ArrayLike:
def logical_op(left: ArrayLike, right: Any, op, use_numexpr=True) -> ArrayLike:
"""
Evaluate a logical operation `|`, `&`, or `^`.

Expand Down Expand Up @@ -377,7 +379,7 @@ def fill_bool(x, left=None):
return res_values


def get_array_op(op):
def get_array_op(op, use_numexpr=True):
"""
Return a binary array operation corresponding to the given operator op.

Expand All @@ -401,9 +403,9 @@ def get_array_op(op):
return op

if op_name in {"eq", "ne", "lt", "le", "gt", "ge"}:
return partial(comparison_op, op=op)
return partial(comparison_op, op=op, use_numexpr=use_numexpr)
elif op_name in {"and", "or", "xor", "rand", "ror", "rxor"}:
return partial(logical_op, op=op)
return partial(logical_op, op=op, use_numexpr=use_numexpr)
elif op_name in {
"add",
"sub",
Expand All @@ -414,7 +416,7 @@ def get_array_op(op):
"divmod",
"pow",
}:
return partial(arithmetic_op, op=op)
return partial(arithmetic_op, op=op, use_numexpr=use_numexpr)
else:
raise NotImplementedError(op_name)

Expand Down