Skip to content

PERF/REF: Check use of numexpr earlier in the DataFrame operation #41122

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

47 changes: 47 additions & 0 deletions pandas/core/computation/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,59 @@ def _evaluate_standard(op, op_str, a, b):
return op(a, b)


def can_use_numexpr(op, size: Optional[int] = None, dtypes=None, scalar=None):
"""
Initial check whether numexpr can be used with the given size and
involved data types and/or scalar operand.

Returns False if it definitely cannot use numexpr, otherwise returns
True (which doesn't mean we always end up using numexpr)

Parameters
----------
op : operator
size : int
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

int or None?

dtypes : list
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

list[DtypeObj]?

List of dtypes involved in the operation
scalar :
Optionally a scalar, eg if the right operand is a scalar.

Returns
-------
bool
"""
if not USE_NUMEXPR:
return False

if size is not None:
if size < _MIN_ELEMENTS:
return False

op_str = _op_str_mapping.get(op, None)
if op_str is None:
return False

if scalar is not None:
if isinstance(scalar, str):
return False

# allowed are a superset
if dtypes is not None:
return _ALLOWED_DTYPES["evaluate"] >= set(dtypes)

# safe fallback if dtypes were not specified
return True


def _can_use_numexpr(op, op_str, a, b, dtype_check):
""" return a boolean if we WILL be using numexpr """
if op_str is not None:

# required min elements (otherwise we are adding overhead)
if a.size > _MIN_ELEMENTS:
if isinstance(b, str):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the place where this is moved from has a comment # can never use numexpr. not a huge loss since the comment isn't that informative, but at some point we should get a helpful comment about why it cant be used

return False

# check for dtype compatibility
dtypes: Set[str] = set()
for o in [a, b]:
Expand Down
42 changes: 42 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@
from pandas.core.arraylike import OpsMixin
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.sparse import SparseFrameAccessor
import pandas.core.computation.expressions as expressions
from pandas.core.construction import (
extract_array,
sanitize_array,
Expand Down Expand Up @@ -6832,6 +6833,16 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None):

right = lib.item_from_zerodim(right)
if not is_list_like(right):

if isinstance(self._mgr, ArrayManager):
use_numexpr = expressions.can_use_numexpr(
func, self.shape[0], None, right
)
else:
use_numexpr = expressions.can_use_numexpr(func, None, None, right)

array_op = ops.get_array_op(func, use_numexpr=use_numexpr)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the get_array_op on L6832 still needed?


# i.e. scalar, faster than checking np.ndim(right) == 0
with np.errstate(all="ignore"):
bm = self._mgr.apply(array_op, right=right)
Expand All @@ -6844,6 +6855,15 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None):
# fails in cases with empty columns reached via
# _frame_arith_method_with_reindex

if isinstance(self._mgr, ArrayManager):
use_numexpr = expressions.can_use_numexpr(
func, self.shape[0], None, None
)
else:
use_numexpr = expressions.USE_NUMEXPR

array_op = ops.get_array_op(func, use_numexpr=use_numexpr)

# TODO operate_blockwise expects a manager of the same type
with np.errstate(all="ignore"):
bm = self._mgr.operate_blockwise(
Expand All @@ -6866,6 +6886,17 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None):
# maybe_align_as_frame ensures we do not have an ndarray here
assert not isinstance(right, np.ndarray)

if isinstance(self._mgr, ArrayManager):
use_numexpr = expressions.can_use_numexpr(
func, self.shape[1], (right.dtype,), None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does self.shape[1] correspond to the size of some array?

)
else:
use_numexpr = expressions.can_use_numexpr(
func, None, (right.dtype,), None
)

array_op = ops.get_array_op(func, use_numexpr=use_numexpr)

with np.errstate(all="ignore"):
arrays = [
array_op(_left, _right)
Expand All @@ -6876,6 +6907,17 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None):
assert right.index.equals(self.index) # Handle other cases later
right = right._values

if isinstance(self._mgr, ArrayManager):
use_numexpr = expressions.can_use_numexpr(
func, self.shape[0], (right.dtype,), None
)
else:
use_numexpr = expressions.can_use_numexpr(
func, None, (right.dtype,), None
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these ~8 lines (x4) seem like a reasonable cope for an AM/BM method


array_op = ops.get_array_op(func, use_numexpr=use_numexpr)

with np.errstate(all="ignore"):
arrays = [array_op(left, right) for left in self._iter_column_arrays()]

Expand Down
30 changes: 13 additions & 17 deletions pandas/core/ops/array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def _masked_arith_op(x: np.ndarray, y, op):
return result


def _na_arithmetic_op(left, right, op, is_cmp: bool = False):
def _na_arithmetic_op(left, right, op, is_cmp: bool = False, use_numexpr=True):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

annotate, docstring

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can use_numexpr be keyword-only

"""
Return the result of evaluating op on the passed in values.

Expand All @@ -156,14 +156,8 @@ def _na_arithmetic_op(left, right, op, is_cmp: bool = False):
------
TypeError : invalid operation
"""
if isinstance(right, str):
# can never use numexpr
func = op
else:
func = partial(expressions.evaluate, op)

try:
result = func(left, right)
result = expressions.evaluate(op, left, right, use_numexpr=use_numexpr)
except TypeError:
if is_object_dtype(left) or is_object_dtype(right) and not is_cmp:
# For object dtype, fallback to a masked operation (only operating
Expand All @@ -182,7 +176,7 @@ def _na_arithmetic_op(left, right, op, is_cmp: bool = False):
return missing.dispatch_fill_zeros(op, left, right, result)


def arithmetic_op(left: ArrayLike, right: Any, op):
def arithmetic_op(left: ArrayLike, right: Any, op, use_numexpr=True):
"""
Evaluate an arithmetic operation `+`, `-`, `*`, `/`, `//`, `%`, `**`, ...

Expand Down Expand Up @@ -222,12 +216,12 @@ def arithmetic_op(left: ArrayLike, right: Any, op):
# (https://github.com/pandas-dev/pandas/issues/41165)
_bool_arith_check(op, left, right)

res_values = _na_arithmetic_op(left, right, op)
res_values = _na_arithmetic_op(left, right, op, use_numexpr=use_numexpr)

return res_values


def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike:
def comparison_op(left: ArrayLike, right: Any, op, use_numexpr=True) -> ArrayLike:
"""
Evaluate a comparison operation `=`, `!=`, `>=`, `>`, `<=`, or `<`.

Expand Down Expand Up @@ -285,7 +279,9 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike:
res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues)

else:
res_values = _na_arithmetic_op(lvalues, rvalues, op, is_cmp=True)
res_values = _na_arithmetic_op(
lvalues, rvalues, op, is_cmp=True, use_numexpr=use_numexpr
)

return res_values

Expand Down Expand Up @@ -331,7 +327,7 @@ def na_logical_op(x: np.ndarray, y, op):
return result.reshape(x.shape)


def logical_op(left: ArrayLike, right: Any, op) -> ArrayLike:
def logical_op(left: ArrayLike, right: Any, op, use_numexpr=True) -> ArrayLike:
"""
Evaluate a logical operation `|`, `&`, or `^`.

Expand Down Expand Up @@ -397,7 +393,7 @@ def fill_bool(x, left=None):
return res_values


def get_array_op(op):
def get_array_op(op, use_numexpr=True):
"""
Return a binary array operation corresponding to the given operator op.

Expand All @@ -421,9 +417,9 @@ def get_array_op(op):
return op

if op_name in {"eq", "ne", "lt", "le", "gt", "ge"}:
return partial(comparison_op, op=op)
return partial(comparison_op, op=op, use_numexpr=use_numexpr)
elif op_name in {"and", "or", "xor", "rand", "ror", "rxor"}:
return partial(logical_op, op=op)
return partial(logical_op, op=op, use_numexpr=use_numexpr)
elif op_name in {
"add",
"sub",
Expand All @@ -434,7 +430,7 @@ def get_array_op(op):
"divmod",
"pow",
}:
return partial(arithmetic_op, op=op)
return partial(arithmetic_op, op=op, use_numexpr=use_numexpr)
else:
raise NotImplementedError(op_name)

Expand Down