-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
PERF/REF: Check use of numexpr earlier in the DataFrame operation #41122
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1bb21fa
9496e19
ef0202a
3fce895
f0f6bb6
54a84d4
b252e7c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -72,12 +72,59 @@ def _evaluate_standard(op, op_str, a, b): | |
return op(a, b) | ||
|
||
|
||
def can_use_numexpr(op, size: Optional[int] = None, dtypes=None, scalar=None): | ||
""" | ||
Initial check whether numexpr can be used with the given size and | ||
involved data types and/or scalar operand. | ||
|
||
Returns False if it definitely cannot use numexpr, otherwise returns | ||
True (which doesn't mean we always end up using numexpr) | ||
|
||
Parameters | ||
---------- | ||
op : operator | ||
size : int | ||
dtypes : list | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. list[DtypeObj]? |
||
List of dtypes involved in the operation | ||
scalar : | ||
Optionally a scalar, eg if the right operand is a scalar. | ||
|
||
Returns | ||
------- | ||
bool | ||
""" | ||
if not USE_NUMEXPR: | ||
return False | ||
|
||
if size is not None: | ||
if size < _MIN_ELEMENTS: | ||
return False | ||
|
||
op_str = _op_str_mapping.get(op, None) | ||
if op_str is None: | ||
return False | ||
|
||
if scalar is not None: | ||
if isinstance(scalar, str): | ||
return False | ||
|
||
# allowed are a superset | ||
if dtypes is not None: | ||
return _ALLOWED_DTYPES["evaluate"] >= set(dtypes) | ||
|
||
# safe fallback if dtypes were not specified | ||
return True | ||
|
||
|
||
def _can_use_numexpr(op, op_str, a, b, dtype_check): | ||
""" return a boolean if we WILL be using numexpr """ | ||
if op_str is not None: | ||
|
||
# required min elements (otherwise we are adding overhead) | ||
if a.size > _MIN_ELEMENTS: | ||
if isinstance(b, str): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the place where this is moved from has a comment |
||
return False | ||
|
||
# check for dtype compatibility | ||
dtypes: Set[str] = set() | ||
for o in [a, b]: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -139,6 +139,7 @@ | |
from pandas.core.arraylike import OpsMixin | ||
from pandas.core.arrays import ExtensionArray | ||
from pandas.core.arrays.sparse import SparseFrameAccessor | ||
import pandas.core.computation.expressions as expressions | ||
from pandas.core.construction import ( | ||
extract_array, | ||
sanitize_array, | ||
|
@@ -6832,6 +6833,16 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None): | |
|
||
right = lib.item_from_zerodim(right) | ||
if not is_list_like(right): | ||
|
||
if isinstance(self._mgr, ArrayManager): | ||
use_numexpr = expressions.can_use_numexpr( | ||
func, self.shape[0], None, right | ||
) | ||
else: | ||
use_numexpr = expressions.can_use_numexpr(func, None, None, right) | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
array_op = ops.get_array_op(func, use_numexpr=use_numexpr) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is the get_array_op on L6832 still needed? |
||
|
||
# i.e. scalar, faster than checking np.ndim(right) == 0 | ||
with np.errstate(all="ignore"): | ||
bm = self._mgr.apply(array_op, right=right) | ||
|
@@ -6844,6 +6855,15 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None): | |
# fails in cases with empty columns reached via | ||
# _frame_arith_method_with_reindex | ||
|
||
if isinstance(self._mgr, ArrayManager): | ||
use_numexpr = expressions.can_use_numexpr( | ||
func, self.shape[0], None, None | ||
) | ||
else: | ||
use_numexpr = expressions.USE_NUMEXPR | ||
|
||
array_op = ops.get_array_op(func, use_numexpr=use_numexpr) | ||
|
||
# TODO operate_blockwise expects a manager of the same type | ||
with np.errstate(all="ignore"): | ||
bm = self._mgr.operate_blockwise( | ||
|
@@ -6866,6 +6886,17 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None): | |
# maybe_align_as_frame ensures we do not have an ndarray here | ||
assert not isinstance(right, np.ndarray) | ||
|
||
if isinstance(self._mgr, ArrayManager): | ||
use_numexpr = expressions.can_use_numexpr( | ||
func, self.shape[1], (right.dtype,), None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does self.shape[1] correspond to the size of some array? |
||
) | ||
else: | ||
use_numexpr = expressions.can_use_numexpr( | ||
func, None, (right.dtype,), None | ||
) | ||
|
||
array_op = ops.get_array_op(func, use_numexpr=use_numexpr) | ||
|
||
with np.errstate(all="ignore"): | ||
arrays = [ | ||
array_op(_left, _right) | ||
|
@@ -6876,6 +6907,17 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None): | |
assert right.index.equals(self.index) # Handle other cases later | ||
right = right._values | ||
|
||
if isinstance(self._mgr, ArrayManager): | ||
use_numexpr = expressions.can_use_numexpr( | ||
func, self.shape[0], (right.dtype,), None | ||
) | ||
else: | ||
use_numexpr = expressions.can_use_numexpr( | ||
func, None, (right.dtype,), None | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these ~8 lines (x4) seem like a reasonable cope for an AM/BM method |
||
|
||
array_op = ops.get_array_op(func, use_numexpr=use_numexpr) | ||
|
||
with np.errstate(all="ignore"): | ||
arrays = [array_op(left, right) for left in self._iter_column_arrays()] | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -135,7 +135,7 @@ def _masked_arith_op(x: np.ndarray, y, op): | |
return result | ||
|
||
|
||
def _na_arithmetic_op(left, right, op, is_cmp: bool = False): | ||
def _na_arithmetic_op(left, right, op, is_cmp: bool = False, use_numexpr=True): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. annotate, docstring There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can use_numexpr be keyword-only |
||
""" | ||
Return the result of evaluating op on the passed in values. | ||
|
||
|
@@ -156,14 +156,8 @@ def _na_arithmetic_op(left, right, op, is_cmp: bool = False): | |
------ | ||
TypeError : invalid operation | ||
""" | ||
if isinstance(right, str): | ||
# can never use numexpr | ||
func = op | ||
else: | ||
func = partial(expressions.evaluate, op) | ||
|
||
try: | ||
result = func(left, right) | ||
result = expressions.evaluate(op, left, right, use_numexpr=use_numexpr) | ||
except TypeError: | ||
if is_object_dtype(left) or is_object_dtype(right) and not is_cmp: | ||
# For object dtype, fallback to a masked operation (only operating | ||
|
@@ -182,7 +176,7 @@ def _na_arithmetic_op(left, right, op, is_cmp: bool = False): | |
return missing.dispatch_fill_zeros(op, left, right, result) | ||
|
||
|
||
def arithmetic_op(left: ArrayLike, right: Any, op): | ||
def arithmetic_op(left: ArrayLike, right: Any, op, use_numexpr=True): | ||
""" | ||
Evaluate an arithmetic operation `+`, `-`, `*`, `/`, `//`, `%`, `**`, ... | ||
|
||
|
@@ -222,12 +216,12 @@ def arithmetic_op(left: ArrayLike, right: Any, op): | |
# (https://github.com/pandas-dev/pandas/issues/41165) | ||
_bool_arith_check(op, left, right) | ||
|
||
res_values = _na_arithmetic_op(left, right, op) | ||
res_values = _na_arithmetic_op(left, right, op, use_numexpr=use_numexpr) | ||
|
||
return res_values | ||
|
||
|
||
def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: | ||
def comparison_op(left: ArrayLike, right: Any, op, use_numexpr=True) -> ArrayLike: | ||
""" | ||
Evaluate a comparison operation `=`, `!=`, `>=`, `>`, `<=`, or `<`. | ||
|
||
|
@@ -285,7 +279,9 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: | |
res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) | ||
|
||
else: | ||
res_values = _na_arithmetic_op(lvalues, rvalues, op, is_cmp=True) | ||
res_values = _na_arithmetic_op( | ||
lvalues, rvalues, op, is_cmp=True, use_numexpr=use_numexpr | ||
) | ||
|
||
return res_values | ||
|
||
|
@@ -331,7 +327,7 @@ def na_logical_op(x: np.ndarray, y, op): | |
return result.reshape(x.shape) | ||
|
||
|
||
def logical_op(left: ArrayLike, right: Any, op) -> ArrayLike: | ||
def logical_op(left: ArrayLike, right: Any, op, use_numexpr=True) -> ArrayLike: | ||
""" | ||
Evaluate a logical operation `|`, `&`, or `^`. | ||
|
||
|
@@ -397,7 +393,7 @@ def fill_bool(x, left=None): | |
return res_values | ||
|
||
|
||
def get_array_op(op): | ||
def get_array_op(op, use_numexpr=True): | ||
""" | ||
Return a binary array operation corresponding to the given operator op. | ||
|
||
|
@@ -421,9 +417,9 @@ def get_array_op(op): | |
return op | ||
|
||
if op_name in {"eq", "ne", "lt", "le", "gt", "ge"}: | ||
return partial(comparison_op, op=op) | ||
return partial(comparison_op, op=op, use_numexpr=use_numexpr) | ||
elif op_name in {"and", "or", "xor", "rand", "ror", "rxor"}: | ||
return partial(logical_op, op=op) | ||
return partial(logical_op, op=op, use_numexpr=use_numexpr) | ||
elif op_name in { | ||
"add", | ||
"sub", | ||
|
@@ -434,7 +430,7 @@ def get_array_op(op): | |
"divmod", | ||
"pow", | ||
}: | ||
return partial(arithmetic_op, op=op) | ||
return partial(arithmetic_op, op=op, use_numexpr=use_numexpr) | ||
else: | ||
raise NotImplementedError(op_name) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
int or None?