From 1bb21fa252befb743aea4767aed4ccc9476ebe0b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 1 Apr 2021 14:05:37 +0200 Subject: [PATCH 1/5] PERF/REF: Check use of numexpr earlier in the DataFrame operation --- pandas/core/computation/expressions.py | 41 ++++++++++++++++++++++++++ pandas/core/frame.py | 24 +++++++++++++++ pandas/core/ops/array_ops.py | 29 +++++++++++------- 3 files changed, 83 insertions(+), 11 deletions(-) diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 8205534c9d48b..e62357abc2360 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -72,6 +72,47 @@ def _evaluate_standard(op, op_str, a, b): return op(a, b) +def can_use_numexpr(op, size=None, dtypes=None, scalar=None): + """ + Initial check whether numexpr can be used with the given size and + involved data types and/or scalar operand. + + Returns False if it definitely cannot use numexpr, otherwise returns + True (which doesn't mean we always end up using numexpr) + + Parameters + ---------- + op : operator + size : int + dtypes : list + List of dtypes involved in the operation + scalar : + Optionally a scalar, eg if the right operand is a scalar. + + Returns + ------- + bool + """ + if size is not None: + if size < _MIN_ELEMENTS: + return False + + op_str = _op_str_mapping.get(op, None) + if op_str is None: + return False + + if scalar is not None: + if isinstance(scalar, str): + return False + + # allowed are a superset + if dtypes is not None: + return _ALLOWED_DTYPES["evaluate"] >= set(dtypes) + + # safe fallback if dtypes were not specified + return True + + def _can_use_numexpr(op, op_str, a, b, dtype_check): """ return a boolean if we WILL be using numexpr """ if op_str is not None: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b22fcbd9229e7..e47de2a208c9e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6678,11 +6678,25 @@ def _dispatch_frame_op(self, right, func: Callable, axis: Optional[int] = None): ------- DataFrame """ + import pandas.core.computation.expressions as expressions + # Get the appropriate array-op to apply to each column/block's values. array_op = ops.get_array_op(func) right = lib.item_from_zerodim(right) if not is_list_like(right): + + if isinstance(self._mgr, ArrayManager): + use_numexpr = expressions.USE_NUMEXPR and expressions.can_use_numexpr( + func, self.shape[0], None, right + ) + else: + use_numexpr = expressions.USE_NUMEXPR and expressions.can_use_numexpr( + func, None, None, right + ) + + array_op = ops.get_array_op(func, use_numexpr=use_numexpr) + # i.e. scalar, faster than checking np.ndim(right) == 0 with np.errstate(all="ignore"): bm = self._mgr.apply(array_op, right=right) @@ -6695,6 +6709,16 @@ def _dispatch_frame_op(self, right, func: Callable, axis: Optional[int] = None): # fails in cases with empty columns reached via # _frame_arith_method_with_reindex + if isinstance(self._mgr, ArrayManager): + use_numexpr = expressions.USE_NUMEXPR and expressions.can_use_numexpr( + func, self.shape[0], None, None + ) + else: + use_numexpr = expressions.USE_NUMEXPR + + # breakpoint() + array_op = ops.get_array_op(func, use_numexpr=use_numexpr) + # TODO operate_blockwise expects a manager of the same type with np.errstate(all="ignore"): bm = self._mgr.operate_blockwise( diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 1d7c16de0c05d..d2b6dad00c0e6 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -84,6 +84,7 @@ def _masked_arith_op(x: np.ndarray, y, op): y : np.ndarray, Series, Index op : binary operator """ + print("n\n!!!!!!!!!! ", op, x.dtype, type(y), getattr(y, "dtype", "")) # For Series `x` is 1D so ravel() is a no-op; calling it anyway makes # the logic valid for both Series and DataFrame ops. xrav = x.ravel() @@ -134,7 +135,7 @@ def _masked_arith_op(x: np.ndarray, y, op): return result -def _na_arithmetic_op(left, right, op, is_cmp: bool = False): +def _na_arithmetic_op(left, right, op, is_cmp: bool = False, use_numexpr=True): """ Return the result of evaluating op on the passed in values. @@ -156,14 +157,18 @@ def _na_arithmetic_op(left, right, op, is_cmp: bool = False): TypeError : invalid operation """ try: - result = expressions.evaluate(op, left, right) + result = expressions.evaluate(op, left, right, use_numexpr=use_numexpr) except TypeError: if is_cmp: # numexpr failed on comparison op, e.g. ndarray[float] > datetime # In this case we do not fall back to the masked op, as that # will handle complex numbers incorrectly, see GH#32047 raise - result = _masked_arith_op(left, right, op) + # breakpoint() + if is_object_dtype(left) or is_object_dtype(right): + result = _masked_arith_op(left, right, op) + else: + raise if is_cmp and (is_scalar(result) or result is NotImplemented): # numpy returned a scalar instead of operating element-wise @@ -173,7 +178,7 @@ def _na_arithmetic_op(left, right, op, is_cmp: bool = False): return missing.dispatch_fill_zeros(op, left, right, result) -def arithmetic_op(left: ArrayLike, right: Any, op): +def arithmetic_op(left: ArrayLike, right: Any, op, use_numexpr=True): """ Evaluate an arithmetic operation `+`, `-`, `*`, `/`, `//`, `%`, `**`, ... @@ -212,7 +217,7 @@ def arithmetic_op(left: ArrayLike, right: Any, op): return res_values -def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: +def comparison_op(left: ArrayLike, right: Any, op, use_numexpr=True) -> ArrayLike: """ Evaluate a comparison operation `=`, `!=`, `>=`, `>`, `<=`, or `<`. @@ -267,7 +272,9 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) else: - res_values = _na_arithmetic_op(lvalues, rvalues, op, is_cmp=True) + res_values = _na_arithmetic_op( + lvalues, rvalues, op, is_cmp=True, use_numexpr=use_numexpr + ) return res_values @@ -313,7 +320,7 @@ def na_logical_op(x: np.ndarray, y, op): return result.reshape(x.shape) -def logical_op(left: ArrayLike, right: Any, op) -> ArrayLike: +def logical_op(left: ArrayLike, right: Any, op, use_numexpr=True) -> ArrayLike: """ Evaluate a logical operation `|`, `&`, or `^`. @@ -379,7 +386,7 @@ def fill_bool(x, left=None): return res_values -def get_array_op(op): +def get_array_op(op, use_numexpr=True): """ Return a binary array operation corresponding to the given operator op. @@ -403,9 +410,9 @@ def get_array_op(op): return op if op_name in {"eq", "ne", "lt", "le", "gt", "ge"}: - return partial(comparison_op, op=op) + return partial(comparison_op, op=op, use_numexpr=use_numexpr) elif op_name in {"and", "or", "xor", "rand", "ror", "rxor"}: - return partial(logical_op, op=op) + return partial(logical_op, op=op, use_numexpr=use_numexpr) elif op_name in { "add", "sub", @@ -416,7 +423,7 @@ def get_array_op(op): "divmod", "pow", }: - return partial(arithmetic_op, op=op) + return partial(arithmetic_op, op=op, use_numexpr=use_numexpr) else: raise NotImplementedError(op_name) From ef0202abacbb55060c6810258f0566c8ebcf50c2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 23 Apr 2021 14:24:59 +0200 Subject: [PATCH 2/5] check option within can_use_numexpr --- pandas/core/computation/expressions.py | 3 +++ pandas/core/frame.py | 8 +++----- pandas/core/ops/array_ops.py | 1 - 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 7e44c1ef4265a..bcd5f99e3e39f 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -93,6 +93,9 @@ def can_use_numexpr(op, size=None, dtypes=None, scalar=None): ------- bool """ + if not USE_NUMEXPR: + return False + if size is not None: if size < _MIN_ELEMENTS: return False diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 79dead80c78cb..df08956f3c109 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6836,13 +6836,11 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None): if not is_list_like(right): if isinstance(self._mgr, ArrayManager): - use_numexpr = expressions.USE_NUMEXPR and expressions.can_use_numexpr( + use_numexpr = expressions.can_use_numexpr( func, self.shape[0], None, right ) else: - use_numexpr = expressions.USE_NUMEXPR and expressions.can_use_numexpr( - func, None, None, right - ) + use_numexpr = expressions.can_use_numexpr(func, None, None, right) array_op = ops.get_array_op(func, use_numexpr=use_numexpr) @@ -6859,7 +6857,7 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None): # _frame_arith_method_with_reindex if isinstance(self._mgr, ArrayManager): - use_numexpr = expressions.USE_NUMEXPR and expressions.can_use_numexpr( + use_numexpr = expressions.can_use_numexpr( func, self.shape[0], None, None ) else: diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 4d9551235d271..abb1e033947fb 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -83,7 +83,6 @@ def _masked_arith_op(x: np.ndarray, y, op): y : np.ndarray, Series, Index op : binary operator """ - print("n\n!!!!!!!!!! ", op, x.dtype, type(y), getattr(y, "dtype", "")) # For Series `x` is 1D so ravel() is a no-op; calling it anyway makes # the logic valid for both Series and DataFrame ops. xrav = x.ravel() From 3fce895b93e1ee3c8ba1fdb415cddc24d9778487 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 23 Apr 2021 14:39:56 +0200 Subject: [PATCH 3/5] passthrough for arithmetic_op as well --- pandas/core/ops/array_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index abb1e033947fb..34a4e0eaa65e4 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -205,7 +205,7 @@ def arithmetic_op(left: ArrayLike, right: Any, op, use_numexpr=True): # Timedelta is included because numexpr will fail on it, see GH#31457 res_values = op(left, right) else: - res_values = _na_arithmetic_op(left, right, op) + res_values = _na_arithmetic_op(left, right, op, use_numexpr=use_numexpr) return res_values From 54a84d4a3805a7c7fb34ecb466f49725ae9c35e8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 28 Apr 2021 12:42:37 +0200 Subject: [PATCH 4/5] feedback --- pandas/core/computation/expressions.py | 2 +- pandas/core/frame.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 9cffd9fc11637..2569837142827 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -72,7 +72,7 @@ def _evaluate_standard(op, op_str, a, b): return op(a, b) -def can_use_numexpr(op, size=None, dtypes=None, scalar=None): +def can_use_numexpr(op, size: Optional[int] = None, dtypes=None, scalar=None): """ Initial check whether numexpr can be used with the given size and involved data types and/or scalar operand. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index df08956f3c109..005b2b68d508c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -139,6 +139,7 @@ from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseFrameAccessor +import pandas.core.computation.expressions as expressions from pandas.core.construction import ( extract_array, sanitize_array, @@ -6827,8 +6828,6 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None): ------- DataFrame """ - import pandas.core.computation.expressions as expressions - # Get the appropriate array-op to apply to each column/block's values. array_op = ops.get_array_op(func) From b252e7cb179bae52fb377437a17c550e8836a317 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 28 Apr 2021 14:57:00 +0200 Subject: [PATCH 5/5] add check for frame / series ops --- pandas/core/computation/expressions.py | 3 +++ pandas/core/frame.py | 23 ++++++++++++++++++++++- pandas/core/ops/array_ops.py | 8 +------- 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 2569837142827..fb8d3eb3ab2df 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -122,6 +122,9 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): # required min elements (otherwise we are adding overhead) if a.size > _MIN_ELEMENTS: + if isinstance(b, str): + return False + # check for dtype compatibility dtypes: Set[str] = set() for o in [a, b]: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 005b2b68d508c..c50ae68a7a5fd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6862,7 +6862,6 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None): else: use_numexpr = expressions.USE_NUMEXPR - # breakpoint() array_op = ops.get_array_op(func, use_numexpr=use_numexpr) # TODO operate_blockwise expects a manager of the same type @@ -6887,6 +6886,17 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None): # maybe_align_as_frame ensures we do not have an ndarray here assert not isinstance(right, np.ndarray) + if isinstance(self._mgr, ArrayManager): + use_numexpr = expressions.can_use_numexpr( + func, self.shape[1], (right.dtype,), None + ) + else: + use_numexpr = expressions.can_use_numexpr( + func, None, (right.dtype,), None + ) + + array_op = ops.get_array_op(func, use_numexpr=use_numexpr) + with np.errstate(all="ignore"): arrays = [ array_op(_left, _right) @@ -6897,6 +6907,17 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None): assert right.index.equals(self.index) # Handle other cases later right = right._values + if isinstance(self._mgr, ArrayManager): + use_numexpr = expressions.can_use_numexpr( + func, self.shape[0], (right.dtype,), None + ) + else: + use_numexpr = expressions.can_use_numexpr( + func, None, (right.dtype,), None + ) + + array_op = ops.get_array_op(func, use_numexpr=use_numexpr) + with np.errstate(all="ignore"): arrays = [array_op(left, right) for left in self._iter_column_arrays()] diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 84a10af651c1e..4deb0cd9d5a7b 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -156,14 +156,8 @@ def _na_arithmetic_op(left, right, op, is_cmp: bool = False, use_numexpr=True): ------ TypeError : invalid operation """ - if isinstance(right, str): - # can never use numexpr - func = op - else: - func = partial(expressions.evaluate, op, use_numexpr=use_numexpr) - try: - result = func(left, right) + result = expressions.evaluate(op, left, right, use_numexpr=use_numexpr) except TypeError: if is_object_dtype(left) or is_object_dtype(right) and not is_cmp: # For object dtype, fallback to a masked operation (only operating