From 14b455ec7ded420f2554355fdf46ad2fb94f1df0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 16 Feb 2020 19:29:05 -0800 Subject: [PATCH 1/4] use numexpr for Series comparisons --- pandas/core/ops/__init__.py | 7 +++---- pandas/core/ops/array_ops.py | 18 ++++++++++++++---- pandas/tests/arithmetic/test_numeric.py | 7 ++++++- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index f3c1a609d50a1..b7d1e93aaec52 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -374,7 +374,6 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): """ # Note: we use iloc to access columns for compat with cases # with non-unique columns. - import pandas.core.computation.expressions as expressions right = lib.item_from_zerodim(right) if lib.is_scalar(right) or np.ndim(right) == 0: @@ -419,8 +418,7 @@ def column_op(a, b): # Remaining cases have less-obvious dispatch rules raise NotImplementedError(right) - new_data = expressions.evaluate(column_op, str_rep, left, right) - return new_data + return column_op(left, right) # ----------------------------------------------------------------------------- @@ -515,6 +513,7 @@ def _comp_method_SERIES(cls, op, special): Wrapper function for Series arithmetic operations, to avoid code duplication. """ + str_rep = _get_opstr(op) op_name = _get_op_name(op, special) @unpack_zerodim_and_defer(op_name) @@ -528,7 +527,7 @@ def wrapper(self, other): lvalues = extract_array(self, extract_numpy=True) rvalues = extract_array(other, extract_numpy=True) - res_values = comparison_op(lvalues, rvalues, op) + res_values = comparison_op(lvalues, rvalues, op, str_rep) return _construct_result(self, res_values, index=self.index, name=res_name) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 37a4a6eddaebe..c4d80e7c13bc4 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -125,7 +125,7 @@ def na_op(x, y): return na_op -def na_arithmetic_op(left, right, op, str_rep: str): +def na_arithmetic_op(left, right, op, str_rep: Optional[str], is_cmp: bool = False): """ Return the result of evaluating op on the passed in values. @@ -136,6 +136,8 @@ def na_arithmetic_op(left, right, op, str_rep: str): left : np.ndarray right : np.ndarray or scalar str_rep : str or None + is_cmp : bool, default False + If this a comparison operation. Returns ------- @@ -150,6 +152,8 @@ def na_arithmetic_op(left, right, op, str_rep: str): try: result = expressions.evaluate(op, str_rep, left, right) except TypeError: + if is_cmp: + raise result = masked_arith_op(left, right, op) return missing.dispatch_fill_zeros(op, left, right, result) @@ -201,7 +205,10 @@ def arithmetic_op( def comparison_op( - left: Union[np.ndarray, ABCExtensionArray], right: Any, op + left: Union[np.ndarray, ABCExtensionArray], + right: Any, + op, + str_rep: Optional[str] = None, ) -> Union[np.ndarray, ABCExtensionArray]: """ Evaluate a comparison operation `=`, `!=`, `>=`, `>`, `<=`, or `<`. @@ -250,7 +257,10 @@ def comparison_op( op_name = f"__{op.__name__}__" method = getattr(lvalues, op_name) with np.errstate(all="ignore"): - res_values = method(rvalues) + res_values = na_arithmetic_op(lvalues, rvalues, op, str_rep, is_cmp=True) + if is_scalar(res_values): + # numexpr choked + res_values = method(rvalues) if res_values is NotImplemented: res_values = invalid_comparison(lvalues, rvalues, op) @@ -385,7 +395,7 @@ def get_array_op(op, str_rep: Optional[str] = None): """ op_name = op.__name__.strip("_") if op_name in {"eq", "ne", "lt", "le", "gt", "ge"}: - return partial(comparison_op, op=op) + return partial(comparison_op, op=op, str_rep=str_rep) elif op_name in {"and", "or", "xor", "rand", "ror", "rxor"}: return partial(logical_op, op=op) else: diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 51d09a92773b1..e0de2e5068d65 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -66,7 +66,12 @@ def test_df_numeric_cmp_dt64_raises(self): ts = pd.Timestamp.now() df = pd.DataFrame({"x": range(5)}) - msg = "Invalid comparison between dtype=int64 and Timestamp" + msg = "|".join( + [ + "Invalid comparison between dtype=int64 and Timestamp", + "'[<>]' not supported between instances of 'Timestamp' and 'int'", + ] + ) with pytest.raises(TypeError, match=msg): df > ts From 6273e7d5813690432fa556cef0a65d784d55d7c2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 17 Feb 2020 07:24:24 -0800 Subject: [PATCH 2/4] revert --- pandas/core/ops/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index b7d1e93aaec52..219625ae40056 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -374,6 +374,7 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): """ # Note: we use iloc to access columns for compat with cases # with non-unique columns. + import pandas.core.computation.expressions as expressions right = lib.item_from_zerodim(right) if lib.is_scalar(right) or np.ndim(right) == 0: @@ -418,7 +419,8 @@ def column_op(a, b): # Remaining cases have less-obvious dispatch rules raise NotImplementedError(right) - return column_op(left, right) + new_data = expressions.evaluate(column_op, str_rep, left, right) + return new_data # ----------------------------------------------------------------------------- From 3a55dc11e9760ced92ef4f785228f35476a3c3c0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 17 Feb 2020 07:26:18 -0800 Subject: [PATCH 3/4] update exception message --- pandas/tests/arithmetic/test_numeric.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index e0de2e5068d65..d4baf2f374cdf 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -66,13 +66,7 @@ def test_df_numeric_cmp_dt64_raises(self): ts = pd.Timestamp.now() df = pd.DataFrame({"x": range(5)}) - msg = "|".join( - [ - "Invalid comparison between dtype=int64 and Timestamp", - "'[<>]' not supported between instances of 'Timestamp' and 'int'", - ] - ) - + msg = "'[<>]' not supported between instances of 'Timestamp' and 'int'" with pytest.raises(TypeError, match=msg): df > ts with pytest.raises(TypeError, match=msg): From 23976f7520c50f7d10f636823224b368a360a243 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 24 Feb 2020 14:14:47 -0800 Subject: [PATCH 4/4] handle it all inside na_arithmetic_op --- pandas/core/ops/array_ops.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index e0ade8e7b189c..b216a927f65b3 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -154,9 +154,17 @@ def na_arithmetic_op(left, right, op, str_rep: Optional[str], is_cmp: bool = Fal result = expressions.evaluate(op, str_rep, left, right) except TypeError: if is_cmp: + # numexpr failed on comparison op, e.g. ndarray[float] > datetime + # In this case we do not fall back to the masked op, as that + # will handle complex numbers incorrectly, see GH#32047 raise result = masked_arith_op(left, right, op) + if is_cmp and (is_scalar(result) or result is NotImplemented): + # numpy returned a scalar instead of operating element-wise + # e.g. numeric array vs str + return invalid_comparison(left, right, op) + return missing.dispatch_fill_zeros(op, left, right, result) @@ -204,10 +212,7 @@ def arithmetic_op(left: ArrayLike, right: Any, op, str_rep: str): def comparison_op( - left: ArrayLike, - right: Any, - op, - str_rep: Optional[str] = None, + left: ArrayLike, right: Any, op, str_rep: Optional[str] = None, ) -> ArrayLike: """ Evaluate a comparison operation `=`, `!=`, `>=`, `>`, `<=`, or `<`. @@ -253,19 +258,8 @@ def comparison_op( res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) else: - op_name = f"__{op.__name__}__" - method = getattr(lvalues, op_name) with np.errstate(all="ignore"): res_values = na_arithmetic_op(lvalues, rvalues, op, str_rep, is_cmp=True) - if is_scalar(res_values): - # numexpr choked - res_values = method(rvalues) - - if res_values is NotImplemented: - res_values = invalid_comparison(lvalues, rvalues, op) - if is_scalar(res_values): - typ = type(rvalues) - raise TypeError(f"Could not compare {typ} type with Series") return res_values