From a4dea4aa71ca9d0de5dbdb73be8e1d97be959633 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 16 Mar 2021 11:37:21 +0100 Subject: [PATCH 1/8] BUG/TST: run and fix all arithmetic tests with+without numexpr --- pandas/core/ops/array_ops.py | 26 +++++++++++++++++++------ pandas/tests/arithmetic/conftest.py | 12 ++++++++++++ pandas/tests/arithmetic/test_numeric.py | 15 ++++++++++++-- 3 files changed, 45 insertions(+), 8 deletions(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 9153eb25032e7..4d125b873a7d0 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -2,7 +2,7 @@ Functions for arithmetic and comparison operations on NumPy arrays and ExtensionArrays. """ -from datetime import timedelta +import datetime from functools import partial import operator from typing import Any @@ -11,11 +11,13 @@ import numpy as np from pandas._libs import ( + NaTType, Timedelta, Timestamp, lib, ops as libops, ) +from pandas._libs.tslibs import BaseOffset from pandas._typing import ( ArrayLike, Shape, @@ -157,8 +159,14 @@ def _na_arithmetic_op(left, right, op, is_cmp: bool = False): """ import pandas.core.computation.expressions as expressions + if isinstance(right, str): + # can never use numexpr + func = op + else: + func = partial(expressions.evaluate, op) + try: - result = expressions.evaluate(op, left, right) + result = func(left, right) except TypeError: if is_cmp: # numexpr failed on comparison op, e.g. ndarray[float] > datetime @@ -199,7 +207,9 @@ def arithmetic_op(left: ArrayLike, right: Any, op): rvalues = ensure_wrapped_if_datetimelike(right) rvalues = _maybe_upcast_for_op(rvalues, lvalues.shape) - if should_extension_dispatch(lvalues, rvalues) or isinstance(rvalues, Timedelta): + if should_extension_dispatch(lvalues, rvalues) or isinstance( + rvalues, (Timedelta, BaseOffset, Timestamp, NaTType) + ): # Timedelta is included because numexpr will fail on it, see GH#31457 res_values = op(lvalues, rvalues) @@ -243,7 +253,9 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: "Lengths must match to compare", lvalues.shape, rvalues.shape ) - if should_extension_dispatch(lvalues, rvalues): + if should_extension_dispatch(lvalues, rvalues) or isinstance( + rvalues, (Timedelta, BaseOffset, Timestamp, NaTType) + ): # Call the method on lvalues res_values = op(lvalues, rvalues) @@ -258,7 +270,7 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: # GH#36377 going through the numexpr path would incorrectly raise return invalid_comparison(lvalues, rvalues, op) - elif is_object_dtype(lvalues.dtype): + elif is_object_dtype(lvalues.dtype) or isinstance(rvalues, str): res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) else: @@ -444,11 +456,13 @@ def _maybe_upcast_for_op(obj, shape: Shape): TimedeltaArray, ) - if type(obj) is timedelta: + if type(obj) is datetime.timedelta: # GH#22390 cast up to Timedelta to rely on Timedelta # implementation; otherwise operation against numeric-dtype # raises TypeError return Timedelta(obj) + elif type(obj) is datetime.datetime: + return Timestamp(obj) elif isinstance(obj, np.datetime64): # GH#28080 numpy casts integer-dtype to datetime64 when doing # array[int] + datetime64, which we do not allow diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index d90592c68e351..1e97db152c294 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -9,6 +9,18 @@ UInt64Index, ) import pandas._testing as tm +from pandas.core.computation import expressions as expr + + +@pytest.fixture( + autouse=True, scope="module", params=[0, 1000000], ids=["numexpr", "python"] +) +def switch_numexpr_min_elements(request): + _MIN_ELEMENTS = expr._MIN_ELEMENTS + expr._MIN_ELEMENTS = request.param + yield request.param + expr._MIN_ELEMENTS = _MIN_ELEMENTS + # ------------------------------------------------------------------ # Helper Functions diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 1e2622d6a8fcd..fb7f43c24ddfe 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -27,6 +27,7 @@ ) import pandas._testing as tm from pandas.core import ops +from pandas.core.computation import expressions as expr @pytest.fixture(params=[Index, Series, tm.to_array]) @@ -127,7 +128,12 @@ def test_numeric_cmp_string_numexpr_path(self, box_with_array): result = obj != "a" tm.assert_equal(result, ~expected) - msg = "Invalid comparison between dtype=float64 and str" + msg = "|".join( + [ + "Invalid comparison between dtype=float64 and str", + "'<' not supported between instances of 'numpy.ndarray' and 'str'", + ] + ) with pytest.raises(TypeError, match=msg): obj < "a" @@ -390,7 +396,7 @@ def test_div_negative_zero(self, zero, numeric_idx, op): # ------------------------------------------------------------------ @pytest.mark.parametrize("dtype1", [np.int64, np.float64, np.uint64]) - def test_ser_div_ser(self, dtype1, any_real_dtype): + def test_ser_div_ser(self, switch_numexpr_min_elements, dtype1, any_real_dtype): # no longer do integer div for any ops, but deal with the 0's dtype2 = any_real_dtype @@ -404,6 +410,11 @@ def test_ser_div_ser(self, dtype1, any_real_dtype): name=None, ) expected.iloc[0:3] = np.inf + if first.dtype == "int64" and second.dtype == "float32": + # when using numexpr, the casting rules are slightly different + # and int64/float32 combo results in float32 instead of float64 + if expr.USE_NUMEXPR and switch_numexpr_min_elements == 0: + expected = expected.astype("float32") result = first / second tm.assert_series_equal(result, expected) From 6851089c827d7e2ada84ed5385b4aa6e5d97aedb Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 1 Apr 2021 10:52:27 +0200 Subject: [PATCH 2/8] fix test for comparison with NaT --- pandas/_libs/tslibs/nattype.pyx | 4 ++++ pandas/tests/frame/test_arithmetic.py | 16 +++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index d86d3261d404e..0c598beb6ad16 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -127,6 +127,10 @@ cdef class _NaT(datetime): result.fill(_nat_scalar_rules[op]) elif other.dtype.kind == "O": result = np.array([PyObject_RichCompare(self, x, op) for x in other]) + elif op == Py_EQ: + result = np.zeros(other.shape, dtype=bool) + elif op == Py_NE: + result = np.ones(other.shape, dtype=bool) else: return NotImplemented return result diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index c6816fa6481f4..4c31d15541412 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -174,9 +174,19 @@ def test_timestamp_compare(self): with pytest.raises(TypeError, match=msg): right_f(pd.Timestamp("20010109"), df) # nats - expected = left_f(df, pd.Timestamp("nat")) - result = right_f(pd.Timestamp("nat"), df) - tm.assert_frame_equal(result, expected) + if left in ["eq", "ne"]: + expected = left_f(df, pd.Timestamp("nat")) + result = right_f(pd.Timestamp("nat"), df) + tm.assert_frame_equal(result, expected) + else: + msg = ( + "'(<|>)=?' not supported between " + "instances of 'numpy.ndarray' and 'NaTType'" + ) + with pytest.raises(TypeError, match=msg): + left_f(df, pd.Timestamp("nat")) + with pytest.raises(TypeError, match=msg): + right_f(pd.Timestamp("nat"), df) def test_mixed_comparison(self): # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False, From 51226755004cc4ee84ee35701c9c57203c5bfd59 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 20 Apr 2021 15:42:48 +0200 Subject: [PATCH 3/8] revert change error message test --- pandas/tests/arithmetic/test_numeric.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 6e39366f269b3..7bd69cfae7964 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -128,12 +128,7 @@ def test_numeric_cmp_string_numexpr_path(self, box_with_array): result = obj != "a" tm.assert_equal(result, ~expected) - msg = "|".join( - [ - "Invalid comparison between dtype=float64 and str", - "'<' not supported between instances of 'numpy.ndarray' and 'str'", - ] - ) + msg = "Invalid comparison between dtype=float64 and str" with pytest.raises(TypeError, match=msg): obj < "a" From 82a7247ce0b92a9caaeb72eb66155fc7ae2a6dec Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 20 Apr 2021 15:49:43 +0200 Subject: [PATCH 4/8] fix case with object dtype and scalar right operand --- pandas/core/ops/array_ops.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 9db536dbd1e40..66c6c1c0ec1b5 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -259,8 +259,9 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: "Lengths must match to compare", lvalues.shape, rvalues.shape ) - if should_extension_dispatch(lvalues, rvalues) or isinstance( - rvalues, (Timedelta, BaseOffset, Timestamp, NaTType) + if should_extension_dispatch(lvalues, rvalues) or ( + isinstance(rvalues, (Timedelta, BaseOffset, Timestamp, NaTType)) + and not is_object_dtype(lvalues.dtype) ): # Call the method on lvalues res_values = op(lvalues, rvalues) From e294267752971ac89f5fd12e470e7ff249b8b71c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 22 Apr 2021 13:47:09 +0200 Subject: [PATCH 5/8] fix for wrong error message with Timestamp scalar --- pandas/tests/arithmetic/test_numeric.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 7bd69cfae7964..9e1d13eac5039 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -896,7 +896,13 @@ def test_series_frame_radd_bug(self): # really raise this time now = pd.Timestamp.now().to_pydatetime() - msg = "unsupported operand type" + msg = "|".join( + [ + "unsupported operand type", + # wrong error message, see https://github.com/numpy/numpy/issues/18832 + "Concatenation operation", + ] + ) with pytest.raises(TypeError, match=msg): now + ts From 0804e6328d44f4d97c8dd43afa936bef11e49624 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 22 Apr 2021 13:54:55 +0200 Subject: [PATCH 6/8] add comment --- pandas/core/ops/array_ops.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 66c6c1c0ec1b5..bd8738660894c 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -460,6 +460,7 @@ def _maybe_upcast_for_op(obj, shape: Shape): # raises TypeError return Timedelta(obj) elif type(obj) is datetime.datetime: + # cast up to Timestamp to rely on Timestamp implementation, see Timedelta above return Timestamp(obj) elif isinstance(obj, np.datetime64): # GH#28080 numpy casts integer-dtype to datetime64 when doing From dcf38cf16a769593def95963a720ab262dacf7e3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 23 Apr 2021 14:08:42 +0200 Subject: [PATCH 7/8] check right is NaT --- pandas/core/ops/array_ops.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index b557bd3b8cb0b..471612131d154 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -10,7 +10,7 @@ import numpy as np from pandas._libs import ( - NaTType, + NaT, Timedelta, Timestamp, lib, @@ -210,9 +210,10 @@ def arithmetic_op(left: ArrayLike, right: Any, op): right = _maybe_upcast_for_op(right, left.shape) if should_extension_dispatch(left, right) or isinstance( - right, (Timedelta, BaseOffset, Timestamp, NaTType) + right, (Timedelta, BaseOffset, Timestamp) or right is NaT ): - # Timedelta is included because numexpr will fail on it, see GH#31457 + # Timedelta/Timestamp and other custom scalars are included in the check + # because numexpr will fail on it, see GH#31457 res_values = op(left, right) else: res_values = _na_arithmetic_op(left, right, op) @@ -257,7 +258,7 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: ) if should_extension_dispatch(lvalues, rvalues) or ( - isinstance(rvalues, (Timedelta, BaseOffset, Timestamp, NaTType)) + (isinstance(rvalues, (Timedelta, BaseOffset, Timestamp)) or right is NaT) and not is_object_dtype(lvalues.dtype) ): # Call the method on lvalues From 036cf62a89bf0485a8d24e1ff9f742e6ea4982bc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 23 Apr 2021 16:40:53 +0200 Subject: [PATCH 8/8] wrong bracket --- pandas/core/ops/array_ops.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 471612131d154..67b68ce7365cc 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -209,8 +209,10 @@ def arithmetic_op(left: ArrayLike, right: Any, op): # casts integer dtypes to timedelta64 when operating with timedelta64 - GH#22390) right = _maybe_upcast_for_op(right, left.shape) - if should_extension_dispatch(left, right) or isinstance( - right, (Timedelta, BaseOffset, Timestamp) or right is NaT + if ( + should_extension_dispatch(left, right) + or isinstance(right, (Timedelta, BaseOffset, Timestamp)) + or right is NaT ): # Timedelta/Timestamp and other custom scalars are included in the check # because numexpr will fail on it, see GH#31457