Skip to content

BUG/TST: run and fix all arithmetic tests with+without numexpr #40463

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Apr 26, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pandas/_libs/tslibs/nattype.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,10 @@ cdef class _NaT(datetime):
result.fill(_nat_scalar_rules[op])
elif other.dtype.kind == "O":
result = np.array([PyObject_RichCompare(self, x, op) for x in other])
elif op == Py_EQ:
result = np.zeros(other.shape, dtype=bool)
elif op == Py_NE:
result = np.ones(other.shape, dtype=bool)
else:
return NotImplemented
return result
Expand Down
26 changes: 20 additions & 6 deletions pandas/core/ops/array_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,21 @@
Functions for arithmetic and comparison operations on NumPy arrays and
ExtensionArrays.
"""
from datetime import timedelta
import datetime
from functools import partial
import operator
from typing import Any

import numpy as np

from pandas._libs import (
NaTType,
Timedelta,
Timestamp,
lib,
ops as libops,
)
from pandas._libs.tslibs import BaseOffset
from pandas._typing import (
ArrayLike,
Shape,
Expand Down Expand Up @@ -155,8 +157,14 @@ def _na_arithmetic_op(left, right, op, is_cmp: bool = False):
------
TypeError : invalid operation
"""
if isinstance(right, str):
# can never use numexpr
func = op
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Basically with a string argument, numexpr will fail with a "wrong" error message. Alternatively, _can_use_numexpr in expressions.py could also be updated to check for this and avoid using the numexpr path (currently that only checks object with dtypes, not for scalars)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lets make an effort to keep numexpr-specific lgoic in _can_use_numexpre/expressions

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jbrockmendel would you be OK with leaving the check here as is, short term? I have a next PR that moves this check inside a can_use_numexpr function inside expressions.py (#41122), so that will clean this up.
But I would like to merge this PR before #41122 since this one is adding a lot of test coverage for with/without numexpr.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah ok for now, but let's for sure move later

else:
func = partial(expressions.evaluate, op)

try:
result = expressions.evaluate(op, left, right)
result = func(left, right)
except TypeError:
if is_cmp:
# numexpr failed on comparison op, e.g. ndarray[float] > datetime
Expand Down Expand Up @@ -202,7 +210,9 @@ def arithmetic_op(left: ArrayLike, right: Any, op):
rvalues = ensure_wrapped_if_datetimelike(right)
rvalues = _maybe_upcast_for_op(rvalues, lvalues.shape)

if should_extension_dispatch(lvalues, rvalues) or isinstance(rvalues, Timedelta):
if should_extension_dispatch(lvalues, rvalues) or isinstance(
rvalues, (Timedelta, BaseOffset, Timestamp, NaTType)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this additional check could maybe be moved into should_extension_dispatch (although it is not necessarily related to "is extension array", but rather to "don't take the numexpr path")

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yah, IIRC should_extension_dispatch is only used here, so might as well refactor/rename/move/whatever is most convenient.

There is a comment below about why Timedelta is included; can you update it for the others

i think check rvalues is NaT rather than isinstance check

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed to right is NaT and updated the comment.

):
# Timedelta is included because numexpr will fail on it, see GH#31457
res_values = op(lvalues, rvalues)

Expand Down Expand Up @@ -248,7 +258,9 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike:
"Lengths must match to compare", lvalues.shape, rvalues.shape
)

if should_extension_dispatch(lvalues, rvalues):
if should_extension_dispatch(lvalues, rvalues) or isinstance(
rvalues, (Timedelta, BaseOffset, Timestamp, NaTType)
):
# Call the method on lvalues
res_values = op(lvalues, rvalues)

Expand All @@ -263,7 +275,7 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike:
# GH#36377 going through the numexpr path would incorrectly raise
return invalid_comparison(lvalues, rvalues, op)

elif is_object_dtype(lvalues.dtype):
elif is_object_dtype(lvalues.dtype) or isinstance(rvalues, str):
res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues)

else:
Expand Down Expand Up @@ -440,11 +452,13 @@ def _maybe_upcast_for_op(obj, shape: Shape):
Be careful to call this *after* determining the `name` attribute to be
attached to the result of the arithmetic operation.
"""
if type(obj) is timedelta:
if type(obj) is datetime.timedelta:
# GH#22390 cast up to Timedelta to rely on Timedelta
# implementation; otherwise operation against numeric-dtype
# raises TypeError
return Timedelta(obj)
elif type(obj) is datetime.datetime:
return Timestamp(obj)
elif isinstance(obj, np.datetime64):
# GH#28080 numpy casts integer-dtype to datetime64 when doing
# array[int] + datetime64, which we do not allow
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/arithmetic/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,18 @@
UInt64Index,
)
import pandas._testing as tm
from pandas.core.computation import expressions as expr


@pytest.fixture(
autouse=True, scope="module", params=[0, 1000000], ids=["numexpr", "python"]
)
def switch_numexpr_min_elements(request):
_MIN_ELEMENTS = expr._MIN_ELEMENTS
expr._MIN_ELEMENTS = request.param
yield request.param
expr._MIN_ELEMENTS = _MIN_ELEMENTS
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we get rid of some of the setup/teardown in test_expressions with this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potentially something similar could be used there as well, yes. But this PR is focusing on the tests/arithmetic/ tests, there is #40497 as general issue to modernize test_expressions.py



# ------------------------------------------------------------------
# Helper Functions
Expand Down
15 changes: 13 additions & 2 deletions pandas/tests/arithmetic/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
)
import pandas._testing as tm
from pandas.core import ops
from pandas.core.computation import expressions as expr


@pytest.fixture(params=[Index, Series, tm.to_array])
Expand Down Expand Up @@ -127,7 +128,12 @@ def test_numeric_cmp_string_numexpr_path(self, box_with_array):
result = obj != "a"
tm.assert_equal(result, ~expected)

msg = "Invalid comparison between dtype=float64 and str"
msg = "|".join(
[
"Invalid comparison between dtype=float64 and str",
"'<' not supported between instances of 'numpy.ndarray' and 'str'",
]
)
with pytest.raises(TypeError, match=msg):
obj < "a"

Expand Down Expand Up @@ -390,7 +396,7 @@ def test_div_negative_zero(self, zero, numeric_idx, op):
# ------------------------------------------------------------------

@pytest.mark.parametrize("dtype1", [np.int64, np.float64, np.uint64])
def test_ser_div_ser(self, dtype1, any_real_dtype):
def test_ser_div_ser(self, switch_numexpr_min_elements, dtype1, any_real_dtype):
# no longer do integer div for any ops, but deal with the 0's
dtype2 = any_real_dtype

Expand All @@ -404,6 +410,11 @@ def test_ser_div_ser(self, dtype1, any_real_dtype):
name=None,
)
expected.iloc[0:3] = np.inf
if first.dtype == "int64" and second.dtype == "float32":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the reverse excluded as well?

Copy link
Member Author

@jorisvandenbossche jorisvandenbossche Apr 26, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reverse (float32 + int64) is not tested, as the first.dtype is always int64/float64/uint64

(but yeah, the reverse order would also result in float32 instead of float64 when numexpr is used)

# when using numexpr, the casting rules are slightly different
# and int64/float32 combo results in float32 instead of float64
if expr.USE_NUMEXPR and switch_numexpr_min_elements == 0:
expected = expected.astype("float32")

result = first / second
tm.assert_series_equal(result, expected)
Expand Down
16 changes: 13 additions & 3 deletions pandas/tests/frame/test_arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,9 +174,19 @@ def test_timestamp_compare(self):
with pytest.raises(TypeError, match=msg):
right_f(pd.Timestamp("20010109"), df)
# nats
expected = left_f(df, pd.Timestamp("nat"))
result = right_f(pd.Timestamp("nat"), df)
tm.assert_frame_equal(result, expected)
if left in ["eq", "ne"]:
expected = left_f(df, pd.Timestamp("nat"))
result = right_f(pd.Timestamp("nat"), df)
tm.assert_frame_equal(result, expected)
else:
msg = (
"'(<|>)=?' not supported between "
"instances of 'numpy.ndarray' and 'NaTType'"
)
with pytest.raises(TypeError, match=msg):
left_f(df, pd.Timestamp("nat"))
with pytest.raises(TypeError, match=msg):
right_f(pd.Timestamp("nat"), df)

def test_mixed_comparison(self):
# GH#13128, GH#22163 != datetime64 vs non-dt64 should be False,
Expand Down