Skip to content

PERF: no need to check for DataFrame in pandas.core.computation.expressions #40445

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
12 changes: 1 addition & 11 deletions pandas/core/computation/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@

from pandas._typing import FuncType

from pandas.core.dtypes.generic import ABCDataFrame

from pandas.core.computation.check import NUMEXPR_INSTALLED
from pandas.core.ops import roperator

Expand Down Expand Up @@ -84,14 +82,8 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check):
# check for dtype compatibility
dtypes: Set[str] = set()
for o in [a, b]:
# Series implements dtypes, check for dimension count as well
if hasattr(o, "dtypes") and o.ndim > 1:
s = o.dtypes.value_counts()
if len(s) > 1:
return False
dtypes |= set(s.index.astype(str))
# ndarray and Series Case
elif hasattr(o, "dtype"):
if hasattr(o, "dtype"):
dtypes |= {o.dtype.name}

# allowed are a superset
Expand Down Expand Up @@ -191,8 +183,6 @@ def _where_numexpr(cond, a, b):


def _has_bool_dtype(x):
if isinstance(x, ABCDataFrame):
return "bool" in x.dtypes
try:
return x.dtype == bool
except AttributeError:
Expand Down
46 changes: 19 additions & 27 deletions pandas/tests/test_expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
)
from pandas.core.computation import expressions as expr

_frame = DataFrame(np.random.randn(10000, 4), columns=list("ABCD"), dtype="float64")
_frame = DataFrame(np.random.randn(10001, 4), columns=list("ABCD"), dtype="float64")
_frame2 = DataFrame(np.random.randn(100, 4), columns=list("ABCD"), dtype="float64")
_mixed = DataFrame(
{
Expand All @@ -36,14 +36,21 @@
_integer2 = DataFrame(
np.random.randint(1, 100, size=(101, 4)), columns=list("ABCD"), dtype="int64"
)
_array = _frame["A"].values.copy()
_array2 = _frame2["A"].values.copy()

_array_mixed = _mixed["D"].values.copy()
_array_mixed2 = _mixed2["D"].values.copy()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we really need to use fixtures for this module.....can you create an issue

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, opened #40497 for this



@pytest.mark.skipif(not expr.USE_NUMEXPR, reason="not using numexpr")
class TestExpressions:
def setup_method(self, method):

self.frame = _frame.copy()
self.array = _array.copy()
self.frame2 = _frame2.copy()
self.array2 = _array2.copy()
self.mixed = _mixed.copy()
self.mixed2 = _mixed2.copy()
self._MIN_ELEMENTS = expr._MIN_ELEMENTS
Expand Down Expand Up @@ -134,33 +141,29 @@ def test_invalid(self):

# no op
result = expr._can_use_numexpr(
operator.add, None, self.frame, self.frame, "evaluate"
)
assert not result

# mixed
result = expr._can_use_numexpr(
operator.add, "+", self.mixed, self.frame, "evaluate"
operator.add, None, self.array, self.array, "evaluate"
)
assert not result

# min elements
result = expr._can_use_numexpr(
operator.add, "+", self.frame2, self.frame2, "evaluate"
operator.add, "+", self.array2, self.array2, "evaluate"
)
assert not result

# ok, we only check on first part of expression
result = expr._can_use_numexpr(
operator.add, "+", self.frame, self.frame2, "evaluate"
operator.add, "+", self.array, self.array2, "evaluate"
)
assert result

@pytest.mark.parametrize(
"opname,op_str",
[("add", "+"), ("sub", "-"), ("mul", "*"), ("truediv", "/"), ("pow", "**")],
)
@pytest.mark.parametrize("left,right", [(_frame, _frame2), (_mixed, _mixed2)])
@pytest.mark.parametrize(
"left,right", [(_array, _array2), (_array_mixed, _array_mixed2)]
)
def test_binary_ops(self, opname, op_str, left, right):
def testit():

Expand All @@ -170,16 +173,9 @@ def testit():

op = getattr(operator, opname)

result = expr._can_use_numexpr(op, op_str, left, left, "evaluate")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so this will just raise now? if so can you add a test

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, it seems we just return False. So I could leave that as a test, but note that this is a file-internal helper function (leading underscore) that's within pandas never called with a DataFrame. So I would basically be testing something that is not expected to have any specific behaviour.

assert result != left._is_mixed_type

result = expr.evaluate(op, left, left, use_numexpr=True)
expected = expr.evaluate(op, left, left, use_numexpr=False)

if isinstance(result, DataFrame):
tm.assert_frame_equal(result, expected)
else:
tm.assert_numpy_array_equal(result, expected.values)
tm.assert_numpy_array_equal(result, expected)

result = expr._can_use_numexpr(op, op_str, right, right, "evaluate")
assert not result
Expand All @@ -203,23 +199,19 @@ def testit():
("ne", "!="),
],
)
@pytest.mark.parametrize("left,right", [(_frame, _frame2), (_mixed, _mixed2)])
@pytest.mark.parametrize(
"left,right", [(_array, _array2), (_array_mixed, _array_mixed2)]
)
def test_comparison_ops(self, opname, op_str, left, right):
def testit():
f12 = left + 1
f22 = right + 1

op = getattr(operator, opname)

result = expr._can_use_numexpr(op, op_str, left, f12, "evaluate")
assert result != left._is_mixed_type

result = expr.evaluate(op, left, f12, use_numexpr=True)
expected = expr.evaluate(op, left, f12, use_numexpr=False)
if isinstance(result, DataFrame):
tm.assert_frame_equal(result, expected)
else:
tm.assert_numpy_array_equal(result, expected.values)
tm.assert_numpy_array_equal(result, expected)

result = expr._can_use_numexpr(op, op_str, right, f22, "evaluate")
assert not result
Expand Down