Skip to content

ENH: evaluate datetime ops in python with eval #4924

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Sep 27, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,9 @@ Experimental Features
- A :meth:`~pandas.DataFrame.query` method has been added that allows
you to select elements of a ``DataFrame`` using a natural query syntax nearly
identical to Python syntax.
- ``pd.eval`` and friends now evaluate operations involving ``datetime64``
objects in Python space because ``numexpr`` cannot handle ``NaT`` values
(:issue:`4897`).

.. _release.bug_fixes-0.13.0:

Expand Down
18 changes: 12 additions & 6 deletions pandas/computation/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,14 +111,20 @@ def _align_core(terms):
typ = biggest._constructor
axes = biggest.axes
naxes = len(axes)
gt_than_one_axis = naxes > 1

for term in (terms[i] for i in term_index):
for axis, items in enumerate(term.value.axes):
if isinstance(term.value, pd.Series) and naxes > 1:
ax, itm = naxes - 1, term.value.index
for value in (terms[i].value for i in term_index):
is_series = isinstance(value, pd.Series)
is_series_and_gt_one_axis = is_series and gt_than_one_axis

for axis, items in enumerate(value.axes):
if is_series_and_gt_one_axis:
ax, itm = naxes - 1, value.index
else:
ax, itm = axis, items
axes[ax] = axes[ax].join(itm, how='outer')

if not axes[ax].is_(itm):
axes[ax] = axes[ax].join(itm, how='outer')

for i, ndim in compat.iteritems(ndims):
for axis, items in zip(range(ndim), axes):
Expand All @@ -136,7 +142,7 @@ def _align_core(terms):
warnings.warn("Alignment difference on axis {0} is larger"
" than an order of magnitude on term {1!r}, "
"by more than {2:.4g}; performance may suffer"
"".format(axis, term.name, ordm),
"".format(axis, terms[i].name, ordm),
category=pd.io.common.PerformanceWarning)

if transpose:
Expand Down
9 changes: 8 additions & 1 deletion pandas/computation/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,8 +493,15 @@ def _possibly_evaluate_binop(self, op, op_class, lhs, rhs,
maybe_eval_in_python=('==', '!=')):
res = op(lhs, rhs)

# "in"/"not in" ops are always evaluated in python
if (res.op in _cmp_ops_syms and
lhs.is_datetime or rhs.is_datetime and
self.engine != 'pytables'):
# all date ops must be done in python bc numexpr doesn't work well
# with NaT
return self._possibly_eval(res, self.binary_ops)

if res.op in eval_in_python:
# "in"/"not in" ops are always evaluated in python
return self._possibly_eval(res, eval_in_python)
elif (lhs.return_type == object or rhs.return_type == object and
self.engine != 'pytables'):
Expand Down
40 changes: 19 additions & 21 deletions pandas/computation/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import operator as op
from functools import partial
from itertools import product, islice, chain
from datetime import datetime

import numpy as np

Expand Down Expand Up @@ -161,24 +162,16 @@ def raw(self):
self.type))

@property
def kind(self):
def is_datetime(self):
try:
return self.type.__name__
t = self.type.type
except AttributeError:
return self.type.type.__name__
t = self.type

return issubclass(t, (datetime, np.datetime64))

@property
def value(self):
kind = self.kind.lower()
if kind == 'datetime64':
try:
return self._value.asi8
except AttributeError:
return self._value.view('i8')
elif kind == 'datetime':
return pd.Timestamp(self._value)
elif kind == 'timestamp':
return self._value.asm8.view('i8')
return self._value

@value.setter
Expand Down Expand Up @@ -248,6 +241,15 @@ def return_type(self):
def isscalar(self):
return all(operand.isscalar for operand in self.operands)

@property
def is_datetime(self):
try:
t = self.return_type.type
except AttributeError:
t = self.return_type

return issubclass(t, (datetime, np.datetime64))


def _in(x, y):
"""Compute the vectorized membership of ``x in y`` if possible, otherwise
Expand Down Expand Up @@ -424,24 +426,20 @@ def stringify(value):

lhs, rhs = self.lhs, self.rhs

if (is_term(lhs) and lhs.kind.startswith('datetime') and is_term(rhs)
and rhs.isscalar):
if is_term(lhs) and lhs.is_datetime and is_term(rhs) and rhs.isscalar:
v = rhs.value
if isinstance(v, (int, float)):
v = stringify(v)
v = _ensure_decoded(v)
v = pd.Timestamp(v)
v = pd.Timestamp(_ensure_decoded(v))
if v.tz is not None:
v = v.tz_convert('UTC')
self.rhs.update(v)

if (is_term(rhs) and rhs.kind.startswith('datetime') and
is_term(lhs) and lhs.isscalar):
if is_term(rhs) and rhs.is_datetime and is_term(lhs) and lhs.isscalar:
v = lhs.value
if isinstance(v, (int, float)):
v = stringify(v)
v = _ensure_decoded(v)
v = pd.Timestamp(v)
v = pd.Timestamp(_ensure_decoded(v))
if v.tz is not None:
v = v.tz_convert('UTC')
self.lhs.update(v)
Expand Down
2 changes: 1 addition & 1 deletion pandas/computation/tests/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -1003,7 +1003,7 @@ def check_performance_warning_for_poor_alignment(self, engine, parser):
expected = ("Alignment difference on axis {0} is larger"
" than an order of magnitude on term {1!r}, "
"by more than {2:.4g}; performance may suffer"
"".format(1, 's', np.log10(s.size - df.shape[1])))
"".format(1, 'df', np.log10(s.size - df.shape[1])))
assert_equal(msg, expected)

def test_performance_warning_for_poor_alignment(self):
Expand Down
26 changes: 1 addition & 25 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1894,29 +1894,6 @@ def _getitem_frame(self, key):
raise ValueError('Must pass DataFrame with boolean values only')
return self.where(key)

def _get_index_resolvers(self, axis):
# index or columns
axis_index = getattr(self, axis)
d = dict()

for i, name in enumerate(axis_index.names):
if name is not None:
key = level = name
else:
# prefix with 'i' or 'c' depending on the input axis
# e.g., you must do ilevel_0 for the 0th level of an unnamed
# multiiindex
level_string = '{prefix}level_{i}'.format(prefix=axis[0], i=i)
key = level_string
level = i

d[key] = Series(axis_index.get_level_values(level).values,
index=axis_index, name=level)

# put the index/columns itself in the dict
d[axis] = axis_index
return d

def query(self, expr, **kwargs):
"""Query the columns of a frame with a boolean expression.

Expand Down Expand Up @@ -2037,8 +2014,7 @@ def eval(self, expr, **kwargs):
"""
resolvers = kwargs.pop('resolvers', None)
if resolvers is None:
index_resolvers = self._get_index_resolvers('index')
index_resolvers.update(self._get_index_resolvers('columns'))
index_resolvers = self._get_resolvers()
resolvers = [self, index_resolvers]
kwargs['local_dict'] = _ensure_scope(resolvers=resolvers, **kwargs)
return _eval(expr, **kwargs)
Expand Down
36 changes: 36 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,42 @@ def _get_block_manager_axis(self, axis):
return m - axis
return axis

def _get_axis_resolvers(self, axis):
# index or columns
axis_index = getattr(self, axis)
d = dict()
prefix = axis[0]

for i, name in enumerate(axis_index.names):
if name is not None:
key = level = name
else:
# prefix with 'i' or 'c' depending on the input axis
# e.g., you must do ilevel_0 for the 0th level of an unnamed
# multiiindex
key = '{prefix}level_{i}'.format(prefix=prefix, i=i)
level = i

level_values = axis_index.get_level_values(level)
s = level_values.to_series()
s.index = axis_index
d[key] = s

# put the index/columns itself in the dict
if isinstance(axis_index, MultiIndex):
dindex = axis_index
else:
dindex = axis_index.to_series()

d[axis] = dindex
return d

def _get_resolvers(self):
d = {}
for axis_name in self._AXIS_ORDERS:
d.update(self._get_axis_resolvers(axis_name))
return d

@property
def _info_axis(self):
return getattr(self, self._info_axis_name)
Expand Down
Loading