diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index 34166343817a4..00c76632ce17b 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -300,7 +300,7 @@ Expression Evaluation via :func:`~pandas.eval` (Experimental) .. versionadded:: 0.13 -The top-level function :func:`~pandas.eval` implements expression evaluation of +The top-level function :func:`pandas.eval` implements expression evaluation of :class:`~pandas.Series` and :class:`~pandas.DataFrame` objects. .. note:: @@ -336,11 +336,11 @@ engine in addition to some extensions available only in pandas. Supported Syntax ~~~~~~~~~~~~~~~~ -These operations are supported by :func:`~pandas.eval`: +These operations are supported by :func:`pandas.eval`: - Arithmetic operations except for the left shift (``<<``) and right shift (``>>``) operators, e.g., ``df + 2 * pi / s ** 4 % 42 - the_golden_ratio`` -- Comparison operations, e.g., ``2 < df < df2`` +- Comparison operations, including chained comparisons, e.g., ``2 < df < df2`` - Boolean operations, e.g., ``df < df2 and df3 < df4 or not df_bool`` - ``list`` and ``tuple`` literals, e.g., ``[1, 2]`` or ``(1, 2)`` - Attribute access, e.g., ``df.a`` @@ -373,9 +373,9 @@ This Python syntax is **not** allowed: :func:`~pandas.eval` Examples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:func:`~pandas.eval` works wonders for expressions containing large arrays +:func:`pandas.eval` works well with expressions containing large arrays -First let's create 4 decent-sized arrays to play with: +First let's create a few decent-sized arrays to play with: .. ipython:: python @@ -441,8 +441,10 @@ Now let's do the same thing but with comparisons: The ``DataFrame.eval`` method (Experimental) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In addition to the top level :func:`~pandas.eval` function you can also -evaluate an expression in the "context" of a ``DataFrame``. +.. versionadded:: 0.13 + +In addition to the top level :func:`pandas.eval` function you can also +evaluate an expression in the "context" of a :class:`~pandas.DataFrame`. .. ipython:: python :suppress: @@ -462,10 +464,10 @@ evaluate an expression in the "context" of a ``DataFrame``. df = DataFrame(randn(5, 2), columns=['a', 'b']) df.eval('a + b') -Any expression that is a valid :func:`~pandas.eval` expression is also a valid -``DataFrame.eval`` expression, with the added benefit that *you don't have to -prefix the name of the* ``DataFrame`` *to the column(s) you're interested in -evaluating*. +Any expression that is a valid :func:`pandas.eval` expression is also a valid +:meth:`DataFrame.eval` expression, with the added benefit that you don't have to +prefix the name of the :class:`~pandas.DataFrame` to the column(s) you're +interested in evaluating. In addition, you can perform assignment of columns within an expression. This allows for *formulaic evaluation*. Only a single assignment is permitted. @@ -480,55 +482,75 @@ it must be a valid Python identifier. df.eval('a = 1') df +The equivalent in standard Python would be + +.. ipython:: python + + df = DataFrame(dict(a=range(5), b=range(5, 10))) + df['c'] = df.a + df.b + df['d'] = df.a + df.b + df.c + df['a'] = 1 + df + Local Variables ~~~~~~~~~~~~~~~ -You can refer to local variables the same way you would in vanilla Python +In pandas version 0.14 the local variable API has changed. In pandas 0.13.x, +you could refer to local variables the same way you would in standard Python. +For example, -.. ipython:: python +.. code-block:: python df = DataFrame(randn(5, 2), columns=['a', 'b']) newcol = randn(len(df)) df.eval('b + newcol') -.. note:: + UndefinedVariableError: name 'newcol' is not defined - The one exception is when you have a local (or global) with the same name as - a column in the ``DataFrame`` +As you can see from the exception generated, this syntax is no longer allowed. +You must *explicitly reference* any local variable that you want to use in an +expression by placing the ``@`` character in front of the name. For example, - .. code-block:: python +.. ipython:: python - df = DataFrame(randn(5, 2), columns=['a', 'b']) - a = randn(len(df)) - df.eval('a + b') - NameResolutionError: resolvers and locals overlap on names ['a'] + df = DataFrame(randn(5, 2), columns=list('ab')) + newcol = randn(len(df)) + df.eval('b + @newcol') + df.query('b < @newcol') +If you don't prefix the local variable with ``@``, pandas will raise an +exception telling you the variable is undefined. - To deal with these conflicts, a special syntax exists for referring - variables with the same name as a column +When using :meth:`DataFrame.eval` and :meth:`DataFrame.query`, this allows you +to have a local variable and a :class:`~pandas.DataFrame` column with the same +name in an expression. - .. ipython:: python - :suppress: - a = randn(len(df)) +.. ipython:: python - .. ipython:: python + a = randn() + df.query('@a < a') + df.loc[a < df.a] # same as the previous expression - df.eval('@a + b') +With :func:`pandas.eval` you cannot use the ``@`` prefix *at all*, because it +isn't defined in that context. ``pandas`` will let you know this if you try to +use ``@`` in a top-level call to :func:`pandas.eval`. For example, - The same is true for :meth:`~pandas.DataFrame.query` +.. ipython:: python + :okexcept: - .. ipython:: python + a, b = 1, 2 + pd.eval('@a + b') - df.query('@a < b') +In this case, you should simply refer to the variables like you would in +standard Python. - .. ipython:: python - :suppress: +.. ipython:: python - del a + pd.eval('a + b') -:func:`~pandas.eval` Parsers +:func:`pandas.eval` Parsers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There are two different parsers and and two different engines you can use as @@ -568,7 +590,7 @@ The ``and`` and ``or`` operators here have the same precedence that they would in vanilla Python. -:func:`~pandas.eval` Backends +:func:`pandas.eval` Backends ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There's also the option to make :func:`~pandas.eval` operate identical to plain @@ -577,12 +599,12 @@ ol' Python. .. note:: Using the ``'python'`` engine is generally *not* useful, except for testing - other :func:`~pandas.eval` engines against it. You will acheive **no** - performance benefits using :func:`~pandas.eval` with ``engine='python'``. + other evaluation engines against it. You will acheive **no** performance + benefits using :func:`~pandas.eval` with ``engine='python'`` and in fact may + incur a performance hit. -You can see this by using :func:`~pandas.eval` with the ``'python'`` engine is -actually a bit slower (not by much) than evaluating the same expression in -Python: +You can see this by using :func:`pandas.eval` with the ``'python'`` engine. It +is a bit slower (not by much) than evaluating the same expression in Python .. ipython:: python @@ -593,15 +615,15 @@ Python: %timeit pd.eval('df1 + df2 + df3 + df4', engine='python') -:func:`~pandas.eval` Performance +:func:`pandas.eval` Performance ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :func:`~pandas.eval` is intended to speed up certain kinds of operations. In particular, those operations involving complex expressions with large -``DataFrame``/``Series`` objects should see a significant performance benefit. -Here is a plot showing the running time of :func:`~pandas.eval` as function of -the size of the frame involved in the computation. The two lines are two -different engines. +:class:`~pandas.DataFrame`/:class:`~pandas.Series` objects should see a +significant performance benefit. Here is a plot showing the running time of +:func:`pandas.eval` as function of the size of the frame involved in the +computation. The two lines are two different engines. .. image:: _static/eval-perf.png @@ -618,19 +640,31 @@ different engines. This plot was created using a ``DataFrame`` with 3 columns each containing floating point values generated using ``numpy.random.randn()``. -Technical Minutia -~~~~~~~~~~~~~~~~~ -- Expressions that would result in an object dtype (including simple - variable evaluation) have to be evaluated in Python space. The main reason - for this behavior is to maintain backwards compatbility with versions of - numpy < 1.7. In those versions of ``numpy`` a call to ``ndarray.astype(str)`` - will truncate any strings that are more than 60 characters in length. Second, - we can't pass ``object`` arrays to ``numexpr`` thus string comparisons must - be evaluated in Python space. -- The upshot is that this *only* applies to object-dtype'd expressions. So, - if you have an expression--for example--that's a string comparison - ``and``-ed together with another boolean expression that's from a numeric - comparison, the numeric comparison will be evaluated by ``numexpr``. In fact, - in general, :func:`~pandas.query`/:func:`~pandas.eval` will "pick out" the - subexpressions that are ``eval``-able by ``numexpr`` and those that must be - evaluated in Python space transparently to the user. +Technical Minutia Regarding Expression Evaluation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Expressions that would result in an object dtype or involve datetime operations +(because of ``NaT``) must be evaluated in Python space. The main reason for +this behavior is to maintain backwards compatbility with versions of numpy < +1.7. In those versions of ``numpy`` a call to ``ndarray.astype(str)`` will +truncate any strings that are more than 60 characters in length. Second, we +can't pass ``object`` arrays to ``numexpr`` thus string comparisons must be +evaluated in Python space. + +The upshot is that this *only* applies to object-dtype'd expressions. So, if +you have an expression--for example + +.. ipython:: python + + df = DataFrame({'strings': np.repeat(list('cba'), 3), + 'nums': np.repeat(range(3), 3)}) + df + df.query('strings == "a" and nums == 1') + +the numeric part of the comparison (``nums == 1``) will be evaluated by +``numexpr``. + +In general, :meth:`DataFrame.query`/:func:`pandas.eval` will +evaluate the subexpressions that *can* be evaluated by ``numexpr`` and those +that must be evaluated in Python space transparently to the user. This is done +by inferring the result type of an expression from its arguments and operators. diff --git a/doc/source/release.rst b/doc/source/release.rst index 80274c74c0f87..af8c4bdf381e7 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -83,9 +83,26 @@ API Changes - ``pd.infer_freq()`` - ``pd.infer_freq()`` will now raise a ``TypeError`` if given an invalid ``Series/Index`` type (:issue:`6407`) +- Local variable usage has changed in + :func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query` + (:issue:`5987`). For the :class:`~pandas.DataFrame` methods, two things have + changed + + - Column names are now given precedence over locals + - Local variables must be referred to explicitly. This means that even if + you have a local variable that is *not* a column you must still refer to + it with the ``'@'`` prefix. + - You can have an expression like ``df.query('@a < a')`` with no complaints + from ``pandas`` about ambiguity of the name ``a``. + +- The top-level :func:`pandas.eval` function does not allow you use the + ``'@'`` prefix and provides you with an error message telling you so. +- ``NameResolutionError`` was removed because it isn't necessary anymore. + Experimental Features ~~~~~~~~~~~~~~~~~~~~~ + Improvements to existing features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -144,6 +161,8 @@ Bug Fixes - Bug in DataFrame.dropna with duplicate indices (:issue:`6355`) - Regression in chained getitem indexing with embedded list-like from 0.12 (:issue:`6394`) - ``Float64Index`` with nans not comparing correctly +- ``eval``/``query`` expressions with strings containing the ``@`` character + will now work (:issue:`6366`). pandas 0.13.1 ------------- diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 949de3f674028..76ba2dafd69d6 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -51,6 +51,22 @@ API changes s.year s.index.year +- Local variable usage has changed in + :func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query` + (:issue:`5987`). For the :class:`~pandas.DataFrame` methods, two things have + changed + + - Column names are now given precedence over locals + - Local variables must be referred to explicitly. This means that even if + you have a local variable that is *not* a column you must still refer to + it with the ``'@'`` prefix. + - You can have an expression like ``df.query('@a < a')`` with no complaints + from ``pandas`` about ambiguity of the name ``a``. + +- The top-level :func:`pandas.eval` function does not allow you use the + ``'@'`` prefix and provides you with an error message telling you so. +- ``NameResolutionError`` was removed because it isn't necessary anymore. + MultiIndexing Using Slicers ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 8ec3adcdffd6f..bff6eb1f95abc 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -54,6 +54,8 @@ import pickle as cPickle import http.client as httplib +from pandas.compat.chainmap import DeepChainMap + if PY3: def isidentifier(s): diff --git a/pandas/compat/chainmap.py b/pandas/compat/chainmap.py new file mode 100644 index 0000000000000..9edd2ef056a52 --- /dev/null +++ b/pandas/compat/chainmap.py @@ -0,0 +1,26 @@ +try: + from collections import ChainMap +except ImportError: + from pandas.compat.chainmap_impl import ChainMap + + +class DeepChainMap(ChainMap): + def __setitem__(self, key, value): + for mapping in self.maps: + if key in mapping: + mapping[key] = value + return + self.maps[0][key] = value + + def __delitem__(self, key): + for mapping in self.maps: + if key in mapping: + del mapping[key] + return + raise KeyError(key) + + # override because the m parameter is introduced in Python 3.4 + def new_child(self, m=None): + if m is None: + m = {} + return self.__class__(m, *self.maps) diff --git a/pandas/compat/chainmap_impl.py b/pandas/compat/chainmap_impl.py new file mode 100644 index 0000000000000..92d2424057f83 --- /dev/null +++ b/pandas/compat/chainmap_impl.py @@ -0,0 +1,136 @@ +from collections import MutableMapping + +try: + from thread import get_ident +except ImportError: + from _thread import get_ident + + +def recursive_repr(fillvalue='...'): + 'Decorator to make a repr function return fillvalue for a recursive call' + + def decorating_function(user_function): + repr_running = set() + + def wrapper(self): + key = id(self), get_ident() + if key in repr_running: + return fillvalue + repr_running.add(key) + try: + result = user_function(self) + finally: + repr_running.discard(key) + return result + + # Can't use functools.wraps() here because of bootstrap issues + wrapper.__module__ = getattr(user_function, '__module__') + wrapper.__doc__ = getattr(user_function, '__doc__') + wrapper.__name__ = getattr(user_function, '__name__') + return wrapper + + return decorating_function + + +class ChainMap(MutableMapping): + ''' A ChainMap groups multiple dicts (or other mappings) together + to create a single, updateable view. + + The underlying mappings are stored in a list. That list is public and can + accessed or updated using the *maps* attribute. There is no other state. + + Lookups search the underlying mappings successively until a key is found. + In contrast, writes, updates, and deletions only operate on the first + mapping. + + ''' + + def __init__(self, *maps): + '''Initialize a ChainMap by setting *maps* to the given mappings. + If no mappings are provided, a single empty dictionary is used. + + ''' + self.maps = list(maps) or [{}] # always at least one map + + def __missing__(self, key): + raise KeyError(key) + + def __getitem__(self, key): + for mapping in self.maps: + try: + return mapping[key] # can't use 'key in mapping' with defaultdict + except KeyError: + pass + return self.__missing__(key) # support subclasses that define __missing__ + + def get(self, key, default=None): + return self[key] if key in self else default + + def __len__(self): + return len(set().union(*self.maps)) # reuses stored hash values if possible + + def __iter__(self): + return iter(set().union(*self.maps)) + + def __contains__(self, key): + return any(key in m for m in self.maps) + + def __bool__(self): + return any(self.maps) + + @recursive_repr() + def __repr__(self): + return '{0.__class__.__name__}({1})'.format( + self, ', '.join(repr(m) for m in self.maps)) + + @classmethod + def fromkeys(cls, iterable, *args): + 'Create a ChainMap with a single dict created from the iterable.' + return cls(dict.fromkeys(iterable, *args)) + + def copy(self): + 'New ChainMap or subclass with a new copy of maps[0] and refs to maps[1:]' + return self.__class__(self.maps[0].copy(), *self.maps[1:]) + + __copy__ = copy + + def new_child(self, m=None): # like Django's Context.push() + ''' + New ChainMap with a new map followed by all previous maps. If no + map is provided, an empty dict is used. + ''' + if m is None: + m = {} + return self.__class__(m, *self.maps) + + @property + def parents(self): # like Django's Context.pop() + 'New ChainMap from maps[1:].' + return self.__class__(*self.maps[1:]) + + def __setitem__(self, key, value): + self.maps[0][key] = value + + def __delitem__(self, key): + try: + del self.maps[0][key] + except KeyError: + raise KeyError('Key not found in the first mapping: {!r}'.format(key)) + + def popitem(self): + 'Remove and return an item pair from maps[0]. Raise KeyError is maps[0] is empty.' + try: + return self.maps[0].popitem() + except KeyError: + raise KeyError('No keys found in the first mapping.') + + def pop(self, key, *args): + 'Remove *key* from maps[0] and return its value. Raise KeyError if *key* not in maps[0].' + try: + return self.maps[0].pop(key, *args) + except KeyError: + raise KeyError('Key not found in the first mapping: {!r}'.format(key)) + + def clear(self): + 'Clear maps[0], leaving maps[1:] intact.' + self.maps[0].clear() diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 9738cac58fb2d..120e190736516 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -4,9 +4,34 @@ import abc from pandas import compat +from pandas.compat import DeepChainMap, map from pandas.core import common as com from pandas.computation.align import _align, _reconstruct_object -from pandas.computation.ops import UndefinedVariableError +from pandas.computation.ops import UndefinedVariableError, _mathops, _reductions + + +_ne_builtins = frozenset(_mathops + _reductions) + + +class NumExprClobberingError(NameError): + pass + + +def _check_ne_builtin_clash(expr): + """Attempt to prevent foot-shooting in a helpful way. + + Parameters + ---------- + terms : Term + Terms can contain + """ + names = expr.names + overlap = names & _ne_builtins + + if overlap: + s = ', '.join(map(repr, overlap)) + raise NumExprClobberingError('Variables in expression "%s" overlap with ' + 'numexpr builtins: (%s)' % (expr, s)) class AbstractEngine(object): @@ -29,9 +54,6 @@ def convert(self): """ return com.pprint_thing(self.expr) - def pre_evaluate(self): - self.expr.check_name_clashes() - def evaluate(self): """Run the engine on the expression @@ -47,7 +69,6 @@ def evaluate(self): self.result_type, self.aligned_axes = _align(self.expr.terms) # make sure no names in resolvers and locals/globals clash - self.pre_evaluate() res = self._evaluate() return _reconstruct_object(self.result_type, res, self.aligned_axes, self.expr.terms.return_type) @@ -87,16 +108,15 @@ def convert(self): def _evaluate(self): import numexpr as ne - # add the resolvers to locals - self.expr.add_resolvers_to_locals() - # convert the expression to a valid numexpr expression s = self.convert() try: - return ne.evaluate(s, local_dict=self.expr.env.locals, - global_dict=self.expr.env.globals, - truediv=self.expr.truediv) + env = self.expr.env + scope = env.full_scope + truediv = scope['truediv'] + _check_ne_builtin_clash(self.expr) + return ne.evaluate(s, local_dict=scope, truediv=truediv) except KeyError as e: # python 3 compat kludge try: @@ -106,6 +126,7 @@ def _evaluate(self): raise UndefinedVariableError(msg) + class PythonEngine(AbstractEngine): """Evaluate an expression in Python space. @@ -118,7 +139,6 @@ def __init__(self, expr): super(PythonEngine, self).__init__(expr) def evaluate(self): - self.pre_evaluate() return self.expr() def _evaluate(self): diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 4cc68ac4770b3..f628a788b7147 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -3,9 +3,11 @@ """Top level ``eval`` module. """ - +import tokenize from pandas.core import common as com -from pandas.computation.expr import Expr, _parsers, _ensure_scope +from pandas.computation.expr import Expr, _parsers, tokenize_string +from pandas.computation.scope import _ensure_scope +from pandas.compat import DeepChainMap, builtins from pandas.computation.engines import _engines from distutils.version import LooseVersion @@ -116,8 +118,26 @@ def _convert_expression(expr): return s +def _check_for_locals(expr, stack_level, parser): + at_top_of_stack = stack_level == 0 + not_pandas_parser = parser != 'pandas' + + if not_pandas_parser: + msg = "The '@' prefix is only supported by the pandas parser" + elif at_top_of_stack: + msg = ("The '@' prefix is not allowed in " + "top-level eval calls, \nplease refer to " + "your variables by name without the '@' " + "prefix") + + if at_top_of_stack or not_pandas_parser: + for toknum, tokval, _, _, _ in tokenize_string(expr): + if toknum == tokenize.OP and tokval == '@': + raise SyntaxError(msg) + + def eval(expr, parser='pandas', engine='numexpr', truediv=True, - local_dict=None, global_dict=None, resolvers=None, level=2, + local_dict=None, global_dict=None, resolvers=(), level=0, target=None): """Evaluate a Python expression as a string using various backends. @@ -198,10 +218,13 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True, _check_engine(engine) _check_parser(parser) _check_resolvers(resolvers) + _check_for_locals(expr, level, parser) # get our (possibly passed-in) scope - env = _ensure_scope(global_dict=global_dict, local_dict=local_dict, - resolvers=resolvers, level=level, target=target) + level += 1 + env = _ensure_scope(level, global_dict=global_dict, + local_dict=local_dict, resolvers=resolvers, + target=target) parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, truediv=truediv) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 71501d5079c4c..454a28fd82362 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -7,7 +7,6 @@ import inspect import tokenize import datetime -import struct from functools import partial @@ -16,225 +15,16 @@ from pandas.compat import StringIO, zip, reduce, string_types from pandas.core.base import StringMixin from pandas.core import common as com -from pandas.computation.common import NameResolutionError from pandas.computation.ops import (_cmp_ops_syms, _bool_ops_syms, _arith_ops_syms, _unary_ops_syms, is_term) from pandas.computation.ops import _reductions, _mathops, _LOCAL_TAG from pandas.computation.ops import Op, BinOp, UnaryOp, Term, Constant, Div from pandas.computation.ops import UndefinedVariableError +from pandas.computation.scope import Scope, _ensure_scope -def _ensure_scope(level=2, global_dict=None, local_dict=None, resolvers=None, - target=None, **kwargs): - """Ensure that we are grabbing the correct scope.""" - return Scope(gbls=global_dict, lcls=local_dict, level=level, - resolvers=resolvers, target=target) - - -def _check_disjoint_resolver_names(resolver_keys, local_keys, global_keys): - """Make sure that variables in resolvers don't overlap with locals or - globals. - """ - res_locals = list(com.intersection(resolver_keys, local_keys)) - if res_locals: - msg = "resolvers and locals overlap on names {0}".format(res_locals) - raise NameResolutionError(msg) - - res_globals = list(com.intersection(resolver_keys, global_keys)) - if res_globals: - msg = "resolvers and globals overlap on names {0}".format(res_globals) - raise NameResolutionError(msg) - - -def _replacer(x, pad_size): - """Replace a number with its padded hexadecimal representation. Used to tag - temporary variables with their calling scope's id. - """ - # get the hex repr of the binary char and remove 0x and pad by pad_size - # zeros - try: - hexin = ord(x) - except TypeError: - # bytes literals masquerade as ints when iterating in py3 - hexin = x - - return hex(hexin).replace('0x', '').rjust(pad_size, '0') - - -def _raw_hex_id(obj, pad_size=2): - """Return the padded hexadecimal id of ``obj``.""" - # interpret as a pointer since that's what really what id returns - packed = struct.pack('@P', id(obj)) - - return ''.join(_replacer(x, pad_size) for x in packed) - - -class Scope(StringMixin): - - """Object to hold scope, with a few bells to deal with some custom syntax - added by pandas. - - Parameters - ---------- - gbls : dict or None, optional, default None - lcls : dict or Scope or None, optional, default None - level : int, optional, default 1 - resolvers : list-like or None, optional, default None - - Attributes - ---------- - globals : dict - locals : dict - level : int - resolvers : tuple - resolver_keys : frozenset - """ - __slots__ = ('globals', 'locals', 'resolvers', '_global_resolvers', - 'resolver_keys', '_resolver', 'level', 'ntemps', 'target') - - def __init__(self, gbls=None, lcls=None, level=1, resolvers=None, - target=None): - self.level = level - self.resolvers = tuple(resolvers or []) - self.globals = dict() - self.locals = dict() - self.target = target - self.ntemps = 1 # number of temporary variables in this scope - - if isinstance(lcls, Scope): - ld, lcls = lcls, dict() - self.locals.update(ld.locals.copy()) - self.globals.update(ld.globals.copy()) - self.resolvers += ld.resolvers - if ld.target is not None: - self.target = ld.target - self.update(ld.level) - - frame = sys._getframe(level) - try: - self.globals.update(gbls or frame.f_globals) - self.locals.update(lcls or frame.f_locals) - finally: - del frame - - # add some useful defaults - self.globals['Timestamp'] = pd.lib.Timestamp - self.globals['datetime'] = datetime - - # SUCH a hack - self.globals['True'] = True - self.globals['False'] = False - - # function defs - self.globals['list'] = list - self.globals['tuple'] = tuple - - res_keys = (list(o.keys()) for o in self.resolvers) - self.resolver_keys = frozenset(reduce(operator.add, res_keys, [])) - self._global_resolvers = self.resolvers + (self.locals, self.globals) - self._resolver = None - - self.resolver_dict = {} - for o in self.resolvers: - self.resolver_dict.update(dict(o)) - - def __unicode__(self): - return com.pprint_thing( - 'locals: {0}\nglobals: {0}\nresolvers: ' - '{0}\ntarget: {0}'.format(list(self.locals.keys()), - list(self.globals.keys()), - list(self.resolver_keys), - self.target)) - - def __getitem__(self, key): - return self.resolve(key, globally=False) - - def resolve(self, key, globally=False): - resolvers = self.locals, self.globals - if globally: - resolvers = self._global_resolvers - - for resolver in resolvers: - try: - return resolver[key] - except KeyError: - pass - - def update(self, level=None): - """Update the current scope by going back `level` levels. - - Parameters - ---------- - level : int or None, optional, default None - """ - # we are always 2 levels below the caller - # plus the caller may be below the env level - # in which case we need addtl levels - sl = 2 - if level is not None: - sl += level - - # add sl frames to the scope starting with the - # most distant and overwritting with more current - # makes sure that we can capture variable scope - frame = inspect.currentframe() - try: - frames = [] - while sl >= 0: - frame = frame.f_back - sl -= 1 - if frame is None: - break - frames.append(frame) - for f in frames[::-1]: - self.locals.update(f.f_locals) - self.globals.update(f.f_globals) - finally: - del frame, frames - - def add_tmp(self, value, where='locals'): - """Add a temporary variable to the scope. - - Parameters - ---------- - value : object - An arbitrary object to be assigned to a temporary variable. - where : basestring, optional, default 'locals', {'locals', 'globals'} - What scope to add the value to. - - Returns - ------- - name : basestring - The name of the temporary variable created. - """ - d = getattr(self, where, None) - - if d is None: - raise AttributeError("Cannot add value to non-existent scope " - "{0!r}".format(where)) - if not isinstance(d, dict): - raise TypeError("Cannot add value to object of type {0!r}, " - "scope must be a dictionary" - "".format(type(d).__name__)) - name = 'tmp_var_{0}_{1}_{2}'.format(type(value).__name__, self.ntemps, - _raw_hex_id(self)) - d[name] = value - - # only increment if the variable gets put in the scope - self.ntemps += 1 - return name - - def remove_tmp(self, name, where='locals'): - d = getattr(self, where, None) - if d is None: - raise AttributeError("Cannot remove value from non-existent scope " - "{0!r}".format(where)) - if not isinstance(d, dict): - raise TypeError("Cannot remove value from object of type {0!r}, " - "scope must be a dictionary" - "".format(type(d).__name__)) - del d[name] - self.ntemps -= 1 +def tokenize_string(s): + return tokenize.generate_tokens(StringIO(s).readline) def _rewrite_assign(source): @@ -242,8 +32,7 @@ def _rewrite_assign(source): ``=`` as a substitute for ``==``. """ res = [] - g = tokenize.generate_tokens(StringIO(source).readline) - for toknum, tokval, _, _, _ in g: + for toknum, tokval, _, _, _ in tokenize_string(source): res.append((toknum, '==' if tokval == '=' else tokval)) return tokenize.untokenize(res) @@ -253,8 +42,7 @@ def _replace_booleans(source): precedence is changed to boolean precedence. """ res = [] - g = tokenize.generate_tokens(StringIO(source).readline) - for toknum, tokval, _, _, _ in g: + for toknum, tokval, _, _, _ in tokenize_string(source): if toknum == tokenize.OP: if tokval == '&': res.append((tokenize.NAME, 'and')) @@ -268,7 +56,7 @@ def _replace_booleans(source): def _replace_locals(source, local_symbol='@'): - """Replace local variables with a syntacticall valid name.""" + """Replace local variables with a syntactically valid name.""" return source.replace(local_symbol, _LOCAL_TAG) @@ -549,8 +337,8 @@ def visit_BinOp(self, node, **kwargs): return self._possibly_evaluate_binop(op, op_class, left, right) def visit_Div(self, node, **kwargs): - return lambda lhs, rhs: Div(lhs, rhs, - truediv=self.env.locals['truediv']) + truediv = self.env.scope['truediv'] + return lambda lhs, rhs: Div(lhs, rhs, truediv) def visit_UnaryOp(self, node, **kwargs): op = self.visit(node.op) @@ -768,21 +556,20 @@ class Expr(StringMixin): """ def __init__(self, expr, engine='numexpr', parser='pandas', env=None, - truediv=True, level=2): + truediv=True, level=0): self.expr = expr - self.env = _ensure_scope(level=level, local_dict=env) + self.env = env or Scope(level=level + 1) self.engine = engine self.parser = parser + self.env.scope['truediv'] = truediv self._visitor = _parsers[parser](self.env, self.engine, self.parser) self.terms = self.parse() - self.truediv = truediv @property def assigner(self): return getattr(self._visitor, 'assigner', None) def __call__(self): - self.env.locals['truediv'] = self.truediv return self.terms(self.env) def __unicode__(self): @@ -806,34 +593,5 @@ def names(self): return frozenset([self.terms.name]) return frozenset(term.name for term in com.flatten(self.terms)) - def check_name_clashes(self): - env = self.env - names = self.names - res_keys = frozenset(env.resolver_dict.keys()) & names - lcl_keys = frozenset(env.locals.keys()) & names - gbl_keys = frozenset(env.globals.keys()) & names - _check_disjoint_resolver_names(res_keys, lcl_keys, gbl_keys) - - def add_resolvers_to_locals(self): - """Add the extra scope (resolvers) to local scope - - Notes - ----- - This should be done after parsing and pre-evaluation, otherwise - unnecessary name clashes will occur. - """ - self.env.locals.update(self.env.resolver_dict) - - -def isexpr(s, check_names=True): - """Strict checking for a valid expression.""" - try: - Expr(s, env=_ensure_scope() if check_names else None) - except SyntaxError: - return False - except NameError: - return not check_names - return True - _parsers = {'python': PythonExprVisitor, 'pandas': PandasExprVisitor} diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 270ba92d4483a..93c10fc42ee36 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -23,7 +23,6 @@ _LOCAL_TAG = '__pd_eval_local_' -_TAG_RE = re.compile('^{0}'.format(_LOCAL_TAG)) class UndefinedVariableError(NameError): @@ -32,26 +31,13 @@ class UndefinedVariableError(NameError): def __init__(self, *args): msg = 'name {0!r} is not defined' - subbed = _TAG_RE.sub('', args[0]) + subbed = args[0].replace(_LOCAL_TAG, '') if subbed != args[0]: subbed = '@' + subbed msg = 'local variable {0!r} is not defined' super(UndefinedVariableError, self).__init__(msg.format(subbed)) -def _possibly_update_key(d, value, old_key, new_key=None): - if new_key is None: - new_key = old_key - - try: - del d[old_key] - except KeyError: - return False - else: - d[new_key] = value - return True - - class Term(StringMixin): def __new__(cls, name, env, side=None, encoding=None): @@ -65,13 +51,13 @@ def __init__(self, name, env, side=None, encoding=None): self._name = name self.env = env self.side = side - self.local = _TAG_RE.search(text_type(name)) is not None + self.is_local = text_type(name).startswith(_LOCAL_TAG) self._value = self._resolve_name() self.encoding = encoding @property def local_name(self): - return _TAG_RE.sub('', self.name) + return self.name.replace(_LOCAL_TAG, '') def __unicode__(self): return com.pprint_thing(self.name) @@ -83,9 +69,8 @@ def evaluate(self, *args, **kwargs): return self def _resolve_name(self): - env = self.env key = self.name - res = env.resolve(self.local_name, globally=not self.local) + res = self.env.resolve(self.local_name, is_local=self.is_local) self.update(res) if res is None: @@ -94,8 +79,8 @@ def _resolve_name(self): raise UndefinedVariableError(key) if hasattr(res, 'ndim') and res.ndim > 2: - raise NotImplementedError("N-dimensional objects, where N > 2, are" - " not supported with eval") + raise NotImplementedError("N-dimensional objects, where N > 2," + " are not supported with eval") return res def update(self, value): @@ -108,34 +93,14 @@ def update(self, value): ('locals', 'key'), ('globals', 'key')] """ - env = self.env key = self.name # if it's a variable name (otherwise a constant) if isinstance(key, string_types): - if self.local: - # get it's name WITHOUT the local tag (defined above) - local_name = self.local_name - - # search for the local in the above specified order - scope_pairs = product([env.locals, env.globals], - [local_name, key]) - - # a[::2] + a[1::2] but iterators - scope_iter = chain(islice(scope_pairs, None, None, 2), - islice(scope_pairs, 1, None, 2)) - for d, k in scope_iter: - if _possibly_update_key(d, value, k, key): - break - else: - raise UndefinedVariableError(key) - else: - # otherwise we look in resolvers -> locals -> globals - for r in (env.resolver_dict, env.locals, env.globals): - if _possibly_update_key(r, value, key): - break - else: - raise UndefinedVariableError(key) + try: + self.env.swapkey(self.local_name, key, new_value=value) + except KeyError: + raise UndefinedVariableError(key) self.value = value @@ -374,7 +339,7 @@ def __call__(self, env): The result of an evaluated expression. """ # handle truediv - if self.op == '/' and env.locals['truediv']: + if self.op == '/' and env.scope['truediv']: self.func = op.truediv # recurse over the left/right nodes @@ -472,7 +437,7 @@ class Div(BinOp): regardless of the value of ``truediv``. """ - def __init__(self, lhs, rhs, truediv=True, *args, **kwargs): + def __init__(self, lhs, rhs, truediv, *args, **kwargs): super(Div, self).__init__('/', lhs, rhs, *args, **kwargs) if truediv or PY3: diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index c5b0785fe6f72..b995909ed15ad 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -7,25 +7,24 @@ from datetime import datetime, timedelta import numpy as np import pandas as pd -from pandas.compat import u, string_types, PY3 +from pandas.compat import u, string_types, PY3, DeepChainMap from pandas.core.base import StringMixin import pandas.core.common as com from pandas.computation import expr, ops -from pandas.computation.ops import is_term +from pandas.computation.ops import is_term, UndefinedVariableError +from pandas.computation.scope import _ensure_scope from pandas.computation.expr import BaseExprVisitor from pandas.computation.common import _ensure_decoded from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type class Scope(expr.Scope): - __slots__ = 'globals', 'locals', 'queryables' - - def __init__(self, gbls=None, lcls=None, queryables=None, level=1): - super( - Scope, - self).__init__(gbls=gbls, - lcls=lcls, - level=level) + __slots__ = 'queryables', + + def __init__(self, level, global_dict=None, local_dict=None, + queryables=None): + super(Scope, self).__init__(level + 1, global_dict=global_dict, + local_dict=local_dict) self.queryables = queryables or dict() @@ -48,9 +47,11 @@ def _resolve_name(self): raise NameError('name {0!r} is not defined'.format(self.name)) return self.name - # resolve the rhs (and allow to be None) - return self.env.locals.get(self.name, - self.env.globals.get(self.name, self.name)) + # resolve the rhs (and allow it to be None) + try: + return self.env.resolve(self.name, is_local=False) + except UndefinedVariableError: + return self.name @property def value(self): @@ -478,7 +479,7 @@ class Expr(expr.Expr): """ def __init__(self, where, op=None, value=None, queryables=None, - encoding=None, scope_level=None): + encoding=None, scope_level=0): # try to be back compat where = self.parse_back_compat(where, op, value) @@ -488,25 +489,25 @@ def __init__(self, where, op=None, value=None, queryables=None, self.filter = None self.terms = None self._visitor = None - # capture the environement if needed - lcls = dict() - if isinstance(where, Expr): - lcls.update(where.env.locals) + # capture the environment if needed + local_dict = DeepChainMap() + + if isinstance(where, Expr): + local_dict = where.env.scope where = where.expr elif isinstance(where, (list, tuple)): for idx, w in enumerate(where): if isinstance(w, Expr): - lcls.update(w.env.locals) + local_dict = w.env.scope else: w = self.parse_back_compat(w) where[idx] = w where = ' & ' .join(["(%s)" % w for w in where]) self.expr = where - self.env = Scope(lcls=lcls) - self.env.update(scope_level) + self.env = Scope(scope_level + 1, local_dict=local_dict) if queryables is not None and isinstance(self.expr, string_types): self.env.queryables.update(queryables) @@ -535,7 +536,7 @@ def parse_back_compat(self, w, op=None, value=None): warnings.warn("passing a tuple into Expr is deprecated, " "pass the where as a single string", DeprecationWarning) - + if op is not None: if not isinstance(w, string_types): raise TypeError( diff --git a/pandas/computation/scope.py b/pandas/computation/scope.py new file mode 100644 index 0000000000000..eaeba86a0e946 --- /dev/null +++ b/pandas/computation/scope.py @@ -0,0 +1,310 @@ +"""Module for scope operations +""" + +import sys +import operator +import struct +import inspect +import datetime +import itertools +import pprint + +import pandas as pd +from pandas.compat import DeepChainMap, map, StringIO +from pandas.core import common as com +from pandas.core.base import StringMixin +from pandas.computation.ops import UndefinedVariableError, _LOCAL_TAG + + +def _ensure_scope(level, global_dict=None, local_dict=None, resolvers=(), + target=None, **kwargs): + """Ensure that we are grabbing the correct scope.""" + return Scope(level + 1, global_dict=global_dict, local_dict=local_dict, + resolvers=resolvers, target=target) + + +def _replacer(x): + """Replace a number with its hexadecimal representation. Used to tag + temporary variables with their calling scope's id. + """ + # get the hex repr of the binary char and remove 0x and pad by pad_size + # zeros + try: + hexin = ord(x) + except TypeError: + # bytes literals masquerade as ints when iterating in py3 + hexin = x + + return hex(hexin) + + +def _raw_hex_id(obj): + """Return the padded hexadecimal id of ``obj``.""" + # interpret as a pointer since that's what really what id returns + packed = struct.pack('@P', id(obj)) + return ''.join(map(_replacer, packed)) + + + +_DEFAULT_GLOBALS = { + 'Timestamp': pd.lib.Timestamp, + 'datetime': datetime.datetime, + 'True': True, + 'False': False, + 'list': list, + 'tuple': tuple +} + + +def _get_pretty_string(obj): + """Return a prettier version of obj + + Parameters + ---------- + obj : object + Object to pretty print + + Returns + ------- + s : str + Pretty print object repr + """ + sio = StringIO() + pprint.pprint(obj, stream=sio) + return sio.getvalue() + + +class Scope(StringMixin): + + """Object to hold scope, with a few bells to deal with some custom syntax + and contexts added by pandas. + + Parameters + ---------- + level : int + global_dict : dict or None, optional, default None + local_dict : dict or Scope or None, optional, default None + resolvers : list-like or None, optional, default None + target : object + + Attributes + ---------- + level : int + scope : DeepChainMap + target : object + temps : dict + """ + __slots__ = 'level', 'scope', 'target', 'temps' + + def __init__(self, level, global_dict=None, local_dict=None, resolvers=(), + target=None): + self.level = level + 1 + + # shallow copy because we don't want to keep filling this up with what + # was there before if there are multiple calls to Scope/_ensure_scope + self.scope = DeepChainMap(_DEFAULT_GLOBALS.copy()) + self.target = target + + if isinstance(local_dict, Scope): + self.scope.update(local_dict.scope) + if local_dict.target is not None: + self.target = local_dict.target + self.update(local_dict.level) + + frame = sys._getframe(self.level) + + try: + # shallow copy here because we don't want to replace what's in + # scope when we align terms (alignment accesses the underlying + # numpy array of pandas objects) + self.scope = self.scope.new_child((global_dict or + frame.f_globals).copy()) + if not isinstance(local_dict, Scope): + self.scope = self.scope.new_child((local_dict or + frame.f_locals).copy()) + finally: + del frame + + # assumes that resolvers are going from outermost scope to inner + if isinstance(local_dict, Scope): + resolvers += tuple(local_dict.resolvers.maps) + self.resolvers = DeepChainMap(*resolvers) + self.temps = {} + + def __unicode__(self): + scope_keys = _get_pretty_string(list(self.scope.keys())) + res_keys = _get_pretty_string(list(self.resolvers.keys())) + return '%s(scope=%s, resolvers=%s)' % (type(self).__name__, scope_keys, + res_keys) + + @property + def has_resolvers(self): + """Return whether we have any extra scope. + + For example, DataFrames pass Their columns as resolvers during calls to + ``DataFrame.eval()`` and ``DataFrame.query()``. + + Returns + ------- + hr : bool + """ + return bool(len(self.resolvers)) + + def resolve(self, key, is_local): + """Resolve a variable name in a possibly local context + + Parameters + ---------- + key : text_type + A variable name + is_local : bool + Flag indicating whether the variable is local or not (prefixed with + the '@' symbol) + + Returns + ------- + value : object + The value of a particular variable + """ + try: + # only look for locals in outer scope + if is_local: + return self.scope[key] + + # not a local variable so check in resolvers if we have them + if self.has_resolvers: + return self.resolvers[key] + + # if we're here that means that we have no locals and we also have + # no resolvers + assert not is_local and not self.has_resolvers + return self.scope[key] + except KeyError: + try: + # last ditch effort we look in temporaries + # these are created when parsing indexing expressions + # e.g., df[df > 0] + return self.temps[key] + except KeyError: + raise UndefinedVariableError(key) + + def swapkey(self, old_key, new_key, new_value=None): + """Replace a variable name, with a potentially new value. + + Parameters + ---------- + old_key : str + Current variable name to replace + new_key : str + New variable name to replace `old_key` with + new_value : object + Value to be replaced along with the possible renaming + """ + if self.has_resolvers: + maps = self.resolvers.maps + self.scope.maps + else: + maps = self.scope.maps + + maps.append(self.temps) + + for mapping in maps: + if old_key in mapping: + if new_value is None: + mapping[new_key] = mapping.pop(old_key) + else: + mapping[new_key] = new_value + return + raise KeyError(old_key) + + def _get_vars(self, stack, scopes): + """Get specifically scoped variables from a list of stack frames. + + Parameters + ---------- + stack : list + A list of stack frames as returned by ``inspect.stack()`` + scopes : sequence of strings + A sequence containing valid stack frame attribute names that + evaluate to a dictionary. For example, ('locals', 'globals') + """ + variables = itertools.product(scopes, stack) + for scope, (frame, _, _, _, _, _) in variables: + try: + d = getattr(frame, 'f_' + scope) + self.scope = self.scope.new_child(d) + finally: + # won't remove it, but DECREF it + # in Py3 this probably isn't necessary since frame won't be + # scope after the loop + del frame + + def update(self, level): + """Update the current scope by going back `level` levels. + + Parameters + ---------- + level : int or None, optional, default None + """ + sl = level + 1 + + # add sl frames to the scope starting with the + # most distant and overwriting with more current + # makes sure that we can capture variable scope + stack = inspect.stack() + + try: + self._get_vars(stack[:sl], scopes=['locals']) + finally: + del stack[:], stack + + def add_tmp(self, value): + """Add a temporary variable to the scope. + + Parameters + ---------- + value : object + An arbitrary object to be assigned to a temporary variable. + + Returns + ------- + name : basestring + The name of the temporary variable created. + """ + name = '{0}_{1}_{2}'.format(type(value).__name__, self.ntemps, + _raw_hex_id(self)) + + # add to inner most scope + assert name not in self.temps + self.temps[name] = value + assert name in self.temps + + # only increment if the variable gets put in the scope + return name + + def remove_tmp(self, name): + """Remove a temporary variable from this scope + + Parameters + ---------- + name : str + The name of a temporary to be removed + """ + del self.temps[name] + + @property + def ntemps(self): + """The number of temporary variables in this scope""" + return len(self.temps) + + @property + def full_scope(self): + """Return the full scope for use with passing to engines transparently + as a mapping. + + Returns + ------- + vars : DeepChainMap + All variables in this scope. + """ + maps = [self.temps] + self.resolvers.maps + self.scope.maps + return DeepChainMap(*maps) diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index bb700c0d594e8..099e8b0412134 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -23,7 +23,6 @@ from pandas.computation.ops import (_binary_ops_dict, _special_case_arith_ops_syms, _arith_ops_syms, _bool_ops_syms) -from pandas.computation.common import NameResolutionError import pandas.computation.expr as expr import pandas.util.testing as tm @@ -1043,6 +1042,7 @@ def tearDownClass(cls): def eval(self, *args, **kwargs): kwargs['engine'] = self.engine kwargs['parser'] = self.parser + kwargs['level'] = kwargs.pop('level', 0) + 1 return pd.eval(*args, **kwargs) def test_simple_arith_ops(self): @@ -1114,10 +1114,10 @@ def test_truediv(self): d = {'s': s} if PY3: - res = self.eval(ex, truediv=False, local_dict=d) + res = self.eval(ex, truediv=False) assert_array_equal(res, np.array([1.0])) - res = self.eval(ex, truediv=True, local_dict=d) + res = self.eval(ex, truediv=True) assert_array_equal(res, np.array([1.0])) res = self.eval('1 / 2', truediv=True) @@ -1128,18 +1128,18 @@ def test_truediv(self): expec = 0.5 self.assertEqual(res, expec) - res = self.eval('s / 2', truediv=False, local_dict={'s': s}) + res = self.eval('s / 2', truediv=False) expec = 0.5 self.assertEqual(res, expec) - res = self.eval('s / 2', truediv=True, local_dict={'s': s}) + res = self.eval('s / 2', truediv=True) expec = 0.5 self.assertEqual(res, expec) else: - res = self.eval(ex, truediv=False, local_dict=d) + res = self.eval(ex, truediv=False) assert_array_equal(res, np.array([1])) - res = self.eval(ex, truediv=True, local_dict=d) + res = self.eval(ex, truediv=True) assert_array_equal(res, np.array([1.0])) res = self.eval('1 / 2', truediv=True) @@ -1150,18 +1150,18 @@ def test_truediv(self): expec = 0 self.assertEqual(res, expec) - res = self.eval('s / 2', truediv=False, local_dict={'s': s}) + res = self.eval('s / 2', truediv=False) expec = 0 self.assertEqual(res, expec) - res = self.eval('s / 2', truediv=True, local_dict={'s': s}) + res = self.eval('s / 2', truediv=True) expec = 0.5 self.assertEqual(res, expec) def test_failing_subscript_with_name_error(self): df = DataFrame(np.random.randn(5, 3)) - self.assertRaises(NameError, self.eval, 'df[x > 2] > 2', - local_dict={'df': df}) + with tm.assertRaises(NameError): + self.eval('df[x > 2] > 2') def test_lhs_expression_subscript(self): df = DataFrame(np.random.randn(5, 3)) @@ -1232,8 +1232,9 @@ def f(): def f(): a = 1 - df.eval('a=a+b') - self.assertRaises(NameResolutionError, f) + old_a = df.a.copy() + df.eval('a = a + b') + assert_frame_equal(old_a + df.b, df.a) # multiple assignment df = orig_df.copy() @@ -1486,34 +1487,6 @@ def test_invalid_parser(): parser='asdf') -def check_is_expr_syntax(engine): - tm.skip_if_no_ne(engine) - s = 1 - valid1 = 's + 1' - valid2 = '__y + _xx' - assert_true(expr.isexpr(valid1, check_names=False)) - assert_true(expr.isexpr(valid2, check_names=False)) - - -def check_is_expr_names(engine): - tm.skip_if_no_ne(engine) - r, s = 1, 2 - valid = 's + r' - invalid = '__y + __x' - assert_true(expr.isexpr(valid, check_names=True)) - assert_false(expr.isexpr(invalid, check_names=True)) - - -def test_is_expr_syntax(): - for engine in _engines: - yield check_is_expr_syntax, engine - - -def test_is_expr_names(): - for engine in _engines: - yield check_is_expr_names, engine - - _parsers = {'python': PythonExprVisitor, 'pytables': pytables.ExprVisitor, 'pandas': PandasExprVisitor} @@ -1547,7 +1520,8 @@ def test_syntax_error_exprs(): def check_name_error_exprs(engine, parser): tm.skip_if_no_ne(engine) e = 's + t' - assert_raises(NameError, pd.eval, e, engine=engine, parser=parser) + with tm.assertRaises(NameError): + pd.eval(e, engine=engine, parser=parser) def test_name_error_exprs(): @@ -1582,6 +1556,25 @@ def test_invalid_numexpr_version(): yield check_invalid_numexpr_version, engine, parser +def check_invalid_local_variable_reference(engine, parser): + tm.skip_if_no_ne(engine) + + a, b = 1, 2 + exprs = 'a + @b', '@a + b', '@a + @b' + for expr in exprs: + if parser != 'pandas': + with tm.assertRaisesRegexp(SyntaxError, "The '@' prefix is only"): + pd.eval(exprs, engine=engine, parser=parser) + else: + with tm.assertRaisesRegexp(SyntaxError, "The '@' prefix is not"): + pd.eval(exprs, engine=engine, parser=parser) + + +def test_invalid_local_variable_reference(): + for engine, parser in ENGINES_PARSERS: + yield check_invalid_local_variable_reference, engine, parser + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e448c96682084..fad348aed0c7d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -36,7 +36,7 @@ from pandas.core.series import Series import pandas.computation.expressions as expressions from pandas.computation.eval import eval as _eval -from pandas.computation.expr import _ensure_scope +from pandas.computation.scope import _ensure_scope from pandas.compat.scipy import scoreatpercentile as _quantile from pandas.compat import(range, zip, lrange, lmap, lzip, StringIO, u, OrderedDict, raise_with_traceback) @@ -1738,26 +1738,30 @@ def _getitem_frame(self, key): def query(self, expr, **kwargs): """Query the columns of a frame with a boolean expression. + .. versionadded:: 0.13 + Parameters ---------- expr : string - The query string to evaluate. The result of the evaluation of this - expression is first passed to :attr:`~pandas.DataFrame.loc` and if - that fails because of a multidimensional key (e.g., a DataFrame) - then the result will be passed to - :meth:`~pandas.DataFrame.__getitem__`. + The query string to evaluate. You can refer to variables + in the environment by prefixing them with an '@' character like + ``@a + b``. kwargs : dict - See the documentation for :func:`~pandas.eval` for complete details - on the keyword arguments accepted by - :meth:`~pandas.DataFrame.query`. + See the documentation for :func:`pandas.eval` for complete details + on the keyword arguments accepted by :meth:`DataFrame.query`. Returns ------- - q : DataFrame or Series + q : DataFrame Notes ----- - This method uses the top-level :func:`~pandas.eval` function to + The result of the evaluation of this expression is first passed to + :attr:`DataFrame.loc` and if that fails because of a + multidimensional key (e.g., a DataFrame) then the result will be passed + to :meth:`DataFrame.__getitem__`. + + This method uses the top-level :func:`pandas.eval` function to evaluate the passed query. The :meth:`~pandas.DataFrame.query` method uses a slightly @@ -1773,12 +1777,12 @@ def query(self, expr, **kwargs): recommended as it is inefficient compared to using ``numexpr`` as the engine. - The :attr:`~pandas.DataFrame.index` and - :attr:`~pandas.DataFrame.columns` attributes of the - :class:`~pandas.DataFrame` instance is placed in the namespace by - default, which allows you to treat both the index and columns of the + The :attr:`DataFrame.index` and + :attr:`DataFrame.columns` attributes of the + :class:`~pandas.DataFrame` instance are placed in the query namespace + by default, which allows you to treat both the index and columns of the frame as a column in the frame. - The identifier ``index`` is used for this variable, and you can also + The identifier ``index`` is used for the frame index; you can also use the name of the index to identify it in a query. For further details and examples see the ``query`` documentation in @@ -1797,18 +1801,7 @@ def query(self, expr, **kwargs): >>> df.query('a > b') >>> df[df.a > df.b] # same result as the previous expression """ - # need to go up at least 4 stack frames - # 4 expr.Scope - # 3 expr._ensure_scope - # 2 self.eval - # 1 self.query - # 0 self.query caller (implicit) - level = kwargs.setdefault('level', 4) - if level < 4: - raise ValueError("Going up fewer than 4 stack frames will not" - " capture the necessary variable scope for a " - "query expression") - + kwargs['level'] = kwargs.pop('level', 0) + 1 res = self.eval(expr, **kwargs) try: @@ -1852,14 +1845,15 @@ def eval(self, expr, **kwargs): >>> from pandas import DataFrame >>> df = DataFrame(randn(10, 2), columns=list('ab')) >>> df.eval('a + b') - >>> df.eval('c=a + b') + >>> df.eval('c = a + b') """ resolvers = kwargs.pop('resolvers', None) + kwargs['level'] = kwargs.pop('level', 0) + 1 if resolvers is None: - index_resolvers = self._get_resolvers() - resolvers = [self, index_resolvers] - kwargs['local_dict'] = _ensure_scope(resolvers=resolvers, **kwargs) + index_resolvers = self._get_index_resolvers() + resolvers = index_resolvers, dict(self.iteritems()) kwargs['target'] = self + kwargs['resolvers'] = kwargs.get('resolvers', ()) + resolvers return _eval(expr, **kwargs) def _slice(self, slobj, axis=0, raise_on_error=False, typ=None): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bf682f7c50252..3251e59e53603 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -350,7 +350,7 @@ def _get_axis_resolvers(self, axis): d[axis] = dindex return d - def _get_resolvers(self): + def _get_index_resolvers(self): d = {} for axis_name in self._AXIS_ORDERS: d.update(self._get_axis_resolvers(axis_name)) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 85a9cf4ea0f9f..76f630082aa15 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -30,7 +30,7 @@ import pandas.core.common as com from pandas.tools.merge import concat from pandas import compat -from pandas.compat import u_safe as u, PY3, range, lrange, string_types +from pandas.compat import u_safe as u, PY3, range, lrange, string_types, filter from pandas.io.common import PerformanceWarning from pandas.core.config import get_option from pandas.computation.pytables import Expr, maybe_expression @@ -66,7 +66,7 @@ def _ensure_encoding(encoding): Term = Expr -def _ensure_term(where): +def _ensure_term(where, scope_level): """ ensure that the where is a Term or a list of Term this makes sure that we are capturing the scope of variables @@ -76,11 +76,17 @@ def _ensure_term(where): # only consider list/tuple here as an ndarray is automaticaly a coordinate # list + level = scope_level + 1 if isinstance(where, (list, tuple)): - where = [w if not maybe_expression(w) else Term(w, scope_level=2) - for w in where if w is not None] + wlist = [] + for w in filter(lambda x: x is not None, where): + if not maybe_expression(w): + wlist.append(w) + else: + wlist.append(Term(w, scope_level=level)) + where = wlist elif maybe_expression(where): - where = Term(where, scope_level=2) + where = Term(where, scope_level=level) return where @@ -311,7 +317,7 @@ def read_hdf(path_or_buf, key, **kwargs): # grab the scope if 'where' in kwargs: - kwargs['where'] = _ensure_term(kwargs['where']) + kwargs['where'] = _ensure_term(kwargs['where'], scope_level=1) f = lambda store, auto_close: store.select( key, auto_close=auto_close, **kwargs) @@ -643,7 +649,7 @@ def select(self, key, where=None, start=None, stop=None, columns=None, raise KeyError('No object named %s in the file' % key) # create the storer and axes - where = _ensure_term(where) + where = _ensure_term(where, scope_level=1) s = self._create_storer(group) s.infer_axes() @@ -675,7 +681,7 @@ def select_as_coordinates( start : integer (defaults to None), row number to start selection stop : integer (defaults to None), row number to stop selection """ - where = _ensure_term(where) + where = _ensure_term(where, scope_level=1) return self.get_storer(key).read_coordinates(where=where, start=start, stop=stop, **kwargs) @@ -730,7 +736,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, """ # default to single select - where = _ensure_term(where) + where = _ensure_term(where, scope_level=1) if isinstance(keys, (list, tuple)) and len(keys) == 1: keys = keys[0] if isinstance(keys, string_types): @@ -776,8 +782,8 @@ def func(_start, _stop): c = s.read_coordinates(where=where, start=_start, stop=_stop, **kwargs) else: c = None - - objs = [t.read(where=c, start=_start, stop=_stop, + + objs = [t.read(where=c, start=_start, stop=_stop, columns=columns, **kwargs) for t in tbls] # concat and return @@ -838,7 +844,7 @@ def remove(self, key, where=None, start=None, stop=None): raises KeyError if key is not a valid store """ - where = _ensure_term(where) + where = _ensure_term(where, scope_level=1) try: s = self.get_storer(key) except: diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index dcdd5408c3376..ad02cdb4f0c72 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -59,6 +59,7 @@ def create_tempfile(path): """ create an unopened named temporary file """ return os.path.join(tempfile.gettempdir(),path) + @contextmanager def ensure_clean_store(path, mode='a', complevel=None, complib=None, fletcher32=False): @@ -77,6 +78,7 @@ def ensure_clean_store(path, mode='a', complevel=None, complib=None, if mode == 'w' or mode == 'a': safe_remove(path) + @contextmanager def ensure_clean_path(path): """ @@ -95,6 +97,7 @@ def ensure_clean_path(path): for f in filenames: safe_remove(f) + # set these parameters so we don't have file sharing tables.parameters.MAX_NUMEXPR_THREADS = 1 tables.parameters.MAX_BLOSC_THREADS = 1 @@ -256,7 +259,6 @@ def test_api(self): self.assertRaises(TypeError, df.to_hdf, path,'df',append=True,format='foo') self.assertRaises(TypeError, df.to_hdf, path,'df',append=False,format='bar') - def test_api_default_format(self): # default_format option @@ -2257,7 +2259,6 @@ def test_remove_startstop(self): expected = wp.reindex(major_axis=wp.major_axis-wp.major_axis[np.arange(0,20,3)]) assert_panel_equal(result, expected) - def test_remove_crit(self): with ensure_clean_store(self.path) as store: @@ -2517,7 +2518,7 @@ def test_backwards_compat_without_term_object(self): result = store.select('wp', [('minor_axis','=',['A','B'])]) expected = wp.loc[:,:,['A','B']] assert_panel_equal(result, expected) - + def test_same_name_scoping(self): with ensure_clean_store(self.path) as store: @@ -3323,6 +3324,8 @@ def test_frame_select(self): date = df.index[len(df) // 2] crit1 = Term('index>=date') + self.assertEqual(crit1.env.scope['date'], date) + crit2 = ("columns=['A', 'D']") crit3 = ('columns=A') @@ -3776,7 +3779,6 @@ def test_select_as_multiple(self): self.assertRaises(ValueError, store.select_as_multiple, ['df1','df3'], where=['A>0', 'B>0'], selector='df1') - def test_nan_selection_bug_4858(self): # GH 4858; nan selection bug, only works for pytables >= 3.1 @@ -4227,6 +4229,7 @@ def test_query_with_nested_special_character(self): result = store.select('test', 'a = "test & test"') tm.assert_frame_equal(expected, result) + def _test_sort(obj): if isinstance(obj, DataFrame): return obj.reindex(sorted(obj.index)) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index af7dc780e88fe..b5b13dc3190e2 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -12116,7 +12116,6 @@ def test_isin_dupe_self(self): expected.iloc[1, 1] = True assert_frame_equal(result, expected) - def test_isin_against_series(self): df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}, index=['a', 'b', 'c', 'd']) @@ -12249,6 +12248,7 @@ def test_empty_frame_dtypes_ftypes(self): ('b', 'bool:dense'), ('c', 'float64:dense')]))) + def skip_if_no_ne(engine='numexpr'): if engine == 'numexpr': try: @@ -12325,6 +12325,7 @@ def check_query_with_unnamed_multiindex(self, parser, engine): df = DataFrame(randn(10, 2), index=index) ind = Series(df.index.get_level_values(0).values, index=index) + #import ipdb; ipdb.set_trace() res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine) res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine) exp = df[ind == 'red'] @@ -12448,7 +12449,7 @@ def test_query_multiindex_get_index_resolvers(self): def check_query_multiindex_get_index_resolvers(self, parser, engine): df = mkdf(10, 3, r_idx_nlevels=2, r_idx_names=['spam', 'eggs']) - resolvers = df._get_resolvers() + resolvers = df._get_index_resolvers() def to_series(mi, level): level_values = mi.get_level_values(level) @@ -12508,17 +12509,29 @@ def tearDownClass(cls): super(TestDataFrameQueryNumExprPandas, cls).tearDownClass() del cls.engine, cls.parser - def test_date_query_method(self): + def test_date_query_with_attribute_access(self): engine, parser = self.engine, self.parser + skip_if_no_pandas_parser(parser) df = DataFrame(randn(5, 3)) df['dates1'] = date_range('1/1/2012', periods=5) df['dates2'] = date_range('1/1/2013', periods=5) df['dates3'] = date_range('1/1/2014', periods=5) - res = df.query('dates1 < 20130101 < dates3', engine=engine, + res = df.query('@df.dates1 < 20130101 < @df.dates3', engine=engine, parser=parser) expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] assert_frame_equal(res, expec) + def test_date_query_no_attribute_access(self): + engine, parser = self.engine, self.parser + df = DataFrame(randn(5, 3)) + df['dates1'] = date_range('1/1/2012', periods=5) + df['dates2'] = date_range('1/1/2013', periods=5) + df['dates3'] = date_range('1/1/2014', periods=5) + res = df.query('dates1 < 20130101 < dates3', engine=engine, + parser=parser) + expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + tm.assert_frame_equal(res, expec) + def test_date_query_with_NaT(self): engine, parser = self.engine, self.parser n = 10 @@ -12576,7 +12589,7 @@ def test_date_query_with_non_date(self): n = 10 df = DataFrame({'dates': date_range('1/1/2012', periods=n), - 'nondate': np.arange(n)}) + 'nondate': np.arange(n)}) ops = '==', '!=', '<', '>', '<=', '>=' @@ -12584,32 +12597,61 @@ def test_date_query_with_non_date(self): with tm.assertRaises(TypeError): df.query('dates %s nondate' % op, parser=parser, engine=engine) - def test_query_scope(self): + def test_query_syntax_error(self): engine, parser = self.engine, self.parser - from pandas.computation.common import NameResolutionError - df = DataFrame({"i": lrange(10), "+": lrange(3, 13), "r": lrange(4, 14)}) - i, s = 5, 6 - self.assertRaises(NameResolutionError, df.query, 'i < 5', - engine=engine, parser=parser, local_dict={'i': i}) - self.assertRaises(SyntaxError, df.query, 'i - +', engine=engine, - parser=parser) - self.assertRaises(NameResolutionError, df.query, 'i == s', - engine=engine, parser=parser, local_dict={'i': i, - 's': s}) - - def test_query_scope_index(self): + with tm.assertRaises(SyntaxError): + df.query('i - +', engine=engine, parser=parser) + + def test_query_scope(self): + from pandas.computation.ops import UndefinedVariableError engine, parser = self.engine, self.parser - from pandas.computation.common import NameResolutionError - df = DataFrame(np.random.randint(10, size=(10, 3)), - index=Index(range(10), name='blob'), - columns=['a', 'b', 'c']) + skip_if_no_pandas_parser(parser) + + df = DataFrame(np.random.randn(20, 2), columns=list('ab')) + + a, b = 1, 2 + res = df.query('a > b', engine=engine, parser=parser) + expected = df[df.a > df.b] + tm.assert_frame_equal(res, expected) + + res = df.query('@a > b', engine=engine, parser=parser) + expected = df[a > df.b] + tm.assert_frame_equal(res, expected) + + # no local variable c + with tm.assertRaises(UndefinedVariableError): + df.query('@a > b > @c', engine=engine, parser=parser) + + # no column named 'c' + with tm.assertRaises(UndefinedVariableError): + df.query('@a > b > c', engine=engine, parser=parser) + + def test_query_doesnt_pickup_local(self): + from pandas.computation.ops import UndefinedVariableError + + engine, parser = self.engine, self.parser + n = m = 10 + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + from numpy import sin + + # we don't pick up the local 'sin' + with tm.assertRaises(UndefinedVariableError): + df.query('sin > 5', engine=engine, parser=parser) + + def test_query_builtin(self): + from pandas.computation.engines import NumExprClobberingError + engine, parser = self.engine, self.parser + + n = m = 10 + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + df.index.name = 'sin' - self.assertRaises(NameResolutionError, df.query, 'sin > 5', - engine=engine, parser=parser, local_dict={'sin': - sin}) + with tm.assertRaisesRegexp(NumExprClobberingError, + 'Variables in expression.+'): + df.query('sin > 5', engine=engine, parser=parser) def test_query(self): engine, parser = self.engine, self.parser @@ -12621,16 +12663,6 @@ def test_query(self): parser=parser), df[df.a + df.b > df.b * df.c]) - local_dict = dict(df.iteritems()) - local_dict.update({'df': df}) - self.assertRaises(NameError, df.query, 'a < d & b < f', - local_dict=local_dict, engine=engine, parser=parser) - - # make sure that it's not just because we didn't pass the locals in - self.assertRaises(AssertionError, self.assertRaises, NameError, - df.query, 'a < b', local_dict={'df': df}, - engine=engine, parser=parser) - def test_query_index_with_name(self): engine, parser = self.engine, self.parser df = DataFrame(np.random.randint(10, size=(10, 3)), @@ -12663,36 +12695,41 @@ def test_query_index_without_name(self): def test_nested_scope(self): engine = self.engine parser = self.parser - # smoke test - x = 1 - result = pd.eval('x + 1', engine=engine, parser=parser) - self.assertEqual(result, 2) - df = DataFrame(np.random.randn(5, 3)) + skip_if_no_pandas_parser(parser) + + df = DataFrame(np.random.randn(5, 3)) df2 = DataFrame(np.random.randn(5, 3)) - expected = df[(df>0) & (df2>0)] + expected = df[(df > 0) & (df2 > 0)] - result = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + result = df.query('(@df > 0) & (@df2 > 0)', engine=engine, parser=parser) assert_frame_equal(result, expected) - result = pd.eval('df[(df > 0) and (df2 > 0)]', engine=engine, + result = pd.eval('df[df > 0 and df2 > 0]', engine=engine, parser=parser) assert_frame_equal(result, expected) - result = pd.eval('df[(df > 0) and (df2 > 0) and df[df > 0] > 0]', + result = pd.eval('df[df > 0 and df2 > 0 and df[df > 0] > 0]', engine=engine, parser=parser) expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] assert_frame_equal(result, expected) result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser) - expected = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + expected = df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) assert_frame_equal(result, expected) + def test_nested_raises_on_local_self_reference(self): + from pandas.computation.ops import UndefinedVariableError + + df = DataFrame(np.random.randn(5, 3)) + + # can't reference ourself b/c we're a local so @ is necessary + with tm.assertRaises(UndefinedVariableError): + df.query('df > 0', engine=self.engine, parser=self.parser) + def test_local_syntax(self): skip_if_no_pandas_parser(self.parser) - from pandas.computation.common import NameResolutionError - engine, parser = self.engine, self.parser df = DataFrame(randn(100, 10), columns=list('abcdefghij')) b = 1 @@ -12700,13 +12737,6 @@ def test_local_syntax(self): result = df.query('a < @b', engine=engine, parser=parser) assert_frame_equal(result, expect) - # scope issue with self.assertRaises so just catch it and let it pass - try: - df.query('a < @b', engine=engine, parser=parser) - except NameResolutionError: - pass - - del b expect = df[df.a < df.b] result = df.query('a < b', engine=engine, parser=parser) assert_frame_equal(result, expect) @@ -12722,6 +12752,22 @@ def test_chained_cmp_and_in(self): expec = df[ind] assert_frame_equal(res, expec) + def test_local_variable_with_in(self): + engine, parser = self.engine, self.parser + skip_if_no_pandas_parser(parser) + a = Series(np.random.randint(3, size=15), name='a') + b = Series(np.random.randint(10, size=15), name='b') + df = DataFrame({'a': a, 'b': b}) + + expected = df.loc[(df.b - 1).isin(a)] + result = df.query('b - 1 in a', engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) + + b = Series(np.random.randint(10, size=15), name='b') + expected = df.loc[(b - 1).isin(a)] + result = df.query('@b - 1 in a', engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) + class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas): @@ -12733,17 +12779,16 @@ def setUpClass(cls): tm.skip_if_no_ne(cls.engine) cls.frame = _frame.copy() - def test_date_query_method(self): + def test_date_query_no_attribute_access(self): engine, parser = self.engine, self.parser df = DataFrame(randn(5, 3)) df['dates1'] = date_range('1/1/2012', periods=5) df['dates2'] = date_range('1/1/2013', periods=5) df['dates3'] = date_range('1/1/2014', periods=5) - res = df.query('(df.dates1 < 20130101) & (20130101 < df.dates3)', + res = df.query('(dates1 < 20130101) & (20130101 < dates3)', engine=engine, parser=parser) expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] - assert_frame_equal(res, expec) - + tm.assert_frame_equal(res, expec) def test_date_query_with_NaT(self): engine, parser = self.engine, self.parser n = 10 @@ -12792,10 +12837,10 @@ def test_date_index_query_with_NaT_duplicates(self): df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT df.set_index('dates1', inplace=True, drop=True) with tm.assertRaises(NotImplementedError): - res = df.query('index < 20130101 < dates3', engine=engine, - parser=parser) + df.query('index < 20130101 < dates3', engine=engine, parser=parser) def test_nested_scope(self): + from pandas.computation.ops import UndefinedVariableError engine = self.engine parser = self.parser # smoke test @@ -12805,23 +12850,23 @@ def test_nested_scope(self): df = DataFrame(np.random.randn(5, 3)) df2 = DataFrame(np.random.randn(5, 3)) - expected = df[(df>0) & (df2>0)] - result = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) - assert_frame_equal(result, expected) + # don't have the pandas parser + with tm.assertRaises(SyntaxError): + df.query('(@df>0) & (@df2>0)', engine=engine, parser=parser) + with tm.assertRaises(UndefinedVariableError): + df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + + expected = df[(df > 0) & (df2 > 0)] result = pd.eval('df[(df > 0) & (df2 > 0)]', engine=engine, parser=parser) - assert_frame_equal(result, expected) + tm.assert_frame_equal(expected, result) + expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] result = pd.eval('df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]', engine=engine, parser=parser) - expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] - assert_frame_equal(result, expected) - - result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser) - expected = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) - assert_frame_equal(result, expected) + tm.assert_frame_equal(expected, result) class TestDataFrameQueryPythonPandas(TestDataFrameQueryNumExprPandas): @@ -12833,6 +12878,18 @@ def setUpClass(cls): cls.parser = 'pandas' cls.frame = _frame.copy() + def test_query_builtin(self): + from pandas.computation.engines import NumExprClobberingError + engine, parser = self.engine, self.parser + + n = m = 10 + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + + df.index.name = 'sin' + expected = df[df.index > 5] + result = df.query('sin > 5', engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) + class TestDataFrameQueryPythonPython(TestDataFrameQueryNumExprPython): @@ -12842,6 +12899,18 @@ def setUpClass(cls): cls.engine = cls.parser = 'python' cls.frame = _frame.copy() + def test_query_builtin(self): + from pandas.computation.engines import NumExprClobberingError + engine, parser = self.engine, self.parser + + n = m = 10 + df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list('abc')) + + df.index.name = 'sin' + expected = df[df.index > 5] + result = df.query('sin > 5', engine=engine, parser=parser) + tm.assert_frame_equal(expected, result) + PARSERS = 'python', 'pandas' ENGINES = 'python', 'numexpr' @@ -12910,8 +12979,8 @@ def check_str_list_query_method(self, parser, engine): for lhs, op, rhs in zip(lhs, ops, rhs): ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs) - assertRaises(NotImplementedError, df.query, ex, engine=engine, - parser=parser, local_dict={'strings': df.strings}) + with tm.assertRaises(NotImplementedError): + df.query(ex, engine=engine, parser=parser) else: res = df.query('strings == ["a", "b"]', engine=engine, parser=parser)