From 90423398bf86aed982ca4a946e9ddc45b7a4b09b Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 15 Jun 2013 21:34:56 -0400 Subject: [PATCH 01/16] ENH: add new computation module and toplevel eval function --- pandas/__init__.py | 1 + pandas/computation/__init__.py | 0 pandas/computation/align.py | 220 +++++++ pandas/computation/api.py | 2 + pandas/computation/common.py | 11 + pandas/computation/engines.py | 80 +++ pandas/computation/eval.py | 85 +++ pandas/computation/expr.py | 150 +++++ pandas/{core => computation}/expressions.py | 96 +-- pandas/computation/ops.py | 255 ++++++++ pandas/computation/tests/__init__.py | 0 pandas/computation/tests/test_eval.py | 648 ++++++++++++++++++++ pandas/core/base.py | 1 + pandas/core/common.py | 24 + pandas/core/frame.py | 2 +- pandas/core/internals.py | 3 +- pandas/io/pytables.py | 193 +++--- pandas/io/tests/test_pytables.py | 32 +- pandas/tests/test_common.py | 65 ++ pandas/tseries/index.py | 3 +- pandas/util/testing.py | 5 +- setup.py | 3 +- vb_suite/binary_ops.py | 12 +- vb_suite/indexing.py | 2 +- 24 files changed, 1732 insertions(+), 161 deletions(-) create mode 100644 pandas/computation/__init__.py create mode 100644 pandas/computation/align.py create mode 100644 pandas/computation/api.py create mode 100644 pandas/computation/common.py create mode 100644 pandas/computation/engines.py create mode 100644 pandas/computation/eval.py create mode 100644 pandas/computation/expr.py rename pandas/{core => computation}/expressions.py (67%) create mode 100644 pandas/computation/ops.py create mode 100644 pandas/computation/tests/__init__.py create mode 100644 pandas/computation/tests/test_eval.py diff --git a/pandas/__init__.py b/pandas/__init__.py index 03681d3fa5a3f..c4c012d6c5095 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -42,6 +42,7 @@ from pandas.stats.api import * from pandas.tseries.api import * from pandas.io.api import * +from pandas.computation.api import * from pandas.util.testing import debug diff --git a/pandas/computation/__init__.py b/pandas/computation/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/computation/align.py b/pandas/computation/align.py new file mode 100644 index 0000000000000..529fe84fd06a7 --- /dev/null +++ b/pandas/computation/align.py @@ -0,0 +1,220 @@ +from functools import partial, wraps +from itertools import izip + +import numpy as np + +import pandas as pd +import pandas.core.common as com +from pandas.computation.ops import is_const +from pandas.computation.common import flatten + + +def _align_core_single_unary_op(term): + if isinstance(term.value, np.ndarray) and not com.is_series(term.value): + typ = partial(np.asanyarray, dtype=term.value.dtype) + else: + typ = type(term.value) + ret = typ, + + if not hasattr(term.value, 'axes'): + ret += None, + else: + ret += _zip_axes_from_type(typ, term.value.axes), + return ret + + +def _zip_axes_from_type(typ, new_axes): + axes = {} + for ax_ind, ax_name in typ._AXIS_NAMES.iteritems(): + axes[ax_name] = new_axes[ax_ind] + return axes + + +def _maybe_promote_shape(values, naxes): + # test to see if we have an array else leave since must be a number + if not isinstance(values, np.ndarray): + return values + + ndims = values.ndim + if ndims > naxes: + raise AssertionError('cannot have more dims than axes, ' + '{0} > {1}'.format(ndims, naxes)) + if ndims == naxes: + return values + + ndim = set(xrange(ndims)) + nax = set(xrange(naxes)) + + axes_slice = [slice(None)] * naxes + + # symmetric difference of numaxes and ndims + slices = nax - ndim + + if ndims == naxes: + if slices: + raise AssertionError('slices should be empty if ndims == naxes ' + '{0}'.format(slices)) + else: + if not slices: + raise AssertionError('slices should NOT be empty if ndim != naxes ' + '{0}'.format(slices)) + + for sl in slices: + axes_slice[sl] = np.newaxis + + return values[tuple(axes_slice)] + + +def _any_pandas_objects(terms): + """Check a sequence of terms for instances of PandasObject.""" + return any(com.is_pd_obj(term.value) for term in terms) + + +def _filter_special_cases(f): + @wraps(f) + def wrapper(terms): + # single unary operand + if len(terms) == 1: + return _align_core_single_unary_op(terms[0]) + + # only scalars + elif all(term.isscalar for term in terms): + return np.result_type(*(term.value for term in terms)), None + + # single element ndarrays + all_has_size = all(hasattr(term.value, 'size') for term in terms) + if (all_has_size and all(term.value.size == 1 for term in terms)): + return np.result_type(*(term.value for term in terms)), None + + # no pandas so just punt to the evaluator + if not _any_pandas_objects(terms): + return np.result_type(*(term.value for term in terms)), None + + return f(terms) + return wrapper + + +@_filter_special_cases +def _align_core(terms): + term_index = [i for i, term in enumerate(terms) if hasattr(term.value, + 'axes')] + term_dims = [terms[i].value.ndim for i in term_index] + ndims = pd.Series(dict(zip(term_index, term_dims))) + + # initial axes are the axes of the largest-axis'd term + biggest = terms[ndims.idxmax()].value + typ = biggest._constructor + axes = biggest.axes + naxes = len(axes) + + for term in (terms[i] for i in term_index): + for axis, items in enumerate(term.value.axes): + if com.is_series(term.value) and naxes > 1: + ax, itm = naxes - 1, term.value.index + else: + ax, itm = axis, items + axes[ax] = axes[ax].join(itm, how='outer') + + for i, ndim in ndims.iteritems(): + for axis, items in izip(xrange(ndim), axes): + ti = terms[i].value + + if hasattr(ti, 'reindex_axis'): + transpose = com.is_series(ti) and naxes > 1 + + if transpose: + f = partial(ti.reindex, index=axes[naxes - 1], copy=False) + else: + f = partial(ti.reindex_axis, items, axis=axis, copy=False) + + if pd.lib.is_bool_array(ti.values): + r = f(fill_value=True) + else: + r = f() + + terms[i].update(r) + + res = _maybe_promote_shape(terms[i].value.T if transpose else + terms[i].value, naxes) + res = res.T if transpose else res + + try: + v = res.values + except AttributeError: + v = res + terms[i].update(v) + + return typ, _zip_axes_from_type(typ, axes) + + +def _filter_terms(flat): + # numeric literals + literals = set(filter(is_const, flat)) + + # these are strings which are variable names + names = set(flat) - literals + + # literals are not names and names are not literals, so intersection should + # be empty + if literals & names: + raise ValueError('literals cannot be names and names cannot be ' + 'literals') + return names, literals + + +def _align(terms, env): + + # flatten the parse tree (a nested list) + terms = list(flatten(terms)) + + # separate names and literals + names, literals = _filter_terms(terms) + + if not names: # only literals so just promote to a common type + return np.result_type(*literals).type, None + + # if all resolved variables are numeric scalars + if all(term.isscalar for term in terms): + return np.result_type(*(term.value for term in terms)).type, None + + # perform the main alignment + typ, axes = _align_core(terms) + return typ, axes + + +def _reconstruct_object(typ, obj, axes, dtype): + """Reconstruct an object given its type, raw value, and possibly empty + (None) axes. + + Parameters + ---------- + typ : object + A type + obj : object + The value to use in the type constructor + axes : dict + The axes to use to construct the resulting pandas object + + Returns + ------- + reconst : typ + An object of type ``typ`` with the value `obj` and possible axes + `axes`. + """ + #import ipdb; ipdb.set_trace() + try: + typ = typ.type + except AttributeError: + pass + + if (not isinstance(typ, partial) and + issubclass(typ, pd.core.generic.PandasObject)): + return typ(obj, dtype=dtype, **axes) + + ret_value = typ(obj).astype(dtype) + + try: + ret = ret_value.item() + except ValueError: + ret = ret_value + return ret diff --git a/pandas/computation/api.py b/pandas/computation/api.py new file mode 100644 index 0000000000000..db8269a497768 --- /dev/null +++ b/pandas/computation/api.py @@ -0,0 +1,2 @@ +from pandas.computation.eval import eval +from pandas.computation.expr import Expr diff --git a/pandas/computation/common.py b/pandas/computation/common.py new file mode 100644 index 0000000000000..4061984dd5e08 --- /dev/null +++ b/pandas/computation/common.py @@ -0,0 +1,11 @@ +import collections +from pandas.core.common import is_string + + +def flatten(l): + for el in l: + if isinstance(el, collections.Iterable) and not is_string(el): + for s in flatten(el): + yield s + else: + yield el diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py new file mode 100644 index 0000000000000..7f500dccb825b --- /dev/null +++ b/pandas/computation/engines.py @@ -0,0 +1,80 @@ +import abc + +from pandas.computation.align import _align, _reconstruct_object + + +class AbstractEngine(object): + """""" + __metaclass__ = abc.ABCMeta + + has_neg_frac = False + + def __init__(self, expr): + self.expr = expr + self.aligned_axes = None + self.result_type = None + + @abc.abstractmethod + def convert(self): + """Convert an expression for evaluation.""" + pass + + def evaluate(self): + if not self._is_aligned: + self.result_type, self.aligned_axes = _align(self.expr.terms, + self.expr.env) + + res = self._evaluate(self.expr.env) + return _reconstruct_object(self.result_type, res, self.aligned_axes, + self.expr.terms.return_type) + + @property + def _is_aligned(self): + return self.aligned_axes is not None and self.result_type is not None + + @abc.abstractmethod + def _evaluate(self, env): + """Return an evaluated expression.""" + pass + + +class NumExprEngine(AbstractEngine): + """NumExpr engine class""" + has_neg_frac = True + + def __init__(self, expr): + super(NumExprEngine, self).__init__(expr) + + def convert(self): + """Return a string""" + return '%s' % self.expr + + def _evaluate(self, env): + import numexpr as ne + + try: + return ne.evaluate(self.convert(), local_dict=env.locals, + global_dict=env.globals, + truediv=self.expr.truediv) + except KeyError as e: + raise NameError('{0!r} is not defined'.format(e.message)) + + +class PythonEngine(AbstractEngine): + """Use NumPy even if numexpr is installed""" + has_neg_frac = False + + def __init__(self, expr): + super(PythonEngine, self).__init__(expr) + + def convert(self): + pass + + def evaluate(self): + return self.expr(self.expr.env) + + def _evaluate(self, env): + pass + + +_engines = {'numexpr': NumExprEngine, 'python': PythonEngine} diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py new file mode 100644 index 0000000000000..1a681e37d6130 --- /dev/null +++ b/pandas/computation/eval.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python + +import numbers + +import numpy as np + +import six + +from pandas.computation.expr import Expr, Scope +from pandas.computation.engines import _engines + + +def eval(expr, engine='numexpr', truediv=True, local_dict=None, + global_dict=None): + """Evaluate a Python expression as a string using various backends. + + The following arithmetic operations are supported: +, -, *, /, **, %, // + (python engine only) along with the following boolean operations: | (or), & + (and), and ~ (not). All Pandas objects are supported and behave as they + would with in-Python evaluation. + + Parameters + ---------- + expr : string or Expr object + The expression to evaluate. This can be either a string or an ``Expr`` + object. + engine : string, optional, default 'numexpr', {'python', 'numexpr', 'pytables'} + The engine used to evaluate the expression. Supported engines are + + - 'numexpr': This default engine evaluates pandas objects using numexpr + for large speed ups in complex expressions with large + frames. + - 'python': Performs operations as if you had eval'd in top level + python + - 'pytables': Engine used for evaluating expressions for selection of + objects from PyTables HDF5 tables. + + truediv : bool, optional, default True + Whether to use true division, like in Python >= 3 + local_dict : dict or None, optional, default None + A dictionary of local variables, taken from locals() by default. + global_dict : dict or None, optional, default None + A dictionary of global variables, taken from globals() by default. + + Returns + ------- + obj : ndarray, scalar, DataFrame, Series, or Panel + + Notes + ----- + * The benefits of using ``eval`` are that very large frames that are terms in + long expressions are sped up, sometimes by as much as 10x. + + See :ref:`Enhancing performance ` for more details. + """ + # make sure we're passed a valid engine + if not engine in _engines: + raise KeyError('Invalid engine {0} passed, valid engines are' + ' {1}'.format(_engines.keys())) + + eng = _engines[engine] + + if isinstance(expr, six.string_types): + # need to go 2 up in the call stack from the constructor since we want + # the calling scope's variables + env = Scope(global_dict, local_dict, frame_level=2) + parsed_expr = Expr(expr, engine, env, truediv) + elif isinstance(expr, Expr): + parsed_expr = expr + else: + raise TypeError("eval only accepts strings and Expr objects, you " + "passed a {0!r}".format(expr.__class__.__name__)) + + + # construct the engine and evaluate + ret = eng(parsed_expr).evaluate() + + # sanity check for a number + # TODO: eventually take out + # TODO: pytables engine will probably need a string check + if np.isscalar(ret): + if not isinstance(ret, (np.number, np.bool_, numbers.Number)): + raise TypeError('scalar result must be numeric or bool, passed ' + 'type is {0!r}'.format(ret.__class__.__name__)) + return ret diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py new file mode 100644 index 0000000000000..6d33f6ac50a0d --- /dev/null +++ b/pandas/computation/expr.py @@ -0,0 +1,150 @@ +import ast +import sys +from functools import partial + +from pandas.core.base import StringMixin +from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops +from pandas.computation.ops import _cmp_ops_syms, _bool_ops_syms +from pandas.computation.ops import _arith_ops_syms, _unary_ops_syms +from pandas.computation.ops import Term, Constant + + +class Scope(object): + __slots__ = 'globals', 'locals' + + def __init__(self, gbls=None, lcls=None, frame_level=1): + frame = sys._getframe(frame_level) + + try: + self.globals = gbls or frame.f_globals.copy() + self.locals = lcls or frame.f_locals.copy() + finally: + del frame + + +class ExprParserError(Exception): + pass + + +class ExprVisitor(ast.NodeVisitor): + """Custom ast walker + """ + bin_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms + bin_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'BitAnd', 'BitOr', + 'Add', 'Sub', 'Mult', 'Div', 'Pow', 'FloorDiv', 'Mod') + bin_op_nodes_map = dict(zip(bin_ops, bin_op_nodes)) + + unary_ops = _unary_ops_syms + unary_op_nodes = 'UAdd', 'USub', 'Invert' + unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) + + def __init__(self, env): + for bin_op in self.bin_ops: + setattr(self, 'visit_{0}'.format(self.bin_op_nodes_map[bin_op]), + lambda node, bin_op=bin_op: partial(BinOp, bin_op)) + + for unary_op in self.unary_ops: + setattr(self, + 'visit_{0}'.format(self.unary_op_nodes_map[unary_op]), + lambda node, unary_op=unary_op: partial(UnaryOp, unary_op)) + self.env = env + + def visit(self, node): + if not (isinstance(node, ast.AST) or isinstance(node, basestring)): + raise TypeError('"node" must be an AST node or a string, you' + ' passed a(n) {0}'.format(node.__class__)) + if isinstance(node, basestring): + node = ast.fix_missing_locations(ast.parse(node)) + return super(ExprVisitor, self).visit(node) + + def visit_Module(self, node): + if len(node.body) != 1: + raise ExprParserError('only a single expression is allowed') + + expr = node.body[0] + if not isinstance(expr, ast.Expr): + raise SyntaxError('only expressions are allowed') + + return self.visit(expr) + + def visit_Expr(self, node): + return self.visit(node.value) + + def visit_BinOp(self, node): + op = self.visit(node.op) + left = self.visit(node.left) + right = self.visit(node.right) + return op(left, right) + + def visit_UnaryOp(self, node): + if isinstance(node.op, ast.Not): + raise NotImplementedError("not operator not yet supported") + op = self.visit(node.op) + return op(self.visit(node.operand)) + + def visit_Name(self, node): + return Term(node.id, self.env) + + def visit_Num(self, node): + return Constant(node.n, self.env) + + def visit_Compare(self, node): + ops = node.ops + comps = node.comparators + if len(ops) != 1: + raise ExprParserError('chained comparisons not supported') + return self.visit(ops[0])(self.visit(node.left), self.visit(comps[0])) + + def visit_Call(self, node): + if not isinstance(node.func, ast.Name): + raise TypeError("Only named functions are supported") + + valid_ops = _reductions + _mathops + + if node.func.id not in valid_ops: + raise ValueError("Only {0} are supported".format(valid_ops)) + + raise NotImplementedError("function calls not yet supported") + + def visit_Attribute(self, node): + raise NotImplementedError("attribute access is not yet supported") + + def visit_BoolOp(self, node): + raise NotImplementedError("boolean operators are not yet supported") + + +class Expr(StringMixin): + """Expr object""" + def __init__(self, expr, engine='numexpr', env=None, truediv=True): + self.expr = expr + self.env = env or Scope(frame_level=2) + self._visitor = ExprVisitor(self.env) + self.terms = self.parse() + self.engine = engine + self.truediv = truediv + + def __call__(self, env): + env.locals['truediv'] = self.truediv + return self.terms(env) + + def __unicode__(self): + return unicode(self.terms) + + def parse(self): + """return a Termset""" + return self._visitor.visit(self.expr) + + def align(self): + """align a set of Terms""" + return self.terms.align(self.env) + + +def isexpr(s, check_names=True): + try: + Expr(s) + except SyntaxError: + return False + except NameError: + return not check_names + else: + return True diff --git a/pandas/core/expressions.py b/pandas/computation/expressions.py similarity index 67% rename from pandas/core/expressions.py rename to pandas/computation/expressions.py index b1bd104ce48a5..45c9a2d5259cb 100644 --- a/pandas/core/expressions.py +++ b/pandas/computation/expressions.py @@ -5,6 +5,7 @@ Offer fast expression evaluation thru numexpr """ + import numpy as np from pandas.core.common import _values_from_object @@ -15,17 +16,19 @@ _NUMEXPR_INSTALLED = False _USE_NUMEXPR = _NUMEXPR_INSTALLED -_evaluate = None -_where = None +_evaluate = None +_where = None # the set of dtypes that we will allow pass to numexpr -_ALLOWED_DTYPES = dict(evaluate = set(['int64','int32','float64','float32','bool']), - where = set(['int64','float64','bool'])) +_ALLOWED_DTYPES = dict( + evaluate=set(['int64', 'int32', 'float64', 'float32', 'bool']), + where=set(['int64', 'float64', 'bool'])) # the minimum prod shape that we will use numexpr -_MIN_ELEMENTS = 10000 +_MIN_ELEMENTS = 10000 + -def set_use_numexpr(v = True): +def set_use_numexpr(v=True): # set/unset to use numexpr global _USE_NUMEXPR if _NUMEXPR_INSTALLED: @@ -35,26 +38,25 @@ def set_use_numexpr(v = True): global _evaluate, _where if not _USE_NUMEXPR: _evaluate = _evaluate_standard - _where = _where_standard + _where = _where_standard else: _evaluate = _evaluate_numexpr - _where = _where_numexpr + _where = _where_numexpr -def set_numexpr_threads(n = None): + +def set_numexpr_threads(n=None): # if we are using numexpr, set the threads to n # otherwise reset - try: - if _NUMEXPR_INSTALLED and _USE_NUMEXPR: - if n is None: - n = ne.detect_number_of_cores() - ne.set_num_threads(n) - except: - pass + if _NUMEXPR_INSTALLED and _USE_NUMEXPR: + if n is None: + n = ne.detect_number_of_cores() + ne.set_num_threads(n) def _evaluate_standard(op, op_str, a, b, raise_on_error=True, **eval_kwargs): """ standard evaluation """ - return op(a,b) + return op(a, b) + def _can_use_numexpr(op, op_str, a, b, dtype_check): """ return a boolean if we WILL be using numexpr """ @@ -65,13 +67,13 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): # check for dtype compatiblity dtypes = set() - for o in [ a, b ]: - if hasattr(o,'get_dtype_counts'): + for o in [a, b]: + if hasattr(o, 'get_dtype_counts'): s = o.get_dtype_counts() if len(s) > 1: return False dtypes |= set(s.index) - elif isinstance(o,np.ndarray): + elif isinstance(o, np.ndarray): dtypes |= set([o.dtype.name]) # allowed are a superset @@ -80,52 +82,54 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): return False -def _evaluate_numexpr(op, op_str, a, b, raise_on_error = False, **eval_kwargs): + +def _evaluate_numexpr(op, op_str, a, b, raise_on_error=False, **eval_kwargs): result = None if _can_use_numexpr(op, op_str, a, b, 'evaluate'): try: a_value, b_value = a, b - if hasattr(a_value,'values'): + if hasattr(a_value, 'values'): a_value = a_value.values - if hasattr(b_value,'values'): + if hasattr(b_value, 'values'): b_value = b_value.values result = ne.evaluate('a_value %s b_value' % op_str, - local_dict={ 'a_value' : a_value, - 'b_value' : b_value }, + local_dict={'a_value': a_value, + 'b_value': b_value}, casting='safe', **eval_kwargs) except (ValueError) as detail: if 'unknown type object' in str(detail): pass except (Exception) as detail: if raise_on_error: - raise TypeError(str(detail)) + raise if result is None: - result = _evaluate_standard(op,op_str,a,b,raise_on_error) + result = _evaluate_standard(op, op_str, a, b, raise_on_error) return result def _where_standard(cond, a, b, raise_on_error=True): - return np.where(_values_from_object(cond), _values_from_object(a), _values_from_object(b)) + return np.where(_values_from_object(cond), _values_from_object(a), + _values_from_object(b)) -def _where_numexpr(cond, a, b, raise_on_error = False): +def _where_numexpr(cond, a, b, raise_on_error=False): result = None if _can_use_numexpr(None, 'where', a, b, 'where'): try: cond_value, a_value, b_value = cond, a, b - if hasattr(cond_value,'values'): + if hasattr(cond_value, 'values'): cond_value = cond_value.values - if hasattr(a_value,'values'): + if hasattr(a_value, 'values'): a_value = a_value.values - if hasattr(b_value,'values'): + if hasattr(b_value, 'values'): b_value = b_value.values - result = ne.evaluate('where(cond_value,a_value,b_value)', - local_dict={ 'cond_value' : cond_value, - 'a_value' : a_value, - 'b_value' : b_value }, + result = ne.evaluate('where(cond_value, a_value, b_value)', + local_dict={'cond_value': cond_value, + 'a_value': a_value, + 'b_value': b_value}, casting='safe') except (ValueError) as detail: if 'unknown type object' in str(detail): @@ -135,7 +139,7 @@ def _where_numexpr(cond, a, b, raise_on_error = False): raise TypeError(str(detail)) if result is None: - result = _where_standard(cond,a,b,raise_on_error) + result = _where_standard(cond, a, b, raise_on_error) return result @@ -143,7 +147,9 @@ def _where_numexpr(cond, a, b, raise_on_error = False): # turn myself on set_use_numexpr(True) -def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, **eval_kwargs): + +def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, + **eval_kwargs): """ evaluate and return the expression of the op on a and b Parameters @@ -153,15 +159,18 @@ def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, **eval_kw op_str: the string version of the op a : left operand b : right operand - raise_on_error : pass the error to the higher level if indicated (default is False), - otherwise evaluate the op with and return the results + raise_on_error : pass the error to the higher level if indicated + (default is False), otherwise evaluate the op with and + return the results use_numexpr : whether to try to use numexpr (default True) """ if use_numexpr: - return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error, **eval_kwargs) + return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error, + **eval_kwargs) return _evaluate_standard(op, op_str, a, b, raise_on_error=raise_on_error) + def where(cond, a, b, raise_on_error=False, use_numexpr=True): """ evaluate the where condition cond on a and b @@ -171,8 +180,9 @@ def where(cond, a, b, raise_on_error=False, use_numexpr=True): cond : a boolean array a : return if cond is True b : return if cond is False - raise_on_error : pass the error to the higher level if indicated (default is False), - otherwise evaluate the op with and return the results + raise_on_error : pass the error to the higher level if indicated + (default is False), otherwise evaluate the op with and + return the results use_numexpr : whether to try to use numexpr (default True) """ diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py new file mode 100644 index 0000000000000..ca5f6d4872a72 --- /dev/null +++ b/pandas/computation/ops.py @@ -0,0 +1,255 @@ +import operator as op + +import numpy as np +from pandas.util.py3compat import PY3 +import pandas.core.common as com +from pandas.core.base import StringMixin +from pandas.computation.common import flatten + + +_reductions = 'sum', 'prod' +_mathops = ('sin', 'cos', 'exp', 'log', 'expm1', 'log1p', 'pow', 'div', 'sqrt', + 'inv', 'sinh', 'cosh', 'tanh', 'arcsin', 'arccos', 'arctan', + 'arccosh', 'arcsinh', 'arctanh', 'arctan2', 'abs') + + +class OperatorError(Exception): + pass + + +class UnaryOperatorError(OperatorError): + pass + + +class BinaryOperatorError(OperatorError): + pass + + +def _resolve_name(env, key): + res = env.locals.get(key, env.globals.get(key)) + + if res is None: + if not isinstance(key, basestring): + return key + + raise NameError('name {0!r} is not defined'.format(key)) + + return res + + +def _update_name(env, key, value): + if isinstance(key, basestring): + try: + del env.locals[key] + env.locals[key] = value + except KeyError: + try: + del env.globals[key] + env.globals[key] = value + except KeyError: + raise NameError('name {0!r} is not defined'.format(key)) + + +class Term(StringMixin): + def __init__(self, name, env): + self.name = name + self.env = env + self.value = _resolve_name(self.env, self.name) + + try: + # ndframe potentially very slow for large, mixed dtype frames + self.type = self.value.values.dtype + except AttributeError: + try: + # ndarray + self.type = self.value.dtype + except AttributeError: + # scalar + self.type = type(self.value) + + def __unicode__(self): + return com.pprint_thing(self.name) + + def update(self, value): + _update_name(self.env, self.name, value) + self.value = value + + @property + def isscalar(self): + return np.isscalar(self.value) + + +class Constant(Term): + def __init__(self, value, env): + super(Constant, self).__init__(value, env) + + +def _print_operand(opr): + return opr.name if is_term(opr) else unicode(opr) + + +class Op(StringMixin): + """Hold an operator of unknown arity + """ + def __init__(self, op, operands): + self.op = op + self.operands = operands + + def __iter__(self): + return iter(self.operands) + + def __unicode__(self): + """Print a generic n-ary operator and its operands using infix + notation""" + # recurse over the operands + parened = ('({0})'.format(_print_operand(opr)) + for opr in self.operands) + return com.pprint_thing(' {0} '.format(self.op).join(parened)) + + @property + def return_type(self): + # clobber types to bool if the op is a boolean operator + if self.op in (_cmp_ops_syms + _bool_ops_syms): + return np.bool_ + return np.result_type(*(term.type for term in flatten(self))) + + +_cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=', '=' +_cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne, op.eq +_cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs)) + +_bool_ops_syms = '&', '|' +_bool_ops_funcs = op.and_, op.or_ +_bool_ops_dict = dict(zip(_bool_ops_syms, _bool_ops_funcs)) + +_arith_ops_syms = '+', '-', '*', '/', '**', '//', '%' +_arith_ops_funcs = (op.add, op.sub, op.mul, op.truediv if PY3 else op.div, + op.pow, op.floordiv, op.mod) +_arith_ops_dict = dict(zip(_arith_ops_syms, _arith_ops_funcs)) + +_binary_ops_dict = {} + +for d in (_cmp_ops_dict, _bool_ops_dict, _arith_ops_dict): + _binary_ops_dict.update(d) + + +def _cast_inplace(terms, dtype): + dt = np.dtype(dtype) + for term in terms: + # cast all the way down the tree since operands must be + try: + _cast_inplace(term.operands, dtype) + except AttributeError: + # we've bottomed out so actually do the cast + try: + new_value = term.value.astype(dt) + except AttributeError: + new_value = dt.type(term.value) + term.update(new_value) + + +def is_term(obj): + return isinstance(obj, Term) + + +def is_const(obj): + return isinstance(obj, Constant) + + +class BinOp(Op): + """Hold a binary operator and its operands + + Parameters + ---------- + op : str or Op + left : str or Op + right : str or Op + """ + def __init__(self, op, lhs, rhs): + super(BinOp, self).__init__(op, (lhs, rhs)) + self.lhs = lhs + self.rhs = rhs + + try: + self.func = _binary_ops_dict[op] + except KeyError: + keys = _binary_ops_dict.keys() + raise BinaryOperatorError('Invalid binary operator {0}, valid' + ' operators are {1}'.format(op, keys)) + + def __call__(self, env): + # handle truediv + if self.op == '/' and env.locals['truediv']: + self.func = op.truediv + + # recurse over the left nodes + try: + left = self.lhs(env) + except TypeError: + left = self.lhs + + # recurse over the right nodes + try: + right = self.rhs(env) + except TypeError: + right = self.rhs + + # base cases + if is_term(left) and is_term(right): + res = self.func(left.value, right.value) + elif not is_term(left) and is_term(right): + res = self.func(left, right.value) + elif is_term(left) and not is_term(right): + res = self.func(left.value, right) + elif not (is_term(left) or is_term(right)): + res = self.func(left, right) + + return res + + +class Mod(BinOp): + def __init__(self, lhs, rhs): + super(Mod, self).__init__('%', lhs, rhs) + _cast_inplace(self.operands, np.float_) + + +_unary_ops_syms = '+', '-', '~' +_unary_ops_funcs = op.pos, op.neg, op.invert +_unary_ops_dict = dict(zip(_unary_ops_syms, _unary_ops_funcs)) + + +class UnaryOp(Op): + """Hold a unary operator and its operands + """ + def __init__(self, op, operand): + super(UnaryOp, self).__init__(op, (operand,)) + self.operand = operand + + try: + self.func = _unary_ops_dict[op] + except KeyError: + raise UnaryOperatorError('Invalid unary operator {0}, valid ' + 'operators are ' + '{1}'.format(op, _unary_ops_syms)) + + def __call__(self, env): + operand = self.operand + + # recurse if operand is an Op + try: + operand = self.operand(env) + except TypeError: + operand = self.operand + + v = operand.value if is_term(operand) else operand + + try: + res = self.func(v) + except TypeError: + res = self.func(v.values) + + return res + + def __unicode__(self): + return com.pprint_thing('{0}({1})'.format(self.op, self.operand)) + diff --git a/pandas/computation/tests/__init__.py b/pandas/computation/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py new file mode 100644 index 0000000000000..fc1cccf320201 --- /dev/null +++ b/pandas/computation/tests/test_eval.py @@ -0,0 +1,648 @@ +#!/usr/bin/env python + +import unittest +import itertools +from itertools import product + +import nose +from nose.tools import assert_raises, assert_tuple_equal +from nose.tools import assert_true, assert_false + +from numpy.random import randn, rand +import numpy as np +from numpy.testing import assert_array_equal, assert_allclose +from numpy.testing.decorators import slow + +import pandas as pd +from pandas.core import common as com +from pandas import DataFrame, Series +from pandas.util.testing import makeCustomDataframe as mkdf +from pandas.computation.engines import _engines, _reconstruct_object +from pandas.computation.align import _align_core +from pandas.computation.ops import _binary_ops_dict, _unary_ops_dict, Term +import pandas.computation.expr as expr +from pandas.computation.expressions import _USE_NUMEXPR +from pandas.computation.eval import Scope +from pandas.util.testing import assert_frame_equal, randbool +from pandas.util.py3compat import PY3 + + +def skip_numexpr_engine(engine): + if not _USE_NUMEXPR and engine == 'numexpr': + raise nose.SkipTest + + +def engine_has_neg_frac(engine): + return _engines[engine].has_neg_frac + + +def fractional(x): + frac, _ = np.modf(np.asanyarray(x)) + return frac + + +def hasfractional(x): + return np.any(fractional(x)) + + +def _eval_from_expr(lhs, cmp1, rhs, binop, cmp2): + f1 = _binary_ops_dict[cmp1] + f2 = _binary_ops_dict[cmp2] + bf = _binary_ops_dict[binop] + env = Scope() + typ, axes = _align_core((Term('lhs', env), Term('rhs', env))) + lhs, rhs = env.locals['lhs'], env.locals['rhs'] + return _reconstruct_object(typ, bf(f1(lhs, rhs), f2(lhs, rhs)), axes) + + +def _eval_single_bin(lhs, cmp1, rhs, has_neg_frac): + c = _binary_ops_dict[cmp1] + if has_neg_frac: + try: + result = c(lhs, rhs) + except ValueError: + result = np.nan + else: + result = c(lhs, rhs) + return result + + +def isframe(x): + return isinstance(x, pd.DataFrame) + + +def isseries(x): + return isinstance(x, pd.Series) + + +def are_compatible_types(op, lhs, rhs): + if op in ('&', '|'): + if isframe(lhs) and isseries(rhs) or isframe(rhs) and isseries(lhs): + return False + return True + + +def _eval_bin_and_unary(unary, lhs, arith1, rhs): + binop = _binary_ops_dict[arith1] + unop = expr._unary_ops_dict[unary] + return unop(binop(lhs, rhs)) + + +def _series_and_2d_ndarray(lhs, rhs): + return (com.is_series(lhs) and isinstance(rhs, np.ndarray) and rhs.ndim > 1 + or com.is_series(rhs) and isinstance(lhs, np.ndarray) and lhs.ndim + > 1) + + +# Smoke testing +class TestBasicEval(unittest.TestCase): + + @classmethod + def setUpClass(self): + self.cmp_ops = expr._cmp_ops_syms + self.cmp2_ops = self.cmp_ops[::-1] + self.bin_ops = expr._bool_ops_syms + self.arith_ops = tuple(o for o in expr._arith_ops_syms if o != '//') + self.unary_ops = '+', '-' + + def set_current_engine(self): + self.engine = 'numexpr' + + def setup_data(self): + nan_df = DataFrame(rand(10, 5)) + nan_df[nan_df > 0.5] = np.nan + self.lhses = (DataFrame(randn(10, 5)), Series(randn(5)), randn(), + np.float64(randn()), randn(10, 5), randn(5), np.nan, + Series([1, 2, np.nan, np.nan, 5]), nan_df) + self.rhses = (DataFrame(randn(10, 5)), Series(randn(5)), randn(), + np.float64(randn()), randn(10, 5), randn(5), np.nan, + Series([1, 2, np.nan, np.nan, 5]), nan_df) + + def setUp(self): + try: + import numexpr as ne + self.ne = ne + except ImportError: + raise nose.SkipTest + self.set_current_engine() + self.setup_data() + self.current_engines = filter(lambda x: x != self.engine, + _engines.iterkeys()) + + @slow + def test_complex_cmp_ops(self): + self.setUp() + lhses, rhses = self.lhses, self.rhses + args = itertools.product(lhses, self.cmp_ops, rhses, self.bin_ops, + self.cmp2_ops) + for lhs, cmp1, rhs, binop, cmp2 in args: + self._create_cmp_op_t(lhs, cmp1, rhs, binop, cmp2) + + def test_simple_cmp_ops(self): + bool_lhses = (DataFrame(randbool(size=(10, 5))), + Series(randbool((5,))), randbool()) + bool_rhses = (DataFrame(randbool(size=(10, 5))), + Series(randbool((5,))), randbool()) + args = itertools.product(bool_lhses, bool_rhses, self.cmp_ops) + for lhs, rhs, cmp_op in args: + self._create_simple_cmp_op_t(lhs, rhs, cmp_op) + + def test_binary_arith_ops(self): + self.setUp() + lhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + rhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + args = itertools.product(lhses, self.arith_ops, rhses) + for lhs, op, rhs in args: + self._create_arith_op_t(lhs, op, rhs) + + def test_unary_arith_ops(self): + self.setUp() + lhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + rhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + aops = tuple(aop for aop in self.arith_ops if aop not in '+-') + args = itertools.product(self.unary_ops, lhses, aops, rhses) + for unary_op, lhs, arith_op, rhs in args: + self._create_unary_arith_op_t(unary_op, lhs, arith_op, rhs) + + def test_invert(self): + self.setUp() + lhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + rhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + args = itertools.product(lhses, self.cmp_ops, rhses) + for lhs, op, rhs in args: + self._create_invert_op_t(lhs, op, rhs) + + def _create_cmp_op_t(self, lhs, cmp1, rhs, binop, cmp2): + ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format(cmp1=cmp1, + binop=binop, + cmp2=cmp2) + if _series_and_2d_ndarray(lhs, rhs): + self.assertRaises(Exception, _eval_from_expr, lhs, cmp1, rhs, + binop, cmp2) + self.assertRaises(Exception, pd.eval, ex, engine=self.engine) + else: + expected = _eval_from_expr(lhs, cmp1, rhs, binop, cmp2) + result = pd.eval(ex, engine=self.engine) + assert_array_equal(result, expected) + + def _create_simple_cmp_op_t(self, lhs, rhs, cmp1): + ex = 'lhs {0} rhs'.format(cmp1) + + if are_compatible_types(cmp1, lhs, rhs): + expected = _eval_single_bin(lhs, cmp1, rhs, + engine_has_neg_frac(self.engine)) + result = pd.eval(ex, engine=self.engine) + assert_array_equal(result, expected) + else: + assert_raises(TypeError, _eval_single_bin, lhs, cmp1, rhs, + engine_has_neg_frac(self.engine)) + + def _create_arith_op_t(self, lhs, arith1, rhs): + ex = 'lhs {0} rhs'.format(arith1) + nan_frac_neg = (arith1 == '**' and np.any(lhs < 0) and + hasfractional(rhs) and np.isscalar(lhs) and + np.isscalar(rhs) and + not (isinstance(lhs, tuple(np.typeDict.values())) + or isinstance(rhs, tuple(np.typeDict.values())))) + if nan_frac_neg and not engine_has_neg_frac(self.engine): + assert_raises(ValueError, pd.eval, ex, engine=self.engine, + local_dict=locals(), global_dict=globals()) + else: + result = pd.eval(ex, engine=self.engine) + + if arith1 != '//': + expected = _eval_single_bin(lhs, arith1, rhs, + engine_has_neg_frac(self.engine)) + # roundoff error with modulus + if arith1 == '%': + assert_allclose(result, expected) + else: + assert_array_equal(result, expected) + + # sanity check on recursive parsing + try: + ghs = rhs.copy() + except AttributeError: + ghs = rhs + + if nan_frac_neg and not engine_has_neg_frac(self.engine): + assert_raises(ValueError, pd.eval, ex, engine=self.engine, + local_dict=locals(), global_dict=globals()) + else: + if arith1 == '**': + ex = '(lhs {0} rhs) {0} ghs'.format(arith1) + else: + ex = 'lhs {0} rhs {0} ghs'.format(arith1) + result = pd.eval(ex, engine=self.engine) + + try: + nlhs = _eval_single_bin(lhs, arith1, rhs, + engine_has_neg_frac(self.engine)) + except ValueError: + assert_raises(ValueError, _eval_single_bin, lhs, arith1, rhs, + engine_has_neg_frac(self.engine)) + else: + try: + nlhs, ghs = nlhs.align(ghs) + except: + pass + if arith1 != '//': + expected = self.ne.evaluate('nlhs {0} ghs'.format(arith1)) + + # roundoff error with modulus + if arith1 == '%': + assert_allclose(result, expected) + else: + assert_array_equal(result, expected) + + def _create_invert_op_t(self, lhs, cmp1, rhs): + # simple + for el in (lhs, rhs): + try: + elb = el.astype(bool) + except AttributeError: + elb = np.array([bool(el)]) + expected = ~elb + result = pd.eval('~elb', engine=self.engine) + assert_array_equal(expected, result) + + for engine in self.current_engines: + assert_array_equal(result, pd.eval('~elb', engine=engine)) + + # compound + ex = '~(lhs {0} rhs)'.format(cmp1) + if np.isscalar(lhs) and np.isscalar(rhs): + lhs, rhs = map(lambda x: np.array([x]), (lhs, rhs)) + expected = ~_eval_single_bin(lhs, cmp1, rhs, + engine_has_neg_frac(self.engine)) + result = pd.eval(ex, engine=self.engine) + assert_array_equal(expected, result) + + # make sure the other engines work + for engine in self.current_engines: + ev = pd.eval(ex, engine=self.engine) + assert_array_equal(ev, result) + + def _create_unary_arith_op_t(self, unary_op, lhs, arith1, rhs): + # simple + ex = '{0}lhs'.format(unary_op, arith1) + f = _unary_ops_dict[unary_op] + bad_types = tuple(np.typeDict.values()) + + nan_frac_neg = (arith1 == '**' and + np.any(lhs < 0) and + hasfractional(rhs) and + np.isscalar(lhs) and np.isscalar(rhs) and + not (isinstance(lhs, bad_types) or + isinstance(rhs, bad_types)) + and not engine_has_neg_frac(self.engine)) + try: + expected = f(lhs.values) + except AttributeError: + expected = f(lhs) + result = pd.eval(ex, engine=self.engine) + assert_array_equal(result, expected) + + for engine in self.current_engines: + assert_array_equal(result, pd.eval(ex, engine=engine)) + + ex = '{0}(lhs {1} rhs)'.format(unary_op, arith1) + + if nan_frac_neg: + assert_raises(ValueError, pd.eval, ex, engine=self.engine, + local_dict=locals(), global_dict=globals()) + else: + # compound + result = pd.eval(ex, engine=self.engine) + + #(lhs, rhs), _ = _align((lhs, rhs)) + #if arith1 != '//': + #expected = self.ne.evaluate(ex) + #assert_array_equal(result, expected) + #else: + #assert_raises(TypeError, self.ne.evaluate, ex) + + #for engine in self.current_engines: + #if arith1 != '//': + #if engine_has_neg_frac(engine): + #assert_array_equal(result, pd.eval(ex, engine=engine)) + #else: + #assert_raises(TypeError, pd.eval, ex, engine=engine, + #local_dict=locals(), global_dict=globals()) + + +class TestBasicEvalPython(TestBasicEval): + + @classmethod + def setUpClass(cls): + cls.cmp_ops = expr._cmp_ops_syms + cls.cmp2_ops = cls.cmp_ops[::-1] + cls.bin_ops = expr._bool_ops_syms + cls.arith_ops = expr._arith_ops_syms + cls.unary_ops = '+', '-' + + def set_current_engine(self): + self.engine = 'python' + + +def test_syntax_error_exprs(): + for engine in _engines: + e = 's +' + assert_raises(SyntaxError, pd.eval, e, engine=engine) + + +def test_name_error_exprs(): + for engine in _engines: + e = 's + t' + assert_raises(NameError, pd.eval, e, engine=engine) + + +def test_align_nested_unary_op(): + for engine in _engines: + yield check_align_nested_unary_op, engine + + +f = lambda *args, **kwargs: np.random.randn() + + +def check_align_nested_unary_op(engine): + skip_numexpr_engine(engine) + s = 'df * ~2' + df = mkdf(10, 10, data_gen_f=f) + res = pd.eval(s, engine) + assert_frame_equal(res, df * ~2) + + +def check_basic_frame_alignment(engine): + df = mkdf(10, 10, data_gen_f=f) + df2 = mkdf(20, 10, data_gen_f=f) + res = pd.eval('df + df2', engine=engine) + assert_frame_equal(res, df + df2) + + +def test_basic_frame_alignment(): + for engine in _engines: + yield check_basic_frame_alignment, engine + + +def check_medium_complex_frame_alignment(engine, r1, r2, c1, c2): + skip_numexpr_engine(engine) + df = mkdf(5, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) + df2 = mkdf(10, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + df3 = mkdf(15, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + res = pd.eval('df + df2 + df3', engine=engine) + assert_frame_equal(res, df + df2 + df3) + + +@slow +def test_medium_complex_frame_alignment(): + args = product(_engines, *([INDEX_TYPES[:4]] * 4)) + for engine, r1, r2, c1, c2 in args: + check_medium_complex_frame_alignment(engine, r1, r2, c1, c2) + + +def check_basic_frame_series_alignment(engine, r_idx_type, c_idx_type, + index_name): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': + assert_raises(ValueError, pd.eval, 'df + s', local_dict=locals()) + assert_raises(ValueError, df.add, s, axis=1) + else: + res = pd.eval('df + s', engine=engine) + expected = df + s + assert_frame_equal(res, expected) + + +def check_not_both_period_fails_otherwise_succeeds(lhs, rhs, r_idx_type, + c_idx_type, index_name, s, + df, *terms): + if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': + assert_raises(ValueError, pd.eval, lhs, local_dict=locals()) + assert_raises(ValueError, pd.eval, rhs, local_dict=locals()) + else: + a, b = pd.eval(lhs), pd.eval(rhs) + assert_frame_equal(a, b) + + +def check_basic_series_frame_alignment(engine, r_idx_type, c_idx_type, + index_name): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': + assert_raises(ValueError, pd.eval, 's + df', local_dict=locals()) + assert_raises(ValueError, df.add, s, axis=1) + else: + res = pd.eval('s + df', engine=engine) + expected = s + df + assert_frame_equal(res, expected) + + +@slow +def check_basic_series_frame_alignment_datetime(engine, r_idx_type, c_idx_type, + index_name): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': + assert_raises(ValueError, pd.eval, 's + df', local_dict=locals()) + assert_raises(ValueError, df.add, s, axis=1) + else: + res = pd.eval('s + df', engine=engine) + expected = s + df + assert_frame_equal(res, expected) + + if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': + assert_raises(ValueError, pd.eval, 'df + s', local_dict=locals()) + assert_raises(ValueError, df.add, s, axis=1) + else: + res = pd.eval('df + s', engine=engine) + expected = df + s + assert_frame_equal(res, expected) + + +def check_series_frame_commutativity(engine, r_idx_type, c_idx_type, op, + index_name): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + lhs = 's {0} df'.format(op) + rhs = 'df {0} s'.format(op) + check_not_both_period_fails_otherwise_succeeds(lhs, rhs, r_idx_type, + c_idx_type, index_name, s, + df) + + +INDEX_TYPES = 'i', 'f', 's', 'u', # 'dt', # 'p' + + +@slow +def test_series_frame_commutativity(): + args = product(_engines, INDEX_TYPES, INDEX_TYPES, ('+', '*'), ('index', + 'columns')) + for engine, r_idx_type, c_idx_type, op, index_name in args: + check_series_frame_commutativity(engine, r_idx_type, c_idx_type, op, + index_name) + + +def test_basic_frame_series_alignment(): + args = product(_engines, INDEX_TYPES, INDEX_TYPES, ('index', 'columns')) + for engine, r_idx_type, c_idx_type, index_name in args: + check_basic_frame_series_alignment(engine, r_idx_type, c_idx_type, + index_name) + + +@slow +def test_basic_series_frame_alignment_datetime(): + idx_types = INDEX_TYPES + args = product(_engines, idx_types, idx_types, ('index', 'columns')) + for engine, r_idx_type, c_idx_type, index_name in args: + check_basic_series_frame_alignment_datetime(engine, r_idx_type, + c_idx_type, index_name) + + +def test_basic_series_frame_alignment(): + args = product(_engines, INDEX_TYPES, INDEX_TYPES, ('index', 'columns')) + for engine, r_idx_type, c_idx_type, index_name in args: + check_basic_series_frame_alignment(engine, r_idx_type, c_idx_type, + index_name) + + +def check_complex_series_frame_alignment(engine, index_name, obj, r1, r2, c1, + c2): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) + df2 = mkdf(20, 10, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + index = getattr(locals()[obj], index_name) + s = Series(np.random.randn(5), index[:5]) + if engine != 'python': + expected = df2.add(s, axis=1).add(df) + else: + expected = df2 + s + df + res = pd.eval('df2 + s + df', engine=engine) + expected = df2 + s + df + assert_tuple_equal(res.shape, expected.shape) + assert_frame_equal(res, expected) + + +@slow +def test_complex_series_frame_alignment(): + args = product(_engines, ('index', 'columns'), ('df', 'df2'), + *([INDEX_TYPES[:4]] * 4)) + for engine, index_name, obj, r1, r2, c1, c2 in args: + check_complex_series_frame_alignment(engine, index_name, obj, r1, r2, + c1, c2) + + +def check_datetime_index_rows_punts_to_python(engine): + df = mkdf(10, 10, data_gen_f=f, r_idx_type='dt', c_idx_type='dt') + index = getattr(df, 'index') + s = Series(np.random.randn(5), index[:5]) + env = Scope(globals(), locals()) + + +def test_datetime_index_rows_punts_to_python(): + for engine in _engines: + check_datetime_index_rows_punts_to_python(engine) + + +def test_truediv(): + for engine in _engines: + check_truediv(engine) + + +def check_truediv(engine): + s = np.array([1]) + ex = 's / 1' + + if PY3: + res = pd.eval(ex, truediv=False) + assert_array_equal(res, np.array([1.0])) + + res = pd.eval(ex, truediv=True) + assert_array_equal(res, np.array([1.0])) + else: + res = pd.eval(ex, truediv=False) + assert_array_equal(res, np.array([1])) + + res = pd.eval(ex, truediv=True) + assert_array_equal(res, np.array([1.0])) + + +__var_s = randn(10) + + +def check_global_scope(engine): + e = '__var_s * 2' + assert_array_equal(__var_s * 2, pd.eval(e, engine=engine)) + + +def test_global_scope(): + for engine in _engines: + yield check_global_scope, engine + + +def check_is_expr(engine): + s = 1 + valid = 's + 1' + invalid = 's +' + assert_true(expr.isexpr(valid, check_names=True)) + assert_true(expr.isexpr(valid, check_names=False)) + assert_false(expr.isexpr(invalid, check_names=False)) + assert_false(expr.isexpr(invalid, check_names=True)) + + +def test_is_expr(): + for engine in _engines: + check_is_expr(engine) + + +def check_not_fails(engine): + x = True + assert_raises(NotImplementedError, pd.eval, 'not x', engine=engine, + local_dict={'x': x}) + + +def test_not_fails(): + for engine in _engines: + check_not_fails(engine) + + +def check_and_fails(engine): + x, y = False, True + assert_raises(NotImplementedError, pd.eval, 'x and y', engine=engine, + local_dict={'x': x, 'y': y}) + + +def test_and_fails(): + for engine in _engines: + check_and_fails(engine) + + +def check_or_fails(engine): + x, y = True, False + assert_raises(NotImplementedError, pd.eval, 'x or y', engine=engine, + local_dict={'x': x, 'y': y}) + + +def test_or_fails(): + for engine in _engines: + check_or_fails(engine) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/core/base.py b/pandas/core/base.py index a2f7f04053b9f..fb0d56113ede9 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -48,6 +48,7 @@ def __repr__(self): """ return str(self) + class PandasObject(StringMixin): """baseclass for various pandas objects""" diff --git a/pandas/core/common.py b/pandas/core/common.py index 34aaa08b57171..89407121f959a 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -18,6 +18,7 @@ from pandas.core.config import get_option from pandas.core import array as pa +import pandas as pd class PandasError(Exception): pass @@ -1656,6 +1657,29 @@ def is_bool(obj): return isinstance(obj, (bool, np.bool_)) +def is_string(obj): + return isinstance(obj, (basestring, np.str_, np.unicode_)) + + +def is_series(obj): + return isinstance(obj, pd.Series) + + +def is_frame(obj): + return isinstance(obj, pd.DataFrame) + + +def is_panel(obj): + return isinstance(obj, pd.Panel) + + +def is_pd_obj(obj): + return isinstance(obj, pd.core.generic.PandasObject) + + +def is_ndframe(obj): + return isinstance(obj, pd.core.generic.NDFrame) + def is_integer(obj): return isinstance(obj, (int, long, np.integer)) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f56b6bc00cf15..c957ec9d331b9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -35,8 +35,8 @@ create_block_manager_from_arrays, create_block_manager_from_blocks) from pandas.core.series import Series, _radd_compat -import pandas.core.expressions as expressions from pandas.sparse.array import SparseArray +import pandas.computation.expressions as expressions from pandas.compat.scipy import scoreatpercentile as _quantile from pandas.compat import(range, zip, lrange, lmap, lzip, StringIO, u, OrderedDict, raise_with_traceback) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index c265d1590af95..11ce27b078b18 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -18,8 +18,7 @@ from pandas.sparse.array import _maybe_to_sparse, SparseArray import pandas.lib as lib import pandas.tslib as tslib -import pandas.core.expressions as expressions -from pandas.util.decorators import cache_readonly +import pandas.computation.expressions as expressions from pandas.tslib import Timestamp from pandas import compat diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index c8224f761ce17..6e7f721950a15 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -32,7 +32,6 @@ from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type import pandas.core.common as com from pandas.tools.merge import concat -from pandas import compat from pandas.io.common import PerformanceWarning from pandas.core.config import get_option @@ -222,9 +221,12 @@ def get_store(path, **kwargs): Examples -------- + >>> from pandas import DataFrame + >>> from numpy.random import randn + >>> bar = DataFrame(randn(10, 4)) >>> with get_store('test.h5') as store: - >>> store['foo'] = bar # write to HDF5 - >>> bar = store['foo'] # retrieve + ... store['foo'] = bar # write to HDF5 + ... bar = store['foo'] # retrieve """ store = None try: @@ -237,7 +239,8 @@ def get_store(path, **kwargs): # interface to/from ### -def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, append=None, **kwargs): +def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, + append=None, **kwargs): """ store this object, close it if we opened it """ if append: f = lambda store: store.append(key, value, **kwargs) @@ -332,6 +335,9 @@ class HDFStore(StringMixin): Examples -------- + >>> from pandas import DataFrame + >>> from numpy.random import randn + >>> bar = DataFrame(randn(10, 4)) >>> store = HDFStore('test.h5') >>> store['foo'] = bar # write to HDF5 >>> bar = store['foo'] # retrieve @@ -341,9 +347,9 @@ class HDFStore(StringMixin): def __init__(self, path, mode=None, complevel=None, complib=None, fletcher32=False, **kwargs): try: - import tables as _ + import tables except ImportError: # pragma: no cover - raise Exception('HDFStore requires PyTables') + raise ImportError('HDFStore requires PyTables') self._path = path if mode is None: @@ -523,7 +529,8 @@ def get(self, key): raise KeyError('No object named %s in the file' % key) return self._read_group(group) - def select(self, key, where=None, start=None, stop=None, columns=None, iterator=False, chunksize=None, auto_close=False, **kwargs): + def select(self, key, where=None, start=None, stop=None, columns=None, + iterator=False, chunksize=None, auto_close=False, **kwargs): """ Retrieve pandas object stored in file, optionally based on where criteria @@ -554,17 +561,22 @@ def select(self, key, where=None, start=None, stop=None, columns=None, iterator= # what we are actually going to do for a chunk def func(_start, _stop): - return s.read(where=where, start=_start, stop=_stop, columns=columns, **kwargs) + return s.read(where=where, start=_start, stop=_stop, + columns=columns, **kwargs) if iterator or chunksize is not None: if not s.is_table: raise TypeError( "can only use an iterator or chunksize on a table") - return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, chunksize=chunksize, auto_close=auto_close) + return TableIterator(self, func, nrows=s.nrows, start=start, + stop=stop, chunksize=chunksize, + auto_close=auto_close) - return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, auto_close=auto_close).get_values() + return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, + auto_close=auto_close).get_values() - def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs): + def select_as_coordinates( + self, key, where=None, start=None, stop=None, **kwargs): """ return the selection as an Index @@ -599,7 +611,9 @@ def select_column(self, key, column, **kwargs): """ return self.get_storer(key).read_column(column=column, **kwargs) - def select_as_multiple(self, keys, where=None, selector=None, columns=None, start=None, stop=None, iterator=False, chunksize=None, auto_close=False, **kwargs): + def select_as_multiple(self, keys, where=None, selector=None, columns=None, + start=None, stop=None, iterator=False, + chunksize=None, auto_close=False, **kwargs): """ Retrieve pandas objects from multiple tables Parameters @@ -624,10 +638,10 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, star return self.select(key=keys, where=where, columns=columns, start=start, stop=stop, iterator=iterator, chunksize=chunksize, **kwargs) if not isinstance(keys, (list, tuple)): - raise Exception("keys must be a list/tuple") + raise TypeError("keys must be a list/tuple") - if len(keys) == 0: - raise Exception("keys must have a non-zero length") + if not len(keys): + raise ValueError("keys must have a non-zero length") if selector is None: selector = keys[0] @@ -642,7 +656,8 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, star raise TypeError("Invalid table [%s]" % k) if not t.is_table: raise TypeError( - "object [%s] is not a table, and cannot be used in all select as multiple" % t.pathname) + "object [%s] is not a table, and cannot be used in all select as multiple" % + t.pathname) if nrows is None: nrows = t.nrows @@ -655,7 +670,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, star c = self.select_as_coordinates( selector, where, start=start, stop=stop) nrows = len(c) - except (Exception) as detail: + except Exception: raise ValueError("invalid selector [%s]" % selector) def func(_start, _stop): @@ -777,8 +792,8 @@ def append(self, key, value, format=None, append=True, columns=None, dropna=None data in the table, so be careful """ if columns is not None: - raise Exception( - "columns is not a supported keyword in append, try data_columns") + raise TypeError("columns is not a supported keyword in append, " + "try data_columns") if dropna is None: dropna = get_option("io.hdf.dropna_table") @@ -809,8 +824,9 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, d """ if axes is not None: - raise Exception( - "axes is currently not accepted as a paremter to append_to_multiple; you can create the tables indepdently instead") + raise TypeError("axes is currently not accepted as a parameter to" + " append_to_multiple; you can create the " + "tables indepdently instead") if not isinstance(d, dict): raise ValueError( @@ -876,7 +892,7 @@ def create_table_index(self, key, **kwargs): # version requirements _tables() if not _table_supports_index: - raise Exception("PyTables >= 2.3 is required for table indexing") + raise ValueError("PyTables >= 2.3 is required for table indexing") s = self.get_storer(key) if s is None: @@ -930,7 +946,11 @@ def copy( """ new_store = HDFStore( - file, mode=mode, complib=complib, complevel=complevel, fletcher32 = fletcher32) + file, + mode=mode, + complib=complib, + complevel=complevel, + fletcher32=fletcher32) if keys is None: keys = list(self.keys()) if not isinstance(keys, (tuple, list)): @@ -1142,7 +1162,8 @@ class TableIterator(object): kwargs : the passed kwargs """ - def __init__(self, store, func, nrows, start=None, stop=None, chunksize=None, auto_close=False): + def __init__(self, store, func, nrows, start=None, stop=None, + chunksize=None, auto_close=False): self.store = store self.func = func self.nrows = nrows or 0 @@ -1251,7 +1272,12 @@ def set_table(self, table): def __unicode__(self): temp = tuple( - map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))) + map(pprint_thing, + (self.name, + self.cname, + self.axis, + self.pos, + self.kind))) return "name->%s,cname->%s,axis->%s,pos->%s,kind->%s" % temp def __eq__(self, other): @@ -1361,9 +1387,7 @@ def validate_col(self, itemsize=None): """ validate this column: return the compared against itemsize """ # validate this column for string truncation (or reset to the max size) - dtype = getattr(self, 'dtype', None) if _ensure_decoded(self.kind) == u('string'): - c = self.col if c is not None: if itemsize is None: @@ -1467,7 +1491,8 @@ class DataCol(IndexCol): _info_fields = ['tz'] @classmethod - def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs): + def create_for_block( + cls, i=None, name=None, cname=None, version=None, **kwargs): """ return a new datacol with the block i """ if cname is None: @@ -1487,7 +1512,8 @@ def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs) return cls(name=name, cname=cname, **kwargs) - def __init__(self, values=None, kind=None, typ=None, cname=None, data=None, block=None, **kwargs): + def __init__(self, values=None, kind=None, typ=None, + cname=None, data=None, block=None, **kwargs): super(DataCol, self).__init__( values=values, kind=kind, typ=typ, cname=cname, **kwargs) self.dtype = None @@ -1540,7 +1566,8 @@ def set_kind(self): if self.typ is None: self.typ = getattr(self.description, self.cname, None) - def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=None, **kwargs): + def set_atom(self, block, existing_col, min_itemsize, + nan_rep, info, encoding=None, **kwargs): """ create and setup my atom from the block b """ self.values = list(block.items) @@ -1596,7 +1623,11 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No # end up here ### elif inferred_type == 'string' or dtype == 'object': self.set_atom_string( - block, existing_col, min_itemsize, nan_rep, encoding) + block, + existing_col, + min_itemsize, + nan_rep, + encoding) else: self.set_atom_data(block) @@ -1605,7 +1636,8 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No def get_atom_string(self, block, itemsize): return _tables().StringCol(itemsize=itemsize, shape=block.shape[0]) - def set_atom_string(self, block, existing_col, min_itemsize, nan_rep, encoding): + def set_atom_string( + self, block, existing_col, min_itemsize, nan_rep, encoding): # fill nan items with myself block = block.fillna(nan_rep)[0] data = block.values @@ -1701,13 +1733,13 @@ def validate_attr(self, append): if (existing_fields is not None and existing_fields != list(self.values)): raise ValueError("appended items do not match existing items" - " in table!") + " in table!") existing_dtype = getattr(self.attrs, self.dtype_attr, None) if (existing_dtype is not None and existing_dtype != self.dtype): raise ValueError("appended items dtype do not match existing items dtype" - " in table!") + " in table!") def convert(self, values, nan_rep, encoding): """ set the data from this selection (and convert to the correct dtype if we can) """ @@ -1855,6 +1887,9 @@ def __unicode__(self): return "%-12.12s (shape->%s)" % (self.pandas_type, s) return self.pandas_type + def __str__(self): + return self.__repr__() + def set_object_info(self): """ set my pandas type & version """ self.attrs.pandas_type = str(self.pandas_kind) @@ -2058,7 +2093,7 @@ def read_index(self, key): _, index = self.read_index_node(getattr(self.group, key)) return index else: # pragma: no cover - raise Exception('unrecognized index variety: %s' % variety) + raise TypeError('unrecognized index variety: %s' % variety) def write_index(self, key, index): if isinstance(index, MultiIndex): @@ -2241,7 +2276,7 @@ def write_array(self, key, value, items=None): warnings.warn(ws, PerformanceWarning) vlarr = self._handle.createVLArray(self.group, key, - _tables().ObjectAtom()) + _tables().ObjectAtom()) vlarr.append(value) elif value.dtype.type == np.datetime64: self._handle.createArray(self.group, key, value.view('i8')) @@ -2381,8 +2416,7 @@ def read(self, **kwargs): sdict = {} for name in items: key = 'sparse_frame_%s' % name - node = getattr(self.group, key) - s = SparseFrameFixed(self.parent, getattr(self.group, key)) + s = SparseFrameStorer(self.parent, getattr(self.group, key)) s.infer_axes() sdict[name] = s.read() return SparsePanel(sdict, items=items, default_kind=self.default_kind, @@ -2574,7 +2608,8 @@ def validate(self, other): oax = ov[i] if sax != oax: raise ValueError( - "invalid combinate of [%s] on appending data [%s] vs current table [%s]" % (c, sax, oax)) + "invalid combinate of [%s] on appending data [%s] vs current table [%s]" % + (c, sax, oax)) # should never get here raise Exception( @@ -2706,14 +2741,14 @@ def validate_min_itemsize(self, min_itemsize): continue if k not in q: raise ValueError( - "min_itemsize has the key [%s] which is not an axis or data_column" % k) + "min_itemsize has the key [%s] which is not an axis or data_column" % + k) @property def indexables(self): """ create/cache the indexables if they don't exist """ if self._indexables is None: - d = self.description self._indexables = [] # index columns @@ -2848,7 +2883,8 @@ def validate_data_columns(self, data_columns, min_itemsize): # return valid columns in the order of our axis return [c for c in data_columns if c in axis_labels] - def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, min_itemsize=None, **kwargs): + def create_axes(self, axes, obj, validate=True, nan_rep=None, + data_columns=None, min_itemsize=None, **kwargs): """ create and return the axes leagcy tables create an indexable column, indexable index, non-indexable fields @@ -2869,8 +2905,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, try: axes = _AXES_MAP[type(obj)] except: - raise TypeError( - "cannot properly create the storer for: [group->%s,value->%s]" % + raise TypeError("cannot properly create the storer for: [group->%s,value->%s]" % (self.group._v_name, type(obj))) # map axes to numbers @@ -2995,8 +3030,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, try: existing_col = existing_table.values_axes[i] except: - raise ValueError( - "Incompatible appended table [%s] with existing table [%s]" % + raise ValueError("Incompatible appended table [%s] with existing table [%s]" % (blocks, existing_table.values_axes)) else: existing_col = None @@ -3070,7 +3104,8 @@ def process_filter(field, filt): return obj - def create_description(self, complib=None, complevel=None, fletcher32=False, expectedrows=None): + def create_description( + self, complib=None, complevel=None, fletcher32=False, expectedrows=None): """ create the description of the table from the axes & values """ # expected rows estimate @@ -3119,8 +3154,8 @@ def read_column(self, column, where=None, **kwargs): return False if where is not None: - raise Exception( - "read_column does not currently accept a where clause") + raise TypeError("read_column does not currently accept a where " + "clause") # find the axes for a in self.axes: @@ -3128,7 +3163,8 @@ def read_column(self, column, where=None, **kwargs): if not a.is_data_indexable: raise ValueError( - "column [%s] can not be extracted individually; it is not data indexable" % column) + "column [%s] can not be extracted individually; it is not data indexable" % + column) # column must be an indexable or a data column c = getattr(self.table.cols, column) @@ -3174,7 +3210,7 @@ class LegacyTable(Table): ndim = 3 def write(self, **kwargs): - raise Exception("write operations are not allowed on legacy tables!") + raise TypeError("write operations are not allowed on legacy tables!") def read(self, where=None, columns=None, **kwargs): """ we have n indexable columns, with an arbitrary number of data axes """ @@ -3626,9 +3662,9 @@ def get_attrs(self): self.levels = [] t = self.table self.index_axes = [a.infer(t) - for a in self.indexables if a.is_an_indexable] + for a in self.indexables if a.is_an_indexable] self.values_axes = [a.infer(t) - for a in self.indexables if not a.is_an_indexable] + for a in self.indexables if not a.is_an_indexable] self.data_columns = [a.name for a in self.values_axes] @property @@ -3755,7 +3791,7 @@ def _convert_index(index, encoding=None): index_name=index_name) if isinstance(index, MultiIndex): - raise Exception('MultiIndex not supported here!') + raise TypeError('MultiIndex not supported here!') inferred_type = lib.infer_dtype(index) @@ -3904,32 +3940,13 @@ def _need_convert(kind): return False -class Term(StringMixin): - - """create a term object that holds a field, op, and value - - Parameters - ---------- - field : dict, string term expression, or the field to operate (must be a valid index/column type of DataFrame/Panel) - op : a valid op (defaults to '=') (optional) - >, >=, <, <=, =, != (not equal) are allowed - value : a value or list of values (required) - queryables : a kinds map (dict of column name -> kind), or None i column is non-indexable - encoding : an encoding that will encode the query terms +class Coordinates(object): - Returns - ------- - a Term object + """ holds a returned coordinates list, useful to select the same rows from different tables - Examples - -------- - >>> Term(dict(field = 'index', op = '>', value = '20121114')) - >>> Term('index', '20121114') - >>> Term('index', '>', '20121114') - >>> Term('index', ['20121114','20121114']) - >>> Term('index', datetime(2012,11,14)) - >>> Term('major_axis>20121114') - >>> Term('minor_axis', ['A','U']) + coordinates : holds the array of coordinates + group : the source group + where : the source where """ _ops = ['<=', '<', '>=', '>', '!=', '==', '='] @@ -4134,23 +4151,13 @@ def stringify(value): return TermValue(v, stringify(v), u('string')) -class TermValue(object): - - """ hold a term value the we use to construct a condition/filter """ - def __init__(self, value, converted, kind): - self.value = value - self.converted = converted - self.kind = kind + def __len__(self): + return len(self.values) - def tostring(self, encoding): - """ quote the string if not encoded - else encode and return """ - if self.kind == u('string'): - if encoding is not None: - return self.converted - return '"%s"' % self.converted - return self.converted + def __getitem__(self, key): + """ return a new coordinates object, sliced by the key """ + return Coordinates(self.values[key], self.group, self.where) class Selection(object): diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 861b4dd7567a0..6a325db8aaaa9 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -2,9 +2,10 @@ from pandas.compat import range, lrange, u import nose import unittest -import os import sys +import os import warnings +from contextlib import contextmanager import datetime import numpy as np @@ -25,7 +26,6 @@ from pandas import compat, _np_version_under1p7 from pandas.core import common as com -from numpy.testing.decorators import slow try: import tables @@ -42,12 +42,12 @@ # contextmanager to ensure the file cleanup def safe_remove(path): if path is not None: - import os try: os.remove(path) except: pass + def safe_close(store): try: if store is not None: @@ -55,7 +55,6 @@ def safe_close(store): except: pass -from contextlib import contextmanager @contextmanager def ensure_clean(path, mode='a', complevel=None, complib=None, @@ -1328,6 +1327,7 @@ def test_big_table_frame(self): store.append('df', df) rows = store.root.df.table.nrows recons = store.select('df') + assert isinstance(recons, DataFrame) print("\nbig_table frame [%s] -> %5.2f" % (rows, time.time() - x)) @@ -1382,7 +1382,7 @@ def test_big_put_frame(self): with ensure_clean(self.path, mode='w') as store: start_time = time.time() - store = HDFStore(fn, mode='w') + store = HDFStore(self.path, mode='w') store.put('df', df) print(df.get_dtype_counts()) @@ -1410,6 +1410,7 @@ def test_big_table_panel(self): store.append('wp', wp) rows = store.root.wp.table.nrows recons = store.select('wp') + assert isinstance(recons, Panel) print("\nbig_table panel [%s] -> %5.2f" % (rows, time.time() - x)) @@ -1654,7 +1655,6 @@ def test_table_values_dtypes_roundtrip(self): expected.sort() tm.assert_series_equal(result,expected) - def test_table_mixed_dtypes(self): # frame @@ -2898,7 +2898,6 @@ def test_string_select(self): expected = df[df.int!=2] assert_frame_equal(result,expected) - def test_read_column(self): df = tm.makeTimeDataFrame() @@ -3190,7 +3189,6 @@ def _check_double_roundtrip(self, obj, comparator, compression=False, again = store['obj'] comparator(again, obj, **kwargs) - def _check_roundtrip_table(self, obj, comparator, compression=False): options = {} if compression: @@ -3296,6 +3294,7 @@ def test_pytables_native_read(self): try: store = HDFStore(tm.get_data_path('legacy_hdf/pytables_native.h5'), 'r') d2 = store['detector/readout'] + assert isinstance(d2, DataFrame) finally: safe_close(store) @@ -3303,6 +3302,7 @@ def test_pytables_native_read(self): store = HDFStore(tm.get_data_path('legacy_hdf/pytables_native2.h5'), 'r') str(store) d1 = store['detector'] + assert isinstance(d1, DataFrame) finally: safe_close(store) @@ -3352,11 +3352,18 @@ def test_legacy_0_10_read(self): def test_legacy_0_11_read(self): # legacy from 0.11 try: - store = HDFStore(tm.get_data_path('legacy_hdf/legacy_table_0.11.h5'), 'r') + path = os.path.join('legacy_hdf', 'legacy_table_0.11.h5') + store = HDFStore(tm.get_data_path(path), 'r') str(store) + assert 'df' in store + assert 'df1' in store + assert 'mi' in store df = store.select('df') df1 = store.select('df1') mi = store.select('mi') + assert isinstance(df, DataFrame) + assert isinstance(df1, DataFrame) + assert isinstance(mi, DataFrame) finally: safe_close(store) @@ -3364,10 +3371,9 @@ def test_copy(self): def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): try: - import os - if f is None: - f = tm.get_data_path('legacy_hdf/legacy_0.10.h5') + f = tm.get_data_path(os.path.join('legacy_hdf', + 'legacy_0.10.h5')) store = HDFStore(f, 'r') @@ -3437,6 +3443,7 @@ def test_legacy_table_write(self): df = DataFrame(dict(A = 'foo', B = 'bar'),index=lrange(10)) store.append('df', df, data_columns = ['B'], min_itemsize={'A' : 200 }) + store.append('wp', wp) store.close() @@ -3524,6 +3531,7 @@ def _test_sort(obj): else: raise ValueError('type not supported here') + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index e2051eba7f42a..96131d782893f 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -17,6 +17,11 @@ import pandas.util.testing as tm import pandas.core.config as cf +import numpy as np +from numpy.random import randn + +from pandas.tslib import iNaT + _multiprocess_can_split_ = True @@ -42,6 +47,7 @@ def __getitem__(self): assert(not is_seq(A())) + def test_notnull(): assert notnull(1.) assert not notnull(None) @@ -107,6 +113,61 @@ def test_isnull_lists(): assert(not result.any()) +def test_is_string(): + class MyString(str): + pass + + class MyUnicode(unicode): + pass + + strings = ('s', np.str_('a'), np.unicode_('unicode_string'), + MyString('a _string blah'), u'asdf', MyUnicode(u'asdf')) + not_strings = [], 1, {}, set(), np.array(['1']), np.array([u'1']) + + for string in strings: + assert com.is_string(string), '{0} is not a string'.format(string) + + for not_string in not_strings: + assert not com.is_string(not_string), ('{0} is a ' + 'string'.format(not_string)) + + +def test_is_frame(): + df = DataFrame(randn(2, 1)) + assert com.is_frame(df) + assert not com.is_frame('s') + + +def test_is_series(): + s = Series(randn(2)) + assert com.is_series(s) + assert not com.is_series(s.values) + + +def test_is_panel(): + p = Panel(randn(2, 3, 4)) + assert com.is_panel(p) + assert not com.is_panel(2) + + +def test_is_pd_obj(): + df = DataFrame(randn(2, 1)) + s = Series(randn(2)) + p = Panel(randn(2, 3, 4)) + for obj in (df, s, p): + assert com.is_pd_obj(obj) + assert not com.is_pd_obj(obj.values) + + +def test_is_ndframe(): + df = DataFrame(randn(2, 1)) + p = Panel(randn(2, 3, 4)) + # should add series after @jreback's ndframe to series pr + for obj in (df, p): + assert com.is_ndframe(obj) + assert not com.is_ndframe(obj.values) + + def test_isnull_datetime(): assert (not isnull(datetime.now())) assert notnull(datetime.now()) @@ -121,11 +182,13 @@ def test_isnull_datetime(): assert(mask[0]) assert(not mask[1:].any()) + def test_datetimeindex_from_empty_datetime64_array(): for unit in [ 'ms', 'us', 'ns' ]: idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit)) assert(len(idx) == 0) + def test_nan_to_nat_conversions(): df = DataFrame(dict({ @@ -144,6 +207,7 @@ def test_nan_to_nat_conversions(): if LooseVersion(np.__version__) >= '1.7.0': assert(s[8].value == np.datetime64('NaT').astype(np.int64)) + def test_any_none(): assert(com._any_none(1, 2, 3, None)) assert(not com._any_none(1, 2, 3, 4)) @@ -308,6 +372,7 @@ def test_ensure_int32(): result = com._ensure_int32(values) assert(result.dtype == np.int32) + def test_ensure_platform_int(): # verify that when we create certain types of indices diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 1572ca481d8a4..8646d261306ca 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -926,7 +926,8 @@ def join(self, other, how='left', level=None, return_indexers=False): See Index.join """ if (not isinstance(other, DatetimeIndex) and len(other) > 0 and - other.inferred_type != 'mixed-integer'): + other.inferred_type not in ('floating', 'mixed-integer', + 'mixed-integer-float', 'mixed')): try: other = DatetimeIndex(other) except TypeError: diff --git a/pandas/util/testing.py b/pandas/util/testing.py index abc13fb2ad9ee..eeb5ca4369164 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -15,7 +15,7 @@ from contextlib import contextmanager from distutils.version import LooseVersion -from numpy.random import randn +from numpy.random import randn, rand import numpy as np from pandas.core.common import isnull, _is_sequence @@ -48,6 +48,9 @@ _RAISE_NETWORK_ERROR_DEFAULT = False +def randbool(size=(), p=0.5): + return rand(*size) <= p + def rands(n): choices = string.ascii_letters + string.digits return ''.join(random.choice(choices) for _ in range(n)) diff --git a/setup.py b/setup.py index b7df339daf75a..955dedb74c180 100755 --- a/setup.py +++ b/setup.py @@ -83,7 +83,7 @@ except ImportError: cython = False -from os.path import splitext, basename, join as pjoin +from os.path import join as pjoin class build_ext(_build_ext): @@ -506,6 +506,7 @@ def pxd(name): maintainer=AUTHOR, packages=['pandas', 'pandas.compat', + 'pandas.computation', 'pandas.core', 'pandas.io', 'pandas.rpy', diff --git a/vb_suite/binary_ops.py b/vb_suite/binary_ops.py index 54774344520c9..3f076f9f922a3 100644 --- a/vb_suite/binary_ops.py +++ b/vb_suite/binary_ops.py @@ -21,7 +21,7 @@ start_date=datetime(2012, 1, 1)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_numexpr_threads(1) @@ -32,7 +32,7 @@ start_date=datetime(2013, 2, 26)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_use_numexpr(False) @@ -53,7 +53,7 @@ start_date=datetime(2012, 1, 1)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_numexpr_threads(1) @@ -63,7 +63,7 @@ start_date=datetime(2013, 2, 26)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_use_numexpr(False) @@ -84,7 +84,7 @@ start_date=datetime(2012, 1, 1)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_numexpr_threads(1) @@ -94,7 +94,7 @@ start_date=datetime(2013, 2, 26)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_use_numexpr(False) diff --git a/vb_suite/indexing.py b/vb_suite/indexing.py index 1264ae053ffca..2fb5a22ce0cb8 100644 --- a/vb_suite/indexing.py +++ b/vb_suite/indexing.py @@ -118,7 +118,7 @@ setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(50000, 100)) df2 = DataFrame(np.random.randn(50000, 100)) expr.set_use_numexpr(False) From e10a550148c3b00e345d842238a6a31b5937c863 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 6 Jul 2013 16:25:58 -0400 Subject: [PATCH 02/16] ENH: rewrite assignment operator as equal comparison for PyTables --- pandas/computation/expr.py | 47 ++++++++++++++++++++++++--- pandas/computation/tests/test_eval.py | 4 ++- 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 6d33f6ac50a0d..9a9cd226278bc 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -1,7 +1,12 @@ import ast import sys +import itertools +import tokenize +import re +from cStringIO import StringIO from functools import partial + from pandas.core.base import StringMixin from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops from pandas.computation.ops import _cmp_ops_syms, _bool_ops_syms @@ -26,12 +31,38 @@ class ExprParserError(Exception): pass +def _rewrite_assign(source): + res = [] + g = tokenize.generate_tokens(StringIO(source).readline) + for toknum, tokval, _, _, _ in g: + res.append((toknum, '==' if tokval == '=' else tokval)) + return tokenize.untokenize(res) + + +def _parenthesize_booleans(source, ops='|&'): + res = source + for op in ops: + terms = res.split(op) + + t = [] + for term in terms: + t.append('({0})'.format(term)) + + res = op.join(t) + return res + + +def preparse(source): + return _parenthesize_booleans(_rewrite_assign(source)) + + class ExprVisitor(ast.NodeVisitor): """Custom ast walker """ bin_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms - bin_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'BitAnd', 'BitOr', - 'Add', 'Sub', 'Mult', 'Div', 'Pow', 'FloorDiv', 'Mod') + bin_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', None, + 'BitAnd', 'BitOr', 'Add', 'Sub', 'Mult', 'Div', 'Pow', + 'FloorDiv', 'Mod') bin_op_nodes_map = dict(zip(bin_ops, bin_op_nodes)) unary_ops = _unary_ops_syms @@ -39,7 +70,7 @@ class ExprVisitor(ast.NodeVisitor): unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) def __init__(self, env): - for bin_op in self.bin_ops: + for bin_op in itertools.ifilter(lambda x: x is not None, self.bin_ops): setattr(self, 'visit_{0}'.format(self.bin_op_nodes_map[bin_op]), lambda node, bin_op=bin_op: partial(BinOp, bin_op)) @@ -54,7 +85,7 @@ def visit(self, node): raise TypeError('"node" must be an AST node or a string, you' ' passed a(n) {0}'.format(node.__class__)) if isinstance(node, basestring): - node = ast.fix_missing_locations(ast.parse(node)) + node = ast.fix_missing_locations(ast.parse(preparse(node))) return super(ExprVisitor, self).visit(node) def visit_Module(self, node): @@ -62,7 +93,7 @@ def visit_Module(self, node): raise ExprParserError('only a single expression is allowed') expr = node.body[0] - if not isinstance(expr, ast.Expr): + if not isinstance(expr, (ast.Expr, ast.Assign)): raise SyntaxError('only expressions are allowed') return self.visit(expr) @@ -95,6 +126,12 @@ def visit_Compare(self, node): raise ExprParserError('chained comparisons not supported') return self.visit(ops[0])(self.visit(node.left), self.visit(comps[0])) + def visit_Assign(self, node): + cmpr = ast.copy_location(ast.Compare(ops=[ast.Eq()], + left=node.targets[0], + comparators=[node.value]), node) + return self.visit(cmpr) + def visit_Call(self, node): if not isinstance(node.func, ast.Name): raise TypeError("Only named functions are supported") diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index fc1cccf320201..6ec630b80614d 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -52,7 +52,9 @@ def _eval_from_expr(lhs, cmp1, rhs, binop, cmp2): env = Scope() typ, axes = _align_core((Term('lhs', env), Term('rhs', env))) lhs, rhs = env.locals['lhs'], env.locals['rhs'] - return _reconstruct_object(typ, bf(f1(lhs, rhs), f2(lhs, rhs)), axes) + ret_type = np.result_type(lhs, rhs) + return _reconstruct_object(typ, bf(f1(lhs, rhs), f2(lhs, rhs)), axes, + ret_type) def _eval_single_bin(lhs, cmp1, rhs, has_neg_frac): From 0b1b2fc98c4998f286911f0a5e1d5252281bff27 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 6 Jul 2013 15:52:29 -0400 Subject: [PATCH 03/16] ENH: add Expr based terms for pytables --- doc/source/io.rst | 97 +++-- doc/source/v0.10.0.txt | 5 +- pandas/computation/align.py | 24 +- pandas/computation/engines.py | 5 +- pandas/computation/eval.py | 13 +- pandas/computation/expr.py | 361 ++++++++++++++---- pandas/computation/ops.py | 80 ++-- pandas/computation/pytables.py | 505 ++++++++++++++++++++++++++ pandas/computation/tests/test_eval.py | 88 ++++- pandas/core/frame.py | 21 +- pandas/io/pytables.py | 39 +- pandas/io/tests/test_pytables.py | 257 +++++++++---- pandas/tests/test_frame.py | 48 ++- pandas/util/testing.py | 20 +- 14 files changed, 1287 insertions(+), 276 deletions(-) create mode 100644 pandas/computation/pytables.py mode change 100644 => 100755 pandas/computation/tests/test_eval.py diff --git a/doc/source/io.rst b/doc/source/io.rst index c29af29d2e63f..a20d2a7aa51b5 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1962,7 +1962,7 @@ storing/selecting from homogeneous index DataFrames. store.select('df_mi') # the levels are automatically included as data columns - store.select('df_mi', Term('foo=bar')) + store.select('df_mi', 'foo=bar') .. _io.hdf5-query: @@ -1970,49 +1970,80 @@ storing/selecting from homogeneous index DataFrames. Querying a Table ~~~~~~~~~~~~~~~~ +.. warning:: + + This query capabilities have changed substantially starting in ``0.13.0``. + Queries from prior version are accepted (with a ``DeprecationWarning``) printed + if its not string-like. + ``select`` and ``delete`` operations have an optional criterion that can be specified to select/delete only a subset of the data. This allows one to have a very large on-disk table and retrieve only a portion of the data. -A query is specified using the ``Term`` class under the hood. +A query is specified using the ``Term`` class under the hood, as a boolean expression. - - 'index' and 'columns' are supported indexers of a DataFrame - - 'major_axis', 'minor_axis', and 'items' are supported indexers of + - ``index`` and ``columns`` are supported indexers of a DataFrame + - ``major_axis``, ``minor_axis``, and ``items`` are supported indexers of the Panel + - if ``data_columns`` are specified, these can be used as additional indexers + +Valid comparison operators are: + + - ``=, ==, !=, >, >=, <, <=`` + +Valid boolean expressions are combined with: + + - ``|`` : or + - ``&`` : and + - ``(`` and ``)`` : for grouping + +These rules are similar to how boolean expressions are used in pandas for indexing. + +.. note:: + + - ``=`` will be automatically expanded to the comparison operator ``==`` + - ``~`` is the not operator, but can only be used in very limited circumstances + - If a list/tuple of expressions are passed they will be combined via ``&``. + +The following are valid expressions: + + - ``'index>=date'`` + - ``"columns=['A', 'D']"`` + - ``'columns=A'`` + - ``'columns==A'`` + - ``"~(columns=['A','B'])"`` + - ``'index>df.index[3] & string="bar"'`` + - ``'(index>df.index[3] & index<=df.index[6]) | string="bar"'`` + - ``"ts>=Timestamp('2012-02-01')"`` + - ``"major_axis>=20130101"`` + +The ``indexers`` are on the left-hand side of the sub-expression: -Valid terms can be created from ``dict, list, tuple, or -string``. Objects can be embeded as values. Allowed operations are: ``<, -<=, >, >=, =, !=``. ``=`` will be inferred as an implicit set operation -(e.g. if 2 or more values are provided). The following are all valid -terms. + - ``columns``, ``major_axis``, ``ts`` - - ``dict(field = 'index', op = '>', value = '20121114')`` - - ``('index', '>', '20121114')`` - - ``'index > 20121114'`` - - ``('index', '>', datetime(2012, 11, 14))`` - - ``('index', ['20121114', '20121115'])`` - - ``('major_axis', '=', Timestamp('2012/11/14'))`` - - ``('minor_axis', ['A', 'B'])`` +The right-hand side of the sub-expression (after a comparsion operator), can be: -Queries are built up using a list of ``Terms`` (currently only -**anding** of terms is supported). An example query for a panel might be -specified as follows. ``['major_axis>20000102', ('minor_axis', '=', -['A', 'B']) ]``. This is roughly translated to: `major_axis must be -greater than the date 20000102 and the minor_axis must be A or B` + - functions that will be evaluated, e.g. ``Timestamp('2012-02-01')`` + - strings, e.g. ``"bar"`` + - date-like, e.g. ``20130101``, or ``"20130101"`` + - lists, e.g. ``"['A','B']"`` + - variables that are defined in the local names space, e.g. ``date`` + +Here is an example: .. ipython:: python store.append('wp',wp) store - store.select('wp', [ Term('major_axis>20000102'), Term('minor_axis', '=', ['A', 'B']) ]) + store.select('wp', "major_axis>Timestamp('20000102') & minor_axis=['A', 'B']") The ``columns`` keyword can be supplied to select a list of columns to be returned, -this is equivalent to passing a ``Term('columns', list_of_columns_to_filter)``: +this is equivalent to passing a ``'columns=list_of_columns_to_filter'``: .. ipython:: python - store.select('df', columns=['A', 'B']) + store.select('df', "columns=['A', 'B']") ``start`` and ``stop`` parameters can be specified to limit the total search space. These are in terms of the total number of rows in a table. @@ -2023,8 +2054,7 @@ space. These are in terms of the total number of rows in a table. wp.to_frame() # limiting the search - store.select('wp',[ Term('major_axis>20000102'), - Term('minor_axis', '=', ['A','B']) ], + store.select('wp',"major_axis>20000102 & minor_axis=['A','B']", start=0, stop=10) .. _io.hdf5-timedelta: @@ -2057,10 +2087,13 @@ You can create/modify an index for a table with ``create_table_index`` after data is already in the table (after and ``append/put`` operation). Creating a table index is **highly** encouraged. This will speed your queries a great deal when you use a ``select`` with the -indexed dimension as the ``where``. **Indexes are automagically created -(starting 0.10.1)** on the indexables and any data columns you -specify. This behavior can be turned off by passing ``index=False`` to -``append``. +indexed dimension as the ``where``. + +.. note:: + + Indexes are automagically created (starting ``0.10.1``) on the indexables + and any data columns you specify. This behavior can be turned off by passing + ``index=False`` to ``append``. .. ipython:: python @@ -2117,7 +2150,7 @@ create a new table!) Iterator ~~~~~~~~ -Starting in 0.11, you can pass, ``iterator=True`` or ``chunksize=number_in_a_chunk`` +Starting in ``0.11.0``, you can pass, ``iterator=True`` or ``chunksize=number_in_a_chunk`` to ``select`` and ``select_as_multiple`` to return an iterator on the results. The default is 50,000 rows returned in a chunk. @@ -2151,7 +2184,7 @@ Advanced Queries To retrieve a single indexable or data column, use the method ``select_column``. This will, for example, enable you to get the index very quickly. These return a ``Series`` of the result, indexed by the row number. -These do not currently accept the ``where`` selector (coming soon) +These do not currently accept the ``where`` selector. .. ipython:: python diff --git a/doc/source/v0.10.0.txt b/doc/source/v0.10.0.txt index d0c0ecc148239..476760e4b1464 100644 --- a/doc/source/v0.10.0.txt +++ b/doc/source/v0.10.0.txt @@ -258,11 +258,10 @@ Updated PyTables Support store.append('wp',wp) # selecting via A QUERY - store.select('wp', - [ Term('major_axis>20000102'), Term('minor_axis', '=', ['A','B']) ]) + store.select('wp', "major_axis>20000102 & minor_axis=['A','B']") # removing data from tables - store.remove('wp', [ 'major_axis', '>', wp.major_axis[3] ]) + store.remove('wp', 'major_axis>wp.major_axis[3]') store.select('wp') # deleting a store diff --git a/pandas/computation/align.py b/pandas/computation/align.py index 529fe84fd06a7..65840bb68b4ea 100644 --- a/pandas/computation/align.py +++ b/pandas/computation/align.py @@ -77,18 +77,20 @@ def wrapper(terms): if len(terms) == 1: return _align_core_single_unary_op(terms[0]) + term_values = (term.value for term in terms) # only scalars - elif all(term.isscalar for term in terms): - return np.result_type(*(term.value for term in terms)), None + if all(isinstance(term.value, pd.Index) or term.isscalar for term in + terms): + return np.result_type(*term_values), None # single element ndarrays all_has_size = all(hasattr(term.value, 'size') for term in terms) - if (all_has_size and all(term.value.size == 1 for term in terms)): - return np.result_type(*(term.value for term in terms)), None + if all_has_size and all(term.value.size == 1 for term in terms): + return np.result_type(*term_values), None # no pandas so just punt to the evaluator if not _any_pandas_objects(terms): - return np.result_type(*(term.value for term in terms)), None + return np.result_type(*term_values), None return f(terms) return wrapper @@ -162,17 +164,11 @@ def _filter_terms(flat): return names, literals -def _align(terms, env): - - # flatten the parse tree (a nested list) +def _align(terms): + """Align a set of terms""" + # flatten the parse tree (a nested list, really) terms = list(flatten(terms)) - # separate names and literals - names, literals = _filter_terms(terms) - - if not names: # only literals so just promote to a common type - return np.result_type(*literals).type, None - # if all resolved variables are numeric scalars if all(term.isscalar for term in terms): return np.result_type(*(term.value for term in terms)).type, None diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 7f500dccb825b..ea296ad0e4dd4 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -21,8 +21,7 @@ def convert(self): def evaluate(self): if not self._is_aligned: - self.result_type, self.aligned_axes = _align(self.expr.terms, - self.expr.env) + self.result_type, self.aligned_axes = _align(self.expr.terms) res = self._evaluate(self.expr.env) return _reconstruct_object(self.result_type, res, self.aligned_axes, @@ -77,4 +76,4 @@ def _evaluate(self, env): pass -_engines = {'numexpr': NumExprEngine, 'python': PythonEngine} +_engines = {'numexpr': NumExprEngine, 'python': PythonEngine } diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 1a681e37d6130..04e17e3e41ac2 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -11,7 +11,7 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, - global_dict=None): + global_dict=None, resolvers=None): """Evaluate a Python expression as a string using various backends. The following arithmetic operations are supported: +, -, *, /, **, %, // @@ -24,7 +24,7 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, expr : string or Expr object The expression to evaluate. This can be either a string or an ``Expr`` object. - engine : string, optional, default 'numexpr', {'python', 'numexpr', 'pytables'} + engine : string, optional, default 'numexpr', {'python', 'numexpr' } The engine used to evaluate the expression. Supported engines are - 'numexpr': This default engine evaluates pandas objects using numexpr @@ -32,8 +32,6 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, frames. - 'python': Performs operations as if you had eval'd in top level python - - 'pytables': Engine used for evaluating expressions for selection of - objects from PyTables HDF5 tables. truediv : bool, optional, default True Whether to use true division, like in Python >= 3 @@ -61,9 +59,9 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, eng = _engines[engine] if isinstance(expr, six.string_types): - # need to go 2 up in the call stack from the constructor since we want - # the calling scope's variables - env = Scope(global_dict, local_dict, frame_level=2) + # need to go 2 up in the call stack from the constructor + env = Scope(global_dict, local_dict, frame_level=2, + resolvers=resolvers) parsed_expr = Expr(expr, engine, env, truediv) elif isinstance(expr, Expr): parsed_expr = expr @@ -77,7 +75,6 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, # sanity check for a number # TODO: eventually take out - # TODO: pytables engine will probably need a string check if np.isscalar(ret): if not isinstance(ret, (np.number, np.bool_, numbers.Number)): raise TypeError('scalar result must be numeric or bool, passed ' diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 9a9cd226278bc..cb7b269485f95 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -1,23 +1,27 @@ import ast +import operator import sys +import inspect import itertools import tokenize -import re from cStringIO import StringIO from functools import partial - from pandas.core.base import StringMixin from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops from pandas.computation.ops import _cmp_ops_syms, _bool_ops_syms from pandas.computation.ops import _arith_ops_syms, _unary_ops_syms from pandas.computation.ops import Term, Constant +import pandas.lib as lib +import datetime + class Scope(object): - __slots__ = 'globals', 'locals' + __slots__ = ('globals', 'locals', 'resolvers', '_global_resolvers', + 'resolver_keys', '_resolver') - def __init__(self, gbls=None, lcls=None, frame_level=1): + def __init__(self, gbls=None, lcls=None, frame_level=1, resolvers=None): frame = sys._getframe(frame_level) try: @@ -26,9 +30,53 @@ def __init__(self, gbls=None, lcls=None, frame_level=1): finally: del frame - -class ExprParserError(Exception): - pass + # add some useful defaults + self.globals['Timestamp'] = lib.Timestamp + self.globals['datetime'] = datetime + + self.resolvers = resolvers or [] + self.resolver_keys = set(reduce(operator.add, (list(o.keys()) for o in + self.resolvers), [])) + self._global_resolvers = self.resolvers + [self.locals, self.globals] + self._resolver = None + + @property + def resolver(self): + if self._resolver is None: + def resolve_key(key): + for resolver in self._global_resolvers: + try: + return resolver[key] + except KeyError: + pass + self._resolver = resolve_key + + return self._resolver + + def update(self, scope_level=None): + + # we are always 2 levels below the caller + # plus the caller maybe below the env level + # in which case we need addtl levels + sl = 2 + if scope_level is not None: + sl += scope_level + + # add sl frames to the scope starting with the + # most distant and overwritting with more current + # makes sure that we can capture variable scope + frame = inspect.currentframe() + try: + frames = [] + while sl >= 0: + frame = frame.f_back + sl -= 1 + frames.append(frame) + for f in frames[::-1]: + self.locals.update(f.f_locals) + finally: + del frame + del frames def _rewrite_assign(source): @@ -52,110 +100,279 @@ def _parenthesize_booleans(source, ops='|&'): return res -def preparse(source): +def _preparse(source): return _parenthesize_booleans(_rewrite_assign(source)) -class ExprVisitor(ast.NodeVisitor): + +# partition all AST nodes +_all_nodes = frozenset(filter(lambda x: isinstance(x, type) and + issubclass(x, ast.AST), + (getattr(ast, node) for node in dir(ast)))) + + +def _filter_nodes(superclass, all_nodes=_all_nodes): + node_names = (node.__name__ for node in all_nodes + if issubclass(node, superclass)) + return frozenset(node_names) + + +_all_node_names = frozenset(map(lambda x: x.__name__, _all_nodes)) +_mod_nodes = _filter_nodes(ast.mod) +_stmt_nodes = _filter_nodes(ast.stmt) +_expr_nodes = _filter_nodes(ast.expr) +_expr_context_nodes = _filter_nodes(ast.expr_context) +_slice_nodes = _filter_nodes(ast.slice) +_boolop_nodes = _filter_nodes(ast.boolop) +_operator_nodes = _filter_nodes(ast.operator) +_unary_op_nodes = _filter_nodes(ast.unaryop) +_cmp_op_nodes = _filter_nodes(ast.cmpop) +_comprehension_nodes = _filter_nodes(ast.comprehension) +_handler_nodes = _filter_nodes(ast.excepthandler) +_arguments_nodes = _filter_nodes(ast.arguments) +_keyword_nodes = _filter_nodes(ast.keyword) +_alias_nodes = _filter_nodes(ast.alias) + + +# nodes that we don't support directly but are needed for parsing +_hacked_nodes = frozenset(['Assign', 'Module', 'Expr']) + + +# these nodes are low priority or won't ever be supported (e.g., AST) +_unsupported_nodes = ((_stmt_nodes | _mod_nodes | _handler_nodes | + _arguments_nodes | _keyword_nodes | _alias_nodes | + _expr_context_nodes | frozenset(['Yield', + 'GeneratorExp', + 'IfExp', 'DictComp', + 'SetComp', 'Repr', + 'Lambda', 'Set', 'In', + 'NotIn', 'AST', + 'Is', 'IsNot'])) - + _hacked_nodes) + +# we're adding a different assignment in some cases to be equality comparison +# and we don't want `stmt` and friends in their so get only the class whose +# names are capitalized +_base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes +_msg = 'cannot both support and not support {0}'.format(_unsupported_nodes & + _base_supported_nodes) +assert not _unsupported_nodes & _base_supported_nodes, _msg + + +def _node_not_implemented(node_name, cls): + def f(self, *args, **kwargs): + raise NotImplementedError("{0!r} nodes are not " + "implemented".format(node_name)) + return f + + +def disallow(nodes): + def disallowed(cls): + cls.unsupported_nodes = () + for node in nodes: + new_method = _node_not_implemented(node, cls) + name = 'visit_{0}'.format(node) + cls.unsupported_nodes += (name,) + setattr(cls, name, new_method) + return cls + return disallowed + + +def _op_maker(op_class, op_symbol): + def f(self, node, *args, **kwargs): + return partial(op_class, op_symbol, *args, **kwargs) + return f + + +_op_classes = {'binary': BinOp, 'unary': UnaryOp} + +def add_ops(op_classes): + def f(cls): + for op_attr_name, op_class in op_classes.iteritems(): + ops = getattr(cls, '{0}_ops'.format(op_attr_name)) + ops_map = getattr(cls, '{0}_op_nodes_map'.format(op_attr_name)) + for op in ops: + setattr(cls, 'visit_{0}'.format(ops_map[op]), + _op_maker(op_class, op)) + return cls + return f + + +@disallow(_unsupported_nodes) +@add_ops(_op_classes) +class BaseExprVisitor(ast.NodeVisitor): + """Custom ast walker """ - bin_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms - bin_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', None, - 'BitAnd', 'BitOr', 'Add', 'Sub', 'Mult', 'Div', 'Pow', - 'FloorDiv', 'Mod') - bin_op_nodes_map = dict(zip(bin_ops, bin_op_nodes)) + binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms + binary_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'BitAnd', + 'BitOr', 'Add', 'Sub', 'Mult', 'Div', 'Pow', 'FloorDiv', + 'Mod') + binary_op_nodes_map = dict(itertools.izip(binary_ops, binary_op_nodes)) unary_ops = _unary_ops_syms unary_op_nodes = 'UAdd', 'USub', 'Invert' - unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) + unary_op_nodes_map = dict(itertools.izip(unary_ops, unary_op_nodes)) - def __init__(self, env): - for bin_op in itertools.ifilter(lambda x: x is not None, self.bin_ops): - setattr(self, 'visit_{0}'.format(self.bin_op_nodes_map[bin_op]), - lambda node, bin_op=bin_op: partial(BinOp, bin_op)) - - for unary_op in self.unary_ops: - setattr(self, - 'visit_{0}'.format(self.unary_op_nodes_map[unary_op]), - lambda node, unary_op=unary_op: partial(UnaryOp, unary_op)) + def __init__(self, env, preparser=_preparse): self.env = env + self.preparser = preparser - def visit(self, node): - if not (isinstance(node, ast.AST) or isinstance(node, basestring)): - raise TypeError('"node" must be an AST node or a string, you' - ' passed a(n) {0}'.format(node.__class__)) + def visit(self, node, **kwargs): if isinstance(node, basestring): - node = ast.fix_missing_locations(ast.parse(preparse(node))) - return super(ExprVisitor, self).visit(node) + node = ast.fix_missing_locations(ast.parse(self.preparser(node))) - def visit_Module(self, node): - if len(node.body) != 1: - raise ExprParserError('only a single expression is allowed') + method = 'visit_' + node.__class__.__name__ + visitor = getattr(self, method, None) + return visitor(node, **kwargs) + def visit_Module(self, node, **kwargs): + if len(node.body) != 1: + raise SyntaxError('only a single expression is allowed') expr = node.body[0] - if not isinstance(expr, (ast.Expr, ast.Assign)): - raise SyntaxError('only expressions are allowed') + return self.visit(expr, **kwargs) - return self.visit(expr) - - def visit_Expr(self, node): - return self.visit(node.value) + def visit_Expr(self, node, **kwargs): + return self.visit(node.value, **kwargs) - def visit_BinOp(self, node): + def visit_BinOp(self, node, **kwargs): op = self.visit(node.op) - left = self.visit(node.left) - right = self.visit(node.right) + left = self.visit(node.left, side='left') + right = self.visit(node.right, side='right') return op(left, right) - def visit_UnaryOp(self, node): - if isinstance(node.op, ast.Not): - raise NotImplementedError("not operator not yet supported") + def visit_UnaryOp(self, node, **kwargs): op = self.visit(node.op) return op(self.visit(node.operand)) - def visit_Name(self, node): + def visit_Name(self, node, **kwargs): return Term(node.id, self.env) - def visit_Num(self, node): + def visit_Num(self, node, **kwargs): return Constant(node.n, self.env) - def visit_Compare(self, node): - ops = node.ops - comps = node.comparators - if len(ops) != 1: - raise ExprParserError('chained comparisons not supported') - return self.visit(ops[0])(self.visit(node.left), self.visit(comps[0])) - - def visit_Assign(self, node): - cmpr = ast.copy_location(ast.Compare(ops=[ast.Eq()], - left=node.targets[0], - comparators=[node.value]), node) + def visit_Str(self, node, **kwargs): + return Constant(node.s, self.env) + + def visit_List(self, node, **kwargs): + return Constant([self.visit(e).value for e in node.elts], self.env) + + visit_Tuple = visit_List + + def visit_Index(self, node, **kwargs): + """ df.index[4] """ + return self.visit(node.value) + + + def visit_Subscript(self, node, **kwargs): + """ df.index[4:6] """ + value = self.visit(node.value) + slobj = self.visit(node.slice) + + try: + return Constant(value[slobj], self.env) + except TypeError: + raise ValueError("cannot subscript [{0}] with " + "[{1}]".format(value, slobj)) + + def visit_Slice(self, node, **kwargs): + """ df.index[slice(4,6)] """ + lower = node.lower + if lower is not None: + lower = self.visit(lower).value + upper = node.upper + if upper is not None: + upper = self.visit(upper).value + step = node.step + if step is not None: + step = self.visit(step).value + + return slice(lower, upper, step) + + def visit_Assign(self, node, **kwargs): + cmpr = ast.Compare(ops=[ast.Eq()], left=node.targets[0], + comparators=[node.value]) return self.visit(cmpr) - def visit_Call(self, node): - if not isinstance(node.func, ast.Name): + def visit_Attribute(self, node, **kwargs): + attr = node.attr + value = node.value + + ctx = node.ctx.__class__ + if ctx == ast.Load: + # resolve the value + return getattr(self.visit(value).value, attr) + raise ValueError("Invalid Attribute context {0}".format(ctx.__name__)) + + def visit_Call(self, node, **kwargs): + + # this can happen with: datetime.datetime + if isinstance(node.func, ast.Attribute): + res = self.visit_Attribute(node.func) + elif not isinstance(node.func, ast.Name): raise TypeError("Only named functions are supported") + else: + res = self.visit(node.func) + + if res is None: + raise ValueError("Invalid function call {0}".format(node.func.id)) + if hasattr(res, 'value'): + res = res.value + + args = [self.visit(targ).value for targ in node.args] + if node.starargs is not None: + args = args + self.visit(node.starargs).value + + keywords = {} + for key in node.keywords: + if not isinstance(key, ast.keyword): + raise ValueError( + "keyword error in function call '{0}'".format(node.func.id)) + keywords[key.arg] = self.visit(key.value).value + if node.kwargs is not None: + keywords.update(self.visit(node.kwargs).value) + + return Constant(res(*args, **keywords), self.env) + + def visit_Compare(self, node, **kwargs): + ops = node.ops + comps = node.comparators + for op, comp in itertools.izip(ops, comps): + vop = self.visit(op) + node = vop(self.visit(node.left, side='left'), + self.visit(comp, side='right')) + return node - valid_ops = _reductions + _mathops - if node.func.id not in valid_ops: - raise ValueError("Only {0} are supported".format(valid_ops)) +_numexpr_not_supported = frozenset(['Assign', 'BoolOp', 'Not', 'Str', 'Slice', + 'Index', 'Subscript', 'Tuple', 'List', + 'Dict', 'Call']) +_numexpr_supported_calls = frozenset(_reductions + _mathops) - raise NotImplementedError("function calls not yet supported") +@disallow(_unsupported_nodes | _numexpr_not_supported) +class NumExprVisitor(BaseExprVisitor): + def __init__(self, env, preparser=None): + if preparser is not None: + raise ValueError("only strict numexpr syntax is supported") + preparser = lambda x: x + super(NumExprVisitor, self).__init__(env, preparser) - def visit_Attribute(self, node): - raise NotImplementedError("attribute access is not yet supported") - def visit_BoolOp(self, node): - raise NotImplementedError("boolean operators are not yet supported") +_python_not_supported = _numexpr_not_supported + +@disallow(_unsupported_nodes | _python_not_supported) +class PythonExprVisitor(BaseExprVisitor): + pass class Expr(StringMixin): + """Expr object""" + def __init__(self, expr, engine='numexpr', env=None, truediv=True): self.expr = expr self.env = env or Scope(frame_level=2) - self._visitor = ExprVisitor(self.env) + self._visitor = _visitors[engine](self.env) self.terms = self.parse() self.engine = engine self.truediv = truediv @@ -167,6 +384,9 @@ def __call__(self, env): def __unicode__(self): return unicode(self.terms) + def __len__(self): + return len(self.expr) + def parse(self): """return a Termset""" return self._visitor.visit(self.expr) @@ -185,3 +405,6 @@ def isexpr(s, check_names=True): return not check_names else: return True + + +_visitors = {'python': PythonExprVisitor, 'numexpr': NumExprVisitor} diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index ca5f6d4872a72..b2dd638da1ef3 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -1,6 +1,8 @@ import operator as op import numpy as np + +import pandas as pd from pandas.util.py3compat import PY3 import pandas.core.common as com from pandas.core.base import StringMixin @@ -25,36 +27,12 @@ class BinaryOperatorError(OperatorError): pass -def _resolve_name(env, key): - res = env.locals.get(key, env.globals.get(key)) - - if res is None: - if not isinstance(key, basestring): - return key - - raise NameError('name {0!r} is not defined'.format(key)) - - return res - - -def _update_name(env, key, value): - if isinstance(key, basestring): - try: - del env.locals[key] - env.locals[key] = value - except KeyError: - try: - del env.globals[key] - env.globals[key] = value - except KeyError: - raise NameError('name {0!r} is not defined'.format(key)) - - class Term(StringMixin): - def __init__(self, name, env): + def __init__(self, name, env, side=None): self.name = name self.env = env - self.value = _resolve_name(self.env, self.name) + self.side = side + self.value = self._resolve_name() try: # ndframe potentially very slow for large, mixed dtype frames @@ -70,8 +48,39 @@ def __init__(self, name, env): def __unicode__(self): return com.pprint_thing(self.name) + def _resolve_name(self): + env = self.env + key = self.name + res = env.resolver(key) + self.update(res) + + if res is None: + if not isinstance(key, basestring): + return key + raise NameError('name {0!r} is not defined'.format(key)) + + if isinstance(res, pd.Panel): + raise NotImplementedError("Panel objects are not supported with " + "eval") + return res + def update(self, value): - _update_name(self.env, self.name, value) + env = self.env + key = self.name + if isinstance(key, basestring): + try: + del env.locals[key] + env.locals[key] = value + except KeyError: + if key in env.resolver_keys: + env.locals[key] = value + else: + try: + del env.globals[key] + env.globals[key] = value + except KeyError: + raise NameError('{0!r} is undefined'.format(key)) + self.value = value @property @@ -83,6 +92,9 @@ class Constant(Term): def __init__(self, value, env): super(Constant, self).__init__(value, env) + def _resolve_name(self): + return self.name + def _print_operand(opr): return opr.name if is_term(opr) else unicode(opr) @@ -91,7 +103,7 @@ def _print_operand(opr): class Op(StringMixin): """Hold an operator of unknown arity """ - def __init__(self, op, operands): + def __init__(self, op, operands, *args, **kwargs): self.op = op self.operands = operands @@ -114,8 +126,8 @@ def return_type(self): return np.result_type(*(term.type for term in flatten(self))) -_cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=', '=' -_cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne, op.eq +_cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=' +_cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne _cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs)) _bool_ops_syms = '&', '|' @@ -165,7 +177,7 @@ class BinOp(Op): left : str or Op right : str or Op """ - def __init__(self, op, lhs, rhs): + def __init__(self, op, lhs, rhs, **kwargs): super(BinOp, self).__init__(op, (lhs, rhs)) self.lhs = lhs self.rhs = rhs @@ -208,8 +220,8 @@ def __call__(self, env): class Mod(BinOp): - def __init__(self, lhs, rhs): - super(Mod, self).__init__('%', lhs, rhs) + def __init__(self, lhs, rhs, *args, **kwargs): + super(Mod, self).__init__('%', lhs, rhs, *args, **kwargs) _cast_inplace(self.operands, np.float_) diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py new file mode 100644 index 0000000000000..64a1036bbb20f --- /dev/null +++ b/pandas/computation/pytables.py @@ -0,0 +1,505 @@ +""" manage PyTables query interface via Expressions """ + +import ast +import time +import warnings +from functools import partial +from datetime import datetime + +import numpy as np + +import pandas.core.common as com +import pandas.lib as lib +from pandas.computation import expr, ops +from pandas.computation.ops import is_term, Constant +from pandas.computation.expr import BaseExprVisitor +from pandas import Index +from pandas.core.common import is_list_like + + +def _ensure_decoded(s): + """ if we have bytes, decode them to unicode """ + if isinstance(s, (np.bytes_, bytes)): + s = s.decode('UTF-8') + return s + + +class Scope(expr.Scope): + __slots__ = 'globals', 'locals', 'queryables' + + def __init__(self, gbls=None, lcls=None, queryables=None, frame_level=1): + super( + Scope, + self).__init__(gbls=gbls, + lcls=lcls, + frame_level=frame_level) + self.queryables = queryables or dict() + + +class Term(ops.Term): + + def __init__(self, name, env, side=None): + super(Term, self).__init__(name, env, side=side) + + def _resolve_name(self): + + # must be a queryables + if self.side == 'left': + if self.name not in self.env.queryables: + raise NameError('name {0!r} is not defined'.format(self.name)) + return self.name + + # resolve the rhs (and allow to be None) + return self.env.locals.get(self.name, + self.env.globals.get(self.name, self.name)) + + +class BinOp(ops.BinOp): + + _max_selectors = 31 + + def __init__(self, op, lhs, rhs, queryables, encoding): + super(BinOp, self).__init__(op, lhs, rhs) + self.queryables = queryables + self.encoding = encoding + self.filter = None + self.condition = None + + def prune(self, klass): + + def pr(left, right): + """ create and return a new specilized BinOp from myself """ + + if left is None: + return right + elif right is None: + return left + + k = klass + if isinstance(left, ConditionBinOp): + if (isinstance(left, ConditionBinOp) and + isinstance(right, ConditionBinOp)): + k = JointConditionBinOp + elif isinstance(left, k): + return left + elif isinstance(right, k): + return right + + elif isinstance(left, FilterBinOp): + if (isinstance(left, FilterBinOp) and + isinstance(right, FilterBinOp)): + k = JointFilterBinOp + elif isinstance(left, k): + return left + elif isinstance(right, k): + return right + + return k(self.op, left, right, queryables=self.queryables, + encoding=self.encoding).evaluate() + + left, right = self.lhs, self.rhs + + if is_term(left) and is_term(right): + res = pr(left.value, right.value) + elif not is_term(left) and is_term(right): + res = pr(left.prune(klass), right.value) + elif is_term(left) and not is_term(right): + res = pr(left.value, right.prune(klass)) + elif not (is_term(left) or is_term(right)): + res = pr(left.prune(klass), right.prune(klass)) + + return res + + def conform(self, rhs): + """ inplace conform rhs """ + if not is_list_like(rhs): + rhs = [rhs] + if hasattr(self.rhs, 'ravel'): + rhs = rhs.ravel() + return rhs + + @property + def is_valid(self): + """ return True if this is a valid field """ + return self.lhs in self.queryables + + @property + def is_in_table(self): + """ return True if this is a valid column name for generation (e.g. an + actual column in the table) """ + return self.queryables.get(self.lhs) is not None + + @property + def kind(self): + """ the kind of my field """ + return self.queryables.get(self.lhs) + + def generate(self, v): + """ create and return the op string for this TermValue """ + val = v.tostring(self.encoding) + return "(%s %s %s)" % (self.lhs, self.op, val) + + def convert_value(self, v): + """ convert the expression that is in the term to something that is + accepted by pytables """ + + def stringify(value): + value = str(value) + if self.encoding is not None: + value = value.encode(self.encoding) + return value + + kind = _ensure_decoded(self.kind) + if kind == u'datetime64' or kind == u'datetime': + + if isinstance(v, (int, float)): + v = stringify(v) + v = _ensure_decoded(v) + v = lib.Timestamp(v) + if v.tz is not None: + v = v.tz_convert('UTC') + return TermValue(v, v.value, kind) + elif isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u'date': + v = time.mktime(v.timetuple()) + return TermValue(v, lib.Timestamp(v), kind) + elif kind == u'integer': + v = int(float(v)) + return TermValue(v, v, kind) + elif kind == u'float': + v = float(v) + return TermValue(v, v, kind) + elif kind == u'bool': + if isinstance(v, basestring): + v = not v.strip().lower() in [u'false', u'f', u'no', u'n', + u'none', u'0', u'[]', u'{}', u''] + else: + v = bool(v) + return TermValue(v, v, kind) + elif not isinstance(v, basestring): + v = stringify(v) + return TermValue(v, stringify(v), u'string') + + # string quoting + return TermValue(v, stringify(v), u'string') + + +class FilterBinOp(BinOp): + + def __unicode__(self): + return com.pprint_thing("[Filter : [{0}] -> " + "[{1}]".format(self.filter[0], self.filter[1])) + + def invert(self): + """ invert the filter """ + if self.filter is not None: + f = list(self.filter) + f[1] = self.generate_filter_op(invert=True) + self.filter = tuple(f) + return self + + def format(self): + """ return the actual filter format """ + return [self.filter] + + def evaluate(self): + + if not isinstance(self.lhs, basestring): + return self + + if not self.is_valid: + raise ValueError("query term is not valid [%s]" % self) + + rhs = self.conform(self.rhs) + values = [TermValue(v, v, self.kind) for v in rhs] + + if self.is_in_table: + + # if too many values to create the expression, use a filter instead + if self.op in ['==', '!='] and len(values) > self._max_selectors: + + filter_op = self.generate_filter_op() + self.filter = ( + self.lhs, + filter_op, + Index([v.value for v in values])) + + return self + return None + + # equality conditions + if self.op in ['==', '!=']: + + filter_op = self.generate_filter_op() + self.filter = ( + self.lhs, + filter_op, + Index([v.value for v in values])) + + else: + raise TypeError( + "passing a filterable condition to a non-table indexer [%s]" % + self) + + return self + + def generate_filter_op(self, invert=False): + if (self.op == '!=' and not invert) or (self.op == '==' and invert): + return lambda axis, vals: ~axis.isin(vals) + else: + return lambda axis, vals: axis.isin(vals) + + +class JointFilterBinOp(FilterBinOp): + + def format(self): + raise NotImplementedError("unable to collapse Joint Filters") + + def evaluate(self): + return self + + +class ConditionBinOp(BinOp): + + def __unicode__(self): + return com.pprint_thing("[Condition : [{0}]]".format(self.condition)) + + def invert(self): + """ invert the condition """ + #if self.condition is not None: + # self.condition = "~(%s)" % self.condition + #return self + raise NotImplementedError("cannot use an invert condition when passing to numexpr") + + def format(self): + """ return the actual ne format """ + return self.condition + + def evaluate(self): + + if not isinstance(self.lhs, basestring): + return self + + if not self.is_valid: + raise ValueError("query term is not valid [%s]" % self) + + # convert values if we are in the table + if not self.is_in_table: + return None + + rhs = self.conform(self.rhs) + values = [self.convert_value(v) for v in rhs] + + # equality conditions + if self.op in ['==', '!=']: + + # too many values to create the expression? + if len(values) <= self._max_selectors: + vs = [self.generate(v) for v in values] + self.condition = "(%s)" % ' | '.join(vs) + + # use a filter after reading + else: + return None + else: + self.condition = self.generate(values[0]) + + return self + + +class JointConditionBinOp(ConditionBinOp): + + def evaluate(self): + self.condition = "(%s %s %s)" % ( + self.lhs.condition, + self.op, + self.rhs.condition) + return self + + +class UnaryOp(ops.UnaryOp): + + def prune(self, klass): + + if self.op != '~': + raise NotImplementedError("UnaryOp only support invert type ops") + + operand = self.operand + operand = operand.prune(klass) + + if operand is not None: + if issubclass(klass,ConditionBinOp): + if operand.condition is not None: + return operand.invert() + elif issubclass(klass,FilterBinOp): + if operand.filter is not None: + return operand.invert() + + return None + + + +_op_classes = {'unary': UnaryOp} + + +class ExprVisitor(BaseExprVisitor): + def __init__(self, env, **kwargs): + super(ExprVisitor, self).__init__(env) + for bin_op in self.binary_ops: + setattr(self, 'visit_{0}'.format(self.binary_op_nodes_map[bin_op]), + lambda node, bin_op=bin_op: partial(BinOp, bin_op, + **kwargs)) + + def visit_Name(self, node, side=None, **kwargs): + return Term(node.id, self.env, side=side, **kwargs) + + def visit_UnaryOp(self, node, **kwargs): + if isinstance(node.op, (ast.Not, ast.Invert)): + return UnaryOp('~', self.visit(node.operand)) + elif isinstance(node.op, ast.USub): + return Constant(-self.visit(node.operand).value, self.env) + elif isinstance(node.op, ast.UAdd): + raise NotImplementedError('Unary addition not supported') + + def visit_USub(self, node, **kwargs): + return Constant(-self.visit(node.operand).value, self.env) + + def visit_Index(self, node, **kwargs): + return self.visit(node.value).value + +class Expr(expr.Expr): + + """ hold a pytables like expression, comprised of possibly multiple 'terms' + + Parameters + ---------- + where : string term expression, Expr, or list-like of Exprs + queryables : a kinds map (dict of column name -> kind), or None i column is non-indexable + encoding : an encoding that will encode the query terms + + Returns + ------- + an Expr object + + Examples + -------- + + 'index>=date' + "columns=['A', 'D']" + 'columns=A' + 'columns==A' + "~(columns=['A','B'])" + 'index>df.index[3] & string="bar"' + '(index>df.index[3] & index<=df.index[6]) | string="bar"' + "ts>=Timestamp('2012-02-01')" + "major_axis>=20130101" + """ + + def __init__(self, where, op=None, value=None, queryables=None, + encoding=None, scope_level=None): + + # try to be back compat + where = self.parse_back_compat(where, op, value) + + self.encoding = encoding + self.condition = None + self.filter = None + self.terms = None + self._visitor = None + + # capture the environement if needed + lcls = dict() + if isinstance(where, Expr): + + lcls.update(where.env.locals) + where = str(where) + + elif isinstance(where, (list, tuple)): + + for w in where: + if isinstance(w, Expr): + lcls.update(w.env.locals) + else: + w = self.parse_back_compat(w) + + where = ' & ' .join(["(%s)" % w for w in where]) + + self.expr = where + self.env = Scope(lcls=lcls) + self.env.update(scope_level) + + if queryables is not None: + self.env.queryables.update(queryables) + self._visitor = ExprVisitor(self.env, queryables=queryables, + encoding=encoding) + self.terms = self.parse() + + def parse_back_compat(self, w, op=None, value=None): + """ allow backward compatibility for passed arguments """ + + if isinstance(w, dict): + w, op, value = w.get('field'), w.get('op'), w.get('value') + if not isinstance(w, basestring): + raise TypeError( + "where must be passed as a string if op/value are passed") + warnings.warn("passing a dict to Expr is deprecated, " + "pass the where as a single string", + DeprecationWarning) + + if op is not None: + if not isinstance(w, basestring): + raise TypeError( + "where must be passed as a string if op/value are passed") + + if isinstance(op, Expr): + raise TypeError("invalid op passed, must be a string") + w = "{0}{1}".format(w, op) + if value is not None: + if isinstance(value, Expr): + raise TypeError("invalid value passed, must be a string") + w = "{0}{1}".format(w, value) + + warnings.warn("passing multiple values to Expr is deprecated, " + "pass the where as a single string", + DeprecationWarning) + + return w + + def __unicode__(self): + if self.terms is not None: + return unicode(self.terms) + return self.expr + + def evaluate(self): + """ create and return the numexpr condition and filter """ + + try: + self.condition = self.terms.prune(ConditionBinOp) + except AttributeError: + raise ValueError( + "cannot process expression [{0}], [{1}] is not a valid condition".format(self.expr,self)) + try: + self.filter = self.terms.prune(FilterBinOp) + except AttributeError: + raise ValueError( + "cannot process expression [{0}], [{1}] is not a valid filter".format(self.expr,self)) + + return self.condition, self.filter + + +class TermValue(object): + + """ hold a term value the we use to construct a condition/filter """ + + def __init__(self, value, converted, kind): + self.value = value + self.converted = converted + self.kind = kind + + def tostring(self, encoding): + """ quote the string if not encoded + else encode and return """ + if self.kind == u'string': + if encoding is not None: + return self.converted + return '"%s"' % self.converted + return self.converted diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py old mode 100644 new mode 100755 index 6ec630b80614d..fa96342ec9bdc --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -3,10 +3,11 @@ import unittest import itertools from itertools import product +import ast import nose from nose.tools import assert_raises, assert_tuple_equal -from nose.tools import assert_true, assert_false +from nose.tools import assert_true, assert_false, assert_equal from numpy.random import randn, rand import numpy as np @@ -15,12 +16,14 @@ import pandas as pd from pandas.core import common as com -from pandas import DataFrame, Series +from pandas import DataFrame, Series, Panel from pandas.util.testing import makeCustomDataframe as mkdf from pandas.computation.engines import _engines, _reconstruct_object from pandas.computation.align import _align_core +from pandas.computation.expr import NumExprVisitor, PythonExprVisitor from pandas.computation.ops import _binary_ops_dict, _unary_ops_dict, Term import pandas.computation.expr as expr +from pandas.computation import pytables from pandas.computation.expressions import _USE_NUMEXPR from pandas.computation.eval import Scope from pandas.util.testing import assert_frame_equal, randbool @@ -96,7 +99,6 @@ def _series_and_2d_ndarray(lhs, rhs): > 1) -# Smoke testing class TestBasicEval(unittest.TestCase): @classmethod @@ -645,6 +647,86 @@ def test_or_fails(): check_or_fails(engine) +_visitors = {'numexpr': NumExprVisitor, 'python': PythonExprVisitor, + 'pytables': pytables.ExprVisitor} + + +def check_disallowed_nodes(engine): + """make sure the disallowed decorator works""" + VisitorClass = _visitors[engine] + uns_ops = VisitorClass.unsupported_nodes + inst = VisitorClass('x + 1') + for ops in uns_ops: + assert_raises(NotImplementedError, getattr(inst, ops), inst, ast.AST()) + + +def test_disallowed_nodes(): + for engine in ('pytables', 'numexpr', 'python'): + check_disallowed_nodes(engine) + + +def check_simple_ops(engine): + ops = '+', '*', '/', '-', '%', '**' + + for op in ops: + expec = _eval_single_bin(1, op, 1, engine_has_neg_frac(engine)) + x = pd.eval('1 {0} 1'.format(op), engine=engine) + assert_equal(x, expec) + + expec = _eval_single_bin(x, op, 1, engine_has_neg_frac(engine)) + y = pd.eval('x {0} 1'.format(op), engine=engine) + assert_equal(y, expec) + + expec = _eval_single_bin(1, op, x + 1, engine_has_neg_frac(engine)) + y = pd.eval('1 {0} (x + 1)'.format(op), engine=engine) + assert_equal(y, expec) + + +def test_simple_ops(): + for engine in _engines: + check_simple_ops(engine) + + +def check_no_new_locals(engine): + x = 1 + lcls = locals().copy() + pd.eval('x + 1', local_dict=lcls) + lcls2 = locals().copy() + lcls2.pop('lcls') + assert_equal(lcls, lcls2) + + +def test_no_new_locals(): + for engine in _engines: + check_no_new_locals(engine) + + +def check_no_new_globals(engine): + x = 1 + gbls = globals().copy() + pd.eval('x + 1') + gbls2 = globals().copy() + assert_equal(gbls, gbls2) + + +def test_no_new_globals(): + for engine in _engines: + check_no_new_globals(engine) + + +def check_panel_fails(engine): + x = Panel(randn(3, 4, 5)) + y = Series(randn(10)) + assert_raises(NotImplementedError, pd.eval, 'x + y', local_dict={'x': x, + 'y': y}, + engine=engine) + + +def test_panel_fails(): + for engine in _engines: + check_panel_fails(engine) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c957ec9d331b9..59145cd54b360 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -28,15 +28,16 @@ _coerce_to_dtypes, _DATELIKE_DTYPES, is_list_like) from pandas.core.generic import NDFrame from pandas.core.index import Index, MultiIndex, _ensure_index -from pandas.core.indexing import (_NDFrameIndexer, _maybe_droplevels, - _convert_to_index_sliceable, _check_bool_indexer, - _maybe_convert_indices) +from pandas.core.indexing import (_maybe_droplevels, + _convert_to_index_sliceable, + _check_bool_indexer, _maybe_convert_indices) from pandas.core.internals import (BlockManager, create_block_manager_from_arrays, create_block_manager_from_blocks) from pandas.core.series import Series, _radd_compat from pandas.sparse.array import SparseArray import pandas.computation.expressions as expressions +from pandas.computation.eval import eval as _eval from pandas.compat.scipy import scoreatpercentile as _quantile from pandas.compat import(range, zip, lrange, lmap, lzip, StringIO, u, OrderedDict, raise_with_traceback) @@ -55,7 +56,6 @@ import pandas.core.nanops as nanops import pandas.lib as lib -import pandas.tslib as tslib import pandas.algos as _algos from pandas.core.config import get_option, set_option @@ -1898,6 +1898,18 @@ def _getitem_frame(self, key): raise ValueError('Must pass DataFrame with boolean values only') return self.where(key) + def query(self, expr, **kwargs): + resolvers = kwargs.get('resolvers', None) + if resolvers is None: + index_resolvers = {} + if self.index.name is not None: + index_resolvers[self.index.name] = self.index + index_resolvers.update({'index': self.index, + 'columns': self.columns}) + resolvers = [self, index_resolvers] + kwargs.update({'resolvers': resolvers}) + return self[_eval(expr, **kwargs)] + def _slice(self, slobj, axis=0, raise_on_error=False): axis = self._get_block_manager_axis(axis) new_data = self._data.get_slice( @@ -4599,6 +4611,7 @@ def combineMult(self, other): DataFrame._setup_axes( ['index', 'columns'], info_axis=1, stat_axis=0, axes_are_reversed=True) + _EMPTY_SERIES = Series([]) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 6e7f721950a15..3695a994bf0a8 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -34,6 +34,7 @@ from pandas.tools.merge import concat from pandas.io.common import PerformanceWarning from pandas.core.config import get_option +from pandas.computation.pytables import Expr import pandas.lib as lib import pandas.algos as algos @@ -62,6 +63,21 @@ def _ensure_encoding(encoding): encoding = _default_encoding return encoding +Term = Expr + +def _ensure_term(where): + """ ensure that the where is a Term or a list of Term + this makes sure that we are capturing the scope of variables + that are passed """ + + # create the terms here with a frame_level=2 (we are 2 levels down) + if isinstance(where, (list, tuple)): + where = [ w if isinstance(w, Term) else Term(w, scope_level=2) for w in where if w is not None ] + elif where is None or isinstance(where, Coordinates): + pass + elif not isinstance(where, Term): + where = Term(where, scope_level=2) + return where class PossibleDataLossError(Exception): pass @@ -556,6 +572,7 @@ def select(self, key, where=None, start=None, stop=None, columns=None, raise KeyError('No object named %s in the file' % key) # create the storer and axes + where = _ensure_term(where) s = self._create_storer(group) s.infer_axes() @@ -587,6 +604,7 @@ def select_as_coordinates( start : integer (defaults to None), row number to start selection stop : integer (defaults to None), row number to stop selection """ + where = _ensure_term(where) return self.get_storer(key).read_coordinates(where=where, start=start, stop=stop, **kwargs) def unique(self, key, column, **kwargs): @@ -632,6 +650,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, """ # default to single select + where = _ensure_term(where) if isinstance(keys, (list, tuple)) and len(keys) == 1: keys = keys[0] if isinstance(keys, compat.string_types): @@ -735,6 +754,7 @@ def remove(self, key, where=None, start=None, stop=None): raises KeyError if key is not a valid store """ + where = _ensure_term(where) try: s = self.get_storer(key) except: @@ -3070,8 +3090,8 @@ def process_axes(self, obj, columns=None): obj = _reindex_axis(obj, axis, labels, columns) # apply the selection filters (but keep in the same order) - if self.selection.filter: - for field, op, filt in self.selection.filter: + if self.selection.filter is not None: + for field, op, filt in self.selection.filter.format(): def process_filter(field, filt): @@ -4211,15 +4231,8 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): self.terms = self.generate(where) # create the numexpr & the filter - if self.terms: - terms = [t for t in self.terms if t.condition is not None] - if len(terms): - self.condition = "(%s)" % ' & '.join( - [t.condition for t in terms]) - self.filter = [] - for t in self.terms: - if t.filter is not None: - self.filter.append(t.filter) + if self.terms is not None: + self.condition, self.filter = self.terms.evaluate() def generate(self, where): """ where can be a : dict,list,tuple,string """ @@ -4245,7 +4258,7 @@ def select(self): generate the selection """ if self.condition is not None: - return self.table.table.readWhere(self.condition, start=self.start, stop=self.stop) + return self.table.table.readWhere(self.condition.format(), start=self.start, stop=self.stop) elif self.coordinates is not None: return self.table.table.readCoordinates(self.coordinates) return self.table.table.read(start=self.start, stop=self.stop) @@ -4257,7 +4270,7 @@ def select_coords(self): if self.condition is None: return np.arange(self.table.nrows) - return self.table.table.getWhereList(self.condition, start=self.start, stop=self.stop, sort=True) + return self.table.table.getWhereList(self.condition.format(), start=self.start, stop=self.stop, sort=True) # utilities ### diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 6a325db8aaaa9..88173c001fa7e 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -81,6 +81,7 @@ def _maybe_remove(store, key): except: pass + def compat_assert_produces_warning(w,f): """ don't produce a warning under PY3 """ if compat.PY3: @@ -89,6 +90,7 @@ def compat_assert_produces_warning(w,f): with tm.assert_produces_warning(expected_warning=w): f() + class TestHDFStore(unittest.TestCase): def setUp(self): @@ -328,8 +330,8 @@ def test_contains(self): self.assert_('bar' not in store) # GH 2694 - with tm.assert_produces_warning(expected_warning=tables.NaturalNameWarning): - store['node())'] = tm.makeDataFrame() + warnings.filterwarnings('ignore', category=tables.NaturalNameWarning) + store['node())'] = tm.makeDataFrame() self.assert_('node())' in store) def test_versioning(self): @@ -886,16 +888,16 @@ def test_append_frame_column_oriented(self): expected = df.reindex(columns=['A']) tm.assert_frame_equal(expected, result) - # this isn't supported - self.assertRaises(TypeError, store.select, 'df1', ( - 'columns=A', Term('index', '>', df.index[4]))) - # selection on the non-indexable result = store.select( - 'df1', ('columns=A', Term('index', '=', df.index[0:4]))) + 'df1', ('columns=A', Term('index=df.index[0:4]'))) expected = df.reindex(columns=['A'], index=df.index[0:4]) tm.assert_frame_equal(expected, result) + # this isn't supported + self.assertRaises(TypeError, store.select, 'df1', ( + 'columns=A', Term('index>df.index[4]'))) + def test_append_with_different_block_ordering(self): #GH 4096; using same frames, but different block orderings @@ -1133,7 +1135,7 @@ def test_append_with_data_columns(self): # data column searching (with an indexable and a data_columns) result = store.select( - 'df', [Term('B>0'), Term('index', '>', df.index[3])]) + 'df', [Term('B>0'), Term('index>df.index[3]')]) df_new = df.reindex(index=df.index[4:]) expected = df_new[df_new.B > 0] tm.assert_frame_equal(result, expected) @@ -1145,7 +1147,7 @@ def test_append_with_data_columns(self): df_new['string'][5:6] = 'bar' _maybe_remove(store, 'df') store.append('df', df_new, data_columns=['string']) - result = store.select('df', [Term('string', '=', 'foo')]) + result = store.select('df', [Term('string=foo')]) expected = df_new[df_new.string == 'foo'] tm.assert_frame_equal(result, expected) @@ -1191,14 +1193,14 @@ def check_col(key,name,size): _maybe_remove(store, 'df') store.append( 'df', df_new, data_columns=['A', 'B', 'string', 'string2']) - result = store.select('df', [Term('string', '=', 'foo'), Term( + result = store.select('df', [Term('string=foo'), Term( 'string2=foo'), Term('A>0'), Term('B<0')]) expected = df_new[(df_new.string == 'foo') & ( df_new.string2 == 'foo') & (df_new.A > 0) & (df_new.B < 0)] tm.assert_frame_equal(result, expected) # yield an empty frame - result = store.select('df', [Term('string', '=', 'foo'), Term( + result = store.select('df', [Term('string=foo'), Term( 'string2=cool')]) expected = df_new[(df_new.string == 'foo') & ( df_new.string2 == 'cool')] @@ -1759,7 +1761,7 @@ def compare(a,b): assert_frame_equal(result,df) # select with tz aware - compare(store.select('df_tz',where=Term('A','>=',df.A[3])),df[df.A>=df.A[3]]) + compare(store.select('df_tz',where=Term('A>=df.A[3]')),df[df.A>=df.A[3]]) _maybe_remove(store, 'df_tz') df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130103',tz='US/Eastern')),index=lrange(5)) @@ -1927,14 +1929,14 @@ def test_remove_where(self): with ensure_clean(self.path) as store: # non-existance - crit1 = Term('index', '>', 'foo') + crit1 = Term('index>foo') self.assertRaises(KeyError, store.remove, 'a', [crit1]) # try to remove non-table (with crit) # non-table ok (where = None) wp = tm.makePanel() - store.put('wp', wp, format='t') - store.remove('wp', [('minor_axis', ['A', 'D'])]) + store.put('wp', wp, fmt='t') + store.remove('wp', [("minor_axis=['A', 'D']")]) rs = store.select('wp') expected = wp.reindex(minor_axis=['B', 'C']) assert_panel_equal(rs, expected) @@ -1966,8 +1968,8 @@ def test_remove_crit(self): # group row removal date4 = wp.major_axis.take([0, 1, 2, 4, 5, 6, 8, 9, 10]) - crit4 = Term('major_axis', date4) - store.put('wp3', wp, format='table') + crit4 = Term('major_axis=date4') + store.put('wp3', wp, format='t') n = store.remove('wp3', where=[crit4]) assert(n == 36) result = store.select('wp3') @@ -1978,8 +1980,8 @@ def test_remove_crit(self): store.put('wp', wp, format='table') date = wp.major_axis[len(wp.major_axis) // 2] - crit1 = Term('major_axis', '>', date) - crit2 = Term('minor_axis', ['A', 'D']) + crit1 = Term('major_axis>date') + crit2 = Term("minor_axis=['A', 'D']") n = store.remove('wp', where=[crit1]) assert(n == 56) @@ -1995,14 +1997,14 @@ def test_remove_crit(self): store.put('wp2', wp, format='table') date1 = wp.major_axis[1:3] - crit1 = Term('major_axis', date1) + crit1 = Term('major_axis=date1') store.remove('wp2', where=[crit1]) result = store.select('wp2') expected = wp.reindex(major_axis=wp.major_axis - date1) assert_panel_equal(result, expected) date2 = wp.major_axis[5] - crit2 = Term('major_axis', date2) + crit2 = Term('major_axis=date2') store.remove('wp2', where=[crit2]) result = store['wp2'] expected = wp.reindex( @@ -2010,7 +2012,7 @@ def test_remove_crit(self): assert_panel_equal(result, expected) date3 = [wp.major_axis[7], wp.major_axis[9]] - crit3 = Term('major_axis', date3) + crit3 = Term('major_axis=date3') store.remove('wp2', where=[crit3]) result = store['wp2'] expected = wp.reindex( @@ -2020,62 +2022,94 @@ def test_remove_crit(self): # corners store.put('wp4', wp, format='table') n = store.remove( - 'wp4', where=[Term('major_axis', '>', wp.major_axis[-1])]) + 'wp4', where=[Term('major_axis>wp.major_axis[-1]')]) result = store.select('wp4') assert_panel_equal(result, wp) - def test_terms(self): + def test_invalid_terms(self): with ensure_clean(self.path) as store: + df = tm.makeTimeDataFrame() + df['string'] = 'foo' + df.ix[0:4,'string'] = 'bar' wp = tm.makePanel() p4d = tm.makePanel4D() store.put('wp', wp, format='table') store.put('p4d', p4d, format='table') # some invalid terms - terms = [ - ['minor', ['A', 'B']], - ['index', ['20121114']], - ['index', ['20121114', '20121114']], - ] - for t in terms: - self.assertRaises(Exception, store.select, 'wp', t) + self.assertRaises(NameError, store.select, 'wp', "minor=['A', 'B']") + self.assertRaises(NameError, store.select, 'wp', ["index=['20121114']"]) + self.assertRaises(NameError, store.select, 'wp', ["index=['20121114', '20121114']"]) + + # deprecations + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + Term('index','==') - self.assertRaises(Exception, Term.__init__) - self.assertRaises(Exception, Term.__init__, 'blah') - self.assertRaises(Exception, Term.__init__, 'index') - self.assertRaises(Exception, Term.__init__, 'index', '==') - self.assertRaises(Exception, Term.__init__, 'index', '>', 5) + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + Term('index', '>', 5) + + self.assertRaises(TypeError, Term) + + # more invalid + self.assertRaises(ValueError, store.select, 'df','df.index[3]') + self.assertRaises(SyntaxError, store.select, 'df','index>') + self.assertRaises(ValueError, store.select, 'wp', "major_axis<'20000108' & minor_axis['A', 'B']") + + def test_terms(self): + + with ensure_clean(self.path) as store: + + wp = tm.makePanel() + p4d = tm.makePanel4D() + store.put('wp', wp, table=True) + store.put('p4d', p4d, table=True) # panel result = store.select('wp', [Term( - 'major_axis<20000108'), Term('minor_axis', '=', ['A', 'B'])]) + 'major_axis<"20000108"'), Term("minor_axis=['A', 'B']")]) expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) assert_panel_equal(result, expected) + # with deprecation + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + result = store.select('wp', [Term( + 'major_axis','<',"20000108"), Term("minor_axis=['A', 'B']")]) + expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) + tm.assert_panel_equal(result, expected) + # p4d - result = store.select('p4d', [Term('major_axis<20000108'), - Term('minor_axis', '=', ['A', 'B']), - Term('items', '=', ['ItemA', 'ItemB'])]) + result = store.select('p4d', [Term('major_axis<"20000108"'), + Term("minor_axis=['A', 'B']"), + Term("items=['ItemA', 'ItemB']")]) expected = p4d.truncate(after='20000108').reindex( minor=['A', 'B'], items=['ItemA', 'ItemB']) assert_panel4d_equal(result, expected) - # valid terms + # back compat invalid terms terms = [ dict(field='major_axis', op='>', value='20121114'), - ('major_axis', '20121114'), - ('major_axis', '>', '20121114'), - (('major_axis', ['20121114', '20121114']),), - ('major_axis', datetime.datetime(2012, 11, 14)), + [ dict(field='major_axis', op='>', value='20121114') ], + [ "minor_axis=['A','B']", dict(field='major_axis', op='>', value='20121114') ] + ] + for t in terms: + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + Term(t) + + # valid terms + terms = [ + ('major_axis=20121114'), + ('major_axis>20121114'), + (("major_axis=['20121114', '20121114']"),), + ('major_axis=datetime.datetime(2012, 11, 14)'), 'major_axis> 20121114', 'major_axis >20121114', 'major_axis > 20121114', - (('minor_axis', ['A', 'B']),), - (('minor_axis', ['A', 'B']),), - ((('minor_axis', ['A', 'B']),),), - (('items', ['ItemA', 'ItemB']),), + (("minor_axis=['A', 'B']"),), + (("minor_axis=['A', 'B']"),), + ((("minor_axis==['A', 'B']"),),), + (("items=['ItemA', 'ItemB']"),), ('items=ItemA'), ] @@ -2085,8 +2119,8 @@ def test_terms(self): # valid for p4d only terms = [ - (('labels', '=', ['l1', 'l2']),), - Term('labels', '=', ['l1', 'l2']), + (("labels=['l1', 'l2']"),), + Term("labels=['l1', 'l2']"), ] for t in terms: @@ -2211,7 +2245,7 @@ def test_index_types(self): self._check_roundtrip(ser, func) ser = Series(values, [datetime.datetime( - 2012, 1, 1), datetime.datetime(2012, 1, 2)]) + 2012, 1, 1), datetime.datetime(2012, 1, 2)]) self._check_roundtrip(ser, func) def test_timeseries_preepoch(self): @@ -2525,7 +2559,7 @@ def test_select(self): _maybe_remove(store, 'wp') store.append('wp', wp) items = ['Item%03d' % i for i in range(80)] - result = store.select('wp', Term('items', items)) + result = store.select('wp', Term('items=items')) expected = wp.reindex(items=items) assert_panel_equal(expected, result) @@ -2542,7 +2576,7 @@ def test_select(self): tm.assert_frame_equal(expected, result) # equivalentsly - result = store.select('df', [('columns', ['A', 'B'])]) + result = store.select('df', [("columns=['A', 'B']")]) expected = df.reindex(columns=['A', 'B']) tm.assert_frame_equal(expected, result) @@ -2575,7 +2609,8 @@ def test_select_dtypes(self): df = DataFrame(dict(ts=bdate_range('2012-01-01', periods=300), A=np.random.randn(300))) _maybe_remove(store, 'df') store.append('df', df, data_columns=['ts', 'A']) - result = store.select('df', [Term('ts', '>=', Timestamp('2012-02-01'))]) + + result = store.select('df', [Term("ts>=Timestamp('2012-02-01')")]) expected = df[df.ts >= Timestamp('2012-02-01')] tm.assert_frame_equal(expected, result) @@ -2602,7 +2637,7 @@ def test_select_dtypes(self): _maybe_remove(store, 'df_int') store.append('df_int', df) result = store.select( - 'df_int', [Term("index<10"), Term("columns", "=", ["A"])]) + 'df_int', [Term("index<10"), Term("columns=['A']")]) expected = df.reindex(index=list(df.index)[0:10],columns=['A']) tm.assert_frame_equal(expected, result) @@ -2612,7 +2647,7 @@ def test_select_dtypes(self): _maybe_remove(store, 'df_float') store.append('df_float', df) result = store.select( - 'df_float', [Term("index<10.0"), Term("columns", "=", ["A"])]) + 'df_float', [Term("index<10.0"), Term("columns=['A']")]) expected = df.reindex(index=list(df.index)[0:10],columns=['A']) tm.assert_frame_equal(expected, result) @@ -2628,30 +2663,30 @@ def test_select_with_many_inputs(self): store.append('df', df, data_columns=['ts', 'A', 'B', 'users']) # regular select - result = store.select('df', [Term('ts', '>=', Timestamp('2012-02-01'))]) + result = store.select('df', [Term("ts>=Timestamp('2012-02-01')")]) expected = df[df.ts >= Timestamp('2012-02-01')] tm.assert_frame_equal(expected, result) # small selector - result = store.select('df', [Term('ts', '>=', Timestamp('2012-02-01')),Term('users',['a','b','c'])]) + result = store.select('df', [Term("ts>=Timestamp('2012-02-01') & users=['a','b','c']")]) expected = df[ (df.ts >= Timestamp('2012-02-01')) & df.users.isin(['a','b','c']) ] tm.assert_frame_equal(expected, result) # big selector along the columns selector = [ 'a','b','c' ] + [ 'a%03d' % i for i in range(60) ] - result = store.select('df', [Term('ts', '>=', Timestamp('2012-02-01')),Term('users',selector)]) + result = store.select('df', [Term("ts>=Timestamp('2012-02-01')"),Term('users=selector')]) expected = df[ (df.ts >= Timestamp('2012-02-01')) & df.users.isin(selector) ] tm.assert_frame_equal(expected, result) selector = lrange(100,200) - result = store.select('df', [Term('B', selector)]) + result = store.select('df', [Term('B=selector')]) expected = df[ df.B.isin(selector) ] tm.assert_frame_equal(expected, result) self.assert_(len(result) == 100) # big selector along the index selector = Index(df.ts[0:100].values) - result = store.select('df', [Term('ts', selector)]) + result = store.select('df', [Term('ts=selector')]) expected = df[ df.ts.isin(selector.values) ] tm.assert_frame_equal(expected, result) self.assert_(len(result) == 100) @@ -2807,15 +2842,15 @@ def test_panel_select(self): store.put('wp', wp, format='table') date = wp.major_axis[len(wp.major_axis) // 2] - crit1 = ('major_axis', '>=', date) - crit2 = ('minor_axis', '=', ['A', 'D']) + crit1 = ('major_axis>=date') + crit2 = ("minor_axis=['A', 'D']") result = store.select('wp', [crit1, crit2]) expected = wp.truncate(before=date).reindex(minor=['A', 'D']) assert_panel_equal(result, expected) result = store.select( - 'wp', ['major_axis>=20000124', ('minor_axis', '=', ['A', 'B'])]) + 'wp', ['major_axis>="20000124"', ("minor_axis=['A', 'B']")]) expected = wp.truncate(before='20000124').reindex(minor=['A', 'B']) assert_panel_equal(result, expected) @@ -2827,9 +2862,9 @@ def test_frame_select(self): store.put('frame', df,format='table') date = df.index[len(df) // 2] - crit1 = ('index', '>=', date) - crit2 = ('columns', ['A', 'D']) - crit3 = ('columns', 'A') + crit1 = Term('index>=date') + crit2 = ("columns=['A', 'D']") + crit3 = ('columns=A') result = store.select('frame', [crit1, crit2]) expected = df.ix[date:, ['A', 'D']] @@ -2850,6 +2885,62 @@ def test_frame_select(self): # self.assertRaises(ValueError, store.select, # 'frame', [crit1, crit2]) + def test_frame_select_complex(self): + """ select via complex criteria """ + + df = tm.makeTimeDataFrame() + df['string'] = 'foo' + df.loc[df.index[0:4],'string'] = 'bar' + + with ensure_clean(self.path) as store: + store.put('df', df, table=True, data_columns=['string']) + + # empty + result = store.select('df', 'index>df.index[3] & string="bar"') + expected = df.loc[(df.index>df.index[3]) & (df.string=='bar')] + tm.assert_frame_equal(result, expected) + + result = store.select('df', 'index>df.index[3] & string="foo"') + expected = df.loc[(df.index>df.index[3]) & (df.string=='foo')] + tm.assert_frame_equal(result, expected) + + # or + result = store.select('df', 'index>df.index[3] | string="bar"') + expected = df.loc[(df.index>df.index[3]) | (df.string=='bar')] + tm.assert_frame_equal(result, expected) + + result = store.select('df', '(index>df.index[3] & index<=df.index[6]) | string="bar"') + expected = df.loc[((df.index>df.index[3]) & (df.index<=df.index[6])) | (df.string=='bar')] + tm.assert_frame_equal(result, expected) + + # invert + result = store.select('df', 'string!="bar"') + expected = df.loc[df.string!='bar'] + tm.assert_frame_equal(result, expected) + + # invert not implemented in numexpr :( + self.assertRaises(NotImplementedError, store.select, 'df', '~(string="bar")') + + # invert ok for filters + result = store.select('df', "~(columns=['A','B'])") + expected = df.loc[:,df.columns-['A','B']] + tm.assert_frame_equal(result, expected) + + def test_invalid_filtering(self): + + # can't use more than one filter (atm) + + df = tm.makeTimeDataFrame() + + with ensure_clean(self.path) as store: + store.put('df', df, table=True) + + # not implemented + self.assertRaises(NotImplementedError, store.select, 'df', "columns=['A'] | columns=['B']") + + # in theory we could deal with this + self.assertRaises(NotImplementedError, store.select, 'df', "columns=['A','B'] & columns=['C']") + def test_string_select(self): # GH 2973 @@ -3121,12 +3212,17 @@ def test_select_as_multiple(self): expected = concat([df1, df2], axis=1) expected = expected[5:] tm.assert_frame_equal(result, expected) - except (Exception) as detail: - print("error in select_as_multiple %s" % str(detail)) - print("store: %s" % store) - print("df1: %s" % df1) - print("df2: %s" % df2) - + except (Exception), detail: + print ("error in select_as_multiple %s" % str(detail)) + print ("store: %s" % store) + print ("df1: %s" % df1) + print ("df2: %s" % df2) + + result = store.select_as_multiple(['df1', 'df2'], where=[Term( + 'index>df2.index[4]')], selector='df2') + expected = concat([df1, df2], axis=1) + expected = expected[5:] + tm.assert_frame_equal(result, expected) # test excpection for diff rows store.append('df3', tm.makeTimeDataFrame(nper=50)) @@ -3141,13 +3237,13 @@ def test_start_stop(self): store.append('df', df) result = store.select( - 'df', [Term("columns", "=", ["A"])], start=0, stop=5) + 'df', [Term("columns=['A']")], start=0, stop=5) expected = df.ix[0:4, ['A']] tm.assert_frame_equal(result, expected) # out of range result = store.select( - 'df', [Term("columns", "=", ["A"])], start=30, stop=40) + 'df', [Term("columns=['A']")], start=30, stop=40) assert(len(result) == 0) tm.assert_isinstance(result, DataFrame) @@ -3160,7 +3256,7 @@ def test_select_filter_corner(self): with ensure_clean(self.path) as store: store.put('frame', df, format='table') - crit = Term('columns', df.columns[:75]) + crit = Term('columns=df.columns[:75]') result = store.select('frame', [crit]) tm.assert_frame_equal(result, df.ix[:, df.columns[:75]]) @@ -3330,11 +3426,12 @@ def test_legacy_table_read(self): # old version warning with tm.assert_produces_warning(expected_warning=IncompatibilityWarning): self.assertRaises( - Exception, store.select, 'wp1', Term('minor_axis', '=', 'B')) + Exception, store.select, 'wp1', Term('minor_axis=B')) - with tm.assert_produces_warning(expected_warning=IncompatibilityWarning): df2 = store.select('df2') - store.select('df2', Term('index', '>', df2.index[2])) + result = store.select('df2', Term('index>df2.index[2]')) + expected = df2[df2.index > df2.index[2]] + assert_frame_equal(expected, result) finally: safe_close(store) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index a5c1941a7f2d3..66a71d72019e6 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -30,7 +30,7 @@ import pandas.core.format as fmt import pandas.core.datetools as datetools from pandas.core.api import (DataFrame, Index, Series, notnull, isnull, - MultiIndex, DatetimeIndex, Timestamp, Period) + MultiIndex, DatetimeIndex, Timestamp) from pandas import date_range import pandas as pd from pandas.io.parsers import read_csv @@ -44,6 +44,7 @@ ensure_clean) from pandas.core.indexing import IndexingError from pandas.core.common import PandasError +from pandas.util.compat import OrderedDict import pandas.util.testing as tm import pandas.lib as lib @@ -2119,7 +2120,6 @@ def test_constructor_cast_failure(self): # this is ok df['foo2'] = np.ones((4,2)).tolist() - def test_constructor_dtype_nocast_view(self): df = DataFrame([[1, 2]]) should_be_view = DataFrame(df, dtype=df[0].dtype) @@ -3166,7 +3166,6 @@ def test_constructor_single_value(self): with tm.assertRaisesRegexp(TypeError, 'incompatible data and dtype'): DataFrame('a', [1, 2], ['a', 'c'], float) - def test_constructor_with_datetimes(self): intname = np.dtype(np.int_).name floatname = np.dtype(np.float_).name @@ -5238,8 +5237,6 @@ def make_dtnat_arr(n,nnat=None): _do_test(mkdf(nrows, ncols,r_idx_nlevels=2,c_idx_nlevels=2), path,rnlvl=2,cnlvl=2) - - def test_to_csv_from_csv_w_some_infs(self): # test roundtrip with inf, -inf, nan, as full columns and mix @@ -8098,6 +8095,45 @@ def test_mask_edge_case_1xN_frame(self): expec = DataFrame([[nan, 2]]) assert_frame_equal(res, expec) + def test_query(self): + try: + import numexpr as ne + except ImportError: + raise nose.SkipTest + # comparison + df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c']) + assert_frame_equal(df.query('a < b'), df[df.a < df.b]) + + # arith ops + assert_frame_equal(df.query('a + b > b * c'), + df[df.a + df.b > df.b * df.c]) + + local_dict = dict(df.iteritems()) + local_dict.update({'df': df}) + self.assertRaises(NameError, df.query, 'a < d & b < f', + local_dict=local_dict) + + # make sure that it's not just because we didn't pass the locals in + self.assertRaises(AssertionError, self.assertRaises, NameError, + df.query, 'a < b', local_dict=local_dict) + + def test_query_index(self): + try: + import numexpr as ne + except ImportError: + raise nose.SkipTest + + df = DataFrame(np.random.randn(10, 3), index=Index(range(10), + name='blob'), + columns=['a', 'b', 'c']) + assert_frame_equal(df.query('index < b'), df[df.index < df.b]) + assert_frame_equal(df.query('index < 5'), df[df.index < 5]) + assert_frame_equal(df.query('(blob < 5) & (a < b)'), df[(df.index < 5) + & (df.a < + df.b)]) + assert_frame_equal(df.query('blob < b'), df[df.index < df.b]) + + #---------------------------------------------------------------------- # Transposing def test_transpose(self): @@ -8228,7 +8264,6 @@ def test_diff(self): assert_series_equal(the_diff['A'], tf['A'] - tf['A'].shift(1)) - def test_diff_mixed_dtype(self): df = DataFrame(np.random.randn(5, 3)) df['A'] = np.array([1, 2, 3, 4, 5], dtype=object) @@ -10137,7 +10172,6 @@ def test_unstack_dtypes(self): expected = Series({'float64' : 2, 'object' : 2}) assert_series_equal(result, expected) - def test_reset_index(self): stacked = self.frame.stack()[::2] stacked = DataFrame({'foo': stacked, 'bar': stacked}) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index eeb5ca4369164..a070fa7ca4216 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -13,12 +13,14 @@ from datetime import datetime from functools import wraps, partial from contextlib import contextmanager +from httplib import HTTPException +from urllib2 import urlopen from distutils.version import LooseVersion from numpy.random import randn, rand import numpy as np -from pandas.core.common import isnull, _is_sequence +from pandas.core.common import isnull, _is_sequence, is_list_like import pandas.core.index as index import pandas.core.series as series import pandas.core.frame as frame @@ -1136,7 +1138,7 @@ def handle_success(self, exc_type, exc_value, traceback): @contextmanager -def assert_produces_warning(expected_warning=Warning, filter_level="always"): +def assert_produces_warning(expected_warning=None, filter_level="always"): """ Context manager for running code that expects to raise (or not raise) warnings. Checks that code raises the expected warning and only the @@ -1162,19 +1164,25 @@ def assert_produces_warning(expected_warning=Warning, filter_level="always"): ..warn:: This is *not* thread-safe. """ + if expected_warning is None: + expected_warning = [Warning] + elif not is_list_like(expected_warning): + expected_warning = [expected_warning] with warnings.catch_warnings(record=True) as w: saw_warning = False warnings.simplefilter(filter_level) yield w extra_warnings = [] for actual_warning in w: - if (expected_warning and issubclass(actual_warning.category, - expected_warning)): + if (expected_warning and any(issubclass(actual_warning.category, + ew) for ew in + expected_warning)): saw_warning = True else: extra_warnings.append(actual_warning.category.__name__) if expected_warning: - assert saw_warning, ("Did not see expected warning of class %r." - % expected_warning.__name__) + msg = ', '.join(ew.__name__ for ew in expected_warning) + assert saw_warning, ("Did not see expected warning(s) of " + "class(es): %s." % msg) assert not extra_warnings, ("Caused unexpected warning(s): %r." % extra_warnings) From b35406abda6803dc1c37b1375f73c07254fea551 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 14 Jul 2013 15:00:53 -0400 Subject: [PATCH 04/16] ENH: add automatic query via frame.__getitem__ --- doc/source/api.rst | 12 + doc/source/enhancingperf.rst | 155 +++++++- doc/source/indexing.rst | 130 ++++--- pandas/computation/align.py | 11 +- pandas/computation/engines.py | 2 +- pandas/computation/eval.py | 53 ++- pandas/computation/expr.py | 63 ++-- pandas/computation/ops.py | 44 ++- pandas/computation/tests/test_eval.py | 502 ++++++++++++++------------ pandas/core/frame.py | 91 ++++- pandas/tests/test_frame.py | 107 ++++-- vb_suite/eval.py | 49 +++ vb_suite/suite.py | 3 +- 13 files changed, 857 insertions(+), 365 deletions(-) create mode 100644 vb_suite/eval.py diff --git a/doc/source/api.rst b/doc/source/api.rst index 538965d0be7ad..837afb8996db3 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -155,6 +155,17 @@ Top-level dealing with datetimes to_datetime +Top-level evaluation +~~~~~~~~~~~~~~~~~~~~ + +.. currentmodule:: pandas.computation.eval + +.. autosummary:: + :toctree: generated/ + + eval + + Standard moving window functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -452,6 +463,7 @@ Indexing, iteration DataFrame.tail DataFrame.xs DataFrame.isin + DataFrame.query Binary operator functions ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index 95428bd27e2a2..3f3a31879752a 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -225,8 +225,8 @@ the rows, applying our ``integrate_f_typed``, and putting this in the zeros arra .. note:: - Loop like this would be *extremely* slow in python, but in cython looping over - numpy arrays is *fast*. + Loops like this would be *extremely* slow in python, but in Cython looping + over numpy arrays is *fast*. .. ipython:: python @@ -289,3 +289,154 @@ Further topics - Loading C modules into cython. Read more in the `cython docs `__. + +.. _enhancingperf.eval: + +Expression Evaluation via :func:`~pandas.computation.eval.eval` +--------------------------------------------------------------- + +New in pandas v0.13 a top-level function :func:`~pandas.computation.eval.eval` +implements expression evaluation of expressions containing +:class:`~pandas.core.series.Series` and :class:`~pandas.core.frame.DataFrame` +objects. + +.. note:: + + To benefit from using :func:`~pandas.computation.eval.eval` you need to + install ``numexpr``. See the :ref:`recommended dependencies section + ` for more details. + +The major benefit of using :func:`~pandas.computation.eval.eval` for expression +evaluation rather than just straight-up Python is two-fold: large +:class:`~pandas.core.frame.DataFrame` objects are evaluated more efficiently +and large expressions are evaluated all at once by the underlying engine (by +default ``numexpr`` is used for evaluation). + +.. note:: + + You should not use :func:`~pandas.computation.eval.eval` for simple + expressions or for expressions involving small DataFrames. In fact, + :func:`~pandas.computation.eval.eval` is many orders of magnitude slower for + smaller expressions/objects than plain ole' Python. A good rule of thumb is + to only use :func:`~pandas.computation.eval.eval` when you have a + :class:`~pandas.core.frame.DataFrame` with more than 10,000 rows. + + +:func:`~pandas.computation.eval.eval` supports all arithmetic expressions +supported by the engine (by default the engine is ``numexpr``). The ``numexpr`` +engine uses ``numexpr`` under the hood to evaluate expressions efficiently, +while allowing a slightly modified, and we think more intuitive syntax for +expressions. + + +.. note:: + + The larger the frame and the larger the expression the more speedup you will + see from using :func:`~pandas.computation.eval.eval`. + + +:func:`~pandas.computation.eval.eval` Examples +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`~pandas.computation.eval.eval` works wonders for expressions containing +large arrays + +First let's create 4 decent-sized arrays to play with: + +.. ipython:: python + + import pandas as pd + from pandas import DataFrame, Series + from numpy.random import randn + import numpy as np + nrows, ncols = 20000, 100 + df1, df2, df3, df4 = [DataFrame(randn(nrows, ncols)) for _ in xrange(4)] + + +Now let's compare adding them together using plain ol' Python versus +:func:`~pandas.computation.eval.eval`: + + +.. ipython:: python + + %timeit df1 + df2 + df3 + df4 + +.. ipython:: python + + %timeit pd.eval('df1 + df2 + df3 + df4') + + +Now let's do the same thing but with comparisons: + +.. ipython:: python + + %timeit (df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0) + +.. ipython:: python + + %timeit pd.eval('(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)') + + +:func:`~pandas.computation.eval.eval` also works with "unaligned" pandas +objects: + + +.. ipython:: python + + s = Series(randn(50)) + %timeit df1 + df2 + df3 + df4 + s + +.. ipython:: python + + %timeit pd.eval('df1 + df2 + df3 + df4 + s') + +There are also two different flavors of parsers and and two different engines +to use as the backend. + +:func:`~pandas.computation.eval.eval` Parsers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The default ``"pandas"`` parser allows a bit more intuitive (we think) syntax +for expressing query-like operations (comparisons, conjunctions and +disjunctions). In particular, the precedence of the ``&`` and ``|`` operators +is made equal to the precedence of the corresponding boolean operations ``and`` +and ``or``. + +For example, the above conjunction can be written without +parentheses. Alternatively, you can use the ``'python'`` parser to enforce +strict Python semantics. + +.. ipython:: python + + expr = '(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)' + x = pd.eval(expr, parser='python') + expr_no_parens = 'df1 > 0 & df2 > 0 & df3 > 0 & df4 > 0' + y = pd.eval(expr_no_parens, parser='pandas') + np.all(x == y) + + +:func:`~pandas.computation.eval.eval` Backends +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There's also the option to make :func:`~pandas.computation.eval.eval` operate +identical to plain ol' Python. + +.. note:: + + Using the ``'python'`` engine is generally *not* useful, except for + comparing performance and testing other + :func:`~pandas.computation.eval.eval` engines against it. You will acheive + **no** performance benefits using :func:`~pandas.computation.eval.eval` with + ``engine='python'``. + +You can see this by using :func:`~pandas.computation.eval.eval` with the +``'python'`` engine is actually a bit slower (not by much) than evaluating the +same expression in Python: + +.. ipython:: python + + %timeit df1 + df2 + df3 + df4 + +.. ipython:: python + + %timeit pd.eval('df1 + df2 + df3 + df4', engine='python') diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index d2fd11ee43615..47bf5fe29dc86 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1006,6 +1006,77 @@ convert to an integer index: df_new[(df_new['index'] >= 1.0) & (df_new['index'] < 2)] +.. _indexing.query: + +The ``query`` Method +~~~~~~~~~~~~~~~~~~~~ +New in pandas v0.13, :class:`~pandas.core.frame.DataFrame` objects have a +:meth:`~pandas.core.frame.DataFrame.query` method that allows selection using a +string consisting of columns of the calling +:class:`~pandas.core.frame.DataFrame`. + + + +.. _indexing.class: + +Index objects +------------- + +The pandas Index class and its subclasses can be viewed as implementing an +*ordered set* in addition to providing the support infrastructure necessary for +lookups, data alignment, and reindexing. The easiest way to create one directly +is to pass a list or other sequence to ``Index``: + +.. ipython:: python + + index = Index(['e', 'd', 'a', 'b']) + index + 'd' in index + +You can also pass a ``name`` to be stored in the index: + + +.. ipython:: python + + index = Index(['e', 'd', 'a', 'b'], name='something') + index.name + +Starting with pandas 0.5, the name, if set, will be shown in the console +display: + +.. ipython:: python + + index = Index(list(range(5)), name='rows') + columns = Index(['A', 'B', 'C'], name='cols') + df = DataFrame(np.random.randn(5, 3), index=index, columns=columns) + df + df['A'] + + +Set operations on Index objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _indexing.set_ops: + +The three main operations are ``union (|)``, ``intersection (&)``, and ``diff +(-)``. These can be directly called as instance methods or used via overloaded +operators: + +.. ipython:: python + + a = Index(['c', 'b', 'a']) + b = Index(['c', 'e', 'd']) + a.union(b) + a | b + a & b + a - b + +``isin`` method of Index objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +One additional operation is the ``isin`` method that works analogously to the +``Series.isin`` method found :ref:`here `. + .. _indexing.hierarchical: Hierarchical indexing (MultiIndex) @@ -1354,65 +1425,6 @@ not check (or care) whether the levels themselves are sorted. Fortunately, the constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but if you compute the levels and labels yourself, please be careful. -.. _indexing.class: - -Index objects -------------- - -The pandas Index class and its subclasses can be viewed as implementing an -*ordered set* in addition to providing the support infrastructure necessary for -lookups, data alignment, and reindexing. The easiest way to create one directly -is to pass a list or other sequence to ``Index``: - -.. ipython:: python - - index = Index(['e', 'd', 'a', 'b']) - index - 'd' in index - -You can also pass a ``name`` to be stored in the index: - - -.. ipython:: python - - index = Index(['e', 'd', 'a', 'b'], name='something') - index.name - -Starting with pandas 0.5, the name, if set, will be shown in the console -display: - -.. ipython:: python - - index = Index(list(range(5)), name='rows') - columns = Index(['A', 'B', 'C'], name='cols') - df = DataFrame(np.random.randn(5, 3), index=index, columns=columns) - df - df['A'] - - -Set operations on Index objects -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. _indexing.set_ops: - -The three main operations are ``union (|)``, ``intersection (&)``, and ``diff -(-)``. These can be directly called as instance methods or used via overloaded -operators: - -.. ipython:: python - - a = Index(['c', 'b', 'a']) - b = Index(['c', 'e', 'd']) - a.union(b) - a | b - a & b - a - b - -``isin`` method of Index objects -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -One additional operation is the ``isin`` method that works analogously to the -``Series.isin`` method found :ref:`here `. Setting index metadata (``name(s)``, ``levels``, ``labels``) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/computation/align.py b/pandas/computation/align.py index 65840bb68b4ea..09606fc41a46b 100644 --- a/pandas/computation/align.py +++ b/pandas/computation/align.py @@ -166,8 +166,15 @@ def _filter_terms(flat): def _align(terms): """Align a set of terms""" - # flatten the parse tree (a nested list, really) - terms = list(flatten(terms)) + try: + # flatten the parse tree (a nested list, really) + terms = list(flatten(terms)) + except TypeError: + # can't iterate so it must just be a constant or single variable + if isinstance(terms.value, (pd.Series, pd.core.generic.NDFrame)): + typ = type(terms.value) + return typ, _zip_axes_from_type(typ, terms.value.axes) + return np.result_type(terms.type), None # if all resolved variables are numeric scalars if all(term.isscalar for term in terms): diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index ea296ad0e4dd4..d4f23324b672f 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -76,4 +76,4 @@ def _evaluate(self, env): pass -_engines = {'numexpr': NumExprEngine, 'python': PythonEngine } +_engines = {'numexpr': NumExprEngine, 'python': PythonEngine} diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 04e17e3e41ac2..5f234b7864427 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -4,27 +4,49 @@ import numpy as np -import six - -from pandas.computation.expr import Expr, Scope +from pandas.computation.expr import Expr, Scope, _parsers from pandas.computation.engines import _engines -def eval(expr, engine='numexpr', truediv=True, local_dict=None, - global_dict=None, resolvers=None): +def _check_engine(engine): + if engine not in _engines: + raise KeyError('Invalid engine {0!r} passed, valid engines are' + ' {1}'.format(engine, _engines.keys())) + if engine == 'numexpr': + try: + import numexpr + except ImportError: + raise ImportError("'numexpr' not found. Cannot use " + "engine='numexpr' if 'numexpr' is not installed") + + +def _check_parser(parser): + if parser not in _parsers: + raise KeyError('Invalid parser {0!r} passed, valid parsers are' + ' {1}'.format(parser, _parsers.keys())) + + + +def eval(expr, parser='pandas', engine='numexpr', truediv=True, + local_dict=None, global_dict=None, resolvers=None): """Evaluate a Python expression as a string using various backends. The following arithmetic operations are supported: +, -, *, /, **, %, // (python engine only) along with the following boolean operations: | (or), & - (and), and ~ (not). All Pandas objects are supported and behave as they - would with in-Python evaluation. + (and), and ~ (not). Series and DataFrame objects are supported and behave + as they would with in-Python evaluation. Parameters ---------- expr : string or Expr object The expression to evaluate. This can be either a string or an ``Expr`` object. - engine : string, optional, default 'numexpr', {'python', 'numexpr' } + parser : str, optional, default 'pandas', {'pandas', 'python'} + The parser to use to construct the syntax tree from the expression. The + default of 'pandas' parses code slightly different than standard + Python. See the :ref:`enhancing performance ` + documentation for more details. + engine : string, optional, default 'numexpr', {'python', 'numexpr'} The engine used to evaluate the expression. Supported engines are - 'numexpr': This default engine evaluates pandas objects using numexpr @@ -32,7 +54,6 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, frames. - 'python': Performs operations as if you had eval'd in top level python - truediv : bool, optional, default True Whether to use true division, like in Python >= 3 local_dict : dict or None, optional, default None @@ -42,27 +63,23 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, Returns ------- - obj : ndarray, scalar, DataFrame, Series, or Panel + obj : ndarray, scalar, DataFrame, Series Notes ----- - * The benefits of using ``eval`` are that very large frames that are terms in - long expressions are sped up, sometimes by as much as 10x. - See :ref:`Enhancing performance ` for more details. """ # make sure we're passed a valid engine - if not engine in _engines: - raise KeyError('Invalid engine {0} passed, valid engines are' - ' {1}'.format(_engines.keys())) + _check_engine(engine) + _check_parser(parser) eng = _engines[engine] - if isinstance(expr, six.string_types): + if isinstance(expr, basestring): # need to go 2 up in the call stack from the constructor env = Scope(global_dict, local_dict, frame_level=2, resolvers=resolvers) - parsed_expr = Expr(expr, engine, env, truediv) + parsed_expr = Expr(expr, engine, parser, env, truediv) elif isinstance(expr, Expr): parsed_expr = expr else: diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index cb7b269485f95..23f4341102729 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -8,10 +8,11 @@ from functools import partial from pandas.core.base import StringMixin -from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops -from pandas.computation.ops import _cmp_ops_syms, _bool_ops_syms -from pandas.computation.ops import _arith_ops_syms, _unary_ops_syms -from pandas.computation.ops import Term, Constant +from pandas.core import common as com +from pandas.computation.ops import (BinOp, UnaryOp, _reductions, _mathops, + _cmp_ops_syms, _bool_ops_syms, + _arith_ops_syms, _unary_ops_syms, Term, + Constant) import pandas.lib as lib import datetime @@ -34,6 +35,10 @@ def __init__(self, gbls=None, lcls=None, frame_level=1, resolvers=None): self.globals['Timestamp'] = lib.Timestamp self.globals['datetime'] = datetime + # SUCH a hack + self.globals['True'] = True + self.globals['False'] = False + self.resolvers = resolvers or [] self.resolver_keys = set(reduce(operator.add, (list(o.keys()) for o in self.resolvers), [])) @@ -219,8 +224,15 @@ def __init__(self, env, preparser=_preparse): self.preparser = preparser def visit(self, node, **kwargs): + parse = lambda x: ast.fix_missing_locations(ast.parse(x)) if isinstance(node, basestring): - node = ast.fix_missing_locations(ast.parse(self.preparser(node))) + clean = self.preparser(node) + elif isinstance(node, ast.AST): + clean = node + else: + raise TypeError("Cannot visit objects of type {0!r}" + "".format(node.__class__.__name__)) + node = parse(clean) method = 'visit_' + node.__class__.__name__ visitor = getattr(self, method, None) @@ -263,7 +275,6 @@ def visit_Index(self, node, **kwargs): """ df.index[4] """ return self.visit(node.value) - def visit_Subscript(self, node, **kwargs): """ df.index[4:6] """ value = self.visit(node.value) @@ -344,21 +355,16 @@ def visit_Compare(self, node, **kwargs): return node -_numexpr_not_supported = frozenset(['Assign', 'BoolOp', 'Not', 'Str', 'Slice', - 'Index', 'Subscript', 'Tuple', 'List', - 'Dict', 'Call']) +_python_not_supported = frozenset(['Assign', 'BoolOp', 'Not', 'Str', 'Slice', + 'Index', 'Subscript', 'Tuple', 'List', + 'Dict', 'Call']) _numexpr_supported_calls = frozenset(_reductions + _mathops) -@disallow(_unsupported_nodes | _numexpr_not_supported) -class NumExprVisitor(BaseExprVisitor): - def __init__(self, env, preparser=None): - if preparser is not None: - raise ValueError("only strict numexpr syntax is supported") - preparser = lambda x: x - super(NumExprVisitor, self).__init__(env, preparser) - +@disallow(_unsupported_nodes | _python_not_supported) +class PandasExprVisitor(BaseExprVisitor): + def __init__(self, env, preparser=_preparse): + super(PandasExprVisitor, self).__init__(env, preparser) -_python_not_supported = _numexpr_not_supported @disallow(_unsupported_nodes | _python_not_supported) class PythonExprVisitor(BaseExprVisitor): @@ -369,10 +375,11 @@ class Expr(StringMixin): """Expr object""" - def __init__(self, expr, engine='numexpr', env=None, truediv=True): + def __init__(self, expr, engine='numexpr', parser='pandas', env=None, + truediv=True): self.expr = expr self.env = env or Scope(frame_level=2) - self._visitor = _visitors[engine](self.env) + self._visitor = _parsers[parser](self.env) self.terms = self.parse() self.engine = engine self.truediv = truediv @@ -382,7 +389,7 @@ def __call__(self, env): return self.terms(env) def __unicode__(self): - return unicode(self.terms) + return com.pprint_thing(self.terms) def __len__(self): return len(self.expr) @@ -396,6 +403,18 @@ def align(self): return self.terms.align(self.env) +def maybe_expression(s, kind='python'): + """ loose checking if s is an expression """ + if not isinstance(s, basestring): + return False + try: + visitor = _parsers[kind] + # make sure we have an op at least + return any(op in s for op in visitor.binary_ops) + except: + return False + + def isexpr(s, check_names=True): try: Expr(s) @@ -407,4 +426,4 @@ def isexpr(s, check_names=True): return True -_visitors = {'python': PythonExprVisitor, 'numexpr': NumExprVisitor} +_parsers = {'python': PythonExprVisitor, 'pandas': PandasExprVisitor} diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index b2dd638da1ef3..85459b2fab7a0 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -2,7 +2,6 @@ import numpy as np -import pandas as pd from pandas.util.py3compat import PY3 import pandas.core.common as com from pandas.core.base import StringMixin @@ -34,20 +33,12 @@ def __init__(self, name, env, side=None): self.side = side self.value = self._resolve_name() - try: - # ndframe potentially very slow for large, mixed dtype frames - self.type = self.value.values.dtype - except AttributeError: - try: - # ndarray - self.type = self.value.dtype - except AttributeError: - # scalar - self.type = type(self.value) - def __unicode__(self): return com.pprint_thing(self.name) + def __call__(self, *args, **kwargs): + return self.value + def _resolve_name(self): env = self.env key = self.name @@ -59,9 +50,9 @@ def _resolve_name(self): return key raise NameError('name {0!r} is not defined'.format(key)) - if isinstance(res, pd.Panel): - raise NotImplementedError("Panel objects are not supported with " - "eval") + if hasattr(res, 'ndim') and res.ndim > 2: + raise NotImplementedError("N-dimensional objects, where N > 2, are" + " not supported with eval") return res def update(self, value): @@ -79,7 +70,8 @@ def update(self, value): del env.globals[key] env.globals[key] = value except KeyError: - raise NameError('{0!r} is undefined'.format(key)) + raise NameError('name {0!r} is not ' + 'defined'.format(key)) self.value = value @@ -87,6 +79,21 @@ def update(self, value): def isscalar(self): return np.isscalar(self.value) + @property + def type(self): + try: + # ndframe potentially very slow for large, mixed dtype frames + return self.value.values.dtype + except AttributeError: + try: + # ndarray + return self.value.dtype + except AttributeError: + # scalar + return type(self.value) + + return_type = type + class Constant(Term): def __init__(self, value, env): @@ -139,6 +146,11 @@ def return_type(self): op.pow, op.floordiv, op.mod) _arith_ops_dict = dict(zip(_arith_ops_syms, _arith_ops_funcs)) +_special_case_arith_ops_syms = '**', '//', '%' +_special_case_arith_ops_funcs = op.pow, op.floordiv, op.mod +_special_case_arith_ops_dict = dict(zip(_special_case_arith_ops_syms, + _special_case_arith_ops_funcs)) + _binary_ops_dict = {} for d in (_cmp_ops_dict, _bool_ops_dict, _arith_ops_dict): diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index fa96342ec9bdc..06b6b211b7d1c 100755 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -1,7 +1,8 @@ #!/usr/bin/env python import unittest -import itertools +import functools +import numbers from itertools import product import ast @@ -18,15 +19,17 @@ from pandas.core import common as com from pandas import DataFrame, Series, Panel from pandas.util.testing import makeCustomDataframe as mkdf -from pandas.computation.engines import _engines, _reconstruct_object -from pandas.computation.align import _align_core -from pandas.computation.expr import NumExprVisitor, PythonExprVisitor -from pandas.computation.ops import _binary_ops_dict, _unary_ops_dict, Term +from pandas.computation.engines import _engines +from pandas.computation.expr import PythonExprVisitor, PandasExprVisitor +from pandas.computation.ops import (_binary_ops_dict, _unary_ops_dict, + _special_case_arith_ops_syms, + _arith_ops_syms) import pandas.computation.expr as expr from pandas.computation import pytables from pandas.computation.expressions import _USE_NUMEXPR from pandas.computation.eval import Scope -from pandas.util.testing import assert_frame_equal, randbool +from pandas.util.testing import (assert_frame_equal, randbool, + assertRaisesRegexp) from pandas.util.py3compat import PY3 @@ -39,30 +42,9 @@ def engine_has_neg_frac(engine): return _engines[engine].has_neg_frac -def fractional(x): - frac, _ = np.modf(np.asanyarray(x)) - return frac - - -def hasfractional(x): - return np.any(fractional(x)) - - -def _eval_from_expr(lhs, cmp1, rhs, binop, cmp2): - f1 = _binary_ops_dict[cmp1] - f2 = _binary_ops_dict[cmp2] - bf = _binary_ops_dict[binop] - env = Scope() - typ, axes = _align_core((Term('lhs', env), Term('rhs', env))) - lhs, rhs = env.locals['lhs'], env.locals['rhs'] - ret_type = np.result_type(lhs, rhs) - return _reconstruct_object(typ, bf(f1(lhs, rhs), f2(lhs, rhs)), axes, - ret_type) - - -def _eval_single_bin(lhs, cmp1, rhs, has_neg_frac): +def _eval_single_bin(lhs, cmp1, rhs, engine): c = _binary_ops_dict[cmp1] - if has_neg_frac: + if engine_has_neg_frac(engine): try: result = c(lhs, rhs) except ValueError: @@ -72,55 +54,57 @@ def _eval_single_bin(lhs, cmp1, rhs, has_neg_frac): return result -def isframe(x): - return isinstance(x, pd.DataFrame) - - -def isseries(x): - return isinstance(x, pd.Series) - - -def are_compatible_types(op, lhs, rhs): - if op in ('&', '|'): - if isframe(lhs) and isseries(rhs) or isframe(rhs) and isseries(lhs): - return False - return True - - -def _eval_bin_and_unary(unary, lhs, arith1, rhs): - binop = _binary_ops_dict[arith1] - unop = expr._unary_ops_dict[unary] - return unop(binop(lhs, rhs)) - - def _series_and_2d_ndarray(lhs, rhs): return (com.is_series(lhs) and isinstance(rhs, np.ndarray) and rhs.ndim > 1 or com.is_series(rhs) and isinstance(lhs, np.ndarray) and lhs.ndim > 1) -class TestBasicEval(unittest.TestCase): +def skip_incompatible_operand(f): + @functools.wraps(f) + def wrapper(self, lhs, arith1, rhs, *args, **kwargs): + if _series_and_2d_ndarray(lhs, rhs): + self.assertRaises(Exception, pd.eval, 'lhs {0} rhs'.format(arith1), + local_dict={'lhs': lhs, 'rhs': rhs}, + engine=self.engine) + else: + f(self, lhs, arith1, rhs, *args, **kwargs) + return wrapper + + +_good_arith_ops = tuple(set(_arith_ops_syms) - + set(_special_case_arith_ops_syms)) + + +class TestEvalPandas(unittest.TestCase): @classmethod - def setUpClass(self): - self.cmp_ops = expr._cmp_ops_syms - self.cmp2_ops = self.cmp_ops[::-1] - self.bin_ops = expr._bool_ops_syms - self.arith_ops = tuple(o for o in expr._arith_ops_syms if o != '//') - self.unary_ops = '+', '-' + def setUpClass(cls): + cls.cmp_ops = expr._cmp_ops_syms + cls.cmp2_ops = cls.cmp_ops[::-1] + cls.bin_ops = expr._bool_ops_syms + cls.special_case_ops = _special_case_arith_ops_syms + cls.arith_ops = _good_arith_ops + cls.unary_ops = '+', '-' def set_current_engine(self): self.engine = 'numexpr' def setup_data(self): - nan_df = DataFrame(rand(10, 5)) - nan_df[nan_df > 0.5] = np.nan - self.lhses = (DataFrame(randn(10, 5)), Series(randn(5)), randn(), - np.float64(randn()), randn(10, 5), randn(5), np.nan, - Series([1, 2, np.nan, np.nan, 5]), nan_df) - self.rhses = (DataFrame(randn(10, 5)), Series(randn(5)), randn(), - np.float64(randn()), randn(10, 5), randn(5), np.nan, - Series([1, 2, np.nan, np.nan, 5]), nan_df) + nan_df1 = DataFrame(rand(10, 5)) + nan_df1[nan_df1 > 0.5] = np.nan + nan_df2 = DataFrame(rand(10, 5)) + nan_df2[nan_df2 > 0.5] = np.nan + + self.pandas_lhses = (DataFrame(randn(10, 5)), Series(randn(5)), + Series([1, 2, np.nan, np.nan, 5]), nan_df1) + self.pandas_rhses = (DataFrame(randn(10, 5)), Series(randn(5)), + Series([1, 2, np.nan, np.nan, 5]), nan_df2) + self.scalar_lhses = randn(), np.float64(randn()), np.nan + self.scalar_rhses = randn(), np.float64(randn()), np.nan + + self.lhses = self.pandas_lhses + self.scalar_lhses + self.rhses = self.pandas_rhses + self.scalar_rhses def setUp(self): try: @@ -135,131 +119,148 @@ def setUp(self): @slow def test_complex_cmp_ops(self): - self.setUp() - lhses, rhses = self.lhses, self.rhses - args = itertools.product(lhses, self.cmp_ops, rhses, self.bin_ops, - self.cmp2_ops) - for lhs, cmp1, rhs, binop, cmp2 in args: - self._create_cmp_op_t(lhs, cmp1, rhs, binop, cmp2) + for lhs, cmp1, rhs, binop, cmp2 in product(self.lhses, self.cmp_ops, + self.rhses, self.bin_ops, + self.cmp2_ops): + self.check_complex_cmp_op(lhs, cmp1, rhs, binop, cmp2) def test_simple_cmp_ops(self): bool_lhses = (DataFrame(randbool(size=(10, 5))), Series(randbool((5,))), randbool()) bool_rhses = (DataFrame(randbool(size=(10, 5))), Series(randbool((5,))), randbool()) - args = itertools.product(bool_lhses, bool_rhses, self.cmp_ops) - for lhs, rhs, cmp_op in args: - self._create_simple_cmp_op_t(lhs, rhs, cmp_op) + for lhs, rhs, cmp_op in product(bool_lhses, bool_rhses, self.cmp_ops): + self.check_simple_cmp_op(lhs, cmp_op, rhs) + @slow def test_binary_arith_ops(self): - self.setUp() - lhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() - rhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() - args = itertools.product(lhses, self.arith_ops, rhses) - for lhs, op, rhs in args: - self._create_arith_op_t(lhs, op, rhs) + for lhs, op, rhs in product(self.lhses, self.arith_ops, self.rhses): + self.check_binary_arith_op(lhs, op, rhs) + + def test_modulus(self): + for lhs, rhs in product(self.lhses, self.rhses): + self.check_modulus(lhs, '%', rhs) + + def test_floor_division(self): + for lhs, rhs in product(self.lhses, self.rhses): + self.check_floor_division(lhs, '//', rhs) + + def test_pow(self): + for lhs, rhs in product(self.lhses, self.rhses): + self.check_pow(lhs, '**', rhs) + @slow def test_unary_arith_ops(self): - self.setUp() - lhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() - rhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() - aops = tuple(aop for aop in self.arith_ops if aop not in '+-') - args = itertools.product(self.unary_ops, lhses, aops, rhses) - for unary_op, lhs, arith_op, rhs in args: - self._create_unary_arith_op_t(unary_op, lhs, arith_op, rhs) - - def test_invert(self): - self.setUp() - lhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() - rhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() - args = itertools.product(lhses, self.cmp_ops, rhses) - for lhs, op, rhs in args: - self._create_invert_op_t(lhs, op, rhs) - - def _create_cmp_op_t(self, lhs, cmp1, rhs, binop, cmp2): + for unary_op, lhs, arith_op, rhs in product(self.unary_ops, self.lhses, + self.arith_ops, + self.rhses): + self.check_unary_arith_op(lhs, arith_op, rhs, unary_op) + + @slow + def test_single_invert_op(self): + for lhs, op, rhs in product(self.lhses, self.cmp_ops, self.rhses): + self.check_single_invert_op(lhs, op, rhs) + + @slow + def test_compound_invert_op(self): + for lhs, op, rhs in product(self.lhses, self.cmp_ops, self.rhses): + self.check_compound_invert_op(lhs, op, rhs) + + @skip_incompatible_operand + def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format(cmp1=cmp1, binop=binop, cmp2=cmp2) - if _series_and_2d_ndarray(lhs, rhs): - self.assertRaises(Exception, _eval_from_expr, lhs, cmp1, rhs, - binop, cmp2) - self.assertRaises(Exception, pd.eval, ex, engine=self.engine) - else: - expected = _eval_from_expr(lhs, cmp1, rhs, binop, cmp2) - result = pd.eval(ex, engine=self.engine) - assert_array_equal(result, expected) + lhs_new = _eval_single_bin(lhs, cmp1, rhs, self.engine) + rhs_new = _eval_single_bin(lhs, cmp2, rhs, self.engine) + expected = _eval_single_bin(lhs_new, binop, rhs_new, self.engine) + result = pd.eval(ex, engine=self.engine) + assert_array_equal(result, expected) - def _create_simple_cmp_op_t(self, lhs, rhs, cmp1): + @skip_incompatible_operand + def check_simple_cmp_op(self, lhs, cmp1, rhs): ex = 'lhs {0} rhs'.format(cmp1) + expected = _eval_single_bin(lhs, cmp1, rhs, self.engine) + result = pd.eval(ex, engine=self.engine) + assert_array_equal(result, expected) - if are_compatible_types(cmp1, lhs, rhs): - expected = _eval_single_bin(lhs, cmp1, rhs, - engine_has_neg_frac(self.engine)) - result = pd.eval(ex, engine=self.engine) - assert_array_equal(result, expected) + @skip_incompatible_operand + def check_binary_arith_op(self, lhs, arith1, rhs): + ex = 'lhs {0} rhs'.format(arith1) + result = pd.eval(ex, engine=self.engine) + expected = _eval_single_bin(lhs, arith1, rhs, self.engine) + assert_array_equal(result, expected) + ex = 'lhs {0} rhs {0} rhs'.format(arith1) + result = pd.eval(ex, engine=self.engine) + nlhs = _eval_single_bin(lhs, arith1, rhs, + self.engine) + self.check_alignment(result, nlhs, rhs, arith1) + + def check_alignment(self, result, nlhs, ghs, op): + try: + nlhs, ghs = nlhs.align(ghs) + except (ValueError, TypeError, AttributeError): + # ValueError: series frame or frame series align + # TypeError, AttributeError: series or frame with scalar align + pass else: - assert_raises(TypeError, _eval_single_bin, lhs, cmp1, rhs, - engine_has_neg_frac(self.engine)) + expected = self.ne.evaluate('nlhs {0} ghs'.format(op)) + assert_array_equal(result, expected) - def _create_arith_op_t(self, lhs, arith1, rhs): + # the following 3 tests require special casing + + @skip_incompatible_operand + def check_modulus(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) - nan_frac_neg = (arith1 == '**' and np.any(lhs < 0) and - hasfractional(rhs) and np.isscalar(lhs) and - np.isscalar(rhs) and - not (isinstance(lhs, tuple(np.typeDict.values())) - or isinstance(rhs, tuple(np.typeDict.values())))) - if nan_frac_neg and not engine_has_neg_frac(self.engine): - assert_raises(ValueError, pd.eval, ex, engine=self.engine, - local_dict=locals(), global_dict=globals()) - else: - result = pd.eval(ex, engine=self.engine) - - if arith1 != '//': - expected = _eval_single_bin(lhs, arith1, rhs, - engine_has_neg_frac(self.engine)) - # roundoff error with modulus - if arith1 == '%': - assert_allclose(result, expected) - else: - assert_array_equal(result, expected) + result = pd.eval(ex, engine=self.engine) + expected = lhs % rhs + assert_allclose(result, expected) + expected = self.ne.evaluate('expected {0} rhs'.format(arith1)) + assert_allclose(result, expected) - # sanity check on recursive parsing - try: - ghs = rhs.copy() - except AttributeError: - ghs = rhs + @skip_incompatible_operand + def check_floor_division(self, lhs, arith1, rhs): + ex = 'lhs {0} rhs'.format(arith1) - if nan_frac_neg and not engine_has_neg_frac(self.engine): - assert_raises(ValueError, pd.eval, ex, engine=self.engine, - local_dict=locals(), global_dict=globals()) + if self.engine == 'python': + res = pd.eval(ex, engine=self.engine) + expected = lhs // rhs + assert_array_equal(res, expected) else: - if arith1 == '**': - ex = '(lhs {0} rhs) {0} ghs'.format(arith1) - else: - ex = 'lhs {0} rhs {0} ghs'.format(arith1) - result = pd.eval(ex, engine=self.engine) + self.assertRaises(TypeError, pd.eval, ex, local_dict={'lhs': lhs, + 'rhs': rhs}, + engine=self.engine) - try: - nlhs = _eval_single_bin(lhs, arith1, rhs, - engine_has_neg_frac(self.engine)) - except ValueError: - assert_raises(ValueError, _eval_single_bin, lhs, arith1, rhs, - engine_has_neg_frac(self.engine)) + def get_expected_pow_result(self, lhs, rhs): + try: + expected = _eval_single_bin(lhs, '**', rhs, self.engine) + except ValueError as e: + msg = 'negative number cannot be raised to a fractional power' + if e.message == msg: + if self.engine == 'python': + raise nose.SkipTest(e.message) + else: + expected = np.nan + # raise on other, possibly valid ValueErrors else: - try: - nlhs, ghs = nlhs.align(ghs) - except: - pass - if arith1 != '//': - expected = self.ne.evaluate('nlhs {0} ghs'.format(arith1)) - - # roundoff error with modulus - if arith1 == '%': - assert_allclose(result, expected) - else: - assert_array_equal(result, expected) - - def _create_invert_op_t(self, lhs, cmp1, rhs): + raise + return expected + + @skip_incompatible_operand + def check_pow(self, lhs, arith1, rhs): + ex = 'lhs {0} rhs'.format(arith1) + expected = self.get_expected_pow_result(lhs, rhs) + result = pd.eval(ex, engine=self.engine) + assert_array_equal(result, expected) + + ex = '(lhs {0} rhs) {0} rhs'.format(arith1) + result = pd.eval(ex, engine=self.engine) + expected = self.get_expected_pow_result( + self.get_expected_pow_result(lhs, rhs), rhs) + assert_array_equal(result, expected) + + @skip_incompatible_operand + def check_single_invert_op(self, lhs, cmp1, rhs): # simple for el in (lhs, rhs): try: @@ -273,33 +274,33 @@ def _create_invert_op_t(self, lhs, cmp1, rhs): for engine in self.current_engines: assert_array_equal(result, pd.eval('~elb', engine=engine)) + @skip_incompatible_operand + def check_compound_invert_op(self, lhs, cmp1, rhs): # compound ex = '~(lhs {0} rhs)'.format(cmp1) if np.isscalar(lhs) and np.isscalar(rhs): lhs, rhs = map(lambda x: np.array([x]), (lhs, rhs)) - expected = ~_eval_single_bin(lhs, cmp1, rhs, - engine_has_neg_frac(self.engine)) + expected = ~_eval_single_bin(lhs, cmp1, rhs, self.engine) result = pd.eval(ex, engine=self.engine) assert_array_equal(expected, result) - # make sure the other engines work + # make sure the other engines work the same as this one for engine in self.current_engines: ev = pd.eval(ex, engine=self.engine) assert_array_equal(ev, result) - def _create_unary_arith_op_t(self, unary_op, lhs, arith1, rhs): + @skip_incompatible_operand + def check_unary_arith_op(self, lhs, arith1, rhs, unary_op): # simple ex = '{0}lhs'.format(unary_op, arith1) f = _unary_ops_dict[unary_op] - bad_types = tuple(np.typeDict.values()) - - nan_frac_neg = (arith1 == '**' and - np.any(lhs < 0) and - hasfractional(rhs) and - np.isscalar(lhs) and np.isscalar(rhs) and - not (isinstance(lhs, bad_types) or - isinstance(rhs, bad_types)) - and not engine_has_neg_frac(self.engine)) + bad_types = np.floating, float, numbers.Real + + if isinstance(lhs, bad_types): + raise nose.SkipTest("Incompatiable type for ~ operator") + if isinstance(rhs, bad_types): + raise nose.SkipTest("Incompatiable type for ~ operator") + try: expected = f(lhs.values) except AttributeError: @@ -311,40 +312,23 @@ def _create_unary_arith_op_t(self, unary_op, lhs, arith1, rhs): assert_array_equal(result, pd.eval(ex, engine=engine)) ex = '{0}(lhs {1} rhs)'.format(unary_op, arith1) + result = pd.eval(ex, engine=self.engine) - if nan_frac_neg: - assert_raises(ValueError, pd.eval, ex, engine=self.engine, - local_dict=locals(), global_dict=globals()) - else: - # compound - result = pd.eval(ex, engine=self.engine) - #(lhs, rhs), _ = _align((lhs, rhs)) - #if arith1 != '//': - #expected = self.ne.evaluate(ex) - #assert_array_equal(result, expected) - #else: - #assert_raises(TypeError, self.ne.evaluate, ex) +class TestEvalPython(TestEvalPandas): - #for engine in self.current_engines: - #if arith1 != '//': - #if engine_has_neg_frac(engine): - #assert_array_equal(result, pd.eval(ex, engine=engine)) - #else: - #assert_raises(TypeError, pd.eval, ex, engine=engine, - #local_dict=locals(), global_dict=globals()) + def set_current_engine(self): + self.engine = 'python' -class TestBasicEvalPython(TestBasicEval): +class TestEvalPandasWithMixedTypeOperands(TestEvalPandas): + def setup_data(self): + super(TestEvalPandasWithMixedTypeOperands, self).setup_data() + self.lhses += randn(10, 5), randn(5) + self.rhses += randn(10, 5), randn(5) - @classmethod - def setUpClass(cls): - cls.cmp_ops = expr._cmp_ops_syms - cls.cmp2_ops = cls.cmp_ops[::-1] - cls.bin_ops = expr._bool_ops_syms - cls.arith_ops = expr._arith_ops_syms - cls.unary_ops = '+', '-' +class TestEvalPythonWithMixedTypeOperands(TestEvalPandasWithMixedTypeOperands): def set_current_engine(self): self.engine = 'python' @@ -373,7 +357,7 @@ def check_align_nested_unary_op(engine): skip_numexpr_engine(engine) s = 'df * ~2' df = mkdf(10, 10, data_gen_f=f) - res = pd.eval(s, engine) + res = pd.eval(s, engine=engine) assert_frame_equal(res, df * ~2) @@ -450,7 +434,6 @@ def check_basic_series_frame_alignment(engine, r_idx_type, c_idx_type, assert_frame_equal(res, expected) -@slow def check_basic_series_frame_alignment_datetime(engine, r_idx_type, c_idx_type, index_name): skip_numexpr_engine(engine) @@ -502,6 +485,7 @@ def test_series_frame_commutativity(): index_name) +@slow def test_basic_frame_series_alignment(): args = product(_engines, INDEX_TYPES, INDEX_TYPES, ('index', 'columns')) for engine, r_idx_type, c_idx_type, index_name in args: @@ -518,6 +502,7 @@ def test_basic_series_frame_alignment_datetime(): c_idx_type, index_name) +@slow def test_basic_series_frame_alignment(): args = product(_engines, INDEX_TYPES, INDEX_TYPES, ('index', 'columns')) for engine, r_idx_type, c_idx_type, index_name in args: @@ -647,13 +632,13 @@ def test_or_fails(): check_or_fails(engine) -_visitors = {'numexpr': NumExprVisitor, 'python': PythonExprVisitor, - 'pytables': pytables.ExprVisitor} +_parsers = {'python': PythonExprVisitor, 'pytables': pytables.ExprVisitor, + 'pandas': PandasExprVisitor} -def check_disallowed_nodes(engine): +def check_disallowed_nodes(visitor): """make sure the disallowed decorator works""" - VisitorClass = _visitors[engine] + VisitorClass = _parsers[visitor] uns_ops = VisitorClass.unsupported_nodes inst = VisitorClass('x + 1') for ops in uns_ops: @@ -661,30 +646,57 @@ def check_disallowed_nodes(engine): def test_disallowed_nodes(): - for engine in ('pytables', 'numexpr', 'python'): - check_disallowed_nodes(engine) + for visitor in _parsers: + check_disallowed_nodes(visitor) -def check_simple_ops(engine): - ops = '+', '*', '/', '-', '%', '**' +def check_simple_arith_ops(engine): + ops = expr._arith_ops_syms + expr._cmp_ops_syms - for op in ops: - expec = _eval_single_bin(1, op, 1, engine_has_neg_frac(engine)) + for op in filter(lambda x: x != '//', ops): + expec = _eval_single_bin(1, op, 1, engine) x = pd.eval('1 {0} 1'.format(op), engine=engine) assert_equal(x, expec) - expec = _eval_single_bin(x, op, 1, engine_has_neg_frac(engine)) + expec = _eval_single_bin(x, op, 1, engine) y = pd.eval('x {0} 1'.format(op), engine=engine) assert_equal(y, expec) - expec = _eval_single_bin(1, op, x + 1, engine_has_neg_frac(engine)) + expec = _eval_single_bin(1, op, x + 1, engine) y = pd.eval('1 {0} (x + 1)'.format(op), engine=engine) assert_equal(y, expec) -def test_simple_ops(): +def check_simple_bool_ops(engine): + for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, + False)): + expec = _eval_single_bin(lhs, op, rhs, engine) + x = pd.eval('lhs {0} rhs'.format(op), engine=engine) + assert_equal(x, expec) + + +def check_bool_ops_with_constants(engine): + asteval = ast.literal_eval + for op, lhs, rhs in product(expr._bool_ops_syms, ('True', 'False'), + ('True', 'False')): + expec = _eval_single_bin(asteval(lhs), op, asteval(rhs), engine) + x = pd.eval('{0} {1} {2}'.format(lhs, op, rhs), engine=engine) + assert_equal(x, expec) + + +def test_simple_arith_ops(): + for engine in _engines: + check_simple_arith_ops(engine) + + +def test_simple_bool_ops(): for engine in _engines: - check_simple_ops(engine) + check_simple_bool_ops(engine) + + +def test_bool_ops_with_constants(): + for engine in _engines: + check_bool_ops_with_constants(engine) def check_no_new_locals(engine): @@ -727,6 +739,52 @@ def test_panel_fails(): check_panel_fails(engine) +def check_4d_ndarray_fails(engine): + x = randn(3, 4, 5, 6) + y = Series(randn(10)) + assert_raises(NotImplementedError, pd.eval, 'x + y', local_dict={'x': x, + 'y': y}, + engine=engine) + + +def test_4d_ndarray_fails(): + for engine in _engines: + check_4d_ndarray_fails(engine) + + +def check_constant(engine): + x = pd.eval('1', engine=engine) + assert_equal(x, 1) + + +def test_constant(): + for engine in _engines: + check_constant(engine) + + +def check_single_variable(engine): + df = DataFrame(randn(10, 2)) + df2 = pd.eval('df', engine=engine) + assert_frame_equal(df, df2) + + +def test_single_variable(): + for engine in _engines: + check_single_variable(engine) + + +def test_invalid_engine(): + assertRaisesRegexp(KeyError, 'Invalid engine \'asdf\' passed', + pd.eval, 'x + y', local_dict={'x': 1, 'y': 2}, + engine='asdf') + + +def test_invalid_parser(): + assertRaisesRegexp(KeyError, 'Invalid parser \'asdf\' passed', + pd.eval, 'x + y', local_dict={'x': 1, 'y': 2}, + parser='asdf') + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 59145cd54b360..3f4b283f577b5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -38,6 +38,7 @@ from pandas.sparse.array import SparseArray import pandas.computation.expressions as expressions from pandas.computation.eval import eval as _eval +from pandas.computation.expr import maybe_expression from pandas.compat.scipy import scoreatpercentile as _quantile from pandas.compat import(range, zip, lrange, lmap, lzip, StringIO, u, OrderedDict, raise_with_traceback) @@ -1832,7 +1833,12 @@ def __getitem__(self, key): elif is_mi_columns: return self._getitem_multilevel(key) else: - return self._getitem_column(key) + try: + return self._getitem_column(key) + except KeyError: + if maybe_expression(key): + return self.query(key) + raise def _getitem_column(self, key): """ return the actual column """ @@ -1899,6 +1905,89 @@ def _getitem_frame(self, key): return self.where(key) def query(self, expr, **kwargs): + """Query the columns of a frame with an expression. + + Parameters + ---------- + expr : string + The query string to evaluate. The result of the evaluation of this + expression is passed to + :meth:`~pandas.core.frame.DataFrame.__getitem__`. + kwargs : dict + See the documentation for :func:`~pandas.computation.eval.eval` for + complete details on the keyword arguments accepted by + :meth:`~pandas.core.frame.DataFrame.query`. + + Returns + ------- + q : DataFrame or Series + + Notes + ----- + This method uses the top-level :func:`~pandas.computation.eval.eval` + function to evaluate the passed query. + + The :meth:`~pandas.core.frame.DataFrame.query` method uses a slightly + modified Python syntax by default. For example, the ``&`` and ``|`` + (bitwise) operators have the precedence of their boolean cousins, + ``and`` and ``or``. This *is* syntactically valid Python, however the + semantics are different. + + You can use a syntax that is semantically identical to Python by + passing the keyword argument ``parser='numexpr'``. + + The ``index`` of the :class:`~pandas.core.frame.DataFrame` instance is + placed in the namespace by default, which allows you to treat the index + as a column in the frame. The identifier ``index`` is used for this + variable, and you can also use the name of the index to identify it in + a query. + + Raises + ------ + NameError + * if not all identifiers in the query can be found + SyntaxError + * if a syntactically *invalid* Python expression is passed + + Examples + -------- + Get the value of the frame where column ``b`` has values between the + values of columns ``a`` and ``c``. + + >>> from pandas import DataFrame + >>> from numpy.random import randn + >>> df = DataFrame(randn(100, 3), columns=list('abc')) + >>> result = df.query('a < b & b < c') + + Do the same thing but fallback on a named index if there is no column + with the name ``a``. + + >>> from pandas import DataFrame, Index + >>> from numpy.random import randn + >>> n = 10 + >>> index = Index(randn(n), name='a') + >>> df = DataFrame(randn(n, 2), index=index, columns=list('bc')) + >>> result = df.query('a < b & b < c') + + A use case for :meth:`~pandas.core.frame.DataFrame.query` is when you + have a collection of :class:`~pandas.core.frame.DataFrame` s that have + a subset of column names in common. You can pass the same query to both + frames *without* having to specify which frame you're interested in + querying + + >>> from pandas import DataFrame, Index + >>> from numpy.random import randn + >>> n = 100 + >>> index = Index(randn(n), name='a') + >>> df = DataFrame(randn(n, 2), index=index, columns=list('bc')) + >>> df2 = DataFrame(randn(n + 10, 3)) + >>> expr = 'a < b & b < c' + >>> results = map(lambda frame: frame.query(expr), [df, df2]) + + See Also + -------- + pandas.computation.eval.eval + """ resolvers = kwargs.get('resolvers', None) if resolvers is None: index_resolvers = {} diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 66a71d72019e6..d53d966c6598e 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -45,6 +45,8 @@ from pandas.core.indexing import IndexingError from pandas.core.common import PandasError from pandas.util.compat import OrderedDict +from pandas.computation.expr import Expr +import pandas.computation as comp import pandas.util.testing as tm import pandas.lib as lib @@ -8095,43 +8097,106 @@ def test_mask_edge_case_1xN_frame(self): expec = DataFrame([[nan, 2]]) assert_frame_equal(res, expec) - def test_query(self): + def test_query_expressions(self): try: import numexpr as ne except ImportError: - raise nose.SkipTest - # comparison + raise nose.SkipTest("cannot query engine numexpr when numexpr not installed") df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c']) - assert_frame_equal(df.query('a < b'), df[df.a < df.b]) + assert_frame_equal(df['a < b'], df[df.a < df.b]) + assert_frame_equal(df['a + b > b * c'], + df[df.a + df.b > df.b * df.c]) + + def test_query_expressions_with_index(self): + try: + import numexpr as ne + except ImportError: + raise nose.SkipTest("cannot query engine numexpr when numexpr not installed") + df = DataFrame(np.random.randint(10, size=(10, 3)), + index=Index(range(10), name='blob'), + columns=['a', 'b', 'c']) + assert_frame_equal(df['index < b'], df[df.index < df.b]) + assert_frame_equal(df['index < 5'], df[df.index < 5]) + assert_frame_equal(df['(blob < 5) & (a < b)'], + df[(df.index < 5) & (df.a < df.b)]) + assert_frame_equal(df['blob < b'], df[df.index < df.b]) + + def test_query(self): + import itertools + for engine, parser in itertools.product(comp.engines._engines, + comp.expr._parsers): + self.check_query(engine, parser) + + def check_query(self, engine, parser): + if engine == 'numexpr': + try: + import numexpr as ne + except ImportError: + raise nose.SkipTest("cannot query engine numexpr when numexpr not installed") - # arith ops - assert_frame_equal(df.query('a + b > b * c'), + df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c']) + assert_frame_equal(df.query('a < b', engine=engine, parser=parser), df[df.a < df.b]) + assert_frame_equal(df.query('a + b > b * c', engine=engine, parser=parser), df[df.a + df.b > df.b * df.c]) local_dict = dict(df.iteritems()) local_dict.update({'df': df}) self.assertRaises(NameError, df.query, 'a < d & b < f', - local_dict=local_dict) + local_dict=local_dict, engine=engine, parser=parser) # make sure that it's not just because we didn't pass the locals in self.assertRaises(AssertionError, self.assertRaises, NameError, - df.query, 'a < b', local_dict=local_dict) + df.query, 'a < b', local_dict=local_dict, + engine=engine, parser=parser) def test_query_index(self): - try: - import numexpr as ne - except ImportError: - raise nose.SkipTest - - df = DataFrame(np.random.randn(10, 3), index=Index(range(10), - name='blob'), + import itertools + for engine, parser in itertools.product(comp.engines._engines, + comp.expr._parsers): + self.check_query_index(engine, parser) + + def check_query_index(self, engine, parser): + if engine == 'numexpr': + try: + import numexpr as ne + except ImportError: + raise nose.SkipTest("cannot query engine numexpr when numexpr not installed") + + df = DataFrame(np.random.randint(10, size=(10, 3)), + index=Index(range(10), name='blob'), + columns=['a', 'b', 'c']) + assert_frame_equal(df.query('index < b', engine=engine, parser=parser), + df[df.index < df.b]) + assert_frame_equal(df.query('index < 5', engine=engine, parser=parser), + df[df.index < 5]) + assert_frame_equal(df.query('(blob < 5) & (a < b)', engine=engine, + parser=parser), + df[(df.index < 5) & (df.a < df.b)]) + assert_frame_equal(df.query('blob < b', engine=engine, parser=parser), + df[df.index < df.b]) + + def test_query_different_parsers(self): + for engine in comp.engines._engines: + self.check_query_different_parsers(engine) + + def check_query_different_parsers(self, engine): + if engine == 'numexpr': + try: + import numexpr as ne + except ImportError: + raise nose.SkipTest("cannot query engine numexpr when numexpr not installed") + df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c']) + assert_frame_equal(df.query('(a < 5) & (a < b)', parser='python', + engine=engine), + df.query('a < 5 & a < b', parser='pandas', + engine=engine)) + df = DataFrame(np.random.randint(10, size=(10, 3)), + index=Index(range(10), name='blob'), columns=['a', 'b', 'c']) - assert_frame_equal(df.query('index < b'), df[df.index < df.b]) - assert_frame_equal(df.query('index < 5'), df[df.index < 5]) - assert_frame_equal(df.query('(blob < 5) & (a < b)'), df[(df.index < 5) - & (df.a < - df.b)]) - assert_frame_equal(df.query('blob < b'), df[df.index < df.b]) + assert_frame_equal(df.query('(blob < 5) & (a < b)', parser='python', + engine=engine), + df.query('blob < 5 & a < b', parser='pandas', + engine=engine)) #---------------------------------------------------------------------- diff --git a/vb_suite/eval.py b/vb_suite/eval.py new file mode 100644 index 0000000000000..c0c983862ea3c --- /dev/null +++ b/vb_suite/eval.py @@ -0,0 +1,49 @@ +from vbench.benchmark import Benchmark +from datetime import datetime + +setup = """from pandas_vb_common import * +import pandas as pd +df = DataFrame(np.random.randn(20000, 100)) +df2 = DataFrame(np.random.randn(20000, 100)) +df3 = DataFrame(np.random.randn(20000, 100)) +df4 = DataFrame(np.random.randn(20000, 100)) +""" + +SECTION = 'Eval' + +#---------------------------------------------------------------------- +# binary ops + +#---------------------------------------------------------------------- +# add + +frame_add_eval = \ + Benchmark("pd.eval('df + df2 + df3 + df4')", setup, name='frame_add_eval', + start_date=datetime(2013, 7, 21)) + +frame_add_python = \ + Benchmark("pd.eval('df + df2 + df3 + df4', engine='python')", setup, + name='frame_add_python', start_date=datetime(2013, 7, 21)) + +#---------------------------------------------------------------------- +# mult + +frame_mult_eval = \ + Benchmark("pd.eval('df * df2 * df3 * df4')", setup, name='frame_mult_eval', + start_date=datetime(2012, 7, 21)) + +frame_mult_python = \ + Benchmark("pdl.eval('df * df2 * df3 * df4', engine='python')", setup, + name='frame_mult_python', start_date=datetime(2013, 7, 21)) + +#---------------------------------------------------------------------- +# multi and + +frame_and_eval = \ + Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')", setup, + name='frame_and_eval', start_date=datetime(2012, 7, 21)) + +frame_and_python = \ + Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', " + "engine='python')", setup, name='frame_and_python', + start_date=datetime(2013, 7, 21)) diff --git a/vb_suite/suite.py b/vb_suite/suite.py index ca83855c2a109..f3c8dfe3032e0 100644 --- a/vb_suite/suite.py +++ b/vb_suite/suite.py @@ -23,7 +23,8 @@ 'sparse', 'reshape', 'stat_ops', - 'timeseries'] + 'timeseries', + 'eval'] by_module = {} benchmarks = [] From 9012e5e5e37f91c11f299b3404dcc2f315135d11 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 22 Jul 2013 12:59:07 -0400 Subject: [PATCH 05/16] ENH: add boolean operators --- pandas/computation/expr.py | 36 ++++++++++++++++++------- pandas/computation/ops.py | 14 ++++++---- pandas/computation/pytables.py | 2 +- pandas/computation/tests/test_eval.py | 38 +++------------------------ 4 files changed, 40 insertions(+), 50 deletions(-) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 23f4341102729..47f879f21b8d6 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -151,8 +151,8 @@ def _filter_nodes(superclass, all_nodes=_all_nodes): 'IfExp', 'DictComp', 'SetComp', 'Repr', 'Lambda', 'Set', 'In', - 'NotIn', 'AST', - 'Is', 'IsNot'])) - + 'NotIn', 'AST', 'Is', + 'IsNot'])) - _hacked_nodes) # we're adding a different assignment in some cases to be equality comparison @@ -211,12 +211,12 @@ class BaseExprVisitor(ast.NodeVisitor): """ binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms binary_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'BitAnd', - 'BitOr', 'Add', 'Sub', 'Mult', 'Div', 'Pow', 'FloorDiv', - 'Mod') + 'BitOr', 'And', 'Or', 'Add', 'Sub', 'Mult', 'Div', + 'Pow', 'FloorDiv', 'Mod') binary_op_nodes_map = dict(itertools.izip(binary_ops, binary_op_nodes)) unary_ops = _unary_ops_syms - unary_op_nodes = 'UAdd', 'USub', 'Invert' + unary_op_nodes = 'UAdd', 'USub', 'Invert', 'Not' unary_op_nodes_map = dict(itertools.izip(unary_ops, unary_op_nodes)) def __init__(self, env, preparser=_preparse): @@ -354,13 +354,31 @@ def visit_Compare(self, node, **kwargs): self.visit(comp, side='right')) return node + def visit_BoolOp(self, node, **kwargs): + op = self.visit(node.op) + def visitor(x, y): + try: + lhs = self.visit(x) + except TypeError: + lhs = x + + try: + rhs = self.visit(y) + except TypeError: + rhs = y + + return op(lhs, rhs) -_python_not_supported = frozenset(['Assign', 'BoolOp', 'Not', 'Str', 'Slice', - 'Index', 'Subscript', 'Tuple', 'List', - 'Dict', 'Call']) + operands = node.values + return reduce(visitor, operands) + + +_python_not_supported = frozenset(['Assign', 'Str', 'Slice', 'Index', + 'Subscript', 'Tuple', 'List', 'Dict', + 'Call']) _numexpr_supported_calls = frozenset(_reductions + _mathops) -@disallow(_unsupported_nodes | _python_not_supported) +@disallow((_unsupported_nodes | _python_not_supported) - _boolop_nodes) class PandasExprVisitor(BaseExprVisitor): def __init__(self, env, preparser=_preparse): super(PandasExprVisitor, self).__init__(env, preparser) diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 85459b2fab7a0..2a8ef0277f173 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -107,11 +107,15 @@ def _print_operand(opr): return opr.name if is_term(opr) else unicode(opr) +def _get_op(op): + return {'not': '~', 'and': '&', 'or': '|'}.get(op, op) + + class Op(StringMixin): """Hold an operator of unknown arity """ def __init__(self, op, operands, *args, **kwargs): - self.op = op + self.op = _get_op(op) self.operands = operands def __iter__(self): @@ -137,8 +141,8 @@ def return_type(self): _cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne _cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs)) -_bool_ops_syms = '&', '|' -_bool_ops_funcs = op.and_, op.or_ +_bool_ops_syms = '&', '|', 'and', 'or' +_bool_ops_funcs = op.and_, op.or_, op.and_, op.or_ _bool_ops_dict = dict(zip(_bool_ops_syms, _bool_ops_funcs)) _arith_ops_syms = '+', '-', '*', '/', '**', '//', '%' @@ -237,8 +241,8 @@ def __init__(self, lhs, rhs, *args, **kwargs): _cast_inplace(self.operands, np.float_) -_unary_ops_syms = '+', '-', '~' -_unary_ops_funcs = op.pos, op.neg, op.invert +_unary_ops_syms = '+', '-', '~', 'not' +_unary_ops_funcs = op.pos, op.neg, op.invert, op.invert _unary_ops_dict = dict(zip(_unary_ops_syms, _unary_ops_funcs)) diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index 64a1036bbb20f..ddebbc625c281 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -340,7 +340,6 @@ def prune(self, klass): _op_classes = {'unary': UnaryOp} - class ExprVisitor(BaseExprVisitor): def __init__(self, env, **kwargs): super(ExprVisitor, self).__init__(env) @@ -366,6 +365,7 @@ def visit_USub(self, node, **kwargs): def visit_Index(self, node, **kwargs): return self.visit(node.value).value + class Expr(expr.Expr): """ hold a pytables like expression, comprised of possibly multiple 'terms' diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 06b6b211b7d1c..cbabf2897a656 100755 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -23,7 +23,7 @@ from pandas.computation.expr import PythonExprVisitor, PandasExprVisitor from pandas.computation.ops import (_binary_ops_dict, _unary_ops_dict, _special_case_arith_ops_syms, - _arith_ops_syms) + _arith_ops_syms, Constant) import pandas.computation.expr as expr from pandas.computation import pytables from pandas.computation.expressions import _USE_NUMEXPR @@ -599,39 +599,6 @@ def test_is_expr(): check_is_expr(engine) -def check_not_fails(engine): - x = True - assert_raises(NotImplementedError, pd.eval, 'not x', engine=engine, - local_dict={'x': x}) - - -def test_not_fails(): - for engine in _engines: - check_not_fails(engine) - - -def check_and_fails(engine): - x, y = False, True - assert_raises(NotImplementedError, pd.eval, 'x and y', engine=engine, - local_dict={'x': x, 'y': y}) - - -def test_and_fails(): - for engine in _engines: - check_and_fails(engine) - - -def check_or_fails(engine): - x, y = True, False - assert_raises(NotImplementedError, pd.eval, 'x or y', engine=engine, - local_dict={'x': x, 'y': y}) - - -def test_or_fails(): - for engine in _engines: - check_or_fails(engine) - - _parsers = {'python': PythonExprVisitor, 'pytables': pytables.ExprVisitor, 'pandas': PandasExprVisitor} @@ -641,8 +608,9 @@ def check_disallowed_nodes(visitor): VisitorClass = _parsers[visitor] uns_ops = VisitorClass.unsupported_nodes inst = VisitorClass('x + 1') + for ops in uns_ops: - assert_raises(NotImplementedError, getattr(inst, ops), inst, ast.AST()) + assert_raises(NotImplementedError, getattr(inst, ops)) def test_disallowed_nodes(): From 4093cd9d01d50be2a89b0aaf926839d01e3d99ff Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 22 Jul 2013 16:48:54 -0400 Subject: [PATCH 06/16] ENH: add chained comparisons --- doc/source/api.rst | 2 +- doc/source/enhancingperf.rst | 103 ++-- doc/source/io.rst | 12 +- pandas/computation/align.py | 20 +- pandas/computation/common.py | 11 - pandas/computation/eval.py | 82 +-- pandas/computation/expr.py | 91 ++- pandas/computation/ops.py | 3 +- pandas/computation/pytables.py | 4 +- pandas/computation/tests/test_eval.py | 765 +++++++++++++------------- pandas/core/common.py | 10 + pandas/core/frame.py | 47 +- pandas/io/tests/test_data.py | 2 +- pandas/io/tests/test_pytables.py | 22 + pandas/tests/test_frame.py | 19 + pandas/util/testing.py | 18 +- 16 files changed, 677 insertions(+), 534 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 837afb8996db3..affa840781c34 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -158,7 +158,7 @@ Top-level dealing with datetimes Top-level evaluation ~~~~~~~~~~~~~~~~~~~~ -.. currentmodule:: pandas.computation.eval +.. currentmodule:: pandas .. autosummary:: :toctree: generated/ diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index 3f3a31879752a..47d2acc578a21 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -292,53 +292,53 @@ Read more in the `cython docs `__. .. _enhancingperf.eval: -Expression Evaluation via :func:`~pandas.computation.eval.eval` ---------------------------------------------------------------- +.. versionadded:: 0.13 -New in pandas v0.13 a top-level function :func:`~pandas.computation.eval.eval` -implements expression evaluation of expressions containing -:class:`~pandas.core.series.Series` and :class:`~pandas.core.frame.DataFrame` -objects. +Expression Evaluation via :func:`~pandas.eval` +---------------------------------------------- + +New in pandas v0.13 a top-level function :func:`~pandas.eval` implements +expression evaluation of expressions containing :class:`~pandas.Series` and +:class:`~pandas.DataFrame` objects. .. note:: - To benefit from using :func:`~pandas.computation.eval.eval` you need to + To benefit from using :func:`~pandas.eval` you need to install ``numexpr``. See the :ref:`recommended dependencies section ` for more details. -The major benefit of using :func:`~pandas.computation.eval.eval` for expression -evaluation rather than just straight-up Python is two-fold: large -:class:`~pandas.core.frame.DataFrame` objects are evaluated more efficiently -and large expressions are evaluated all at once by the underlying engine (by -default ``numexpr`` is used for evaluation). +The major benefit of using :func:`~pandas.eval` for expression evaluation +rather than just straight-up Python is two-fold: large +:class:`~pandas.DataFrame` objects are evaluated more efficiently and large +expressions are evaluated all at once by the underlying engine (by default +``numexpr`` is used for evaluation). .. note:: - You should not use :func:`~pandas.computation.eval.eval` for simple + You should not use :func:`~pandas.eval` for simple expressions or for expressions involving small DataFrames. In fact, - :func:`~pandas.computation.eval.eval` is many orders of magnitude slower for - smaller expressions/objects than plain ole' Python. A good rule of thumb is - to only use :func:`~pandas.computation.eval.eval` when you have a + :func:`~pandas.eval` is many orders of magnitude slower for + smaller expressions/objects than plain ol' Python. A good rule of thumb is + to only use :func:`~pandas.eval` when you have a :class:`~pandas.core.frame.DataFrame` with more than 10,000 rows. -:func:`~pandas.computation.eval.eval` supports all arithmetic expressions -supported by the engine (by default the engine is ``numexpr``). The ``numexpr`` -engine uses ``numexpr`` under the hood to evaluate expressions efficiently, -while allowing a slightly modified, and we think more intuitive syntax for -expressions. +:func:`~pandas.eval` supports all arithmetic expressions +supported by the engine. The ``numexpr`` engine uses ``numexpr`` under the hood +to evaluate expressions efficiently, while allowing a slightly modified--and we +think more intuitive--syntax for expressions. .. note:: The larger the frame and the larger the expression the more speedup you will - see from using :func:`~pandas.computation.eval.eval`. + see from using :func:`~pandas.eval`. -:func:`~pandas.computation.eval.eval` Examples -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +:func:`~pandas.eval` Examples +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:func:`~pandas.computation.eval.eval` works wonders for expressions containing +:func:`~pandas.eval` works wonders for expressions containing large arrays First let's create 4 decent-sized arrays to play with: @@ -354,7 +354,7 @@ First let's create 4 decent-sized arrays to play with: Now let's compare adding them together using plain ol' Python versus -:func:`~pandas.computation.eval.eval`: +:func:`~pandas.eval`: .. ipython:: python @@ -377,8 +377,7 @@ Now let's do the same thing but with comparisons: %timeit pd.eval('(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)') -:func:`~pandas.computation.eval.eval` also works with "unaligned" pandas -objects: +:func:`~pandas.eval` also works with "unaligned" pandas objects: .. ipython:: python @@ -393,18 +392,17 @@ objects: There are also two different flavors of parsers and and two different engines to use as the backend. -:func:`~pandas.computation.eval.eval` Parsers -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +:func:`~pandas.eval` Parsers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The default ``"pandas"`` parser allows a bit more intuitive (we think) syntax -for expressing query-like operations (comparisons, conjunctions and -disjunctions). In particular, the precedence of the ``&`` and ``|`` operators -is made equal to the precedence of the corresponding boolean operations ``and`` -and ``or``. +The default ``"pandas"`` parser allows a more intuitive syntax for expressing +query-like operations (comparisons, conjunctions and disjunctions). In +particular, the precedence of the ``&`` and ``|`` operators is made equal to +the precedence of the corresponding boolean operations ``and`` and ``or``. -For example, the above conjunction can be written without -parentheses. Alternatively, you can use the ``'python'`` parser to enforce -strict Python semantics. +For example, the above conjunction can be written without parentheses. +Alternatively, you can use the ``'python'`` parser to enforce strict Python +semantics. .. ipython:: python @@ -415,23 +413,34 @@ strict Python semantics. np.all(x == y) -:func:`~pandas.computation.eval.eval` Backends -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The same expression can be "anded" with the word :keyword:`and` as well: + +.. ipython:: python + + expr = '(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)' + x = pd.eval(expr, parser='python') + expr_with_ands = 'df1 > 0 and df2 > 0 and df3 > 0 and df4 > 0' + y = pd.eval(expr_with_ands, parser='pandas') + np.all(x == y) + + +:func:`~pandas.eval` Backends +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -There's also the option to make :func:`~pandas.computation.eval.eval` operate -identical to plain ol' Python. +There's also the option to make :func:`~pandas.eval` operate identical to plain +ol' Python. .. note:: Using the ``'python'`` engine is generally *not* useful, except for comparing performance and testing other - :func:`~pandas.computation.eval.eval` engines against it. You will acheive - **no** performance benefits using :func:`~pandas.computation.eval.eval` with + :func:`~pandas.eval` engines against it. You will acheive + **no** performance benefits using :func:`~pandas.eval` with ``engine='python'``. -You can see this by using :func:`~pandas.computation.eval.eval` with the -``'python'`` engine is actually a bit slower (not by much) than evaluating the -same expression in Python: +You can see this by using :func:`~pandas.eval` with the ``'python'`` engine is +actually a bit slower (not by much) than evaluating the same expression in +Python: .. ipython:: python diff --git a/doc/source/io.rst b/doc/source/io.rst index a20d2a7aa51b5..dff1b4836e88a 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2003,8 +2003,9 @@ These rules are similar to how boolean expressions are used in pandas for indexi .. note:: - ``=`` will be automatically expanded to the comparison operator ``==`` - - ``~`` is the not operator, but can only be used in very limited circumstances - - If a list/tuple of expressions are passed they will be combined via ``&``. + - ``~`` is the not operator, but can only be used in very limited + circumstances + - If a list/tuple of expressions is passed they will be combined via ``&`` The following are valid expressions: @@ -2022,7 +2023,7 @@ The ``indexers`` are on the left-hand side of the sub-expression: - ``columns``, ``major_axis``, ``ts`` -The right-hand side of the sub-expression (after a comparsion operator), can be: +The right-hand side of the sub-expression (after a comparsion operator) can be: - functions that will be evaluated, e.g. ``Timestamp('2012-02-01')`` - strings, e.g. ``"bar"`` @@ -2038,8 +2039,9 @@ Here is an example: store store.select('wp', "major_axis>Timestamp('20000102') & minor_axis=['A', 'B']") -The ``columns`` keyword can be supplied to select a list of columns to be returned, -this is equivalent to passing a ``'columns=list_of_columns_to_filter'``: +The ``columns`` keyword can be supplied to select a list of columns to be +returned, this is equivalent to passing a +``'columns=list_of_columns_to_filter'``: .. ipython:: python diff --git a/pandas/computation/align.py b/pandas/computation/align.py index 09606fc41a46b..fcc1dd1b0334f 100644 --- a/pandas/computation/align.py +++ b/pandas/computation/align.py @@ -1,3 +1,4 @@ +import warnings from functools import partial, wraps from itertools import izip @@ -6,7 +7,6 @@ import pandas as pd import pandas.core.common as com from pandas.computation.ops import is_const -from pandas.computation.common import flatten def _align_core_single_unary_op(term): @@ -123,11 +123,23 @@ def _align_core(terms): if hasattr(ti, 'reindex_axis'): transpose = com.is_series(ti) and naxes > 1 + reindexer = axes[naxes - 1] if transpose else items + + term_axis_size = len(ti.axes[axis]) + reindexer_size = len(reindexer) + + if (np.log10(abs(reindexer_size - term_axis_size)) >= 1 and + reindexer_size >= 10000): + warnings.warn("Alignment difference on axis {0} is larger" + " than an order of magnitude on term {1!r}, " + "performance may suffer".format(axis, term), + category=pd.io.common.PerformanceWarning) if transpose: - f = partial(ti.reindex, index=axes[naxes - 1], copy=False) + f = partial(ti.reindex, index=reindexer, copy=False) else: - f = partial(ti.reindex_axis, items, axis=axis, copy=False) + f = partial(ti.reindex_axis, reindexer, axis=axis, + copy=False) if pd.lib.is_bool_array(ti.values): r = f(fill_value=True) @@ -168,7 +180,7 @@ def _align(terms): """Align a set of terms""" try: # flatten the parse tree (a nested list, really) - terms = list(flatten(terms)) + terms = list(com.flatten(terms)) except TypeError: # can't iterate so it must just be a constant or single variable if isinstance(terms.value, (pd.Series, pd.core.generic.NDFrame)): diff --git a/pandas/computation/common.py b/pandas/computation/common.py index 4061984dd5e08..e69de29bb2d1d 100644 --- a/pandas/computation/common.py +++ b/pandas/computation/common.py @@ -1,11 +0,0 @@ -import collections -from pandas.core.common import is_string - - -def flatten(l): - for el in l: - if isinstance(el, collections.Iterable) and not is_string(el): - for s in flatten(el): - yield s - else: - yield el diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 5f234b7864427..abd4785cb5ea3 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -4,11 +4,12 @@ import numpy as np -from pandas.computation.expr import Expr, Scope, _parsers +from pandas.computation.expr import Expr, _parsers, _ensure_scope from pandas.computation.engines import _engines def _check_engine(engine): + """make sure a valid engine is passed""" if engine not in _engines: raise KeyError('Invalid engine {0!r} passed, valid engines are' ' {1}'.format(engine, _engines.keys())) @@ -21,79 +22,96 @@ def _check_engine(engine): def _check_parser(parser): + """make sure a valid parser is passed""" if parser not in _parsers: raise KeyError('Invalid parser {0!r} passed, valid parsers are' ' {1}'.format(parser, _parsers.keys())) - - def eval(expr, parser='pandas', engine='numexpr', truediv=True, local_dict=None, global_dict=None, resolvers=None): """Evaluate a Python expression as a string using various backends. - The following arithmetic operations are supported: +, -, *, /, **, %, // - (python engine only) along with the following boolean operations: | (or), & - (and), and ~ (not). Series and DataFrame objects are supported and behave - as they would with in-Python evaluation. + The following arithmetic operations are supported: ``+``, ``-``, ``*``, + ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following + boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not). + Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, + :keyword:`or`, and :keyword:`not` with the same semantics as the + corresponding bitwise operators. :class:`~pandas.Series` and + :class:`~pandas.DataFrame` objects are supported and behave as they would + with plain ol' Python evaluation. Parameters ---------- - expr : string or Expr object - The expression to evaluate. This can be either a string or an ``Expr`` - object. - parser : str, optional, default 'pandas', {'pandas', 'python'} + expr : string + The expression to evaluate. + parser : string, optional, default 'pandas', {'pandas', 'python'} The parser to use to construct the syntax tree from the expression. The default of 'pandas' parses code slightly different than standard Python. See the :ref:`enhancing performance ` documentation for more details. engine : string, optional, default 'numexpr', {'python', 'numexpr'} + The engine used to evaluate the expression. Supported engines are - - 'numexpr': This default engine evaluates pandas objects using numexpr - for large speed ups in complex expressions with large - frames. - - 'python': Performs operations as if you had eval'd in top level - python + - ``'numexpr'``: This default engine evaluates pandas objects using + numexpr for large speed ups in complex expressions + with large frames. + - ``'python'``: Performs operations as if you had ``eval``'d in top + level python. This engine is generally not that useful. + truediv : bool, optional, default True Whether to use true division, like in Python >= 3 local_dict : dict or None, optional, default None A dictionary of local variables, taken from locals() by default. global_dict : dict or None, optional, default None A dictionary of global variables, taken from globals() by default. + resolvers : dict of dict-like or None, default None + A dictionary of dict-like object (specifically they must implement the + ``get`` method) that you can use to inject an additional collection of + namespaces to use for variable lookup. This is used in the + :meth:`~pandas.DataFrame.query` method to inject the + :attr:`~pandas.DataFrame.index` and :attr:`~pandas.DataFrame.columns` + variables that refer to their respective :class:`~pandas.DataFrame` + instance attributes. Returns ------- - obj : ndarray, scalar, DataFrame, Series + ret : ndarray, numeric scalar, :class:`~pandas.DataFrame`, :class:`~pandas.Series` Notes ----- - See :ref:`Enhancing performance ` for more details. + The ``dtype`` of any objects involved in an arithmetic ``%`` operation are + recursively cast to ``float64``. + + See the :ref:`enhancing performance ` documentation for + more details. + + See Also + -------- + pandas.DataFrame.query """ - # make sure we're passed a valid engine + # make sure we're passed a valid engine and parser _check_engine(engine) _check_parser(parser) - eng = _engines[engine] + env = _ensure_scope(global_dict=global_dict, local_dict=local_dict, + resolvers=resolvers) if isinstance(expr, basestring): - # need to go 2 up in the call stack from the constructor - env = Scope(global_dict, local_dict, frame_level=2, - resolvers=resolvers) - parsed_expr = Expr(expr, engine, parser, env, truediv) - elif isinstance(expr, Expr): - parsed_expr = expr + parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, + truediv=truediv) else: - raise TypeError("eval only accepts strings and Expr objects, you " - "passed a {0!r}".format(expr.__class__.__name__)) - + raise TypeError("eval only accepts strings, you passed an object of " + "type {0!r}".format(expr.__class__.__name__)) # construct the engine and evaluate + eng = _engines[engine] ret = eng(parsed_expr).evaluate() - # sanity check for a number + # sanity check for a number if it's a scalar result # TODO: eventually take out if np.isscalar(ret): if not isinstance(ret, (np.number, np.bool_, numbers.Number)): - raise TypeError('scalar result must be numeric or bool, passed ' - 'type is {0!r}'.format(ret.__class__.__name__)) + raise TypeError('scalar result must be numeric or bool, return' + ' type is {0!r}'.format(ret.__class__.__name__)) return ret diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 47f879f21b8d6..abcf8cc38d88c 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -18,16 +18,33 @@ import datetime -class Scope(object): +def _ensure_scope(level=2, global_dict=None, local_dict=None, resolvers=None, + **kwargs): + """ ensure that we are grabbing the correct scope """ + return Scope(global_dict, local_dict, level=level, resolvers=resolvers) + + +class Scope(StringMixin): __slots__ = ('globals', 'locals', 'resolvers', '_global_resolvers', - 'resolver_keys', '_resolver') + 'resolver_keys', '_resolver', 'level') + + def __init__(self, gbls=None, lcls=None, level=1, resolvers=None): + self.level = level + self.resolvers = resolvers or [] + self.globals = dict() + self.locals = dict() - def __init__(self, gbls=None, lcls=None, frame_level=1, resolvers=None): - frame = sys._getframe(frame_level) + if isinstance(lcls, Scope): + ld, lcls = lcls, dict() + self.locals.update(ld.locals) + self.globals.update(ld.globals) + self.resolvers.extend(ld.resolvers) + self.update(ld.level) + frame = sys._getframe(level) try: - self.globals = gbls or frame.f_globals.copy() - self.locals = lcls or frame.f_locals.copy() + self.globals.update(gbls or frame.f_globals.copy()) + self.locals.update(lcls or frame.f_locals.copy()) finally: del frame @@ -39,12 +56,19 @@ def __init__(self, gbls=None, lcls=None, frame_level=1, resolvers=None): self.globals['True'] = True self.globals['False'] = False - self.resolvers = resolvers or [] self.resolver_keys = set(reduce(operator.add, (list(o.keys()) for o in self.resolvers), [])) self._global_resolvers = self.resolvers + [self.locals, self.globals] self._resolver = None + def __unicode__(self): + return "locals: {0}\nglobals: {0}\nresolvers: {0}".format(self.locals.keys(), + self.globals.keys(), + self.resolver_keys) + + def __getitem__(self, key): + return self.locals.get(key,self.globals[key]) + @property def resolver(self): if self._resolver is None: @@ -58,14 +82,14 @@ def resolve_key(key): return self._resolver - def update(self, scope_level=None): + def update(self, level=None): # we are always 2 levels below the caller # plus the caller maybe below the env level # in which case we need addtl levels sl = 2 - if scope_level is not None: - sl += scope_level + if level is not None: + sl += level # add sl frames to the scope starting with the # most distant and overwritting with more current @@ -79,6 +103,7 @@ def update(self, scope_level=None): frames.append(frame) for f in frames[::-1]: self.locals.update(f.f_locals) + self.globals.update(f.f_globals) finally: del frame del frames @@ -312,7 +337,15 @@ def visit_Attribute(self, node, **kwargs): ctx = node.ctx.__class__ if ctx == ast.Load: # resolve the value - return getattr(self.visit(value).value, attr) + resolved = self.visit(value).value + try: + return getattr(resolved, attr) + except (AttributeError): + + # something like datetime.datetime where scope is overriden + if isinstance(value, ast.Name) and value.id == attr: + return resolved + raise ValueError("Invalid Attribute context {0}".format(ctx.__name__)) def visit_Call(self, node, **kwargs): @@ -348,11 +381,17 @@ def visit_Call(self, node, **kwargs): def visit_Compare(self, node, **kwargs): ops = node.ops comps = node.comparators + if len(comps) == 1: + return self.visit(ops[0])(self.visit(node.left, side='left'), + self.visit(comps[0], side='right')) + left = node.left + values = [] for op, comp in itertools.izip(ops, comps): - vop = self.visit(op) - node = vop(self.visit(node.left, side='left'), - self.visit(comp, side='right')) - return node + new_node = self.visit(ast.Compare(comparators=[comp], left=left, + ops=[op])) + left = comp + values.append(new_node) + return self.visit(ast.BoolOp(op=ast.And(), values=values)) def visit_BoolOp(self, node, **kwargs): op = self.visit(node.op) @@ -396,7 +435,7 @@ class Expr(StringMixin): def __init__(self, expr, engine='numexpr', parser='pandas', env=None, truediv=True): self.expr = expr - self.env = env or Scope(frame_level=2) + self.env = _ensure_scope(level=2,local_dict=env) self._visitor = _parsers[parser](self.env) self.terms = self.parse() self.engine = engine @@ -421,21 +460,25 @@ def align(self): return self.terms.align(self.env) -def maybe_expression(s, kind='python'): +_needs_filter = frozenset(['and', 'or', 'not']) + + +def maybe_expression(s, kind='pandas'): """ loose checking if s is an expression """ if not isinstance(s, basestring): return False - try: - visitor = _parsers[kind] - # make sure we have an op at least - return any(op in s for op in visitor.binary_ops) - except: - return False + visitor = _parsers[kind] + ops = visitor.binary_ops + visitor.unary_ops + filtered = frozenset(ops) - _needs_filter + # make sure we have an op at least + return any(op in s or ' and ' in s or ' or ' in s or ' not ' in s + for op in filtered) def isexpr(s, check_names=True): + env = _ensure_scope() try: - Expr(s) + Expr(s,env=env) except SyntaxError: return False except NameError: diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 2a8ef0277f173..9a1a96cec30d1 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -5,7 +5,6 @@ from pandas.util.py3compat import PY3 import pandas.core.common as com from pandas.core.base import StringMixin -from pandas.computation.common import flatten _reductions = 'sum', 'prod' @@ -134,7 +133,7 @@ def return_type(self): # clobber types to bool if the op is a boolean operator if self.op in (_cmp_ops_syms + _bool_ops_syms): return np.bool_ - return np.result_type(*(term.type for term in flatten(self))) + return np.result_type(*(term.type for term in com.flatten(self))) _cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=' diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index ddebbc625c281..2819a63f3706e 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -27,12 +27,12 @@ def _ensure_decoded(s): class Scope(expr.Scope): __slots__ = 'globals', 'locals', 'queryables' - def __init__(self, gbls=None, lcls=None, queryables=None, frame_level=1): + def __init__(self, gbls=None, lcls=None, queryables=None, level=1): super( Scope, self).__init__(gbls=gbls, lcls=lcls, - frame_level=frame_level) + level=level) self.queryables = queryables or dict() diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index cbabf2897a656..7a2704a321ec9 100755 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -23,19 +23,19 @@ from pandas.computation.expr import PythonExprVisitor, PandasExprVisitor from pandas.computation.ops import (_binary_ops_dict, _unary_ops_dict, _special_case_arith_ops_syms, - _arith_ops_syms, Constant) + _arith_ops_syms) import pandas.computation.expr as expr from pandas.computation import pytables from pandas.computation.expressions import _USE_NUMEXPR -from pandas.computation.eval import Scope from pandas.util.testing import (assert_frame_equal, randbool, - assertRaisesRegexp) + assertRaisesRegexp, + assert_produces_warning) from pandas.util.py3compat import PY3 def skip_numexpr_engine(engine): if not _USE_NUMEXPR and engine == 'numexpr': - raise nose.SkipTest + raise nose.SkipTest("not using numexpr") def engine_has_neg_frac(engine): @@ -46,12 +46,13 @@ def _eval_single_bin(lhs, cmp1, rhs, engine): c = _binary_ops_dict[cmp1] if engine_has_neg_frac(engine): try: - result = c(lhs, rhs) - except ValueError: - result = np.nan - else: - result = c(lhs, rhs) - return result + return c(lhs, rhs) + except ValueError as e: + if e.message == ('negative number cannot be raised to a ' + 'fractional power'): + return np.nan + raise + return c(lhs, rhs) def _series_and_2d_ndarray(lhs, rhs): @@ -60,6 +61,10 @@ def _series_and_2d_ndarray(lhs, rhs): > 1) +def _bool_and_frame(lhs, rhs): + return isinstance(lhs, bool) and com.is_frame(rhs) + + def skip_incompatible_operand(f): @functools.wraps(f) def wrapper(self, lhs, arith1, rhs, *args, **kwargs): @@ -75,7 +80,6 @@ def wrapper(self, lhs, arith1, rhs, *args, **kwargs): _good_arith_ops = tuple(set(_arith_ops_syms) - set(_special_case_arith_ops_syms)) - class TestEvalPandas(unittest.TestCase): @classmethod @@ -87,6 +91,11 @@ def setUpClass(cls): cls.arith_ops = _good_arith_ops cls.unary_ops = '+', '-' + @classmethod + def tearDownClass(cls): + del cls.cmp_ops, cls.cmp2_ops, cls.bin_ops, cls.special_case_ops + del cls.arith_ops, cls.unary_ops + def set_current_engine(self): self.engine = 'numexpr' @@ -103,8 +112,10 @@ def setup_data(self): self.scalar_lhses = randn(), np.float64(randn()), np.nan self.scalar_rhses = randn(), np.float64(randn()), np.nan - self.lhses = self.pandas_lhses + self.scalar_lhses - self.rhses = self.pandas_rhses + self.scalar_rhses + self.lhses = self.pandas_lhses + self.scalar_lhses + (randn(10, 5), + randn(5)) + self.rhses = self.pandas_rhses + self.scalar_rhses + (randn(10, 5), + randn(5)) def setUp(self): try: @@ -114,8 +125,12 @@ def setUp(self): raise nose.SkipTest self.set_current_engine() self.setup_data() - self.current_engines = filter(lambda x: x != self.engine, - _engines.iterkeys()) + self.current_engines = filter(lambda x: x != self.engine, _engines) + + def tearDown(self): + del self.lhses, self.rhses, self.scalar_rhses, self.scalar_lhses + del self.pandas_rhses, self.pandas_lhses, self.current_engines, self.ne + del self.engine @slow def test_complex_cmp_ops(self): @@ -166,6 +181,14 @@ def test_compound_invert_op(self): for lhs, op, rhs in product(self.lhses, self.cmp_ops, self.rhses): self.check_compound_invert_op(lhs, op, rhs) + @slow + def test_chained_cmp_op(self): + mids = self.lhses + cmp_ops = tuple(set(self.cmp_ops) - set(['==', '!=', '<=', '>='])) + for lhs, cmp1, mid, cmp2, rhs in product(self.lhses, self.cmp_ops, + mids, cmp_ops, self.rhses): + self.check_chained_cmp_op(lhs, cmp1, mid, cmp2, rhs) + @skip_incompatible_operand def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format(cmp1=cmp1, @@ -177,6 +200,46 @@ def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): result = pd.eval(ex, engine=self.engine) assert_array_equal(result, expected) + def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): + # these are not compatible operands + if _series_and_2d_ndarray(lhs, mid): + self.assertRaises(ValueError, _eval_single_bin, lhs, cmp2, mid, + self.engine) + else: + lhs_new = _eval_single_bin(lhs, cmp1, mid, self.engine) + + if _series_and_2d_ndarray(mid, rhs): + self.assertRaises(ValueError, _eval_single_bin, mid, cmp2, rhs, + self.engine) + else: + rhs_new = _eval_single_bin(mid, cmp2, rhs, self.engine) + + try: + lhs_new + rhs_new + except NameError: + pass + else: + # these are not compatible operands + if (com.is_series(lhs_new) and com.is_frame(rhs_new) or + _bool_and_frame(lhs_new, rhs_new)): + self.assertRaises(TypeError, _eval_single_bin, lhs_new, '&', + rhs_new, self.engine) + elif _series_and_2d_ndarray(lhs_new, rhs_new): + # TODO: once #4319 is fixed add this test back in + #self.assertRaises(Exception, _eval_single_bin, lhs_new, '&', + #rhs_new, self.engine) + pass + else: + ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2) + ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp1, cmp2) + ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp1, cmp2) + expected = _eval_single_bin(lhs_new, '&', rhs_new, self.engine) + + for ex in (ex1, ex2, ex3): + result = pd.eval(ex, engine=self.engine) + assert_array_equal(result, expected) + @skip_incompatible_operand def check_simple_cmp_op(self, lhs, cmp1, rhs): ex = 'lhs {0} rhs'.format(cmp1) @@ -321,267 +384,351 @@ def set_current_engine(self): self.engine = 'python' -class TestEvalPandasWithMixedTypeOperands(TestEvalPandas): - def setup_data(self): - super(TestEvalPandasWithMixedTypeOperands, self).setup_data() - self.lhses += randn(10, 5), randn(5) - self.rhses += randn(10, 5), randn(5) - - -class TestEvalPythonWithMixedTypeOperands(TestEvalPandasWithMixedTypeOperands): - def set_current_engine(self): - self.engine = 'python' - - -def test_syntax_error_exprs(): - for engine in _engines: - e = 's +' - assert_raises(SyntaxError, pd.eval, e, engine=engine) - - -def test_name_error_exprs(): - for engine in _engines: - e = 's + t' - assert_raises(NameError, pd.eval, e, engine=engine) - - -def test_align_nested_unary_op(): - for engine in _engines: - yield check_align_nested_unary_op, engine - - f = lambda *args, **kwargs: np.random.randn() -def check_align_nested_unary_op(engine): - skip_numexpr_engine(engine) - s = 'df * ~2' - df = mkdf(10, 10, data_gen_f=f) - res = pd.eval(s, engine=engine) - assert_frame_equal(res, df * ~2) - - -def check_basic_frame_alignment(engine): - df = mkdf(10, 10, data_gen_f=f) - df2 = mkdf(20, 10, data_gen_f=f) - res = pd.eval('df + df2', engine=engine) - assert_frame_equal(res, df + df2) - - -def test_basic_frame_alignment(): - for engine in _engines: - yield check_basic_frame_alignment, engine - - -def check_medium_complex_frame_alignment(engine, r1, r2, c1, c2): - skip_numexpr_engine(engine) - df = mkdf(5, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) - df2 = mkdf(10, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) - df3 = mkdf(15, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) - res = pd.eval('df + df2 + df3', engine=engine) - assert_frame_equal(res, df + df2 + df3) - - -@slow -def test_medium_complex_frame_alignment(): - args = product(_engines, *([INDEX_TYPES[:4]] * 4)) - for engine, r1, r2, c1, c2 in args: - check_medium_complex_frame_alignment(engine, r1, r2, c1, c2) - - -def check_basic_frame_series_alignment(engine, r_idx_type, c_idx_type, - index_name): - skip_numexpr_engine(engine) - df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, - c_idx_type=c_idx_type) - index = getattr(df, index_name) - s = Series(np.random.randn(5), index[:5]) - - if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': - assert_raises(ValueError, pd.eval, 'df + s', local_dict=locals()) - assert_raises(ValueError, df.add, s, axis=1) - else: - res = pd.eval('df + s', engine=engine) - expected = df + s - assert_frame_equal(res, expected) - - -def check_not_both_period_fails_otherwise_succeeds(lhs, rhs, r_idx_type, - c_idx_type, index_name, s, - df, *terms): - if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': - assert_raises(ValueError, pd.eval, lhs, local_dict=locals()) - assert_raises(ValueError, pd.eval, rhs, local_dict=locals()) - else: - a, b = pd.eval(lhs), pd.eval(rhs) - assert_frame_equal(a, b) - - -def check_basic_series_frame_alignment(engine, r_idx_type, c_idx_type, - index_name): - skip_numexpr_engine(engine) - df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, - c_idx_type=c_idx_type) - index = getattr(df, index_name) - s = Series(np.random.randn(5), index[:5]) - - if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': - assert_raises(ValueError, pd.eval, 's + df', local_dict=locals()) - assert_raises(ValueError, df.add, s, axis=1) - else: - res = pd.eval('s + df', engine=engine) - expected = s + df - assert_frame_equal(res, expected) - +class TestAlignment(unittest.TestCase): -def check_basic_series_frame_alignment_datetime(engine, r_idx_type, c_idx_type, - index_name): - skip_numexpr_engine(engine) - df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, - c_idx_type=c_idx_type) - index = getattr(df, index_name) - s = Series(np.random.randn(5), index[:5]) - if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': - assert_raises(ValueError, pd.eval, 's + df', local_dict=locals()) - assert_raises(ValueError, df.add, s, axis=1) - else: - res = pd.eval('s + df', engine=engine) - expected = s + df - assert_frame_equal(res, expected) - - if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': - assert_raises(ValueError, pd.eval, 'df + s', local_dict=locals()) - assert_raises(ValueError, df.add, s, axis=1) - else: - res = pd.eval('df + s', engine=engine) - expected = df + s - assert_frame_equal(res, expected) + @classmethod + def setUpClass(cls): + cls.INDEX_TYPES = 'i', 'f', 's', 'u', 'dt', # 'p' + @classmethod + def tearDownClass(cls): + del cls.INDEX_TYPES + + def check_align_nested_unary_op(self, engine): + skip_numexpr_engine(engine) + s = 'df * ~2' + df = mkdf(10, 10, data_gen_f=f) + res = pd.eval(s, engine=engine) + assert_frame_equal(res, df * ~2) + + def test_align_nested_unary_op(self): + for engine in _engines: + self.check_align_nested_unary_op(engine) + + def check_basic_frame_alignment(self, engine): + df = mkdf(10, 10, data_gen_f=f) + df2 = mkdf(20, 10, data_gen_f=f) + res = pd.eval('df + df2', engine=engine) + assert_frame_equal(res, df + df2) + + def test_basic_frame_alignment(self): + for engine in _engines: + self.check_basic_frame_alignment(engine) + + def check_medium_complex_frame_alignment(self, engine, r1, r2, c1, c2): + skip_numexpr_engine(engine) + df = mkdf(5, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) + df2 = mkdf(10, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + df3 = mkdf(15, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + res = pd.eval('df + df2 + df3', engine=engine) + assert_frame_equal(res, df + df2 + df3) -def check_series_frame_commutativity(engine, r_idx_type, c_idx_type, op, - index_name): - skip_numexpr_engine(engine) - df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, - c_idx_type=c_idx_type) - index = getattr(df, index_name) - s = Series(np.random.randn(5), index[:5]) + @slow + def test_medium_complex_frame_alignment(self): + args = product(_engines, *([self.INDEX_TYPES[:4]] * 4)) + for engine, r1, r2, c1, c2 in args: + self.check_medium_complex_frame_alignment(engine, r1, r2, c1, c2) + + def check_basic_frame_series_alignment(self, engine, r_idx_type, + c_idx_type, index_name): + def testit(): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + res = pd.eval('df + s', engine=engine) + if r_idx_type == 'dt' or c_idx_type == 'dt': + if engine == 'numexpr': + expected = df.add(s) + else: + expected = df + s + else: + expected = df + s + assert_frame_equal(res, expected) - lhs = 's {0} df'.format(op) - rhs = 'df {0} s'.format(op) - check_not_both_period_fails_otherwise_succeeds(lhs, rhs, r_idx_type, - c_idx_type, index_name, s, - df) + testit() + @slow + def test_basic_frame_series_alignment(self): + args = product(_engines, self.INDEX_TYPES, self.INDEX_TYPES, + ('index', 'columns')) + for engine, r_idx_type, c_idx_type, index_name in args: + self.check_basic_frame_series_alignment(engine, r_idx_type, + c_idx_type, index_name) -INDEX_TYPES = 'i', 'f', 's', 'u', # 'dt', # 'p' + def check_basic_series_frame_alignment(self, engine, r_idx_type, + c_idx_type, index_name): + def testit(): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + res = pd.eval('s + df', engine=engine) + if r_idx_type == 'dt' or c_idx_type == 'dt': + if engine == 'numexpr': + expected = df.add(s) + else: + expected = s + df + else: + expected = s + df + assert_frame_equal(res, expected) + testit() -@slow -def test_series_frame_commutativity(): - args = product(_engines, INDEX_TYPES, INDEX_TYPES, ('+', '*'), ('index', - 'columns')) - for engine, r_idx_type, c_idx_type, op, index_name in args: - check_series_frame_commutativity(engine, r_idx_type, c_idx_type, op, - index_name) + @slow + def test_basic_series_frame_alignment(self): + args = product(_engines, self.INDEX_TYPES, self.INDEX_TYPES, + ('index', 'columns')) + for engine, r_idx_type, c_idx_type, index_name in args: + self.check_basic_series_frame_alignment(engine, r_idx_type, + c_idx_type, index_name) + def check_series_frame_commutativity(self, engine, r_idx_type, c_idx_type, + op, index_name): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) -@slow -def test_basic_frame_series_alignment(): - args = product(_engines, INDEX_TYPES, INDEX_TYPES, ('index', 'columns')) - for engine, r_idx_type, c_idx_type, index_name in args: - check_basic_frame_series_alignment(engine, r_idx_type, c_idx_type, - index_name) + lhs = 's {0} df'.format(op) + rhs = 'df {0} s'.format(op) + a = pd.eval(lhs, engine=engine) + b = pd.eval(rhs, engine=engine) + if r_idx_type != 'dt' and c_idx_type != 'dt': + if engine == 'numexpr': + assert_frame_equal(a, b) -@slow -def test_basic_series_frame_alignment_datetime(): - idx_types = INDEX_TYPES - args = product(_engines, idx_types, idx_types, ('index', 'columns')) - for engine, r_idx_type, c_idx_type, index_name in args: - check_basic_series_frame_alignment_datetime(engine, r_idx_type, - c_idx_type, index_name) + @slow + def test_series_frame_commutativity(self): + args = product(_engines, self.INDEX_TYPES, self.INDEX_TYPES, ('+', + '*'), + ('index', 'columns')) + for engine, r_idx_type, c_idx_type, op, index_name in args: + self.check_series_frame_commutativity(engine, r_idx_type, + c_idx_type, op, index_name) + + def check_complex_series_frame_alignment(self, engine, index_name, obj, r1, + r2, c1, c2): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) + df2 = mkdf(20, 10, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + index = getattr(locals()[obj], index_name) + s = Series(np.random.randn(5), index[:5]) + if engine != 'python': + expected = df2.add(s, axis=1).add(df) + else: + expected = df2 + s + df + res = pd.eval('df2 + s + df', engine=engine) + expected = df2 + s + df + assert_tuple_equal(res.shape, expected.shape) + assert_frame_equal(res, expected) + @slow + def test_complex_series_frame_alignment(self): + args = product(_engines, ('index', 'columns'), ('df', 'df2'), + *([self.INDEX_TYPES[:4]] * 4)) + for engine, index_name, obj, r1, r2, c1, c2 in args: + self.check_complex_series_frame_alignment(engine, index_name, obj, + r1, r2, c1, c2) + + def test_performance_warning_for_asenine_alignment(self): + df = DataFrame(randn(1000, 10)) + s = Series(randn(10000)) + with assert_produces_warning(pd.io.common.PerformanceWarning): + pd.eval('df + s') + + s = Series(randn(1000)) + with assert_produces_warning(False): + pd.eval('df + s') + + df = DataFrame(randn(10, 10000)) + s = Series(randn(10000)) + with assert_produces_warning(False): + pd.eval('df + s') + +class TestOperations(unittest.TestCase): + + def check_simple_arith_ops(self, engine): + ops = expr._arith_ops_syms + expr._cmp_ops_syms + + for op in filter(lambda x: x != '//', ops): + expec = _eval_single_bin(1, op, 1, engine) + x = pd.eval('1 {0} 1'.format(op), engine=engine) + assert_equal(x, expec) + + expec = _eval_single_bin(x, op, 1, engine) + y = pd.eval('x {0} 1'.format(op), engine=engine) + assert_equal(y, expec) + + expec = _eval_single_bin(1, op, x + 1, engine) + y = pd.eval('1 {0} (x + 1)'.format(op), engine=engine) + assert_equal(y, expec) + + def check_simple_bool_ops(self, engine): + for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, + False)): + expec = _eval_single_bin(lhs, op, rhs, engine) + x = pd.eval('lhs {0} rhs'.format(op), engine=engine) + assert_equal(x, expec) + + def check_bool_ops_with_constants(self, engine): + asteval = ast.literal_eval + for op, lhs, rhs in product(expr._bool_ops_syms, ('True', 'False'), + ('True', 'False')): + expec = _eval_single_bin(asteval(lhs), op, asteval(rhs), engine) + x = pd.eval('{0} {1} {2}'.format(lhs, op, rhs), engine=engine) + assert_equal(x, expec) + + def test_simple_arith_ops(self): + for engine in _engines: + self.check_simple_arith_ops(engine) + + def test_simple_bool_ops(self): + for engine in _engines: + self.check_simple_bool_ops(engine) + + def test_bool_ops_with_constants(self): + for engine in _engines: + self.check_bool_ops_with_constants(engine) + + def check_panel_fails(self, engine): + x = Panel(randn(3, 4, 5)) + y = Series(randn(10)) + assert_raises(NotImplementedError, pd.eval, 'x + y', + local_dict={'x': x, 'y': y}, engine=engine) + + def test_panel_fails(self): + for engine in _engines: + self.check_panel_fails(engine) + + def check_4d_ndarray_fails(self, engine): + x = randn(3, 4, 5, 6) + y = Series(randn(10)) + assert_raises(NotImplementedError, pd.eval, 'x + y', local_dict={'x': x, + 'y': y}, + engine=engine) + + def test_4d_ndarray_fails(self): + for engine in _engines: + self.check_4d_ndarray_fails(engine) + + def check_constant(self, engine): + x = pd.eval('1', engine=engine) + assert_equal(x, 1) + + def test_constant(self): + for engine in _engines: + self.check_constant(engine) + + def check_single_variable(self, engine): + df = DataFrame(randn(10, 2)) + df2 = pd.eval('df', engine=engine) + assert_frame_equal(df, df2) + + def test_single_variable(self): + for engine in _engines: + self.check_single_variable(engine) + + def test_truediv(self): + for engine in _engines: + self.check_truediv(engine) + + def check_truediv(self, engine): + s = np.array([1]) + ex = 's / 1' + + if PY3: + res = pd.eval(ex, truediv=False) + assert_array_equal(res, np.array([1.0])) + + res = pd.eval(ex, truediv=True) + assert_array_equal(res, np.array([1.0])) + else: + res = pd.eval(ex, truediv=False) + assert_array_equal(res, np.array([1])) -@slow -def test_basic_series_frame_alignment(): - args = product(_engines, INDEX_TYPES, INDEX_TYPES, ('index', 'columns')) - for engine, r_idx_type, c_idx_type, index_name in args: - check_basic_series_frame_alignment(engine, r_idx_type, c_idx_type, - index_name) - - -def check_complex_series_frame_alignment(engine, index_name, obj, r1, r2, c1, - c2): - skip_numexpr_engine(engine) - df = mkdf(10, 10, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) - df2 = mkdf(20, 10, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) - index = getattr(locals()[obj], index_name) - s = Series(np.random.randn(5), index[:5]) - if engine != 'python': - expected = df2.add(s, axis=1).add(df) - else: - expected = df2 + s + df - res = pd.eval('df2 + s + df', engine=engine) - expected = df2 + s + df - assert_tuple_equal(res.shape, expected.shape) - assert_frame_equal(res, expected) + res = pd.eval(ex, truediv=True) + assert_array_equal(res, np.array([1.0])) -@slow -def test_complex_series_frame_alignment(): - args = product(_engines, ('index', 'columns'), ('df', 'df2'), - *([INDEX_TYPES[:4]] * 4)) - for engine, index_name, obj, r1, r2, c1, c2 in args: - check_complex_series_frame_alignment(engine, index_name, obj, r1, r2, - c1, c2) +_var_s = randn(10) +class TestScope(unittest.TestCase): -def check_datetime_index_rows_punts_to_python(engine): - df = mkdf(10, 10, data_gen_f=f, r_idx_type='dt', c_idx_type='dt') - index = getattr(df, 'index') - s = Series(np.random.randn(5), index[:5]) - env = Scope(globals(), locals()) + def check_global_scope(self, engine): + e = '_var_s * 2' + assert_array_equal(_var_s * 2, pd.eval(e, engine=engine)) + def test_global_scope(self): + for engine in _engines: + self.check_global_scope(engine) -def test_datetime_index_rows_punts_to_python(): - for engine in _engines: - check_datetime_index_rows_punts_to_python(engine) + def check_no_new_locals(self, engine): + x = 1 + lcls = locals().copy() + pd.eval('x + 1', local_dict=lcls) + lcls2 = locals().copy() + lcls2.pop('lcls') + assert_equal(lcls, lcls2) + def test_no_new_locals(self): + for engine in _engines: + self.check_no_new_locals(engine) -def test_truediv(): - for engine in _engines: - check_truediv(engine) + def check_no_new_globals(self, engine): + x = 1 + gbls = globals().copy() + pd.eval('x + 1') + gbls2 = globals().copy() + assert_equal(gbls, gbls2) + def test_no_new_globals(self): + for engine in _engines: + self.check_no_new_globals(engine) -def check_truediv(engine): - s = np.array([1]) - ex = 's / 1' + def test_nested_scope(self): + x = 1 + result = pd.eval('x + 1') + self.assertEqual(result, 2) - if PY3: - res = pd.eval(ex, truediv=False) - assert_array_equal(res, np.array([1.0])) + df = DataFrame(np.random.randn(2000, 10)) + df2 = DataFrame(np.random.randn(2000, 10)) + expected = df[(df>0) & (df2>0)] - res = pd.eval(ex, truediv=True) - assert_array_equal(res, np.array([1.0])) - else: - res = pd.eval(ex, truediv=False) - assert_array_equal(res, np.array([1])) + result = df['(df>0) & (df2>0)'] + assert_frame_equal(result,expected) - res = pd.eval(ex, truediv=True) - assert_array_equal(res, np.array([1.0])) + result = df.query('(df>0) & (df2>0)') + assert_frame_equal(result,expected) + ##### this fails #### + #result = pd.eval('df[(df>0) & (df2>0)]') + #assert_frame_equal(result,expected) -__var_s = randn(10) + #### also fails #### + #self.assertRaises(NotImplementedError, pd.eval, + #'df[(df > 0) & (df2 > 0)]') -def check_global_scope(engine): - e = '__var_s * 2' - assert_array_equal(__var_s * 2, pd.eval(e, engine=engine)) +def test_invalid_engine(): + assertRaisesRegexp(KeyError, 'Invalid engine \'asdf\' passed', + pd.eval, 'x + y', local_dict={'x': 1, 'y': 2}, + engine='asdf') -def test_global_scope(): - for engine in _engines: - yield check_global_scope, engine +def test_invalid_parser(): + assertRaisesRegexp(KeyError, 'Invalid parser \'asdf\' passed', + pd.eval, 'x + y', local_dict={'x': 1, 'y': 2}, + parser='asdf') def check_is_expr(engine): @@ -600,8 +747,7 @@ def test_is_expr(): _parsers = {'python': PythonExprVisitor, 'pytables': pytables.ExprVisitor, - 'pandas': PandasExprVisitor} - + 'pandas': PandasExprVisitor} def check_disallowed_nodes(visitor): """make sure the disallowed decorator works""" @@ -618,139 +764,16 @@ def test_disallowed_nodes(): check_disallowed_nodes(visitor) -def check_simple_arith_ops(engine): - ops = expr._arith_ops_syms + expr._cmp_ops_syms - - for op in filter(lambda x: x != '//', ops): - expec = _eval_single_bin(1, op, 1, engine) - x = pd.eval('1 {0} 1'.format(op), engine=engine) - assert_equal(x, expec) - - expec = _eval_single_bin(x, op, 1, engine) - y = pd.eval('x {0} 1'.format(op), engine=engine) - assert_equal(y, expec) - - expec = _eval_single_bin(1, op, x + 1, engine) - y = pd.eval('1 {0} (x + 1)'.format(op), engine=engine) - assert_equal(y, expec) - - -def check_simple_bool_ops(engine): - for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, - False)): - expec = _eval_single_bin(lhs, op, rhs, engine) - x = pd.eval('lhs {0} rhs'.format(op), engine=engine) - assert_equal(x, expec) - - -def check_bool_ops_with_constants(engine): - asteval = ast.literal_eval - for op, lhs, rhs in product(expr._bool_ops_syms, ('True', 'False'), - ('True', 'False')): - expec = _eval_single_bin(asteval(lhs), op, asteval(rhs), engine) - x = pd.eval('{0} {1} {2}'.format(lhs, op, rhs), engine=engine) - assert_equal(x, expec) - - -def test_simple_arith_ops(): - for engine in _engines: - check_simple_arith_ops(engine) - - -def test_simple_bool_ops(): - for engine in _engines: - check_simple_bool_ops(engine) - - -def test_bool_ops_with_constants(): - for engine in _engines: - check_bool_ops_with_constants(engine) - - -def check_no_new_locals(engine): - x = 1 - lcls = locals().copy() - pd.eval('x + 1', local_dict=lcls) - lcls2 = locals().copy() - lcls2.pop('lcls') - assert_equal(lcls, lcls2) - - -def test_no_new_locals(): - for engine in _engines: - check_no_new_locals(engine) - - -def check_no_new_globals(engine): - x = 1 - gbls = globals().copy() - pd.eval('x + 1') - gbls2 = globals().copy() - assert_equal(gbls, gbls2) - - -def test_no_new_globals(): - for engine in _engines: - check_no_new_globals(engine) - - -def check_panel_fails(engine): - x = Panel(randn(3, 4, 5)) - y = Series(randn(10)) - assert_raises(NotImplementedError, pd.eval, 'x + y', local_dict={'x': x, - 'y': y}, - engine=engine) - - -def test_panel_fails(): - for engine in _engines: - check_panel_fails(engine) - - -def check_4d_ndarray_fails(engine): - x = randn(3, 4, 5, 6) - y = Series(randn(10)) - assert_raises(NotImplementedError, pd.eval, 'x + y', local_dict={'x': x, - 'y': y}, - engine=engine) - - -def test_4d_ndarray_fails(): - for engine in _engines: - check_4d_ndarray_fails(engine) - - -def check_constant(engine): - x = pd.eval('1', engine=engine) - assert_equal(x, 1) - - -def test_constant(): +def test_syntax_error_exprs(): for engine in _engines: - check_constant(engine) - - -def check_single_variable(engine): - df = DataFrame(randn(10, 2)) - df2 = pd.eval('df', engine=engine) - assert_frame_equal(df, df2) + e = 's +' + assert_raises(SyntaxError, pd.eval, e, engine=engine) -def test_single_variable(): +def test_name_error_exprs(): for engine in _engines: - check_single_variable(engine) - - -def test_invalid_engine(): - assertRaisesRegexp(KeyError, 'Invalid engine \'asdf\' passed', - pd.eval, 'x + y', local_dict={'x': 1, 'y': 2}, - engine='asdf') - - -def test_invalid_parser(): - assertRaisesRegexp(KeyError, 'Invalid parser \'asdf\' passed', - pd.eval, 'x + y', local_dict={'x': 1, 'y': 2}, - parser='asdf') + e = 's + t' + assert_raises(NameError, pd.eval, e, engine=engine) if __name__ == '__main__': diff --git a/pandas/core/common.py b/pandas/core/common.py index 89407121f959a..74f41355cccd8 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -3,6 +3,7 @@ """ import re +import collections import codecs import csv import sys @@ -224,6 +225,15 @@ def notnull(obj): return -res +def flatten(l): + for el in l: + if isinstance(el, collections.Iterable) and not is_string(el): + for s in flatten(el): + yield s + else: + yield el + + def mask_missing(arr, values_to_mask): """ Return a masking array of same size/shape as arr diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3f4b283f577b5..dd92dfa235521 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -38,7 +38,7 @@ from pandas.sparse.array import SparseArray import pandas.computation.expressions as expressions from pandas.computation.eval import eval as _eval -from pandas.computation.expr import maybe_expression +from pandas.computation.expr import maybe_expression, _ensure_scope from pandas.compat.scipy import scoreatpercentile as _quantile from pandas.compat import(range, zip, lrange, lmap, lzip, StringIO, u, OrderedDict, raise_with_traceback) @@ -1837,7 +1837,8 @@ def __getitem__(self, key): return self._getitem_column(key) except KeyError: if maybe_expression(key): - return self.query(key) + env = _ensure_scope(level=2) + return self.query(key, local_dict=env) raise def _getitem_column(self, key): @@ -1912,11 +1913,11 @@ def query(self, expr, **kwargs): expr : string The query string to evaluate. The result of the evaluation of this expression is passed to - :meth:`~pandas.core.frame.DataFrame.__getitem__`. + :meth:`~pandas.DataFrame.__getitem__`. kwargs : dict - See the documentation for :func:`~pandas.computation.eval.eval` for - complete details on the keyword arguments accepted by - :meth:`~pandas.core.frame.DataFrame.query`. + See the documentation for :func:`~pandas.eval` for complete details + on the keyword arguments accepted by + :meth:`~pandas.DataFrame.query`. Returns ------- @@ -1924,30 +1925,32 @@ def query(self, expr, **kwargs): Notes ----- - This method uses the top-level :func:`~pandas.computation.eval.eval` - function to evaluate the passed query. + This method uses the top-level :func:`~pandas.eval` function to + evaluate the passed query. - The :meth:`~pandas.core.frame.DataFrame.query` method uses a slightly + The :meth:`~pandas.DataFrame.query` method uses a slightly modified Python syntax by default. For example, the ``&`` and ``|`` (bitwise) operators have the precedence of their boolean cousins, - ``and`` and ``or``. This *is* syntactically valid Python, however the - semantics are different. + :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python, + however the semantics are different. You can use a syntax that is semantically identical to Python by - passing the keyword argument ``parser='numexpr'``. + passing the keyword argument ``parser='python'``. - The ``index`` of the :class:`~pandas.core.frame.DataFrame` instance is - placed in the namespace by default, which allows you to treat the index - as a column in the frame. The identifier ``index`` is used for this - variable, and you can also use the name of the index to identify it in - a query. + The :attr:`~pandas.DataFrame.index` and + :attr:`~pandas.DataFrame.columns` attributes of the + :class:`~pandas.DataFrame` instance is placed in the namespace by + default, which allows you to treat both the index and columns of the + frame as a column in the frame. + The identifier ``index`` is used for this variable, and you can also + use the name of the index to identify it in a query. Raises ------ NameError - * if not all identifiers in the query can be found + * If not all identifiers in the query can be found SyntaxError - * if a syntactically *invalid* Python expression is passed + * If a syntactically invalid Python expression is passed Examples -------- @@ -1986,9 +1989,9 @@ def query(self, expr, **kwargs): See Also -------- - pandas.computation.eval.eval + pandas.eval """ - resolvers = kwargs.get('resolvers', None) + resolvers = kwargs.pop('resolvers', None) if resolvers is None: index_resolvers = {} if self.index.name is not None: @@ -1996,7 +1999,7 @@ def query(self, expr, **kwargs): index_resolvers.update({'index': self.index, 'columns': self.columns}) resolvers = [self, index_resolvers] - kwargs.update({'resolvers': resolvers}) + kwargs['local_dict'] = _ensure_scope(resolvers=resolvers, **kwargs) return self[_eval(expr, **kwargs)] def _slice(self, slobj, axis=0, raise_on_error=False): diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py index 34b2811876f30..1cffccea2289f 100644 --- a/pandas/io/tests/test_data.py +++ b/pandas/io/tests/test_data.py @@ -277,7 +277,7 @@ def setUpClass(cls): except ImportError: raise nose.SkipTest - with assert_produces_warning(): + with assert_produces_warning(FutureWarning): cls.aapl = web.Options('aapl') today = datetime.today() diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 88173c001fa7e..dfcbf0a984dab 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -2126,6 +2126,28 @@ def test_terms(self): for t in terms: store.select('p4d', t) + def test_same_name_scoping(self): + + with ensure_clean(self.path) as store: + + import pandas as pd + df = DataFrame(np.random.randn(20, 2),index=pd.date_range('20130101',periods=20)) + store.put('df', df, table=True) + expected = df[df.index>pd.Timestamp('20130105')] + + import datetime + result = store.select('df','index>datetime.datetime(2013,1,5)') + assert_frame_equal(result,expected) + + from datetime import datetime + + # technically an error, but allow it + result = store.select('df','index>datetime.datetime(2013,1,5)') + assert_frame_equal(result,expected) + + result = store.select('df','index>datetime(2013,1,5)') + assert_frame_equal(result,expected) + def test_series(self): s = tm.makeStringSeries() diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index d53d966c6598e..dccd7a8d14cac 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -8097,6 +8097,25 @@ def test_mask_edge_case_1xN_frame(self): expec = DataFrame([[nan, 2]]) assert_frame_equal(res, expec) + def test_query_expressions_correct_failure(self): + try: + import numexpr as ne + except ImportError: + raise nose.SkipTest("cannot query engine numexpr when numexpr not installed") + df = self.frame + exprs = 'and', 'or', 'not' + exprs += tuple(x + tm.rands(5) for x in exprs) + exprs += tuple(tm.rands(5) + x for x in exprs) + + for e in exprs: + self.assertRaises(KeyError, df.__getitem__, e) + + for e in (' and ', ' or ', ' not '): + self.assertRaises(SyntaxError, df.__getitem__, e) + + x = tm.randbool(size=(self.frame.shape[0],)) + self.assertRaises(KeyError, df.__getitem__, 'x') + def test_query_expressions(self): try: import numexpr as ne diff --git a/pandas/util/testing.py b/pandas/util/testing.py index a070fa7ca4216..461f5ab0f21c9 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -20,7 +20,7 @@ from numpy.random import randn, rand import numpy as np -from pandas.core.common import isnull, _is_sequence, is_list_like +from pandas.core.common import isnull, _is_sequence import pandas.core.index as index import pandas.core.series as series import pandas.core.frame as frame @@ -1138,7 +1138,7 @@ def handle_success(self, exc_type, exc_value, traceback): @contextmanager -def assert_produces_warning(expected_warning=None, filter_level="always"): +def assert_produces_warning(expected_warning=Warning, filter_level="always"): """ Context manager for running code that expects to raise (or not raise) warnings. Checks that code raises the expected warning and only the @@ -1164,25 +1164,19 @@ def assert_produces_warning(expected_warning=None, filter_level="always"): ..warn:: This is *not* thread-safe. """ - if expected_warning is None: - expected_warning = [Warning] - elif not is_list_like(expected_warning): - expected_warning = [expected_warning] with warnings.catch_warnings(record=True) as w: saw_warning = False warnings.simplefilter(filter_level) yield w extra_warnings = [] for actual_warning in w: - if (expected_warning and any(issubclass(actual_warning.category, - ew) for ew in - expected_warning)): + if (expected_warning and issubclass(actual_warning.category, + expected_warning)): saw_warning = True else: extra_warnings.append(actual_warning.category.__name__) if expected_warning: - msg = ', '.join(ew.__name__ for ew in expected_warning) - assert saw_warning, ("Did not see expected warning(s) of " - "class(es): %s." % msg) + assert saw_warning, ("Did not see expected warning of class %r." + % expected_warning.__name__) assert not extra_warnings, ("Caused unexpected warning(s): %r." % extra_warnings) From 4cbfdc25bd4d71c4debc02aaa6425d37ccdfba94 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Wed, 24 Jul 2013 20:56:53 -0400 Subject: [PATCH 07/16] ENH: add subscripting to eval expressions --- doc/source/indexing.rst | 119 ++++++++++++- pandas/computation/align.py | 20 +-- pandas/computation/engines.py | 49 ++++-- pandas/computation/eval.py | 1 + pandas/computation/expr.py | 186 +++++++++++++------- pandas/computation/ops.py | 15 +- pandas/computation/pytables.py | 27 +++ pandas/computation/tests/test_eval.py | 241 ++++++++++++++++++++++---- pandas/core/common.py | 10 +- pandas/core/frame.py | 46 +---- pandas/util/testing.py | 3 +- vb_suite/eval.py | 99 +++++++++-- 12 files changed, 623 insertions(+), 193 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 47bf5fe29dc86..9f68934f658d8 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1008,14 +1008,121 @@ convert to an integer index: .. _indexing.query: -The ``query`` Method -~~~~~~~~~~~~~~~~~~~~ -New in pandas v0.13, :class:`~pandas.core.frame.DataFrame` objects have a -:meth:`~pandas.core.frame.DataFrame.query` method that allows selection using a -string consisting of columns of the calling -:class:`~pandas.core.frame.DataFrame`. +.. versionadded:: 0.13 + +The :meth:`~pandas.DataFrame.query` Method +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +:class:`~pandas.DataFrame` objects have a :meth:`~pandas.DataFrame.query` +method that allows selection using a string consisting of columns of the +calling :class:`~pandas.DataFrame`. + +You can get the value of the frame where column ``b`` has values +between the values of columns ``a`` and ``c``. + +.. ipython:: python + :suppress: + + from numpy.random import randint, rand + +.. ipython:: python + + n = 20 + df = DataFrame(rand(n, 3), columns=list('abc')) + df + df[(df.a < df.b) & (df.b < df.c)] + df.query('(a < b) & (b < c)') + +Do the same thing but fallback on a named index if there is no column +with the name ``a``. + +.. ipython:: python + + index = Index(np.arange(n), name='a') + df = DataFrame(randint(n, size=(n, 2)), index=index, columns=list('bc')) + df + df.query('a < b and b < c') + +A use case for :meth:`~pandas.DataFrame.query` is when you have a collection of +:class:`~pandas.DataFrame` s that have a subset of column names (or index +names) in common. You can pass the same query to both frames *without* having +to specify which frame you're interested in querying + +.. ipython:: python + + df2 = DataFrame(randint(n + 10, size=(n + 10, 3)), columns=list('abc')) + df2 + expr = 'a < b & b < c' + map(lambda frame: frame.query(expr), [df, df2]) + +A chained comparison would also work in this situation, yielding slightly +cleaner syntax + +.. ipython:: python + + expr = 'a < b < c' + map(lambda frame: frame.query(expr), [df, df2]) + +One neat feature of :meth:`~pandas.DataFrame.query` is that you can pass an +expression ``expr`` into ``df[]``, e.g., ``df[expr]``. + +This functionality can of course be combined with a slightly modified and more +readable Python syntax implemented in the workhorse function that underlies +:meth:`~pandas.DataFrame.query`--:func:`~pandas.eval`. + +Full numpy-like syntax + +.. ipython:: python + + df['(a < b) & (b < c)'] + +Slightly nicer by removing the parentheses + +.. ipython:: python + + df['a < b & b < c'] + +Use English instead of symbols + +.. ipython:: python + + df['a < b and b < c'] + +Pretty close to how you might write it on paper + +.. ipython:: python + + df['a < b < c'] + +As you can see, these are all equivalent ways to express the same operation (in +fact, they are all ultimately parsed into something very similar to the first +example of the indexing syntax above). + +You can also negate boolean expressions with the word ``not`` or the ``~`` +operator. + +.. ipython:: python + + df = DataFrame(rand(n, 3), columns=list('abc')) + df['bools'] = rand(len(df)) > 0.5 + df['~bools'] + df['not bools'] + df['not bools'] == df['~bools'] + df['not bools'] == df[~df.bools] + +Of course, expressions can be arbitrarily complex too + +.. ipython:: python + + # nice short query syntax + pretty = df['a < b < c and (not bools) or bools > 2'] + + # equivalent in pure Python, yuck! + yuck = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)] + pretty + yuck + yuck == pretty .. _indexing.class: diff --git a/pandas/computation/align.py b/pandas/computation/align.py index fcc1dd1b0334f..5f81fcd60432e 100644 --- a/pandas/computation/align.py +++ b/pandas/computation/align.py @@ -42,13 +42,12 @@ def _maybe_promote_shape(values, naxes): if ndims == naxes: return values - ndim = set(xrange(ndims)) - nax = set(xrange(naxes)) + ndim, nax = range(ndims), range(naxes) axes_slice = [slice(None)] * naxes - # symmetric difference of numaxes and ndims - slices = nax - ndim + # set difference of numaxes and ndims + slices = com.difference(nax, ndim) if ndims == naxes: if slices: @@ -78,7 +77,7 @@ def wrapper(terms): return _align_core_single_unary_op(terms[0]) term_values = (term.value for term in terms) - # only scalars + # only scalars or indexes if all(isinstance(term.value, pd.Index) or term.isscalar for term in terms): return np.result_type(*term_values), None @@ -88,7 +87,7 @@ def wrapper(terms): if all_has_size and all(term.value.size == 1 for term in terms): return np.result_type(*term_values), None - # no pandas so just punt to the evaluator + # no pandas objects if not _any_pandas_objects(terms): return np.result_type(*term_values), None @@ -101,7 +100,7 @@ def _align_core(terms): term_index = [i for i, term in enumerate(terms) if hasattr(term.value, 'axes')] term_dims = [terms[i].value.ndim for i in term_index] - ndims = pd.Series(dict(zip(term_index, term_dims))) + ndims = pd.Series(dict(izip(term_index, term_dims))) # initial axes are the axes of the largest-axis'd term biggest = terms[ndims.idxmax()].value @@ -115,7 +114,8 @@ def _align_core(terms): ax, itm = naxes - 1, term.value.index else: ax, itm = axis, items - axes[ax] = axes[ax].join(itm, how='outer') + if not axes[ax].equals(itm): + axes[ax] = axes[ax].join(itm, how='outer') for i, ndim in ndims.iteritems(): for axis, items in izip(xrange(ndim), axes): @@ -163,10 +163,10 @@ def _align_core(terms): def _filter_terms(flat): # numeric literals - literals = set(filter(is_const, flat)) + literals = frozenset(filter(is_const, flat)) # these are strings which are variable names - names = set(flat) - literals + names = frozenset(flat) - literals # literals are not names and names are not literals, so intersection should # be empty diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index d4f23324b672f..cd161352b97e0 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -1,10 +1,11 @@ import abc +from pandas.core import common as com from pandas.computation.align import _align, _reconstruct_object class AbstractEngine(object): - """""" + """AbstractEngine object serving as a base class for all engines.""" __metaclass__ = abc.ABCMeta has_neg_frac = False @@ -14,12 +15,24 @@ def __init__(self, expr): self.aligned_axes = None self.result_type = None - @abc.abstractmethod def convert(self): - """Convert an expression for evaluation.""" - pass + """Convert an expression for evaluation. + + Defaults to return the expression as a string. + """ + return com.pprint_thing(self.expr) def evaluate(self): + """Run the engine on the expression + + This method performs alignment which is necessary no matter what engine + is being used, thus its implementation is in the base class. + + Returns + ------- + obj : object + The result of the passed expression. + """ if not self._is_aligned: self.result_type, self.aligned_axes = _align(self.expr.terms) @@ -33,7 +46,18 @@ def _is_aligned(self): @abc.abstractmethod def _evaluate(self, env): - """Return an evaluated expression.""" + """Return an evaluated expression. + + Parameters + ---------- + env : Scope + The local and global environment in which to evaluate an + expression. + + Notes + ----- + This method must be implemented by any class the subclasses this class. + """ pass @@ -44,15 +68,12 @@ class NumExprEngine(AbstractEngine): def __init__(self, expr): super(NumExprEngine, self).__init__(expr) - def convert(self): - """Return a string""" - return '%s' % self.expr - def _evaluate(self, env): import numexpr as ne try: - return ne.evaluate(self.convert(), local_dict=env.locals, + s = self.convert() + return ne.evaluate(s, local_dict=env.locals, global_dict=env.globals, truediv=self.expr.truediv) except KeyError as e: @@ -60,15 +81,15 @@ def _evaluate(self, env): class PythonEngine(AbstractEngine): - """Use NumPy even if numexpr is installed""" + """Evaluate an expression in Python space. + + Mostly for testing purposes. + """ has_neg_frac = False def __init__(self, expr): super(PythonEngine, self).__init__(expr) - def convert(self): - pass - def evaluate(self): return self.expr(self.expr.env) diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index abd4785cb5ea3..072bd3feb3a59 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -27,6 +27,7 @@ def _check_parser(parser): raise KeyError('Invalid parser {0!r} passed, valid parsers are' ' {1}'.format(parser, _parsers.keys())) + def eval(expr, parser='pandas', engine='numexpr', truediv=True, local_dict=None, global_dict=None, resolvers=None): """Evaluate a Python expression as a string using various backends. diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index abcf8cc38d88c..f968a6d3fa000 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -4,18 +4,18 @@ import inspect import itertools import tokenize +import datetime + from cStringIO import StringIO from functools import partial +import pandas as pd from pandas.core.base import StringMixin from pandas.core import common as com -from pandas.computation.ops import (BinOp, UnaryOp, _reductions, _mathops, - _cmp_ops_syms, _bool_ops_syms, - _arith_ops_syms, _unary_ops_syms, Term, - Constant) - -import pandas.lib as lib -import datetime +from pandas.computation.ops import (_cmp_ops_syms, _bool_ops_syms, + _arith_ops_syms, _unary_ops_syms) +from pandas.computation.ops import _reductions, _mathops +from pandas.computation.ops import BinOp, UnaryOp, Term, Constant def _ensure_scope(level=2, global_dict=None, local_dict=None, resolvers=None, @@ -25,49 +25,71 @@ def _ensure_scope(level=2, global_dict=None, local_dict=None, resolvers=None, class Scope(StringMixin): + """Object to hold scope, with a few bells to deal with some custom syntax + added by pandas. + + Parameters + ---------- + gbls : dict or None, optional, default None + lcls : dict or Scope or None, optional, default None + level : int, optional, default 1 + resolvers : list-like or None, optional, default None + + Attributes + ---------- + globals : dict + locals : dict + level : int + resolvers : tuple + resolver_keys : frozenset + """ __slots__ = ('globals', 'locals', 'resolvers', '_global_resolvers', 'resolver_keys', '_resolver', 'level') def __init__(self, gbls=None, lcls=None, level=1, resolvers=None): self.level = level - self.resolvers = resolvers or [] + self.resolvers = tuple(resolvers or []) self.globals = dict() self.locals = dict() + self.ntemps = 0 # number of temporary variables in this scope if isinstance(lcls, Scope): ld, lcls = lcls, dict() - self.locals.update(ld.locals) - self.globals.update(ld.globals) - self.resolvers.extend(ld.resolvers) + self.locals.update(ld.locals.copy()) + self.globals.update(ld.globals.copy()) + self.resolvers += ld.resolvers self.update(ld.level) frame = sys._getframe(level) try: - self.globals.update(gbls or frame.f_globals.copy()) - self.locals.update(lcls or frame.f_locals.copy()) + self.globals.update(gbls or frame.f_globals) + self.locals.update(lcls or frame.f_locals) finally: del frame # add some useful defaults - self.globals['Timestamp'] = lib.Timestamp + self.globals['Timestamp'] = pd.lib.Timestamp self.globals['datetime'] = datetime # SUCH a hack self.globals['True'] = True self.globals['False'] = False - self.resolver_keys = set(reduce(operator.add, (list(o.keys()) for o in - self.resolvers), [])) - self._global_resolvers = self.resolvers + [self.locals, self.globals] + self.resolver_keys = frozenset(reduce(operator.add, (list(o.keys()) for + o in + self.resolvers), + [])) + self._global_resolvers = self.resolvers + (self.locals, self.globals) self._resolver = None def __unicode__(self): - return "locals: {0}\nglobals: {0}\nresolvers: {0}".format(self.locals.keys(), - self.globals.keys(), - self.resolver_keys) + return com.pprint_thing("locals: {0}\nglobals: {0}\nresolvers: " + "{0}".format(self.locals.keys(), + self.globals.keys(), + self.resolver_keys)) def __getitem__(self, key): - return self.locals.get(key,self.globals[key]) + return self.resolver(key) @property def resolver(self): @@ -83,9 +105,14 @@ def resolve_key(key): return self._resolver def update(self, level=None): + """Update the current scope by going back `level` levels. + Parameters + ---------- + level : int or None, optional, default None + """ # we are always 2 levels below the caller - # plus the caller maybe below the env level + # plus the caller may be below the env level # in which case we need addtl levels sl = 2 if level is not None: @@ -105,8 +132,38 @@ def update(self, level=None): self.locals.update(f.f_locals) self.globals.update(f.f_globals) finally: - del frame - del frames + del frame, frames + + def add_tmp(self, value, where='locals'): + """Add a temporary variable to the scope. + + Parameters + ---------- + value : object + An arbitrary object to be assigned to a temporary variable. + where : basestring, optional, default 'locals', {'locals', 'globals'} + What scope to add the value to. + + Returns + ------- + name : basestring + The name of the temporary variable created. + """ + d = getattr(self, where, None) + + if d is None: + raise AttributeError("Cannot add value to non-existent scope " + "{0!r}".format(where)) + if not isinstance(d, dict): + raise TypeError("Cannot add value to object of type {0!r}, " + "scope must be a dictionary" + "".format(d.__class__.__name__)) + name = 'tmp_var_{0}_{1}'.format(self.ntemps, pd.util.testing.rands(10)) + d[name] = value + + # only increment if the variable gets put in the scope + self.ntemps += 1 + return name def _rewrite_assign(source): @@ -117,21 +174,12 @@ def _rewrite_assign(source): return tokenize.untokenize(res) -def _parenthesize_booleans(source, ops='|&'): - res = source - for op in ops: - terms = res.split(op) - - t = [] - for term in terms: - t.append('({0})'.format(term)) - - res = op.join(t) - return res +def _replace_booleans(source): + return source.replace('|', ' or ').replace('&', ' and ') def _preparse(source): - return _parenthesize_booleans(_rewrite_assign(source)) + return _replace_booleans(_rewrite_assign(source)) @@ -168,16 +216,15 @@ def _filter_nodes(superclass, all_nodes=_all_nodes): _hacked_nodes = frozenset(['Assign', 'Module', 'Expr']) +_unsupported_expr_nodes = frozenset(['Yield', 'GeneratorExp', 'IfExp', + 'DictComp', 'SetComp', 'Repr', 'Lambda', + 'Set', 'In', 'NotIn', 'AST', 'Is', + 'IsNot']) + # these nodes are low priority or won't ever be supported (e.g., AST) _unsupported_nodes = ((_stmt_nodes | _mod_nodes | _handler_nodes | _arguments_nodes | _keyword_nodes | _alias_nodes | - _expr_context_nodes | frozenset(['Yield', - 'GeneratorExp', - 'IfExp', 'DictComp', - 'SetComp', 'Repr', - 'Lambda', 'Set', 'In', - 'NotIn', 'AST', 'Is', - 'IsNot'])) - + _expr_context_nodes | _unsupported_expr_nodes) - _hacked_nodes) # we're adding a different assignment in some cases to be equality comparison @@ -249,7 +296,7 @@ def __init__(self, env, preparser=_preparse): self.preparser = preparser def visit(self, node, **kwargs): - parse = lambda x: ast.fix_missing_locations(ast.parse(x)) + parse = ast.parse if isinstance(node, basestring): clean = self.preparser(node) elif isinstance(node, ast.AST): @@ -301,15 +348,23 @@ def visit_Index(self, node, **kwargs): return self.visit(node.value) def visit_Subscript(self, node, **kwargs): - """ df.index[4:6] """ value = self.visit(node.value) slobj = self.visit(node.slice) - + expr = com.pprint_thing(slobj) + result = pd.eval(expr, local_dict=self.env.locals, + global_dict=self.env.globals, + resolvers=self.env.resolvers) try: - return Constant(value[slobj], self.env) - except TypeError: - raise ValueError("cannot subscript [{0}] with " - "[{1}]".format(value, slobj)) + # a Term instance + v = value.value[result] + except AttributeError: + # an Op instance + lhs = pd.eval(com.pprint_thing(value), local_dict=self.env.locals, + global_dict=self.env.globals, + resolvers=self.env.resolvers) + v = lhs[result] + name = self.env.add_tmp(v) + return Term(name, env=self.env) def visit_Slice(self, node, **kwargs): """ df.index[slice(4,6)] """ @@ -334,14 +389,15 @@ def visit_Attribute(self, node, **kwargs): attr = node.attr value = node.value - ctx = node.ctx.__class__ - if ctx == ast.Load: + ctx = node.ctx + if isinstance(ctx, ast.Load): # resolve the value resolved = self.visit(value).value try: - return getattr(resolved, attr) - except (AttributeError): - + v = getattr(resolved, attr) + name = self.env.add_tmp(v) + return Term(name, self.env) + except AttributeError: # something like datetime.datetime where scope is overriden if isinstance(value, ast.Name) and value.id == attr: return resolved @@ -412,20 +468,22 @@ def visitor(x, y): return reduce(visitor, operands) -_python_not_supported = frozenset(['Assign', 'Str', 'Slice', 'Index', - 'Subscript', 'Tuple', 'List', 'Dict', - 'Call']) +_python_not_supported = frozenset(['Assign', 'Str', 'Tuple', 'List', 'Dict', + 'Call', 'BoolOp']) _numexpr_supported_calls = frozenset(_reductions + _mathops) -@disallow((_unsupported_nodes | _python_not_supported) - _boolop_nodes) + +@disallow((_unsupported_nodes | _python_not_supported) - + (_boolop_nodes | frozenset(['BoolOp', 'Attribute']))) class PandasExprVisitor(BaseExprVisitor): - def __init__(self, env, preparser=_preparse): + def __init__(self, env, preparser=_replace_booleans): super(PandasExprVisitor, self).__init__(env, preparser) -@disallow(_unsupported_nodes | _python_not_supported) +@disallow(_unsupported_nodes | _python_not_supported | frozenset(['Not'])) class PythonExprVisitor(BaseExprVisitor): - pass + def __init__(self, env, preparser=lambda x: x): + super(PythonExprVisitor, self).__init__(env, preparser=preparser) class Expr(StringMixin): @@ -471,8 +529,8 @@ def maybe_expression(s, kind='pandas'): ops = visitor.binary_ops + visitor.unary_ops filtered = frozenset(ops) - _needs_filter # make sure we have an op at least - return any(op in s or ' and ' in s or ' or ' in s or ' not ' in s - for op in filtered) + return any(op in s or ' and ' in s or ' or ' in s or 'not ' in s for op in + filtered) def isexpr(s, check_names=True): diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 9a1a96cec30d1..d1c8484cbb997 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -93,6 +93,12 @@ def type(self): return_type = type + @property + def raw(self): + return com.pprint_thing('{0}(name={1!r}, type={2})' + ''.format(self.__class__.__name__, self.name, + self.type)) + class Constant(Term): def __init__(self, value, env): @@ -103,7 +109,7 @@ def _resolve_name(self): def _print_operand(opr): - return opr.name if is_term(opr) else unicode(opr) + return opr.name if is_term(opr) else com.pprint_thing(opr) def _get_op(op): @@ -135,6 +141,12 @@ def return_type(self): return np.bool_ return np.result_type(*(term.type for term in com.flatten(self))) + @property + def raw(self): + parened = ('{0}({1!r}, {2})'.format(self.__class__.__name__, self.op, + ', '.join('{0}'.format(opr.raw) for + opr in self.operands))) + return parened _cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=' _cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne @@ -279,4 +291,3 @@ def __call__(self, env): def __unicode__(self): return com.pprint_thing('{0}({1})'.format(self.op, self.operand)) - diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index 2819a63f3706e..7a4ab6e1b6004 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -365,6 +365,33 @@ def visit_USub(self, node, **kwargs): def visit_Index(self, node, **kwargs): return self.visit(node.value).value + def visit_Subscript(self, node, **kwargs): + value = self.visit(node.value) + slobj = self.visit(node.slice) + try: + return Constant(value[slobj], self.env) + except TypeError: + raise ValueError("cannot subscript {0!r} with " + "{1!r}".format(value, slobj)) + + def visit_Attribute(self, node, **kwargs): + attr = node.attr + value = node.value + + ctx = node.ctx.__class__ + if ctx == ast.Load: + # resolve the value + resolved = self.visit(value).value + try: + return getattr(resolved, attr) + except AttributeError: + + # something like datetime.datetime where scope is overriden + if isinstance(value, ast.Name) and value.id == attr: + return resolved + + raise ValueError("Invalid Attribute context {0}".format(ctx.__name__)) + class Expr(expr.Expr): diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 7a2704a321ec9..1881b4716d766 100755 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -7,8 +7,7 @@ import ast import nose -from nose.tools import assert_raises, assert_tuple_equal -from nose.tools import assert_true, assert_false, assert_equal +from nose.tools import assert_raises, assert_true, assert_false, assert_equal from numpy.random import randn, rand import numpy as np @@ -29,7 +28,7 @@ from pandas.computation.expressions import _USE_NUMEXPR from pandas.util.testing import (assert_frame_equal, randbool, assertRaisesRegexp, - assert_produces_warning) + assert_produces_warning, assert_series_equal) from pandas.util.py3compat import PY3 @@ -77,8 +76,7 @@ def wrapper(self, lhs, arith1, rhs, *args, **kwargs): return wrapper -_good_arith_ops = tuple(set(_arith_ops_syms) - - set(_special_case_arith_ops_syms)) +_good_arith_ops = com.difference(_arith_ops_syms, _special_case_arith_ops_syms) class TestEvalPandas(unittest.TestCase): @@ -160,6 +158,7 @@ def test_floor_division(self): for lhs, rhs in product(self.lhses, self.rhses): self.check_floor_division(lhs, '//', rhs) + @slow def test_pow(self): for lhs, rhs in product(self.lhses, self.rhses): self.check_pow(lhs, '**', rhs) @@ -391,11 +390,11 @@ class TestAlignment(unittest.TestCase): @classmethod def setUpClass(cls): - cls.INDEX_TYPES = 'i', 'f', 's', 'u', 'dt', # 'p' + cls.index_types = 'i', 'f', 's', 'u', 'dt', # 'p' @classmethod def tearDownClass(cls): - del cls.INDEX_TYPES + del cls.index_types def check_align_nested_unary_op(self, engine): skip_numexpr_engine(engine) @@ -408,15 +407,35 @@ def test_align_nested_unary_op(self): for engine in _engines: self.check_align_nested_unary_op(engine) - def check_basic_frame_alignment(self, engine): - df = mkdf(10, 10, data_gen_f=f) - df2 = mkdf(20, 10, data_gen_f=f) + def check_basic_frame_alignment(self, engine, r_idx_type, c_idx_type): + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + df2 = mkdf(20, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) res = pd.eval('df + df2', engine=engine) assert_frame_equal(res, df + df2) + @slow def test_basic_frame_alignment(self): - for engine in _engines: - self.check_basic_frame_alignment(engine) + args = product(_engines, self.index_types, self.index_types) + for engine, r, c in args: + self.check_basic_frame_alignment(engine, r, c) + + def check_frame_comparison(self, engine, r_idx_type, c_idx_type): + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + res = pd.eval('df < 2', engine=engine) + assert_frame_equal(res, df < 2) + + df3 = DataFrame(randn(*df.shape), index=df.index, columns=df.columns) + res = pd.eval('df < df3', engine=engine) + assert_frame_equal(res, df < df3) + + @slow + def test_frame_comparison(self): + args = product(_engines, self.index_types, self.index_types) + for engine, r, c in args: + self.check_frame_comparison(engine, r, c) def check_medium_complex_frame_alignment(self, engine, r1, r2, c1, c2): skip_numexpr_engine(engine) @@ -428,7 +447,7 @@ def check_medium_complex_frame_alignment(self, engine, r1, r2, c1, c2): @slow def test_medium_complex_frame_alignment(self): - args = product(_engines, *([self.INDEX_TYPES[:4]] * 4)) + args = product(_engines, *([self.index_types] * 4)) for engine, r1, r2, c1, c2 in args: self.check_medium_complex_frame_alignment(engine, r1, r2, c1, c2) @@ -455,7 +474,7 @@ def testit(): @slow def test_basic_frame_series_alignment(self): - args = product(_engines, self.INDEX_TYPES, self.INDEX_TYPES, + args = product(_engines, self.index_types, self.index_types, ('index', 'columns')) for engine, r_idx_type, c_idx_type, index_name in args: self.check_basic_frame_series_alignment(engine, r_idx_type, @@ -484,7 +503,7 @@ def testit(): @slow def test_basic_series_frame_alignment(self): - args = product(_engines, self.INDEX_TYPES, self.INDEX_TYPES, + args = product(_engines, self.index_types, self.index_types, ('index', 'columns')) for engine, r_idx_type, c_idx_type, index_name in args: self.check_basic_series_frame_alignment(engine, r_idx_type, @@ -509,7 +528,7 @@ def check_series_frame_commutativity(self, engine, r_idx_type, c_idx_type, @slow def test_series_frame_commutativity(self): - args = product(_engines, self.INDEX_TYPES, self.INDEX_TYPES, ('+', + args = product(_engines, self.index_types, self.index_types, ('+', '*'), ('index', 'columns')) for engine, r_idx_type, c_idx_type, op, index_name in args: @@ -519,27 +538,41 @@ def test_series_frame_commutativity(self): def check_complex_series_frame_alignment(self, engine, index_name, obj, r1, r2, c1, c2): skip_numexpr_engine(engine) - df = mkdf(10, 10, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) - df2 = mkdf(20, 10, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + df = mkdf(10, 5, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) + df2 = mkdf(20, 5, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) index = getattr(locals()[obj], index_name) s = Series(np.random.randn(5), index[:5]) - if engine != 'python': - expected = df2.add(s, axis=1).add(df) + + if r2 == 'dt' or c2 == 'dt': + if engine == 'numexpr': + expected2 = df2.add(s) + else: + expected2 = df2 + s else: - expected = df2 + s + df + expected2 = df2 + s + + if r1 == 'dt' or c1 == 'dt': + if engine == 'numexpr': + expected = expected2.add(df) + else: + expected = expected2 + df + else: + expected = expected2 + df + res = pd.eval('df2 + s + df', engine=engine) - expected = df2 + s + df - assert_tuple_equal(res.shape, expected.shape) + self.assertEqual(res.shape, expected.shape) assert_frame_equal(res, expected) @slow def test_complex_series_frame_alignment(self): + index_types = [self.index_types] * 4 args = product(_engines, ('index', 'columns'), ('df', 'df2'), - *([self.INDEX_TYPES[:4]] * 4)) + *index_types) for engine, index_name, obj, r1, r2, c1, c2 in args: self.check_complex_series_frame_alignment(engine, index_name, obj, r1, r2, c1, c2) + @slow def test_performance_warning_for_asenine_alignment(self): df = DataFrame(randn(1000, 10)) s = Series(randn(10000)) @@ -555,6 +588,7 @@ def test_performance_warning_for_asenine_alignment(self): with assert_produces_warning(False): pd.eval('df + s') + class TestOperations(unittest.TestCase): def check_simple_arith_ops(self, engine): @@ -659,6 +693,131 @@ def check_truediv(self, engine): res = pd.eval(ex, truediv=True) assert_array_equal(res, np.array([1.0])) + def test_python_fails_and(self): + df = DataFrame(np.random.randn(5, 3)) + self.assertRaises(NotImplementedError, pd.eval, 'df > 2 and df > 3', + local_dict={'df': df}, parser='python') + + def test_python_fails_or(self): + df = DataFrame(np.random.randn(5, 3)) + self.assertRaises(NotImplementedError, pd.eval, 'df > 2 or df > 3', + local_dict={'df': df}, parser='python') + + def test_python_fails_not(self): + df = DataFrame(np.random.randn(5, 3)) + self.assertRaises(NotImplementedError, pd.eval, 'not df > 2', + local_dict={'df': df}, parser='python') + + def test_python_fails_ampersand(self): + df = DataFrame(np.random.randn(5, 3)) + self.assertRaises(TypeError, pd.eval, + '(df + 2)[df > 1] > 0 & (df > 0)', + local_dict={'df': df}, parser='python') + + def test_python_fails_pipe(self): + df = DataFrame(np.random.randn(5, 3)) + self.assertRaises(TypeError, pd.eval, + '(df + 2)[df > 1] > 0 | (df > 0)', + local_dict={'df': df}, parser='python') + + def check_failing_subscript_with_name_error(self, engine): + df = DataFrame(np.random.randn(5, 3)) + self.assertRaises(NameError, pd.eval, 'df[x > 2] > 2', + local_dict={'df': df}, engine=engine) + + def test_failing_subscript_with_name_error(self): + for engine in _engines: + self.check_failing_subscript_with_name_error(engine) + + def check_lhs_expression_subscript(self, engine): + df = DataFrame(np.random.randn(5, 3)) + result = pd.eval('(df + 1)[df > 2]', engine=engine) + expected = (df + 1)[df > 2] + assert_frame_equal(result, expected) + + def test_lhs_expression_subscript(self): + for engine in _engines: + self.check_lhs_expression_subscript(engine) + + def check_attr_expression(self, engine): + df = DataFrame(np.random.randn(5, 3), columns=list('abc')) + expr1 = 'df.a < df.b' + expec1 = df.a < df.b + expr2 = 'df.a + df.b + df.c' + expec2 = df.a + df.b + df.c + expr3 = 'df.a + df.b + df.c[df.b < 0]' + expec3 = df.a + df.b + df.c[df.b < 0] + exprs = expr1, expr2, expr3 + expecs = expec1, expec2, expec3 + for e, expec in zip(exprs, expecs): + assert_series_equal(expec, pd.eval(e, engine=engine)) + + def test_attr_expression(self): + for engine in _engines: + self.check_attr_expression(engine) + + def check_assignment_fails(self, engine, parser): + df = DataFrame(np.random.randn(5, 3), columns=list('abc')) + df2 = DataFrame(np.random.randn(5, 3)) + expr1 = 'df = df2' + self.assertRaises(NotImplementedError, pd.eval, expr1, + local_dict={'df': df, 'df2': df2}, engine=engine, + parser=parser) + + def test_assignment_fails(self): + for engine, parser in product(_engines.iterkeys(), ('pandas', + 'python')): + self.check_assignment_fails(engine, parser) + + def check_basic_period_index_boolean_expression(self, engine): + df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') + + e = df < 2 + r = pd.eval('df < 2', engine=engine) + x = df < 2 + + assert_frame_equal(r, e) + assert_frame_equal(x, e) + + def test_basic_period_index_expression_python(self): + for engine in _engines: + self.check_basic_period_index_boolean_expression(engine) + + def check_basic_period_index_subscript_expression(self, engine): + df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') + r = pd.eval('df[df < 2 + 3]', engine=engine) + e = df[df < 2 + 3] + assert_frame_equal(r, e) + + def test_basic_period_index_subscript_expression(self): + for engine in _engines: + self.check_basic_period_index_subscript_expression(engine) + + def check_nested_period_index_subscript_expression(self, engine): + df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') + r = pd.eval('df[df[df < 2] < 2] + df * 2', engine=engine) + e = df[df[df < 2] < 2] + df * 2 + assert_frame_equal(r, e) + + def test_nested_period_index_subscript_expression(self): + for engine in _engines: + self.check_nested_period_index_subscript_expression(engine) + + def test_simple_not_expression(self): + df = DataFrame(randn(10, 3), columns=list('abc')) + df['bools'] = rand(len(df)) > 0.5 + res = df['not bools'] + res2 = df['~bools'] + expec = df[~df.bools] + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + + def test_complex_boolean_expression(self): + df = DataFrame(randn(10, 3), columns=list('abc')) + df['bools'] = rand(len(df)) > 0.5 + res = df['a < b < c and (not bools) or bools > 2'] + expec = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)] + assert_frame_equal(res, expec) _var_s = randn(10) @@ -695,28 +854,36 @@ def test_no_new_globals(self): for engine in _engines: self.check_no_new_globals(engine) - def test_nested_scope(self): + def check_nested_scope(self, engine): + # smoke test x = 1 - result = pd.eval('x + 1') + result = pd.eval('x + 1', engine=engine) self.assertEqual(result, 2) - df = DataFrame(np.random.randn(2000, 10)) - df2 = DataFrame(np.random.randn(2000, 10)) + df = DataFrame(np.random.randn(5, 3)) + df2 = DataFrame(np.random.randn(5, 3)) expected = df[(df>0) & (df2>0)] result = df['(df>0) & (df2>0)'] - assert_frame_equal(result,expected) + assert_frame_equal(result, expected) + + result = df.query('(df>0) & (df2>0)', engine=engine) + assert_frame_equal(result, expected) + + result = pd.eval('df[(df > 0) and (df2 > 0)]', engine=engine) + assert_frame_equal(result, expected) - result = df.query('(df>0) & (df2>0)') - assert_frame_equal(result,expected) + result = pd.eval('df[(df > 0) and (df2 > 0) and df[df > 0] > 0]', engine=engine) + expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] + assert_frame_equal(result, expected) - ##### this fails #### - #result = pd.eval('df[(df>0) & (df2>0)]') - #assert_frame_equal(result,expected) + result = pd.eval('df[(df>0) & (df2>0)]',engine=engine) + expected = df.query('(df>0) & (df2>0)', engine=engine) + assert_frame_equal(result, expected) - #### also fails #### - #self.assertRaises(NotImplementedError, pd.eval, - #'df[(df > 0) & (df2 > 0)]') + def test_nested_scope(self): + for engine in _engines: + self.check_nested_scope(engine) def test_invalid_engine(): diff --git a/pandas/core/common.py b/pandas/core/common.py index 74f41355cccd8..757d3eb6f1925 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -4,6 +4,7 @@ import re import collections +import numbers import codecs import csv import sys @@ -1668,7 +1669,7 @@ def is_bool(obj): def is_string(obj): - return isinstance(obj, (basestring, np.str_, np.unicode_)) + return isinstance(obj, basestring) def is_series(obj): @@ -1690,8 +1691,9 @@ def is_pd_obj(obj): def is_ndframe(obj): return isinstance(obj, pd.core.generic.NDFrame) + def is_integer(obj): - return isinstance(obj, (int, long, np.integer)) + return isinstance(obj, (numbers.Integral, np.integer)) def is_float(obj): @@ -1699,7 +1701,7 @@ def is_float(obj): def is_complex(obj): - return isinstance(obj, (complex, np.complexfloating)) + return isinstance(obj, (numbers.Complex, np.complexfloating)) def is_iterator(obj): @@ -1708,7 +1710,7 @@ def is_iterator(obj): def is_number(obj): - return isinstance(obj, (np.number, int, long, float, complex)) + return isinstance(obj, (numbers.Number, np.number)) def is_integer_dtype(arr_or_dtype): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dd92dfa235521..5f4c283d6fffc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1934,8 +1934,12 @@ def query(self, expr, **kwargs): :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python, however the semantics are different. - You can use a syntax that is semantically identical to Python by - passing the keyword argument ``parser='python'``. + You can change the semantics of the expression by passing the keyword + argument ``parser='python'``. This enforces the same semantics as + evaluation in Python space. Likewise, you can pass ``engine='python'`` + to evaluate an expression using Python itself as a backend. This is not + recommended as it is inefficient compared to using ``numexpr`` as the + engine. The :attr:`~pandas.DataFrame.index` and :attr:`~pandas.DataFrame.columns` attributes of the @@ -1945,6 +1949,9 @@ def query(self, expr, **kwargs): The identifier ``index`` is used for this variable, and you can also use the name of the index to identify it in a query. + For further details and examples see the ``query`` documentation in + :ref:`indexing `. + Raises ------ NameError @@ -1952,41 +1959,6 @@ def query(self, expr, **kwargs): SyntaxError * If a syntactically invalid Python expression is passed - Examples - -------- - Get the value of the frame where column ``b`` has values between the - values of columns ``a`` and ``c``. - - >>> from pandas import DataFrame - >>> from numpy.random import randn - >>> df = DataFrame(randn(100, 3), columns=list('abc')) - >>> result = df.query('a < b & b < c') - - Do the same thing but fallback on a named index if there is no column - with the name ``a``. - - >>> from pandas import DataFrame, Index - >>> from numpy.random import randn - >>> n = 10 - >>> index = Index(randn(n), name='a') - >>> df = DataFrame(randn(n, 2), index=index, columns=list('bc')) - >>> result = df.query('a < b & b < c') - - A use case for :meth:`~pandas.core.frame.DataFrame.query` is when you - have a collection of :class:`~pandas.core.frame.DataFrame` s that have - a subset of column names in common. You can pass the same query to both - frames *without* having to specify which frame you're interested in - querying - - >>> from pandas import DataFrame, Index - >>> from numpy.random import randn - >>> n = 100 - >>> index = Index(randn(n), name='a') - >>> df = DataFrame(randn(n, 2), index=index, columns=list('bc')) - >>> df2 = DataFrame(randn(n + 10, 3)) - >>> expr = 'a < b & b < c' - >>> results = map(lambda frame: frame.query(expr), [df, df2]) - See Also -------- pandas.eval diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 461f5ab0f21c9..e82aef13723ba 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -14,7 +14,6 @@ from functools import wraps, partial from contextlib import contextmanager from httplib import HTTPException -from urllib2 import urlopen from distutils.version import LooseVersion from numpy.random import randn, rand @@ -36,7 +35,7 @@ from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex -from pandas.io.common import urlopen, HTTPException +from pandas.io.common import urlopen Index = index.Index MultiIndex = index.MultiIndex diff --git a/vb_suite/eval.py b/vb_suite/eval.py index c0c983862ea3c..c666cd431cbb4 100644 --- a/vb_suite/eval.py +++ b/vb_suite/eval.py @@ -1,7 +1,7 @@ from vbench.benchmark import Benchmark from datetime import datetime -setup = """from pandas_vb_common import * +common_setup = """from pandas_vb_common import * import pandas as pd df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) @@ -9,6 +9,11 @@ df4 = DataFrame(np.random.randn(20000, 100)) """ +setup = common_setup + """ +import pandas.computation.expressions as expr +expr.set_numexpr_threads(1) +""" + SECTION = 'Eval' #---------------------------------------------------------------------- @@ -16,34 +21,94 @@ #---------------------------------------------------------------------- # add - -frame_add_eval = \ - Benchmark("pd.eval('df + df2 + df3 + df4')", setup, name='frame_add_eval', +eval_frame_add_all_threads = \ + Benchmark("pd.eval('df + df2 + df3 + df4')", common_setup, + name='eval_frame_add_all_threads', start_date=datetime(2013, 7, 21)) -frame_add_python = \ - Benchmark("pd.eval('df + df2 + df3 + df4', engine='python')", setup, - name='frame_add_python', start_date=datetime(2013, 7, 21)) + +eval_frame_add_one_thread = \ + Benchmark("pd.eval('df + df2 + df3 + df4')", setup, + name='eval_frame_add_one_thread', + start_date=datetime(2013, 7, 26)) + +eval_frame_add_python = \ + Benchmark("pd.eval('df + df2 + df3 + df4', engine='python')", common_setup, + name='eval_frame_add_python', start_date=datetime(2013, 7, 21)) + +eval_frame_add_python_one_thread = \ + Benchmark("pd.eval('df + df2 + df3 + df4', engine='python')", setup, + name='eval_frame_add_python_one_thread', + start_date=datetime(2013, 7, 26)) #---------------------------------------------------------------------- # mult -frame_mult_eval = \ - Benchmark("pd.eval('df * df2 * df3 * df4')", setup, name='frame_mult_eval', +eval_frame_mult_all_threads = \ + Benchmark("pd.eval('df * df2 * df3 * df4')", common_setup, + name='eval_frame_mult_all_threads', start_date=datetime(2012, 7, 21)) -frame_mult_python = \ - Benchmark("pdl.eval('df * df2 * df3 * df4', engine='python')", setup, - name='frame_mult_python', start_date=datetime(2013, 7, 21)) +eval_frame_mult_one_thread = \ + Benchmark("pd.eval('df * df2 * df3 * df4')", setup, + name='eval_frame_mult_one_thread', + start_date=datetime(2012, 7, 26)) + +eval_frame_mult_python = \ + Benchmark("pdl.eval('df * df2 * df3 * df4', engine='python')", + common_setup, + name='eval_frame_mult_python', start_date=datetime(2013, 7, 21)) + +eval_frame_mult_python_one_thread = \ + Benchmark("pd.eval('df * df2 * df3 * df4', engine='python')", setup, + name='eval_frame_mult_python_one_thread', + start_date=datetime(2012, 7, 26)) #---------------------------------------------------------------------- # multi and -frame_and_eval = \ +eval_frame_and_all_threads = \ + Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')", + common_setup, + name='eval_frame_and_all_threads', + start_date=datetime(2012, 7, 21)) + +eval_frame_and_one_thread = \ Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)')", setup, - name='frame_and_eval', start_date=datetime(2012, 7, 21)) + name='eval_frame_and_one_thread', + start_date=datetime(2012, 7, 26)) -frame_and_python = \ - Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', " - "engine='python')", setup, name='frame_and_python', +setup = common_setup +eval_frame_and_python = \ + Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')", + common_setup, name='eval_frame_and_python', start_date=datetime(2013, 7, 21)) + +eval_frame_and_one_thread = \ + Benchmark("pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine='python')", + setup, + name='eval_frame_and_python_one_thread', + start_date=datetime(2012, 7, 26)) + +#-------------------------------------------------------------------- +# chained comp +eval_frame_chained_cmp_all_threads = \ + Benchmark("pd.eval('df < df2 < df3 < df4')", common_setup, + name='eval_frame_chained_cmp_all_threads', + start_date=datetime(2012, 7, 21)) + +eval_frame_chained_cmp_one_thread = \ + Benchmark("pd.eval('df < df2 < df3 < df4')", setup, + name='eval_frame_chained_cmp_one_thread', + start_date=datetime(2012, 7, 26)) + +setup = common_setup +eval_frame_chained_cmp_python = \ + Benchmark("pd.eval('df < df2 < df3 < df4', engine='python')", + common_setup, name='eval_frame_chained_cmp_python', + start_date=datetime(2013, 7, 26)) + +eval_frame_chained_cmp_one_thread = \ + Benchmark("pd.eval('df < df2 < df3 < df4', engine='python')", setup, + name='eval_frame_chained_cmp_python_one_thread', + start_date=datetime(2012, 7, 26)) From 3ad3c1636076563995d9ca7440888899ab70434e Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 29 Jul 2013 11:19:13 -0400 Subject: [PATCH 08/16] ENH: added interpretation of 'in' in pytables Term syntax which is syntactically equivalent of '=' --- doc/source/indexing.rst | 4 +- doc/source/io.rst | 1 + pandas/compat/__init__.py | 2 + pandas/computation/align.py | 26 +- pandas/computation/common.py | 12 + pandas/computation/engines.py | 37 +- pandas/computation/eval.py | 15 +- pandas/computation/expr.py | 191 +++-- pandas/computation/ops.py | 126 +++- pandas/computation/pytables.py | 110 +-- pandas/computation/tests/test_eval.py | 960 +++++++++++++++----------- pandas/core/common.py | 20 +- pandas/io/pytables.py | 41 +- pandas/io/tests/test_pytables.py | 75 +- pandas/tests/test_common.py | 22 +- pandas/tests/test_expressions.py | 34 +- pandas/tests/test_frame.py | 392 +++++++---- pandas/util/testing.py | 19 +- setup.py | 1 + 19 files changed, 1296 insertions(+), 792 deletions(-) diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 9f68934f658d8..e5e6e84cc0a0d 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1037,8 +1037,8 @@ with the name ``a``. .. ipython:: python - index = Index(np.arange(n), name='a') - df = DataFrame(randint(n, size=(n, 2)), index=index, columns=list('bc')) + df = DataFrame(randint(n, size=(n, 2)), columns=list('bc')) + df.index.name = 'a' df df.query('a < b and b < c') diff --git a/doc/source/io.rst b/doc/source/io.rst index dff1b4836e88a..19fcbd6f4c851 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2011,6 +2011,7 @@ The following are valid expressions: - ``'index>=date'`` - ``"columns=['A', 'D']"`` + - ``"columns in ['A', 'D']"`` - ``'columns=A'`` - ``'columns==A'`` - ``"~(columns=['A','B'])"`` diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 12c929cd59820..10e1464739203 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -46,11 +46,13 @@ from StringIO import StringIO BytesIO = StringIO import cPickle + import httplib except ImportError: import builtins from io import StringIO, BytesIO cStringIO = StringIO import pickle as cPickle + import http.client as httplib if PY3: diff --git a/pandas/computation/align.py b/pandas/computation/align.py index 5f81fcd60432e..794a209b53f46 100644 --- a/pandas/computation/align.py +++ b/pandas/computation/align.py @@ -1,10 +1,11 @@ import warnings from functools import partial, wraps -from itertools import izip +from pandas.compat import zip, range import numpy as np import pandas as pd +from pandas import compat import pandas.core.common as com from pandas.computation.ops import is_const @@ -25,7 +26,7 @@ def _align_core_single_unary_op(term): def _zip_axes_from_type(typ, new_axes): axes = {} - for ax_ind, ax_name in typ._AXIS_NAMES.iteritems(): + for ax_ind, ax_name in compat.iteritems(typ._AXIS_NAMES): axes[ax_name] = new_axes[ax_ind] return axes @@ -47,7 +48,7 @@ def _maybe_promote_shape(values, naxes): axes_slice = [slice(None)] * naxes # set difference of numaxes and ndims - slices = com.difference(nax, ndim) + slices = list(set(nax) - set(ndim)) if ndims == naxes: if slices: @@ -100,7 +101,7 @@ def _align_core(terms): term_index = [i for i, term in enumerate(terms) if hasattr(term.value, 'axes')] term_dims = [terms[i].value.ndim for i in term_index] - ndims = pd.Series(dict(izip(term_index, term_dims))) + ndims = pd.Series(dict(zip(term_index, term_dims))) # initial axes are the axes of the largest-axis'd term biggest = terms[ndims.idxmax()].value @@ -114,11 +115,10 @@ def _align_core(terms): ax, itm = naxes - 1, term.value.index else: ax, itm = axis, items - if not axes[ax].equals(itm): - axes[ax] = axes[ax].join(itm, how='outer') + axes[ax] = axes[ax].join(itm, how='outer') - for i, ndim in ndims.iteritems(): - for axis, items in izip(xrange(ndim), axes): + for i, ndim in compat.iteritems(ndims): + for axis, items in zip(range(ndim), axes): ti = terms[i].value if hasattr(ti, 'reindex_axis'): @@ -216,17 +216,21 @@ def _reconstruct_object(typ, obj, axes, dtype): An object of type ``typ`` with the value `obj` and possible axes `axes`. """ - #import ipdb; ipdb.set_trace() try: typ = typ.type except AttributeError: pass + try: + res_t = np.result_type(obj.dtype, dtype) + except AttributeError: + res_t = dtype + if (not isinstance(typ, partial) and issubclass(typ, pd.core.generic.PandasObject)): - return typ(obj, dtype=dtype, **axes) + return typ(obj, dtype=res_t, **axes) - ret_value = typ(obj).astype(dtype) + ret_value = typ(obj).astype(res_t) try: ret = ret_value.item() diff --git a/pandas/computation/common.py b/pandas/computation/common.py index e69de29bb2d1d..3253039050b78 100644 --- a/pandas/computation/common.py +++ b/pandas/computation/common.py @@ -0,0 +1,12 @@ +import numpy as np + + +def _ensure_decoded(s): + """ if we have bytes, decode them to unicode """ + if isinstance(s, (np.bytes_, bytes)): + s = s.decode('UTF-8') + return s + + +class NameResolutionError(NameError): + pass diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index cd161352b97e0..794b80615f9ea 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -1,8 +1,9 @@ import abc +from pandas import compat from pandas.core import common as com from pandas.computation.align import _align, _reconstruct_object - +from pandas.computation.ops import UndefinedVariableError class AbstractEngine(object): """AbstractEngine object serving as a base class for all engines.""" @@ -22,6 +23,9 @@ def convert(self): """ return com.pprint_thing(self.expr) + def pre_evaluate(self): + self.expr.check_name_clashes() + def evaluate(self): """Run the engine on the expression @@ -36,7 +40,9 @@ def evaluate(self): if not self._is_aligned: self.result_type, self.aligned_axes = _align(self.expr.terms) - res = self._evaluate(self.expr.env) + # make sure no names in resolvers and locals/globals clash + self.pre_evaluate() + res = self._evaluate() return _reconstruct_object(self.result_type, res, self.aligned_axes, self.expr.terms.return_type) @@ -45,7 +51,7 @@ def _is_aligned(self): return self.aligned_axes is not None and self.result_type is not None @abc.abstractmethod - def _evaluate(self, env): + def _evaluate(self): """Return an evaluated expression. Parameters @@ -68,16 +74,26 @@ class NumExprEngine(AbstractEngine): def __init__(self, expr): super(NumExprEngine, self).__init__(expr) - def _evaluate(self, env): + def _evaluate(self): import numexpr as ne + # add the resolvers to locals + self.expr.add_resolvers_to_locals() + + # convert the expression to syntactically valid Python + s = self.convert() + try: - s = self.convert() - return ne.evaluate(s, local_dict=env.locals, - global_dict=env.globals, + return ne.evaluate(s, local_dict=self.expr.env.locals, + global_dict=self.expr.env.globals, truediv=self.expr.truediv) except KeyError as e: - raise NameError('{0!r} is not defined'.format(e.message)) + # python 3 compat kludge + try: + msg = e.message + except AttributeError: + msg = compat.text_type(e) + raise UndefinedVariableError(msg) class PythonEngine(AbstractEngine): @@ -91,9 +107,10 @@ def __init__(self, expr): super(PythonEngine, self).__init__(expr) def evaluate(self): - return self.expr(self.expr.env) + self.pre_evaluate() + return self.expr() - def _evaluate(self, env): + def _evaluate(self): pass diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 072bd3feb3a59..cb8af98928564 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -1,10 +1,12 @@ #!/usr/bin/env python import numbers - import numpy as np -from pandas.computation.expr import Expr, _parsers, _ensure_scope +from pandas import compat +from pandas.compat import string_types +from pandas.computation.expr import (Expr, _parsers, _ensure_scope, + _check_syntax) from pandas.computation.engines import _engines @@ -29,7 +31,7 @@ def _check_parser(parser): def eval(expr, parser='pandas', engine='numexpr', truediv=True, - local_dict=None, global_dict=None, resolvers=None): + local_dict=None, global_dict=None, resolvers=None, level=2): """Evaluate a Python expression as a string using various backends. The following arithmetic operations are supported: ``+``, ``-``, ``*``, @@ -74,6 +76,9 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True, :attr:`~pandas.DataFrame.index` and :attr:`~pandas.DataFrame.columns` variables that refer to their respective :class:`~pandas.DataFrame` instance attributes. + level : int, optional, default 2 + The number of prior stack frames to traverse and add to the current + scope. Returns ------- @@ -96,9 +101,9 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True, _check_parser(parser) env = _ensure_scope(global_dict=global_dict, local_dict=local_dict, - resolvers=resolvers) + resolvers=resolvers, level=level) - if isinstance(expr, basestring): + if isinstance(expr, string_types): parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, truediv=truediv) else: diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index f968a6d3fa000..5cff968727c5c 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -2,20 +2,21 @@ import operator import sys import inspect -import itertools import tokenize import datetime -from cStringIO import StringIO from functools import partial import pandas as pd +from pandas import compat +from pandas.compat import StringIO, zip, reduce, string_types from pandas.core.base import StringMixin from pandas.core import common as com +from pandas.computation.common import NameResolutionError from pandas.computation.ops import (_cmp_ops_syms, _bool_ops_syms, - _arith_ops_syms, _unary_ops_syms) -from pandas.computation.ops import _reductions, _mathops -from pandas.computation.ops import BinOp, UnaryOp, Term, Constant + _arith_ops_syms, _unary_ops_syms, is_term) +from pandas.computation.ops import _reductions, _mathops, _LOCAL_TAG +from pandas.computation.ops import BinOp, UnaryOp, Term, Constant, Div def _ensure_scope(level=2, global_dict=None, local_dict=None, resolvers=None, @@ -24,6 +25,18 @@ def _ensure_scope(level=2, global_dict=None, local_dict=None, resolvers=None, return Scope(global_dict, local_dict, level=level, resolvers=resolvers) +def _check_disjoint_resolver_names(resolver_keys, local_keys, global_keys): + res_locals = com.intersection(resolver_keys, local_keys) + if res_locals: + msg = "resolvers and locals overlap on names {0}".format(res_locals) + raise NameResolutionError(msg) + + res_globals = com.intersection(resolver_keys, global_keys) + if res_globals: + msg = "resolvers and globals overlap on names {0}".format(res_globals) + raise NameResolutionError(msg) + + class Scope(StringMixin): """Object to hold scope, with a few bells to deal with some custom syntax added by pandas. @@ -75,12 +88,15 @@ def __init__(self, gbls=None, lcls=None, level=1, resolvers=None): self.globals['True'] = True self.globals['False'] = False + self.resolver_keys = frozenset(reduce(operator.add, (list(o.keys()) for o in self.resolvers), [])) self._global_resolvers = self.resolvers + (self.locals, self.globals) self._resolver = None + self.resolver_dict = dict((k, self.resolve(k)) + for k in self.resolver_keys) def __unicode__(self): return com.pprint_thing("locals: {0}\nglobals: {0}\nresolvers: " @@ -89,20 +105,18 @@ def __unicode__(self): self.resolver_keys)) def __getitem__(self, key): - return self.resolver(key) + return self.resolve(key, globally=False) - @property - def resolver(self): - if self._resolver is None: - def resolve_key(key): - for resolver in self._global_resolvers: - try: - return resolver[key] - except KeyError: - pass - self._resolver = resolve_key - - return self._resolver + def resolve(self, key, globally=False): + resolvers = self.locals, self.globals + if globally: + resolvers = self._global_resolvers + + for resolver in resolvers: + try: + return resolver[key] + except KeyError: + pass def update(self, level=None): """Update the current scope by going back `level` levels. @@ -178,6 +192,10 @@ def _replace_booleans(source): return source.replace('|', ' or ').replace('&', ' and ') +def _replace_locals(source, local_symbol='@'): + return source.replace(local_symbol, _LOCAL_TAG) + + def _preparse(source): return _replace_booleans(_rewrite_assign(source)) @@ -265,12 +283,14 @@ def f(self, node, *args, **kwargs): def add_ops(op_classes): def f(cls): - for op_attr_name, op_class in op_classes.iteritems(): + for op_attr_name, op_class in compat.iteritems(op_classes): ops = getattr(cls, '{0}_ops'.format(op_attr_name)) ops_map = getattr(cls, '{0}_op_nodes_map'.format(op_attr_name)) for op in ops: - setattr(cls, 'visit_{0}'.format(ops_map[op]), - _op_maker(op_class, op)) + op_node = ops_map[op] + if op_node is not None: + setattr(cls, 'visit_{0}'.format(op_node), + _op_maker(op_class, op)) return cls return f @@ -278,26 +298,30 @@ def f(cls): @disallow(_unsupported_nodes) @add_ops(_op_classes) class BaseExprVisitor(ast.NodeVisitor): + const_type = Constant + term_type = Term """Custom ast walker """ binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms binary_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'BitAnd', - 'BitOr', 'And', 'Or', 'Add', 'Sub', 'Mult', 'Div', + 'BitOr', 'And', 'Or', 'Add', 'Sub', 'Mult', None, 'Pow', 'FloorDiv', 'Mod') - binary_op_nodes_map = dict(itertools.izip(binary_ops, binary_op_nodes)) + binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes)) unary_ops = _unary_ops_syms unary_op_nodes = 'UAdd', 'USub', 'Invert', 'Not' - unary_op_nodes_map = dict(itertools.izip(unary_ops, unary_op_nodes)) + unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) - def __init__(self, env, preparser=_preparse): + def __init__(self, env, engine, parser, preparser=_preparse): self.env = env + self.engine = engine + self.parser = parser self.preparser = preparser def visit(self, node, **kwargs): parse = ast.parse - if isinstance(node, basestring): + if isinstance(node, string_types): clean = self.preparser(node) elif isinstance(node, ast.AST): clean = node @@ -325,21 +349,27 @@ def visit_BinOp(self, node, **kwargs): right = self.visit(node.right, side='right') return op(left, right) + def visit_Div(self, node, **kwargs): + return lambda lhs, rhs: Div(lhs, rhs, + truediv=self.env.locals['truediv']) + def visit_UnaryOp(self, node, **kwargs): op = self.visit(node.op) - return op(self.visit(node.operand)) + operand = self.visit(node.operand) + return op(operand) def visit_Name(self, node, **kwargs): - return Term(node.id, self.env) + return self.term_type(node.id, self.env, **kwargs) def visit_Num(self, node, **kwargs): - return Constant(node.n, self.env) + return self.const_type(node.n, self.env) def visit_Str(self, node, **kwargs): - return Constant(node.s, self.env) + return self.const_type(node.s, self.env) def visit_List(self, node, **kwargs): - return Constant([self.visit(e).value for e in node.elts], self.env) + return self.const_type([self.visit(e).value for e in node.elts], + self.env) visit_Tuple = visit_List @@ -351,20 +381,18 @@ def visit_Subscript(self, node, **kwargs): value = self.visit(node.value) slobj = self.visit(node.slice) expr = com.pprint_thing(slobj) - result = pd.eval(expr, local_dict=self.env.locals, - global_dict=self.env.globals, - resolvers=self.env.resolvers) + result = pd.eval(expr, local_dict=self.env, engine=self.engine, + parser=self.parser) try: # a Term instance v = value.value[result] except AttributeError: # an Op instance - lhs = pd.eval(com.pprint_thing(value), local_dict=self.env.locals, - global_dict=self.env.globals, - resolvers=self.env.resolvers) + lhs = pd.eval(com.pprint_thing(value), local_dict=self.env, + engine=self.engine, parser=self.parser) v = lhs[result] name = self.env.add_tmp(v) - return Term(name, env=self.env) + return self.term_type(name, env=self.env) def visit_Slice(self, node, **kwargs): """ df.index[slice(4,6)] """ @@ -396,7 +424,7 @@ def visit_Attribute(self, node, **kwargs): try: v = getattr(resolved, attr) name = self.env.add_tmp(v) - return Term(name, self.env) + return self.term_type(name, self.env) except AttributeError: # something like datetime.datetime where scope is overriden if isinstance(value, ast.Name) and value.id == attr: @@ -432,19 +460,25 @@ def visit_Call(self, node, **kwargs): if node.kwargs is not None: keywords.update(self.visit(node.kwargs).value) - return Constant(res(*args, **keywords), self.env) + return self.const_type(res(*args, **keywords), self.env) def visit_Compare(self, node, **kwargs): ops = node.ops comps = node.comparators + + def translate(op): + if isinstance(op,ast.In): + return ast.Eq() + return op + if len(comps) == 1: - return self.visit(ops[0])(self.visit(node.left, side='left'), - self.visit(comps[0], side='right')) + return self.visit(translate(ops[0]))(self.visit(node.left, side='left'), + self.visit(comps[0], side='right')) left = node.left values = [] - for op, comp in itertools.izip(ops, comps): + for op, comp in zip(ops, comps): new_node = self.visit(ast.Compare(comparators=[comp], left=left, - ops=[op])) + ops=[translate(op)])) left = comp values.append(new_node) return self.visit(ast.BoolOp(op=ast.And(), values=values)) @@ -476,32 +510,43 @@ def visitor(x, y): @disallow((_unsupported_nodes | _python_not_supported) - (_boolop_nodes | frozenset(['BoolOp', 'Attribute']))) class PandasExprVisitor(BaseExprVisitor): - def __init__(self, env, preparser=_replace_booleans): - super(PandasExprVisitor, self).__init__(env, preparser) + def __init__(self, env, engine, parser, + preparser=lambda x: _replace_locals(_replace_booleans(x))): + super(PandasExprVisitor, self).__init__(env, engine, parser, preparser) @disallow(_unsupported_nodes | _python_not_supported | frozenset(['Not'])) class PythonExprVisitor(BaseExprVisitor): - def __init__(self, env, preparser=lambda x: x): - super(PythonExprVisitor, self).__init__(env, preparser=preparser) + def __init__(self, env, engine, parser, preparser=lambda x: x): + super(PythonExprVisitor, self).__init__(env, engine, parser, + preparser=preparser) class Expr(StringMixin): + """Expr object holding scope - """Expr object""" - + Parameters + ---------- + expr : str + engine : str, optional, default 'numexpr' + parser : str, optional, default 'pandas' + env : Scope, optional, default None + truediv : bool, optional, default True + level : int, optional, default 2 + """ def __init__(self, expr, engine='numexpr', parser='pandas', env=None, - truediv=True): + truediv=True, level=2): self.expr = expr - self.env = _ensure_scope(level=2,local_dict=env) - self._visitor = _parsers[parser](self.env) - self.terms = self.parse() + self.env = _ensure_scope(level=level, local_dict=env) self.engine = engine + self.parser = parser + self._visitor = _parsers[parser](self.env, self.engine, self.parser) + self.terms = self.parse() self.truediv = truediv - def __call__(self, env): - env.locals['truediv'] = self.truediv - return self.terms(env) + def __call__(self): + self.env.locals['truediv'] = self.truediv + return self.terms(self.env) def __unicode__(self): return com.pprint_thing(self.terms) @@ -510,20 +555,38 @@ def __len__(self): return len(self.expr) def parse(self): - """return a Termset""" + """Parse an expression""" return self._visitor.visit(self.expr) def align(self): """align a set of Terms""" return self.terms.align(self.env) + @property + def names(self): + """Get the names in an expression""" + if is_term(self.terms): + return frozenset([self.terms.name]) + return frozenset(term.name for term in com.flatten(self.terms)) + + def check_name_clashes(self): + env = self.env + names = self.names + res_keys = frozenset(env.resolver_dict.iterkeys()) & names + lcl_keys = frozenset(env.locals.iterkeys()) & names + gbl_keys = frozenset(env.globals.iterkeys()) & names + _check_disjoint_resolver_names(res_keys, lcl_keys, gbl_keys) + + def add_resolvers_to_locals(self): + self.env.locals.update(self.env.resolver_dict) + _needs_filter = frozenset(['and', 'or', 'not']) def maybe_expression(s, kind='pandas'): """ loose checking if s is an expression """ - if not isinstance(s, basestring): + if not isinstance(s, string_types): return False visitor = _parsers[kind] ops = visitor.binary_ops + visitor.unary_ops @@ -534,15 +597,17 @@ def maybe_expression(s, kind='pandas'): def isexpr(s, check_names=True): - env = _ensure_scope() try: - Expr(s,env=env) + Expr(s, env=_ensure_scope() if check_names else None) except SyntaxError: return False except NameError: return not check_names - else: - return True + return True + + +def _check_syntax(s): + ast.parse(s) _parsers = {'python': PythonExprVisitor, 'pandas': PandasExprVisitor} diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index d1c8484cbb997..c0d3c7bdd81dd 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -1,10 +1,13 @@ import operator as op +from functools import partial import numpy as np -from pandas.util.py3compat import PY3 +import pandas as pd +from pandas.compat import PY3, string_types import pandas.core.common as com from pandas.core.base import StringMixin +from pandas.computation.common import _ensure_decoded _reductions = 'sum', 'prod' @@ -26,11 +29,12 @@ class BinaryOperatorError(OperatorError): class Term(StringMixin): - def __init__(self, name, env, side=None): - self.name = name + def __init__(self, name, env, side=None, encoding=None): + self._name = name self.env = env self.side = side - self.value = self._resolve_name() + self._value = self._resolve_name() + self.encoding = encoding def __unicode__(self): return com.pprint_thing(self.name) @@ -45,7 +49,7 @@ def _resolve_name(self): self.update(res) if res is None: - if not isinstance(key, basestring): + if not isinstance(key, string_types): return key raise NameError('name {0!r} is not defined'.format(key)) @@ -57,7 +61,7 @@ def _resolve_name(self): def update(self, value): env = self.env key = self.name - if isinstance(key, basestring): + if isinstance(key, string_types): try: del env.locals[key] env.locals[key] = value @@ -76,20 +80,20 @@ def update(self, value): @property def isscalar(self): - return np.isscalar(self.value) + return np.isscalar(self._value) @property def type(self): try: - # ndframe potentially very slow for large, mixed dtype frames - return self.value.values.dtype + # potentially very slow for large, mixed dtype frames + return self._value.values.dtype except AttributeError: try: # ndarray - return self.value.dtype + return self._value.dtype except AttributeError: # scalar - return type(self.value) + return type(self._value) return_type = type @@ -99,13 +103,50 @@ def raw(self): ''.format(self.__class__.__name__, self.name, self.type)) + @property + def kind(self): + try: + return self.type.__name__ + except AttributeError: + return self.type.type.__name__ + + @property + def value(self): + kind = self.kind.lower() + if kind == 'datetime64': + try: + return self._value.asi8 + except AttributeError: + return self._value.view('i8') + elif kind == 'datetime': + return pd.Timestamp(self._value) + elif kind == 'timestamp': + return self._value.asm8.view('i8') + return self._value + + @value.setter + def value(self, new_value): + self._value = new_value + + @property + def name(self): + return self._name + + @name.setter + def name(self, new_name): + self._name = new_name + class Constant(Term): def __init__(self, value, env): super(Constant, self).__init__(value, env) def _resolve_name(self): - return self.name + return self._name + + @property + def name(self): + return self.value def _print_operand(opr): @@ -122,6 +163,7 @@ class Op(StringMixin): def __init__(self, op, operands, *args, **kwargs): self.op = _get_op(op) self.operands = operands + self.encoding = kwargs.get('encoding', None) def __iter__(self): return iter(self.operands) @@ -148,6 +190,7 @@ def raw(self): opr in self.operands))) return parened + _cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=' _cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne _cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs)) @@ -175,16 +218,11 @@ def raw(self): def _cast_inplace(terms, dtype): dt = np.dtype(dtype) for term in terms: - # cast all the way down the tree since operands must be try: - _cast_inplace(term.operands, dtype) + new_value = term.value.astype(dt) except AttributeError: - # we've bottomed out so actually do the cast - try: - new_value = term.value.astype(dt) - except AttributeError: - new_value = dt.type(term.value) - term.update(new_value) + new_value = dt.type(term.value) + term.update(new_value) def is_term(obj): @@ -209,11 +247,13 @@ def __init__(self, op, lhs, rhs, **kwargs): self.lhs = lhs self.rhs = rhs + self.convert_values() + try: self.func = _binary_ops_dict[op] except KeyError: keys = _binary_ops_dict.keys() - raise BinaryOperatorError('Invalid binary operator {0}, valid' + raise BinaryOperatorError('Invalid binary operator {0!r}, valid' ' operators are {1}'.format(op, keys)) def __call__(self, env): @@ -245,11 +285,45 @@ def __call__(self, env): return res - -class Mod(BinOp): - def __init__(self, lhs, rhs, *args, **kwargs): - super(Mod, self).__init__('%', lhs, rhs, *args, **kwargs) - _cast_inplace(self.operands, np.float_) + def convert_values(self): + def stringify(value): + if self.encoding is not None: + encoder = partial(com.pprint_thing_encoded, + encoding=self.encoding) + else: + encoder = com.pprint_thing + return encoder(value) + + lhs, rhs = self.lhs, self.rhs + + if (is_term(lhs) and lhs.kind.startswith('datetime') and is_term(rhs) + and rhs.isscalar): + v = rhs.value + if isinstance(v, (int, float)): + v = stringify(v) + v = _ensure_decoded(v) + v = pd.Timestamp(v) + if v.tz is not None: + v = v.tz_convert('UTC') + self.rhs.update(v) + + if (is_term(rhs) and rhs.kind.startswith('datetime') and + is_term(lhs) and lhs.isscalar): + v = lhs.value + if isinstance(v, (int, float)): + v = stringify(v) + v = _ensure_decoded(v) + v = pd.Timestamp(v) + if v.tz is not None: + v = v.tz_convert('UTC') + self.lhs.update(v) + + +class Div(BinOp): + def __init__(self, lhs, rhs, truediv=True, *args, **kwargs): + super(Div, self).__init__('/', lhs, rhs, *args, **kwargs) + if truediv or PY3: + _cast_inplace(com.flatten(self), np.float_) _unary_ops_syms = '+', '-', '~', 'not' diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index 7a4ab6e1b6004..2d98397366b7f 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -6,22 +6,13 @@ from functools import partial from datetime import datetime -import numpy as np - +import pandas as pd +from pandas.compat import u, string_types import pandas.core.common as com -import pandas.lib as lib from pandas.computation import expr, ops -from pandas.computation.ops import is_term, Constant +from pandas.computation.ops import is_term from pandas.computation.expr import BaseExprVisitor -from pandas import Index -from pandas.core.common import is_list_like - - -def _ensure_decoded(s): - """ if we have bytes, decode them to unicode """ - if isinstance(s, (np.bytes_, bytes)): - s = s.decode('UTF-8') - return s +from pandas.computation.common import _ensure_decoded class Scope(expr.Scope): @@ -42,7 +33,6 @@ def __init__(self, name, env, side=None): super(Term, self).__init__(name, env, side=side) def _resolve_name(self): - # must be a queryables if self.side == 'left': if self.name not in self.env.queryables: @@ -53,6 +43,22 @@ def _resolve_name(self): return self.env.locals.get(self.name, self.env.globals.get(self.name, self.name)) + @property + def value(self): + return self._value + + +class Constant(Term): + def __init__(self, value, env): + super(Constant, self).__init__(value, env) + + def _resolve_name(self): + return self._name + + @property + def name(self): + return self._value + class BinOp(ops.BinOp): @@ -112,7 +118,7 @@ def pr(left, right): def conform(self, rhs): """ inplace conform rhs """ - if not is_list_like(rhs): + if not com.is_list_like(rhs): rhs = [rhs] if hasattr(self.rhs, 'ravel'): rhs = rhs.ravel() @@ -144,43 +150,48 @@ def convert_value(self, v): accepted by pytables """ def stringify(value): - value = str(value) if self.encoding is not None: - value = value.encode(self.encoding) - return value + encoder = partial(com.pprint_thing_encoded, + encoding=self.encoding) + else: + encoder = com.pprint_thing + return encoder(value) kind = _ensure_decoded(self.kind) - if kind == u'datetime64' or kind == u'datetime': - + if kind == u('datetime64') or kind == u('datetime'): if isinstance(v, (int, float)): v = stringify(v) v = _ensure_decoded(v) - v = lib.Timestamp(v) + v = pd.Timestamp(v) if v.tz is not None: v = v.tz_convert('UTC') return TermValue(v, v.value, kind) - elif isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u'date': + elif isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u('date'): v = time.mktime(v.timetuple()) - return TermValue(v, lib.Timestamp(v), kind) - elif kind == u'integer': + return TermValue(v, pd.Timestamp(v), kind) + elif kind == u('integer'): v = int(float(v)) return TermValue(v, v, kind) - elif kind == u'float': + elif kind == u('float'): v = float(v) return TermValue(v, v, kind) - elif kind == u'bool': - if isinstance(v, basestring): - v = not v.strip().lower() in [u'false', u'f', u'no', u'n', - u'none', u'0', u'[]', u'{}', u''] + elif kind == u('bool'): + if isinstance(v, string_types): + v = not v.strip().lower() in [u('false'), u('f'), u('no'), + u('n'), u('none'), u('0'), + u('[]'), u('{}'), u('')] else: v = bool(v) return TermValue(v, v, kind) - elif not isinstance(v, basestring): + elif not isinstance(v, string_types): v = stringify(v) - return TermValue(v, stringify(v), u'string') + return TermValue(v, stringify(v), u('string')) # string quoting - return TermValue(v, stringify(v), u'string') + return TermValue(v, stringify(v), u('string')) + + def convert_values(self): + pass class FilterBinOp(BinOp): @@ -203,7 +214,7 @@ def format(self): def evaluate(self): - if not isinstance(self.lhs, basestring): + if not isinstance(self.lhs, string_types): return self if not self.is_valid: @@ -221,7 +232,7 @@ def evaluate(self): self.filter = ( self.lhs, filter_op, - Index([v.value for v in values])) + pd.Index([v.value for v in values])) return self return None @@ -233,7 +244,7 @@ def evaluate(self): self.filter = ( self.lhs, filter_op, - Index([v.value for v in values])) + pd.Index([v.value for v in values])) else: raise TypeError( @@ -276,7 +287,7 @@ def format(self): def evaluate(self): - if not isinstance(self.lhs, basestring): + if not isinstance(self.lhs, string_types): return self if not self.is_valid: @@ -341,26 +352,26 @@ def prune(self, klass): _op_classes = {'unary': UnaryOp} class ExprVisitor(BaseExprVisitor): - def __init__(self, env, **kwargs): - super(ExprVisitor, self).__init__(env) + const_type = Constant + term_type = Term + + def __init__(self, env, engine, parser, **kwargs): + super(ExprVisitor, self).__init__(env, engine, parser) for bin_op in self.binary_ops: setattr(self, 'visit_{0}'.format(self.binary_op_nodes_map[bin_op]), lambda node, bin_op=bin_op: partial(BinOp, bin_op, **kwargs)) - def visit_Name(self, node, side=None, **kwargs): - return Term(node.id, self.env, side=side, **kwargs) - def visit_UnaryOp(self, node, **kwargs): if isinstance(node.op, (ast.Not, ast.Invert)): return UnaryOp('~', self.visit(node.operand)) elif isinstance(node.op, ast.USub): - return Constant(-self.visit(node.operand).value, self.env) + return self.const_type(-self.visit(node.operand).value, self.env) elif isinstance(node.op, ast.UAdd): raise NotImplementedError('Unary addition not supported') def visit_USub(self, node, **kwargs): - return Constant(-self.visit(node.operand).value, self.env) + return self.const_type(-self.visit(node.operand).value, self.env) def visit_Index(self, node, **kwargs): return self.visit(node.value).value @@ -369,7 +380,7 @@ def visit_Subscript(self, node, **kwargs): value = self.visit(node.value) slobj = self.visit(node.slice) try: - return Constant(value[slobj], self.env) + return self.const_type(value[slobj], self.env) except TypeError: raise ValueError("cannot subscript {0!r} with " "{1!r}".format(value, slobj)) @@ -400,7 +411,7 @@ class Expr(expr.Expr): Parameters ---------- where : string term expression, Expr, or list-like of Exprs - queryables : a kinds map (dict of column name -> kind), or None i column is non-indexable + queryables : a "kinds" map (dict of column name -> kind), or None if column is non-indexable encoding : an encoding that will encode the query terms Returns @@ -457,6 +468,7 @@ def __init__(self, where, op=None, value=None, queryables=None, if queryables is not None: self.env.queryables.update(queryables) self._visitor = ExprVisitor(self.env, queryables=queryables, + parser='pytables', engine='pytables', encoding=encoding) self.terms = self.parse() @@ -465,7 +477,7 @@ def parse_back_compat(self, w, op=None, value=None): if isinstance(w, dict): w, op, value = w.get('field'), w.get('op'), w.get('value') - if not isinstance(w, basestring): + if not isinstance(w, string_types): raise TypeError( "where must be passed as a string if op/value are passed") warnings.warn("passing a dict to Expr is deprecated, " @@ -473,7 +485,7 @@ def parse_back_compat(self, w, op=None, value=None): DeprecationWarning) if op is not None: - if not isinstance(w, basestring): + if not isinstance(w, string_types): raise TypeError( "where must be passed as a string if op/value are passed") @@ -493,7 +505,7 @@ def parse_back_compat(self, w, op=None, value=None): def __unicode__(self): if self.terms is not None: - return unicode(self.terms) + return com.pprint_thing(self.terms) return self.expr def evaluate(self): @@ -525,7 +537,7 @@ def __init__(self, value, converted, kind): def tostring(self, encoding): """ quote the string if not encoded else encode and return """ - if self.kind == u'string': + if self.kind == u('string'): if encoding is not None: return self.converted return '"%s"' % self.converted diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 1881b4716d766..8a8a04824cf29 100755 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -16,7 +16,7 @@ import pandas as pd from pandas.core import common as com -from pandas import DataFrame, Series, Panel +from pandas import DataFrame, Series, Panel, date_range from pandas.util.testing import makeCustomDataframe as mkdf from pandas.computation.engines import _engines from pandas.computation.expr import PythonExprVisitor, PandasExprVisitor @@ -29,12 +29,11 @@ from pandas.util.testing import (assert_frame_equal, randbool, assertRaisesRegexp, assert_produces_warning, assert_series_equal) -from pandas.util.py3compat import PY3 +from pandas.compat import PY3, u - -def skip_numexpr_engine(engine): +def skip_if_no_ne(engine='numexpr'): if not _USE_NUMEXPR and engine == 'numexpr': - raise nose.SkipTest("not using numexpr") + raise nose.SkipTest("numexpr engine not installed or disabled") def engine_has_neg_frac(engine): @@ -47,8 +46,13 @@ def _eval_single_bin(lhs, cmp1, rhs, engine): try: return c(lhs, rhs) except ValueError as e: - if e.message == ('negative number cannot be raised to a ' - 'fractional power'): + try: + msg = e.message + except AttributeError: + msg = e + msg = u(msg) + if msg == u('negative number cannot be raised to a fractional' + ' power'): return np.nan raise return c(lhs, rhs) @@ -76,26 +80,27 @@ def wrapper(self, lhs, arith1, rhs, *args, **kwargs): return wrapper -_good_arith_ops = com.difference(_arith_ops_syms, _special_case_arith_ops_syms) +def _is_py3_complex_incompat(result, expected): + return (PY3 and isinstance(expected, (complex, np.complexfloating)) and + np.isnan(result)) -class TestEvalPandas(unittest.TestCase): +_good_arith_ops = com.difference(_arith_ops_syms, _special_case_arith_ops_syms) + +class TestEvalNumexprPandas(unittest.TestCase): @classmethod def setUpClass(cls): - cls.cmp_ops = expr._cmp_ops_syms - cls.cmp2_ops = cls.cmp_ops[::-1] - cls.bin_ops = expr._bool_ops_syms - cls.special_case_ops = _special_case_arith_ops_syms - cls.arith_ops = _good_arith_ops - cls.unary_ops = '+', '-' + skip_if_no_ne() + import numexpr as ne + cls.ne = ne + cls.engine = 'numexpr' + cls.parser = 'pandas' @classmethod def tearDownClass(cls): - del cls.cmp_ops, cls.cmp2_ops, cls.bin_ops, cls.special_case_ops - del cls.arith_ops, cls.unary_ops - - def set_current_engine(self): - self.engine = 'numexpr' + del cls.engine, cls.parser + if hasattr(cls, 'ne'): + del cls.ne def setup_data(self): nan_df1 = DataFrame(rand(10, 5)) @@ -115,20 +120,22 @@ def setup_data(self): self.rhses = self.pandas_rhses + self.scalar_rhses + (randn(10, 5), randn(5)) + def setup_ops(self): + self.cmp_ops = expr._cmp_ops_syms + self.cmp2_ops = self.cmp_ops[::-1] + self.bin_ops = expr._bool_ops_syms + self.special_case_ops = _special_case_arith_ops_syms + self.arith_ops = _good_arith_ops + self.unary_ops = '+', '-', '~', 'not ' + def setUp(self): - try: - import numexpr as ne - self.ne = ne - except ImportError: - raise nose.SkipTest - self.set_current_engine() + self.setup_ops() self.setup_data() self.current_engines = filter(lambda x: x != self.engine, _engines) def tearDown(self): del self.lhses, self.rhses, self.scalar_rhses, self.scalar_lhses - del self.pandas_rhses, self.pandas_lhses, self.current_engines, self.ne - del self.engine + del self.pandas_rhses, self.pandas_lhses, self.current_engines @slow def test_complex_cmp_ops(self): @@ -196,7 +203,7 @@ def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): lhs_new = _eval_single_bin(lhs, cmp1, rhs, self.engine) rhs_new = _eval_single_bin(lhs, cmp2, rhs, self.engine) expected = _eval_single_bin(lhs_new, binop, rhs_new, self.engine) - result = pd.eval(ex, engine=self.engine) + result = pd.eval(ex, engine=self.engine, parser=self.parser) assert_array_equal(result, expected) def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): @@ -236,24 +243,25 @@ def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): expected = _eval_single_bin(lhs_new, '&', rhs_new, self.engine) for ex in (ex1, ex2, ex3): - result = pd.eval(ex, engine=self.engine) + result = pd.eval(ex, engine=self.engine, + parser=self.parser) assert_array_equal(result, expected) @skip_incompatible_operand def check_simple_cmp_op(self, lhs, cmp1, rhs): ex = 'lhs {0} rhs'.format(cmp1) expected = _eval_single_bin(lhs, cmp1, rhs, self.engine) - result = pd.eval(ex, engine=self.engine) + result = pd.eval(ex, engine=self.engine, parser=self.parser) assert_array_equal(result, expected) @skip_incompatible_operand def check_binary_arith_op(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) - result = pd.eval(ex, engine=self.engine) + result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = _eval_single_bin(lhs, arith1, rhs, self.engine) assert_array_equal(result, expected) ex = 'lhs {0} rhs {0} rhs'.format(arith1) - result = pd.eval(ex, engine=self.engine) + result = pd.eval(ex, engine=self.engine, parser=self.parser) nlhs = _eval_single_bin(lhs, arith1, rhs, self.engine) self.check_alignment(result, nlhs, rhs, arith1) @@ -274,7 +282,7 @@ def check_alignment(self, result, nlhs, ghs, op): @skip_incompatible_operand def check_modulus(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) - result = pd.eval(ex, engine=self.engine) + result = pd.eval(ex, engine=self.engine, parser=self.parser) expected = lhs % rhs assert_allclose(result, expected) expected = self.ne.evaluate('expected {0} rhs'.format(arith1)) @@ -285,25 +293,31 @@ def check_floor_division(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) if self.engine == 'python': - res = pd.eval(ex, engine=self.engine) + res = pd.eval(ex, engine=self.engine, parser=self.parser) expected = lhs // rhs assert_array_equal(res, expected) else: self.assertRaises(TypeError, pd.eval, ex, local_dict={'lhs': lhs, 'rhs': rhs}, - engine=self.engine) + engine=self.engine, parser=self.parser) def get_expected_pow_result(self, lhs, rhs): try: expected = _eval_single_bin(lhs, '**', rhs, self.engine) except ValueError as e: msg = 'negative number cannot be raised to a fractional power' - if e.message == msg: + try: + emsg = e.message + except AttributeError: + emsg = e + + emsg = u(emsg) + + if emsg == msg: if self.engine == 'python': - raise nose.SkipTest(e.message) + raise nose.SkipTest(emsg) else: expected = np.nan - # raise on other, possibly valid ValueErrors else: raise return expected @@ -312,14 +326,20 @@ def get_expected_pow_result(self, lhs, rhs): def check_pow(self, lhs, arith1, rhs): ex = 'lhs {0} rhs'.format(arith1) expected = self.get_expected_pow_result(lhs, rhs) - result = pd.eval(ex, engine=self.engine) - assert_array_equal(result, expected) + result = pd.eval(ex, engine=self.engine, parser=self.parser) - ex = '(lhs {0} rhs) {0} rhs'.format(arith1) - result = pd.eval(ex, engine=self.engine) - expected = self.get_expected_pow_result( - self.get_expected_pow_result(lhs, rhs), rhs) - assert_array_equal(result, expected) + if (np.isscalar(lhs) and np.isscalar(rhs) and + _is_py3_complex_incompat(result, expected)): + self.assertRaises(AssertionError, assert_array_equal, result, + expected) + else: + assert_array_equal(result, expected) + + ex = '(lhs {0} rhs) {0} rhs'.format(arith1) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + expected = self.get_expected_pow_result( + self.get_expected_pow_result(lhs, rhs), rhs) + assert_array_equal(result, expected) @skip_incompatible_operand def check_single_invert_op(self, lhs, cmp1, rhs): @@ -330,11 +350,13 @@ def check_single_invert_op(self, lhs, cmp1, rhs): except AttributeError: elb = np.array([bool(el)]) expected = ~elb - result = pd.eval('~elb', engine=self.engine) + result = pd.eval('~elb', engine=self.engine, parser=self.parser) assert_array_equal(expected, result) for engine in self.current_engines: - assert_array_equal(result, pd.eval('~elb', engine=engine)) + skip_if_no_ne(engine) + assert_array_equal(result, pd.eval('~elb', engine=engine, + parser=self.parser)) @skip_incompatible_operand def check_compound_invert_op(self, lhs, cmp1, rhs): @@ -343,12 +365,13 @@ def check_compound_invert_op(self, lhs, cmp1, rhs): if np.isscalar(lhs) and np.isscalar(rhs): lhs, rhs = map(lambda x: np.array([x]), (lhs, rhs)) expected = ~_eval_single_bin(lhs, cmp1, rhs, self.engine) - result = pd.eval(ex, engine=self.engine) + result = pd.eval(ex, engine=self.engine, parser=self.parser) assert_array_equal(expected, result) # make sure the other engines work the same as this one for engine in self.current_engines: - ev = pd.eval(ex, engine=self.engine) + skip_if_no_ne(engine) + ev = pd.eval(ex, engine=self.engine, parser=self.parser) assert_array_equal(ev, result) @skip_incompatible_operand @@ -367,100 +390,202 @@ def check_unary_arith_op(self, lhs, arith1, rhs, unary_op): expected = f(lhs.values) except AttributeError: expected = f(lhs) - result = pd.eval(ex, engine=self.engine) + + result = pd.eval(ex, engine=self.engine, parser=self.parser) assert_array_equal(result, expected) for engine in self.current_engines: - assert_array_equal(result, pd.eval(ex, engine=engine)) + skip_if_no_ne(engine) + assert_array_equal(result, pd.eval(ex, engine=engine, + parser=self.parser)) ex = '{0}(lhs {1} rhs)'.format(unary_op, arith1) - result = pd.eval(ex, engine=self.engine) + result = pd.eval(ex, engine=self.engine, parser=self.parser) -class TestEvalPython(TestEvalPandas): +class TestEvalNumexprPython(TestEvalNumexprPandas): + @classmethod + def setUpClass(cls): + skip_if_no_ne() + import numexpr as ne + cls.ne = ne + cls.engine = 'numexpr' + cls.parser = 'python' + + def setup_ops(self): + self.cmp_ops = expr._cmp_ops_syms + self.cmp2_ops = self.cmp_ops[::-1] + self.bin_ops = (s for s in expr._bool_ops_syms if s not in ('and', 'or')) + self.special_case_ops = _special_case_arith_ops_syms + self.arith_ops = _good_arith_ops + self.unary_ops = '+', '-', '~' - def set_current_engine(self): - self.engine = 'python' + def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): + ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2) + self.assertRaises(NotImplementedError, pd.eval, ex1, + local_dict={'lhs': lhs, 'mid': mid, 'rhs': rhs}, + engine=self.engine, parser=self.parser) -f = lambda *args, **kwargs: np.random.randn() +class TestEvalPythonPython(TestEvalNumexprPython): + @classmethod + def setUpClass(cls): + cls.engine = 'python' + cls.parser = 'python' + @skip_incompatible_operand + def check_modulus(self, lhs, arith1, rhs): + ex = 'lhs {0} rhs'.format(arith1) + result = pd.eval(ex, engine=self.engine) + expected = lhs % rhs + assert_allclose(result, expected) + expected = eval('expected {0} rhs'.format(arith1)) + assert_allclose(result, expected) -class TestAlignment(unittest.TestCase): + def check_alignment(self, result, nlhs, ghs, op): + try: + nlhs, ghs = nlhs.align(ghs) + except (ValueError, TypeError, AttributeError): + # ValueError: series frame or frame series align + # TypeError, AttributeError: series or frame with scalar align + pass + else: + expected = eval('nlhs {0} ghs'.format(op)) + assert_array_equal(result, expected) + +class TestEvalPythonPandas(TestEvalPythonPython): @classmethod def setUpClass(cls): - cls.index_types = 'i', 'f', 's', 'u', 'dt', # 'p' + cls.engine = 'python' + cls.parser = 'pandas' - @classmethod - def tearDownClass(cls): - del cls.index_types + def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): + # these are not compatible operands + if _series_and_2d_ndarray(lhs, mid): + self.assertRaises(ValueError, _eval_single_bin, lhs, cmp2, mid, + self.engine) + else: + lhs_new = _eval_single_bin(lhs, cmp1, mid, self.engine) + + if _series_and_2d_ndarray(mid, rhs): + self.assertRaises(ValueError, _eval_single_bin, mid, cmp2, rhs, + self.engine) + else: + rhs_new = _eval_single_bin(mid, cmp2, rhs, self.engine) + + try: + lhs_new + rhs_new + except NameError: + pass + else: + # these are not compatible operands + if (com.is_series(lhs_new) and com.is_frame(rhs_new) or + _bool_and_frame(lhs_new, rhs_new)): + self.assertRaises(TypeError, _eval_single_bin, lhs_new, '&', + rhs_new, self.engine) + elif _series_and_2d_ndarray(lhs_new, rhs_new): + # TODO: once #4319 is fixed add this test back in + #self.assertRaises(Exception, _eval_single_bin, lhs_new, '&', + #rhs_new, self.engine) + pass + else: + ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2) + ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp1, cmp2) + ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp1, cmp2) + expected = _eval_single_bin(lhs_new, '&', rhs_new, self.engine) - def check_align_nested_unary_op(self, engine): - skip_numexpr_engine(engine) + for ex in (ex1, ex2, ex3): + result = pd.eval(ex, engine=self.engine, + parser=self.parser) + assert_array_equal(result, expected) + + + +f = lambda *args, **kwargs: np.random.randn() + + +ENGINES_PARSERS = list(product(_engines, expr._parsers)) + + +#------------------------------------- +# basic and complex alignment + +class TestAlignment(object): + + index_types = 'i', 'f', 's', 'u', 'dt', # 'p' + + def check_align_nested_unary_op(self, engine, parser): + skip_if_no_ne(engine) s = 'df * ~2' - df = mkdf(10, 10, data_gen_f=f) - res = pd.eval(s, engine=engine) + df = mkdf(5, 3, data_gen_f=f) + res = pd.eval(s, engine=engine, parser=parser) assert_frame_equal(res, df * ~2) def test_align_nested_unary_op(self): - for engine in _engines: - self.check_align_nested_unary_op(engine) + for engine, parser in ENGINES_PARSERS: + yield self.check_align_nested_unary_op, engine, parser - def check_basic_frame_alignment(self, engine, r_idx_type, c_idx_type): - df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, - c_idx_type=c_idx_type) - df2 = mkdf(20, 10, data_gen_f=f, r_idx_type=r_idx_type, - c_idx_type=c_idx_type) - res = pd.eval('df + df2', engine=engine) - assert_frame_equal(res, df + df2) + def check_basic_frame_alignment(self, engine, parser): + skip_if_no_ne(engine) + args = product(self.index_types, repeat=2) + for r_idx_type, c_idx_type in args: + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + df2 = mkdf(20, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + res = pd.eval('df + df2', engine=engine, parser=parser) + assert_frame_equal(res, df + df2) @slow def test_basic_frame_alignment(self): - args = product(_engines, self.index_types, self.index_types) - for engine, r, c in args: - self.check_basic_frame_alignment(engine, r, c) + for engine, parser in ENGINES_PARSERS: + yield self.check_basic_frame_alignment, engine, parser - def check_frame_comparison(self, engine, r_idx_type, c_idx_type): - df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, - c_idx_type=c_idx_type) - res = pd.eval('df < 2', engine=engine) - assert_frame_equal(res, df < 2) + def check_frame_comparison(self, engine, parser): + skip_if_no_ne(engine) + args = product(self.index_types, repeat=2) + for r_idx_type, c_idx_type in args: + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + res = pd.eval('df < 2', engine=engine, parser=parser) + assert_frame_equal(res, df < 2) - df3 = DataFrame(randn(*df.shape), index=df.index, columns=df.columns) - res = pd.eval('df < df3', engine=engine) - assert_frame_equal(res, df < df3) + df3 = DataFrame(randn(*df.shape), index=df.index, + columns=df.columns) + res = pd.eval('df < df3', engine=engine, parser=parser) + assert_frame_equal(res, df < df3) @slow def test_frame_comparison(self): - args = product(_engines, self.index_types, self.index_types) - for engine, r, c in args: - self.check_frame_comparison(engine, r, c) - - def check_medium_complex_frame_alignment(self, engine, r1, r2, c1, c2): - skip_numexpr_engine(engine) - df = mkdf(5, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) - df2 = mkdf(10, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) - df3 = mkdf(15, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) - res = pd.eval('df + df2 + df3', engine=engine) - assert_frame_equal(res, df + df2 + df3) + for engine, parser in ENGINES_PARSERS: + yield self.check_frame_comparison, engine, parser + + def check_medium_complex_frame_alignment(self, engine, parser): + skip_if_no_ne(engine) + args = product(self.index_types, repeat=4) + for r1, c1, r2, c2 in args: + df = mkdf(5, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) + df2 = mkdf(10, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + df3 = mkdf(15, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + res = pd.eval('df + df2 + df3', engine=engine, parser=parser) + assert_frame_equal(res, df + df2 + df3) @slow def test_medium_complex_frame_alignment(self): - args = product(_engines, *([self.index_types] * 4)) - for engine, r1, r2, c1, c2 in args: - self.check_medium_complex_frame_alignment(engine, r1, r2, c1, c2) - - def check_basic_frame_series_alignment(self, engine, r_idx_type, - c_idx_type, index_name): - def testit(): - skip_numexpr_engine(engine) + for engine, parser in ENGINES_PARSERS: + yield self.check_medium_complex_frame_alignment, engine, parser + + def check_basic_frame_series_alignment(self, engine, parser): + skip_if_no_ne(engine) + def testit(r_idx_type, c_idx_type, index_name): df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type) index = getattr(df, index_name) s = Series(np.random.randn(5), index[:5]) - res = pd.eval('df + s', engine=engine) + res = pd.eval('df + s', engine=engine, parser=parser) if r_idx_type == 'dt' or c_idx_type == 'dt': if engine == 'numexpr': expected = df.add(s) @@ -470,26 +595,25 @@ def testit(): expected = df + s assert_frame_equal(res, expected) - testit() + args = product(self.index_types, self.index_types, ('index', + 'columns')) + for r_idx_type, c_idx_type, index_name in args: + testit(r_idx_type, c_idx_type, index_name) @slow def test_basic_frame_series_alignment(self): - args = product(_engines, self.index_types, self.index_types, - ('index', 'columns')) - for engine, r_idx_type, c_idx_type, index_name in args: - self.check_basic_frame_series_alignment(engine, r_idx_type, - c_idx_type, index_name) - - def check_basic_series_frame_alignment(self, engine, r_idx_type, - c_idx_type, index_name): - def testit(): - skip_numexpr_engine(engine) + for engine, parser in ENGINES_PARSERS: + yield self.check_basic_frame_series_alignment, engine, parser + + def check_basic_series_frame_alignment(self, engine, parser): + skip_if_no_ne(engine) + def testit(r_idx_type, c_idx_type, index_name): df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, c_idx_type=c_idx_type) index = getattr(df, index_name) s = Series(np.random.randn(5), index[:5]) - res = pd.eval('s + df', engine=engine) + res = pd.eval('s + df', engine=engine, parser=parser) if r_idx_type == 'dt' or c_idx_type == 'dt': if engine == 'numexpr': expected = df.add(s) @@ -499,247 +623,237 @@ def testit(): expected = s + df assert_frame_equal(res, expected) - testit() + args = product(self.index_types, self.index_types, ('index', + 'columns')) + for r_idx_type, c_idx_type, index_name in args: + testit(r_idx_type, c_idx_type, index_name) @slow def test_basic_series_frame_alignment(self): - args = product(_engines, self.index_types, self.index_types, + for engine, parser in ENGINES_PARSERS: + yield self.check_basic_series_frame_alignment, engine, parser + + def check_series_frame_commutativity(self, engine, parser): + skip_if_no_ne(engine) + args = product(self.index_types, self.index_types, ('+', '*'), ('index', 'columns')) - for engine, r_idx_type, c_idx_type, index_name in args: - self.check_basic_series_frame_alignment(engine, r_idx_type, - c_idx_type, index_name) - - def check_series_frame_commutativity(self, engine, r_idx_type, c_idx_type, - op, index_name): - skip_numexpr_engine(engine) - df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, - c_idx_type=c_idx_type) - index = getattr(df, index_name) - s = Series(np.random.randn(5), index[:5]) - - lhs = 's {0} df'.format(op) - rhs = 'df {0} s'.format(op) - a = pd.eval(lhs, engine=engine) - b = pd.eval(rhs, engine=engine) - - if r_idx_type != 'dt' and c_idx_type != 'dt': - if engine == 'numexpr': - assert_frame_equal(a, b) + for r_idx_type, c_idx_type, op, index_name in args: + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + lhs = 's {0} df'.format(op) + rhs = 'df {0} s'.format(op) + a = pd.eval(lhs, engine=engine, parser=parser) + b = pd.eval(rhs, engine=engine, parser=parser) + + if r_idx_type != 'dt' and c_idx_type != 'dt': + if engine == 'numexpr': + assert_frame_equal(a, b) @slow def test_series_frame_commutativity(self): - args = product(_engines, self.index_types, self.index_types, ('+', - '*'), - ('index', 'columns')) - for engine, r_idx_type, c_idx_type, op, index_name in args: - self.check_series_frame_commutativity(engine, r_idx_type, - c_idx_type, op, index_name) - - def check_complex_series_frame_alignment(self, engine, index_name, obj, r1, - r2, c1, c2): - skip_numexpr_engine(engine) - df = mkdf(10, 5, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) - df2 = mkdf(20, 5, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) - index = getattr(locals()[obj], index_name) - s = Series(np.random.randn(5), index[:5]) - - if r2 == 'dt' or c2 == 'dt': - if engine == 'numexpr': - expected2 = df2.add(s) + for engine, parser in ENGINES_PARSERS: + yield self.check_series_frame_commutativity, engine, parser + + def check_complex_series_frame_alignment(self, engine, parser): + skip_if_no_ne(engine) + index_types = [self.index_types] * 4 + args = product(('index', 'columns'), ('df', 'df2'), *index_types) + for index_name, obj, r1, r2, c1, c2 in args: + df = mkdf(10, 5, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) + df2 = mkdf(20, 5, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + index = getattr(locals()[obj], index_name) + s = Series(np.random.randn(5), index[:5]) + + if r2 == 'dt' or c2 == 'dt': + if engine == 'numexpr': + expected2 = df2.add(s) + else: + expected2 = df2 + s else: expected2 = df2 + s - else: - expected2 = df2 + s - if r1 == 'dt' or c1 == 'dt': - if engine == 'numexpr': - expected = expected2.add(df) + if r1 == 'dt' or c1 == 'dt': + if engine == 'numexpr': + expected = expected2.add(df) + else: + expected = expected2 + df else: expected = expected2 + df - else: - expected = expected2 + df - res = pd.eval('df2 + s + df', engine=engine) - self.assertEqual(res.shape, expected.shape) - assert_frame_equal(res, expected) + res = pd.eval('df2 + s + df', engine=engine, parser=parser) + assert_equal(res.shape, expected.shape) + assert_frame_equal(res, expected) @slow def test_complex_series_frame_alignment(self): - index_types = [self.index_types] * 4 - args = product(_engines, ('index', 'columns'), ('df', 'df2'), - *index_types) - for engine, index_name, obj, r1, r2, c1, c2 in args: - self.check_complex_series_frame_alignment(engine, index_name, obj, - r1, r2, c1, c2) + for engine, parser in ENGINES_PARSERS: + yield self.check_complex_series_frame_alignment, engine, parser - @slow - def test_performance_warning_for_asenine_alignment(self): + def check_performance_warning_for_asenine_alignment(self, engine, parser): + skip_if_no_ne(engine) df = DataFrame(randn(1000, 10)) s = Series(randn(10000)) - with assert_produces_warning(pd.io.common.PerformanceWarning): - pd.eval('df + s') + if engine == 'numexpr': + seen = pd.io.common.PerformanceWarning + else: + seen = False + + with assert_produces_warning(seen): + pd.eval('df + s', engine=engine, parser=parser) s = Series(randn(1000)) with assert_produces_warning(False): - pd.eval('df + s') + pd.eval('df + s', engine=engine, parser=parser) df = DataFrame(randn(10, 10000)) s = Series(randn(10000)) with assert_produces_warning(False): - pd.eval('df + s') + pd.eval('df + s', engine=engine, parser=parser) + + def test_performance_warning_for_asenine_alignment(self): + for engine, parser in ENGINES_PARSERS: + yield self.check_performance_warning_for_asenine_alignment, engine, parser -class TestOperations(unittest.TestCase): +#------------------------------------ +# slightly more complex ops - def check_simple_arith_ops(self, engine): +class TestOperationsNumExprPandas(unittest.TestCase): + @classmethod + def setUpClass(cls): + skip_if_no_ne() + cls.engine = 'numexpr' + cls.parser = 'pandas' + + @classmethod + def tearDownClass(cls): + del cls.engine, cls.parser + + def eval(self, *args, **kwargs): + kwargs['engine'] = self.engine + kwargs['parser'] = self.parser + return pd.eval(*args, **kwargs) + + def test_simple_arith_ops(self): ops = expr._arith_ops_syms + expr._cmp_ops_syms for op in filter(lambda x: x != '//', ops): - expec = _eval_single_bin(1, op, 1, engine) - x = pd.eval('1 {0} 1'.format(op), engine=engine) + expec = _eval_single_bin(1, op, 1, self.engine) + x = self.eval('1 {0} 1'.format(op)) assert_equal(x, expec) - expec = _eval_single_bin(x, op, 1, engine) - y = pd.eval('x {0} 1'.format(op), engine=engine) + expec = _eval_single_bin(x, op, 1, self.engine) + y = self.eval('x {0} 1'.format(op), local_dict={'x': x}) assert_equal(y, expec) - expec = _eval_single_bin(1, op, x + 1, engine) - y = pd.eval('1 {0} (x + 1)'.format(op), engine=engine) + expec = _eval_single_bin(1, op, x + 1, self.engine) + y = self.eval('1 {0} (x + 1)'.format(op), local_dict={'x': x}) assert_equal(y, expec) - def check_simple_bool_ops(self, engine): + def test_simple_bool_ops(self): for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, False)): - expec = _eval_single_bin(lhs, op, rhs, engine) - x = pd.eval('lhs {0} rhs'.format(op), engine=engine) + expec = _eval_single_bin(lhs, op, rhs, self.engine) + x = self.eval('lhs {0} rhs'.format(op), local_dict={'lhs': lhs, + 'rhs': rhs}) assert_equal(x, expec) - def check_bool_ops_with_constants(self, engine): + def test_bool_ops_with_constants(self): asteval = ast.literal_eval for op, lhs, rhs in product(expr._bool_ops_syms, ('True', 'False'), ('True', 'False')): - expec = _eval_single_bin(asteval(lhs), op, asteval(rhs), engine) - x = pd.eval('{0} {1} {2}'.format(lhs, op, rhs), engine=engine) + expec = _eval_single_bin(asteval(lhs), op, asteval(rhs), + self.engine) + x = self.eval('{0} {1} {2}'.format(lhs, op, rhs), + local_dict={'lhs': lhs, 'rhs': rhs}) assert_equal(x, expec) - def test_simple_arith_ops(self): - for engine in _engines: - self.check_simple_arith_ops(engine) - - def test_simple_bool_ops(self): - for engine in _engines: - self.check_simple_bool_ops(engine) - - def test_bool_ops_with_constants(self): - for engine in _engines: - self.check_bool_ops_with_constants(engine) - - def check_panel_fails(self, engine): + def test_panel_fails(self): x = Panel(randn(3, 4, 5)) y = Series(randn(10)) - assert_raises(NotImplementedError, pd.eval, 'x + y', - local_dict={'x': x, 'y': y}, engine=engine) - - def test_panel_fails(self): - for engine in _engines: - self.check_panel_fails(engine) + assert_raises(NotImplementedError, self.eval, 'x + y', + local_dict={'x': x, 'y': y}) - def check_4d_ndarray_fails(self, engine): + def test_4d_ndarray_fails(self): x = randn(3, 4, 5, 6) y = Series(randn(10)) - assert_raises(NotImplementedError, pd.eval, 'x + y', local_dict={'x': x, - 'y': y}, - engine=engine) - - def test_4d_ndarray_fails(self): - for engine in _engines: - self.check_4d_ndarray_fails(engine) - - def check_constant(self, engine): - x = pd.eval('1', engine=engine) - assert_equal(x, 1) + assert_raises(NotImplementedError, self.eval, 'x + y', + local_dict={'x': x, 'y': y}) def test_constant(self): - for engine in _engines: - self.check_constant(engine) + x = self.eval('1') + assert_equal(x, 1) - def check_single_variable(self, engine): + def test_single_variable(self): df = DataFrame(randn(10, 2)) - df2 = pd.eval('df', engine=engine) + df2 = self.eval('df', local_dict={'df': df}) assert_frame_equal(df, df2) - def test_single_variable(self): - for engine in _engines: - self.check_single_variable(engine) - def test_truediv(self): - for engine in _engines: - self.check_truediv(engine) - - def check_truediv(self, engine): s = np.array([1]) ex = 's / 1' + d = {'s': s} if PY3: - res = pd.eval(ex, truediv=False) + res = self.eval(ex, truediv=False, local_dict=d) assert_array_equal(res, np.array([1.0])) - res = pd.eval(ex, truediv=True) + res = self.eval(ex, truediv=True, local_dict=d) assert_array_equal(res, np.array([1.0])) + + res = self.eval('1 / 2', truediv=True) + expec = 0.5 + self.assertEqual(res, expec) + + res = self.eval('1 / 2', truediv=False) + expec = 0.5 + self.assertEqual(res, expec) + + res = self.eval('s / 2', truediv=False, local_dict={'s': s}) + expec = 0.5 + self.assertEqual(res, expec) + + res = self.eval('s / 2', truediv=True, local_dict={'s': s}) + expec = 0.5 + self.assertEqual(res, expec) else: - res = pd.eval(ex, truediv=False) + res = self.eval(ex, truediv=False, local_dict=d) assert_array_equal(res, np.array([1])) - res = pd.eval(ex, truediv=True) + res = self.eval(ex, truediv=True, local_dict=d) assert_array_equal(res, np.array([1.0])) - def test_python_fails_and(self): - df = DataFrame(np.random.randn(5, 3)) - self.assertRaises(NotImplementedError, pd.eval, 'df > 2 and df > 3', - local_dict={'df': df}, parser='python') - - def test_python_fails_or(self): - df = DataFrame(np.random.randn(5, 3)) - self.assertRaises(NotImplementedError, pd.eval, 'df > 2 or df > 3', - local_dict={'df': df}, parser='python') + res = self.eval('1 / 2', truediv=True) + expec = 0.5 + self.assertEqual(res, expec) - def test_python_fails_not(self): - df = DataFrame(np.random.randn(5, 3)) - self.assertRaises(NotImplementedError, pd.eval, 'not df > 2', - local_dict={'df': df}, parser='python') + res = self.eval('1 / 2', truediv=False) + expec = 0 + self.assertEqual(res, expec) - def test_python_fails_ampersand(self): - df = DataFrame(np.random.randn(5, 3)) - self.assertRaises(TypeError, pd.eval, - '(df + 2)[df > 1] > 0 & (df > 0)', - local_dict={'df': df}, parser='python') - - def test_python_fails_pipe(self): - df = DataFrame(np.random.randn(5, 3)) - self.assertRaises(TypeError, pd.eval, - '(df + 2)[df > 1] > 0 | (df > 0)', - local_dict={'df': df}, parser='python') + res = self.eval('s / 2', truediv=False, local_dict={'s': s}) + expec = 0 + self.assertEqual(res, expec) - def check_failing_subscript_with_name_error(self, engine): - df = DataFrame(np.random.randn(5, 3)) - self.assertRaises(NameError, pd.eval, 'df[x > 2] > 2', - local_dict={'df': df}, engine=engine) + res = self.eval('s / 2', truediv=True, local_dict={'s': s}) + expec = 0.5 + self.assertEqual(res, expec) def test_failing_subscript_with_name_error(self): - for engine in _engines: - self.check_failing_subscript_with_name_error(engine) + df = DataFrame(np.random.randn(5, 3)) + self.assertRaises(NameError, self.eval, 'df[x > 2] > 2', + local_dict={'df': df}) - def check_lhs_expression_subscript(self, engine): + def test_lhs_expression_subscript(self): df = DataFrame(np.random.randn(5, 3)) - result = pd.eval('(df + 1)[df > 2]', engine=engine) + result = self.eval('(df + 1)[df > 2]', local_dict={'df': df}) expected = (df + 1)[df > 2] assert_frame_equal(result, expected) - def test_lhs_expression_subscript(self): - for engine in _engines: - self.check_lhs_expression_subscript(engine) - - def check_attr_expression(self, engine): + def test_attr_expression(self): df = DataFrame(np.random.randn(5, 3), columns=list('abc')) expr1 = 'df.a < df.b' expec1 = df.a < df.b @@ -750,197 +864,265 @@ def check_attr_expression(self, engine): exprs = expr1, expr2, expr3 expecs = expec1, expec2, expec3 for e, expec in zip(exprs, expecs): - assert_series_equal(expec, pd.eval(e, engine=engine)) + assert_series_equal(expec, self.eval(e, local_dict={'df': df})) - def test_attr_expression(self): - for engine in _engines: - self.check_attr_expression(engine) - - def check_assignment_fails(self, engine, parser): + def test_assignment_fails(self): df = DataFrame(np.random.randn(5, 3), columns=list('abc')) df2 = DataFrame(np.random.randn(5, 3)) expr1 = 'df = df2' - self.assertRaises(NotImplementedError, pd.eval, expr1, - local_dict={'df': df, 'df2': df2}, engine=engine, - parser=parser) + self.assertRaises(NotImplementedError, self.eval, expr1, + local_dict={'df': df, 'df2': df2}) - def test_assignment_fails(self): - for engine, parser in product(_engines.iterkeys(), ('pandas', - 'python')): - self.check_assignment_fails(engine, parser) - - def check_basic_period_index_boolean_expression(self, engine): + def test_basic_period_index_boolean_expression(self): df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') e = df < 2 - r = pd.eval('df < 2', engine=engine) + r = self.eval('df < 2', local_dict={'df': df}) x = df < 2 assert_frame_equal(r, e) assert_frame_equal(x, e) - def test_basic_period_index_expression_python(self): - for engine in _engines: - self.check_basic_period_index_boolean_expression(engine) - - def check_basic_period_index_subscript_expression(self, engine): + def test_basic_period_index_subscript_expression(self): df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') - r = pd.eval('df[df < 2 + 3]', engine=engine) + r = self.eval('df[df < 2 + 3]', local_dict={'df': df}) e = df[df < 2 + 3] assert_frame_equal(r, e) - def test_basic_period_index_subscript_expression(self): - for engine in _engines: - self.check_basic_period_index_subscript_expression(engine) - - def check_nested_period_index_subscript_expression(self, engine): + def test_nested_period_index_subscript_expression(self): df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') - r = pd.eval('df[df[df < 2] < 2] + df * 2', engine=engine) + r = self.eval('df[df[df < 2] < 2] + df * 2', local_dict={'df': df}) e = df[df[df < 2] < 2] + df * 2 assert_frame_equal(r, e) - def test_nested_period_index_subscript_expression(self): - for engine in _engines: - self.check_nested_period_index_subscript_expression(engine) - - def test_simple_not_expression(self): - df = DataFrame(randn(10, 3), columns=list('abc')) - df['bools'] = rand(len(df)) > 0.5 - res = df['not bools'] - res2 = df['~bools'] - expec = df[~df.bools] - assert_frame_equal(res, expec) - assert_frame_equal(res2, expec) - - def test_complex_boolean_expression(self): - df = DataFrame(randn(10, 3), columns=list('abc')) - df['bools'] = rand(len(df)) > 0.5 - res = df['a < b < c and (not bools) or bools > 2'] - expec = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)] - assert_frame_equal(res, expec) + def test_date_boolean(self): + df = DataFrame(randn(5, 3)) + df['dates1'] = date_range('1/1/2012', periods=5) + res = self.eval('df.dates1 < 20130101', local_dict={'df': df}) + expec = df.dates1 < '20130101' + assert_series_equal(res, expec) + + +class TestOperationsNumExprPython(TestOperationsNumExprPandas): + @classmethod + def setUpClass(cls): + if not _USE_NUMEXPR: + raise nose.SkipTest("numexpr engine not installed") + cls.engine = 'numexpr' + cls.parser = 'python' + + def test_fails_and(self): + df = DataFrame(np.random.randn(5, 3)) + self.assertRaises(NotImplementedError, pd.eval, 'df > 2 and df > 3', + local_dict={'df': df}, parser=self.parser, + engine=self.engine) + + def test_fails_or(self): + df = DataFrame(np.random.randn(5, 3)) + self.assertRaises(NotImplementedError, pd.eval, 'df > 2 or df > 3', + local_dict={'df': df}, parser=self.parser, + engine=self.engine) + + def test_fails_not(self): + df = DataFrame(np.random.randn(5, 3)) + self.assertRaises(NotImplementedError, pd.eval, 'not df > 2', + local_dict={'df': df}, parser=self.parser, + engine=self.engine) + + def test_fails_ampersand(self): + df = DataFrame(np.random.randn(5, 3)) + self.assertRaises(TypeError, pd.eval, + '(df + 2)[df > 1] > 0 & (df > 0)', + local_dict={'df': df}, parser=self.parser, + engine=self.engine) + + def test_fails_pipe(self): + df = DataFrame(np.random.randn(5, 3)) + self.assertRaises(TypeError, pd.eval, + '(df + 2)[df > 1] > 0 | (df > 0)', + local_dict={'df': df}, parser=self.parser, + engine=self.engine) + + def test_bool_ops_with_constants(self): + from ast import literal_eval as asteval + for op, lhs, rhs in product(expr._bool_ops_syms, ('True', 'False'), + ('True', 'False')): + if op not in ('and', 'or'): + expec = _eval_single_bin(asteval(lhs), op, asteval(rhs), + self.engine) + x = self.eval('{0} {1} {2}'.format(lhs, op, rhs), + local_dict={'lhs': lhs, 'rhs': rhs}) + assert_equal(x, expec) + else: + self.assertRaises(NotImplementedError, + self.eval, + '{0} {1} {2}'.format(lhs, op, rhs), + local_dict={'lhs': lhs, 'rhs': rhs}) + + def test_simple_bool_ops(self): + for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, + False)): + if op not in ('and', 'or'): + expec = _eval_single_bin(lhs, op, rhs, self.engine) + x = self.eval('lhs {0} rhs'.format(op), local_dict={'lhs': lhs, + 'rhs': rhs}) + assert_equal(x, expec) + else: + self.assertRaises(NotImplementedError, + self.eval, + 'lhs {0} rhs'.format(op), + local_dict={'lhs': lhs, 'rhs': rhs}) + + +class TestOperationsPythonPython(TestOperationsNumExprPython): + @classmethod + def setUpClass(cls): + cls.engine = cls.parser = 'python' + + def test_fails_ampersand(self): + raise nose.SkipTest("known failer for now") + df = DataFrame(np.random.randn(5, 3)) + self.assertRaises(TypeError, pd.eval, + '(df + 2)[df > 1] > 0 & (df > 0)', + local_dict={'df': df}, parser=self.parser, + engine=self.engine) + + def test_fails_pipe(self): + raise nose.SkipTest("known failer for now") + df = DataFrame(np.random.randn(5, 3)) + self.assertRaises(TypeError, pd.eval, + '(df + 2)[df > 1] > 0 | (df > 0)', + local_dict={'df': df}, parser=self.parser, + engine=self.engine) + + +class TestOperationsPythonPandas(TestOperationsNumExprPandas): + @classmethod + def setUpClass(cls): + cls.engine = 'python' + cls.parser = 'pandas' + _var_s = randn(10) -class TestScope(unittest.TestCase): - def check_global_scope(self, engine): - e = '_var_s * 2' - assert_array_equal(_var_s * 2, pd.eval(e, engine=engine)) +class TestScope(object): + def check_global_scope(self, e, engine, parser): + skip_if_no_ne(engine) + assert_array_equal(_var_s * 2, pd.eval(e, engine=engine, + parser=parser)) def test_global_scope(self): - for engine in _engines: - self.check_global_scope(engine) + e = '_var_s * 2' + for engine, parser in product(_engines, expr._parsers): + yield self.check_global_scope, e, engine, parser - def check_no_new_locals(self, engine): + def check_no_new_locals(self, engine, parser): + skip_if_no_ne(engine) x = 1 lcls = locals().copy() - pd.eval('x + 1', local_dict=lcls) + pd.eval('x + 1', local_dict=lcls, engine=engine, parser=parser) lcls2 = locals().copy() lcls2.pop('lcls') assert_equal(lcls, lcls2) def test_no_new_locals(self): - for engine in _engines: - self.check_no_new_locals(engine) + for engine, parser in product(_engines, expr._parsers): + yield self.check_no_new_locals, engine, parser - def check_no_new_globals(self, engine): + def check_no_new_globals(self, engine, parser): + skip_if_no_ne(engine) x = 1 gbls = globals().copy() - pd.eval('x + 1') + pd.eval('x + 1', engine=engine, parser=parser) gbls2 = globals().copy() assert_equal(gbls, gbls2) def test_no_new_globals(self): - for engine in _engines: - self.check_no_new_globals(engine) - - def check_nested_scope(self, engine): - # smoke test - x = 1 - result = pd.eval('x + 1', engine=engine) - self.assertEqual(result, 2) - - df = DataFrame(np.random.randn(5, 3)) - df2 = DataFrame(np.random.randn(5, 3)) - expected = df[(df>0) & (df2>0)] - - result = df['(df>0) & (df2>0)'] - assert_frame_equal(result, expected) - - result = df.query('(df>0) & (df2>0)', engine=engine) - assert_frame_equal(result, expected) - - result = pd.eval('df[(df > 0) and (df2 > 0)]', engine=engine) - assert_frame_equal(result, expected) - - result = pd.eval('df[(df > 0) and (df2 > 0) and df[df > 0] > 0]', engine=engine) - expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] - assert_frame_equal(result, expected) - - result = pd.eval('df[(df>0) & (df2>0)]',engine=engine) - expected = df.query('(df>0) & (df2>0)', engine=engine) - assert_frame_equal(result, expected) - - def test_nested_scope(self): - for engine in _engines: - self.check_nested_scope(engine) + for engine, parser in product(_engines, expr._parsers): + yield self.check_no_new_globals, engine, parser def test_invalid_engine(): + skip_if_no_ne() assertRaisesRegexp(KeyError, 'Invalid engine \'asdf\' passed', pd.eval, 'x + y', local_dict={'x': 1, 'y': 2}, engine='asdf') def test_invalid_parser(): + skip_if_no_ne() assertRaisesRegexp(KeyError, 'Invalid parser \'asdf\' passed', pd.eval, 'x + y', local_dict={'x': 1, 'y': 2}, parser='asdf') -def check_is_expr(engine): +def check_is_expr_syntax(engine): + skip_if_no_ne(engine) s = 1 - valid = 's + 1' - invalid = 's +' + valid1 = 's + 1' + valid2 = '__y + _xx' + assert_true(expr.isexpr(valid1, check_names=False)) + assert_true(expr.isexpr(valid2, check_names=False)) + + +def check_is_expr_names(engine): + skip_if_no_ne(engine) + r, s = 1, 2 + valid = 's + r' + invalid = '__y + __x' assert_true(expr.isexpr(valid, check_names=True)) - assert_true(expr.isexpr(valid, check_names=False)) - assert_false(expr.isexpr(invalid, check_names=False)) assert_false(expr.isexpr(invalid, check_names=True)) -def test_is_expr(): +def test_is_expr_syntax(): for engine in _engines: - check_is_expr(engine) + yield check_is_expr_syntax, engine + + +def test_is_expr_names(): + for engine in _engines: + yield check_is_expr_names, engine _parsers = {'python': PythonExprVisitor, 'pytables': pytables.ExprVisitor, 'pandas': PandasExprVisitor} -def check_disallowed_nodes(visitor): - """make sure the disallowed decorator works""" - VisitorClass = _parsers[visitor] +def check_disallowed_nodes(engine, parser): + skip_if_no_ne(engine) + VisitorClass = _parsers[parser] uns_ops = VisitorClass.unsupported_nodes - inst = VisitorClass('x + 1') + inst = VisitorClass('x + 1', engine, parser) for ops in uns_ops: assert_raises(NotImplementedError, getattr(inst, ops)) def test_disallowed_nodes(): - for visitor in _parsers: - check_disallowed_nodes(visitor) + for engine, visitor in product(_parsers, repeat=2): + yield check_disallowed_nodes, engine, visitor + + +def check_syntax_error_exprs(engine, parser): + skip_if_no_ne(engine) + e = 's +' + assert_raises(SyntaxError, pd.eval, e, engine=engine, parser=parser) def test_syntax_error_exprs(): - for engine in _engines: - e = 's +' - assert_raises(SyntaxError, pd.eval, e, engine=engine) + for engine, parser in ENGINES_PARSERS: + yield check_syntax_error_exprs, engine, parser + + +def check_name_error_exprs(engine, parser): + skip_if_no_ne(engine) + e = 's + t' + assert_raises(NameError, pd.eval, e, engine=engine, parser=parser) def test_name_error_exprs(): - for engine in _engines: - e = 's + t' - assert_raises(NameError, pd.eval, e, engine=engine) + for engine, parser in ENGINES_PARSERS: + yield check_name_error_exprs, engine, parser if __name__ == '__main__': diff --git a/pandas/core/common.py b/pandas/core/common.py index 757d3eb6f1925..8ada14485a780 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -9,8 +9,11 @@ import csv import sys +from distutils.version import LooseVersion + from numpy.lib.format import read_array, write_array import numpy as np + import pandas.algos as algos import pandas.lib as lib import pandas.tslib as tslib @@ -227,6 +230,21 @@ def notnull(obj): def flatten(l): + """Flatten an arbitrarily nested sequence. + + Parameters + ---------- + l : sequence + The non string sequence to flatten + + Notes + ----- + This doesn't consider strings sequences. + + Returns + ------- + flattened : generator + """ for el in l: if isinstance(el, collections.Iterable) and not is_string(el): for s in flatten(el): @@ -1669,7 +1687,7 @@ def is_bool(obj): def is_string(obj): - return isinstance(obj, basestring) + return isinstance(obj, string_types) def is_series(obj): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 3695a994bf0a8..3744363138214 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2,12 +2,9 @@ High level interface to PyTables for reading and writing pandas data structures to disk """ -from __future__ import print_function # pylint: disable-msg=E1101,W0613,W0603 from datetime import datetime, date -from pandas.compat import map, range, zip, lrange, lmap, u -from pandas import compat import time import re import copy @@ -22,7 +19,7 @@ from pandas.sparse.array import BlockIndex, IntIndex from pandas.tseries.api import PeriodIndex, DatetimeIndex from pandas.core.base import StringMixin -from pandas.core.common import adjoin, is_list_like, pprint_thing +from pandas.core.common import adjoin, pprint_thing from pandas.core.algorithms import match, unique from pandas.core.categorical import Categorical from pandas.core.common import _asarray_tuplesafe @@ -32,6 +29,8 @@ from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type import pandas.core.common as com from pandas.tools.merge import concat +from pandas import compat +from pandas.compat import u, PY3, range from pandas.io.common import PerformanceWarning from pandas.core.config import get_option from pandas.computation.pytables import Expr @@ -59,7 +58,7 @@ def _ensure_decoded(s): def _ensure_encoding(encoding): # set the encoding if we need if encoding is None: - if compat.PY3: + if PY3: encoding = _default_encoding return encoding @@ -264,7 +263,8 @@ def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, f = lambda store: store.put(key, value, **kwargs) if isinstance(path_or_buf, compat.string_types): - with get_store(path_or_buf, mode=mode, complevel=complevel, complib=complib) as store: + with get_store(path_or_buf, mode=mode, complevel=complevel, + complib=complib) as store: f(store) else: f(path_or_buf) @@ -499,7 +499,7 @@ def open(self, mode='a'): self._handle = h5_open(self._path, self._mode) except IOError as e: # pragma: no cover if 'can not be written' in str(e): - print('Opening %s in read-only mode' % self._path) + print ('Opening %s in read-only mode' % self._path) self._handle = h5_open(self._path, 'r') else: raise @@ -654,7 +654,9 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, if isinstance(keys, (list, tuple)) and len(keys) == 1: keys = keys[0] if isinstance(keys, compat.string_types): - return self.select(key=keys, where=where, columns=columns, start=start, stop=stop, iterator=iterator, chunksize=chunksize, **kwargs) + return self.select(key=keys, where=where, columns=columns, + start=start, stop=stop, iterator=iterator, + chunksize=chunksize, **kwargs) if not isinstance(keys, (list, tuple)): raise TypeError("keys must be a list/tuple") @@ -1537,7 +1539,7 @@ def __init__(self, values=None, kind=None, typ=None, super(DataCol, self).__init__( values=values, kind=kind, typ=typ, cname=cname, **kwargs) self.dtype = None - self.dtype_attr = u("%s_dtype") % self.name + self.dtype_attr = u("%s_dtype" % self.name) self.set_data(data) def __unicode__(self): @@ -3474,15 +3476,14 @@ def write_data_chunk(self, indexes, mask, values): rows = rows[~mask.ravel().astype(bool)] except Exception as detail: - raise Exception("cannot create row-data -> %s" % str(detail)) + raise Exception("cannot create row-data -> %s" % detail) try: if len(rows): self.table.append(rows) self.table.flush() except Exception as detail: - raise Exception( - "tables cannot write this data -> %s" % str(detail)) + raise TypeError("tables cannot write this data -> %s" % detail) def delete(self, where=None, **kwargs): @@ -3526,7 +3527,7 @@ def delete(self, where=None, **kwargs): # we must remove in reverse order! pg = groups.pop() for g in reversed(groups): - rows = l.take(lrange(g, pg)) + rows = l.take(range(g, pg)) table.removeRows(start=rows[rows.index[0] ], stop=rows[rows.index[-1]] + 1) pg = g @@ -4239,19 +4240,7 @@ def generate(self, where): if where is None: return None - if not isinstance(where, (list, tuple)): - where = [where] - else: - - # make this a list of we think that we only have a sigle term & no - # operands inside any terms - if not any([isinstance(w, (list, tuple, Term)) for w in where]): - - if not any([isinstance(w, compat.string_types) and Term._search.match(w) for w in where]): - where = [where] - - queryables = self.table.queryables() - return [Term(c, queryables=queryables, encoding=self.table.encoding) for c in where] + return Expr(where, queryables=self.table.queryables(), encoding=self.table.encoding) def select(self): """ diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index dfcbf0a984dab..b13c8e83d8777 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1,5 +1,3 @@ -from __future__ import print_function -from pandas.compat import range, lrange, u import nose import unittest import sys @@ -24,8 +22,8 @@ assert_series_equal) from pandas import concat, Timestamp from pandas import compat, _np_version_under1p7 -from pandas.core import common as com - +from pandas.compat import range, lrange, u +from pandas.util.testing import assert_produces_warning try: import tables @@ -752,7 +750,7 @@ def test_encoding(self): raise nose.SkipTest('system byteorder is not little, skipping test_encoding!') with ensure_clean(self.path) as store: - df = DataFrame(dict(A='foo',B='bar'),index=lrange(5)) + df = DataFrame(dict(A='foo',B='bar'),index=range(5)) df.loc[2,'A'] = np.nan df.loc[3,'B'] = np.nan _maybe_remove(store, 'df') @@ -906,7 +904,7 @@ def test_append_with_different_block_ordering(self): for i in range(10): df = DataFrame(np.random.randn(10,2),columns=list('AB')) - df['index'] = lrange(10) + df['index'] = range(10) df['index'] += i*10 df['int64'] = Series([1]*len(df),dtype='int64') df['int16'] = Series([1]*len(df),dtype='int16') @@ -1082,7 +1080,7 @@ def check_col(key,name,size): def check_col(key,name,size): self.assert_(getattr(store.get_storer(key).table.description,name).itemsize == size) - df = DataFrame(dict(A = 'foo', B = 'bar'),index=lrange(10)) + df = DataFrame(dict(A = 'foo', B = 'bar'),index=range(10)) # a min_itemsize that creates a data_column _maybe_remove(store, 'df') @@ -1317,9 +1315,8 @@ def test_big_table_frame(self): raise nose.SkipTest('no big table frame') # create and write a big table - df = DataFrame(np.random.randn(2000 * 100, 100), - index=lrange(2000 * 100), - columns=['E%03d' % i for i in range(100)]) + df = DataFrame(np.random.randn(2000 * 100, 100), index=range( + 2000 * 100), columns=['E%03d' % i for i in range(100)]) for x in range(20): df['String%03d' % x] = 'string%03d' % x @@ -1331,7 +1328,7 @@ def test_big_table_frame(self): recons = store.select('df') assert isinstance(recons, DataFrame) - print("\nbig_table frame [%s] -> %5.2f" % (rows, time.time() - x)) + print ("\nbig_table frame [%s] -> %5.2f" % (rows, time.time() - x)) def test_big_table2_frame(self): # this is a really big table: 1m rows x 60 float columns, 20 string, 20 datetime @@ -1342,15 +1339,14 @@ def test_big_table2_frame(self): print ("\nbig_table2 start") import time start_time = time.time() - df = DataFrame(np.random.randn(1000 * 1000, 60), - index=lrange(int(1000 * 1000)), - columns=['E%03d' % i for i in range(60)]) + df = DataFrame(np.random.randn(1000 * 1000, 60), index=range(int( + 1000 * 1000)), columns=['E%03d' % i for i in range(60)]) for x in range(20): df['String%03d' % x] = 'string%03d' % x for x in range(20): df['datetime%03d' % x] = datetime.datetime(2001, 1, 2, 0, 0) - print("\nbig_table2 frame (creation of df) [rows->%s] -> %5.2f" + print ("\nbig_table2 frame (creation of df) [rows->%s] -> %5.2f" % (len(df.index), time.time() - start_time)) def f(chunksize): @@ -1361,9 +1357,9 @@ def f(chunksize): for c in [10000, 50000, 250000]: start_time = time.time() - print("big_table2 frame [chunk->%s]" % c) + print ("big_table2 frame [chunk->%s]" % c) rows = f(c) - print("big_table2 frame [rows->%s,chunk->%s] -> %5.2f" + print ("big_table2 frame [rows->%s,chunk->%s] -> %5.2f" % (rows, c, time.time() - start_time)) def test_big_put_frame(self): @@ -1372,14 +1368,14 @@ def test_big_put_frame(self): print ("\nbig_put start") import time start_time = time.time() - df = DataFrame(np.random.randn(1000 * 1000, 60), index=lrange(int( + df = DataFrame(np.random.randn(1000 * 1000, 60), index=range(int( 1000 * 1000)), columns=['E%03d' % i for i in range(60)]) for x in range(20): df['String%03d' % x] = 'string%03d' % x for x in range(20): df['datetime%03d' % x] = datetime.datetime(2001, 1, 2, 0, 0) - print("\nbig_put frame (creation of df) [rows->%s] -> %5.2f" + print ("\nbig_put frame (creation of df) [rows->%s] -> %5.2f" % (len(df.index), time.time() - start_time)) with ensure_clean(self.path, mode='w') as store: @@ -1387,8 +1383,8 @@ def test_big_put_frame(self): store = HDFStore(self.path, mode='w') store.put('df', df) - print(df.get_dtype_counts()) - print("big_put frame [shape->%s] -> %5.2f" + print (df.get_dtype_counts()) + print ("big_put frame [shape->%s] -> %5.2f" % (df.shape, time.time() - start_time)) def test_big_table_panel(self): @@ -1414,7 +1410,7 @@ def test_big_table_panel(self): recons = store.select('wp') assert isinstance(recons, Panel) - print("\nbig_table panel [%s] -> %5.2f" % (rows, time.time() - x)) + print ("\nbig_table panel [%s] -> %5.2f" % (rows, time.time() - x)) def test_append_diff_item_order(self): @@ -1715,7 +1711,7 @@ def test_unimplemented_dtypes_table_columns(self): # py3 ok for unicode if not compat.PY3: - l.append(('unicode', u('\u03c3'))) + l.append(('unicode', u('\\u03c3'))) ### currently not supported dtypes #### for n, f in l: @@ -1764,14 +1760,14 @@ def compare(a,b): compare(store.select('df_tz',where=Term('A>=df.A[3]')),df[df.A>=df.A[3]]) _maybe_remove(store, 'df_tz') - df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130103',tz='US/Eastern')),index=lrange(5)) + df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130103',tz='US/Eastern')),index=range(5)) store.append('df_tz',df) result = store['df_tz'] compare(result,df) assert_frame_equal(result,df) _maybe_remove(store, 'df_tz') - df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='EET')),index=lrange(5)) + df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='EET')),index=range(5)) self.assertRaises(TypeError, store.append, 'df_tz', df) # this is ok @@ -1782,7 +1778,7 @@ def compare(a,b): assert_frame_equal(result,df) # can't append with diff timezone - df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='CET')),index=lrange(5)) + df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='CET')),index=range(5)) self.assertRaises(ValueError, store.append, 'df_tz', df) # as index @@ -2679,7 +2675,7 @@ def test_select_with_many_inputs(self): df = DataFrame(dict(ts=bdate_range('2012-01-01', periods=300), A=np.random.randn(300), - B=lrange(300), + B=range(300), users = ['a']*50 + ['b']*50 + ['c']*100 + ['a%03d' % i for i in range(100)])) _maybe_remove(store, 'df') store.append('df', df, data_columns=['ts', 'A', 'B', 'users']) @@ -2700,7 +2696,7 @@ def test_select_with_many_inputs(self): expected = df[ (df.ts >= Timestamp('2012-02-01')) & df.users.isin(selector) ] tm.assert_frame_equal(expected, result) - selector = lrange(100,200) + selector = range(100,200) result = store.select('df', [Term('B=selector')]) expected = df[ df.B.isin(selector) ] tm.assert_frame_equal(expected, result) @@ -2948,6 +2944,11 @@ def test_frame_select_complex(self): expected = df.loc[:,df.columns-['A','B']] tm.assert_frame_equal(result, expected) + # in + result = store.select('df', "index>df.index[3] & columns in ['A','B']") + expected = df.loc[df.index>df.index[3]].reindex(columns=['A','B']) + tm.assert_frame_equal(result, expected) + def test_invalid_filtering(self): # can't use more than one filter (atm) @@ -3029,7 +3030,7 @@ def f(): # valid result = store.select_column('df', 'index') tm.assert_almost_equal(result.values, Series(df.index).values) - tm.assert_isinstance(result,Series) + self.assert_(isinstance(result,Series)) # not a data indexable column self.assertRaises( @@ -3228,18 +3229,6 @@ def test_select_as_multiple(self): tm.assert_frame_equal(result, expected) # multiple (diff selector) - try: - result = store.select_as_multiple(['df1', 'df2'], where=[Term( - 'index', '>', df2.index[4])], selector='df2') - expected = concat([df1, df2], axis=1) - expected = expected[5:] - tm.assert_frame_equal(result, expected) - except (Exception), detail: - print ("error in select_as_multiple %s" % str(detail)) - print ("store: %s" % store) - print ("df1: %s" % df1) - print ("df2: %s" % df2) - result = store.select_as_multiple(['df1', 'df2'], where=[Term( 'index>df2.index[4]')], selector='df2') expected = concat([df1, df2], axis=1) @@ -3267,7 +3256,7 @@ def test_start_stop(self): result = store.select( 'df', [Term("columns=['A']")], start=30, stop=40) assert(len(result) == 0) - tm.assert_isinstance(result, DataFrame) + assert(type(result) == DataFrame) def test_select_filter_corner(self): @@ -3505,7 +3494,7 @@ def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): # check keys if keys is None: - keys = list(store.keys()) + keys = store.keys() self.assert_(set(keys) == set(tstore.keys())) # check indicies & nrows diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 96131d782893f..5fc02d97b239e 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -4,12 +4,12 @@ import nose from nose.tools import assert_equal -import unittest import numpy as np from pandas.tslib import iNaT -from pandas import Series, DataFrame, date_range, DatetimeIndex, Timestamp -import pandas.compat as compat +from pandas import (Series, DataFrame, date_range, DatetimeIndex, Timestamp, + Panel) +from pandas import compat from pandas.compat import range, long, lrange, lmap, u from pandas.core.common import notnull, isnull import pandas.compat as compat @@ -17,11 +17,8 @@ import pandas.util.testing as tm import pandas.core.config as cf -import numpy as np from numpy.random import randn -from pandas.tslib import iNaT - _multiprocess_can_split_ = True @@ -114,15 +111,18 @@ def test_isnull_lists(): def test_is_string(): - class MyString(str): + class MyUnicode(compat.text_type): pass - class MyUnicode(unicode): - pass + if not compat.PY3: + class MyString(str): + pass + else: + MyString = MyUnicode strings = ('s', np.str_('a'), np.unicode_('unicode_string'), - MyString('a _string blah'), u'asdf', MyUnicode(u'asdf')) - not_strings = [], 1, {}, set(), np.array(['1']), np.array([u'1']) + MyString('asdfasdfasdf'), u('asdf'), MyUnicode(u('asdf'))) + not_strings = [], 1, {}, set(), np.array(['1']), np.array([u('1')]) for string in strings: assert com.is_string(string), '{0} is not a string'.format(string) diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index ff76c7c070946..f81620b897a4a 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -4,31 +4,25 @@ import unittest import nose -import operator -from numpy import random, nan from numpy.random import randn + +import operator import numpy as np from numpy.testing import assert_array_equal -import pandas as pan -from pandas.core.api import DataFrame, Series, notnull, isnull -from pandas.core import expressions as expr +from pandas.core.api import DataFrame +from pandas.computation import expressions as expr -from pandas.util.testing import (assert_almost_equal, - assert_series_equal, - assert_frame_equal) +from pandas.util.testing import assert_series_equal, assert_frame_equal from pandas import compat -import pandas.util.testing as tm -import pandas.lib as lib - -from numpy.testing.decorators import slow if not expr._USE_NUMEXPR: - raise nose.SkipTest + raise nose.SkipTest("numexpr not available") + -_frame = DataFrame(np.random.randn(10000, 4), columns = list('ABCD'), dtype='float64') -_frame2 = DataFrame(np.random.randn(100, 4), columns = list('ABCD'), dtype='float64') +_frame = DataFrame(randn(10000, 4), columns=list('ABCD'), dtype='float64') +_frame2 = DataFrame(randn(100, 4), columns = list('ABCD'), dtype='float64') _mixed = DataFrame({ 'A' : _frame['A'].copy(), 'B' : _frame['B'].astype('float32'), 'C' : _frame['C'].astype('int64'), 'D' : _frame['D'].astype('int32') }) _mixed2 = DataFrame({ 'A' : _frame2['A'].copy(), 'B' : _frame2['B'].astype('float32'), 'C' : _frame2['C'].astype('int64'), 'D' : _frame2['D'].astype('int32') }) _integer = DataFrame(np.random.randint(1, 100, size=(10001, 4)), columns = list('ABCD'), dtype='int64') @@ -128,11 +122,11 @@ def testit(): result = expr.evaluate(op, op_str, f, f, use_numexpr=True) expected = expr.evaluate(op, op_str, f, f, use_numexpr=False) assert_array_equal(result,expected.values) - + result = expr._can_use_numexpr(op, op_str, f2, f2, 'evaluate') self.assert_(result == False) - + expr.set_use_numexpr(False) testit() expr.set_use_numexpr(True) @@ -149,7 +143,7 @@ def testit(): f11 = f f12 = f + 1 - + f21 = f2 f22 = f2 + 1 @@ -163,7 +157,7 @@ def testit(): result = expr.evaluate(op, op_str, f11, f12, use_numexpr=True) expected = expr.evaluate(op, op_str, f11, f12, use_numexpr=False) assert_array_equal(result,expected.values) - + result = expr._can_use_numexpr(op, op_str, f21, f22, 'evaluate') self.assert_(result == False) @@ -180,7 +174,7 @@ def test_where(self): def testit(): for f in [ self.frame, self.frame2, self.mixed, self.mixed2 ]: - + for cond in [ True, False ]: c = np.empty(f.shape,dtype=np.bool_) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index dccd7a8d14cac..7ba62a75a00cd 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -18,7 +18,7 @@ from pandas import compat from numpy import random, nan -from numpy.random import randn +from numpy.random import randn, rand import numpy as np import numpy.ma as ma from numpy.testing import assert_array_equal @@ -8097,129 +8097,9 @@ def test_mask_edge_case_1xN_frame(self): expec = DataFrame([[nan, 2]]) assert_frame_equal(res, expec) - def test_query_expressions_correct_failure(self): - try: - import numexpr as ne - except ImportError: - raise nose.SkipTest("cannot query engine numexpr when numexpr not installed") - df = self.frame - exprs = 'and', 'or', 'not' - exprs += tuple(x + tm.rands(5) for x in exprs) - exprs += tuple(tm.rands(5) + x for x in exprs) - - for e in exprs: - self.assertRaises(KeyError, df.__getitem__, e) - - for e in (' and ', ' or ', ' not '): - self.assertRaises(SyntaxError, df.__getitem__, e) - - x = tm.randbool(size=(self.frame.shape[0],)) - self.assertRaises(KeyError, df.__getitem__, 'x') - - def test_query_expressions(self): - try: - import numexpr as ne - except ImportError: - raise nose.SkipTest("cannot query engine numexpr when numexpr not installed") - df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c']) - assert_frame_equal(df['a < b'], df[df.a < df.b]) - assert_frame_equal(df['a + b > b * c'], - df[df.a + df.b > df.b * df.c]) - - def test_query_expressions_with_index(self): - try: - import numexpr as ne - except ImportError: - raise nose.SkipTest("cannot query engine numexpr when numexpr not installed") - df = DataFrame(np.random.randint(10, size=(10, 3)), - index=Index(range(10), name='blob'), - columns=['a', 'b', 'c']) - assert_frame_equal(df['index < b'], df[df.index < df.b]) - assert_frame_equal(df['index < 5'], df[df.index < 5]) - assert_frame_equal(df['(blob < 5) & (a < b)'], - df[(df.index < 5) & (df.a < df.b)]) - assert_frame_equal(df['blob < b'], df[df.index < df.b]) - - def test_query(self): - import itertools - for engine, parser in itertools.product(comp.engines._engines, - comp.expr._parsers): - self.check_query(engine, parser) - - def check_query(self, engine, parser): - if engine == 'numexpr': - try: - import numexpr as ne - except ImportError: - raise nose.SkipTest("cannot query engine numexpr when numexpr not installed") - - df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c']) - assert_frame_equal(df.query('a < b', engine=engine, parser=parser), df[df.a < df.b]) - assert_frame_equal(df.query('a + b > b * c', engine=engine, parser=parser), - df[df.a + df.b > df.b * df.c]) - - local_dict = dict(df.iteritems()) - local_dict.update({'df': df}) - self.assertRaises(NameError, df.query, 'a < d & b < f', - local_dict=local_dict, engine=engine, parser=parser) - - # make sure that it's not just because we didn't pass the locals in - self.assertRaises(AssertionError, self.assertRaises, NameError, - df.query, 'a < b', local_dict=local_dict, - engine=engine, parser=parser) - - def test_query_index(self): - import itertools - for engine, parser in itertools.product(comp.engines._engines, - comp.expr._parsers): - self.check_query_index(engine, parser) - - def check_query_index(self, engine, parser): - if engine == 'numexpr': - try: - import numexpr as ne - except ImportError: - raise nose.SkipTest("cannot query engine numexpr when numexpr not installed") - - df = DataFrame(np.random.randint(10, size=(10, 3)), - index=Index(range(10), name='blob'), - columns=['a', 'b', 'c']) - assert_frame_equal(df.query('index < b', engine=engine, parser=parser), - df[df.index < df.b]) - assert_frame_equal(df.query('index < 5', engine=engine, parser=parser), - df[df.index < 5]) - assert_frame_equal(df.query('(blob < 5) & (a < b)', engine=engine, - parser=parser), - df[(df.index < 5) & (df.a < df.b)]) - assert_frame_equal(df.query('blob < b', engine=engine, parser=parser), - df[df.index < df.b]) - - def test_query_different_parsers(self): - for engine in comp.engines._engines: - self.check_query_different_parsers(engine) - - def check_query_different_parsers(self, engine): - if engine == 'numexpr': - try: - import numexpr as ne - except ImportError: - raise nose.SkipTest("cannot query engine numexpr when numexpr not installed") - df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c']) - assert_frame_equal(df.query('(a < 5) & (a < b)', parser='python', - engine=engine), - df.query('a < 5 & a < b', parser='pandas', - engine=engine)) - df = DataFrame(np.random.randint(10, size=(10, 3)), - index=Index(range(10), name='blob'), - columns=['a', 'b', 'c']) - assert_frame_equal(df.query('(blob < 5) & (a < b)', parser='python', - engine=engine), - df.query('blob < 5 & a < b', parser='pandas', - engine=engine)) - - #---------------------------------------------------------------------- # Transposing + def test_transpose(self): frame = self.frame dft = frame.T @@ -11224,10 +11104,270 @@ def test_isin_with_string_scalar(self): with tm.assertRaises(TypeError): df.isin('aaa') +def skip_if_no_ne(engine='numexpr'): + if engine == 'numexpr': + try: + import numexpr as ne + except ImportError: + raise nose.SkipTest("cannot query engine numexpr when numexpr not " + "installed") + + +class TestDataFrameQueryNumExprPandas(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.engine = 'numexpr' + cls.parser = 'pandas' + skip_if_no_ne(cls.engine) + cls.frame = _frame.copy() + + @classmethod + def tearDownClass(cls): + del cls.frame, cls.engine, cls.parser + + def test_date_query_method(self): + engine, parser = self.engine, self.parser + df = DataFrame(randn(5, 3)) + df['dates1'] = date_range('1/1/2012', periods=5) + df['dates2'] = date_range('1/1/2013', periods=5) + df['dates3'] = date_range('1/1/2014', periods=5) + res = df.query('dates1 < 20130101 < dates3', engine=engine, + parser=parser) + expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + assert_frame_equal(res, expec) + + def test_query_scope(self): + engine, parser = self.engine, self.parser + from pandas.computation.common import NameResolutionError + + df = DataFrame({"i": lrange(10), + "+": lrange(3, 13), "r": lrange(4, 14)}) + i, s = 5, 6 + self.assertRaises(NameResolutionError, df.query, 'i < 5', + local_dict=locals(), global_dict=globals(), + engine=engine, parser=parser) + self.assertRaises(NameResolutionError, df.query, 'i - +', engine=engine, + local_dict=locals(), global_dict=globals(), + parser=parser) + self.assertRaises(NameResolutionError, df.query, 'i == s', + engine=engine, local_dict=locals(), + global_dict=globals(), parser=parser) + df.index.name = 'sin' + self.assertRaises(NameResolutionError, df.query, 'sin > 5', + engine=engine, parser=parser, local_dict=locals(), + global_dict=globals()) + + def test_query(self): + engine, parser = self.engine, self.parser + df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c']) + assert_frame_equal(df.query('a < b', engine=engine, parser=parser), df[df.a < df.b]) + assert_frame_equal(df.query('a + b > b * c', engine=engine, parser=parser), + df[df.a + df.b > df.b * df.c]) + + local_dict = dict(df.iteritems()) + local_dict.update({'df': df}) + self.assertRaises(NameError, df.query, 'a < d & b < f', + local_dict=local_dict, engine=engine, parser=parser) + + # make sure that it's not just because we didn't pass the locals in + self.assertRaises(AssertionError, self.assertRaises, NameError, + df.query, 'a < b', local_dict={'df': df}, + engine=engine, parser=parser) + + def test_query_index(self): + engine, parser = self.engine, self.parser + df = DataFrame(np.random.randint(10, size=(10, 3)), + index=Index(range(10), name='blob'), + columns=['a', 'b', 'c']) + assert_frame_equal(df.query('index < b', engine=engine, parser=parser), + df[df.index < df.b]) + assert_frame_equal(df.query('index < 5', engine=engine, parser=parser), + df[df.index < 5]) + assert_frame_equal(df.query('(blob < 5) & (a < b)', engine=engine, + parser=parser), + df[(df.index < 5) & (df.a < df.b)]) + assert_frame_equal(df.query('blob < b', engine=engine, parser=parser), + df[df.index < df.b]) + + def test_nested_scope(self): + engine = self.engine + parser = self.parser + # smoke test + x = 1 + result = pd.eval('x + 1', engine=engine, parser=parser) + self.assertEqual(result, 2) + + df = DataFrame(np.random.randn(5, 3)) + df2 = DataFrame(np.random.randn(5, 3)) + expected = df[(df>0) & (df2>0)] + + result = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + assert_frame_equal(result, expected) + + result = pd.eval('df[(df > 0) and (df2 > 0)]', engine=engine, + parser=parser) + assert_frame_equal(result, expected) + + result = pd.eval('df[(df > 0) and (df2 > 0) and df[df > 0] > 0]', + engine=engine, parser=parser) + expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] + assert_frame_equal(result, expected) + + result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser) + expected = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + assert_frame_equal(result, expected) + + +class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas): + @classmethod + def setUpClass(cls): + cls.engine = 'numexpr' + cls.parser = 'python' + skip_if_no_ne(cls.engine) + cls.frame = _frame.copy() + + @classmethod + def tearDownClass(cls): + del cls.frame, cls.engine, cls.parser + + def test_date_query_method(self): + engine, parser = self.engine, self.parser + df = DataFrame(randn(5, 3)) + df['dates1'] = date_range('1/1/2012', periods=5) + df['dates2'] = date_range('1/1/2013', periods=5) + df['dates3'] = date_range('1/1/2014', periods=5) + res = df.query('(df.dates1 < 20130101) & (20130101 < df.dates3)', + engine=engine, parser=parser) + expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + assert_frame_equal(res, expec) + + def test_nested_scope(self): + engine = self.engine + parser = self.parser + # smoke test + x = 1 + result = pd.eval('x + 1', engine=engine, parser=parser) + self.assertEqual(result, 2) + + df = DataFrame(np.random.randn(5, 3)) + df2 = DataFrame(np.random.randn(5, 3)) + expected = df[(df>0) & (df2>0)] + + result = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + assert_frame_equal(result, expected) + + result = pd.eval('df[(df > 0) & (df2 > 0)]', engine=engine, + parser=parser) + assert_frame_equal(result, expected) + + result = pd.eval('df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]', + engine=engine, parser=parser) + expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] + assert_frame_equal(result, expected) + + result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser) + expected = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + assert_frame_equal(result, expected) + + +class TestDataFrameQueryPythonPandas(TestDataFrameQueryNumExprPandas): + @classmethod + def setUpClass(cls): + cls.engine = 'python' + cls.parser = 'pandas' + cls.frame = _frame.copy() + + @classmethod + def tearDownClass(cls): + del cls.frame, cls.engine, cls.parser + + +class TestDataFrameQueryPythonPython(TestDataFrameQueryNumExprPython): + @classmethod + def setUpClass(cls): + cls.engine = cls.parser = 'python' + cls.frame = _frame.copy() + + @classmethod + def tearDownClass(cls): + del cls.frame, cls.engine, cls.parser + + +class TestDataFrameQueryGetitem(unittest.TestCase): + @classmethod + def setUpClass(cls): + skip_if_no_ne() + cls.frame = _frame.copy() + + @classmethod + def tearDownClass(cls): + del cls.frame + + def test_nested_scope(self): + df = DataFrame(np.random.randn(5, 3)) + df2 = DataFrame(np.random.randn(5, 3)) + expected = df[(df>0) & (df2>0)] + + result = df['(df>0) & (df2>0)'] + assert_frame_equal(result, expected) + + def test_date_query_getitem(self): + df = DataFrame(randn(5, 3)) + df['dates1'] = date_range('1/1/2012', periods=5) + df['dates2'] = date_range('1/1/2013', periods=5) + df['dates3'] = date_range('1/1/2014', periods=5) + res = df['dates1 < 20130101 < dates3'] + expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] + assert_frame_equal(res, expec) + + def test_query_expressions_correct_failure(self): + df = self.frame + exprs = 'and', 'or', 'not' + exprs += tuple(x + tm.rands(5) for x in exprs) + exprs += tuple(tm.rands(5) + x for x in exprs) + + for e in exprs: + self.assertRaises(KeyError, df.__getitem__, e) + + for e in (' and ', ' or ', ' not '): + self.assertRaises(SyntaxError, df.__getitem__, e) + + x = tm.randbool(size=(self.frame.shape[0],)) + self.assertRaises(KeyError, df.__getitem__, 'x') + + def test_query_expressions_with_index(self): + df = DataFrame(np.random.randint(10, size=(10, 3)), + index=Index(range(10), name='blob'), + columns=['a', 'b', 'c']) + assert_frame_equal(df['index < b'], df[df.index < df.b]) + assert_frame_equal(df['index < 5'], df[df.index < 5]) + assert_frame_equal(df['(blob < 5) & (a < b)'], + df[(df.index < 5) & (df.a < df.b)]) + assert_frame_equal(df['blob < b'], df[df.index < df.b]) + + def test_query_expressions(self): + df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c']) + assert_frame_equal(df['a < b'], df[df.a < df.b]) + assert_frame_equal(df['a + b > b * c'], + df[df.a + df.b > df.b * df.c]) + + def test_simple_not_expression(self): + df = DataFrame(randn(10, 3), columns=list('abc')) + df['bools'] = rand(len(df)) > 0.5 + res = df['not bools'] + res2 = df['~bools'] + expec = df[~df.bools] + assert_frame_equal(res, expec) + assert_frame_equal(res2, expec) + + def test_complex_boolean_expression(self): + df = DataFrame(randn(10, 3), columns=list('abc')) + df['bools'] = rand(len(df)) > 0.5 + res = df['a < b < c and (not bools) or bools > 2'] + expec = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)] + assert_frame_equal(res, expec) + + if __name__ == '__main__': - # unittest.main() - import nose - # nose.runmodule(argv=[__file__,'-vvs','-x', '--ipdb-failure'], - # exit=False) nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index e82aef13723ba..bf895e2abd97e 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -13,7 +13,6 @@ from datetime import datetime from functools import wraps, partial from contextlib import contextmanager -from httplib import HTTPException from distutils.version import LooseVersion from numpy.random import randn, rand @@ -28,7 +27,7 @@ import pandas.compat as compat from pandas.compat import( map, zip, range, unichr, lrange, lmap, lzip, u, callable, Counter, - raise_with_traceback + raise_with_traceback, httplib ) from pandas import bdate_range @@ -52,6 +51,7 @@ def randbool(size=(), p=0.5): return rand(*size) <= p + def rands(n): choices = string.ascii_letters + string.digits return ''.join(random.choice(choices) for _ in range(n)) @@ -65,7 +65,6 @@ def randu(n): #------------------------------------------------------------------------------ # Console debugging tools - def debug(f, *args, **kwargs): from pdb import Pdb as OldPdb try: @@ -756,7 +755,7 @@ def dec(f): return wrapper -_network_error_classes = IOError, HTTPException +_network_error_classes = IOError, httplib.HTTPException @optional_args @@ -800,13 +799,13 @@ def network(t, raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT, >>> import nose >>> @network ... def test_network(): - ... with urlopen("rabbit://bonanza.com") as f: - ... pass + ... with urlopen("rabbit://bonanza.com") as f: + ... pass ... >>> try: - ... test_network() + ... test_network() ... except nose.SkipTest: - ... print "SKIPPING!" + ... print("SKIPPING!") ... SKIPPING! @@ -815,8 +814,8 @@ def network(t, raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT, >>> @network(raise_on_error=True) ... def test_network(): - ... with urlopen("complaint://deadparrot.com") as f: - ... pass + ... with urlopen("complaint://deadparrot.com") as f: + ... pass ... >>> test_network() Traceback (most recent call last): diff --git a/setup.py b/setup.py index 955dedb74c180..ffd6089bdc88d 100755 --- a/setup.py +++ b/setup.py @@ -507,6 +507,7 @@ def pxd(name): packages=['pandas', 'pandas.compat', 'pandas.computation', + 'pandas.computation.tests', 'pandas.core', 'pandas.io', 'pandas.rpy', From 0160225c19bade97f069d0a5de05819d83b2de46 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 5 Aug 2013 14:15:22 -0400 Subject: [PATCH 09/16] ENH: add local scoping with @lcl_variable syntax --- pandas/computation/ops.py | 59 +++++++++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 12 deletions(-) diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index c0d3c7bdd81dd..cd59ab2927e7b 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -1,10 +1,11 @@ +import re import operator as op from functools import partial import numpy as np import pandas as pd -from pandas.compat import PY3, string_types +from pandas.compat import PY3, string_types, text_type import pandas.core.common as com from pandas.core.base import StringMixin from pandas.computation.common import _ensure_decoded @@ -16,6 +17,16 @@ 'arccosh', 'arcsinh', 'arctanh', 'arctan2', 'abs') +_LOCAL_TAG = '__pd_eval_local_' +_TAG_RE = re.compile('^{0}'.format(_LOCAL_TAG)) + + +class UndefinedVariableError(NameError): + def __init__(self, *args): + super(UndefinedVariableError, + self).__init__('name {0!r} is not defined'.format(args[0])) + + class OperatorError(Exception): pass @@ -33,9 +44,14 @@ def __init__(self, name, env, side=None, encoding=None): self._name = name self.env = env self.side = side + self.local = _TAG_RE.search(text_type(name)) is not None self._value = self._resolve_name() self.encoding = encoding + @property + def local_name(self): + return _TAG_RE.sub('', self.name) + def __unicode__(self): return com.pprint_thing(self.name) @@ -43,15 +59,16 @@ def __call__(self, *args, **kwargs): return self.value def _resolve_name(self): + #import ipdb; ipdb.set_trace() env = self.env key = self.name - res = env.resolver(key) + res = env.resolve(self.local_name, globally=not self.local) self.update(res) if res is None: if not isinstance(key, string_types): return key - raise NameError('name {0!r} is not defined'.format(key)) + raise UndefinedVariableError(key) if hasattr(res, 'ndim') and res.ndim > 2: raise NotImplementedError("N-dimensional objects, where N > 2, are" @@ -62,19 +79,37 @@ def update(self, value): env = self.env key = self.name if isinstance(key, string_types): - try: - del env.locals[key] - env.locals[key] = value - except KeyError: - if key in env.resolver_keys: + if self.local: + local_name = self.local_name + + try: + del env.locals[local_name] env.locals[key] = value - else: + except KeyError: try: - del env.globals[key] + del env.globals[local_name] env.globals[key] = value except KeyError: - raise NameError('name {0!r} is not ' - 'defined'.format(key)) + try: + del env.locals[key] + env.locals[key] = value + except KeyError: + try: + del env.globals[key] + env.globals[key] = value + except KeyError: + raise UndefinedVariableError(key) + else: + for r in (env.resolver_dict, env.locals, env.globals): + try: + del r[key] + except KeyError: + pass + else: + r[key] = value + break + else: + raise UndefinedVariableError(key) self.value = value From cca01738ace059964c9a8b9855c2c8a7901d5080 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 5 Aug 2013 14:30:36 -0400 Subject: [PATCH 10/16] ENH: add DataFrame.eval method --- pandas/computation/align.py | 11 +- pandas/computation/eval.py | 19 +- pandas/computation/expr.py | 36 ++-- pandas/computation/ops.py | 75 +++++--- pandas/computation/pytables.py | 22 ++- pandas/computation/tests/test_eval.py | 241 ++++++++++++++---------- pandas/core/common.py | 54 +++--- pandas/core/frame.py | 22 ++- pandas/core/series.py | 18 +- pandas/io/pytables.py | 34 ++-- pandas/io/tests/test_pytables.py | 5 +- pandas/tests/test_common.py | 64 +------ pandas/tests/test_frame.py | 261 +++++++++++++++++++++++--- vb_suite/indexing.py | 2 +- 14 files changed, 540 insertions(+), 324 deletions(-) diff --git a/pandas/computation/align.py b/pandas/computation/align.py index 794a209b53f46..ec51887ff6df0 100644 --- a/pandas/computation/align.py +++ b/pandas/computation/align.py @@ -11,7 +11,7 @@ def _align_core_single_unary_op(term): - if isinstance(term.value, np.ndarray) and not com.is_series(term.value): + if isinstance(term.value, np.ndarray): typ = partial(np.asanyarray, dtype=term.value.dtype) else: typ = type(term.value) @@ -67,7 +67,8 @@ def _maybe_promote_shape(values, naxes): def _any_pandas_objects(terms): """Check a sequence of terms for instances of PandasObject.""" - return any(com.is_pd_obj(term.value) for term in terms) + return any(isinstance(term.value, pd.core.generic.PandasObject) + for term in terms) def _filter_special_cases(f): @@ -111,7 +112,7 @@ def _align_core(terms): for term in (terms[i] for i in term_index): for axis, items in enumerate(term.value.axes): - if com.is_series(term.value) and naxes > 1: + if isinstance(term.value, pd.Series) and naxes > 1: ax, itm = naxes - 1, term.value.index else: ax, itm = axis, items @@ -122,7 +123,7 @@ def _align_core(terms): ti = terms[i].value if hasattr(ti, 'reindex_axis'): - transpose = com.is_series(ti) and naxes > 1 + transpose = isinstance(ti, pd.Series) and naxes > 1 reindexer = axes[naxes - 1] if transpose else items term_axis_size = len(ti.axes[axis]) @@ -183,7 +184,7 @@ def _align(terms): terms = list(com.flatten(terms)) except TypeError: # can't iterate so it must just be a constant or single variable - if isinstance(terms.value, (pd.Series, pd.core.generic.NDFrame)): + if isinstance(terms.value, pd.core.generic.NDFrame): typ = type(terms.value) return typ, _zip_axes_from_type(typ, terms.value.axes) return np.result_type(terms.type), None diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index cb8af98928564..ff073889376aa 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -3,10 +3,8 @@ import numbers import numpy as np -from pandas import compat from pandas.compat import string_types -from pandas.computation.expr import (Expr, _parsers, _ensure_scope, - _check_syntax) +from pandas.computation.expr import Expr, _parsers, _ensure_scope from pandas.computation.engines import _engines @@ -47,12 +45,12 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True, ---------- expr : string The expression to evaluate. - parser : string, optional, default 'pandas', {'pandas', 'python'} + parser : string, default 'pandas', {'pandas', 'python'} The parser to use to construct the syntax tree from the expression. The default of 'pandas' parses code slightly different than standard Python. See the :ref:`enhancing performance ` documentation for more details. - engine : string, optional, default 'numexpr', {'python', 'numexpr'} + engine : string, default 'numexpr', {'python', 'numexpr'} The engine used to evaluate the expression. Supported engines are @@ -62,11 +60,11 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True, - ``'python'``: Performs operations as if you had ``eval``'d in top level python. This engine is generally not that useful. - truediv : bool, optional, default True + truediv : bool, default True Whether to use true division, like in Python >= 3 - local_dict : dict or None, optional, default None + local_dict : dict or None, default None A dictionary of local variables, taken from locals() by default. - global_dict : dict or None, optional, default None + global_dict : dict or None, default None A dictionary of global variables, taken from globals() by default. resolvers : dict of dict-like or None, default None A dictionary of dict-like object (specifically they must implement the @@ -76,7 +74,7 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True, :attr:`~pandas.DataFrame.index` and :attr:`~pandas.DataFrame.columns` variables that refer to their respective :class:`~pandas.DataFrame` instance attributes. - level : int, optional, default 2 + level : int, default 2 The number of prior stack frames to traverse and add to the current scope. @@ -112,7 +110,8 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True, # construct the engine and evaluate eng = _engines[engine] - ret = eng(parsed_expr).evaluate() + eng_inst = eng(parsed_expr) + ret = eng_inst.evaluate() # sanity check for a number if it's a scalar result # TODO: eventually take out diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 5cff968727c5c..1fbc0b72289b1 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -26,12 +26,12 @@ def _ensure_scope(level=2, global_dict=None, local_dict=None, resolvers=None, def _check_disjoint_resolver_names(resolver_keys, local_keys, global_keys): - res_locals = com.intersection(resolver_keys, local_keys) + res_locals = list(com.intersection(resolver_keys, local_keys)) if res_locals: msg = "resolvers and locals overlap on names {0}".format(res_locals) raise NameResolutionError(msg) - res_globals = com.intersection(resolver_keys, global_keys) + res_globals = list(com.intersection(resolver_keys, global_keys)) if res_globals: msg = "resolvers and globals overlap on names {0}".format(res_globals) raise NameResolutionError(msg) @@ -172,7 +172,9 @@ def add_tmp(self, value, where='locals'): raise TypeError("Cannot add value to object of type {0!r}, " "scope must be a dictionary" "".format(d.__class__.__name__)) - name = 'tmp_var_{0}_{1}'.format(self.ntemps, pd.util.testing.rands(10)) + name = 'tmp_var_{0}_{1}_{2}'.format(value.__class__.__name__, + self.ntemps, + pd.util.testing.rands(10)) d[name] = value # only increment if the variable gets put in the scope @@ -320,18 +322,15 @@ def __init__(self, env, engine, parser, preparser=_preparse): self.preparser = preparser def visit(self, node, **kwargs): - parse = ast.parse if isinstance(node, string_types): clean = self.preparser(node) - elif isinstance(node, ast.AST): - clean = node - else: + node = ast.fix_missing_locations(ast.parse(clean)) + elif not isinstance(node, ast.AST): raise TypeError("Cannot visit objects of type {0!r}" "".format(node.__class__.__name__)) - node = parse(clean) method = 'visit_' + node.__class__.__name__ - visitor = getattr(self, method, None) + visitor = getattr(self, method) return visitor(node, **kwargs) def visit_Module(self, node, **kwargs): @@ -365,11 +364,12 @@ def visit_Num(self, node, **kwargs): return self.const_type(node.n, self.env) def visit_Str(self, node, **kwargs): - return self.const_type(node.s, self.env) + name = self.env.add_tmp(node.s) + return self.term_type(name, self.env) def visit_List(self, node, **kwargs): - return self.const_type([self.visit(e).value for e in node.elts], - self.env) + name = self.env.add_tmp([self.visit(e).value for e in node.elts]) + return self.term_type(name, self.env) visit_Tuple = visit_List @@ -467,7 +467,7 @@ def visit_Compare(self, node, **kwargs): comps = node.comparators def translate(op): - if isinstance(op,ast.In): + if isinstance(op, ast.In): return ast.Eq() return op @@ -502,8 +502,8 @@ def visitor(x, y): return reduce(visitor, operands) -_python_not_supported = frozenset(['Assign', 'Str', 'Tuple', 'List', 'Dict', - 'Call', 'BoolOp']) +_python_not_supported = frozenset(['Assign', 'Tuple', 'Dict', 'Call', + 'BoolOp']) _numexpr_supported_calls = frozenset(_reductions + _mathops) @@ -572,9 +572,9 @@ def names(self): def check_name_clashes(self): env = self.env names = self.names - res_keys = frozenset(env.resolver_dict.iterkeys()) & names - lcl_keys = frozenset(env.locals.iterkeys()) & names - gbl_keys = frozenset(env.globals.iterkeys()) & names + res_keys = frozenset(env.resolver_dict.keys()) & names + lcl_keys = frozenset(env.locals.keys()) & names + gbl_keys = frozenset(env.globals.keys()) & names _check_disjoint_resolver_names(res_keys, lcl_keys, gbl_keys) def add_resolvers_to_locals(self): diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index cd59ab2927e7b..0ae2d2f28c44d 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -1,6 +1,7 @@ import re import operator as op from functools import partial +from itertools import product, islice, chain import numpy as np @@ -23,8 +24,12 @@ class UndefinedVariableError(NameError): def __init__(self, *args): - super(UndefinedVariableError, - self).__init__('name {0!r} is not defined'.format(args[0])) + msg = 'name {0!r} is not defined' + subbed = _TAG_RE.sub('', args[0]) + if subbed != args[0]: + subbed = '@' + subbed + msg = 'local variable {0!r} is not defined' + super(UndefinedVariableError, self).__init__(msg.format(subbed)) class OperatorError(Exception): @@ -39,6 +44,19 @@ class BinaryOperatorError(OperatorError): pass +def _possibly_update_key(d, value, old_key, new_key=None): + if new_key is None: + new_key = old_key + + try: + del d[old_key] + except KeyError: + return False + else: + d[new_key] = value + return True + + class Term(StringMixin): def __init__(self, name, env, side=None, encoding=None): self._name = name @@ -76,37 +94,40 @@ def _resolve_name(self): return res def update(self, value): + """ + search order for local (i.e., @variable) variables: + + scope, key_variable + [('locals', 'local_name'), + ('globals', 'local_name'), + ('locals', 'key'), + ('globals', 'key')] + """ env = self.env key = self.name + + # if it's a variable name (otherwise a constant) if isinstance(key, string_types): if self.local: + # get it's name WITHOUT the local tag (defined above) local_name = self.local_name - try: - del env.locals[local_name] - env.locals[key] = value - except KeyError: - try: - del env.globals[local_name] - env.globals[key] = value - except KeyError: - try: - del env.locals[key] - env.locals[key] = value - except KeyError: - try: - del env.globals[key] - env.globals[key] = value - except KeyError: - raise UndefinedVariableError(key) + # search for the local in the above specified order + scope_pairs = product([env.locals, env.globals], + [local_name, key]) + + # a[::2] + a[1::2] but iterators + scope_iter = chain(islice(scope_pairs, None, None, 2), + islice(scope_pairs, 1, None, 2)) + for d, k in scope_iter: + if _possibly_update_key(d, value, k, key): + break + else: + raise UndefinedVariableError(key) else: + # otherwise we look in resolvers -> locals -> globals for r in (env.resolver_dict, env.locals, env.globals): - try: - del r[key] - except KeyError: - pass - else: - r[key] = value + if _possibly_update_key(r, value, key): break else: raise UndefinedVariableError(key) @@ -332,7 +353,7 @@ def stringify(value): lhs, rhs = self.lhs, self.rhs if (is_term(lhs) and lhs.kind.startswith('datetime') and is_term(rhs) - and rhs.isscalar): + and rhs.isscalar): v = rhs.value if isinstance(v, (int, float)): v = stringify(v) @@ -343,7 +364,7 @@ def stringify(value): self.rhs.update(v) if (is_term(rhs) and rhs.kind.startswith('datetime') and - is_term(lhs) and lhs.isscalar): + is_term(lhs) and lhs.isscalar): v = lhs.value if isinstance(v, (int, float)): v = stringify(v) diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index 2d98397366b7f..4067c22beb507 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -348,7 +348,6 @@ def prune(self, klass): return None - _op_classes = {'unary': UnaryOp} class ExprVisitor(BaseExprVisitor): @@ -403,6 +402,9 @@ def visit_Attribute(self, node, **kwargs): raise ValueError("Invalid Attribute context {0}".format(ctx.__name__)) + def translate_In(self, op): + return ast.Eq() if isinstance(op, ast.In) else op + class Expr(expr.Expr): @@ -449,7 +451,7 @@ def __init__(self, where, op=None, value=None, queryables=None, if isinstance(where, Expr): lcls.update(where.env.locals) - where = str(where) + where = where.expr elif isinstance(where, (list, tuple)): @@ -465,7 +467,7 @@ def __init__(self, where, op=None, value=None, queryables=None, self.env = Scope(lcls=lcls) self.env.update(scope_level) - if queryables is not None: + if queryables is not None and isinstance(self.expr, string_types): self.env.queryables.update(queryables) self._visitor = ExprVisitor(self.env, queryables=queryables, parser='pytables', engine='pytables', @@ -506,7 +508,7 @@ def parse_back_compat(self, w, op=None, value=None): def __unicode__(self): if self.terms is not None: return com.pprint_thing(self.terms) - return self.expr + return com.pprint_thing(self.expr) def evaluate(self): """ create and return the numexpr condition and filter """ @@ -542,3 +544,15 @@ def tostring(self, encoding): return self.converted return '"%s"' % self.converted return self.converted + + +def maybe_expression(s): + """ loose checking if s is a pytables-acceptable expression """ + if not isinstance(s, string_types): + return False + ops = ExprVisitor.binary_ops + ExprVisitor.unary_ops + ('=',) + + # make sure we have an op at least + return any(op in s for op in ops) + + diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 8a8a04824cf29..df60ce427f441 100755 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -18,19 +18,23 @@ from pandas.core import common as com from pandas import DataFrame, Series, Panel, date_range from pandas.util.testing import makeCustomDataframe as mkdf + +from pandas.computation import pytables +from pandas.computation.expressions import _USE_NUMEXPR from pandas.computation.engines import _engines from pandas.computation.expr import PythonExprVisitor, PandasExprVisitor from pandas.computation.ops import (_binary_ops_dict, _unary_ops_dict, _special_case_arith_ops_syms, - _arith_ops_syms) + _arith_ops_syms, _bool_ops_syms) import pandas.computation.expr as expr -from pandas.computation import pytables -from pandas.computation.expressions import _USE_NUMEXPR from pandas.util.testing import (assert_frame_equal, randbool, assertRaisesRegexp, assert_produces_warning, assert_series_equal) from pandas.compat import PY3, u +_series_frame_incompatible = _bool_ops_syms +_scalar_skip = 'in', 'not in' + def skip_if_no_ne(engine='numexpr'): if not _USE_NUMEXPR and engine == 'numexpr': raise nose.SkipTest("numexpr engine not installed or disabled") @@ -59,13 +63,19 @@ def _eval_single_bin(lhs, cmp1, rhs, engine): def _series_and_2d_ndarray(lhs, rhs): - return (com.is_series(lhs) and isinstance(rhs, np.ndarray) and rhs.ndim > 1 - or com.is_series(rhs) and isinstance(lhs, np.ndarray) and lhs.ndim - > 1) + return ((isinstance(lhs, Series) and + isinstance(rhs, np.ndarray) and rhs.ndim > 1) + or (isinstance(rhs, Series) and + isinstance(lhs, np.ndarray) and lhs.ndim > 1)) + + +def _series_and_frame(lhs, rhs): + return ((isinstance(lhs, Series) and isinstance(rhs, DataFrame)) + or (isinstance(rhs, Series) and isinstance(lhs, DataFrame))) def _bool_and_frame(lhs, rhs): - return isinstance(lhs, bool) and com.is_frame(rhs) + return isinstance(lhs, bool) and isinstance(rhs, pd.core.generic.NDFrame) def skip_incompatible_operand(f): @@ -87,6 +97,7 @@ def _is_py3_complex_incompat(result, expected): _good_arith_ops = com.difference(_arith_ops_syms, _special_case_arith_ops_syms) + class TestEvalNumexprPandas(unittest.TestCase): @classmethod def setUpClass(cls): @@ -115,10 +126,8 @@ def setup_data(self): self.scalar_lhses = randn(), np.float64(randn()), np.nan self.scalar_rhses = randn(), np.float64(randn()), np.nan - self.lhses = self.pandas_lhses + self.scalar_lhses + (randn(10, 5), - randn(5)) - self.rhses = self.pandas_rhses + self.scalar_rhses + (randn(10, 5), - randn(5)) + self.lhses = self.pandas_lhses + self.scalar_lhses + self.rhses = self.pandas_rhses + self.scalar_rhses def setup_ops(self): self.cmp_ops = expr._cmp_ops_syms @@ -191,44 +200,79 @@ def test_compound_invert_op(self): def test_chained_cmp_op(self): mids = self.lhses cmp_ops = tuple(set(self.cmp_ops) - set(['==', '!=', '<=', '>='])) - for lhs, cmp1, mid, cmp2, rhs in product(self.lhses, self.cmp_ops, + for lhs, cmp1, mid, cmp2, rhs in product(self.lhses, cmp_ops, mids, cmp_ops, self.rhses): self.check_chained_cmp_op(lhs, cmp1, mid, cmp2, rhs) - @skip_incompatible_operand def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): + skip_these = 'in', 'not in' ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format(cmp1=cmp1, binop=binop, cmp2=cmp2) - lhs_new = _eval_single_bin(lhs, cmp1, rhs, self.engine) - rhs_new = _eval_single_bin(lhs, cmp2, rhs, self.engine) - expected = _eval_single_bin(lhs_new, binop, rhs_new, self.engine) - result = pd.eval(ex, engine=self.engine, parser=self.parser) - assert_array_equal(result, expected) + scalar_with_in_notin = (np.isscalar(rhs) and (cmp1 in skip_these or + cmp2 in skip_these)) + if scalar_with_in_notin: + self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, + parser=self.parser, local_dict={'lhs': lhs, + 'rhs': rhs}) + elif (_series_and_frame(lhs, rhs) and (cmp1 in + _series_frame_incompatible or + cmp2 in _series_frame_incompatible)): + self.assertRaises(TypeError, pd.eval, ex, + local_dict={'lhs': lhs, 'rhs': rhs}, + engine=self.engine, parser=self.parser) + else: + lhs_new = _eval_single_bin(lhs, cmp1, rhs, self.engine) + rhs_new = _eval_single_bin(lhs, cmp2, rhs, self.engine) + if (isinstance(lhs_new, Series) and isinstance(rhs_new, DataFrame) + and binop in _series_frame_incompatible): + pass + # TODO: the code below should be added back when left and right + # hand side bool ops are fixed. + + #try: + #self.assertRaises(Exception, pd.eval, ex, + #local_dict={'lhs': lhs, 'rhs': rhs}, + #engine=self.engine, parser=self.parser) + #except AssertionError: + #import ipdb; ipdb.set_trace() + #raise + + else: + expected = _eval_single_bin(lhs_new, binop, rhs_new, self.engine) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + assert_array_equal(result, expected) + @skip_incompatible_operand def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): - # these are not compatible operands - if _series_and_2d_ndarray(lhs, mid): - self.assertRaises(ValueError, _eval_single_bin, lhs, cmp2, mid, - self.engine) - else: - lhs_new = _eval_single_bin(lhs, cmp1, mid, self.engine) + skip_these = 'in', 'not in' + + def check_operands(left, right, cmp_op): + if (np.isscalar(right) and not np.isscalar(left) and cmp_op in + skip_these): + self.assertRaises(Exception, _eval_single_bin, left, cmp_op, + right, self.engine) + elif _series_and_2d_ndarray(right, left): + self.assertRaises(Exception, _eval_single_bin, right, cmp_op, + left, self.engine) + elif (np.isscalar(right) and np.isscalar(left) and cmp_op in + skip_these): + self.assertRaises(Exception, _eval_single_bin, right, cmp_op, + left, self.engine) + else: + new = _eval_single_bin(left, cmp_op, right, self.engine) + return new + return - if _series_and_2d_ndarray(mid, rhs): - self.assertRaises(ValueError, _eval_single_bin, mid, cmp2, rhs, - self.engine) - else: - rhs_new = _eval_single_bin(mid, cmp2, rhs, self.engine) + lhs_new = check_operands(lhs, mid, cmp1) + rhs_new = check_operands(mid, rhs, cmp2) - try: - lhs_new - rhs_new - except NameError: - pass - else: + if lhs_new is not None and rhs_new is not None: # these are not compatible operands - if (com.is_series(lhs_new) and com.is_frame(rhs_new) or - _bool_and_frame(lhs_new, rhs_new)): + if isinstance(lhs_new, Series) and isinstance(rhs_new, DataFrame): + self.assertRaises(TypeError, _eval_single_bin, lhs_new, '&', + rhs_new, self.engine) + elif (_bool_and_frame(lhs_new, rhs_new)): self.assertRaises(TypeError, _eval_single_bin, lhs_new, '&', rhs_new, self.engine) elif _series_and_2d_ndarray(lhs_new, rhs_new): @@ -240,7 +284,11 @@ def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2) ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp1, cmp2) ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp1, cmp2) - expected = _eval_single_bin(lhs_new, '&', rhs_new, self.engine) + try: + expected = _eval_single_bin(lhs_new, '&', rhs_new, self.engine) + except TypeError: + import ipdb; ipdb.set_trace() + raise for ex in (ex1, ex2, ex3): result = pd.eval(ex, engine=self.engine, @@ -250,9 +298,14 @@ def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): @skip_incompatible_operand def check_simple_cmp_op(self, lhs, cmp1, rhs): ex = 'lhs {0} rhs'.format(cmp1) - expected = _eval_single_bin(lhs, cmp1, rhs, self.engine) - result = pd.eval(ex, engine=self.engine, parser=self.parser) - assert_array_equal(result, expected) + if cmp1 in ('in', 'not in') and not com.is_list_like(rhs): + self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, + parser=self.parser, local_dict={'lhs': lhs, + 'rhs': rhs}) + else: + expected = _eval_single_bin(lhs, cmp1, rhs, self.engine) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + assert_array_equal(result, expected) @skip_incompatible_operand def check_binary_arith_op(self, lhs, arith1, rhs): @@ -360,19 +413,26 @@ def check_single_invert_op(self, lhs, cmp1, rhs): @skip_incompatible_operand def check_compound_invert_op(self, lhs, cmp1, rhs): - # compound + skip_these = 'in', 'not in' ex = '~(lhs {0} rhs)'.format(cmp1) - if np.isscalar(lhs) and np.isscalar(rhs): - lhs, rhs = map(lambda x: np.array([x]), (lhs, rhs)) - expected = ~_eval_single_bin(lhs, cmp1, rhs, self.engine) - result = pd.eval(ex, engine=self.engine, parser=self.parser) - assert_array_equal(expected, result) - # make sure the other engines work the same as this one - for engine in self.current_engines: - skip_if_no_ne(engine) - ev = pd.eval(ex, engine=self.engine, parser=self.parser) - assert_array_equal(ev, result) + if np.isscalar(rhs) and cmp1 in skip_these: + self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, + parser=self.parser, local_dict={'lhs': lhs, + 'rhs': rhs}) + else: + # compound + if np.isscalar(lhs) and np.isscalar(rhs): + lhs, rhs = map(lambda x: np.array([x]), (lhs, rhs)) + expected = ~_eval_single_bin(lhs, cmp1, rhs, self.engine) + result = pd.eval(ex, engine=self.engine, parser=self.parser) + assert_array_equal(expected, result) + + # make sure the other engines work the same as this one + for engine in self.current_engines: + skip_if_no_ne(engine) + ev = pd.eval(ex, engine=self.engine, parser=self.parser) + assert_array_equal(ev, result) @skip_incompatible_operand def check_unary_arith_op(self, lhs, arith1, rhs, unary_op): @@ -461,46 +521,8 @@ def setUpClass(cls): cls.parser = 'pandas' def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): - # these are not compatible operands - if _series_and_2d_ndarray(lhs, mid): - self.assertRaises(ValueError, _eval_single_bin, lhs, cmp2, mid, - self.engine) - else: - lhs_new = _eval_single_bin(lhs, cmp1, mid, self.engine) - - if _series_and_2d_ndarray(mid, rhs): - self.assertRaises(ValueError, _eval_single_bin, mid, cmp2, rhs, - self.engine) - else: - rhs_new = _eval_single_bin(mid, cmp2, rhs, self.engine) - - try: - lhs_new - rhs_new - except NameError: - pass - else: - # these are not compatible operands - if (com.is_series(lhs_new) and com.is_frame(rhs_new) or - _bool_and_frame(lhs_new, rhs_new)): - self.assertRaises(TypeError, _eval_single_bin, lhs_new, '&', - rhs_new, self.engine) - elif _series_and_2d_ndarray(lhs_new, rhs_new): - # TODO: once #4319 is fixed add this test back in - #self.assertRaises(Exception, _eval_single_bin, lhs_new, '&', - #rhs_new, self.engine) - pass - else: - ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2) - ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp1, cmp2) - ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp1, cmp2) - expected = _eval_single_bin(lhs_new, '&', rhs_new, self.engine) - - for ex in (ex1, ex2, ex3): - result = pd.eval(ex, engine=self.engine, - parser=self.parser) - assert_array_equal(result, expected) - + TestEvalNumexprPandas.check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, + rhs) f = lambda *args, **kwargs: np.random.randn() @@ -741,24 +763,35 @@ def test_simple_arith_ops(self): ops = expr._arith_ops_syms + expr._cmp_ops_syms for op in filter(lambda x: x != '//', ops): - expec = _eval_single_bin(1, op, 1, self.engine) - x = self.eval('1 {0} 1'.format(op)) - assert_equal(x, expec) + ex = '1 {0} 1'.format(op) + ex2 = 'x {0} 1'.format(op) + ex3 = '1 {0} (x + 1)'.format(op) - expec = _eval_single_bin(x, op, 1, self.engine) - y = self.eval('x {0} 1'.format(op), local_dict={'x': x}) - assert_equal(y, expec) + if op in ('in', 'not in'): + self.assertRaises(TypeError, pd.eval, ex, + engine=self.engine, parser=self.parser) + else: + expec = _eval_single_bin(1, op, 1, self.engine) + x = self.eval(ex, engine=self.engine, parser=self.parser) + assert_equal(x, expec) - expec = _eval_single_bin(1, op, x + 1, self.engine) - y = self.eval('1 {0} (x + 1)'.format(op), local_dict={'x': x}) - assert_equal(y, expec) + expec = _eval_single_bin(x, op, 1, self.engine) + y = self.eval(ex2, local_dict={'x': x}, engine=self.engine, + parser=self.parser) + assert_equal(y, expec) + + expec = _eval_single_bin(1, op, x + 1, self.engine) + y = self.eval(ex3, local_dict={'x': x}, + engine=self.engine, parser=self.parser) + assert_equal(y, expec) def test_simple_bool_ops(self): - for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, - False)): + for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), + (True, False)): expec = _eval_single_bin(lhs, op, rhs, self.engine) x = self.eval('lhs {0} rhs'.format(op), local_dict={'lhs': lhs, - 'rhs': rhs}) + 'rhs': rhs}, + engine=self.engine, parser=self.parser) assert_equal(x, expec) def test_bool_ops_with_constants(self): diff --git a/pandas/core/common.py b/pandas/core/common.py index 8ada14485a780..c1ff6a2200225 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -18,12 +18,12 @@ import pandas.lib as lib import pandas.tslib as tslib from pandas import compat -from pandas.compat import StringIO, BytesIO, range, long, u, zip, map +from pandas.compat import (StringIO, BytesIO, range, long, u, zip, map, + string_types) from datetime import timedelta from pandas.core.config import get_option from pandas.core import array as pa -import pandas as pd class PandasError(Exception): pass @@ -33,14 +33,18 @@ class AmbiguousIndexError(PandasError, KeyError): pass _POSSIBLY_CAST_DTYPES = set([np.dtype(t) - for t in ['M8[ns]', 'm8[ns]', 'O', 'int8', 'uint8', 'int16', 'uint16', 'int32', 'uint32', 'int64', 'uint64']]) + for t in ['M8[ns]', 'm8[ns]', 'O', 'int8', + 'uint8', 'int16', 'uint16', 'int32', + 'uint32', 'int64', 'uint64']]) _NS_DTYPE = np.dtype('M8[ns]') _TD_DTYPE = np.dtype('m8[ns]') _INT64_DTYPE = np.dtype(np.int64) _DATELIKE_DTYPES = set([np.dtype(t) for t in ['M8[ns]', 'm8[ns]']]) -# define abstract base classes to enable isinstance type checking on our objects + +# define abstract base classes to enable isinstance type checking on our +# objects def create_pandas_abc_type(name, attr, comp): @classmethod def _check(cls, inst): @@ -50,15 +54,22 @@ def _check(cls, inst): meta = type("ABCBase", (type,), dct) return meta(name, tuple(), dct) + ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",)) ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel",)) -ABCSparseSeries = create_pandas_abc_type("ABCSparseSeries", "_subtyp", ('sparse_series', 'sparse_time_series')) -ABCSparseArray = create_pandas_abc_type("ABCSparseArray", "_subtyp", ('sparse_array', 'sparse_series')) +ABCSparseSeries = create_pandas_abc_type("ABCSparseSeries", "_subtyp", + ('sparse_series', + 'sparse_time_series')) +ABCSparseArray = create_pandas_abc_type("ABCSparseArray", "_subtyp", + ('sparse_array', 'sparse_series')) + class _ABCGeneric(type): def __instancecheck__(cls, inst): return hasattr(inst, "_data") + + ABCGeneric = _ABCGeneric("ABCGeneric", tuple(), {}) def isnull(obj): @@ -229,6 +240,11 @@ def notnull(obj): return -res +def _iterable_not_string(x): + return (isinstance(x, collections.Iterable) and + not isinstance(x, compat.string_types)) + + def flatten(l): """Flatten an arbitrarily nested sequence. @@ -246,7 +262,7 @@ def flatten(l): flattened : generator """ for el in l: - if isinstance(el, collections.Iterable) and not is_string(el): + if _iterable_not_string(el): for s in flatten(el): yield s else: @@ -1686,30 +1702,6 @@ def is_bool(obj): return isinstance(obj, (bool, np.bool_)) -def is_string(obj): - return isinstance(obj, string_types) - - -def is_series(obj): - return isinstance(obj, pd.Series) - - -def is_frame(obj): - return isinstance(obj, pd.DataFrame) - - -def is_panel(obj): - return isinstance(obj, pd.Panel) - - -def is_pd_obj(obj): - return isinstance(obj, pd.core.generic.PandasObject) - - -def is_ndframe(obj): - return isinstance(obj, pd.core.generic.NDFrame) - - def is_integer(obj): return isinstance(obj, (numbers.Integral, np.integer)) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5f4c283d6fffc..ed3ecd3700f31 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -35,7 +35,6 @@ create_block_manager_from_arrays, create_block_manager_from_blocks) from pandas.core.series import Series, _radd_compat -from pandas.sparse.array import SparseArray import pandas.computation.expressions as expressions from pandas.computation.eval import eval as _eval from pandas.computation.expr import maybe_expression, _ensure_scope @@ -53,13 +52,12 @@ import pandas.core.datetools as datetools import pandas.core.common as com import pandas.core.format as fmt -import pandas.core.generic as generic import pandas.core.nanops as nanops import pandas.lib as lib import pandas.algos as _algos -from pandas.core.config import get_option, set_option +from pandas.core.config import get_option #---------------------------------------------------------------------- # Docstring templates @@ -1963,6 +1961,20 @@ def query(self, expr, **kwargs): -------- pandas.eval """ + # need to go up at least 4 stack frames + # 4 expr.Scope + # 3 expr._ensure_scope + # 2 self.eval + # 1 self.query + # 0 self.query caller (implicit) + level = kwargs.setdefault('level', 4) + if level < 4: + raise ValueError("Going up fewer than 4 stack frames will not" + " capture the necessary variable scope for a " + "query expression") + return self[self.eval(expr, **kwargs)] + + def eval(self, expr, **kwargs): resolvers = kwargs.pop('resolvers', None) if resolvers is None: index_resolvers = {} @@ -1970,9 +1982,9 @@ def query(self, expr, **kwargs): index_resolvers[self.index.name] = self.index index_resolvers.update({'index': self.index, 'columns': self.columns}) - resolvers = [self, index_resolvers] + resolvers = [index_resolvers, self] kwargs['local_dict'] = _ensure_scope(resolvers=resolvers, **kwargs) - return self[_eval(expr, **kwargs)] + return _eval(expr, **kwargs) def _slice(self, slobj, axis=0, raise_on_error=False): axis = self._get_block_manager_axis(axis) diff --git a/pandas/core/series.py b/pandas/core/series.py index 893483f0f2636..beb398dfe6fd0 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -8,7 +8,6 @@ import operator from distutils.version import LooseVersion import types -import warnings from numpy import nan, ndarray import numpy as np @@ -18,8 +17,10 @@ _default_index, _maybe_promote, _maybe_upcast, _asarray_tuplesafe, is_integer_dtype, _NS_DTYPE, _TD_DTYPE, - _infer_dtype_from_scalar, is_list_like, _values_from_object, - _possibly_cast_to_datetime, _possibly_castable, _possibly_convert_platform, + _infer_dtype_from_scalar, is_list_like, + _values_from_object, + _possibly_cast_to_datetime, _possibly_castable, + _possibly_convert_platform, ABCSparseArray) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, _ensure_index, _handle_legacy_indexes) @@ -29,7 +30,6 @@ from pandas.core import generic from pandas.core.internals import SingleBlockManager from pandas.core.categorical import Categorical -import pandas.core.expressions as expressions from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex, Period from pandas.tseries.offsets import DateOffset @@ -775,12 +775,9 @@ def put(self, *args, **kwargs): def __len__(self): return len(self._data) - @property - def size(self): - return self.__len__() - def view(self, dtype=None): - return self._constructor(self.values.view(dtype), index=self.index, name=self.name) + return self._constructor(self.values.view(dtype), index=self.index, + name=self.name) def __array__(self, result=None): """ the array interface, return my values """ @@ -790,7 +787,8 @@ def __array_wrap__(self, result): """ Gets called prior to a ufunc (and after) """ - return self._constructor(result, index=self.index, name=self.name, copy=False) + return self._constructor(result, index=self.index, name=self.name, + copy=False) def __contains__(self, key): return key in self.index diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 3744363138214..a4491a87b290d 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -12,7 +12,6 @@ import warnings import numpy as np -import pandas from pandas import (Series, TimeSeries, DataFrame, Panel, Panel4D, Index, MultiIndex, Int64Index, Timestamp, _np_version_under1p7) from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel @@ -30,10 +29,10 @@ import pandas.core.common as com from pandas.tools.merge import concat from pandas import compat -from pandas.compat import u, PY3, range +from pandas.compat import u, PY3, range, lrange from pandas.io.common import PerformanceWarning from pandas.core.config import get_option -from pandas.computation.pytables import Expr +from pandas.computation.pytables import Expr, maybe_expression import pandas.lib as lib import pandas.algos as algos @@ -62,22 +61,27 @@ def _ensure_encoding(encoding): encoding = _default_encoding return encoding + Term = Expr + def _ensure_term(where): - """ ensure that the where is a Term or a list of Term - this makes sure that we are capturing the scope of variables - that are passed """ - - # create the terms here with a frame_level=2 (we are 2 levels down) - if isinstance(where, (list, tuple)): - where = [ w if isinstance(w, Term) else Term(w, scope_level=2) for w in where if w is not None ] - elif where is None or isinstance(where, Coordinates): - pass - elif not isinstance(where, Term): + """ + ensure that the where is a Term or a list of Term + this makes sure that we are capturing the scope of variables + that are passed + create the terms here with a frame_level=2 (we are 2 levels down) + """ + + # only consider list/tuple here as an ndarray is automaticaly a coordinate list + if isinstance(where, (list,tuple)): + where = [w if not maybe_expression(w) else Term(w, scope_level=2) + for w in where if w is not None ] + elif maybe_expression(where): where = Term(where, scope_level=2) return where + class PossibleDataLossError(Exception): pass @@ -2438,7 +2442,7 @@ def read(self, **kwargs): sdict = {} for name in items: key = 'sparse_frame_%s' % name - s = SparseFrameStorer(self.parent, getattr(self.group, key)) + s = SparseFrameFixed(self.parent, getattr(self.group, key)) s.infer_axes() sdict[name] = s.read() return SparsePanel(sdict, items=items, default_kind=self.default_kind, @@ -3527,7 +3531,7 @@ def delete(self, where=None, **kwargs): # we must remove in reverse order! pg = groups.pop() for g in reversed(groups): - rows = l.take(range(g, pg)) + rows = l.take(lrange(g, pg)) table.removeRows(start=rows[rows.index[0] ], stop=rows[rows.index[-1]] + 1) pg = g diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index b13c8e83d8777..ee42a58a38c1c 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1931,8 +1931,8 @@ def test_remove_where(self): # try to remove non-table (with crit) # non-table ok (where = None) wp = tm.makePanel() - store.put('wp', wp, fmt='t') - store.remove('wp', [("minor_axis=['A', 'D']")]) + store.put('wp', wp, format='table') + store.remove('wp', ["minor_axis=['A', 'D']"]) rs = store.select('wp') expected = wp.reindex(minor_axis=['B', 'C']) assert_panel_equal(rs, expected) @@ -2031,6 +2031,7 @@ def test_invalid_terms(self): df.ix[0:4,'string'] = 'bar' wp = tm.makePanel() p4d = tm.makePanel4D() + store.put('df', df, format='table') store.put('wp', wp, format='table') store.put('p4d', p4d, format='table') diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 5fc02d97b239e..8c5764a3f59a6 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -7,18 +7,14 @@ import numpy as np from pandas.tslib import iNaT -from pandas import (Series, DataFrame, date_range, DatetimeIndex, Timestamp, - Panel) +from pandas import Series, DataFrame, date_range, DatetimeIndex, Timestamp from pandas import compat from pandas.compat import range, long, lrange, lmap, u from pandas.core.common import notnull, isnull -import pandas.compat as compat import pandas.core.common as com import pandas.util.testing as tm import pandas.core.config as cf -from numpy.random import randn - _multiprocess_can_split_ = True @@ -110,64 +106,6 @@ def test_isnull_lists(): assert(not result.any()) -def test_is_string(): - class MyUnicode(compat.text_type): - pass - - if not compat.PY3: - class MyString(str): - pass - else: - MyString = MyUnicode - - strings = ('s', np.str_('a'), np.unicode_('unicode_string'), - MyString('asdfasdfasdf'), u('asdf'), MyUnicode(u('asdf'))) - not_strings = [], 1, {}, set(), np.array(['1']), np.array([u('1')]) - - for string in strings: - assert com.is_string(string), '{0} is not a string'.format(string) - - for not_string in not_strings: - assert not com.is_string(not_string), ('{0} is a ' - 'string'.format(not_string)) - - -def test_is_frame(): - df = DataFrame(randn(2, 1)) - assert com.is_frame(df) - assert not com.is_frame('s') - - -def test_is_series(): - s = Series(randn(2)) - assert com.is_series(s) - assert not com.is_series(s.values) - - -def test_is_panel(): - p = Panel(randn(2, 3, 4)) - assert com.is_panel(p) - assert not com.is_panel(2) - - -def test_is_pd_obj(): - df = DataFrame(randn(2, 1)) - s = Series(randn(2)) - p = Panel(randn(2, 3, 4)) - for obj in (df, s, p): - assert com.is_pd_obj(obj) - assert not com.is_pd_obj(obj.values) - - -def test_is_ndframe(): - df = DataFrame(randn(2, 1)) - p = Panel(randn(2, 3, 4)) - # should add series after @jreback's ndframe to series pr - for obj in (df, p): - assert com.is_ndframe(obj) - assert not com.is_ndframe(obj.values) - - def test_isnull_datetime(): assert (not isnull(datetime.now())) assert notnull(datetime.now()) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 7ba62a75a00cd..8145fd9c5c67d 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -11,6 +11,8 @@ import nose import functools import itertools +from itertools import product + from pandas.compat import( map, zip, range, long, lrange, lmap, lzip, OrderedDict, cPickle as pickle, u, StringIO @@ -84,6 +86,7 @@ def _check_mixed_float(df, dtype = None): if dtypes.get('D'): assert(df.dtypes['D'] == dtypes['D']) + def _check_mixed_int(df, dtype = None): dtypes = dict(A = 'int32', B = 'uint64', C = 'uint8', D = 'int64') if isinstance(dtype, compat.string_types): @@ -100,8 +103,6 @@ def _check_mixed_int(df, dtype = None): assert(df.dtypes['D'] == dtypes['D']) - - class CheckIndexing(object): _multiprocess_can_split_ = True @@ -125,6 +126,14 @@ def test_getitem(self): with assertRaisesRegexp(KeyError, 'no item named random'): self.frame['random'] + df = self.frame.copy() + df['$10'] = randn(len(df)) + ad = randn(len(df)) + df['@awesome_domain'] = ad + self.assertRaises(KeyError, df.__getitem__, 'df["$10"]') + res = df['@awesome_domain'] + assert_array_equal(ad, res.values) + def test_getitem_dupe_cols(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=['a', 'a', 'b']) try: @@ -11113,17 +11122,21 @@ def skip_if_no_ne(engine='numexpr'): "installed") +def skip_if_no_pandas_parser(parser): + if parser != 'pandas': + raise nose.SkipTest("cannot evaluate with parser {0!r}".format(parser)) + + class TestDataFrameQueryNumExprPandas(unittest.TestCase): @classmethod def setUpClass(cls): cls.engine = 'numexpr' cls.parser = 'pandas' - skip_if_no_ne(cls.engine) - cls.frame = _frame.copy() + skip_if_no_ne() @classmethod def tearDownClass(cls): - del cls.frame, cls.engine, cls.parser + del cls.engine, cls.parser def test_date_query_method(self): engine, parser = self.engine, self.parser @@ -11140,28 +11153,37 @@ def test_query_scope(self): engine, parser = self.engine, self.parser from pandas.computation.common import NameResolutionError - df = DataFrame({"i": lrange(10), - "+": lrange(3, 13), "r": lrange(4, 14)}) + df = DataFrame({"i": lrange(10), "+": lrange(3, 13), + "r": lrange(4, 14)}) i, s = 5, 6 self.assertRaises(NameResolutionError, df.query, 'i < 5', - local_dict=locals(), global_dict=globals(), - engine=engine, parser=parser) - self.assertRaises(NameResolutionError, df.query, 'i - +', engine=engine, - local_dict=locals(), global_dict=globals(), + engine=engine, parser=parser, local_dict={'i': i}) + self.assertRaises(SyntaxError, df.query, 'i - +', engine=engine, parser=parser) self.assertRaises(NameResolutionError, df.query, 'i == s', - engine=engine, local_dict=locals(), - global_dict=globals(), parser=parser) + engine=engine, parser=parser, local_dict={'i': i, + 's': s}) + + def test_query_scope_index(self): + engine, parser = self.engine, self.parser + from pandas.computation.common import NameResolutionError + df = DataFrame(np.random.randint(10, size=(10, 3)), + index=Index(range(10), name='blob'), + columns=['a', 'b', 'c']) + from numpy import sin df.index.name = 'sin' self.assertRaises(NameResolutionError, df.query, 'sin > 5', - engine=engine, parser=parser, local_dict=locals(), - global_dict=globals()) + engine=engine, parser=parser, local_dict={'sin': + sin}) def test_query(self): engine, parser = self.engine, self.parser df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c']) - assert_frame_equal(df.query('a < b', engine=engine, parser=parser), df[df.a < df.b]) - assert_frame_equal(df.query('a + b > b * c', engine=engine, parser=parser), + + assert_frame_equal(df.query('a < b', engine=engine, parser=parser), + df[df.a < df.b]) + assert_frame_equal(df.query('a + b > b * c', engine=engine, + parser=parser), df[df.a + df.b > df.b * df.c]) local_dict = dict(df.iteritems()) @@ -11174,20 +11196,34 @@ def test_query(self): df.query, 'a < b', local_dict={'df': df}, engine=engine, parser=parser) - def test_query_index(self): + def test_query_index_with_name(self): engine, parser = self.engine, self.parser df = DataFrame(np.random.randint(10, size=(10, 3)), index=Index(range(10), name='blob'), columns=['a', 'b', 'c']) - assert_frame_equal(df.query('index < b', engine=engine, parser=parser), - df[df.index < df.b]) - assert_frame_equal(df.query('index < 5', engine=engine, parser=parser), - df[df.index < 5]) - assert_frame_equal(df.query('(blob < 5) & (a < b)', engine=engine, - parser=parser), - df[(df.index < 5) & (df.a < df.b)]) - assert_frame_equal(df.query('blob < b', engine=engine, parser=parser), - df[df.index < df.b]) + res = df.query('(blob < 5) & (a < b)', engine=engine, parser=parser) + expec = df[(df.index < 5) & (df.a < df.b)] + assert_frame_equal(res, expec) + + res = df.query('blob < b', engine=engine, parser=parser) + expec = df[df.index < df.b] + + assert_frame_equal(res, expec) + + def test_query_index_without_name(self): + engine, parser = self.engine, self.parser + df = DataFrame(np.random.randint(10, size=(10, 3)), + index=range(10), columns=['a', 'b', 'c']) + + # "index" should refer to the index + res = df.query('index < b', engine=engine, parser=parser) + expec = df[df.index < df.b] + assert_frame_equal(res, expec) + + # test against a scalar + res = df.query('index < 5', engine=engine, parser=parser) + expec = df[df.index < 5] + assert_frame_equal(res, expec) def test_nested_scope(self): engine = self.engine @@ -11217,6 +11253,40 @@ def test_nested_scope(self): expected = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) assert_frame_equal(result, expected) + def test_local_syntax(self): + skip_if_no_pandas_parser(self.parser) + + from pandas.computation.common import NameResolutionError + + engine, parser = self.engine, self.parser + df = DataFrame(randn(100, 10), columns=list('abcdefghij')) + b = 1 + expect = df[df.a < b] + result = df.query('a < @b', engine=engine, parser=parser) + assert_frame_equal(result, expect) + + # scope issue with self.assertRaises so just catch it and let it pass + try: + df.query('a < @b', engine=engine, parser=parser) + except NameResolutionError: + pass + + del b + expect = df[df.a < df.b] + result = df.query('a < b', engine=engine, parser=parser) + assert_frame_equal(result, expect) + + def test_chained_cmp_and_in(self): + skip_if_no_pandas_parser(self.parser) + engine, parser = self.engine, self.parser + cols = list('abc') + df = DataFrame(randn(100, len(cols)), columns=cols) + res = df.query('a < b < c and a not in b not in c', engine=engine, + parser=parser) + ind = (df.a < df.b) & (df.b < df.c) & ~df.b.isin(df.a) & ~df.c.isin(df.b) + expec = df[ind] + assert_frame_equal(res, expec) + class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas): @classmethod @@ -11304,9 +11374,9 @@ def tearDownClass(cls): del cls.frame def test_nested_scope(self): - df = DataFrame(np.random.randn(5, 3)) + df = DataFrame(np.random.randn(5, 3)) df2 = DataFrame(np.random.randn(5, 3)) - expected = df[(df>0) & (df2>0)] + expected = df[(df > 0) & (df2 > 0)] result = df['(df>0) & (df2>0)'] assert_frame_equal(result, expected) @@ -11367,6 +11437,139 @@ def test_complex_boolean_expression(self): expec = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)] assert_frame_equal(res, expec) + def test_local_syntax(self): + from pandas.computation.common import NameResolutionError + df = DataFrame(randn(1000, 10), columns=list('abcdefghij')) + b = 1 + expect = df[df.a < b] + result = df['a < @b'] + assert_frame_equal(result, expect) + + # scope issue with self.assertRaises so just catch it and let it pass + try: + df['a < b'] + except NameResolutionError: + pass + + del b + expect = df[df.a < df.b] + result = df['a < b'] + assert_frame_equal(result, expect) + + +PARSERS = 'python', 'pandas' +ENGINES = 'python', 'numexpr' + + +class TestDataFrameQueryStrings(object): + def check_str_query_method(self, parser, engine): + skip_if_no_pandas_parser(parser) + df = DataFrame(randn(10, 1), columns=['b']) + df['strings'] = Series(list('aabbccddee')) + expect = df[df.strings == 'a'] + res = df.query('strings == "a"', engine=engine, parser=parser) + assert_frame_equal(res, expect) + assert_frame_equal(res, df[df.strings.isin(['a'])]) + + def test_str_query_method(self): + for parser, engine in product(PARSERS, ENGINES): + yield self.check_str_query_method, parser, engine + + def test_str_list_query_method(self): + for parser, engine in product(PARSERS, ENGINES): + yield self.check_str_list_query_method, parser, engine + + def check_str_list_query_method(self, parser, engine): + skip_if_no_pandas_parser(parser) + df = DataFrame(randn(10, 1), columns=['b']) + df['strings'] = Series(list('aabbccddee')) + expect = df[df.strings.isin(['a', 'b'])] + res = df.query('strings == ["a", "b"]', engine=engine, parser=parser) + assert_frame_equal(res, expect) + + def test_str_query(self): + skip_if_no_ne() + df = DataFrame(randn(10, 1), columns=['b']) + df['strings'] = Series(list('aabbccddee')) + expect = df[df.strings == 'a'] + res = df['strings == "a"'] + assert_frame_equal(res, expect) + + res = df['"a" == strings'] + assert_frame_equal(res, expect) + + def test_str_query_list(self): + skip_if_no_ne() + df = DataFrame(randn(10, 1), columns=['b']) + df['strings'] = Series(list('aabbccddee')) + expect = df[df.strings.isin(['a', 'b'])] + res = df['strings == ["a", "b"]'] + assert_frame_equal(res, expect) + + res = df['["a", "b"] == strings'] + assert_frame_equal(res, expect) + +class TestDataFrameEvalNumExprPandas(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.engine = 'numexpr' + cls.parser = 'pandas' + skip_if_no_ne() + + @classmethod + def tearDownClass(cls): + del cls.engine, cls.parser + + def setUp(self): + self.frame = DataFrame(randn(10, 3), columns=list('abc')) + + def tearDown(self): + del self.frame + + def test_simple_expr(self): + res = self.frame.eval('a + b', engine=self.engine, parser=self.parser) + expect = self.frame.a + self.frame.b + assert_series_equal(res, expect) + + def test_bool_arith_expr(self): + res = self.frame.eval('a[a < 1] + b', engine=self.engine, + parser=self.parser) + expect = self.frame.a[self.frame.a < 1] + self.frame.b + assert_series_equal(res, expect) + + +class TestDataFrameEvalNumExprPython(TestDataFrameEvalNumExprPandas): + @classmethod + def setUpClass(cls): + cls.engine = 'numexpr' + cls.parser = 'python' + skip_if_no_ne() + + @classmethod + def tearDownClass(cls): + del cls.engine, cls.parser + + +class TestDataFrameEvalPythonPandas(TestDataFrameEvalNumExprPandas): + @classmethod + def setUpClass(cls): + cls.engine = 'python' + cls.parser = 'pandas' + + @classmethod + def tearDownClass(cls): + del cls.engine, cls.parser + + +class TestDataFrameEvalPythonPython(TestDataFrameEvalNumExprPython): + @classmethod + def setUpClass(cls): + cls.engine = cls.parser = 'python' + + @classmethod + def tearDownClass(cls): + del cls.engine, cls.parser + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/vb_suite/indexing.py b/vb_suite/indexing.py index 2fb5a22ce0cb8..beefec256ed81 100644 --- a/vb_suite/indexing.py +++ b/vb_suite/indexing.py @@ -106,7 +106,7 @@ start_date=datetime(2012, 1, 1)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(50000, 100)) df2 = DataFrame(np.random.randn(50000, 100)) expr.set_numexpr_threads(1) From 0d8997a6db0f4aca6cb397cd116143355baba2e9 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 2 Sep 2013 18:00:22 -0400 Subject: [PATCH 11/16] ENH: add in, not in, and string/list query support --- bench/bench_with_subset.R | 53 ++++ bench/bench_with_subset.py | 37 +++ doc/source/api.rst | 1 + doc/source/comparison_with_r.rst | 96 +++++-- doc/source/enhancingperf.rst | 135 ++++++++-- doc/source/indexing.rst | 207 +++++++++++++-- doc/source/io.rst | 30 ++- doc/source/v0.10.0.txt | 5 +- doc/source/v0.13.0.txt | 174 +++++++++++++ pandas/computation/align.py | 21 +- pandas/computation/common.py | 3 +- pandas/computation/engines.py | 14 +- pandas/computation/eval.py | 150 ++++++++--- pandas/computation/expr.py | 329 ++++++++++++++++++----- pandas/computation/ops.py | 217 ++++++++++------ pandas/computation/pytables.py | 21 +- pandas/computation/tests/test_eval.py | 294 ++++++++++++++++++--- pandas/core/common.py | 2 + pandas/core/frame.py | 94 +++++-- pandas/io/pytables.py | 12 +- pandas/io/tests/test_pytables.py | 41 ++- pandas/tests/test_frame.py | 361 +++++++++++++++++++++++++- pandas/util/testing.py | 8 + 23 files changed, 1973 insertions(+), 332 deletions(-) create mode 100644 bench/bench_with_subset.R create mode 100644 bench/bench_with_subset.py diff --git a/bench/bench_with_subset.R b/bench/bench_with_subset.R new file mode 100644 index 0000000000000..69d0f7a9eec63 --- /dev/null +++ b/bench/bench_with_subset.R @@ -0,0 +1,53 @@ +library(microbenchmark) +library(data.table) + + +data.frame.subset.bench <- function (n=1e7, times=30) { + df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n)) + print(microbenchmark(subset(df, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c), + times=times)) +} + + +# data.table allows something very similar to query with an expression +# but we have chained comparisons AND we're faster BOO YAH! +data.table.subset.expression.bench <- function (n=1e7, times=30) { + dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n)) + print(microbenchmark(dt[, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c], + times=times)) +} + + +# compare against subset with data.table for good measure +data.table.subset.bench <- function (n=1e7, times=30) { + dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n)) + print(microbenchmark(subset(dt, a <= b & b <= (c ^ 2 + b ^ 2 - a) & b > c), + times=times)) +} + + +data.frame.with.bench <- function (n=1e7, times=30) { + df <- data.frame(a=rnorm(n), b=rnorm(n), c=rnorm(n)) + + print(microbenchmark(with(df, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3), + times=times)) +} + + +data.table.with.bench <- function (n=1e7, times=30) { + dt <- data.table(a=rnorm(n), b=rnorm(n), c=rnorm(n)) + print(microbenchmark(with(dt, a + b * (c ^ 2 + b ^ 2 - a) / (a * c) ^ 3), + times=times)) +} + + +bench <- function () { + data.frame.subset.bench() + data.table.subset.expression.bench() + data.table.subset.bench() + data.frame.with.bench() + data.table.with.bench() +} + + +bench() diff --git a/bench/bench_with_subset.py b/bench/bench_with_subset.py new file mode 100644 index 0000000000000..878b9c08e62d8 --- /dev/null +++ b/bench/bench_with_subset.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python + +""" +Microbenchmarks for comparison with R's "with" and "subset" functions +""" + +from __future__ import print_function +from timeit import timeit + + +def bench_with(n=1e7, times=10, repeat=3): + setup = "from pandas import DataFrame\n" + setup += "from numpy.random import randn\n" + setup += "df = DataFrame(randn(%d, 3), columns=list('abc'))\n" % n + setup += "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'" + print('DataFrame.eval:') + print(timeit('df.eval(s)', setup=setup, repeat=repeat, number=times)) + + +def bench_subset(n=1e7, times=10, repeat=3): + setup = "from pandas import DataFrame\n" + setup += "from numpy.random import randn\n" + setup += "df = DataFrame(randn(%d, 3), columns=list('abc'))\n" % n + setup += "s = 'a <= b <= (c ** 2 + b ** 2 - a) and b > c'" + print('DataFrame.query:') + print(timeit('df.query(s)', setup=setup, repeat=repeat, number=times)) + print('DataFrame.__getitem__:') + print(timeit('df[s]', setup=setup, repeat=repeat, number=times)) + + +def bench(): + bench_with() + bench_subset() + + +if __name__ == '__main__': + bench() diff --git a/doc/source/api.rst b/doc/source/api.rst index affa840781c34..28c1515e93bc5 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -514,6 +514,7 @@ Computations / Descriptive Stats DataFrame.cumsum DataFrame.describe DataFrame.diff + DataFrame.eval DataFrame.kurt DataFrame.mad DataFrame.max diff --git a/doc/source/comparison_with_r.rst b/doc/source/comparison_with_r.rst index 5759768051c0e..012a6fe6baf96 100644 --- a/doc/source/comparison_with_r.rst +++ b/doc/source/comparison_with_r.rst @@ -1,28 +1,88 @@ .. currentmodule:: pandas .. _compare_with_r: -******************************* Comparison with R / R libraries ******************************* -Since pandas aims to provide a lot of the data manipulation and analysis -functionality that people use R for, this page was started to provide a more -detailed look at the R language and it's many 3rd party libraries as they -relate to pandas. In offering comparisons with R and CRAN libraries, we care -about the following things: +Since ``pandas`` aims to provide a lot of the data manipulation and analysis +functionality that people use `R `__ for, this page +was started to provide a more detailed look at the `R language +`__ and its many third +party libraries as they relate to ``pandas``. In comparisons with R and CRAN +libraries, we care about the following things: - - **Functionality / flexibility**: what can / cannot be done with each tool - - **Performance**: how fast are operations. Hard numbers / benchmarks are + - **Functionality / flexibility**: what can/cannot be done with each tool + - **Performance**: how fast are operations. Hard numbers/benchmarks are preferable - - **Ease-of-use**: is one tool easier or harder to use (you may have to be - the judge of this given side-by-side code comparisons) + - **Ease-of-use**: Is one tool easier/harder to use (you may have to be + the judge of this, given side-by-side code comparisons) + +This page is also here to offer a bit of a translation guide for users of these +R packages. + +Base R +------ + +|subset|_ +~~~~~~~~~~ + +.. versionadded:: 0.13 + +The :meth:`~pandas.DataFrame.query` method is similar to the base R ``subset`` +function. In R you might want to get the rows of a ``data.frame`` where one +column's values are less than another column's values: + + .. code-block:: r + + df <- data.frame(a=rnorm(10), b=rnorm(10)) + subset(df, a <= b) + df[df$a <= df$b,] # note the comma + +In ``pandas``, there are a few ways to perform subsetting. You can use +:meth:`~pandas.DataFrame.query` or pass an expression as if it were an +index/slice as well as standard boolean indexing: + + .. ipython:: python + + from pandas import DataFrame + from numpy.random import randn + + df = DataFrame({'a': randn(10), 'b': randn(10)}) + df.query('a <= b') + df['a <= b'] + df[df.a <= df.b] + df.loc[df.a <= df.b] -As I do not have an encyclopedic knowledge of R packages, feel free to suggest -additional CRAN packages to add to this list. This is also here to offer a big -of a translation guide for users of these R packages. +For more details and examples see :ref:`the query documentation +`. -data.frame ----------- + +|with|_ +~~~~~~~~ + +.. versionadded:: 0.13 + +An expression using a data.frame called ``df`` in R with the columns ``a`` and +``b`` would be evaluated using ``with`` like so: + + .. code-block:: r + + df <- data.frame(a=rnorm(10), b=rnorm(10)) + with(df, a + b) + df$a + df$b # same as the previous expression + +In ``pandas`` the equivalent expression, using the +:meth:`~pandas.DataFrame.eval` method, would be: + + .. ipython:: python + + df = DataFrame({'a': randn(10), 'b': randn(10)}) + df.eval('a + b') + df.a + df.b # same as the previous expression + +In certain cases :meth:`~pandas.DataFrame.eval` will be much faster than +evaluation in pure Python. For more details and examples see :ref:`the eval +documentation `. zoo --- @@ -36,3 +96,9 @@ plyr reshape / reshape2 ------------------ + +.. |with| replace:: ``with`` +.. _with: http://finzi.psych.upenn.edu/R/library/base/html/with.html + +.. |subset| replace:: ``subset`` +.. _subset: http://finzi.psych.upenn.edu/R/library/base/html/subset.html diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index 47d2acc578a21..ffd765ed7f6a5 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -292,14 +292,13 @@ Read more in the `cython docs `__. .. _enhancingperf.eval: -.. versionadded:: 0.13 - Expression Evaluation via :func:`~pandas.eval` ---------------------------------------------- -New in pandas v0.13 a top-level function :func:`~pandas.eval` implements -expression evaluation of expressions containing :class:`~pandas.Series` and -:class:`~pandas.DataFrame` objects. +.. versionadded:: 0.13 + +The top-level function :func:`~pandas.eval` implements expression evaluation of +:class:`~pandas.Series` and :class:`~pandas.DataFrame` objects. .. note:: @@ -307,11 +306,11 @@ expression evaluation of expressions containing :class:`~pandas.Series` and install ``numexpr``. See the :ref:`recommended dependencies section ` for more details. -The major benefit of using :func:`~pandas.eval` for expression evaluation -rather than just straight-up Python is two-fold: large -:class:`~pandas.DataFrame` objects are evaluated more efficiently and large -expressions are evaluated all at once by the underlying engine (by default -``numexpr`` is used for evaluation). +The point of using :func:`~pandas.eval` for expression evaluation rather than +plain Python is two-fold: 1) large :class:`~pandas.DataFrame` objects are +evaluated more efficiently and 2) large arithmetic and boolean expressions are +evaluated all at once by the underlying engine (by default ``numexpr`` is used +for evaluation). .. note:: @@ -323,11 +322,8 @@ expressions are evaluated all at once by the underlying engine (by default :class:`~pandas.core.frame.DataFrame` with more than 10,000 rows. -:func:`~pandas.eval` supports all arithmetic expressions -supported by the engine. The ``numexpr`` engine uses ``numexpr`` under the hood -to evaluate expressions efficiently, while allowing a slightly modified--and we -think more intuitive--syntax for expressions. - +:func:`~pandas.eval` supports all arithmetic expressions supported by the +engine in addition to some extensions available only in pandas. .. note:: @@ -338,8 +334,7 @@ think more intuitive--syntax for expressions. :func:`~pandas.eval` Examples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:func:`~pandas.eval` works wonders for expressions containing -large arrays +:func:`~pandas.eval` works wonders for expressions containing large arrays First let's create 4 decent-sized arrays to play with: @@ -377,7 +372,7 @@ Now let's do the same thing but with comparisons: %timeit pd.eval('(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)') -:func:`~pandas.eval` also works with "unaligned" pandas objects: +:func:`~pandas.eval` also works with unaligned pandas objects: .. ipython:: python @@ -389,13 +384,76 @@ Now let's do the same thing but with comparisons: %timeit pd.eval('df1 + df2 + df3 + df4 + s') -There are also two different flavors of parsers and and two different engines -to use as the backend. +The ``DataFrame.eval`` method +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In addition to the top level :func:`~pandas.eval` function you can also +evaluate an expression in the "context" of a ``DataFrame``. + + +.. ipython:: python + + df = DataFrame(randn(10, 2), columns=['a', 'b']) + df.eval('a + b') + + +Any expression that is a valid :func:`~pandas.eval` expression is also a valid +``DataFrame.eval`` expression, with the added benefit that *you don't have to +prefix the name of the* ``DataFrame`` *to the column you're interested in +evaluating*. + + +Local Variables +~~~~~~~~~~~~~~~ + +You can refer to local variables the same way you would in vanilla Python + +.. ipython:: python + + df = DataFrame(randn(10, 2), columns=['a', 'b']) + newcol = randn(len(df)) + df.eval('b + newcol') + +.. note:: + + The one exception is when you have a local (or global) with the same name as + a column in the ``DataFrame`` + + .. ipython:: python + :okexcept: + + df = DataFrame(randn(10, 2), columns=['a', 'b']) + a = randn(len(df)) + df.eval('a + b') + + To deal with these conflicts, a special syntax exists for referring + variables with the same name as a column + + .. ipython:: python + + df.eval('@a + b') + + The same is true for :meth:`~pandas.DataFrame.query` and + :meth:`~pandas.DataFrame.__getitem__` passed an expression + + .. ipython:: python + + df.query('@a < b') + df['@a < b'] + + .. ipython:: python + :suppress: + + del a + :func:`~pandas.eval` Parsers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The default ``"pandas"`` parser allows a more intuitive syntax for expressing +There are two different parsers and and two different engines you can use as +the backend. + +The default ``'pandas'`` parser allows a more intuitive syntax for expressing query-like operations (comparisons, conjunctions and disjunctions). In particular, the precedence of the ``&`` and ``|`` operators is made equal to the precedence of the corresponding boolean operations ``and`` and ``or``. @@ -413,7 +471,8 @@ semantics. np.all(x == y) -The same expression can be "anded" with the word :keyword:`and` as well: +The same expression can be "anded" together with the word :keyword:`and` as +well: .. ipython:: python @@ -424,6 +483,10 @@ The same expression can be "anded" with the word :keyword:`and` as well: np.all(x == y) +The ``and`` and ``or`` operators here have the same precedence that they would +in vanilla Python. + + :func:`~pandas.eval` Backends ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -432,11 +495,9 @@ ol' Python. .. note:: - Using the ``'python'`` engine is generally *not* useful, except for - comparing performance and testing other - :func:`~pandas.eval` engines against it. You will acheive - **no** performance benefits using :func:`~pandas.eval` with - ``engine='python'``. + Using the ``'python'`` engine is generally *not* useful, except for testing + other :func:`~pandas.eval` engines against it. You will acheive **no** + performance benefits using :func:`~pandas.eval` with ``engine='python'``. You can see this by using :func:`~pandas.eval` with the ``'python'`` engine is actually a bit slower (not by much) than evaluating the same expression in @@ -449,3 +510,23 @@ Python: .. ipython:: python %timeit pd.eval('df1 + df2 + df3 + df4', engine='python') + + +:func:`~pandas.eval` Performance +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:func:`~pandas.eval` is intended to speed up certain kinds of operations. In +particular, those operations involving complex expressions with large +``DataFrame``/``Series`` objects should see a significant performance benefit. +Here is a plot showing the running time of :func:`~pandas.eval` as function of +the size of the frame involved in the computation. The two lines are two +different engines. + + +.. image:: _static/eval-perf.png + + +Note that operations with smallish objects (around 15,000 rows) are faster +using plain Python: + +.. image:: _static/eval-perf-intersect.png diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index e5e6e84cc0a0d..9cf574247f56d 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1008,10 +1008,11 @@ convert to an integer index: .. _indexing.query: -.. versionadded:: 0.13 - The :meth:`~pandas.DataFrame.query` Method ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.13 + :class:`~pandas.DataFrame` objects have a :meth:`~pandas.DataFrame.query` method that allows selection using a string consisting of columns of the calling :class:`~pandas.DataFrame`. @@ -1042,13 +1043,76 @@ with the name ``a``. df df.query('a < b and b < c') -A use case for :meth:`~pandas.DataFrame.query` is when you have a collection of -:class:`~pandas.DataFrame` s that have a subset of column names (or index -names) in common. You can pass the same query to both frames *without* having -to specify which frame you're interested in querying +If instead you don't want to or cannot name your index, you can use the name +``index`` in your query expression: + +.. ipython:: python + :suppress: + + old_index = index + del index + +.. ipython:: python + + df = DataFrame(randint(n, size=(n, 2)), columns=list('bc')) + df + df.query('index < b < c') + +.. ipython:: python + :suppress: + + index = old_index + del old_index + + +:class:`~pandas.MultiIndex` :meth:`~pandas.DataFrame.query` Syntax +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can also use the levels of a ``DataFrame`` with a +:class:`~pandas.MultiIndex` as if they were columns in the frame: + +.. ipython:: python + + import pandas.util.testing as tm + + colors = tm.choice(['red', 'green'], size=10) + foods = tm.choice(['eggs', 'ham'], size=10) + colors + foods + + index = MultiIndex.from_arrays([colors, foods], names=['color', 'food']) + df = DataFrame(randn(10, 2), index=index) + df + df.query('color == "red"') + +If the levels of the ``MultiIndex`` are unnamed, you can refer to them using +special names: + .. ipython:: python + index.names = [None, None] + df = DataFrame(randn(10, 2), index=index) + df + df.query('ilevel_0 == "red"') + + +The convention is ``ilevel_0``, which means "index level 0" for the 0th level +of the ``index``. + + +:meth:`~pandas.DataFrame.query` Use Cases +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +One use case for :meth:`~pandas.DataFrame.query` is when you have a collection of +:class:`~pandas.DataFrame` objects that have a subset of column names (or index +levels/names) in common. You can pass the same query to both frames *without* +having to specify which frame you're interested in querying + +.. ipython:: python + + df = DataFrame(randint(n, size=(n, 2)), columns=list('bc')) + df.index.name = 'a' df2 = DataFrame(randint(n + 10, size=(n + 10, 3)), columns=list('abc')) df2 expr = 'a < b & b < c' @@ -1069,11 +1133,18 @@ This functionality can of course be combined with a slightly modified and more readable Python syntax implemented in the workhorse function that underlies :meth:`~pandas.DataFrame.query`--:func:`~pandas.eval`. + +:meth:`~pandas.DataFrame.query` Python versus pandas Syntax Comparison +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Full numpy-like syntax .. ipython:: python + df = DataFrame(randint(n, size=(n, 3)), columns=list('abc')) + df df['(a < b) & (b < c)'] + df[(df.a < df.b) & (df.b < df.c)] Slightly nicer by removing the parentheses @@ -1097,8 +1168,94 @@ As you can see, these are all equivalent ways to express the same operation (in fact, they are all ultimately parsed into something very similar to the first example of the indexing syntax above). -You can also negate boolean expressions with the word ``not`` or the ``~`` -operator. +The ``in`` and ``not in`` operators +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`~pandas.DataFrame.query` also supports special use of Python's ``in`` and +``not in`` comparison operators, providing a succint syntax for calling the +``isin`` method of a ``Series`` or ``DataFrame``. + +.. ipython:: python + :suppress: + + old_d = d + del d + +.. ipython:: python + + # get all rows where columns "a" and "b" have overlapping values + df = DataFrame({'a': list('aaaabbbbcccc'), 'b': list('aabbccddeeff'), + 'c': randint(5, size=12), 'd': randint(9, size=12)}) + df + df['a in b'] + + # How you'd do it in pure Python + df[df.b.isin(df.a)] + + df['a not in b'] + + # pure Python + df[~df.b.isin(df.a)] + + +You can, of course, combine this with other expressions for very succinct +queries: + + +.. ipython:: python + + # rows where cols a and b have overlapping values and col c's values are less than col d's + df['a in b and c < d'] + + # pure Python + df[df.b.isin(df.a) & (df.c < df.d)] + + +.. note:: + + Note that ``in`` and ``not in`` are evaluated in Python, since ``numexpr`` + has no equivalent of this operation. However, **only the** ``in``/``not in`` + **expression itself** is evaluated in vanilla Python. For example, in the + expression + + .. code-block:: python + + df['a in b + c + d'] + + ``(b + c + d)`` is evaluated by ``numexpr`` and *then* the ``in`` + operation is evaluated in plain Python. In general, any operations that can + be evaluated using ``numexpr`` will be. + +Special use of the ``==`` operator with ``list`` objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Comparing a ``list`` of values to a column using ``==``/``!=`` works similarly +to ``in``/``not in`` + +.. ipython:: python + + df['b == ["a", "b", "c"]'] + + # pure Python + df[df.b.isin(["a", "b", "c"])] + + df['c == [1, 2]'] + + df['c != [1, 2]'] + + # using in/not in + df['[1, 2] in c'] + + df['[1, 2] not in c'] + + # pure Python + df[df.c.isin([1, 2])] + + +Boolean Operators +~~~~~~~~~~~~~~~~~ + +You can negate boolean expressions with the word ``not`` or the ``~`` operator. .. ipython:: python @@ -1113,26 +1270,38 @@ Of course, expressions can be arbitrarily complex too .. ipython:: python - # nice short query syntax - pretty = df['a < b < c and (not bools) or bools > 2'] + # short query syntax + shorter = df['a < b < c and (not bools) or bools > 2'] - # equivalent in pure Python, yuck! - yuck = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)] + # equivalent in pure Python + longer = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)] - pretty - yuck + shorter + longer + + shorter == longer + +.. ipython:: python + :suppress: + + d = old_d + del old_d - yuck == pretty .. _indexing.class: Index objects ------------- -The pandas Index class and its subclasses can be viewed as implementing an -*ordered set* in addition to providing the support infrastructure necessary for -lookups, data alignment, and reindexing. The easiest way to create one directly -is to pass a list or other sequence to ``Index``: +The pandas :class:`~pandas.Index` class and its subclasses can be viewed as +implementing an *ordered multiset*. Duplicates are allowed. However, if you try +to convert an :class:`~pandas.Index` object with duplicate entries into a +``set``, an exception will be raised. + +:class:`~pandas.Index` also provides the infrastructure necessary for +lookups, data alignment, and reindexing. The easiest way to create an +:class:`~pandas.Index` directly is to pass a ``list`` or other sequence to +:class:`~pandas.Index`: .. ipython:: python diff --git a/doc/source/io.rst b/doc/source/io.rst index 19fcbd6f4c851..7ebfe753e76b3 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2032,7 +2032,26 @@ The right-hand side of the sub-expression (after a comparsion operator) can be: - lists, e.g. ``"['A','B']"`` - variables that are defined in the local names space, e.g. ``date`` -Here is an example: +Here are some examples: + +.. ipython:: python + + dfq = DataFrame(randn(10,4),columns=list('ABCD'),index=date_range('20130101',periods=10)) + store.append('dfq',dfq,format='table',data_columns=True) + +Use boolean expressions, with in-line function evaluation. + +.. ipython:: python + + store.select('dfq',"index>Timestamp('20130104') & columns=['A', 'B']") + +Use and inline column reference + +.. ipython:: python + + store.select('dfq',where="A>0 or C>0") + +Works with a Panel as well. .. ipython:: python @@ -2060,6 +2079,15 @@ space. These are in terms of the total number of rows in a table. store.select('wp',"major_axis>20000102 & minor_axis=['A','B']", start=0, stop=10) +.. note:: + + ``select`` will raise a ``ValueError`` if the query expression has an unknown + variable reference. Usually this means that you are trying to select on a column + that is **not** a data_column. + + ``select`` will raise a ``SyntaxError`` if the query expression is not valid. + + .. _io.hdf5-timedelta: **Using timedelta64[ns]** diff --git a/doc/source/v0.10.0.txt b/doc/source/v0.10.0.txt index 476760e4b1464..0c86add1225ad 100644 --- a/doc/source/v0.10.0.txt +++ b/doc/source/v0.10.0.txt @@ -258,10 +258,11 @@ Updated PyTables Support store.append('wp',wp) # selecting via A QUERY - store.select('wp', "major_axis>20000102 & minor_axis=['A','B']") + store.select('wp', + [ Term('major_axis>20000102'), Term('minor_axis', '=', ['A','B']) ]) # removing data from tables - store.remove('wp', 'major_axis>wp.major_axis[3]') + store.remove('wp', Term('major_axis>20000103')) store.select('wp') # deleting a store diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index c56af23e85eae..4f43cd5e0120c 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -187,6 +187,96 @@ Indexing API Changes p p.loc[:,:,'C'] +HDFStore API Changes +~~~~~~~~~~~~~~~~~~~~ + + - Query Format Changes. A much more string-like query format is now supported. + + .. ipython:: python + + path = 'test_query.h5' + dfq = DataFrame(randn(10,4),columns=list('ABCD'),index=date_range('20130101',periods=10)) + dfq.to_hdf(path,'dfq',format='table',data_columns=True) + + Use boolean expressions, with in-line function evaluation. + + .. ipython:: python + + read_hdf(path,'dfq',where="index>Timestamp('20130104') & columns=['A', 'B']") + + Use an inline column reference + + .. ipython:: python + + read_hdf(path,'dfq',where="A>0 or C>0") + + See :ref:`the docs`. + + - Significant table writing performance improvements + - handle a passed ``Series`` in table format (:issue:`4330`) + - added an ``is_open`` property to indicate if the underlying file handle is_open; + a closed store will now report 'CLOSED' when viewing the store (rather than raising an error) + (:issue:`4409`) + - a close of a ``HDFStore`` now will close that instance of the ``HDFStore`` + but will only close the actual file if the ref count (by ``PyTables``) w.r.t. all of the open handles + are 0. Essentially you have a local instance of ``HDFStore`` referenced by a variable. Once you + close it, it will report closed. Other references (to the same file) will continue to operate + until they themselves are closed. Performing an action on a closed file will raise + ``ClosedFileError`` + + .. ipython:: python + + path = 'test.h5' + df = DataFrame(randn(10,2)) + store1 = HDFStore(path) + store2 = HDFStore(path) + store1.append('df',df) + store2.append('df2',df) + + store1 + store2 + store1.close() + store2 + store2.close() + store2 + + .. ipython:: python + :suppress: + + import os + os.remove(path) + + - removed the ``_quiet`` attribute, replace by a ``DuplicateWarning`` if retrieving + duplicate rows from a table (:issue:`4367`) + - removed the ``warn`` argument from ``open``. Instead a ``PossibleDataLossError`` exception will + be raised if you try to use ``mode='w'`` with an OPEN file handle (:issue:`4367`) + - allow a passed locations array or mask as a ``where`` condition (:issue:`4467`). + See :ref:`here` for an example. + + - the ``format`` keyword now replaces the ``table`` keyword; allowed values are ``fixed(f)`` or ``table(t)`` + the same defaults as prior < 0.13.0 remain, e.g. ``put`` implies 'fixed` or 'f' (Fixed) format + and ``append`` imples 'table' or 't' (Table) format + + .. ipython:: python + + path = 'test.h5' + df = DataFrame(randn(10,2)) + df.to_hdf(path,'df_table',format='table') + df.to_hdf(path,'df_table2',append=True) + df.to_hdf(path,'df_fixed') + with get_store(path) as store: + print store + + .. ipython:: python + :suppress: + + import os + os.remove('test.h5') + os.remove('test_query.h5') + - add the keyword ``dropna=True`` to ``append`` to change whether ALL nan rows are not written + to the store (default is ``True``, ALL nan rows are NOT written), also settable + via the option ``io.hdf.dropna_table`` (:issue:`4625`) + Enhancements ~~~~~~~~~~~~ @@ -271,6 +361,90 @@ Enhancements is evaluated, respecttively. See scipy docs. - DataFrame constructor now accepts a numpy masked record array (:issue:`3478`) + +.. _whatsnew_0130.enhancingperf: + +Performance Enhancments +~~~~~~~~~~~~~~~~~~~~~~~ + +- :func:`~pandas.eval`: + + - The new :func:`~pandas.eval` function implements expression evaluation using + ``numexpr`` behind the scenes. This results in large speedups for + complicated expressions involving large DataFrames/Series. For example, + + .. ipython:: python + + nrows, ncols = 20000, 100 + df1, df2, df3, df4 = [DataFrame(randn(nrows, ncols)) + for _ in xrange(4)] + + .. ipython:: python + + %timeit pd.eval('df1 + df2 + df3 + df4') + + For more details, see the :ref:`enhancing performance documentation on eval + ` + +- :meth:`~pandas.DataFrame.eval` + + - Similar to :func:`~pandas.eval`, :class:`~pandas.DataFrame` has a new + :meth:`~pandas.DataFrame.eval` that evaluates an expression in the context + of the ``DataFrame``. For example, + + .. ipython:: python + :suppress: + + try: + del a + except NameError: + pass + + try: + del b + except NameError: + pass + + .. ipython:: python + + df = DataFrame(randn(10, 2), columns=['a', 'b']) + df.eval('a + b') + + +- :meth:`~pandas.DataFrame.query` + + - In 0.13 a :meth:`~pandas.DataFrame.query` method has been added that allows + you to select elements of a ``DataFrame`` using a natural query syntax + nearly identical to Python syntax. For example, + + .. ipython:: python + :suppress: + + try: + del a + except NameError: + pass + + try: + del b + except NameError: + pass + + try: + del c + except NameError: + pass + + .. ipython:: python + + n = 20 + df = DataFrame(randint(n, size=(n, 3)), columns=['a', 'b', 'c']) + df['a < b < c'] + + selects all the rows of ``df`` where ``a < b < c`` evaluates to ``True``. + For more details see the :ref:`indexing documentation on query + `. + .. _whatsnew_0130.refactoring: Internal Refactoring diff --git a/pandas/computation/align.py b/pandas/computation/align.py index ec51887ff6df0..60975bdc8a5b4 100644 --- a/pandas/computation/align.py +++ b/pandas/computation/align.py @@ -1,3 +1,6 @@ +"""Core eval alignment algorithms +""" + import warnings from functools import partial, wraps from pandas.compat import zip, range @@ -7,7 +10,6 @@ import pandas as pd from pandas import compat import pandas.core.common as com -from pandas.computation.ops import is_const def _align_core_single_unary_op(term): @@ -129,11 +131,12 @@ def _align_core(terms): term_axis_size = len(ti.axes[axis]) reindexer_size = len(reindexer) - if (np.log10(abs(reindexer_size - term_axis_size)) >= 1 and - reindexer_size >= 10000): + ordm = np.log10(abs(reindexer_size - term_axis_size)) + if ordm >= 1 and reindexer_size >= 10000: warnings.warn("Alignment difference on axis {0} is larger" " than an order of magnitude on term {1!r}, " - "performance may suffer".format(axis, term), + "by more than {2:.4g}; performance may suffer" + "".format(axis, term.name, ordm), category=pd.io.common.PerformanceWarning) if transpose: @@ -164,7 +167,7 @@ def _align_core(terms): def _filter_terms(flat): # numeric literals - literals = frozenset(filter(is_const, flat)) + literals = frozenset(filter(lambda x: isinstance(x, Constant), flat)) # these are strings which are variable names names = frozenset(flat) - literals @@ -213,7 +216,7 @@ def _reconstruct_object(typ, obj, axes, dtype): Returns ------- - reconst : typ + ret : typ An object of type ``typ`` with the value `obj` and possible axes `axes`. """ @@ -231,7 +234,11 @@ def _reconstruct_object(typ, obj, axes, dtype): issubclass(typ, pd.core.generic.PandasObject)): return typ(obj, dtype=res_t, **axes) - ret_value = typ(obj).astype(res_t) + # special case for pathological things like ~True/~False + if hasattr(res_t, 'type') and typ == np.bool_ and res_t != np.bool_: + ret_value = res_t.type(obj) + else: + ret_value = typ(obj).astype(res_t) try: ret = ret_value.item() diff --git a/pandas/computation/common.py b/pandas/computation/common.py index 3253039050b78..9af2197a4fd69 100644 --- a/pandas/computation/common.py +++ b/pandas/computation/common.py @@ -1,10 +1,11 @@ import numpy as np +import pandas as pd def _ensure_decoded(s): """ if we have bytes, decode them to unicode """ if isinstance(s, (np.bytes_, bytes)): - s = s.decode('UTF-8') + s = s.decode(pd.get_option('display.encoding')) return s diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 794b80615f9ea..88efc9eeab5d5 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -1,3 +1,6 @@ +"""Engine classes for :func:`~pandas.eval` +""" + import abc from pandas import compat @@ -5,8 +8,10 @@ from pandas.computation.align import _align, _reconstruct_object from pandas.computation.ops import UndefinedVariableError + class AbstractEngine(object): - """AbstractEngine object serving as a base class for all engines.""" + """Object serving as a base class for all engines.""" + __metaclass__ = abc.ABCMeta has_neg_frac = False @@ -62,7 +67,7 @@ def _evaluate(self): Notes ----- - This method must be implemented by any class the subclasses this class. + Must be implemented by subclasses. """ pass @@ -74,13 +79,16 @@ class NumExprEngine(AbstractEngine): def __init__(self, expr): super(NumExprEngine, self).__init__(expr) + def convert(self): + return str(super(NumExprEngine, self).convert()) + def _evaluate(self): import numexpr as ne # add the resolvers to locals self.expr.add_resolvers_to_locals() - # convert the expression to syntactically valid Python + # convert the expression to a valid numexpr expression s = self.convert() try: diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index ff073889376aa..36b1e2bc96090 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -1,18 +1,38 @@ #!/usr/bin/env python +"""Top level ``eval`` module. +""" + import numbers import numpy as np +from pandas.core import common as com from pandas.compat import string_types from pandas.computation.expr import Expr, _parsers, _ensure_scope from pandas.computation.engines import _engines def _check_engine(engine): - """make sure a valid engine is passed""" + """Make sure a valid engine is passed. + + Parameters + ---------- + engine : str + + Raises + ------ + KeyError + * If an invalid engine is passed + ImportError + * If numexpr was requested but doesn't exist + """ if engine not in _engines: raise KeyError('Invalid engine {0!r} passed, valid engines are' - ' {1}'.format(engine, _engines.keys())) + ' {1}'.format(engine, list(_engines.keys()))) + + # TODO: validate this in a more general way (thinking of future engines + # that won't necessarily be import-able) + # Could potentially be done on engine instantiation if engine == 'numexpr': try: import numexpr @@ -22,12 +42,76 @@ def _check_engine(engine): def _check_parser(parser): - """make sure a valid parser is passed""" + """Make sure a valid parser is passed. + + Parameters + ---------- + parser : str + + Raises + ------ + KeyError + * If an invalid parser is passed + """ if parser not in _parsers: raise KeyError('Invalid parser {0!r} passed, valid parsers are' ' {1}'.format(parser, _parsers.keys())) +def _check_resolvers(resolvers): + if resolvers is not None: + for resolver in resolvers: + if not hasattr(resolver, '__getitem__'): + name = type(resolver).__name__ + raise AttributeError('Resolver of type {0!r} must implement ' + 'the __getitem__ method'.format(name)) + + +def _check_expression(expr): + """Make sure an expression is not an empty string + + Parameters + ---------- + expr : object + An object that can be converted to a string + + Raises + ------ + ValueError + * If expr is an empty string + """ + if not expr: + raise ValueError("expr cannot be an empty string") + + +def _convert_expression(expr): + """Convert an object to an expression. + + Thus function converts an object to an expression (a unicode string) and + checks to make sure it isn't empty after conversion. This is used to + convert operators to their string representation for recursive calls to + :func:`~pandas.eval`. + + Parameters + ---------- + expr : object + The object to be converted to a string. + + Returns + ------- + s : unicode + The string representation of an object. + + Raises + ------ + ValueError + * If the expression is empty. + """ + s = com.pprint_thing(expr) + _check_expression(s) + return s + + def eval(expr, parser='pandas', engine='numexpr', truediv=True, local_dict=None, global_dict=None, resolvers=None, level=2): """Evaluate a Python expression as a string using various backends. @@ -43,13 +127,19 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True, Parameters ---------- - expr : string - The expression to evaluate. + expr : str or unicode + The expression to evaluate. This string cannot contain any Python + `statements + `__, + only Python `expressions + `__. parser : string, default 'pandas', {'pandas', 'python'} The parser to use to construct the syntax tree from the expression. The - default of 'pandas' parses code slightly different than standard - Python. See the :ref:`enhancing performance ` - documentation for more details. + default of ``'pandas'`` parses code slightly different than standard + Python. Alternatively, you can parse an expression using the + ``'python'`` parser to retain strict Python semantics. See the + :ref:`enhancing performance ` documentation for + more details. engine : string, default 'numexpr', {'python', 'numexpr'} The engine used to evaluate the expression. Supported engines are @@ -60,27 +150,29 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True, - ``'python'``: Performs operations as if you had ``eval``'d in top level python. This engine is generally not that useful. - truediv : bool, default True + More backends may be available in the future. + + truediv : bool, optional Whether to use true division, like in Python >= 3 - local_dict : dict or None, default None + local_dict : dict or None, optional A dictionary of local variables, taken from locals() by default. - global_dict : dict or None, default None + global_dict : dict or None, optional A dictionary of global variables, taken from globals() by default. - resolvers : dict of dict-like or None, default None - A dictionary of dict-like object (specifically they must implement the - ``get`` method) that you can use to inject an additional collection of - namespaces to use for variable lookup. This is used in the + resolvers : list of dict-like or None, optional + A list of objects implementing the ``__getitem__`` special method that + you can use to inject an additional collection of namespaces to use for + variable lookup. For example, this is used in the :meth:`~pandas.DataFrame.query` method to inject the :attr:`~pandas.DataFrame.index` and :attr:`~pandas.DataFrame.columns` variables that refer to their respective :class:`~pandas.DataFrame` instance attributes. - level : int, default 2 + level : int, optional The number of prior stack frames to traverse and add to the current - scope. + scope. Most users will **not** need to change this parameter. Returns ------- - ret : ndarray, numeric scalar, :class:`~pandas.DataFrame`, :class:`~pandas.Series` + ndarray, numeric scalar, DataFrame, Series Notes ----- @@ -93,30 +185,22 @@ def eval(expr, parser='pandas', engine='numexpr', truediv=True, See Also -------- pandas.DataFrame.query + pandas.DataFrame.eval """ - # make sure we're passed a valid engine and parser + expr = _convert_expression(expr) _check_engine(engine) _check_parser(parser) + _check_resolvers(resolvers) + # get our (possibly passed-in) scope env = _ensure_scope(global_dict=global_dict, local_dict=local_dict, resolvers=resolvers, level=level) - if isinstance(expr, string_types): - parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, - truediv=truediv) - else: - raise TypeError("eval only accepts strings, you passed an object of " - "type {0!r}".format(expr.__class__.__name__)) + parsed_expr = Expr(expr, engine=engine, parser=parser, env=env, + truediv=truediv) - # construct the engine and evaluate + # construct the engine and evaluate the parsed expression eng = _engines[engine] eng_inst = eng(parsed_expr) ret = eng_inst.evaluate() - - # sanity check for a number if it's a scalar result - # TODO: eventually take out - if np.isscalar(ret): - if not isinstance(ret, (np.number, np.bool_, numbers.Number)): - raise TypeError('scalar result must be numeric or bool, return' - ' type is {0!r}'.format(ret.__class__.__name__)) return ret diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 1fbc0b72289b1..d8969e1297cd4 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -1,9 +1,13 @@ +""":func:`~pandas.eval` parsers +""" + import ast import operator import sys import inspect import tokenize import datetime +import struct from functools import partial @@ -16,16 +20,20 @@ from pandas.computation.ops import (_cmp_ops_syms, _bool_ops_syms, _arith_ops_syms, _unary_ops_syms, is_term) from pandas.computation.ops import _reductions, _mathops, _LOCAL_TAG -from pandas.computation.ops import BinOp, UnaryOp, Term, Constant, Div +from pandas.computation.ops import Op, BinOp, UnaryOp, Term, Constant, Div def _ensure_scope(level=2, global_dict=None, local_dict=None, resolvers=None, **kwargs): - """ ensure that we are grabbing the correct scope """ - return Scope(global_dict, local_dict, level=level, resolvers=resolvers) + """Ensure that we are grabbing the correct scope.""" + return Scope(gbls=global_dict, lcls=local_dict, level=level, + resolvers=resolvers) def _check_disjoint_resolver_names(resolver_keys, local_keys, global_keys): + """Make sure that variables in resolvers don't overlap with locals or + globals. + """ res_locals = list(com.intersection(resolver_keys, local_keys)) if res_locals: msg = "resolvers and locals overlap on names {0}".format(res_locals) @@ -37,6 +45,29 @@ def _check_disjoint_resolver_names(resolver_keys, local_keys, global_keys): raise NameResolutionError(msg) +def _replacer(x, pad_size): + """Replace a number with its padded hexadecimal representation. Used to tag + temporary variables with their calling scope's id. + """ + # get the hex repr of the binary char and remove 0x and pad by pad_size + # zeros + try: + hexin = ord(x) + except TypeError: + # bytes literals masquerade as ints when iterating in py3 + hexin = x + + return hex(hexin).replace('0x', '').rjust(pad_size, '0') + + +def _raw_hex_id(obj, pad_size=2): + """Return the padded hexadecimal id of ``obj``.""" + # interpret as a pointer since that's what really what id returns + packed = struct.pack('@P', id(obj)) + + return ''.join(_replacer(x, pad_size) for x in packed) + + class Scope(StringMixin): """Object to hold scope, with a few bells to deal with some custom syntax added by pandas. @@ -57,14 +88,14 @@ class Scope(StringMixin): resolver_keys : frozenset """ __slots__ = ('globals', 'locals', 'resolvers', '_global_resolvers', - 'resolver_keys', '_resolver', 'level') + 'resolver_keys', '_resolver', 'level', 'ntemps') def __init__(self, gbls=None, lcls=None, level=1, resolvers=None): self.level = level self.resolvers = tuple(resolvers or []) self.globals = dict() self.locals = dict() - self.ntemps = 0 # number of temporary variables in this scope + self.ntemps = 1 # number of temporary variables in this scope if isinstance(lcls, Scope): ld, lcls = lcls, dict() @@ -88,21 +119,20 @@ def __init__(self, gbls=None, lcls=None, level=1, resolvers=None): self.globals['True'] = True self.globals['False'] = False - - self.resolver_keys = frozenset(reduce(operator.add, (list(o.keys()) for - o in - self.resolvers), - [])) + res_keys = (list(o.keys()) for o in self.resolvers) + self.resolver_keys = frozenset(reduce(operator.add, res_keys, [])) self._global_resolvers = self.resolvers + (self.locals, self.globals) self._resolver = None - self.resolver_dict = dict((k, self.resolve(k)) - for k in self.resolver_keys) + + self.resolver_dict = {} + for o in self.resolvers: + self.resolver_dict.update(dict(o)) def __unicode__(self): return com.pprint_thing("locals: {0}\nglobals: {0}\nresolvers: " - "{0}".format(self.locals.keys(), - self.globals.keys(), - self.resolver_keys)) + "{0}".format(list(self.locals.keys()), + list(self.globals.keys()), + list(self.resolver_keys))) def __getitem__(self, key): return self.resolve(key, globally=False) @@ -171,18 +201,32 @@ def add_tmp(self, value, where='locals'): if not isinstance(d, dict): raise TypeError("Cannot add value to object of type {0!r}, " "scope must be a dictionary" - "".format(d.__class__.__name__)) - name = 'tmp_var_{0}_{1}_{2}'.format(value.__class__.__name__, - self.ntemps, - pd.util.testing.rands(10)) + "".format(type(d).__name__)) + name = 'tmp_var_{0}_{1}_{2}'.format(type(value).__name__, self.ntemps, + _raw_hex_id(self)) d[name] = value # only increment if the variable gets put in the scope self.ntemps += 1 return name + def remove_tmp(self, name, where='locals'): + d = getattr(self, where, None) + if d is None: + raise AttributeError("Cannot remove value from non-existent scope " + "{0!r}".format(where)) + if not isinstance(d, dict): + raise TypeError("Cannot remove value from object of type {0!r}, " + "scope must be a dictionary" + "".format(type(d).__name__)) + del d[name] + self.ntemps -= 1 + def _rewrite_assign(source): + """Rewrite the assignment operator for PyTables expression that want to use + ``=`` as a substitute for ``==``. + """ res = [] g = tokenize.generate_tokens(StringIO(source).readline) for toknum, tokval, _, _, _ in g: @@ -191,17 +235,30 @@ def _rewrite_assign(source): def _replace_booleans(source): + """Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise + precedence is changed to boolean precedence. + """ return source.replace('|', ' or ').replace('&', ' and ') def _replace_locals(source, local_symbol='@'): + """Replace local variables with a syntacticall valid name.""" return source.replace(local_symbol, _LOCAL_TAG) def _preparse(source): + """Compose assignment and boolean replacement.""" return _replace_booleans(_rewrite_assign(source)) +def _is_type(t): + """Factory for a type checking function of type ``t`` or tuple of types.""" + return lambda x: isinstance(x.value, t) + + +_is_list = _is_type(list) +_is_str = _is_type(string_types) + # partition all AST nodes _all_nodes = frozenset(filter(lambda x: isinstance(x, type) and @@ -210,6 +267,7 @@ def _preparse(source): def _filter_nodes(superclass, all_nodes=_all_nodes): + """Filter out AST nodes that are subclasses of ``superclass``.""" node_names = (node.__name__ for node in all_nodes if issubclass(node, superclass)) return frozenset(node_names) @@ -238,8 +296,7 @@ def _filter_nodes(superclass, all_nodes=_all_nodes): _unsupported_expr_nodes = frozenset(['Yield', 'GeneratorExp', 'IfExp', 'DictComp', 'SetComp', 'Repr', 'Lambda', - 'Set', 'In', 'NotIn', 'AST', 'Is', - 'IsNot']) + 'Set', 'AST', 'Is', 'IsNot']) # these nodes are low priority or won't ever be supported (e.g., AST) _unsupported_nodes = ((_stmt_nodes | _mod_nodes | _handler_nodes | @@ -257,6 +314,9 @@ def _filter_nodes(superclass, all_nodes=_all_nodes): def _node_not_implemented(node_name, cls): + """Return a function that raises a NotImplementedError with a passed node + name. + """ def f(self, *args, **kwargs): raise NotImplementedError("{0!r} nodes are not " "implemented".format(node_name)) @@ -264,10 +324,17 @@ def f(self, *args, **kwargs): def disallow(nodes): + """Decorator to disallow certain nodes from parsing. Raises a + NotImplementedError instead. + + Returns + ------- + disallowed : callable + """ def disallowed(cls): cls.unsupported_nodes = () for node in nodes: - new_method = _node_not_implemented(node, cls) + new_method = _node_not_implemented(node, cls) name = 'visit_{0}'.format(node) cls.unsupported_nodes += (name,) setattr(cls, name, new_method) @@ -276,14 +343,29 @@ def disallowed(cls): def _op_maker(op_class, op_symbol): + """Return a function to create an op class with its symbol already passed. + + Returns + ------- + f : callable + """ def f(self, node, *args, **kwargs): + """Return a partial function with an Op subclass with an operator + already passed. + + Returns + ------- + f : callable + """ return partial(op_class, op_symbol, *args, **kwargs) return f _op_classes = {'binary': BinOp, 'unary': UnaryOp} + def add_ops(op_classes): + """Decorator to add default implementation of ops.""" def f(cls): for op_attr_name, op_class in compat.iteritems(op_classes): ops = getattr(cls, '{0}_ops'.format(op_attr_name)) @@ -291,8 +373,8 @@ def f(cls): for op in ops: op_node = ops_map[op] if op_node is not None: - setattr(cls, 'visit_{0}'.format(op_node), - _op_maker(op_class, op)) + made_op = _op_maker(op_class, op) + setattr(cls, 'visit_{0}'.format(op_node), made_op) return cls return f @@ -300,21 +382,36 @@ def f(cls): @disallow(_unsupported_nodes) @add_ops(_op_classes) class BaseExprVisitor(ast.NodeVisitor): + """Custom ast walker. Parsers of other engines should subclass this class + if necessary. + + Parameters + ---------- + env : Scope + engine : str + parser : str + preparser : callable + """ const_type = Constant term_type = Term - """Custom ast walker - """ binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms - binary_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'BitAnd', - 'BitOr', 'And', 'Or', 'Add', 'Sub', 'Mult', None, - 'Pow', 'FloorDiv', 'Mod') + binary_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'In', 'NotIn', + 'BitAnd', 'BitOr', 'And', 'Or', 'Add', 'Sub', 'Mult', + None, 'Pow', 'FloorDiv', 'Mod') binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes)) unary_ops = _unary_ops_syms unary_op_nodes = 'UAdd', 'USub', 'Invert', 'Not' unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) + rewrite_map = { + ast.Eq: ast.In, + ast.NotEq: ast.NotIn, + ast.In: ast.In, + ast.NotIn: ast.NotIn + } + def __init__(self, env, engine, parser, preparser=_preparse): self.env = env self.engine = engine @@ -342,11 +439,74 @@ def visit_Module(self, node, **kwargs): def visit_Expr(self, node, **kwargs): return self.visit(node.value, **kwargs) + def _rewrite_membership_op(self, node, left, right): + # the kind of the operator (is actually an instance) + op_instance = node.op + op_type = type(op_instance) + + # must be two terms and the comparison operator must be ==/!=/in/not in + if is_term(left) and is_term(right) and op_type in self.rewrite_map: + + left_list, right_list = map(_is_list, (left, right)) + left_str, right_str = map(_is_str, (left, right)) + + # if there are any strings or lists in the expression + if left_list or right_list or left_str or right_str: + op_instance = self.rewrite_map[op_type]() + + # pop the string variable out of locals and replace it with a list + # of one string, kind of a hack + if right_str: + self.env.remove_tmp(right.name) + name = self.env.add_tmp([right.value]) + right = self.term_type(name, self.env) + + # swap the operands so things like a == [1, 2] are translated to + # [1, 2] in a -> a.isin([1, 2]) + if right_list or right_str: + left, right = right, left + + op = self.visit(op_instance) + return op, op_instance, left, right + + def _possibly_transform_eq_ne(self, node, left=None, right=None): + if left is None: + left = self.visit(node.left, side='left') + if right is None: + right = self.visit(node.right, side='right') + op, op_class, left, right = self._rewrite_membership_op(node, left, + right) + return op, op_class, left, right + + def _possibly_eval(self, binop, eval_in_python): + # eval `in` and `not in` (for now) in "partial" python space + # things that can be evaluated in "eval" space will be turned into + # temporary variables. for example, + # [1,2] in a + 2 * b + # in that case a + 2 * b will be evaluated using numexpr, and the "in" + # call will be evaluated using isin (in python space) + return binop.evaluate(self.env, self.engine, self.parser, + self.term_type, eval_in_python) + + def _possibly_evaluate_binop(self, op, op_class, lhs, rhs, + eval_in_python=('in', 'not in'), + maybe_eval_in_python=('==', '!=')): + res = op(lhs, rhs) + + # "in"/"not in" ops are always evaluated in python + if res.op in eval_in_python: + return self._possibly_eval(res, eval_in_python) + elif (lhs.return_type == object or rhs.return_type == object and + self.engine != 'pytables'): + # evaluate "==" and "!=" in python if either of our operands has an + # object return type + return self._possibly_eval(res, eval_in_python + + maybe_eval_in_python) + return res + def visit_BinOp(self, node, **kwargs): - op = self.visit(node.op) - left = self.visit(node.left, side='left') - right = self.visit(node.right, side='right') - return op(left, right) + op, op_class, left, right = self._possibly_transform_eq_ne(node) + return self._possibly_evaluate_binop(op, op_class, left, right) def visit_Div(self, node, **kwargs): return lambda lhs, rhs: Div(lhs, rhs, @@ -380,16 +540,15 @@ def visit_Index(self, node, **kwargs): def visit_Subscript(self, node, **kwargs): value = self.visit(node.value) slobj = self.visit(node.slice) - expr = com.pprint_thing(slobj) - result = pd.eval(expr, local_dict=self.env, engine=self.engine, + result = pd.eval(slobj, local_dict=self.env, engine=self.engine, parser=self.parser) try: # a Term instance v = value.value[result] except AttributeError: # an Op instance - lhs = pd.eval(com.pprint_thing(value), local_dict=self.env, - engine=self.engine, parser=self.parser) + lhs = pd.eval(value, local_dict=self.env, engine=self.engine, + parser=self.parser) v = lhs[result] name = self.env.add_tmp(v) return self.term_type(name, env=self.env) @@ -454,61 +613,62 @@ def visit_Call(self, node, **kwargs): keywords = {} for key in node.keywords: if not isinstance(key, ast.keyword): - raise ValueError( - "keyword error in function call '{0}'".format(node.func.id)) + raise ValueError("keyword error in function call " + "'{0}'".format(node.func.id)) keywords[key.arg] = self.visit(key.value).value if node.kwargs is not None: keywords.update(self.visit(node.kwargs).value) return self.const_type(res(*args, **keywords), self.env) + def translate_In(self, op): + return op + def visit_Compare(self, node, **kwargs): ops = node.ops comps = node.comparators - def translate(op): - if isinstance(op, ast.In): - return ast.Eq() - return op - + # base case: we have something like a CMP b if len(comps) == 1: - return self.visit(translate(ops[0]))(self.visit(node.left, side='left'), - self.visit(comps[0], side='right')) + op = self.translate_In(ops[0]) + binop = ast.BinOp(op=op, left=node.left, right=comps[0]) + return self.visit(binop) + + # recursive case: we have a chained comparison, a CMP b CMP c, etc. left = node.left values = [] for op, comp in zip(ops, comps): new_node = self.visit(ast.Compare(comparators=[comp], left=left, - ops=[translate(op)])) + ops=[self.translate_In(op)])) left = comp values.append(new_node) return self.visit(ast.BoolOp(op=ast.And(), values=values)) + def _try_visit_binop(self, bop): + if isinstance(bop, (Op, Term)): + return bop + return self.visit(bop) + def visit_BoolOp(self, node, **kwargs): - op = self.visit(node.op) def visitor(x, y): - try: - lhs = self.visit(x) - except TypeError: - lhs = x - - try: - rhs = self.visit(y) - except TypeError: - rhs = y + lhs = self._try_visit_binop(x) + rhs = self._try_visit_binop(y) - return op(lhs, rhs) + op, op_class, lhs, rhs = self._possibly_transform_eq_ne(node, lhs, + rhs) + return self._possibly_evaluate_binop(op, node.op, lhs, rhs) operands = node.values return reduce(visitor, operands) _python_not_supported = frozenset(['Assign', 'Tuple', 'Dict', 'Call', - 'BoolOp']) + 'BoolOp', 'In', 'NotIn']) _numexpr_supported_calls = frozenset(_reductions + _mathops) @disallow((_unsupported_nodes | _python_not_supported) - - (_boolop_nodes | frozenset(['BoolOp', 'Attribute']))) + (_boolop_nodes | frozenset(['BoolOp', 'Attribute', 'In', 'NotIn']))) class PandasExprVisitor(BaseExprVisitor): def __init__(self, env, engine, parser, preparser=lambda x: _replace_locals(_replace_booleans(x))): @@ -523,7 +683,7 @@ def __init__(self, env, engine, parser, preparser=lambda x: x): class Expr(StringMixin): - """Expr object holding scope + """Object encapsulating an expression. Parameters ---------- @@ -578,25 +738,58 @@ def check_name_clashes(self): _check_disjoint_resolver_names(res_keys, lcl_keys, gbl_keys) def add_resolvers_to_locals(self): + """Add the extra scope (resolvers) to local scope + + Notes + ----- + This should be done after parsing and pre-evaluation, otherwise + unnecessary name clashes will occur. + """ self.env.locals.update(self.env.resolver_dict) -_needs_filter = frozenset(['and', 'or', 'not']) +# these we don't look for since column names can have these characters +_needs_filter = frozenset(['and', 'or', 'not', 'not in', 'in']) + +# these OTOH can only be operators, so you cannot create column names that are +# valid expressions +_ops_to_filter = frozenset([' and ', ' or ', 'not ', ' in ']) + +# if you don't filter out the above expressions you'll get a stack overflow, +# because DataFrame.__getitem__ will continue to search for a column name then +# an expression then a column name then an expression, and so on, until you +# blow up the stack and kill a kitten. def maybe_expression(s, kind='pandas'): - """ loose checking if s is an expression """ + """Loose checking if ``s`` is an expression. + + Parameters + ---------- + s : str or unicode + The expression to check + kind : str or unicode + The parser whose ops to check + + Returns + ------- + bool + ``True`` the expression contains some operators that would be valid + when parsed with the ``kind`` parser, otherwise ``False``. + """ if not isinstance(s, string_types): return False + visitor = _parsers[kind] ops = visitor.binary_ops + visitor.unary_ops - filtered = frozenset(ops) - _needs_filter + filtered = (frozenset(ops) | _ops_to_filter) - _needs_filter + # make sure we have an op at least - return any(op in s or ' and ' in s or ' or ' in s or 'not ' in s for op in - filtered) + return any(op in s for op in filtered) def isexpr(s, check_names=True): + """Strict checking for a valid expression.""" try: Expr(s, env=_ensure_scope() if check_names else None) except SyntaxError: @@ -606,8 +799,4 @@ def isexpr(s, check_names=True): return True -def _check_syntax(s): - ast.parse(s) - - _parsers = {'python': PythonExprVisitor, 'pandas': PandasExprVisitor} diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 0ae2d2f28c44d..14f67a3ab6723 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -1,3 +1,6 @@ +"""Operator classes for eval. +""" + import re import operator as op from functools import partial @@ -23,6 +26,7 @@ class UndefinedVariableError(NameError): + """NameError subclass for local variables.""" def __init__(self, *args): msg = 'name {0!r} is not defined' subbed = _TAG_RE.sub('', args[0]) @@ -32,18 +36,6 @@ def __init__(self, *args): super(UndefinedVariableError, self).__init__(msg.format(subbed)) -class OperatorError(Exception): - pass - - -class UnaryOperatorError(OperatorError): - pass - - -class BinaryOperatorError(OperatorError): - pass - - def _possibly_update_key(d, value, old_key, new_key=None): if new_key is None: new_key = old_key @@ -58,6 +50,13 @@ def _possibly_update_key(d, value, old_key, new_key=None): class Term(StringMixin): + def __new__(cls, name, env, side=None, encoding=None): + klass = Constant if not isinstance(name, string_types) else cls + supr_new = super(Term, klass).__new__ + if PY3: + return supr_new(klass) + return supr_new(klass, name, env, side=side, encoding=encoding) + def __init__(self, name, env, side=None, encoding=None): self._name = name self.env = env @@ -76,8 +75,10 @@ def __unicode__(self): def __call__(self, *args, **kwargs): return self.value + def evaluate(self, *args, **kwargs): + return self + def _resolve_name(self): - #import ipdb; ipdb.set_trace() env = self.env key = self.name res = env.resolve(self.local_name, globally=not self.local) @@ -194,8 +195,9 @@ def name(self, new_name): class Constant(Term): - def __init__(self, value, env): - super(Constant, self).__init__(value, env) + def __init__(self, value, env, side=None, encoding=None): + super(Constant, self).__init__(value, env, side=side, + encoding=encoding) def _resolve_name(self): return self._name @@ -205,19 +207,14 @@ def name(self): return self.value -def _print_operand(opr): - return opr.name if is_term(opr) else com.pprint_thing(opr) - - -def _get_op(op): - return {'not': '~', 'and': '&', 'or': '|'}.get(op, op) +_bool_op_map = {'not': '~', 'and': '&', 'or': '|'} class Op(StringMixin): """Hold an operator of unknown arity """ def __init__(self, op, operands, *args, **kwargs): - self.op = _get_op(op) + self.op = _bool_op_map.get(op, op) self.operands = operands self.encoding = kwargs.get('encoding', None) @@ -228,7 +225,7 @@ def __unicode__(self): """Print a generic n-ary operator and its operands using infix notation""" # recurse over the operands - parened = ('({0})'.format(_print_operand(opr)) + parened = ('({0})'.format(com.pprint_thing(opr)) for opr in self.operands) return com.pprint_thing(' {0} '.format(self.op).join(parened)) @@ -239,16 +236,33 @@ def return_type(self): return np.bool_ return np.result_type(*(term.type for term in com.flatten(self))) - @property - def raw(self): - parened = ('{0}({1!r}, {2})'.format(self.__class__.__name__, self.op, - ', '.join('{0}'.format(opr.raw) for - opr in self.operands))) - return parened + +def _in(x, y): + """Compute the vectorized membership of ``x in y`` if possible, otherwise + use Python. + """ + try: + return y.isin(x) + except AttributeError: + return x in y + except TypeError: + return y.isin([x]) -_cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=' -_cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne +def _not_in(x, y): + """Compute the vectorized membership of ``x not in y`` if possible, + otherwise use Python. + """ + try: + return ~y.isin(x) + except AttributeError: + return x not in y + except TypeError: + return ~y.isin([x]) + + +_cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=', 'in', 'not in' +_cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne, _in, _not_in _cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs)) _bool_ops_syms = '&', '|', 'and', 'or' @@ -272,6 +286,15 @@ def raw(self): def _cast_inplace(terms, dtype): + """Cast an expression inplace. + + Parameters + ---------- + terms : Op + The expression that should cast. + dtype : str or numpy.dtype + The dtype to cast to. + """ dt = np.dtype(dtype) for term in terms: try: @@ -285,18 +308,14 @@ def is_term(obj): return isinstance(obj, Term) -def is_const(obj): - return isinstance(obj, Constant) - - class BinOp(Op): """Hold a binary operator and its operands Parameters ---------- - op : str or Op - left : str or Op - right : str or Op + op : str + left : Term or Op + right : Term or Op """ def __init__(self, op, lhs, rhs, **kwargs): super(BinOp, self).__init__(op, (lhs, rhs)) @@ -309,39 +328,71 @@ def __init__(self, op, lhs, rhs, **kwargs): self.func = _binary_ops_dict[op] except KeyError: keys = _binary_ops_dict.keys() - raise BinaryOperatorError('Invalid binary operator {0!r}, valid' + raise ValueError('Invalid binary operator {0!r}, valid' ' operators are {1}'.format(op, keys)) def __call__(self, env): + """Recursively evaluate an expression in Python space. + + Parameters + ---------- + env : Scope + + Returns + ------- + object + The result of an evaluated expression. + """ # handle truediv if self.op == '/' and env.locals['truediv']: self.func = op.truediv - # recurse over the left nodes - try: - left = self.lhs(env) - except TypeError: - left = self.lhs + # recurse over the left/right nodes + left = self.lhs(env) + right = self.rhs(env) - # recurse over the right nodes - try: - right = self.rhs(env) - except TypeError: - right = self.rhs - - # base cases - if is_term(left) and is_term(right): - res = self.func(left.value, right.value) - elif not is_term(left) and is_term(right): - res = self.func(left, right.value) - elif is_term(left) and not is_term(right): - res = self.func(left.value, right) - elif not (is_term(left) or is_term(right)): - res = self.func(left, right) + return self.func(left, right) - return res + def evaluate(self, env, engine, parser, term_type, eval_in_python): + """Evaluate a binary operation *before* being passed to the engine. + + Parameters + ---------- + env : Scope + engine : str + parser : str + term_type : type + eval_in_python : list + + Returns + ------- + term_type + The "pre-evaluated" expression as an instance of ``term_type`` + """ + if engine == 'python': + res = self(env) + else: + # recurse over the left/right nodes + left = self.lhs.evaluate(env, engine=engine, parser=parser, + term_type=term_type, + eval_in_python=eval_in_python) + right = self.rhs.evaluate(env, engine=engine, parser=parser, + term_type=term_type, + eval_in_python=eval_in_python) + + # base cases + if self.op in eval_in_python: + res = self.func(left.value, right.value) + else: + res = pd.eval(self, local_dict=env, engine=engine, + parser=parser) + + name = env.add_tmp(res) + return term_type(name, env=env) def convert_values(self): + """Convert datetimes to a comparable value in an expression. + """ def stringify(value): if self.encoding is not None: encoder = partial(com.pprint_thing_encoded, @@ -376,8 +427,19 @@ def stringify(value): class Div(BinOp): + """Div operator to special case casting. + + Parameters + ---------- + lhs, rhs : Term or Op + The Terms or Ops in the ``/`` expression. + truediv : bool + Whether or not to use true division. With Python 3 this happens + regardless of the value of ``truediv``. + """ def __init__(self, lhs, rhs, truediv=True, *args, **kwargs): super(Div, self).__init__('/', lhs, rhs, *args, **kwargs) + if truediv or PY3: _cast_inplace(com.flatten(self), np.float_) @@ -389,6 +451,18 @@ def __init__(self, lhs, rhs, truediv=True, *args, **kwargs): class UnaryOp(Op): """Hold a unary operator and its operands + + Parameters + ---------- + op : str + The token used to represent the operator. + operand : Term or Op + The Term or Op operand to the operator. + + Raises + ------ + ValueError + * If no function associated with the passed operator token is found. """ def __init__(self, op, operand): super(UnaryOp, self).__init__(op, (operand,)) @@ -397,27 +471,12 @@ def __init__(self, op, operand): try: self.func = _unary_ops_dict[op] except KeyError: - raise UnaryOperatorError('Invalid unary operator {0}, valid ' - 'operators are ' - '{1}'.format(op, _unary_ops_syms)) + raise ValueError('Invalid unary operator {0!r}, valid operators ' + 'are {1}'.format(op, _unary_ops_syms)) def __call__(self, env): - operand = self.operand - - # recurse if operand is an Op - try: - operand = self.operand(env) - except TypeError: - operand = self.operand - - v = operand.value if is_term(operand) else operand - - try: - res = self.func(v) - except TypeError: - res = self.func(v.values) - - return res + operand = self.operand(env) + return self.func(operand) def __unicode__(self): return com.pprint_thing('{0}({1})'.format(self.op, self.operand)) diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index 4067c22beb507..53973970e039a 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -7,7 +7,8 @@ from datetime import datetime import pandas as pd -from pandas.compat import u, string_types +from pandas.compat import u, string_types, PY3 +from pandas.core.base import StringMixin import pandas.core.common as com from pandas.computation import expr, ops from pandas.computation.ops import is_term @@ -28,9 +29,15 @@ def __init__(self, gbls=None, lcls=None, queryables=None, level=1): class Term(ops.Term): + def __new__(cls, name, env, side=None, encoding=None): + klass = Constant if not isinstance(name, string_types) else cls + supr_new = StringMixin.__new__ + if PY3: + return supr_new(klass) + return supr_new(klass, name, env, side=side, encoding=encoding) - def __init__(self, name, env, side=None): - super(Term, self).__init__(name, env, side=side) + def __init__(self, name, env, side=None, encoding=None): + super(Term, self).__init__(name, env, side=side, encoding=encoding) def _resolve_name(self): # must be a queryables @@ -49,8 +56,9 @@ def value(self): class Constant(Term): - def __init__(self, value, env): - super(Constant, self).__init__(value, env) + def __init__(self, value, env, side=None, encoding=None): + super(Constant, self).__init__(value, env, side=side, + encoding=encoding) def _resolve_name(self): return self._name @@ -405,6 +413,9 @@ def visit_Attribute(self, node, **kwargs): def translate_In(self, op): return ast.Eq() if isinstance(op, ast.In) else op + def _rewrite_membership_op(self, node, left, right): + return self.visit(node.op), node.op, left, right + class Expr(expr.Expr): diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index df60ce427f441..8fb1b35abff37 100755 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -9,7 +9,7 @@ import nose from nose.tools import assert_raises, assert_true, assert_false, assert_equal -from numpy.random import randn, rand +from numpy.random import randn, rand, randint import numpy as np from numpy.testing import assert_array_equal, assert_allclose from numpy.testing.decorators import slow @@ -27,6 +27,7 @@ _special_case_arith_ops_syms, _arith_ops_syms, _bool_ops_syms) import pandas.computation.expr as expr +import pandas.util.testing as tm from pandas.util.testing import (assert_frame_equal, randbool, assertRaisesRegexp, assert_produces_warning, assert_series_equal) @@ -135,7 +136,7 @@ def setup_ops(self): self.bin_ops = expr._bool_ops_syms self.special_case_ops = _special_case_arith_ops_syms self.arith_ops = _good_arith_ops - self.unary_ops = '+', '-', '~', 'not ' + self.unary_ops = '-', '~', 'not ' def setUp(self): self.setup_ops() @@ -179,13 +180,6 @@ def test_pow(self): for lhs, rhs in product(self.lhses, self.rhses): self.check_pow(lhs, '**', rhs) - @slow - def test_unary_arith_ops(self): - for unary_op, lhs, arith_op, rhs in product(self.unary_ops, self.lhses, - self.arith_ops, - self.rhses): - self.check_unary_arith_op(lhs, arith_op, rhs, unary_op) - @slow def test_single_invert_op(self): for lhs, op, rhs in product(self.lhses, self.cmp_ops, self.rhses): @@ -434,33 +428,224 @@ def check_compound_invert_op(self, lhs, cmp1, rhs): ev = pd.eval(ex, engine=self.engine, parser=self.parser) assert_array_equal(ev, result) - @skip_incompatible_operand - def check_unary_arith_op(self, lhs, arith1, rhs, unary_op): - # simple - ex = '{0}lhs'.format(unary_op, arith1) - f = _unary_ops_dict[unary_op] - bad_types = np.floating, float, numbers.Real + def ex(self, op, var_name='lhs'): + return '{0}{1}'.format(op, var_name) - if isinstance(lhs, bad_types): - raise nose.SkipTest("Incompatiable type for ~ operator") - if isinstance(rhs, bad_types): - raise nose.SkipTest("Incompatiable type for ~ operator") + def test_frame_invert(self): + expr = self.ex('~') - try: - expected = f(lhs.values) - except AttributeError: - expected = f(lhs) + ## ~ ## + # frame + ## float always raises + lhs = DataFrame(randn(5, 2)) + if self.engine == 'numexpr': + with tm.assertRaises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + + ## int raises on numexpr + lhs = DataFrame(randint(5, size=(5, 2))) + if self.engine == 'numexpr': + with tm.assertRaises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = ~lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + ## bool always works + lhs = DataFrame(rand(5, 2) > 0.5) + expect = ~lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + ## object raises + lhs = DataFrame({'b': ['a', 1, 2.0], 'c': rand(3) > 0.5}) + if self.engine == 'numexpr': + with tm.assertRaises(ValueError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + + def test_series_invert(self): + #### ~ #### + expr = self.ex('~') + + # series + ## float raises + lhs = Series(randn(5)) + if self.engine == 'numexpr': + with tm.assertRaises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + + ## int raises on numexpr + lhs = Series(randint(5, size=5)) + if self.engine == 'numexpr': + with tm.assertRaises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = ~lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + ## bool + lhs = Series(rand(5) > 0.5) + expect = ~lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + # float + # int + # bool + + # object + lhs = Series(['a', 1, 2.0]) + if self.engine == 'numexpr': + with tm.assertRaises(ValueError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + + def test_frame_negate(self): + expr = self.ex('-') + + # float + lhs = DataFrame(randn(5, 2)) + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + # int + lhs = DataFrame(randint(5, size=(5, 2))) + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + # bool doesn't work with numexpr but works elsewhere + lhs = DataFrame(rand(5, 2) > 0.5) + if self.engine == 'numexpr': + with tm.assertRaises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + def test_series_negate(self): + expr = self.ex('-') + + # float + lhs = Series(randn(5)) + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + # int + lhs = Series(randint(5, size=5)) + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + # bool doesn't work with numexpr but works elsewhere + lhs = Series(rand(5) > 0.5) + if self.engine == 'numexpr': + with tm.assertRaises(NotImplementedError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = -lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) - result = pd.eval(ex, engine=self.engine, parser=self.parser) - assert_array_equal(result, expected) + def test_frame_pos(self): + expr = self.ex('+') - for engine in self.current_engines: - skip_if_no_ne(engine) - assert_array_equal(result, pd.eval(ex, engine=engine, - parser=self.parser)) + # float + lhs = DataFrame(randn(5, 2)) + if self.engine == 'python': + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) - ex = '{0}(lhs {1} rhs)'.format(unary_op, arith1) - result = pd.eval(ex, engine=self.engine, parser=self.parser) + # int + lhs = DataFrame(randint(5, size=(5, 2))) + if self.engine == 'python': + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + # bool doesn't work with numexpr but works elsewhere + lhs = DataFrame(rand(5, 2) > 0.5) + if self.engine == 'python': + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_frame_equal(expect, result) + + def test_series_pos(self): + expr = self.ex('+') + + # float + lhs = Series(randn(5)) + if self.engine == 'python': + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + # int + lhs = Series(randint(5, size=5)) + if self.engine == 'python': + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + # bool doesn't work with numexpr but works elsewhere + lhs = Series(rand(5) > 0.5) + if self.engine == 'python': + with tm.assertRaises(TypeError): + result = pd.eval(expr, engine=self.engine, parser=self.parser) + else: + expect = lhs + result = pd.eval(expr, engine=self.engine, parser=self.parser) + assert_series_equal(expect, result) + + def test_scalar_unary(self): + with tm.assertRaises(TypeError): + pd.eval('~1.0', engine=self.engine, parser=self.parser) + + self.assertEqual(pd.eval('-1.0', parser=self.parser, engine=self.engine), -1.0) + self.assertEqual(pd.eval('+1.0', parser=self.parser, engine=self.engine), +1.0) + + self.assertEqual(pd.eval('~1', parser=self.parser, engine=self.engine), ~1) + self.assertEqual(pd.eval('-1', parser=self.parser, engine=self.engine), -1) + self.assertEqual(pd.eval('+1', parser=self.parser, engine=self.engine), +1) + + self.assertEqual(pd.eval('~True', parser=self.parser, engine=self.engine), ~True) + self.assertEqual(pd.eval('~False', parser=self.parser, engine=self.engine), ~False) + self.assertEqual(pd.eval('-True', parser=self.parser, engine=self.engine), -True) + self.assertEqual(pd.eval('-False', parser=self.parser, engine=self.engine), -False) + self.assertEqual(pd.eval('+True', parser=self.parser, engine=self.engine), +True) + self.assertEqual(pd.eval('+False', parser=self.parser, engine=self.engine), +False) class TestEvalNumexprPython(TestEvalNumexprPandas): @@ -473,9 +658,11 @@ def setUpClass(cls): cls.parser = 'python' def setup_ops(self): - self.cmp_ops = expr._cmp_ops_syms + self.cmp_ops = list(filter(lambda x: x not in ('in', 'not in'), + expr._cmp_ops_syms)) self.cmp2_ops = self.cmp_ops[::-1] - self.bin_ops = (s for s in expr._bool_ops_syms if s not in ('and', 'or')) + self.bin_ops = [s for s in expr._bool_ops_syms + if s not in ('and', 'or')] self.special_case_ops = _special_case_arith_ops_syms self.arith_ops = _good_arith_ops self.unary_ops = '+', '-', '~' @@ -714,7 +901,7 @@ def test_complex_series_frame_alignment(self): for engine, parser in ENGINES_PARSERS: yield self.check_complex_series_frame_alignment, engine, parser - def check_performance_warning_for_asenine_alignment(self, engine, parser): + def check_performance_warning_for_poor_alignment(self, engine, parser): skip_if_no_ne(engine) df = DataFrame(randn(1000, 10)) s = Series(randn(10000)) @@ -735,9 +922,32 @@ def check_performance_warning_for_asenine_alignment(self, engine, parser): with assert_produces_warning(False): pd.eval('df + s', engine=engine, parser=parser) - def test_performance_warning_for_asenine_alignment(self): + df = DataFrame(randn(10, 10)) + s = Series(randn(10000)) + + is_python_engine = engine == 'python' + + if not is_python_engine: + wrn = pd.io.common.PerformanceWarning + else: + wrn = False + + with assert_produces_warning(wrn) as w: + pd.eval('df + s', engine=engine, parser=parser) + + if not is_python_engine: + assert_equal(len(w), 1) + msg = str(w[0].message) + expected = ("Alignment difference on axis {0} is larger" + " than an order of magnitude on term {1!r}, " + "by more than {2:.4g}; performance may suffer" + "".format(1, 's', np.log10(s.size - df.shape[1]))) + assert_equal(msg, expected) + + + def test_performance_warning_for_poor_alignment(self): for engine, parser in ENGINES_PARSERS: - yield self.check_performance_warning_for_asenine_alignment, engine, parser + yield self.check_performance_warning_for_poor_alignment, engine, parser #------------------------------------ @@ -749,6 +959,7 @@ def setUpClass(cls): skip_if_no_ne() cls.engine = 'numexpr' cls.parser = 'pandas' + cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms @classmethod def tearDownClass(cls): @@ -760,7 +971,7 @@ def eval(self, *args, **kwargs): return pd.eval(*args, **kwargs) def test_simple_arith_ops(self): - ops = expr._arith_ops_syms + expr._cmp_ops_syms + ops = self.arith_ops for op in filter(lambda x: x != '//', ops): ex = '1 {0} 1'.format(op) @@ -943,6 +1154,9 @@ def setUpClass(cls): raise nose.SkipTest("numexpr engine not installed") cls.engine = 'numexpr' cls.parser = 'python' + cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms + cls.arith_ops = filter(lambda x: x not in ('in', 'not in'), + cls.arith_ops) def test_fails_and(self): df = DataFrame(np.random.randn(5, 3)) @@ -1011,9 +1225,11 @@ class TestOperationsPythonPython(TestOperationsNumExprPython): @classmethod def setUpClass(cls): cls.engine = cls.parser = 'python' + cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms + cls.arith_ops = filter(lambda x: x not in ('in', 'not in'), + cls.arith_ops) def test_fails_ampersand(self): - raise nose.SkipTest("known failer for now") df = DataFrame(np.random.randn(5, 3)) self.assertRaises(TypeError, pd.eval, '(df + 2)[df > 1] > 0 & (df > 0)', @@ -1021,7 +1237,6 @@ def test_fails_ampersand(self): engine=self.engine) def test_fails_pipe(self): - raise nose.SkipTest("known failer for now") df = DataFrame(np.random.randn(5, 3)) self.assertRaises(TypeError, pd.eval, '(df + 2)[df > 1] > 0 | (df > 0)', @@ -1034,6 +1249,7 @@ class TestOperationsPythonPandas(TestOperationsNumExprPandas): def setUpClass(cls): cls.engine = 'python' cls.parser = 'pandas' + cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms _var_s = randn(10) diff --git a/pandas/core/common.py b/pandas/core/common.py index c1ff6a2200225..d3fa10abc7681 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -9,6 +9,8 @@ import csv import sys +from datetime import timedelta + from distutils.version import LooseVersion from numpy.lib.format import read_array, write_array diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ed3ecd3700f31..c3504477b400b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1903,8 +1903,31 @@ def _getitem_frame(self, key): raise ValueError('Must pass DataFrame with boolean values only') return self.where(key) + def _get_index_resolvers(self, axis): + # index or columns + axis_index = getattr(self, axis) + d = dict() + + for i, name in enumerate(axis_index.names): + if name is not None: + key = level = name + else: + # prefix with 'i' or 'c' depending on the input axis + # e.g., you must do ilevel_0 for the 0th level of an unnamed + # multiiindex + level_string = '{prefix}level_{i}'.format(prefix=axis[0], i=i) + key = level_string + level = i + + d[key] = Series(axis_index.get_level_values(level).values, + index=axis_index, name=level) + + # put the index/columns itself in the dict + d[axis] = axis_index + return d + def query(self, expr, **kwargs): - """Query the columns of a frame with an expression. + """Query the columns of a frame with a boolean expression. Parameters ---------- @@ -1950,16 +1973,18 @@ def query(self, expr, **kwargs): For further details and examples see the ``query`` documentation in :ref:`indexing `. - Raises - ------ - NameError - * If not all identifiers in the query can be found - SyntaxError - * If a syntactically invalid Python expression is passed - See Also -------- pandas.eval + DataFrame.eval + + Examples + -------- + >>> from numpy.random import randn + >>> from pandas import DataFrame + >>> df = DataFrame(randn(10, 2), columns=list('ab')) + >>> df.query('a > b') + >>> df[df.a > df.b] # same result as the previous expression """ # need to go up at least 4 stack frames # 4 expr.Scope @@ -1972,17 +1997,56 @@ def query(self, expr, **kwargs): raise ValueError("Going up fewer than 4 stack frames will not" " capture the necessary variable scope for a " "query expression") - return self[self.eval(expr, **kwargs)] + + res = self.eval(expr, **kwargs) + + try: + return self.loc[res] + except ValueError: + # when res is multi-dimensional loc raises, but this is sometimes a + # valid query + return self[res] def eval(self, expr, **kwargs): + """Evaluate an expression in the context of the calling DataFrame + instance. + + Parameters + ---------- + expr : string + The expression string to evaluate. + kwargs : dict + See the documentation for :func:`~pandas.eval` for complete details + on the keyword arguments accepted by + :meth:`~pandas.DataFrame.query`. + + Returns + ------- + ret : ndarray, scalar, or pandas object + + See Also + -------- + pandas.DataFrame.query + pandas.eval + + Notes + ----- + For more details see the API documentation for :func:`~pandas.eval`. + For detailed examples see :ref:`enhancing performance with eval + `. + + Examples + -------- + >>> from numpy.random import randn + >>> from pandas import DataFrame + >>> df = DataFrame(randn(10, 2), columns=list('ab')) + >>> df.eval('a + b') + """ resolvers = kwargs.pop('resolvers', None) if resolvers is None: - index_resolvers = {} - if self.index.name is not None: - index_resolvers[self.index.name] = self.index - index_resolvers.update({'index': self.index, - 'columns': self.columns}) - resolvers = [index_resolvers, self] + index_resolvers = self._get_index_resolvers('index') + index_resolvers.update(self._get_index_resolvers('columns')) + resolvers = [self, index_resolvers] kwargs['local_dict'] = _ensure_scope(resolvers=resolvers, **kwargs) return _eval(expr, **kwargs) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index a4491a87b290d..b79408a1bf8d2 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4244,7 +4244,17 @@ def generate(self, where): if where is None: return None - return Expr(where, queryables=self.table.queryables(), encoding=self.table.encoding) + q = self.table.queryables() + try: + return Expr(where, queryables=q, encoding=self.table.encoding) + except (NameError) as detail: + + # raise a nice message, suggesting that the user should use data_columns + raise ValueError("The passed where expression: {0}\n" + " contains an invalid variable reference\n" + " all of the variable refrences must be a reference to\n" + " an axis (e.g. 'index' or 'columns'), or a data_column\n" + " The currently defined references are: {1}\n".format(where,','.join(q.keys()))) def select(self): """ diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index ee42a58a38c1c..87def113266b2 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -2036,9 +2036,9 @@ def test_invalid_terms(self): store.put('p4d', p4d, format='table') # some invalid terms - self.assertRaises(NameError, store.select, 'wp', "minor=['A', 'B']") - self.assertRaises(NameError, store.select, 'wp', ["index=['20121114']"]) - self.assertRaises(NameError, store.select, 'wp', ["index=['20121114', '20121114']"]) + self.assertRaises(ValueError, store.select, 'wp', "minor=['A', 'B']") + self.assertRaises(ValueError, store.select, 'wp', ["index=['20121114']"]) + self.assertRaises(ValueError, store.select, 'wp', ["index=['20121114', '20121114']"]) # deprecations with tm.assert_produces_warning(expected_warning=DeprecationWarning): @@ -2054,6 +2054,22 @@ def test_invalid_terms(self): self.assertRaises(SyntaxError, store.select, 'df','index>') self.assertRaises(ValueError, store.select, 'wp', "major_axis<'20000108' & minor_axis['A', 'B']") + # from the docs + with tm.ensure_clean(self.path) as path: + dfq = DataFrame(np.random.randn(10,4),columns=list('ABCD'),index=date_range('20130101',periods=10)) + dfq.to_hdf(path,'dfq',format='table',data_columns=True) + + # check ok + read_hdf(path,'dfq',where="index>Timestamp('20130104') & columns=['A', 'B']") + read_hdf(path,'dfq',where="A>0 or C>0") + + # catch the invalid reference + with tm.ensure_clean(self.path) as path: + dfq = DataFrame(np.random.randn(10,4),columns=list('ABCD'),index=date_range('20130101',periods=10)) + dfq.to_hdf(path,'dfq',format='table') + + self.assertRaises(ValueError, read_hdf, path,'dfq',where="A>0 or C>0") + def test_terms(self): with ensure_clean(self.path) as store: @@ -2123,6 +2139,25 @@ def test_terms(self): for t in terms: store.select('p4d', t) + def test_term_compat(self): + with ensure_clean(self.path) as store: + + wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + store.append('wp',wp) + + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + result = store.select('wp', [Term('major_axis>20000102'), + Term('minor_axis', '=', ['A','B']) ]) + expected = wp.loc[:,wp.major_axis>Timestamp('20000102'),['A','B']] + assert_panel_equal(result, expected) + + store.remove('wp', Term('major_axis>20000103')) + result = store.select('wp') + expected = wp.loc[:,wp.major_axis<=Timestamp('20000103'),:] + assert_panel_equal(result, expected) + def test_same_name_scoping(self): with ensure_clean(self.path) as store: diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 8145fd9c5c67d..ae37953da62c1 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -42,11 +42,12 @@ assert_series_equal, assert_frame_equal, assertRaisesRegexp, + assertRaises, makeCustomDataframe as mkdf, ensure_clean) from pandas.core.indexing import IndexingError from pandas.core.common import PandasError -from pandas.util.compat import OrderedDict +from pandas.compat import OrderedDict from pandas.computation.expr import Expr import pandas.computation as comp @@ -11113,6 +11114,7 @@ def test_isin_with_string_scalar(self): with tm.assertRaises(TypeError): df.isin('aaa') + def skip_if_no_ne(engine='numexpr'): if engine == 'numexpr': try: @@ -11288,6 +11290,187 @@ def test_chained_cmp_and_in(self): assert_frame_equal(res, expec) +class TestDataFrameQueryWithMultiIndex(object): + def check_query_with_named_multiindex(self, parser, engine): + skip_if_no_ne(engine) + a = tm.choice(['red', 'green'], size=10) + b = tm.choice(['eggs', 'ham'], size=10) + index = MultiIndex.from_arrays([a, b], names=['color', 'food']) + df = DataFrame(randn(10, 2), index=index) + ind = Series(df.index.get_level_values('color').values, index=index, + name='color') + + # equality + #import ipdb; ipdb.set_trace() + res1 = df.query('color == "red"', parser=parser, engine=engine) + res2 = df.query('"red" == color', parser=parser, engine=engine) + exp = df[ind == 'red'] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # inequality + res1 = df.query('color != "red"', parser=parser, engine=engine) + res2 = df.query('"red" != color', parser=parser, engine=engine) + exp = df[ind != 'red'] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # list equality (really just set membership) + res1 = df.query('color == ["red"]', parser=parser, engine=engine) + res2 = df.query('["red"] == color', parser=parser, engine=engine) + exp = df[ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + res1 = df.query('color != ["red"]', parser=parser, engine=engine) + res2 = df.query('["red"] != color', parser=parser, engine=engine) + exp = df[~ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # in/not in ops + res1 = df.query('["red"] in color', parser=parser, engine=engine) + res2 = df.query('"red" in color', parser=parser, engine=engine) + exp = df[ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + res1 = df.query('["red"] not in color', parser=parser, engine=engine) + res2 = df.query('"red" not in color', parser=parser, engine=engine) + exp = df[~ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + def test_query_with_named_multiindex(self): + for parser, engine in product(['pandas'], ENGINES): + yield self.check_query_with_named_multiindex, parser, engine + + def check_query_with_unnamed_multiindex(self, parser, engine): + skip_if_no_ne(engine) + a = tm.choice(['red', 'green'], size=10) + b = tm.choice(['eggs', 'ham'], size=10) + index = MultiIndex.from_arrays([a, b]) + df = DataFrame(randn(10, 2), index=index) + ind = Series(df.index.get_level_values(0).values, index=index) + + res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine) + res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine) + exp = df[ind == 'red'] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # inequality + res1 = df.query('ilevel_0 != "red"', parser=parser, engine=engine) + res2 = df.query('"red" != ilevel_0', parser=parser, engine=engine) + exp = df[ind != 'red'] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # list equality (really just set membership) + res1 = df.query('ilevel_0 == ["red"]', parser=parser, engine=engine) + res2 = df.query('["red"] == ilevel_0', parser=parser, engine=engine) + exp = df[ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + res1 = df.query('ilevel_0 != ["red"]', parser=parser, engine=engine) + res2 = df.query('["red"] != ilevel_0', parser=parser, engine=engine) + exp = df[~ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # in/not in ops + res1 = df.query('["red"] in ilevel_0', parser=parser, engine=engine) + res2 = df.query('"red" in ilevel_0', parser=parser, engine=engine) + exp = df[ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + res1 = df.query('["red"] not in ilevel_0', parser=parser, engine=engine) + res2 = df.query('"red" not in ilevel_0', parser=parser, engine=engine) + exp = df[~ind.isin(['red'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + #### LEVEL 1 #### + ind = Series(df.index.get_level_values(1).values, index=index) + res1 = df.query('ilevel_1 == "eggs"', parser=parser, engine=engine) + res2 = df.query('"eggs" == ilevel_1', parser=parser, engine=engine) + exp = df[ind == 'eggs'] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # inequality + res1 = df.query('ilevel_1 != "eggs"', parser=parser, engine=engine) + res2 = df.query('"eggs" != ilevel_1', parser=parser, engine=engine) + exp = df[ind != 'eggs'] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # list equality (really just set membership) + res1 = df.query('ilevel_1 == ["eggs"]', parser=parser, engine=engine) + res2 = df.query('["eggs"] == ilevel_1', parser=parser, engine=engine) + exp = df[ind.isin(['eggs'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + res1 = df.query('ilevel_1 != ["eggs"]', parser=parser, engine=engine) + res2 = df.query('["eggs"] != ilevel_1', parser=parser, engine=engine) + exp = df[~ind.isin(['eggs'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + # in/not in ops + res1 = df.query('["eggs"] in ilevel_1', parser=parser, engine=engine) + res2 = df.query('"eggs" in ilevel_1', parser=parser, engine=engine) + exp = df[ind.isin(['eggs'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + res1 = df.query('["eggs"] not in ilevel_1', parser=parser, engine=engine) + res2 = df.query('"eggs" not in ilevel_1', parser=parser, engine=engine) + exp = df[~ind.isin(['eggs'])] + assert_frame_equal(res1, exp) + assert_frame_equal(res2, exp) + + def test_query_with_unnamed_multiindex(self): + for parser, engine in product(['pandas'], ENGINES): + yield self.check_query_with_unnamed_multiindex, parser, engine + + def check_query_with_partially_named_multiindex(self, parser, engine): + skip_if_no_ne(engine) + a = tm.choice(['red', 'green'], size=10) + b = np.arange(10) + index = MultiIndex.from_arrays([a, b]) + index.names = [None, 'rating'] + df = DataFrame(randn(10, 2), index=index) + res = df.query('rating == 1', parser=parser, engine=engine) + ind = Series(df.index.get_level_values('rating').values, index=index, + name='rating') + exp = df[ind == 1] + assert_frame_equal(res, exp) + + res = df.query('rating != 1', parser=parser, engine=engine) + ind = Series(df.index.get_level_values('rating').values, index=index, + name='rating') + exp = df[ind != 1] + assert_frame_equal(res, exp) + + res = df.query('ilevel_0 == "red"', parser=parser, engine=engine) + ind = Series(df.index.get_level_values(0).values, index=index) + exp = df[ind == "red"] + assert_frame_equal(res, exp) + + res = df.query('ilevel_0 != "red"', parser=parser, engine=engine) + ind = Series(df.index.get_level_values(0).values, index=index) + exp = df[ind != "red"] + assert_frame_equal(res, exp) + + def test_query_with_partially_named_multiindex(self): + for parser, engine in product(['pandas'], ENGINES): + yield self.check_query_with_partially_named_multiindex, parser, engine + + class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas): @classmethod def setUpClass(cls): @@ -11391,13 +11574,20 @@ def test_date_query_getitem(self): assert_frame_equal(res, expec) def test_query_expressions_correct_failure(self): + import random + import string + df = self.frame exprs = 'and', 'or', 'not' exprs += tuple(x + tm.rands(5) for x in exprs) - exprs += tuple(tm.rands(5) + x for x in exprs) + exprs += tuple(random.choice(string.ascii_letters) + tm.rands(5) + x + for x in exprs) + + exprs += 'inb', for e in exprs: - self.assertRaises(KeyError, df.__getitem__, e) + with self.assertRaises(KeyError): + df[e] for e in (' and ', ' or ', ' not '): self.assertRaises(SyntaxError, df.__getitem__, e) @@ -11405,6 +11595,8 @@ def test_query_expressions_correct_failure(self): x = tm.randbool(size=(self.frame.shape[0],)) self.assertRaises(KeyError, df.__getitem__, 'x') + self.assertRaises(NameError, df.__getitem__, 'not inb') + def test_query_expressions_with_index(self): df = DataFrame(np.random.randint(10, size=(10, 3)), index=Index(range(10), name='blob'), @@ -11463,13 +11655,40 @@ def test_local_syntax(self): class TestDataFrameQueryStrings(object): def check_str_query_method(self, parser, engine): - skip_if_no_pandas_parser(parser) + skip_if_no_ne(engine) df = DataFrame(randn(10, 1), columns=['b']) df['strings'] = Series(list('aabbccddee')) expect = df[df.strings == 'a'] - res = df.query('strings == "a"', engine=engine, parser=parser) - assert_frame_equal(res, expect) - assert_frame_equal(res, df[df.strings.isin(['a'])]) + + if parser != 'pandas': + col = 'strings' + lst = '"a"' + + lhs = [col] * 2 + [lst] * 2 + rhs = lhs[::-1] + + eq, ne = '==', '!=' + ops = 2 * ([eq] + [ne]) + + for lhs, op, rhs in zip(lhs, ops, rhs): + ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs) + assertRaises(NotImplementedError, df.query, ex, engine=engine, + parser=parser, local_dict={'strings': df.strings}) + else: + res = df.query('"a" == strings', engine=engine, parser=parser) + assert_frame_equal(res, expect) + + res = df.query('strings == "a"', engine=engine, parser=parser) + assert_frame_equal(res, expect) + assert_frame_equal(res, df[df.strings.isin(['a'])]) + + expect = df[df.strings != 'a'] + res = df.query('strings != "a"', engine=engine, parser=parser) + assert_frame_equal(res, expect) + + res = df.query('"a" != strings', engine=engine, parser=parser) + assert_frame_equal(res, expect) + assert_frame_equal(res, df[~df.strings.isin(['a'])]) def test_str_query_method(self): for parser, engine in product(PARSERS, ENGINES): @@ -11480,14 +11699,45 @@ def test_str_list_query_method(self): yield self.check_str_list_query_method, parser, engine def check_str_list_query_method(self, parser, engine): - skip_if_no_pandas_parser(parser) + skip_if_no_ne(engine) df = DataFrame(randn(10, 1), columns=['b']) df['strings'] = Series(list('aabbccddee')) expect = df[df.strings.isin(['a', 'b'])] - res = df.query('strings == ["a", "b"]', engine=engine, parser=parser) - assert_frame_equal(res, expect) - def test_str_query(self): + if parser != 'pandas': + col = 'strings' + lst = '["a", "b"]' + + lhs = [col] * 2 + [lst] * 2 + rhs = lhs[::-1] + + eq, ne = '==', '!=' + ops = 2 * ([eq] + [ne]) + + for lhs, op, rhs in zip(lhs, ops, rhs): + ex = '{lhs} {op} {rhs}'.format(lhs=lhs, op=op, rhs=rhs) + assertRaises(NotImplementedError, df.query, ex, engine=engine, + parser=parser, local_dict={'strings': df.strings}) + else: + res = df.query('strings == ["a", "b"]', engine=engine, + parser=parser) + assert_frame_equal(res, expect) + + res = df.query('["a", "b"] == strings', engine=engine, + parser=parser) + assert_frame_equal(res, expect) + + expect = df[~df.strings.isin(['a', 'b'])] + + res = df.query('strings != ["a", "b"]', engine=engine, + parser=parser) + assert_frame_equal(res, expect) + + res = df.query('["a", "b"] != strings', engine=engine, + parser=parser) + assert_frame_equal(res, expect) + + def test_str_query_getitem(self): skip_if_no_ne() df = DataFrame(randn(10, 1), columns=['b']) df['strings'] = Series(list('aabbccddee')) @@ -11498,10 +11748,18 @@ def test_str_query(self): res = df['"a" == strings'] assert_frame_equal(res, expect) - def test_str_query_list(self): + expect = df[df.strings != 'a'] + res = df['strings != "a"'] + assert_frame_equal(res, expect) + + res = df['"a" != strings'] + assert_frame_equal(res, expect) + + def test_str_query_list_getitem(self): skip_if_no_ne() df = DataFrame(randn(10, 1), columns=['b']) df['strings'] = Series(list('aabbccddee')) + expect = df[df.strings.isin(['a', 'b'])] res = df['strings == ["a", "b"]'] assert_frame_equal(res, expect) @@ -11509,6 +11767,85 @@ def test_str_query_list(self): res = df['["a", "b"] == strings'] assert_frame_equal(res, expect) + expect = df[~df.strings.isin(['a', 'b'])] + res = df['strings != ["a", "b"]'] + assert_frame_equal(res, expect) + + res = df['["a", "b"] != strings'] + assert_frame_equal(res, expect) + + def check_query_with_string_columns(self, parser, engine): + skip_if_no_ne(engine) + df = DataFrame({'a': list('aaaabbbbcccc'), + 'b': list('aabbccddeeff'), + 'c': np.random.randint(5, size=12), + 'd': np.random.randint(9, size=12)}) + if parser == 'pandas': + res = df.query('a in b', parser=parser, engine=engine) + expec = df[df.b.isin(df.a)] + assert_frame_equal(res, expec) + + res = df.query('a in b and c < d', parser=parser, engine=engine) + expec = df[df.b.isin(df.a) & (df.c < df.d)] + assert_frame_equal(res, expec) + else: + with assertRaises(NotImplementedError): + df.query('a in b', parser=parser, engine=engine) + + with assertRaises(NotImplementedError): + df.query('a in b and c < d', parser=parser, engine=engine) + + def test_query_with_string_columns(self): + for parser, engine in product(PARSERS, ENGINES): + yield self.check_query_with_string_columns, parser, engine + + def test_query_with_string_columns_numexpr(self): + skip_if_no_ne() + df = DataFrame({'a': list('aaaabbbbcccc'), + 'b': list('aabbccddeeff'), + 'c': np.random.randint(5, size=12), + 'd': np.random.randint(9, size=12)}) + res = df['a in b'] + expec = df[df.b.isin(df.a)] + assert_frame_equal(res, expec) + + res = df['a in b and c < d'] + expec = df[df.b.isin(df.a) & (df.c < df.d)] + assert_frame_equal(res, expec) + + def check_object_array_eq_ne(self, parser, engine): + skip_if_no_ne(engine) + df = DataFrame({'a': list('aaaabbbbcccc'), + 'b': list('aabbccddeeff'), + 'c': np.random.randint(5, size=12), + 'd': np.random.randint(9, size=12)}) + res = df.query('a == b', parser=parser, engine=engine) + exp = df[df.a == df.b] + assert_frame_equal(res, exp) + + res = df.query('a != b', parser=parser, engine=engine) + exp = df[df.a != df.b] + assert_frame_equal(res, exp) + + def test_object_array_eq_ne(self): + for parser, engine in product(PARSERS, ENGINES): + yield self.check_object_array_eq_ne, parser, engine + + def test_object_array_eq_ne_getitem(self): + skip_if_no_ne() + df = DataFrame({'a': list('aaaabbbbcccc'), + 'b': list('aabbccddeeff'), + 'c': np.random.randint(5, size=12), + 'd': np.random.randint(9, size=12)}) + res = df['a == b'] + exp = df[df.a == df.b] + assert_frame_equal(res, exp) + + res = df['a != b'] + exp = df[df.a != df.b] + assert_frame_equal(res, exp) + + class TestDataFrameEvalNumExprPandas(unittest.TestCase): @classmethod def setUpClass(cls): diff --git a/pandas/util/testing.py b/pandas/util/testing.py index bf895e2abd97e..0718dc8926011 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -62,6 +62,14 @@ def randu(n): choices += string.digits return ''.join([random.choice(choices) for _ in range(n)]) + +def choice(x, size=10): + """sample with replacement; uniform over the input""" + try: + return np.random.choice(x, size=size) + except AttributeError: + return np.random.randint(len(x), size=size).choose(x) + #------------------------------------------------------------------------------ # Console debugging tools From 0469fe49ce43a543f1dbcab6a0de8fe438e0405a Mon Sep 17 00:00:00 2001 From: jreback Date: Fri, 13 Sep 2013 11:49:58 -0400 Subject: [PATCH 12/16] BUG: parsing of timedelta selection syntax needed correction --- bench/bench_with_subset.py | 121 ++++++++++++++++++++---- doc/source/_static/eval-perf-small.png | Bin 0 -> 25314 bytes doc/source/_static/eval-perf.png | Bin 0 -> 18603 bytes doc/source/_static/query-perf-small.png | Bin 0 -> 25662 bytes doc/source/_static/query-perf.png | Bin 0 -> 20351 bytes doc/source/enhancingperf.rst | 12 ++- doc/source/indexing.rst | 23 ++++- doc/source/io.rst | 2 +- pandas/computation/pytables.py | 8 +- pandas/io/tests/test_pytables.py | 32 +++---- 10 files changed, 149 insertions(+), 49 deletions(-) create mode 100644 doc/source/_static/eval-perf-small.png create mode 100644 doc/source/_static/eval-perf.png create mode 100644 doc/source/_static/query-perf-small.png create mode 100644 doc/source/_static/query-perf.png diff --git a/bench/bench_with_subset.py b/bench/bench_with_subset.py index 878b9c08e62d8..99b98c9838a90 100644 --- a/bench/bench_with_subset.py +++ b/bench/bench_with_subset.py @@ -5,33 +5,112 @@ """ from __future__ import print_function -from timeit import timeit +import numpy as np +from numpy import array +from timeit import repeat as timeit +from pandas.compat import range, zip +from pandas import DataFrame -def bench_with(n=1e7, times=10, repeat=3): - setup = "from pandas import DataFrame\n" - setup += "from numpy.random import randn\n" - setup += "df = DataFrame(randn(%d, 3), columns=list('abc'))\n" % n - setup += "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'" - print('DataFrame.eval:') - print(timeit('df.eval(s)', setup=setup, repeat=repeat, number=times)) +setup_common = """from pandas import DataFrame +from numpy.random import randn +df = DataFrame(randn(%d, 3), columns=list('abc')) +%s""" -def bench_subset(n=1e7, times=10, repeat=3): - setup = "from pandas import DataFrame\n" - setup += "from numpy.random import randn\n" - setup += "df = DataFrame(randn(%d, 3), columns=list('abc'))\n" % n - setup += "s = 'a <= b <= (c ** 2 + b ** 2 - a) and b > c'" - print('DataFrame.query:') - print(timeit('df.query(s)', setup=setup, repeat=repeat, number=times)) - print('DataFrame.__getitem__:') - print(timeit('df[s]', setup=setup, repeat=repeat, number=times)) +setup_with = "s = 'a + b * (c ** 2 + b ** 2 - a) / (a * c) ** 3'" -def bench(): - bench_with() - bench_subset() +def bench_with(n, times=10, repeat=3, engine='numexpr'): + return np.array(timeit('df.eval(s, engine=%r)' % engine, + setup=setup_common % (n, setup_with), + repeat=repeat, number=times)) / times + + +setup_subset = "s = 'a <= b <= c ** 2 + b ** 2 - a and b > c'" + + +def bench_subset(n, times=10, repeat=3, engine='numexpr'): + return np.array(timeit('df.query(s, engine=%r)' % engine, + setup=setup_common % (n, setup_subset), + repeat=repeat, number=times)) / times + + +def bench(mn=1, mx=7, num=100, engines=('python', 'numexpr'), verbose=False): + r = np.logspace(mn, mx, num=num).round().astype(int) + + ev = DataFrame(np.empty((num, len(engines))), columns=engines) + qu = ev.copy(deep=True) + + ev['size'] = qu['size'] = r + + for engine in engines: + for i, n in enumerate(r): + if verbose: + print('engine: %r, i == %d' % (engine, i)) + ev.loc[i, engine] = bench_with(n, times=1, repeat=1, engine=engine) + qu.loc[i, engine] = bench_subset(n, times=1, repeat=1, + engine=engine) + + return ev, qu + + +def plot_perf(df, engines, title, filename=None): + from matplotlib.pyplot import figure, rc + + try: + from mpltools import style + except ImportError: + pass + else: + style.use('ggplot') + + rc('text', usetex=True) + + fig = figure(figsize=(4, 3), dpi=100) + ax = fig.add_subplot(111) + + for engine in engines: + ax.plot(df.size, df[engine], label=engine, lw=2) + + ax.set_xlabel('Number of Rows') + ax.set_ylabel('Time (s)') + ax.set_title(title) + ax.legend(loc='best') + ax.tick_params(top=False, right=False) + + fig.tight_layout() + + if filename is not None: + fig.savefig(filename) if __name__ == '__main__': - bench() + import os + import pandas as pd + + pandas_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) + static_path = os.path.join(pandas_dir, 'doc', 'source', '_static') + + join = lambda p: os.path.join(static_path, p) + + fn = join('eval-query-perf-data.h5') + + engines = 'python', 'numexpr' + + if not os.path.exists(fn): + ev, qu = bench(verbose=True) + ev.to_hdf(fn, 'eval') + qu.to_hdf(fn, 'query') + else: + ev = pd.read_hdf(fn, 'eval') + qu = pd.read_hdf(fn, 'query') + + plot_perf(ev, engines, 'DataFrame.eval()', filename=join('eval-perf.png')) + plot_perf(qu, engines, 'DataFrame.query()', + filename=join('query-perf.png')) + + plot_perf(ev[ev.size <= 50000], engines, 'DataFrame.eval()', + filename=join('eval-perf-small.png')) + plot_perf(qu[qu.size <= 100000], engines, 'DataFrame.query()', + filename=join('query-perf-small.png')) diff --git a/doc/source/_static/eval-perf-small.png b/doc/source/_static/eval-perf-small.png new file mode 100644 index 0000000000000000000000000000000000000000..d86018363ffdc8e6e2845fcd06f4f08af6fb00a5 GIT binary patch literal 25314 zcmb6B1yq$^^froax=XrKK)So7L`tNjyE~-28w3QAMoPN7Q|U&!J0&I0;`jf)@0@Y& zxZ{q?7|52f-*>My*No?R=6s1zR+K?QCPIclAZYJoB~>60sCe+rga{9QbKJ1q4Sv8n zOT1G>1Yh2WCgI@!NDi`}oFNbl!hGF?b`9(K5{Ke5}t(1DcA#KuQ!DA)^eA|-g z`i}v2xe~Tj(7MT_DX;;sUk1N=t%%w=y&HrM2!`cb9UF*i+RgMjasJ}q$3Nyg>WrA5 z6xe0JMu1AUE@TS{0xzX$<;4h4;e*)eQ~km3;fiJx$70p;d#AIa=Gu=0kOiWDHN-Elk?63(HyJ3UZ@g^rH&t|plKP8Jk-EPje z`v|Ih{P^MH$Dp;fHSyJ+S(sm+j{f{2AR}{MgEBEUr>3WuWxDSnfAT*%vT3~-6@2>? z(CNTB9MVUiUT?P?IY8!xYE?(1U2j9EqHV)DlAr+ApqMEXF+40^;>@T~PV)CAXx$f_ zKxhVCraLs`?4YDVD@j8`gPNKe#?jGnjStg4@ zor#c;Fn(wo;(v8G7qrQTvWVb025wxH0saXN4W;7Z!U0#l+LZYA?VFKc6drT6_09G1 zBD>p(&bcqJhT_%4#8tt~)z+^>^||BK4sw!#_52h)r=O?w^VOy(;G{6Iv5j3>I61LR zO--5bHjb6ydqQ69orEc&QKq$BxsuNBOh4eb;i#sD`#M-Tc5^RRD#}7#9Zx|?sW2Fm zh=@pT#)>;XX|lUpBJJC^cSw|6jEqt3cLybk4eQYvNh-}w+j1}OYOr0DMxs<@Nttq6 z@fg_mD%bDGd?jJw=_!!GYft9$a6YiV)a3kzGPu95Z-U=>XZ-foy~IiTl$0`f^gHj^ zhHo&Ymb&_^z299+du3%jxb*Yq`k6XQOUtB9KCO=*H&s_P%5^NyJwD1=f} z@U0IL;kTH)>+;03f^Xlx-T$4;X#2ESV~znXx&20#M5mNVaw)6;U2QQz71a#RkY{Lj zG~+#gY#vdJG!tjUJYANz(T9f8(r9o`u&H8dcu!%lv4H_?UHeyL=JzWaV)7ePH=JB_ zEG)4Fg@uAg7(F$W6aqce^51#Tq@<*Z%FCtYITwK#T9unapYp6SS zv$N_ZW@bjuPml8;$e+%_L?M{4Ntjgq*`a^GeG_?(kH>6r-EwLQ8Dww80Qo6yd~y<6 zBAh@>tz>3!bhLlbzJnP)37kVzlk+Y*U;IHN4~gMn*XfB@nz82|FtT=x`fwE;`lwD; zS{gFM{jjV-66y^DgM_qnD81s_eJ^KcPDpqnvW?*@d>LwnP;*1*Zi3I#R@^Va+}Oxh z#l+36mOcpr?M-te_>!D^=hv8v=7lL?BhtI%G1i_+R!>PuscuK@rep79{7sq}!c?$? zo_~r>D-VH$<-MNT3yKK40V#<6D0SziLZ|7KSVcC-P6l)3x_GYpkU39qJ9EMluh8IN z3O+u9kdP3bv9QokqwZiNKIfh2_vO4zRV%*twj?6n1UWf5xo4<#rv($~)c9)e-$yM^ z;PeO@9EuV2$bJ6Iqvw4t1@c4Wp}71dT63BXjsJ1jmog6XMF+SZkCzNmTVEgPh={$| zU>_!(C$6skRa0G8*EZR0_}ig^UFmX*+h&H_N>wvFf%od}@HgrOi#XtmP<mIm z0h;+-05Ddj^G;@gW$VtA6KS&O^}I3O#|(wkf=3YbpIRFV_Un>Vw3L+%+_X-`%6StK z5-5G`-u@f5X$u24=Hld>+PRl2W;4C*!`H9Gv+vf$ijs2aPLrd*a9p09t`)YSAz zdHMVr9mT^ZPQxQlGk*0+JC`l~@V?&IS|Oa%r{Rjs%*13~hyI;q1!Unsu*ZucsXAB! z)YV&lkH0iK!1I3oWN<8thyZnNdUkfBHv*fiI+h)?RZA>M?zPQN6U%3j;oAU2A9LdsuBPm(6nXOm6xn-w{aMAZZqc zF&U#Gj-mUL7@w@p?z^~56(|J#v_4H&FUdcgEyEsD)6keo(f9rHx}K9D`iZ@1aQMQ^ zcCmh>{4i!+*Zr8Ow7rIt<`vSmD#LMR zqFg@Swm=3t1l-=HAk~bGpnUh#@44kxfQAN^nVFfaHgp|TPY{}2cz7g1E}v(Jbf%6w zEne>BgJtFAgH;)DWw_PU)F6;DbwSk&LSWkI%{pkRWVtf3ErE@9cn_ElK+)4!S*sHFQs&N?un@eY=%Zafu3`0XhvbwGkIFlcp zuqG~Scz1}=F}|@OBA!A1?zWg2FQ!;Q`~QaKT0aw=TD)L~U`(a&>@<9_*f-N;WsN`M zl2l4E?g*kM5V7M8{V_QX%!AoH36CX)3{nkPnErnvL-%Ob_FHHV4-Y(36g;L*6xP-? z$|wTV(=Z~`?A5HlwS< zG-?7!qFg*Y8z2$&24RSVx3&s_`w|SLup&T6_?;w6OC~Iob#$V$o*#B!YE7lduU9m$ zU$5VuZ>F*v!$a0FUbC`dc-@@z^~Vrb*)H--&&^SDa(=Wi2Rm^GXo%fk) zEx_CWZREAJkp=_=l>1NZO`~9AvUZ<6Q2Uz^IMKs} zb(yn-#aKgPHkO4^9RK21)UQ#aQMXBeqt;rzs5ao9B;0v~JJHOWjhLoJAP& zVy#FDjM;YAwYZ8{uU;9c;Q^D}-FcRNs(}aYT2xvpsj7+-85LCp$VS*PmIRyYfojX{ z;o+g}${Y=cRoz04Eo#4PfT%ib^Udi7dy_G9W34G!?H~~?#fqfEUU*AfS!s={t83%a z-C>o(x&$l&0=1|p`NYHo1lDaznXPVWVZn<1%Y+3N#8WCr0nRMiWKW|n`}yI!yF6Y6 zY=cQM0&{hY21>X}v81Kt8ysXfkO~1$p-7NTFJD}30!ePrgri86frgfriihX({GCN? z$&y^pJK)r_rJtzD$OT*w0b@wzccz8BSnqZ%2m=#rGXH%F`aj=-&(6-y=Ua(Nk=q4u z5fKrY7)%@lZv(z&L7x3sVUiFa@!wMdOK+RP%3c;R7)vTx>3XOR@z<-6QB*|dv0K`E zDOSiRD9mL3AZfX8CCDtz7EVZ;n!Y}1+9hN9R1FI@KVN5!|K-b<-h}?i{PzhdDPhgc z0^d(cfSo}+z;c?8N-8U3(&wX};)7FOec0ihZ*rys0gcC^6aKMK(Gb8OP2dOwyL%O+ z@IAyit69(;){F*h5XhjbC57AbZ)=%pJ;9iU=H`VBO{#=!P%#;je-}4D)_qh_F`6il zr*&4t?}u%1JC$~E;i|X!6WHc;^OE54IE&;1Y$U>y?+;$w9gqX{3O(k2o}mmM)S`Bp2yHCAd z!IE`#t()mt+A>r5ze^x#ePU*2Y#5m*;?P&-w;--g8aL5zTr84a;ot;=N*uviW`hNe z<;M@2lD{D+_#1l@d3dHiaZrXhQ*)1X0KP{h80*{AIRgTgmy^?V!i^eQnC+^z_IyV& z{)ZB0xKuw@w8b}UbAQ0ilJa}adD%=I<%ae9E6R$xum$L zNDS1Udi&KA%Z~)86~UQh2+zCb{_p#D=aV_ROhw=~4(sB~u`2468ImPRX0(#_T?AzF zC!yw&f0>M1)qP$BpD^F%)j)7DI5gDbJSH5bq36-ZpsBB;V{=u)14?WGd3F~S6|JZr!;MZ7AsXeCcuX;evK54M-b6|U+jt|H zEZzE6T*14Q&LPX{=#ad4*za1cvgG@p9nD~4CohiAq**D(q+er(<|04}kV~mF)tfhO zK9dImci%fXsb5S?3BK9mYxa4ef^53^xY7@N#|td`djt_ z&$tqLfP4$INk2(pIVqCQj}e8n22u1{qB&oM7=-6Tx{041uzRdmUTs-fdK4IoMDR1}uV245Jc?B#0Yz|gv+u|OLjW7i35@LO52BZrJ%#H?lgi!n7k+Lo{}2&}`;WnEd4`DqLDJo&Nnk3fzX zJJOP|ws^YE+94V>W(8h+0yH4U{3}l$UM)WUY^IUX%!XVUf$Amcex`qKL0_LN5zkIJq8X-cXs-S zFx@zB_d`rju=CmWV}(0xSfH27RCaHV_J0CYEArhh1TiHHh`){>%HXA>1F;)~vtqyO zVqN4+FcXS2_GpI1|Fc`b>OW|Q9l;ziJj9cA1&YXDncOzv|0!8|Fvmxb$KljpodXCo zYT=Hricy#Bv!FjUp|dGOJy$_L6s!h4?Jc*+PiB&xwKTx2`cUIUr}*DLI#Ybi6}C=A z`JZLLYv~~Fg}7>Or5%b((1q_JQm)xGVK3!<55U}STcO1LlHpAz!Q@9a}=1FdhXh-5dnXbbBD=9)+OoW2@FjPcz zm39hj2;w2_*A=u>b(54VNXNe6c-qtnZ#_A{NA}M!mqm_PHE|e?r32#fxx8}!?MMhc zALnw@tex8Ir zui1`c+*~cVoxWWwC(CGB16zRa`@caHV#A6jBS36b?&n6T>er*;O$R?qUI13qp_8*S z79pYK%;;!_V3k1^Gz|?6g`glYq|)QUOvi266`(muAQUE}?c6BJ%D$jUip&y_(EwZ% zxm|;)DlIQZ2mBB~9M=C2RET%O)4&SgRZ~+F0Em_Ihr#Otw{ff3`iWAE>54eATZU^- z=A7YwGS@GAvCh7e!wGAjatPiBv3inUirH1@mT-3IrU=;FklVfN2aXc<-@kt^)r!Yv z-XCz|y!Q5$4RIyJ{Tp#H#IEP3)92jQRuaIa2hzB$<3Z&M4u<98;xYi7_;$qJ7r?{Y zPKju+;g<@Nno95j>#S!>yTH!-?&)Z0jYiVA-H)mU#%E{YA%nxik}4{%UP@Gb;Xh{l z9PfQ3`tPaC9f#u=vXgSbHmMAAwaTOc!d_Th#3CTzTl*{_A#rxo@l5I;f5pejI`;Dj zY_AjGvED;H_jNcX-DcHN=SABF!-MH!Aj`a91he5($`h?daIOHprtC@AFY=qb^HN-FS}~XT@X+ng=;5hZ~fH*|5aW zES-NpQFaM7h{81`KNP1iPfXG0oHQFa{X70WKfV1r!~NwoySAb2n{%F2e|M~d_rexr z@KBDp1;nOYOibABtFPH{qlkM`+lSlf7_?R*#yQm1A0VjzaJY-X({g_+t~k(9e6~qR$l$Ynv!e7sqx@7!4G- zTmtA1s|`@UYdT+(-s)>fqSUDQ@r_~ zeSM#@pE`#t*x{-f85l5eabG``6c>vFEe8ZS#89KRw3NwasSyhVn~a`bW*-x~7-Ii- zXAloBhuAbuP#%Vd5Aigt;r-$bBO8AI`Gh8ceJ#_I&_wUlMN7*8^I=FSpkBwNSy;=9 z=wUSzTbdifCP=u|z8d5pQqJKa*(psCEzTYe_!HC#G)g`9UVTr}5aP+r|Oo0jk&BCIaEVH5=LE80xhH@WZ6ji23Kw zANP$2I!t2Xs50#at?8FTIelCx=5TnIVGSiYkYy|Yd%Lavl z_oOEWpp0G7z%f6VB)8T9#5M=KhgfsS$I-GO?9Ru#cbFX$RWs!^m83|zki`Utt(*yq z`1p8#-c|39>9*SpclWK9Q*h&P*FiM8XdKqg&uu=BnA3C8D_T=L4x7@r*vD| z%_w|+EbyFFJT|$ZAwzVyQI;ym;E+)3%l$qe`QBrTl}v2odhs{PWI9`}trE`12BY>9 z1p66d>?QQ9x!FOfWn9$q_l#j|=eiqM2|4^jjY}0b5tsP9ge!^=e*WQhnRhMgUbx6| zla5+-naM`V`c>Q@fkTJatntP=v%4#*WU*>)Z7mMaau|}4K?5RQ*2d2u_CP@3X8bLN zc)rS~+7_ceWG3zU;T<1>=p3<@mH_gh7zClcuzRuD$I%*TG|mUZn%dV5d|CCq$(XS) zP0TPZ3od6B?Xb1QP_CS}DVeX#Eb>|Lb#(a0APV@wQB(!~UbjjVlL5?MNk`o4{Ez0V zV6+?T;2?Wfo?Ba62AlmcigT_mEW*O10FMG`4&o1kfX;Sq&=|Kc)yC-t8UHdSma>HQ z3U6IJFBD@z6Gina5&||3g)^S}G#U0*B#nd8^C!%NEC#3`D3?|1?7|M1wB*iz@Hrc> zA(gnc%0=dA*RNVR;Dwi-!hos*#0uD|!_F`vXy7%qbD~yI z1GOP0CWa->?-Jp$s=K<=0P=V$*kct$S0%6Q6s9qvLq+N25N8AZ$>wX|*7sN0InwT= zR0ubF7ZiUZGTFyb&aI%>W02(m^KF4PDk)bfcA1X_0-l5J%STRaj8^K}Z=ImW+L%J67)PsYHdnrI zN}M4Xz41l2R4*Y_76ip_;P3X(@xn*IqN*ThknWzID!XL?ARhATIP^Q}l}w@Xb~=)5 zIUTCaEU#t-E)YB;%BG=w>Tk1n+QJ-nM5=AX>?)b-F(+7}_VxAUId-ON)eHMd9Tozk zFFJUtg)$n$IHo%((?sG*98rOg7sK%OSH3dN6J`6-#zm zbh80xVXH9Q`OHn%v_&2eyV|ZPYu!+odh}kXs;VkHJK7Azu<&ri^Dcc|UFQC`Bs0eq zq>!TG;;P6*APe^kJ&8wKlMm(!Li~)Gwl1Cntx6LGV_htiFf7*ZH`w2G^aRba=|s%M zib2*AWXQyU@C8&6!2iz9dN87P(MkCHL4VNkcUfL@>?l+q3X_n~lI>%5Qta&q!~w^( zm^URdjO7P;BRYCM8|10<-m9|8h=W&A;wTu<%evWN82%W+>N0wi$!b-ikwmX{ zNN^aRLDF(?AYgXpT=k=lL@;7SU}^cCx5K6bBr$R^@=XyZv-huOagM0_`pJY5-zpE^13RB=HHLjLEn)gkN1128Su;O*8 zIBLp&@8mW6S21`=czvVt$s6B5{#ZO}1;4N@R@Fj0tM;|AO+$MX1bl4I2w$XM>RZ~- zARVt72ZwhEcfTlkwRc1;-ed1YPalY}oAm6)$@60a3 zsueJg8^-IMW8_-}i$d0zJ+?2*Zs`|flYA$Bkpp5Z?0G9ZK8oIuwr2%2SU3R|D9B(P zZxC#V`#e=>%M0CUObF-~9j3%epwM8&L9eyV&WcO_fOx!*x1<4v0POn>`-^&+p>yXB zKWJ-5Ei7n#-=8Lot_tKl$8a9BE%B70B=102nAvw)T!MQ1iy~&L10~`tBHN2IM%3%T z$TAe7rl}eJ<;yEoRn>W+tfZvitu2!u1=W8DqN@UGUha`(VK7?PIXB?X!wu>SMR7yR zFO@m5H+A!l@hzqy2xW$-2ygM?A0E70Z&tiOYL-=0#A^uWK5odF+6Rr2my}7u=Kuj* zMYGBXZqY5B{fxh-U*ROVZDqR= z19#z|7{AmW<~HI7-h2r!@?!*rcL?WK&JjN4!Lw>mq6EBVALZfAbT>Z*5%hG&2kzDP z_SF8urs$P=K}P@wm3W>k`2o>@>+Q=D3i@FpE`o!;Mr99(x+a8_hmf6+`ov#uGui$k z=rXOTw6d~tSoOJCUlXUqbp;vVWwZmdjSWEGi=LU;b*2n77tF)K88qg*MOEyB>gweg zYLm&BGr}-l5g8KHX%G*`>orT!U%Ac!^#|admsIQB#CF<}oecpBUC&`fN9@0g-ObJb zn8(^qgJ`l)BH*O*-_dorwfP#0U?2ZpP}c2QSs02KEd5GYs)g;%%4a5IY;=J&N((Se z^GMFjL>Ei#*4EWM{;ADQpclso;?beudyIt0Jc89Q=eA{$B8ul$rNi3k>5RB<=%M2` ze_5JWj;X$&>jz%Y;~;Brj~PT0=)6RGul|hSmk$|-BQA%Ad%NUN{i=*6;zY6r@nOtH zsB|pavV67+TqHv7m>l0w5irOkjdIUSHHuaF+|ROmEZeVvhzG0^B9Y@SmoEYOT{N#E zrd;o>3xE88)hfdS*FfUahk4o!QK9fbysX=R2z+rQ$O}0Ka0l>K$L0}I!OsSNszJK!OHWT1pK*05udGzYNw%CS zFa&)G#l>F+shhuV!{AcJAR94Wky-YT$GgVE1u|Gbl*FbONQT?-B-wxEU9*Lfr^JYX zV>!P&taw|jswF=$@*n^GvYZekeAh4kf7+F$I>xaFR&`oo$6MRm>sNF72oPYsm!6#B zpI-)-eKY46Rm>8d%f7s~q+`f8|JIntn^43W*T*8{JTgJbY83fqw0iY|jufL1D8d7O z<(o**CVndA+^_f_Wbw`&=x)j%J_LYH4r7-gTY4AiIR;93I-LA>mR1pMx>A)J*kpZ} zUE<)ma}6ZUJ_6%dN~A$Bp#-07t5Gp$JeDbpez@l{doTVzU*-FD?7>2{(tomQZgVq{ zXB!!aB+%pRjl^ZFw49=)^XxXb>{2+#zz)WYa;1Dgun$lx{pynJui!-KDoQ|fwpN_b~Z)a0~Q7yKo4NJD3hnBtT8vPIh2#_ z(IjEyT|XahtNG=fOOdEMim;j}^wmuK%!9wb#~j-2UCwpQ{qT)4@v@6ZVjh;~CNr61 z9={Qjx4!PXshX%`iBL*5~l?>(JP+#XTMP({`I_17&=F|FuvKSby*MXepl^^ zZKP#ZSJ{hB_@EFU?CyqDR1QTY8*G-$MZbd@?c?_-R)GQsBO2(&VW9Vh&yp zJ?Y;r0tPj+tE3&i`)I=tryjQ76Xq!3i^aUz!NP%dI##gjpzHCXoR0Eg<>uxF-AR9m z95FCkH90lahu{a=EgPGgeLhkoAf^=aRdPV00*(X5f%-|^mewz!2SHny2bvHLwJz@R z((OgQhEU9{z@9vX>6)e;-~0HW0?Lhy;Hcq0w8E49VNm(*S4WkM&b|1FCW_s!pW=0! zcKzKqGsMyqddw8maI_S8L6j|@(c9oW*)Sktv$ha7-Lw+<8l-KHGrt8$dI;F?bnxdAHqi{yyUR(g)XIoNtA zw(1Ad=zw=y{OWQ*E`GnhFY2=!jBVLoyE}SqJD%zka*;NaLd~B;Jh4T1J|}rhbeb63 zcO(Dd0C?Vt=SqZiZ!tSu! z)XJ@$l3bsH4;Jw&OqWx9yfOvf^sN?4D>|Anp&KC;3vPURI!ROpLSkZKLSkZ2XQ!CZ z)%3^mR@|UiH*EcqUCcn=aey?_0*U{02DYttfyGZMJYZHb{qlr!+WuS9w;sPIy zAhFf&Bu+ztcG(OUfO^_*3jGDBf0L6cLT8;YvhUybEW4rX;{la0<|P+{Mz?00XI+Wi zfPp(&Cctpc;fhcW?}W;cBPG<7aaN*rgOOh8>j+DQmJ|-g<4}ql%uBk&{Uwx(Uhgu6 zRsjeHv==CuN1P6ffcE$@5UHs-2paAJ=gJQ~l0gO7t3P4S5lK&@;vGebERS)!^n2w)M9VX&-tM{M!JDFzrTBOOUI={IHZuy$INYChoc>oKL+Cci zAr80l_L|;($xciiDzUV>yo%{F81g?MNM;dV|b1kYOF&` z0fuloUmBps1LzYm`6|WADk_0sLQd$R(rZe9jUcmg>O9W)>e%9R0}6tY3xkDY(AIbC zE{gu^Lx5~7GOW^@q~$02yAu+yr(E^r!-M9c(=M69ncKZuqD^lkGig~zg&(LFv_qKP zI=ZODjYrxE>FLo3WbP2qh^Bw@1`6V6AH!`uTj|qJn1_uqAUeFP(a;-qshvd)i-Z-B zxP8~<4Ubt{wi=}d@YkX++Uzu}@*wxQW9yX0@CI)hE<0<2O z`|lkac^EL~h;(&on1+btz~4CsH^c61IVKiqb+q9D1e+^^z&ymLm?Pn z?B1Y={_x@!*MYm`fz+^5bg=`M5u4{1pA40)T9g-x`uy50B~k)5dOwl!`1#@~>4?HX zwaZKPGq7M|PBFNINElWvQcz0b0M?|s-gP^04h1`=`>t`Ae5gF0nuiNA<~JK2OBR{9 z>+1egBQ4}=Ow-SfMVQzqF9t@ks(_5{dm~`ZCXJFA?dN@Da<-&K6Z-!ANa4fnRAJ2C z-uj2~>CGl3;>z=WS=omrMkCbI9#c_bKt|0*N$&Ju!CXMs$8!t~*s-u~HDsjB_c4Lf zO77m?CC$O2zLSW$DLGk;gn4KwNP`dmEzi=O{MD~7nBHWe5tee+-^@O$!rkvMQSHkD*6j(LAVSr!h9 ze~u+wHZeA0(7@J zzNs?QeL5*Bf5T#2kpYHoC-lC_hXSKbEcY4AcVnN&vB1PBSuH>y?PTbgf>b&?aU+%= zUrvL?qQy~sF@_H1;M`jar`z-?lgGXnWQWbN-%+F4vQC{of?$^unA~LUwXDB zzb|QQB#?i9Q(V(b#2Sv7Ah8piwJ>NLqcm=GEYagM>##y)`tfUgZF8f9J7%6$HrVFUkKirDtvZ*6n7GtYB!{sW$qrPc6cv43PL%xS$il z#E*#~9H^&5EvsCS750%|QTJ;IV#g-&JDquDHuk$cx$0fT^N+jjuQk^GP18%lBQ##E zbPnFPTeYC_o#~>iJXgVy3b%DrUhO*UCSzA#Mk+^NY-i70HnTrIJYeGS+>*a4@3bzI z?d|W^YAgU&rlO*fgktaG!9kYla6QgQ{AB`|e~5cIU}`{$W6B<_j6>KUkFjm*x_G09 zC*&6F-O9>SD6{!37k;`}aKf_~K?!$uvuWBw9?h{MJQ`NO%bnz|3is4^4z7U7Z>+s9X5YB3KuFsYXyve{*SU2pP9)L zDe=D$C=clisRN!4Kq?q|1l;mviBr&%)6>1u6nXvumkFYmczaz!5Xh$ns3liRyN1KM zie$dqw~UarCw^q64kwH9Z|hG?ivnow^+jYeWfF3?*${G}6rvDKUOV~iKJ?-gi!h**Ay}pHFUOk8=vd~T-^HG912PNSv958>`?tA!MM=B4EmSQ&D z{!{`aI#4O1^@sd7MO5U@*Ojblz14$Tey@l@R?$vEgeeS9GQw6(*Bqa{)GSW9I#a0K zA4hK~S;;PYd(;*!JIO_DJNI|cA^yWzJy~|K(hsmV;o%L);Q@uWx%FuFsye|ci}%B8 zhz`a!Y`$Ol3_BCiTE)dex4Jp-gC`i!QFlLBwN$PL5xEf3KkYXE(j?gYhg|3QbZ}b~R_<4Ibt9c(YGmard zO|K&Ower8x2tf+WIovt7`2f?d@kyzUT*2E}FHF>1C#+0Ay$&<(Q{oTzeW2z&+`XYz z5%Q-*27_s}X;gxE{|4sdl)n2feN-q?bK9unNFFG}$r4(<7khyUXlaTFb!}O1MeB3l ztr~x*cYd(ZIoLVYStq)KAH$ZG1&R2TD7lcAGtL*#GTY22BlCNxg~F9Ptka>+c&U9; zAy@9#iF$LS6eb+LBFxBs&HTtI3Iu<$-YiNF!?}AR$uSZ9gc+5*-uPWW&45aRnIXi zA;CK~C_=a*7h$%!%4YSk$Bk$8eEek%T9J~8&d#{ukgxX)tP9~~T)*G@ZFU@=m+*p0s7>>B!XRsbt`tBa1anxJ1I(Yn+ba{IwQZ%GyP1Y4$&f|}{B*F_XLp-qV;*2m_@48+VY4+g#)w=LwY{6O+ z^@BGenKmCBzs%UWg^!@8dnEAXzULqE{ec#zWUh6tL}#N7Be!RT>QNuu4E!kQ1#l~;za%a%%0kRfaF?l<>+q6PjJMr;&#R*}e11T5-UPijjl zOm_cTVT?EMD=AFbKO~DHC@qo!Rz(EqTt06+2*scS5~ERn0FyW7RyAyV_?vmmKM`S~ zKgBUHTJLo^%cIbHxtgqJxuW~Qn!Wi0Wh%a-lAj2*7uf~{K(pdy_4J_lAGSFlH?}YO z-}qVd1)OSU^M03QUrFg9)%aa?^jlf>L3+o<+MLj5bAx>lu526e$A>967^QX2;U2RK zioK(EDz^@>y#4T-oj1&T>mhYG4f^btux{JBL4kjp}22$1xrVqSt=x^**``}RE zHrZu?oGI8HMt%EOm$){B#SU{EYrJc)Ry=1yb^@r_9v%0!-(oaRP21`tUm)NYj&o{% zKkQq%^v)j~(@Xu45d*H|e5@Nzw9~Xr#|$g!ppoB?1Pu+5nn|NX8C>VIYE|)sg6-U# zQxbPd%CAV*Qw;i@iry%|QJxaTc%|Js|792z# z{j!HS<e?x?(hGc0bgL)7wWYksIW0a zs=q0ly8!W1k7osSU?BME2SUivpG07NMPQ#qNT9tDAACYM{K(h6!;}y%Q#_*wVB1Lu zYcYx4BXh0LR!SSg&1&|SNT4P+B`O&lp=sqFbgp=!=|gEKz)b%@t>u&s$ljRBa*jE1 zC@7d46g_V!;4?R-;l#y$Jdv<)78He}qj#EO2fK?aSkwK~7JxrKu$*<)`OCs zIKSb3I?Wy)3kzbEaVTv^+a2DBhVAT5r6^TQ3x5}oxtiFMh`PV{Z;(2!@i)3OIr`&# zV9;uV(a3N_Bm3+Vrrwk#=9K8YlUUruYwf}l$=}oKm8xs<@553RT+D)}yM4Y2Fl*|_ zs;QO{*7}2bX}RufcgVF?;gPjbJlWB;LoFhJzIA{T@9~BLMqW@wGhf{JJH?=eJW1RZ z{XoDokOc<*GH$t_-s8Dk;P^2%`7=l5cA@>9I~mYx-2La!gB5sn(ak=Z(gT8LEs}5z zr(4yKa$firE?Yw>!RUCHt+$aKYfIM1=CPHKQdzF*CaLmiZ6q=Z9jo$xZ#W?URu0m~Z=Z#2(ZBO$bjuj5c`es5B{=#8toP?ZxZc=#E7G zG#!~gmEktSUGJJZ*?4Bd1Up75j@Z>!gRA*DgoHW8b9|;JFRF0wAO{E>dt8XmZiKJ48PD#}W=*JffMY$vXA)2UUi{{1^!~dg}w<1-R zk2!Q5z)>qH8x`DGrTac^B|pY-y5Xidt{Ur+a^M(Mfr+ud^>CKr=-e5mL}-IQ4q&r= zIu7)%2<&Z0;-P?9@Oy_6_W6~Gw^cQlfgkyQH4iwPEhS=7^$MFniKJ4T_xup^lSe}W z@z_sIsT0VbXbNcXKyME^7?g^S5_sRGYf|Omxntra2Wh_t6q%13B7sCQY!jLh6t&4B0cF~giC~ms}fjV3h?PqR|{H? z`5R^A)T_;k(zQjg8fB6bWep#&{wdPis~E$U$6$)p|If+Y zy~`xE)yrN@uU845mEYD6=zDN#fwD0-+YW9Ul7)+qVELMkrUJL1;&xDwW-6hREOt5f4|ap+q6T6~HNi94Uw{hx^()}gpi4z6E4aPA&FA~jsBvUJ>2`|b zMIFxq!&+d6k%Cm7Ufk6q2+hB^0+&3o{W;mFwP#5*5&E%jB$#Ni`7H4d8In)&N4Jd( zd$^P&*6s5{^6M@HoUQ&oijSL22VlaHzFd%hZY9Q#Xf6^pST3K8ZWKAoBXyd5C&yOt zf6oF8=~SvbrCfAl*(~u_{t%BcMmt77B(Ep%f#viQH11uigI4)}j@SIYTVE?W5nGrz zyrKveqrHeV@$-?PV92oL4fn92>2Az^W71l2dq#u~<;Zpa8_!3EvYzb>=ULlHZ`j6D zW|=Pa=SOI9N(#5qs-LGrSVFA^91k%cgG2?&`sd8=pD8;(M*3wQJ(4|e@c(}O>ogGB zyYlH^#g(o_iJn2D9d;XCw$1 z!oZ-aV*dLv@WJspeWi5VkC8vF-uKS8wp>?djgOlK(xA8ATsk=?g<$0S!=R*~$A9kV zNb6VNZgfKQ9CEm#d7I$>mQKo2Lp=m>ad9;(4FXV@>J@6*iLK~9BXXThQ>~&?AcixG z-i3%jxnS=&#(VO{N;IRj?zLc$o?1KVdqv-D_Hr#Z+mYqRe5DQndi2x9h}1dXVN1i> z*bQU~TYP7At@8LKE_F8HrOHU*TmR-i-o9-JXqf-tuzbXpLYqYsOmT7GAp00BTBT&Y zU=tgud6v%p3F(%F7LZMkDtx`IHT zkJF;0gBi58zuO@hSO9-^4%Hw~vwDq0>CWcy3VLJ%Ba6uG>Tn3C&l^N(->|+wHLa$b z2KHjyj2)BC96kJnzFcn0qDIN&lorrqqyUextDAy}rJWtMmr*tiJw0>~>0n|A2MLx}>$@_^g9EDEQB;2gkjFHjPzDc2WLA2$jb`u-@E zhL8A{xl!ef<)k1QY(t4IZ>44^A&_?+FeXly{tI8@Zu;y;3MNnZb7~h@r4~&)7je7k z!yuZx+Uxb5t_QUS!_k<&**eoVYpZ4vM4y03C(pIQtZi&cL#*WrrzokYBycc4+PuwC zzSN#uX=*pR1b}a=eP0=6tZZMl~)s58wIE2&Y}TLV_O=Hwx){VSPDG;UV2k&RRhV zi|Ku1V`D!voRZ_4gCLFTzuzE|ik>CSzJGOjD`N>m@BalwS2KZrK=ilgRE6l-qpz%& zK&OJL*wY15yd%uBYr7=NKyj4r?cP1D&cXWm?}(_|p-9*XsjJxuty*GiQI<5aP>=J( zSTZv540YpoMl|Ko@#|A*oZUYBfr7ie6ul8Q_VkS88pss_*a)H3!-Gd>@lqu-21_6zj|CDafwnRXzsYFHwf9V8_5cGmUv@%IB zjrw1w_4w%I1nc62#NJ{cP9eo%K+WkabY())Elzniq{sF4eB(7qX`_F`Y5UT_d*zaY z<=>Z)5u0cE?9$3*HhrZ?Z7|YuZ$_XIzJ%=)5YoozP%9brvr^JTebwy(-Hh)u8>BEd< zZ82>^N;CMeo^R^dkbQF2WR{ZIpoli(f~@nPy9EC|p~1ObO?@Xo@wvRXc&nLe-JtRr z4?G70lvkk9K|?^%U>?0vczs&*e|2`&aZz>M9-kqGW<~*rhEZA?1qq1(BqgLCkWylh z&Y^n<1%U@iVWbpP5D<{=6p&Ivq(wmK?zo%heee6ZpZnjve1@9gaQ4}It-aRSd;Pvs zoaYgOAT(|akW3M>kmZqXGWfIQ?nbhXr-}SvL3)(b|H(Tvjgvt_K>uW@`ax~4yjx?# z1fR{l7vo7&g{_@21YfX{JS0gL^v&DATE5Ygu z=}bg-=y_U>C2#9LDT{Oa>5vMv!rQq2_3>zANZLJiW@vsy91Yu@`+?VJUJf*uC5ul! z=1Pfq!MC0CGf7vz^Ta)kRlIs!5p1meyk!Kb&eeOWjM0EVthdw?N=f}wsnbT(Xs$5v z)gu=vM2$GLa9?IHpVy=n^<5$BS{1Yc*JLoU)@dQtS9VAU-;|m>KmG9aoRq_}UI?rc zw7w2Y)g8eIM1SqSj;7I5oRT1T_hb#%`91YD>C-6zi<2A8sfsV%Y-;6l4*T9+gb)(b zzL0lr_{Ldzd_+oyJp1Apn{f1K%O*$g$&B%Oy?q#u9aPqFLDI^4enceU@37~V+|JquFuaHx(8uTHRDWOI-+kyO*zz#@Vhe02NJN_QLKyZjSVa2EAC@a- z5ke}PV8hDiBI??gbbhOwAWLN2Ut{5Nd@c_`QM;FG9@5!25(MVDS&&5-Cse78IdQQs z3i7X9i55~pF#L))@!|%{DLLg)Y&x$Bz_?a3fixCzG+m54t!=WWwuGa;V)x)c!ne zqiV8G&ra$Ck}Rn3>%6U%@OVIe@)K-AO$b>2Pvb(C^}i(Iu+D^cj$es)l* z?Yc}zaEaqm&PPHb8FvC-9`Onyjn}U&$8jnWq7hTqU$}}p<_&gMpvFHaB8TpZbl5)0 zT<4nT6kK_=KfM0H?t|kuR(9+Urz*p2ftD0_!qx7iH_thDVpX`vZZY15t9W1N+)Cj$ zuC(c9)My%zl>O_#yl*ll>Bx5-pFZ;?50ZHAZ|JX*8!~($(P(IZyPea#g@q5&;$iun zWs&u{=A0NB`P}X7uJlIoy2WQlhoku3E$a=p`m-Tp)7fU(v8dnv8@C!wYUq(+%yI$u zTbUY%q`%*ytX^5xc*JvMR?-(n$UQLo{i#?_uJ3DYZFjzCwqWGxC)Ujycbn_xWw>k& zh`-NF-hBt;)x)6#s*L!jX|2aAN0!6akMpX`pT_02`K%E#EQK@soBOdD-J8^>uZd>8 z_Uh*|KXMn{pGC{I}Cq%E#=E;a8gC{GgGZoKbP3!^Lw!w3O+6v!M|* zCiUw=5$=m9t^8M>ZAxp7&w1NcNDAV^V<)4W5ntw}y>ns(>iUzdn&{&}6^|NmHpXkU zFQ%5{;ysXIhYJt$;stosgd^w-6AD`L4LaWb^&pztKYo5HegGv()de#98{V^Wepjmz7d=1Sx*h#y7-sk5yvH{EHPiYZeA;cU5#nsJzC?)utaZT`vgT~$Q*xqm z7GiJenoAl3WM6KW#9d$mqX}JJwbutwi6>da4Fo=Z)FbFXh4l`*SZ%sTK~Jb&LR^<4 zy6Yin7O13JnGF#QZ*@Q_d+a1dG1=2Ansr>yWSR3h!t1QB-mEy%#SnA=^Vuf-<)KiI z-C~XP1@ejDMLtuhV0*T?Oc|=iL*6SZhURfKzWkiLrqB?Q@Yw-b!t;!!WYNle#-dJg z>=UdUU2=k{GSnItF`&|lK|b2Apa_S~fOd=ZAi8L!LAjO=Q=s^?$*}A`-E?{6&y!0_aBA z;5EBJ*AoNb-w(2aZZn(;6(CNrZFZYHX@q@pqk5a{94dnhsxbaE=}#AH$GwYlfA)-S zXw&p~=vW2?| znn}$W+G=AYzca$!T%2h-?T%K(W-o>SH)B#G z!$|Ii;Sw?xp&Wz{e2MjIH+Pmo<(i<=7mPj^S2AVJogMQEjnF&EFkw`=b`>Yi8b+Kz zsp9~d%OngAU}>2-C->R$nq5slq>2%Q*8MRo-LrsXh)D!7p;brqydZybkO5u^Livhp z7TmU+!Nlpb*NEoRef=f(8U!iP?7_*Ld(CC-V)SFbjg&?u8#UbFc=>FgP?$6|!B+pR zG}zHtQG0|oV@vTvD*l_q-v)^TaKEIySh_pyo;+z;pQ?jGLggWSeSLOzc4_VtAH4wm zLfOjd24ps4=9G((iRqDOIs|h6{(U<(-O>b$- zCut23kM21MO?hk0G`+v`hxzBbX2PIN33HOT-H?kMWVP_3$eK|JuG0}(TmtFUTheDj zDY59Y$dgm#Au$vb6kSy8TR+qjiPRFWI&-@}1Z9_og||CyFuV~MwCpQO?|arIN%H!j z#jGTx&YbxC6o#O@2V_$@7Tfc*Qx|9Lh=+v#my{=WR%>vHVta}aR7ulC!Bs0D9_MFS8%MIY%XV7TR|0O-=E)h2vkI;9UmpzbFjE7v$juebZWCj$ zpX0Hk!%sUKQ}|3o!XjL{M-V?MUhpjI=w{f=iA3{(f~T?@S=!b;;rZvyx5q*kxV2Q3 zNm28s}(?DaPc>RNyIp`M*~a4@{zl#hzeA^gHT;NzL?yiI4Mk4P#m)C=+`!qn$=P{Z z2VOyg8{jet$9wR8$-^Nd^aLBj)0xhlYlr0nJSDe3T~HcUvo8dqa0$XAg1H0Yr|E{9 zM&<;?4XnPE`NPg9V*{ab-XeEQvBcjo=hUmC0|bmzhlNos{?yQ}Y5gDN%Z$M99{eCd zlQP5*s6GS7lAmoa4h=hdZw}1jS@MVrWQl^;88$6WX(1$lh!xcK^+BnZh3XSFQWOU( zIs3cWGd&aW-ho#Nn^Apv1BE6PMD%__6^ricj-K-!8ZFM|6xbEKC9$UmJBs6sC*(DB zNrp3K>=dI7{d$sv?Cr^ZcLATp!A}PjHXCO^baYVKyy7W zZdE7wLuh)qZ$)~3SQlO27wJKLJ{hlwxiTP2B`Z#%QjI?`Aew0UP;Yw>$WZjFr@}h} zfzA&DxeU?T49T>6c+)P^EW9C)uB|Kd8(UNm8v}IX#B9~y-$H4#uI?VP%jGSBMhpHI z4v}boD^pd87z-4XS6h0=Y66= zNAu}I)T%!BI+ElPl1}ek2lL@oe`Pud_GBh3Fo+^8vW(y=diW`UYAyb5-f0iXbnFK> z3H&+Nu9yYnWL2tEdUPnA1f?_rx|Bg~@6qojD8u0_UMFW=eRg>B_jvc7ghKg(gQdrk z&D&HQLevYSLTl}=zniAzc;RN&H;te_uWw!)Nv8e5mikMCFAV9^p{%hvkVI$hs1~^- z#A%1NO0T*c+5HYD`LGjtQk2pfIMc*uUm~_C(PO{qfKrF&=__y)?lmcKLVCp}xgDzy zy<>|_`Z^g3r^!T|z|iX4K(jh66!s)S(>nrzesI~ZADc>u9~#2=`C&H&yFI;u3B3VR zGKU)4!h%gn7dNmJKJFN4ScrF@Q1WuFVlsF6y}n`9754}TL=ehu$M}mRvelGi z3p8o^n*JGo5nzJ+#eXeLlPj&y7Wgms2)16zIAw0j$MWM};BiUa$j^UbOT!NWzISk3 zeekgqF&wRGsOJRP@{!Rk`D7vhgTZEZj`D}ab*Gw}Apo~oFbq&-hXIY%Cu#hBRY64J z-p*e)xWROf9)Z3F<>v2;N=scQAy-R2aHx@0S>0m1Cs zv3*a)9>(6haiEgiT}G)8-9U-`?-=mnM_k*SJW$||7Mn>gG+z5%0%c-I-#)RW>7*ww zlNs|9jJ@*txf_^-c6P*q5Bme)ow_C_FVg=Cy=4?QfE{;wIecrU38}t*qQp zeI^-jC~O{dwD1M?*%2Uy40tv@hZk}wPCF21T>on4$kNO!vQc5Kw+=Su(p{Q%d5csaoTuO{Hvhj zKbqdCHTOrc^3Jhh$WBs-+)=l-Xl?Mwt58TtpRJ;i5mHIF9Sj3&(BY#u>p;i@xUu0K zkJC%ne7q7l=7v<*p8qIdCl$`51MU;HiAOYG}-m<9~FjGFGt*@#|_4fA0tkK_Kx$BqYGF>lepEY4>Ju9Fo=0-*Z z_5CT~{gG5m)e~!rnI6+qQ}|@7*vYKkg`a>F)#J^R(O5MX?o&^A z-FBH&!HLEy6DP+XE91My4hwX&Jd0a+|BV?PmLtAuM6%reG1&}6aOS`bn^Qw ze@!_{06ppBp7J&S6;CM?DFKMy0LX}X_Os}i{|{dghbHI4ToX?9);A;edL~FAvcIHa zQ_VytsHH%DjICn<=AovTaNmk@Lt@%uX?(X~WMxk{T@wJ7!O|<*!=t=xV`-=5 zk7w)3p6$rNsHHGZQ%o*t$ykg$yKteb206Q$PH(8YRO0>C+xTB*VyH|rQPJCOmCT+X zC7W_KGH0TM`81gC9voo$=tFgSh|9-|PI0}hPZ-MP{8zSobRIq|tdEmP7|0(;9FLu9TOkv>)N4?zX9H!siX;k@)a~8gT&vjb=RX3FyG;R&6(f|vgyJ;w2 zIAYrY@N)!EfGH$|NJ&crVl1I9ZTV0ChoENOYh{$T1h2&%d+$bt$y(8Ip2U2Te&U&F;q3m{-B)zLeC|KY@C= z1&F@59NGFKu5$H?`uX_{PES)qXq8meL5tou(}(jsQ!jh%n%^MY@2f?EVE2T%eY-Dl z|Gg7HGHar_VmiW3*6ZNl){IvE--k#>X%#2}?CvVIZkv98>ykn-RqIvy7r%NsnDG7m zrS~^x;E+qUH8AKF$*8uukeLk?5C~WUHWfTQJ+n(n;J}E%c3J7TqsZB1%UxWALUh)j z14x-NOA=nIz}+@`#Qu?ZMjpWoh;YfwYrx_G&qj~+wp<+l=08cwuA#KYyl9-WGC=< zEzVW*fC?YV^QJjXK8y3jKlfx|xo~F7&@UiUUw4pBzBz340z6(sp$yqr$s8xV*QO8gL#4jXdeSN+9 zM7t|lV-6xR9vD?{dYQm#8ZbP+_eK*z#7Pfq63i#}h2iKX!a$nxPW_ zIcKqTz&djC!$B?b(8vfe*xJ^mopZgDRCD%jJ=}v;Z+++J5^)2}ijs-kJ;er%lL0jNcn8Cf3fxG8KVOEJ%^g! zk1BT3o>t#qbf$AdLYYNbc=ZiIz)-j?dL4c~d1xP%=N5VzGv`}Q4NG@M9c?j~BOWG)rf;>Pv zy5Ih$50*`U6wnik{W+MU2t2RU_r1jaIQFr`edKy%EO=TQ{np2_2w@6h-f3A zT^$-6%q|Ls5+dCoC5?2Rd*1K+ z&;OtQoU_(h=Uv8&GtV>kbI0D-zOHLeq`IoyLrgME2n6y_L0(!D0zrxgzuD1I!N2U) zt#^XIpw2J_Ep+go54w2-_!+}d{*^NXf@^~Kjr3K#;57t715uEc)bhyKTk!TtJY5jK zo7((?B>$7{eGYQ|w{w?Zq6bz=h4$5jgdND=C0{SVar7QHJ2ZxC3(UKVCpaA%UHTOS zaillSXsO57YAx9Q!Fg+45X8T5^o1p-iGgMG#rv$km$wNUisPTdKMV06Y_5%={n%`L zYj{x1GSxJ-*)O_4UbD*Ve}7=XVDWBhW&wMIbz$-90^$g<9{*YxFpV zv&5o){?z=CoV*II?ob!T#AuWqRq1y^)=OfbCZ_H2&H#mBUZ#F<&d z`-~ljQaGfxmLDQvYs;SHxj|ZD*hG}=a~xSNMNcS!5#kZ8S=gOGCr9x;+wbc2?pz)2 zt5>hA?d@fim80i29o?kfPRKhEEKN6iK704>9UdhmN3z)E??h%+*3L?^KE@j~87RwGoRr@vlU+t_sNFElZ^u;Lxv?z=JkT_jwQB-1r;`US3{CL`6v%7*K#qnQF1tH#ejD8m{-7-(315jlG-jpfPFq9fm3x82X>+qhev?=Q$JDt68{I8)Yw8?1rV z`$a^cEJ#auvR#nps`iDc)9{=6RxZ2;IC}QVc5;aA|)hHs4)q#Nlob_Qpw`gv^IlH=< z%sB4ekt{Khjnn8;zZ$(C;Df$??O$n8%zuv+)SjN6ZrFGRW_;RrF~gaPoO}=q6H_)i zk-PVNZ%$z2G`=0Iz4Ek&!w`7?vtAEp=VzPqZ=D}Q1E(e@XZPwHmYxrjtjWbuJsw>2 zyS5u9A#_;PE~DrovjVehTx}UMUJf?naKrkBRf|TJi2LxKvmRmpg8PzPor9+4`|)GM z*d_5iyuF)rEmcgZ{VsABTCGO1)}y&wJ&sJDe>`f<=I%{QOr%_NP!TONx;})Osyy!! z@5+7u{=J8r+u7GCDP%OhPlVZN0uEGfY;1BT-f+9`8$5dSXt`vV`MuYEVj6nS65Z#OuW(1()z9L zceeY=pvg_e)_k$WTcE1Cy3%^=N#!O1gr1S{2^$+$O-;?#)>ehh2dX!s2M!JneINO3 z98b11cm)LLxViBI@~Mi7S_!rq2P*Kj(iuc5Cmvg$^l3*txi*qa)Ntw6lK zL(9wNqi;1eD~W2)J7%h_G&D7N_RSidqru^iZ6cnut6SJeaE^5ucX7K}nVp-{c<~~* zzFx3kj?~Nzqc8KkuD14xxHvUfi5{?FkA5erAy?3=_ohyx@+FK6U0wF|_Quw$`S>&& zzNZU}AwWM$17|6an~^`Kwx%Zc>sNG#meb_voMtfT>_*KV^A8KN1beMB7A{(RhjU8= z7It@S!wZsfWfgJCn#IABLmYqi<8uf8nw&Ja zBM5)b$0jGkoHvJQSy&X<2fu$OXmZ&Ar|zW;m^ssl0bNn2=mf6bjK%I|&*7C4fw}2v z-S9Vb4$Vt?F&4gWkl<76ZcnD3`(C*~NKRi5T%AY`&rS z>M?3>{Cv2QDoZe|*vEGm+@`y`TcT83WC5JC@a^$}wVTVoRGw`$WIG@tcA`=H1c2G& zK2koJEmd>Qd#PdAxaC)SCN}Z=E0& ztW?{1xIs`A%-(=~Nf29~qbmiv+0Zt>&hYEKUXgG5`_}7fVy7Prb*$KI>Xtr#rf33PKPljX~ieM;5|837Qum-`_<7~!CZb2?@EJ93TI9oifxmmP-{jG6`9J*6S zgY#wu*qgk3e351x6M9Iz&XZrlSkwfD_7lc?oNm83=9o zsi>$hKYdDUT=4m`MER6d`IPrFQCLL!+5Q3>|FZroCa>oEyX#vLQm2>gjPXZ`Dk?SU zjGqb%F(6+tqv9b+NlBp~@4!LGg|N5OpY6^B*Dd+YjU?dfsJwHcktzr)D=O+t=Fq<( z#-=q_E^j`5ci{_CZkb`zMxS*=Q_vlTIdhF(41-c~gzpRnZHULnXAfi~cCpG&pFYJW zCvP0612{qtwLk`uBYwc#R3zNhyqM4P>%<##b90F&34+b%zXuWpUH^PAOw(eOF)~{G zS{*_FL$4&Rz%bvMj(ztI^YiD=Gz<(fKA9OA(HR*OTQ0)kW&WClj}-HNZaY1D_ADVS zZQDiP#)j?c`dZB6ao|&7;aPjnX?4jdN#K%ky&Gc3k?C{N!Y!lhe~n5$x=D zadEfEKOAsGq=*ix+ksoX$ z`Jba-p!MPN3(V|n<{_Y1BE&%y%lVQ`V$dQxGZBHupxld#n0~)?mZ|;C*|4$Dn%3;g zv(x{1h1Q-w`2O;I`QV0Vi)WVK$L@JZb06)H0>$MIs(-*<5cd5s@{k2uz!(2%ObZRT zjUOc0DP=_X8ax-_*{1kCsj<(|gSXDkJyp@s*d@A^$l`ZLabTCwCSo0Iql!C`G=D)R z`Kgj(VIEjI10{OAH}1h47>`k0Ru(3kSE)o3U8yj&o`&bY`cVz8-GXc|F^BnB0M_sQ zXm>-aty>xaIFU&lnlSw?be6MmUiC4HQtU3e!p}t^MEEOp)7JD}5jG(Il)S1Qot};; z$nwJcyE)Paf*#0YP{?$R4YB2qkL!o6AoAf2CUbOESq!Qh8R5jp(oyxjbB?X5;sM(S zK3Vcgx7Ido!41O9#?~cvJ|Fz5+R|kEXTG4vVa~!Ccr*}d@g^oFR{e%!rTWdgexPvO zuNbEKU)B2_*9$%Fy!XWK^u@j6^Mrh(ipZ>-L`jRb!TeTNXLfgUDK9TC0fQkQA0Im& zt;+1q)?l%)uz0M55<8BFoiG0oSn8&9pEo(r^1VJY>3R3q<9LASsE61%=ksT1wbckJ zyy2*>u8xkD7D>O(UIy$=zq`NLbo~6<2KaBM$_&8>X%M&G%yeI^9(mpsh=LZMo*o6@ z6EY;3)9~HamNi#zh3k&CE;>OXq3HKaH}4mUvm4;KTkw zq8Yw?3CmePb7t=8rnQx;l*36%NwGTjx;$Qwh>YxzqxKDX%%+VPr4Q@Zva;Sx_eG(N zA9EiayB*C;A&|_>%!`}9Bbiz5U11Lq8-$lECN(~cy6@GeC-l=tBjUBu^lg)-Y|VbP z`3W~RSU1Ge%*=rvQr>K%ZWBuy_4NGwlexCT8AA0g+XP_e=V}%rwjM}wYajqIb8t|e zdPH;6)6+-S-~P;3QXR2T;jy!`%j=cxH}B9d@ARpEB+cWDs$4^ig1QHVyKW9JCK0)?+I?U)%-g_9fhPzR(HNTLuX&d zZ~m;8qNseo>EL;Wuq!4xzis>VL0jAdETZTO;$Nf3)b|&C;3R?%M9a}EjE|2GpcHY- z$;oNozN4cIkB7I57Y8>qKeBECD{q1m5>AbxbQgAyoDLyFYNyq0=roT$>h&PYXxW~oc#>*O_)Ft)@&x`9G%S!8!4ICJ)*T}vlLqmrD zXOb@%&uwj`dfPzZ)k3A1Pe2gaEr%b58Y3I5l)`BOim^MK(D(koP@!4NpLj!?IOEUElNL?TFWA%Fy*&7|jl%VM^tCiPZLHvCd&9Hj z+SpoJI}QOx36Be>iqGlbv4|h|B6E|);819Xa9Q2cqfMc{C;I!A!lI%zOWU2#(-|`e_zt zaPH1*maU$DdyI*k`j%nZ7dJ>HbE9n;84p}8-@j)0?1F{wqH+n+t2!6vDInSJjcx1p z={^Vx&Rap-ZDDS{yxb9t2;BdXhi9rRtZi){onQ6}v<3#F;m)>{my}>vTmE<}IE+Yl z0Oel(-OK~U9Hhc#ij&>2@ez0t21petY^f13|2vM& z_w(@HooODEJYa~&>m!i@Q5uIrFn}q`1DSz=0py4G3o%*T{2Mv+y(C6n+YG$Cyl`;F z`+xihRm|@MAO(@<;h^~T*sbh41fgq|ArG8fP*cJ8=NmvOk+HYu1b3v#5@%p|(9qD} z@u#q0bZiU>(mygH_vQ^NzwI;)6%(j`jr}eej67Bz*e*0aW=+BbNT)igys?q2$^D?$ z_k2N4X@lS}iz;>L^5KxkJ)z*eKxM|l9S6DyuEfO|4;PuJC~HyYaZu;^)11=crJc>l zOrqFT3uh)5>|r60*v$n@O#f?&KH{hx5R$XCEvcW@vb1E>Voj2oL1Yp{2=`eU@xOBh zNDwx^3BZ61Kt71n4l*PN#fWDW^gO0_+nWvj@Zk|CT2_bBp1EztMMhSpH+X{c10Jkz zAdv|`ag1Rh)hyA-@dCA|!S(l}J$%`n26vc_GFEIKjB#n}AiLysS5GTwq%gX%W*e`Z zCeSdtUz7|^=ruS8TSN;{Sb>}A);s1J&Wwh99 zo<m?FRY}jgC-97cAx+U%az$ZQrqqxv}JVqDW)JHTEVS z&ULb}Uj}d#q76+}sE-1IjgBG zPyxS63_FM90D;>sZRRLc{d08P0A(MNlCPl=(SwJtX+T_=zlsw$5t|IN1u+O!<4<1ZcnxTQJA1z7IVoC|Sg0~m3x+tg0LK8-OY`(8 z?mwjLU}8_w~C+89-g|@bFZ4pWBPxU96c#`(KS`BYuJ;%btpPpGA9Sn!jDq zaY>#);>(`Kq@$xXJV$Zi_<@QQcWCUGi1(vc1p1?w+y@X_+#t|UJooYFb~T#kBNaaw z@KAxYUtf3FuyzzLLA|`~5LxYTlt{U4n>_KjQBJD*hyd14a&g)Ait+i)xs1Mvt!N0E z07?|k*7kT5A`}M^2)G4NUq8H%qn^f0gSbt-tw?HskJH~_aY`e~Qt0DhA&&*>jwQdx zTefv}^73Jahh}h$0q}Y^cwLf%kr5TdmGxf=cRw`^Pt{^d9slrFyu=OmwIWT$ zOD#rG)kZM?NOA7z?MvR|&0h$%sCiUISFzD|1cG~1&W#;uqd}!f4|ns|bgbBY2_KvaR!LiJTSkW<-T>*+ z=j`&?bzL4GKO-lykkGBhx9wGK>z1KF=o3(=l1SfJhg&QyikB4w%;PW|E->~49vDUt zfhD*Z5f^vvPC~_ijyz;p4f1=#k%559&t>v?TU2qwo#|ox$0)T|CKdUwel@X5Ja9oC z+jjX{ULJ118E#Y_u2~o&z3Sr_uUZf+n-{3e7;IE7r>KZYKq%IfTv^T16}(`cIcv(E z>$MOVcE3{|yhovecml3X1$NFt2>B92*@Hl`P=_r%S!@x`)ad5L94D7f;N#B+JuVzj+e`hGf9$ zKHXMUhBGrWv&x^Qypbq~LA%!1q_TWKjdGvUS(#~eO9vMyxOytMS{0y5)0G%d`;2d}3lSsIf})YQF2_62N-fe^?$q3tkhN4cB_`LPSm-YZ_|GOGf$qvu5G= zTS))+@15&|DH=LDXzjJLl(gM#vbLN3-;wlNV`qheblVy63binUJK#E&ykxXVij-%3 zWHHPvEckPxAbVzc?+Z2G91qg*Hxj~{qX3e!9K8Mg`8G^{oVH{}dXFMe0sv4z-T@A$ zi6^*Y`P-FZ-WUxXos>^zdO8-M!@hiB3};FJcx{#=6cd6OicBtGFYV&u((k$u9TAc0 zVwl($TsG$oLeCfX#enT@{G5^6`WMsl=KcK71N%S77>Tt#%f0ZtHl_RkuxVsuq}#+x z02-fJ4L>4`sELv0Op(e}Uq0C$mse9GK2K512MWhqL3>NfjR>|WR~MJ)#wRQ0kBSyv zZ8wUI{2V?)J6vmXwcXTBLdz-)kwiziIdsPD84>xdL(k66E)B9jK!YDYeys4lc5~7P z818*yVgi`&?rv$oHEF$k8TnEh!2J9_Y%rgY`;jH~JqqHHIn1ukjt}R73r#r6tGcJk zLDZ#@&%1SPKB>v(UZjvpOzpAU6fN>(nR$rxactvF+CvkUmMfiAq?!E|C9k2TJZU*o zJ4leX4H;Qb^fYr1_o-!%*tM=+=b_Mwef)~@rzgga-BzN>62IQ?{rU4JXUsaoUQIA+Ud!0k#F__|uidZSc{U~_Su(!9j z)mf9`2iHyA_4CHiTa4lW(~|e;6Edh7bz8jn0N=<+Axhir?lH%gy5vEM!0z&U6ts3Q zJDRy#J1KB%fRNB9=9CMH7YhppGIH{xK@e6JUU)yK2@Jt$+&@~&2)H`0EIr4-l7Bx zj^;zxSPkWJDPyhCdk6;l>z5Wt_VA%|0vIyd{crlAA{~fSxNfigJ<+sOdWZR#UvxAS zGmh_*fPlbS!)BJ0tSle=nZi}9K$4+G4#LDvxL=ws`3c)0pL^tGg??E37A+T?>)ZH`T4n@K6Sn*coO1) zA+Ce=_ug~4#*cK`l);`X9dNWE?1mn=l)}FYU*N+;qsbpG zhbwjL+*z}IkPTcX3RpxaCjKwY65L%svZF1gqf2wtOG7#WJCqY0dU)^WzoLnEiA(Ss4Xk_ z(rRuuu`j3x&+m*NgkdT7w`6aRnHmi^wdE~mAGJ%`1FnP38T~Ab=}_y>4B}(yk1p$xPwLObB|H6y-4XO z%UFQ%(j+N48BpF0H_sCXLVo`I8JY1Bk4Lq+fPCXv2hQn7WNCAp%JZ20fE#@-JMXvT zJbx}zyyxGIm7_jt1at;_ky17tw_wa#{lyEE7+ER1Cm|P*`zd1+zuS8xscd0;#lXW` zJNPrptEXH%EXp2|EFdDS^~L4Y))IkO{&(X?Fvt@a9sd+uk@tN+j(P7v32c9%;|pBTfZMvjf)=9++2md9H&HVU}R zTtfb+W*fS)0v950-#2;0}rE zV*(z@<~p>St~ZD0amb8$cEQr9ZU=f5P07Is#4SPaUi72HyQnA)W81B|RE|l566*By6d~%zQV2asc9^a}+8oISXtZIlZ@$f69HFgb zW#Qy&My{@|;-A-)wUueHayu-F0-B`Wl`;g1JltTB8M}Fhb#rPw`yR~-C`X zW6TH92xHG(4CzvBIwmGGAU%|c0$2=81%RQLt}yAUk1LpXvuC74PXII@$?_?YQi2zS zS{{2fQxFj3DW107RFUgQM%+8?iJLq>6Fw|&n{|-CWemA)lY97^9`3;&n5xW3&%`9M zkPL_eLMGLE7tvB}Hf@<#1Liyh7~LOjdC49^a&mK#K$JleGQ^~%TS;jPx%41)Jo|PV z2G$;ck7<>>5k9$ya!>Zap-d6t0<~;eJ-v@zwV-NB(V+)~>A#qo2JPa|0rMbOZh!%& z9PlTALho=j>S?2kD(oh(&Jd;iIriNcTEL9$TuwcH=b!LE871pX9D;a|jkL>+Vs5X` z%@E`jUot>dhK7_t#;ydZGbNTONzsIhWcNGZQ~(a?YJJ-Ymq{(F8^C&s#WtrkS!p@Bu3%>+)X<0s^zOdCYasLd z-E{ri`rKtotMlS)XPMH@R485e;kfoXtjd&fn)g7f?dK+?u+P!2KV#w!9~2qDuWx## z)0vYC1CI&+3qQodh-X6{3E~ErGgvO(oE&(>N0_1G1#v?|%I21q!eUl(a%i9=cyi;k z01zrjRvt2Gw^tMo_YIJMfBn_gFycRwmUD?^$n9_s$o!%qg|C&y! zc34EJWyMwYYf)Rx84+=T8X^}K6~SJ-NR$%Vq=@EDnUnO1A&}oXTz5^sKelq5ki5qK zEl>dSSiXik-*+)_>yCjL2zn!ix5MmW6I}b)Oj(&GB!mEwR9+z)oy+N)%)8=I`BCUr zMBJ&x=WJF&HI0qqfDWg(lYs}hF^=K7_Y!w3m0>qqYDQr{*B}#9WqcUNfY)FU3o1gd zKaL?#hQ%eTG|OA}gXu4?vKf&EeILPQ>u$&?XsI?Tq3@HE!#;i_W7n0)vrw>JHTFgN zTkeNsK*sZMWNJg|qQk9Xo0pC+%?E5gpqEs<9%mFi>L!#|P;hD-0hw3KRTzehx=C>p zcM#ZiOEx-X!veH2{Uj{6b5_0VU}ZfjC7S&Fe6BQQV9OzlA{!ep@0rsF$s$E!t)RAZ z$4ctLgmyZE9%R}GTyP43WD-U;rXatPJ<)NKp7IagXsxEO{UKY?&jRoU-qImR-UH^p zB2;0>2m8qWJ5$L6xTT-WXl-xQDn9hwVMXIn|J6m-U`+yVVH7L7Wg2SZL_IN{!E{gv zK5Qw4Yj7jpBMxq35}liy8wiHmx><8OPxcwZabEd(5rv zsz;c})5orevMf~N+<;)#x`qsvvCNTT;#d7>)W?)9gPX9EE)=dHaQ&1EhTI=QAP?X$ zz(F7}tG|57w{bUFIE8_s)m%POrMTSWFoXI4IoU1tG%i5pbQq=<0?orYx&2q0KI|@(7RrQNY5V*>@j9LV~~rQ%2rDw$DSLgo|#E zZ%TDK#P9mGnbG0y3uwuIjv?~x$IbLhx z_9?&dUE>D9Y88OhfcyBWRitrVsk(Xhr?_18#3iIY+W2Q@iLkZ6`wWH85~`fjbtz*3 z60mvR73e2*W}1}oH#YQ#j)`Mc&AZ_T$n)&U)pKp3^&E3!!-bi_@sX*Hh!Rg$TRWKv z44tli1{=WBWrsGPLeBCJHID%fWCshMh#1$PX`_t>BMpk&_)z-4s}Td!zQ;;RN(m_` z0%g7fzdMA`?(Rs34w{h+L>=bvNMENl{n>vBU%|h_ijP-d%pvT*yS=Ui3>qZhf65Qs zf4sr=a@-gq1vTb70(tb6ANN5B=x|TMPk6%d8gKF*4>0ro$aLle>FUS|N=jV-l!`u% zqAb<6IoErVkOMpgUM+sz{KSa^&|VUAo-`fPCxPMqrwl`nE!t-Q0bt1ls|TV3mUXBE ztl0vF@wuX6^o5fk2yZx%5{!v4A0kl)6HL-=|KdgsKZ742jw&Y(F+FQNFQ+d#A zziZt;ke5ftoQA&t@Bvz*XQyMyMUtUy%*V^y4(F067)OGfc!`11EdKrbpl<1Fj3Wf? z`JzB|CW++{rtN$^{@B=9a+WA9{gWriz~6U=VMVJ7v07N_8>f2Ob3Mxfl(Q}K^N>gV1^-Gs= zptz)@`+E}G>kmG7s|$>&#@G2~BSkW#c-rfMhT?`bM_ecFYu4mXz0HkR$IaS_U>ni+&?mgRvcke z$SB>AlhAgvDAIHv)O$jjVGr)^H1t440j{2kcUn~EU zgLbdyzUp(*BE^E&$o(n@el9w_bYQ`QR`8{>n+f}jK1jgh;r<{IVVtTq6_i2=z4GFC zpuvrI9Mx{Gx4hJjPP-pNCh&_jd+6p<0zs)zwK_b5azU|kH=gr(F{LB0a$uFdbx&jRlE zNiLckkTyDAPZbQ(ieB;R!m9j(=|Ysz!PR^>f7{x9F7Oa^`OOs!Jv|#5_3q3V9)cH_ z1BHpk))~~&$A-n(g_@WkycEfQ@lx@CM1xLCB?UaM*av%+>}MxUW@k}i3mKRzxg2nT zp(A5Qa5We;W3)4lZgFu*FftMiKb7qQ9lXxsXS)Q9kyeq$99DnFrnHL42-sNF(Tv8G zRJN?81PcA;G8n)fe0~Ck&3tjNSXo|nbp=y$oj9$&C8nV5W&%}2iqnq*u9P;@r6yK4 z;gHifNK}fZ>tkBlAOdt!fkn3;JP!w$nEn~@OmTa1IZC=PfG>q|_$3<+TdW74`A!~zm_)xyooc$vUbU#RVX``5;I@pxc;1be)_+KX|6 zr?em_D5gUCA0M2xGm}J^)x_A`lf%Uipl` z`|$AKg{EPihZrgis_3L(u$5Xmw|6864GqECB8GC#JU#U44l5Rqpn^Ftglkx+QLV`2 z?uSx)4O{KQD(i}x^!DU=nV8=p@%S3Z7#cULo~gLhu+7Uh1Ub~+T8+HhdI~hw`yjdy zXei+HgfnqKc><`I(VbHSCnKLRQi{+xKp6!Q^uH4Y_y%A>Us8MZ<-OA7z9opbA(o^4 zTm_sv#5FF!Pz_8>{1L_$w95b94_GnKAaI^EbpwNFpok0ZTJe$rGil}9bw!8-0OfA` z^O3~HK4S$(s62~HxOXcUhlJrEb5@G(h6yd(@CW73Z}e@U({{)oxe#;jeKfjq1_h=eE-R?lC)d^ufZ5mg7n@@Q_uZT^CJvp^@IR%a)*I>qj+ z!$5!iR1ntx86S|WtVx}4uIVJw7+LxJpBhUA+QpUT0|e$47D5Yt$G;VTr{H^;a;`dj ztv3dE7-~z(h`YVS?@7Xp_rfpzpGI1+3lsfpMpnKZZh;5+mRd+lLxEe}m-5Tfm}_{l zdc!`_v+&9~YRZM%tsN9gk4qmAmw}ZxN4t3MJ2q<9*9lEf{cC7w$SWz4J7xd77+9DD zs#R<2^MW~K%C=9zmwP&a)mFBl%8=_bF*w`bQHG@4;?f^0z91{qyX}3%rF)3&vXIL# zUFO|BWU~7#ui6R{wusc^FXD{`-y)T;D)&4zo^vxRC!G`zba+3ges(+8LS_X>Sin#= z{b2?C2H=t2VRX;R@|^HNxc_agi=N(duUZByMw>U-fC->-!ow^)X_z;fU|wSZ|Bj+ z=3Eyol_S@Y0Z8$(m=PZdP+ysoe0tJFlTHv;{)42Yafxnoqr+(XK$3%lLsBEO=l+~v zC>$5b|9)PZgb3Ab3bW!dZjeIneH5W>+*uY382&O0-IoGL_ry@Pwx}fZ-P@$zR`@NA zJqxJaM{+s$79d5(j+YIiW!0>pN49~bnyO$1rP6%*GF_|m1R8=7`vONsS{mMqeNY<& zJt0J|?3*hKM4*s7vH;vmi#rgJaqV)GkQC&1$d|Xaw&lFf+m}_Ak{>l*@_lnj9i0@T z**81>IUd*!dG+~p<7tE%RO-HF{$?+mj6MN5geKiPmOvhF8F*xtbOQVxDYo`1?NT|z zEB4XaOO>pDO7Xl-yF$$oIFWSNm5QpQE;YREA_!TJ?9kss)kr`#{Srp!rAUJK%koX? zx{cyRJQEVqSnvQ`Px_Dn*&W0nxDOn1lZV@`){NnTy)DV-`q9D{2Z$OhFK&owXsJdQ z9Ksf-(OPnG8#|;;Zgs%NSdFw#qd5hxTRLQ;gFDYKMRXkG6@&e*H&9zrk8=Va(56o-~bf05Wz7KxNKCV zfBy^V@{5z4XD7L(b1hI>F8I|qM`m;U_`xSMDYEb|UgU8G{9rZoWGFs9zneqXg$GNS z;SQ;@_$rwq4N{2vw~m{+lz@?YQ??IF5R-92_`uSkF`WMM=}G#SY1KFJt#B=lS8Z*R z697S#3 zIlLvIOxrDq`L+5Yq*ATq?N9RS(ftiVD5P73Tp3S_Aoz4?haNuyJr9aM#2k1*djAIt z_=;+%$AMI}ay{2h+uM`u)+B9`DshlMU1;JK*>U^`Ur2Z1 zQEb1H*3*P#8H{cKIR0H5|kI@qpKfWr)egC_`0SA zxGu^`b;3l2bl=Oi&7mxlu&d#{eI1gDXNk=iMH+1JqLW23gh!F)IcOv(zo)wxTY~IK z$@Z$INa@~7EhqDmjdMpnOa*sDb-0oul*pbraxEwQ(s2;uYnke0ckO9(a+;qidsB#@6op8SIeV@SNt= z%pI5OPEN$d?jyy`-}KSJ;K2$V$G>S(WN0*~&@jBf^3wR}3C1|l5W+CJ(JpG(l3CclgQ`1CkXPBjgZ8G#T3r~Q#=eMwyIZLfU;M}2pdOW2&(Z6Ec(@!!cbCVY zMd~09#Q%B-;Xgzu!WK5|=3_wdl|R*=BUV1M!`4A-bKrYblqf7Fm}W$RVp5lUMS6HZ7H4G!hY@D&85(@NrA64uk;gd}Jh1`y z>Ds$WHHxEo)7`GeFR05tvcGWfE}GI9`oPt#iq#kW^>ut`I){R3k;UTY`q#lrYFOdy z=mnFE-BaN95a0p3>@N>+cCGEa*rBmyMxW^jn5@L4Tu4ruoozyM?D-YW*6a5z^$HZ- z;$@(fyRl85zK? znN#^Jv;MS4lVh#Mk z?3#FJC58giw=N+1JfmN!kL_i_BRFIZtdBrw=b})*M&Ql^UbiGOQf%gg%cu4Dy5za~ zP3(qRbKdM??`NB?$9l?h^796(isqKbyQH%yoP(WQgG@#7b?>;i9(`@0jYe{}H|A@KIZ4%c#+gq_a)k zf^PWaz8?t_S1m0J9+7UjlSL-}8e4E6YAZ{- z8k?UDF2mpz?pA%+Rk9^NPa^g}Kvr(GcT2wC0zV>~3};)=i>aqirK`rqFmGQ9opu=N zf;{m!~3LN9W0$Cu(qld=?Ys-A*za%0VvM(Yc3X3Gp(TQoOX?;(&&* zuF-3n0*Vu;ut6OFssZ~05wgDnWQR+w^sW2&PjXU*+b`GS+PR#m%6t~zu>}V&xeyxs z8GjQ-Pw{uP;E9kQ>qd`SkMb+%D#J=Dk?KoD;KU#T)5lnSK){wbAgfHXw&2by{ztxd zJ=zN2WM58@T4L~*Aa(`UGEcxDhgqvXa|17Oozc^tDJ$h*dYD9gJ05;77lT3j2QJGr zU%n^URV*h^NpuWV0CuBg^Q1syhO6HG_g1slanWQ`eoo_^PEuOXjb7D210tkXpYz#v zMia&nSELR7i)JCgMs4B&vQVLxaIUZ1Gl9j{F<-^{08ghETmts2Wr-nCc6ehipdsNZ zv7DvR+|tq^i@%N$5cpBKi0J%v-8rBl{?q)~@jB@P+jW+xCqaQ~W);Fl>aiS%c6j59 zC5q>Hn8r=VYlZn`2ZtlqV_|P^57@4cW{*(LMz#>>n3eh8=vYCrcmk+>aW+f2zzz(o zWaj5Jz+F!v;x;s>^C;EG|JKz%e3=+A2(9w#7=gvZKj0=9PzWX#md4AM4*;DFDjD0Q zR>OI14UNFxzs-OD{vF+L;88zr#6t>v0{?mS+}wE$sP=$wL0(xI3i(`6FlWcc&Q5XO z3c4ahpTdhP5ZxX8|LV;pIIGUS-L<;Id7=(1b$v^;7Y_{Gt#iTSEH>Ns3c34tGJ@_9 z+NdJLMgLo)uDRw_}qqqvJOX zow?!-Z#h840!(6%0N?|ld-`-FpXCT8R0Jy;5L?S&2taChDep50Q3vLc;0_wjaJ-UW zN5k8;(lP6925N-8RkFVWA)Vo7#4vYwfe?M``Pf~#)scjD1H z{aPk5wQp9W&mOvz87RGYLHK=euxARiY=A>1>AsN1!Nva`m=ft27<%S%J(G&t>R6K$ zGn7<0Q`Xi~jL*EH7nlFIPEAbA*pc5g@yyi!y*Fuwu?9w;hj0C$t}h4k2HfM*P449v zjfn24pfb$Ci3hzvlmkYxX3FHLhAR;q-7>?xF^POlp^`#Ec7xmdkD%GtZmw1`Uo#i5 zYQ@^6$1rJXYNB!674 z!DR{Pkyd6T1e825sesB;9he{ybv<#e*N5V^H6TFN`&{w>8WKXYl`Y!0TKM2R10#(I z&%eZx1bx1R=wH2AG;C~x=_{U~8Tq}`jQl@q4H!(xQf(>l3uxeFWvaXKc*@}8*53-z z6GUD-1O5asarKTXK~G_)%sORr*1VrUXA5ZO?TC8X8Ug(0qUZC@iH)0?Z1>a#k9jb< zlc+xl_%F9^4A)rml1T#_jm0sGB!)I&?KiNR6GYT$3u1#MtUKj{0QUt_g=i#n&Dd)4K`(enbq$S`3X-&% ztIqG{Jh@d>1Q1}0ApNjsSmrYc91EDCFdsX>gQzp_7I6iK7Eg(Zz5=Xofdca=FcAYM z4^qXAstqrh)7;_G(b%21vb4O)muW!1fWD$Kz0}N1j=T4diHYY*9R+7wJ1m>0O2@Yk zy9w2%TXa~HRws0fPvxe&AC>_D=G5!guU`_m>Jr5hrV~Xk6Ytz9Do#7AYP}hNI&86u z4W;cYJ+Xm6(D@MG3y=i7b|D_BV2k3z(2v;vb@KmDUHKSkh05r(<~%%PG1HB%#S6=$ z*>@)e&(pHP8-Z<59?!%CE4$q7% zS@Q}H4J2n}#U`=o2=9ty$yAppU0o6IeM$^a*VFss!hzXKLSp6@!OVs5B@uHPw8#0K z&<&*vI1Jr}moEYHn$y+EYYJf(bVxU8c%%a`VuNEtDNIrEfr_Ve1 zk@u#u>IooAb)vaJzvb4cSJORQJL7M>DoudJUhJGho3Ft&=c}sj+;bJ+;+ZTrj=Q?@ zQl#-SHKYP((*!J+=`o3eTHz7<#~R@I9D(j21CqI1a1&^){lmjyiuo{E+1{@{JLcB;pbs{t-Q=8;NgkpMlgT@>!@eJ)HlFk zFu$+hH2y_pbwnG8L*Kq>8SHU^j&9IBE7?%{P5|_Of2yt~R1?4D1y17~O@TQH!{4#9 z5)n(>y}(+C0{TWqxwguHk|$L>foRMIb{Np|JpJu8p`M-|qK6C!zToMic35=50m;kH z2gc$DfWriqOF0dVgnGkLZG2|E`OPGb1b?N)#=?SeeSRS4 z>njGbDWdTxD~pqg2`ns#GKh@;r0s531`z#?e2#kX?}N!>WHSVvklA#~y2BolcbkDs z7&$>PX<>yFeJxvQH^*PF4u)WgOg7R`dd<(zZ`u*`z}nhc!qoH$FaeB>js}2;TDc|# z#LiUZu;+GwK7AMEe0eQPj2@<~Kd}C`;AJfx#xI%uEsPLoG(a{0eSpoZ;vv%x4Eaq{ zn4z7yLo;jlhx|c^8}&uiys^A^J~<_osNv;x_4OWnDvo%vE2)Y5l=)M3_HNK}$mVCa z)Jh2=27tRn*%ZCb`@pFr34}c0F$u6^nFb$HS(2M8X**qqW6s9L7OYnq2D+3DIQxL? ze(RTr1?`#>1sYXCLc+?Zh|RZ`s4X=`%hID@x|u=G12`Or8}BT&^vuv>4^Lw104Kw? z%c^kzohuvF74fRBO{_lQ@58KTlb`vUTfLhBw4nGix%neK9Hh(@;O+$5Ywfu^@E-zU z5;$y74eA|{aK&%9ghfPp$_fCAL7dH{HunP~uwg*3SQ*Wg!r5a5^N=WOYWhj))Dm0^ z?wWBo6tFwy&)zhH#$XrY!d-1U<1cTJqet-GU>Ipf>$A3w4# zI%fNxy(;j3+4M8!=anC@%DQ*e8AUNQnd^;Ms8FnlxdW@=*}>A&=T3-r1j}ktz*vLz z1bS*!Qvz77$`?Vd4F*Yy3~szMiMTIK5&{x|O!(~^FCxMTXAb9*J6Ks2EbIV8(&S2I zwGbSx!}4ol0&IH_lV9HKr~$OfnG$83^3=HZsoxP+N)}m3%qQRnQ5`vm$x@mG&B4z4 z#Wx6yQ^dL>zX{qsL89X(1zXt!gaJ1NRsjK>jO<8d)e|V}W_X5B7JQcn*sdpDErzb@ z!1DvpVh%!{9?aD)ru^bVav&tQiL(y6+dwmj=+E(Sr^{mt#Eu4_XZ3{w=w|`WU@%3| zz>}FF?oS1RgS@J$TVa_oYtj>t^1$PR46$0Nnjwe;6x<^>5K?e5RX2Y&5_t#&jlasc*zAUA5hCX1WG0~$Ahvaeqy?Cdx`Yq9`;V9%m* z8KPGQmMa1Ls)#SyP-h1AAsd~0vBiW7v%NfZ=I#(%UtyA+`G9qn&kYSToEU;^iSqOa zllSx;lJ2Zg*xB8aL|m0kRm6AX{6hWz`>LG(ukY7ETy!rULht=@NgfBZ>Ovq2GOE&L IFnG}a0B-H1S^xk5 literal 0 HcmV?d00001 diff --git a/doc/source/_static/query-perf-small.png b/doc/source/_static/query-perf-small.png new file mode 100644 index 0000000000000000000000000000000000000000..56fcc787a66af981c839d2ae176f7a0c427f154a GIT binary patch literal 25662 zcmb@uWmuM7*EM?4-67pd3DVsnAWBMicXxL;3W5^SA>G{q(%lWxE!}%@-_Ns;cfUXO z_vhmfJ-9Be^IU74bB#IXm}40(FDr?LOoR-9K+xVxi7P@NP;ub z018hb<|keUj3#}SHJp`TL=EHR=+VrPP4Hn&Q4^l^(xHv!{+RZ|VeQn<1s4pv1&gAr z+NFpV@otq0=-LYi>>KP-*w1#pTUWiIUQ)k1@TA(^g~$ZRyi8s>(&#S_Hop<0&(&ig zK!wjvu^Uc@1a|1Lz`y$F_s=VmEKr@`y(qE{Sj6WyJYaDsP~ihRcCh}h{=yEMySw`; z&+wj{n;Xyg_;|*YMVg>Y(;3HudX-6ZRet|Y(B!1jspGgAhddoF9X-AM-(muZ#WX>v zp7yuQNz|;Y3SP|2%n~{}WG+XG;c5J?>x19eli7@=V{ez6T;F_stsWrgdAVvTeE03` z_FO#wadVaG1N-gQ=4OF|`5F|WXm|)FCZ=DFU)(-A68&Z#V$ z)AjC_M;}%a7_iErl`e-GkoHpTrosNm$jJE5p94-#?CeMog2nt_e)yWaWJ>$;rGDS% zmk_^%1YA3k(E0g!n~O1RMMcF7c{&J$QKcBtYj5Isyq8>9SZE*&{)!ve1b@^jjlv6F zNXy9J-F;&>jTv&cU2Z01Wo2D=GdDNa*2%VSb!gB^wFGMkE`^DOMOphnS{k}n{qe?< zg9zNGy1JTdXkhxT!Trnx{1_{A``e<8q<3v2Rv3SP%WBpS3^R(5y(i1t!h+%N-@nPs zzwk0pC0_74ZlS&m8U~{T6B1LW)LP38JTRpY??GR71ithk`~t>Og`&*j;d2cN{%;g zMvgRwQ+eOQKtunUd81zWe0Oj|up%5(RAN>s$;oDygoI%EuvVVZ2f@S(!O$p!@^W(2 zv>kgJU}0g^20L)Y5SN6CqZz)u4n$3nYjPdC#dl> z_c>4DXtCe8*>PN?BWly9{Cd)0cgaXgSE`TfkSAlOt=V;UcG`hK;l6lbIGD^T_2EO8 z_tjG9T$O1`4ZDWL)a0boys6yns2PXJ43%GoCS{P`&S<8|Ccn!;`=7oqo`PF>GAV=U zIABL;X+?p6kNgWZl#;4yr1Ux(F&70rJ<_LqSv;m#2n7cRE)o(FSOPi=OUt2zeu;?4 zNPBSWoSdB1%uq7yfrp2O3dQ_klh;4#m8 zs1~dFgE<|{R=mZf@Vq(-)?((y;p#b-6A|$n;-$w$X4mhrLkBoWBJ1qTiAb3Rt&}}R z0U&_f=Q^`MJ|;X|^y9~mNqCttJ6&YcT#v0}6Sy0B-x3mx>@`@sr(Z2An7Pt$a8&DV z8(Wh-K0X4tLcTm&LV;vaH5;}mD=R}N7&c1uahYpnjiMg?kuXEvj)7@;UcT<^>guix z3W6P@VdvyjzPCNw8OvTb87Blc!VkhMN|D!%nZw#JA;iSgvp1f`iz0SWP>7$JQB_sd z)LKx_ZftCvIjg+zf!>-?`{Db@3sFnnLnmYtPmL_`iOR=#C6J23zG2LRo9+F$-W>^5 zjRF=A*J`D8>ELI8%h*g8mbX*VBN;md1q6)x_waD7#+#&pp+P%-1Ke537?O&Dfq{X2 zhcLno<>c(GDXfhv0}$u44UV)11_rM`I!AYoF5@XXJJ$?N5M)2;K?kj|j89CQ-tLzH zD35%kklUx>>GSo&pTkh{D6fgh&^DqndZJ7EDt$4GIY{*qbbTzJIj_6PeGoek9j(RL+=1 zHa9mj?M@<~G5x}5pZdFK}ebn4ak@(>cX545#@v70F8<`<#U^kW5QQSA|;J`|{ za23Vi^#?h*V+Hc13z%IqjrlLE(I(A#CJmSUs%1LXCVN)D%r1$^$ZRf`WmC9htQ`OR z#ws#t>A4zKt8+UwAi~5H!9#{i9N085X$Db+w~&{Y_q{|0gyFTGld(=(%MXbtlTp&4 zWSRJZO%9M2Ovr;UZ^7y;kcR#TTY93uZSX{r2aAbLjE@)FAQDA~fQv-PT2JK3;8@>( z`t*q&AV2X}`wbO3T*19;erTGPFS~yVi@Wx62)SytmHY*94d6Ez!aDu!)}~m!s;79F zj*ib}IRUwq@csMu08r(@)stSUi-FYhPEz6%NUQY7@WEn~F@`dJV3-N>zQWN24+4C* zJx&L`-?Lxl9zsvfd-m?=BIcG7J4xj#u1ywxl4nSiFPf72_%TSFYS(^2RmUC9_qvPR zLjXRIRX!YS5Nm5DX6?p52Zegu!)ed>v)tl2!IFrv2T$mzOoCX&?%y8oqFYV#G=t~5fL+P@Ga3+x~$P~-RKi7YQK zmzASWKwiGh^Zg;gccy51PR?8;eq12(MT&-wfgwPnE53K zctiRA&!4f2by^xHE>pOyvGFb>qM%5v)3gX4B0r#De*0*m^6E>U6>?l@t@E#af+&A@ z%3qYcDjsj&zeg(7Y6zH|oMa9*l#`NbkHBLrx8IO}1SHP(L=#s4G;2*~$d0Tprq0wU ztE2lOovSj?G%<};X2=3hR2tV&M(gNycUafG0mq9ZeuwL#X?-ZwBh29WixLX z5-)$2Q0w$Q6Sf!l6nU8}Q9Dk2bcy{sHP0gf+-$8=RV_0!(?s>XM+=##sOb8Z&7>~) z1=)p5C)7_44i0wr*Y;Ixg-Q{|17GK&iTc5%0`ZbAVd&}U!LLIlNWYATh=@x_ps_j9 zthEvq7l-})PpxmM{z5s1{Fb#%@n-r~Q(!^(ZB*Bx(}lb~V5X2b$d{n76n zCMZtSTIV;5X*r4}d>3LWHDJ;RIfFve{VhCq{FN?SB^rn&gPwi1Dk=NVLV`# zOq#Xl1tleXEjqE!o8vt`Nj@t%Diw(0ap4`aD$iIA|tGYBgl=RI677Z_Ex`u8$2-GDHk0BG9yW|}T)V)ANhI4v?5amc0%gooqMLSqI=f={>{FGoA?PwNBZqGM!y zYhm$ocI5VAzrtjQ=wrG7A;0UKsyDE@&Y zTUp5#Ma0>kwIU~U^Xp)ajs&C>u$D3TKYWNRXG-B9gDKn?4h1Sv5Dcj&dB?@mxAO25klTMrg8y6c3lCVFM=?TnXk7)l`h=ulr$ zP_0}K3Z0ZM2+$uW%!I8flc7DQGcIHj`^Lvdh+ktP zUn;-r@Lqka;3dIynQm-A0JH*yGeGqW0~Wr|7oR+5DP`U=KDt$-5^qKIuoQ;km~MWr zFNVZR=i(DL02t%poV8^9(w8x@dvI9=(9X!zl$wPF6QBuX^e}Y?{jDa9+0RsV7Yrms z#Bw?mBqULFb;7UA+7W|;A93;V4JHc|o&yEY-or~t#16n_tf!L_6V*v)=!`gsqL&(2 z2`&ppxi{R@)7-yh!ubk#;1Pco!(82{)Xe*2H|I(%AV4xVH)moDP;zeztd|nQVfBw+ zzr-N{HDAT4D)=O1Wzl2Eg>6*u2?)YA2a>u*MldcF2OPCtD9b$lEv?ySLMQ3fn4B9Q zR~-D#iNbC=90UTRva%8r8yi}r-ewQ2#|Q9_L0LA95`2-SQM)|)@*;^hm!Q(=bLT~t zEItk-FY~j#`{cvIfSF%rVJ|b`N9-))cC%0Hwvicra^W&uyBo1}$jm4r_V(>iY2LYq zU_d~?uc4)aE0818YHL)le&NWimeef=h*OEWxNw0YxOkXWNQmt8^i*XZaOSbaMJq=p z?FU8vJ&i))zTdyw$8$e)Zv$A$tgXfW@=698@>;DN5{f}CAti-~|5|Ny|1|2s#K>qh z_=OJUKs0)|f`UQ|S$5rt!13p?Bs}Ec55ofr=PuqCLjvfyUtu7NyaDg(ZVW{0Q7whu zyvxmmd#4OvyqFX#Jc8~YEBUlQ{vW7*5GobaQt^Yd|5PrDOH6#ZN2^k*ML&JG(ngMj zg(W2`dsJQ0()cbCpHf(u+@gjEyqGNH?eWB9hbgb77A8sVjr)uQ=H~Rz8)s+7>@=rB zuLBz7m8|OO$-N6@5E}qKLf(IQMkc>g|sfc>x(E9r((G zb3u}J(WU3~C(?lFTt)z*H%%aX+PJqFgjp=8h;03qky(jrwWh3W4dV*FYhuVaTI~)k z37-PM7$O=P9OPF|&-bYkjhdeW0OJVZFtbeo*N4&dec%LDEJQ@!#&)q@p_rSY5{VLj zM)es5Kn%t7l6j*3@VTXbC7Wt?mTaT(*opE+?|f%$w$_@Mjh&s<%(`~oFBA0>4L~_jFLoGeo^Zhw{ID8mzzvW)jQT*UWLnFmT8m~ z78LB9BU_ehlbY3Nh)YPUKU}TUx!0J|Ff*f%jg8eseaS2GSuW*!BRQ6G;q59Ds@m2H z4+%q0SWWze8Oim6VgIx@+t|zut9DR5`{87ve$8&5mU2BpRqX(^XXg`05b`Q2Ci*S| zdiB_Oc};yACHt>?(Jne7T@kvnW%Rs0X++kwF}CHFefS2^KPLoVZjU$HU{*VTNTG&Q z(G{ADv`U}6AYR)KPmcdu3N4Wpn4>1>2)j6sPWnT4di{qfqm4RcCoE#N+^9tj$n7vZ zFu9~UUPL6&JbbDkBF9I6{lp}>HZJWHvcC?n z7U1bz2{#&br(Qi3=LOUCmNXMYWUp^+S^kN+#RfcTj_MP){RF+sCjK^TA+pq={jv^Z zet(e0G{4Hdnaao=ay)4b^Ru~e9Slcof0I|sPuk`v5q&@vou-N^1d=)nK^=`uiGWZO zoz*o7)K>JNPf$p39pj1AB7wZ?_Qofiga(H=n;|9ZEu)j5D^R7i>pSl`F6?~B!}Y*Ep5Vo z2M{{9yW!F+p6{}OsL~VNUX2w%{{AgWtgx>;o~i{A$#@$hSoFY-`rP_F>tEl9J1;&f z?#0aN^&01&+fjojMDq$>8Uks2MfS$q9*^51g@h}F@gG^QK9|QWbIIQBk*A@{P~E-3 z(e3mozTP^F+wZbC1kYpP^e;RE^PyCMVxisMBt2cCbitGb&8t^pRPjzt)~3#IaBv2# z7X<}$tPXDg@z$E;s$GvXpu9=0;}6X_Y3*BG z>P7uIlF|Nj2>=ZVKzp!pa4L)=4}(5j&d(PO3=cn3s%M^*ni}POcX_e<9gxGb>-*2} zG9>{UdIt5vIq5duTwlwQh#yl~kh{F$&Q<~FUQfJG&9r1EB|`-%iXIe+o?ri8s;{yz zeA_oP#JT_J)fZk1q`>chBRykDC^~5;*k`e;t9pZJ{4auogAG!Ua9-)%T^^d04gCHs zVrN(WP8eX&TP-aTDr)NZ^mKHHTAPoc$zZZS_)tz>p2lE@)4}iGQq7n1mIm5@M+XE3 z+FzgOR}@!Q;{w+8%+3H=0IwtT6`TY6w$wAc!6+1hQy1KgCNe2RH^XCpEN?7pIGeV_%9`Qb-oUyU739%q$&IGJ% z5N7kuR?6>@k>0JLlnSR^MF7QAY;4$|n(YSq&dKB_xwVtEE|X3=FiarB5wM$}KC7&` zACe5flB{2Y#G=;dM6Xh&6Fsld!OjDC6hJDsgNoi?K8l|{*&ohxHb36(yKQ|ljr;a3 z9MIM8L&t!0K4Sx!+bUF&)VF~h!rlQ31??4RNIkn+$XaXq16!%$mVXkC*y=G;s!NnQ zT8aijE2jP20MCTLmIPpOQx2ZiXz+ncKF@-zj!w+tq84O$sIX;kf^oFkyW{RNgxuBY z8-Jdvx4g6hfINH_h5tdn4!;|tK^asQUSGR`DQhH<3V9KLG89MmYTmL;zas$7 zAjSP`OCMmnl$@NTzCI;@{4l_wC~0XC0Kt<~Qo=<>L9ze)=Y7@bvwY=wWpm0)Ljwz> zLO^Fky}ShIa-Vq(V7tGZ8lwCWa&Zx4vW}P!wlhMh9|TUf=dw?--m7%YINDHnZ?z9w z>{LRFGbD=TvtuA+%YO@T^NG=F05K(cJcus4R(z+(}xrsn&<18(~zb*~dH zczJnweD3Sj)%El!zP{02HfOet*`F7c`?M19(+86xrZc&8T1t*fO)Tu4FeN2rGKcwR z^>2kD40LoNp!xskr@4`T##hP0(fE+hEDqmGZ4+^%7 z3BatxAt--L4oj?Qz_zTvJ8JS2lnb%U80TD@R9r>)w^X8Gmv0G!3uC-J#h-?^H#UtSsLYfpAf4mD?QF74Q`1xcFQv|+7Fbjhqzbt zx{>T?$lvprT{ev4sj?TRvwv02>)pur>8!hNxQTIt8Md z{TPG9#1IVbZQF5I_&WpRMMNuJ9~fLQmA8cmp)VEQrU};Atc^nh=~KyRk6rL31oHx4 z+uX9Obb~$H(&;6F6DSg&Ydr9n44*3l0KeMWX>*n(7luWH)%V>mbbo;g(SEZZZ!nb? z2W;4`qopQR^KlvlUH$4t=?@=ZD6~v-fvC1y3AX3yPIj!p z&hBZbGu@+$5eEYS8yWe;9S>N;4K6hjy~QEltku!T2}IPP$7gaGK}j1@=M_*f-=`}%Y{)7AqRPTY?lvDMWa2g@zQ&*YTpCmS2vv)E#_ z+^p_31fW!2Ln8v{;$hy0;CdiB0RRThh8C9fiZm(eB|HA9#J^w)XgFxed^~YveRIN# z1?6U7(WJ9M4)eN*GV?&USm%?1!dM};Hw)t%1S(L9fTZE}^l)WOBzcaj`p~THtP_*9 zajW4TQC)0MuwWqH(m;WmeczJGDcE!{T-J^Nrwj=?2y6Sn3eC>9W1*!GJ}e8`d4}>1 zdm3qU0(t03jm0MrKgsNnb*PEr5L$Q%mAC(JkBncF_M`Q@O$>n3jMEh9qr5TVe66I; zC)R}h+y{&j?alq^aW7A41VgfU98TZGL>i}!W~q**d$UKxtlVJp&t>XpfLvVfa`?UL zf*f^*V&Kntynqg?UKAw&cf*fWL_YR-x!)ydM+W)+z2>{bOv2s|aG}b=Z@3o)dm3T_ zm%6*60a8kRR6=&2-~aB`{3i>k@E;-<7}52@V)W}@n5z|h$eSzwx}kq)Ou8Oxkv^zh zmmg~v`>HC9R$Hooc%dH{f=~9;^!i$%-m!hLqO?SyF7#Iyns2i!u0)-Bta6iHhVsAD zUK=oAa*_$ySREYmkw5?BH{RMgGJxjG)@QL}@ekR9hZD%f??3DC%VH|ZWZY78Z6QPx zqsxWi!+`0vU%m1FAvi=9qeYE!v`QkIXknLKiUeU%*bQU}uKQy8hbMszBWHyy2-gRyQRas6B4Jdjd+$XM%SB~Y^ zfE0F{&d74}0 z*4A!t=(gzB0s>{D8Ii_`bNDI`-NkAg<`y5XFg!m<1Rp0xiGZY_GH_VKN$@%_UVVSN z-5=)Qyh9IFM5w7i)KRm)BPR@U?N)zK%e4c_ehr314K3l_3lFK>ucBjCHHm#jaZ)te zd)uHl^*WI$khR^+V)L}`<=Sn3Gz6swO{=%VKWt#{e`cIw9o9KZ#zblULt0_swuzfR zfBq!rioyA{w~M^iZ6yKC+`xIKiLNd~krYZ-I3R2Gt&uB@a#ce^jvA~9dZHZ!KdDNt zMLj~F^>oR3&1LP1T|#edk~Pr6o3z1LmI6Ldl%r0WA@L*ckRA(3|Hb?( z1vk{hdg@Z9Nb+o7`(#KT;N>G5hA zbwrQFMpUS+Skw>%%AEdVoO=VWyF?vx3pNF)G4^XO6ahn0DOQ6@l%7h-Bt(@Uu3m=R zPSH|icZt6GXn*N+n}DCbZl}4(2)*iWA2?8EoH51-Wgz3049`C}JRAZvUXYmpL7LsQ z1fRajU}9Qb@q(*&BTK-v+f5Pa?B(~=@a_q(M5rn|W zv%&)+8UnlY8+7&UKwqf&Z$BeYR1>m8fIMwIitz}gC9rZS(($y-G>Shnkama&7$loQ zI&EwlXXjV>bOfebr2Fl=R6#rxl2;-w2ebaqfvQ!H33OJ6vj z4U5rFx?m#sJcChnIXSRE=%h)I&NXvSOh|wgVOd+Mu}pd%u8sT7Cb=m43jUAx<6@%+ zfsjp6j8+UH9E2>AwO@pM=`lCM+N@kkwiyY~s?7fBLg*fR2nD5fmkE5``q1K3@nPfz ztd&dyB^`P^WE!8~9uK2MitLo%nY@A2{YhGd8l^_?+j@0ExX@AJ{D1v!7(T>wCtsY-!Uo(rx4>2Kw*tr&v zLpN*lh1DbG;?}$5KICM!n$Lx8_??bf(?+XBj!aD1U(=3_wE-lk?e#8HVo2q4MwZUS z2GSqs@NvP#x74Oxd`U?`Nt31Bg54cTC5&qIdvrs08u46bSAQgqQA+KF$CC;Pgh5NgjkmA!3(J1XN9i%wm z^)Z;MG+q+Y20a!dBO^Vq@GcA1OO1HM+%~wnRmKDODXRMaQ4~Tt^h7}OWzvEZJPV+2 z)#FwJ%!(O;yt#iNyTuIe8hY`^1R`?(ha&O^LP0mF>yf>%!BzM7x+05gWn~hm zb|T*Fv53LkJ?Ix5^_fBh)K>wXknt4so#AFwM74&FNxgNi{z>`peY0 zC`edrvg`yoFPJgg{9VyZqru3_%UJ{-&L_fRVkCg=Ah^z55`gOSKE}nx)dLlo%L4w3 z7b$`tXs|)~^Ub}{HP^b=Ylrl%Z*UMiT%@Nt`b6pGi)l@l?QXqu+UiWQd(C~Q7CrwU z@IW!49J2%Za_dI#cbIjqm~=-9rV zWrX?+x>!v|jVM4W>C%0=RRvEWC^%RYv=?VYL8Ak-V1QlA9}o#3<^WNfPq2pI)ATbx zutLVPv{q)oQ({OQ@`>u~CI}D$f-z`5f7I==L!!%FeY)XZ^(d6bnYY<{4ahYbtEH;;O2D!0p8ZiC*>?X$f%(828#LFLj9}XrbTTEl z!b+1Hp%ESxR$;$LH=>}8!!SB=mropyzjZ4qIKvDV1-Ax12(TIlRZBkbV0)Zt+`ZqB z_+|_v=C|+)kuSQ?p5if3&MbM{k`#1fx+{$b%B|+ufMX^*BLnj8-8+MFaj?IEj#R8x z0j;mEuYEMb5!SDwwJMG6*)4HsnT zHbQPGq_)OhJ|z^BFUXN&L?s0}ewIUFYMZtZG{)V^!-N)JM90R#z;=8g)s8v96^I+k zFxJ)C+4z#?K!5R%%-H^gL3(^?HQ-sG_4H3W?ORv2IF&!>r-6c##bl76Y6dW<)$7v@ z6JvjWf3;RGUQj@?B(uYKy19vD+Li3XNN!kITHP}v)ZDQLA&9MBEhMet5iTA2XprCF z$-dC(oHHRDes9|qESNNgx=&mYixApJ{b1iq=lc-_shyx9wp)Un2Q}G8^Ug6k6G^zh ziAPr_^lX#wzToFu@04y_NVVdsgEI`XD);J_NFs<|TOI!^*c&P~sgU0yx=|PjqFctT zc*=X!i}Ma9EQloy)b~Shd^R+e8*OfeZVUbx$QrWkNA%AI{dV*KPy9nu z?7vuu$jH1+>SwsO|@3on5v}Kd4e-bat z53=tC-)-IlFuHCOkDRXEujE~OWSKPAcTdubp%!}Uk&Svj5i8R6puwTRLha?1B5Md= zU_eI%;edz;d%4vY;7v-y5dHwSC}!Xvexs)%L<48gl%e| z7&p1rm%PXm!0KQd74K;%8>DBRV{6f!%dpnJFJ>&rti+j7TA@8729;o*BA;h1=91Et zF-!ie$a_Y<)bJp@r}gw}{_-_3(-1E0Io0{_n4{7lI%86|=GHyB+ zazL-2W)}M|+9jz*qk8GVvaEVaT8L{$jd*yghye`!i^tfSHMu#w6tJ_KGrH zF5L`g-L4|6cYoq{_-c*0@FTK~P-XT;eB4UfACxU>bciu$rbCQ5trS%+{05~UgzH4C z4_c?N>*LTy8SSvY+ra>S``*79l@rD$GiqJcxn}%bPSI4>$2`6oy?9wpU;|6QJ&Euo z^aHDVJuViqZGFv+h18or*_UsueUu&@?B2A&QGqI{cE#LkO+jg?G^0&P8KW?iSf|;P z2HIHr1JTsfNSq=v?iNzO#$SxB#7E>fUKCN2C5-jMi7&b5gr#&MBtF5)ii!d2>jpq& z0~Ou;q$VimU9OK~YEUp@VCoE<^E)@dBdYaXAPYNw%0umXsDl$RcsqAM=*i8pE4#W_ z=T{4Vp6web@R-mwJxbiX$oo-VnniTZ_0@TTE!^-HazOUrqATiK7}29ALa9Y7OpLFa zYZ&DZOZF(eV7DLz_BHR}5)HvQBS< zd*)qRR6`xyY9`XU2?{8?%$H zPZTK5@B%kggHR&dBCdG>P6V(dh9YOkJ^*r zKW*O8c0n{sL)Y4;UBy3iH@YZYahkGUU^9Rwq@Rr8Lx^K`#VHYHK=3=Ox7+kl*w!C? zd|!2b67g|BuD$F7jQ6qaR;Qs8Y`tz7+*Lkzd-w7{kR*ovED{YrKXG4QA2395Fvfzs zFD3?ESygp9YZ%LG!2@sD{#H|?y{)BC@Jwq*Ew1;+-sH0Hj5(gqPu$|_5UIs@A;+BX z8|&>j&+UNpTV%Qh#KmUh+c`-J41w_p<9d?n4#@3=X~vIZ1PjisL__M1Ax-Fy$p@c( z@L0{KCQzi=072mdEWRN7BW##YTMC};!&*_F{`H=dM9)aE3}Qc zlN1_^;OZ*s_WN+3#Yc&_eBY^Q{JwBkwBA~3dp9Wfg$&IXF5# z8%~S0c+0=cv%sSd0cM~~6zBRjwu<`#bTwnI&Pzjy`Zn;~^qNT&<+c&AHO_I&wp>oX z34Dde7Fs^C{E0d=>vX`UF`t@EPC(mxP`58N_>VS4335(D$i2jll*H_W5b<{r-^#&Z+^n-M+~Y7(q`oV(@cxEKwBMF|?SfW03&J%zWdmW}!w!b!e_AS#c*D-+X=EcNi z<+Z};;#v*~q#LJ`;4hc9Z&c`;O-^0zrUJ9F$-@S3Cg6s4jJo)a-xoZFQu6z|G0CM1 z`5zsg2(GUvN{UKG;(c9hXkrQQbdj`#@CG7ZY{u*`N#J~R_%UA$mw6Uq+*-EoJ{@+x6;eW^HIQ_Jw9~%e%sM%7qE_Q)te5QU zBBR+%%7DucCfQpd#qoNZb!@jo(BiKnfbsYRqa>90y&J^n(hIZj4NpUsZU7P6y!{S0 zD;C*vsFp1D~OJ_3Yq$Op`*z60_mx9lKHekSb~#=7O_NI)7`jS zcnCgvy6N4~SK@PIzOfKmU;IOxfRBb@b!Nwte6qv{^L1Da(=_RigY^Z7>3S~yDajK&OC-j&} zB%Hzm>(~q5K2hQs5Og$RHFAvg!${RlUb^B?sew))z=-VnAquMnpG@1IE;LZ!gl?zz z37QV$am6;k(>}7R%WHJ&)oQ3?#tKLuclZSp8ny&NWQ(Z?{(TiNTVV z_=5;kuAR8Yh{&OAW9fhD&+xtR=GEi`on`zkgI6*LfgI^~;J`uc$Za9K zt~OS2NAnV#oa2g*lNZf~nt`h&}4oXCS}3Gzc5^h<-9+`z8Zr zP*NhD8z7y#SN#KYih)1*&-Ql8%M=411Zn!mhn5rFws_qhx_E)sCx@@0*Pd%P*YR#{ z2Gq-M3Vxl|-Y+s}vvX}Pwg>48UB8POFQw(l#l1&(!jzHiyAKt;w=$i%d(0zy5?HXI zIZ0^>F^pO~m3H*Rbak~8_JzL>*@lu9tzr|+U!F2IbGMY2!P~i72}mD;W5$J)Fnx^{ zXz>gG1eb~Bj1V@-`S%M;V@Mb5Z{UV|htL`GRZUPKDyAmPQER4JZ#Bn^V=ut+Rm8b>Xt7{ZJ}U$)Ba9Sqb$hGPYP6R-+2Qt z2>3jA=Q&t#!F6Wed9GFE(G*dA;mT@n;C&MPT6 zhPR-(%QTMVSwHd^e+-ACdR2b18wd-?(zlOW@-IPWn+Nc-p?)i4?&_I;8`Hqs%UIS- z#V}TaL`(g@dhh7K|BSjd*iJ@9CQo;Jc^j6GtJ%tcAB+H1?g6rZ_@$gwL}}@1E9Kib zOv0##V1{PTJqCcaqm$C7KB9P)k^6BTs)$=r5ZpF4;o?s&9GYWouY@Gwh(7to&PQ7Y^Z1qNb6sejVwh%b;o66CovE5yBd9D7{IZvRddG1U75IIKkaQyc>gjkD=H zcwC6c$k=dU3djHUFp6YfebFDwa)Hwa(tlQVc1gMrXvSlaliNC%*3{GhrTE2(^;b#B zd4Wet8~Ouo)7aSoGh2Jov3KnXLK0Dz!6CcaEq(AisV=nacW2TgYF` zB#9WcW^A`sp(ddtj@fqg_#=(ZyfrPY$1G3ObiC@p^llE z8A@p6!H^74>>8!7B?SD=ybe8LUhyn1F@=h?Lbbb5+-sSL%!ap1!~DuHN$qoQ-FvQH}f_vv|{bc<52B)cMu2xMVi4*WUMt`RZi60Xrf{I&If-UYSYyt~=hN z9=BP+2+yfVGP2r@!-H;195$mOvDZ>p7bm#)T#xXp_(h5D2V07&Bb<+RjQyak0@WkA zz=5c@sB!+DLkB*a9rUK3@T)^$&F@yQ?}a@*~Pe=($>Szz#HuU0jTv`4ePy((Rl$%lnWZguwZW6M1Y4SmmC0Ey{2PkU`*rhwUQ}tRmLLV~Hpt~N1 zld>wjENX+oS}e23MaqlSlg_0M?-xSp9Xo~|HT=LHkMXa8l@FK)X3RZbi8R8?(W^X_ z;U*OkKSTW$pL1c+*rLg$h0$(~KC^k&Ea}eno!?%qSx5XQzXEfJpU5{_l@XF~@+tV{ zFZuXY$k>n~lc;tnVSMGUtaiJJ5Y}5cr~9-qew#_IHHVt=F)!^bQYUDdCMlR29TvW1 zHmwM$TuVBAPvnw9=!bLaHUEuvrkmVgk!EfBj;%Y1jTFWj9`_pbDv3M&c`5wbK-ThI z<*fk)zg?1{?#H z+iyBQL_muXn9j`WqFGb#zyYF#g*6+bNz!w&%|;V(esks+v6~8J0{X%Kc;U4y= zE7AwkF|dW8?pK~*clP|o2UfYupz8h_V+LDxKSDt?l%a0hVuKO5x$QpqtSut363BMu zI#c}X`}sXI^iD

(oJjw)w)8&&MhTzDQ(7lO~u4*uP^p3>?*nDk@j+U`6Q9Ac9Li z-tY2y{DYR^k8aRJL^MAZ{M}#W&JFyk>)4ZUC?zWV`Ji3oCUs{_3*cFnRYEoW|&;E0>*Jz zl&j<1(EV#-+RiVphb%=uL>4t>f%9UpHS=tD-&$(5cflDcKTPOvgE#3N)3U_Q(IunYH5%rJw+HmaVvZ@~ z1NRjUbY6?1BdM7C??0xhd$MuJV^U>{1qbjkUAHwuJTl8ywyaS4&sJBsSr@eHamCl~ zk5L2Hp8L`&B^trN1SC1sjr#v3n0t!CK=Ua723y1^Z-`2RpyvUNlu#2oNESAuw&^Ux zS9MyFVxo@M&6w9LRntvTkpuj9c-5R+Au&a438sZQ8G{jThZ(hKe2j%S2dWZF~v07JpJn19yt6HY+jTY?D6yNtd2%#S>{|J{?`>AUAh^&s(@Sep4hk0gT&Wl@m#f%b}b@ z5#c2ivxUkfnRIlAyBwb%FoA0p^~Kwt{>#&x_|lYr;CNc+Di#hk0*?wf8dspi0PzE@ z`)A^kmG!!G>z@;F^$i=E|N14-MWpi0|17YfjbrGv+X=rB;AdAtU3(aRp=y7iewShr zELu;c(BI4=2}*!Ef6zWDcpZAaEf@HOPXa;T zGP+p@p)2$_^RbGnO54?y&ZnmG!I45!>d93LH<6PNAL(ayJHtJQi>fBd}ggMa88cT>PW(3 zZ;0F2P^&yCMyDHCGm^QvAcd~5P%I-4SacT)p)wJB3RPd8>JepB*XfJ3M}m7qYp5PT6^M`)&M4mA4wk8h6DK|DYdpc z?Q&b7SSMMEh7N~BGIr=pK>0RdRizd6yCW>nsQp4bL^RPJhg3xkj-lND^w|U}=MN3B z9E`v|tqBKziH?uY?htH*HqsiAX(o*7K*gazU-c46mVp`{^QimT3YTXZ_B}*JiSRqC z_`3cP3>6;iySVUn(7>6)rI2K{lrP|-_;?hJBuyUF*Rs}wrL>cB8k%ifuTFa|2oy6g zo8qnYONVWi#TlF~{(~0e;5R=7JzZO-QuY}wF~vkohM}TUp+O@6D%ib~r$N{M@kFpb zw+;TAhwAnj0}tBcgRe+g>zEE*yuUy_uF>?vpi)vtIrv{b9zFt-Bk7l8JXLOkuwPw+ zGuaQY*0f8lsta`A1`84w_2PSXo%UDuh*^*Red$bojqiqe877VC8RH2G{zt>34s zo`=xQV)y@4c9v05hTo!}VSoW721S}76%de;4rxI`LSks8K^l}$7)mK6loAk0LAtvY z5TrqJNa+}K=<+=L&t2=@5BI}4Yw_is8Q$4X?EUP$-@Sh`*o^PDbnU8!XlyIUnCy6m zL2fsh`0qMNq0TFZ6Yo9d78q5U=tea9X9I=xq7cF&(Ic^ao%FWcDdb%;&xmG0iL1P4 zp|1^iCcaCQ7{Q4U!{d4x8q;--|CCz}_@)sZogYBm5_IQ*HPDlJ;^lxBr2Ai9y2a!MYuw;nw-15t554(3 zg_3hRJILzZc|X)}OdK2ALfqeJNve_z*?@sQv^;~Y>p8tcPjw6*vl}VX5%hqb!+c(f zzalb0$tF)ls{c-=d7ABeWKb?espjI}e@wd<(qKN;8vXa^8;~kXd((9t^?s|cpt-fB zf`FJ7Z#TvJ^OqL93+&yt&o`br>`aorOoCmPAh_ow_PN+0P^S$M`+;*Ia9#oQZBOo| zwObms;j1z)EZSP2$6jbhQ|XI*?)paZ(81%O_Ct*Fckk{SJyLdKS^D~A8JwTqGLbV= zkmI_4$a9B1oQU44)(j{85S~{x@VT_4v`C%^8qrL%fB0iTxCS=AMsm}aQw}#BBHn?4 zSr4$q`bNn&HJOd5MYX$mUk%=P$Eed_LQHX@-LwX5TP#i08^s9PCBDnu#jYoyej(vC z^@pvG!(+Xjqi;LK3?2VWbg#`AHNX7q+M$<90>W@{`=4YkZ&R$!d8sg< zF5$B_-|^Ksr85zb;O&~ve7(a|{6!sVH>RZ-Ea9JPkdqT(ExOcp+MhsHOL~)SRSJmr z?FmB=7G`;7#3|RB5Qg}RJ91H7e_{I7u}$CkMu`K%@Z>|v%srLqbosmffsdS=&{1n) zZ`$J8dWNrD?{+sX4YYW&ia$v#xd!qs=y*i^q(KTRM1rT(QP0hH8L6Y4Xn3V_LAB@Q zzr)@8bca8pqNVV2RgQ|1ltPlzOkzq1g-WFad$H;{Rj6D#8U znLU)&YWgJ$PQL}IdZ0Pg>i9!9Y&shKr;X`5pR!rIyHSfzP(cVBf-!qkDf3UBh+Bq) zMVQxu;CcuXKa% zT14CodB~1<8@iB?5QH-2A0I}oFT$>eZ3^Dk62dy4aJ*OJxK11jC`XtEZ)5fPwXL8c zlNv-;8N}V@e78Vf;pV2n-42|S3#`!xS%A$G-pLK{U`+64Aj=4$E3>dAwZLe08isyg z*E^Dvl$2a}-ni%{zThRfc;H7io^JFUJi061F8J~(`1{kK(Tz}y%L9M^v#J4!N_d6% z@?!Ih;oUB?!h@4L2E=rKq=flg&Ub>lEwcr4%-5ARY3^tIs!xok+ihVB{gCM3&!VWP z*f1eI$WH&34H;v1fFK!9Po_v;^CcxCb2!`B-X7a=Z3yJH==bS8$1$4(-Hf7<%m1ph z6RBM1RE=iNXv|6AM0b*aYP;G(#n_ z_Ib2lSnu*skz`g^-yC(nWkuvP984nayiA-x@3`OTngR{BA&lk0LVIZ~e>Afdi9YbS zfR$BJx*g4m6?@*L*1N9y3MaTw9QWs~DM6VvcbPR^nU_S}t-kg-Jv0@W1*Pt;27{as zF1Zk2zNfxf8v`=rwHh~Lx`!tw;A-(hlau1bxPg}>ub$)vgbsP=-MTD#ax37tHL2{P zgt-1K|0AOwER%#CDp3AX{_ZU)22rn1dhI2F=*JUcT@zJb%}HF|U$d`cFfxSm znh^hf*i=SWG9HGRv#-tmzNfI1etC47fA0qEO`86{W ziosyS0helXYs(ue0oVpw8M4JQx7DtL1{6YwkL*k03``k`>+P<8gcdw^yCZMjz*9yh zy(hMoecvLU0%>JCtB?J?J~MPsyX0!ceFjOm)jQEo8}c`u-S{1ui(pLdBOe|8Go4q> zk4ZG>6gQWAbMBf!8vi1{iq1tu`o9Icc- zc{X*2#iEh7Z{$A2Z71aXK*JUkQ7Q)-;Pn3^m7%_Vn0}#wrnx(HiYOX_?p~H>L7=%J z!&R~;8()+f{F<4G0uX^Z%&ywjDo^FiPI&jSF~u!L^5vd5cHaz7bIw2hxn{dhYpc5(*o?O_-{s-Td>|*Q#8_)UV}Vjz z2R1T|xj%sVcmI=}j5KFhvW;59k-IFpC_Rs)yzb`OedyT+MbOQ{w&G%f+FyyE_RIr> z#oxiCaCatg^CO$Mzy>D6K5-v}LUblX7cU0$;@WP@cOhTp@fF5E5i|e$ubj3k{;0`g z_Qs7F>TGzb8_OoA_Hv-1xg{YNGdN56K)=q^PRr6}p@6Mo8#T#Tr9nsN+eU2V0<#~j zA-9D%t2N%}T$dE{xY5_W;=$qv%f;h*CfqNU_?o^(;8IjtS_vF~t;d}G643oO zB->)N{1kc9`K%rD=g+>AyGUK*Xf3C3*T`AD07+3RnQ6%z7%}T_Li&!6(5O<2lq< z=RA`!mrMoNmrT3`tyb!_(Gh&hP#KX}NSjILa$mL`uU5~iG1-9V%F`#jhtoW&GR=X~ zK1EbI>|Usd$v_N0x<|$8 zR4#4`R)BhMe&<3)`F_MFcesHq1-_G0(0A!Qb0Hau>t}qrnEA1Zf4`d-ke57eVti>+ zqLd17kK9)b6>|fN z=*+C%O4}GcHor?pfxJeIK3gJpllu4iYD`@^kQCPKZvGYiCNKHaRtO||bK>X7?ct<3 zpVehs?u+@yBz0cM?P(E^WX-7^pN%IMqVwuBVm2M!)PK2?ntvQ}KX1m3nDv&tqLHwY z*aN}g?>ZQDQOP;j6T?zGET#ulaW5Se+lZnMtiv+VsYXL5M(fAwy{D!w>d2m(A<0X8 zJsZ}_?f*PZl_z3V&V~r-krZ)t4Q#=}U5I#a)N>3_rLCZaE~V3i=*A|G+&5;G$XPpC z_~9v?Ti`xtzT5dFl~&q+G){F)#;zT-Blri0lUXr-G)#mvwnH8!W!HbLgY++Z8g};hT zJZ~3&OuUA*c4bHiGAV9biME+dvOba%3W4*Ex1Bbm$cQNr3HFS=UuR-sa%;r6i8H!M zrtkSRzusKs(W4>&p2pRyjTn0u7Zrd1Mb2Ja!!fxo|3AofhrNMLiSwoab>A>dzMl_W zCwYPWnu;cS?#5@L{bzExh}cVkD1|3)Cy116gEQCn(}R2T%F@WeO;)O}Qi9BWwkTfe z&T^~np^*_hmh=%(WXzg$S5AN@`L@_vTYx)B!r3Q6_1rf7+3>*|Z^~jnsjfjT6hk)= z1~#|uAdm2{-_YZ=!G#SDj%x};td(tJ&~HzN`6FD)js6nGntI!yUi0?(9$D4SToq6{ z```Nrz?Q~}Mr#3N&vyey9tD-6Vj@drQ#^^HUi~95Wj>GD=WDrB4aY+-ht1wt4MjY? zp2yVD`_roAIzH>#p@KJfTHSt8WYKCqgCxQ)1C7KEY2L=08UK*lUFZpR%aHr({O`#j z%qeBjJtpkSHt4%YaMsY|bl|N8vsjbe=Z?gaM?{u5l^cN`ugK{~0<=ncdN)K}=hAR5 zwbCWSM!!7Ts!@v%b)KoQF&wDVOllq~xbV+B5H)SQgzEmqjxgNe31XlNqmjVmn>Wg? zm9H7=>)$je;=(SdIH&#VZGL6@l4Y_A`pveIm6mj~=GOIyv`+Oahe2Q?fGv*bYhmH= zwXf*MBolXdG*bubXFWAxV_hEF zyF1xw>w1>e#@>p0ky)Sf@>Cl43USCRE-m916&H}9;0ItRo}Qj_GhuI+b}^*8N3sb4 zw#$j$rwYcB=?EbT_vmuGLG8if#lJWnrwuM9-`}TOE?S_K2ZI|?ct;BvgENf#_UiiFMIi}`T&FBwu@&iKLq>?f_9hE%8sv;O(PL_6lP zyk3?JY|Wd%B0`R)2D6uz$@S)L0S(T5I=v>h`aaTsH+w;z=wv7p0=CvS1PMzhLV@&u zcp-T3e!YU%%FPXt#Ra_c&JC|svs<;=n5T!Mb93)P;1r2XnxP2YyBZkS#`-#XX6JS1 zA{s;l6I&vWYqe2aRKfGuZU8p*u;u3nMZ~8%z`{R%QT&dR&Kl-g`g8nYQ>%vgO=nPI zVj`WazXB0=U>-ROb(C^#|I_l{WGwZYy%o~A7YOjiCkL3SEwy|C3z0-E&DT^XqdW5f zM697ygH=!^1U?9MFrvyjEJO;!-h9;8d_M8}-3OzLw3}`$dnU>d-2U0cge`7pWQ^Lx z5D}VnWHlB{kar<6K=f`|TVG#f;){;%Sl3DEmCv^oeKN34`2RS4(_t3rGYDn(k%q9| zNN>R0NRxeKT&sz|mp)zyF>3ZZ_%I_jpUjtF%dO3@*6QIp50FLx+%N)`FFb7yX`*t z5;lV7+{MJyzAJn!(4+nXhYwWM?w34AhtSo=x#Su+UXDMjLE%lg)8{i9$e1tdK)x|_ zCSg{=sendTUxoC<7j2LK9u+(p>t{8s1}qrsstOcLOgJl*9GE>?-4jrlBP%h%DL7vY zGS(%JGJog4UCErCG1}dM{Tx@(n-5b4d#1^rDC+ zpOAULiH&KK4`pe~=^dBmgJ1E9l<@`4%}g1W`(M0gY|o@n6<8kj*WaEBt{$CDwY5QT zXBQeeAL?aun>rQ+z{OAOQDydn7$H<*b%nwXmoloL8Sp!v0jx!X=&Qa2Ezz+M$}HK0Df|?^TB<7XhO8?%gY$5eFae^Y@>kY1s=adNp|X zqT=zqAqfj%|966K;Yw`n;JxYz7!39dU~H$iPWElmqqDep8ObZbG&n#x=-+M*yb;VJ zNY?$3@X*478Aul`*kz5f1;UK>_O1wTZ)^-V%L2TR&YIPss1_+pa&u%=+6=WGP1>lio!Fa_$`6Z#(p4pl`TocB*&PWw4$aV^cvUy4GO zH`4l#>0T?aVg$$L-6g6ilQCH9g4WVJbB*C5l2{V}i z1wWZ}5d${V9ki>%xL&rWyE^0B{*yxWVtEJy`y6XKw3a))d`+~>1a)L6DLI_Qj!v0{l#=s zl7tCJOOvMdC+b`DtD+(+bivn14?3N#`65BS189v6j_yumuwBM0YCxZpv$Nx5B!^#- z)vSe3qRx-|!J2>g&zk={B(S~>tTkZaI<>=+FZewQ#2LxO)3tUUVDeUjek8?JP}$_N zsOb*uFm@;O+ljjqRe)GuQ&STJRW_q4gfuT;nqSCEiJMT^gQ-~y81OGozVXTWDk>@@pHkC^RW@L(dns^ueAIhTv%^z3XHgCT=ZDfNJ8 zPGOa?+x@?DUa}b!#l^)99$k}fB+-x!$vv%K1f{?(Vu=H9bJkn8p{KD;`0DuPp_D zr3lK)yM4vzk&%(1Wn@&6Mm_**FO7Wk>jcRE#L^NXCn;=hVv@R>#YhrX^$9wN11@Jn zTP@(v05&&7bxqjS$!RokG=qzlG^@0ff}Ncm2!A(8-6>VF2@U}w4pqIL@UVHt>$MF^ zqX0njst*y;_2yVfXY7Eg)^)z*!o>!qVrZC@s66ag{S>&u1os~a1rBpa3KG0q1|A1% zBwLd?HhLQeTD@rL={rENB5-YfX`QCRux5zj8MXh%(A~?87b4_BN zKB4#@9J@_8El3;4G6_mbMi!gYF2y}SLLrQjUiKrNw{bTP_y{oW@=Hq}03!jDX5Nd#<9b{lerzgHiHE^i)PABrf55Gy^z3GI0TBCfcQk|I+ z;XO8y(%!S^M%85h)thryT~jLUWa8grFa?a#KH6tffSF%pz|&-2`pc7RbrO{LE%+iA z-DulK({P2d0LuDuL*1fda*|dd6M^>9_Px@lBW4K?d>{%KI@mV@9}QVo z!tjU$Z^d+nbiScrrzH(uu1lji)kvY)-H81D6y-@)!Q1Il>qIjP9Lm-6Lcb+~dc?s+ zgp$IGcEcH=gjYBHc}3i){I^$EZ2^zQUgq8Dsb7vRH$EN|q_{z@+YIR`0&n&!0tWc_ zrRCZCBy8aY7*pL!+=q*aZ_2LUcAVRiXM z0Ww4|u^Q~NWkB-+accPYr{eBQ_}kALfAKhgEbQQ#A}@6a#-$qoy4-*!kByIOIuuzT z@ar%x;7(kTSvyIO%C zWirW}CSdP9@6Nwpl^(PLMr3_`eJH_S8tKz)?i}4*P(XY)i3i)HwCWxyad_jAbWoDH zA2BenjNR=UUXHanctb8?yE{AI0ZW&a2+#Cx(sQxV$5k~o3h^Ar7XfL~zSrY9K!#jj z%uajz{qg(pI8Z;{N`sUr-MO#%z5D(4 z{`P+NfBQM$@R?!ezOPuFP zgO%~oVm^JL_aiAYGd4)JFl1UVAgb3X*75Be&n{cYNLXE55w$2*KU@--E94XE+>6s~ z>wIH_vEpf0rnNXuSY9FD&nbkktOi?BDwhgIA{yGcRU8UQs%EreI^5Y@mz{X;E`fT~ zCbUz_GcJru7Y^HCOs>L|LZf5kei00lL32e%Mg#ArY5xEAg6!kFcNo!f^ie_Y-@mU8 zy!R!=LNO(&U3I&-t9>*5#FQiji&jwZ)s@7vXV0W_HNO-U4Gb32i$~nFKp3lsw%kf) z?acMdX7lp$5Rj3PSy))QNKE}h-b@!Yzx|7diet9*J;nQUOz?P!A+f1Gt9?1(a#>bH z;o}|ohB_o56re0T)ORefWmWM(D7Gzwx}09 zA0Gi2MbfYpXHFQ8z(;z7-PSBd?sDT+Vjmx$=(sq-x%&;0PAL-;>J{Jp3cANo>7_6e zbG*idjsQ^sr={qkq9+cE4US5KdCz&n4CUpK$FqdPmX?fK9Y@l*Xig?bTxl?18v+wz zr&tiP)s6sx{dxlq-EzHOA$!b4Q{l>x7*M*v!o0CSiz>i+DTcIY`$IMJNN}Gqf>wr|ho$>5+ z9?M9Vj`RI_+#B+Wt(1{18GSZZ)@qA>3`S;V%-h$mUrRH7d#qZWnL$cc3J0HS)voO5 z_$=ym5U#}7W9Zzzb<{+kjON^V-%#xYpPsU^jK6=|@&mYSNlD3<7FbX>I$1bZ$5klb zz}5E@w%cXro3lM5Mtt-U^7u%;fhHx!jnkcpo&EW$9esWMES`ZaH(_TxyNxV&69a?q z?(XiwmT^_4ou3aE8z=TGg&Gdw)(xla}FkJQqCdo)8}$1?8t-Me>zRa>!K z9cwRMyy)3^RcSN~?r#$Z|K`!}-&W?P^#+co;@|8X90)reDnFK0RoT3JNdP`T#HK6M zw!QK?=H0t*Y_b}0JRR;k z^x&Tv@ZtS?%E}xbOB9kAxg|4hwEc3yWn9U-N$XW$&~!*Qme>)WOqx0@yPwkX!A7(=EfPd0X^@+a-;qZ1PcZ_f9R zyAW~BR@!_7ZuJfAo(Txxc&oljn3|cfIyFG5o~hs-;IBSq}9fEUTnYyJDM&L&@y>~2EoV2Ps+%M$yZFwYi|Bor1ev| zY!-fRxEWIiUYVP>t3K)N2)O089KdRwz)EG?ayvWsKbul?-5w>Q6!#^}$jImkggE>W zVP$27=mjR+@H+nD2Y;B^vZp_gg~}{}3nJ`xKvgpZ)SC7Lo;_PXosd+Nmyd8cCB=G` zP-D^mPP(z@ldyY{;rP#awTY=?KR!|{mZwicj5xiTvU#itgv9loTfmTUsw`QIo~GsR z8-v4ia&l7bb!b8-g+nO>X*O~q5SYu!0j4c6QlQD=&=YrsgN)`{78!}cJrIsD2^RnN z*4Dan&~2t>t{9lgC^%u0M7ak-3PF_1SKtOXmJ z#@5@<|H=kNh>_)<`EK{~{I*>)%KN0G;JxYM{4ZZ1C}B{^o2kUV7a71VUE%{uOG-$! z+wqSPa_S%Hp@mi3HFMh;0nb+JF38}Pxmsh1Y6`3X?rwn7lnIIka~*FkcQ$F~ce{LJSbEC?EUcCun)_oHOloX=zC! zv72-g_)&|_9l8~1s0>QKuO)GeoM#Xr2C`{qV0$lx3~)>nV`D@j^66E>M|?MRSqM3)dNUyUH5L)Q53upi(=AK4H@5~kX=#JtGseMsjpu(`Ye3M={5SVy zH{f)8>4zW#8(Sy{CT@Lp(ZYSJQw$>hSuH(8`mUUH7#KKdHy-};;# zP)y$nC5RZftx$bQ=^H{es(?ygU-{?<;ky)fd1bmG9W)uVvxmMztmshXFq(3NE;zHAQi z@ewvKG^9&WhC<%Fd9%XGN*6eigTVOoDf-Stj-ri?P3J9G0s^~w^2J8yMtv^)m>X=G z4dX!Lm=61SPAeN51`ZBv@Yn0>>wp780|VA@RSS!}msGQ8WC#BDet906-#>jKcUWq2 z?Yj5+yFjNsW2le%w&3W9CXj-*-OxO`-@=xkeD~Zp>Xw9vh)z_L0`f(h^|)B_et7dW zBO{D^V2O9|yXMTy41796V`Ee^E`Pyc1;IJ{2V=2TerJv%`Re=8$p{zbV*-de`@2g1 z6V5qOaSHj%A~A9sF73Ky5yMuY%7rq@${48}hLIeG4c__b;V}HVI{_&Wa>1fG&XL6{ zC^6!<2Bau6^;y&$Spu%-((_bCp~o$O=nt}lWj!GJ*$5q{r zoU&I{dF}1X|K`^x2r=X}b=7}(pr)C%9aL9y$5DySsMGT+yw0X~zG6@FXWfT6MEHWMh>CGvc^z zmSO^Bca|{7mk!%baUf>L8~xGI(cq)9#>P}n z)U#~8mM&NPCaiywAv}76njrKF^FjaQz_wNOEE~6$G z7)*};2|6){K{$hAf*C-g5cZU9aQDMtL1+6lwEmP~2!w-!L&)!iW9++^k^WjJgi4?+hC z2ICdGSg_Pi_*&iz+NEEF(TBHAokJh4XBa2@>4ycS$cGKn!vX zxN#(Z_+U95`rROTjI`D-px*n~LQY=ZeD8pZlkYj}uf@gNB@pr=m&3Jy1$|)F7}&4s zOo)q%3%pbV2g}Zm)gPT%nf3{0+Uq8*b7m@P(_lv0wS1?0$Xkub;dn}XcfP#uD{*{RjGUNt}Y(p%pJJ@lt zfsT_C2jKW~4_7ogV{b*&tK-hF&Qp=eUrX)M>`=od{5(|Z3vV(IIRMHU`=-YpsgNIH zQWd3D`u2(p17~O0ir1*kyJq#DcoZJk9pW8IA$N@J#s`dDE-z;^Y~;*Vn8Ug<_j4q@ zk{##6Kq^|T(i!V)ZPtH+Qr|~L=#rKG&)_C;$q=3-)%R?*%YTh%ZE0R$|EIBgSE@LU zfWWV^t>Z-q4|%-jJ|lii?>QtXDT(&2#0TX9k z%8dkJrMi@9eBWw>(kClT{`zHCJ8sJ_L!T_P)M+#Ex=yl-94g|VPYdE9&hzuoi+FEUQ&F7{4k!suz05gF;|y0<-2rflWjzO94N&WYfI`Rwl6vKuz2E;P2aQG&>+U8;_T&a&iy zB6-hql{+6CNz)=2M~W^G^mGAnq)Xv?B4ErJ5ef1+@a7F62?-ooSsBg~NgdJ0~g4u+8ozhSW3)g5BY zqmq9wh%%%69)9L16xbQH`3yT!(p0ch8}l^eRmdMM2SL>g=+wzGGlHAXuZkt5?=fOw zGLd{MZ?W%=0*?q3p-u-Y+&s`cqDKsYH(w7i1eFMB0sAC>*MfdWgW4@AgtXK))h^A0 z;;^@AR(pVmnM9oPpHCpS)e^LoBei);PA-YR=IiMWG<4{Q5!TckLNu`$ zzRNmvl0a$FGA42AVrXgQW;nX&VEm7d9_LN7%l9+Z3dDdp};#K z&%>2?co+hd!9tB?RJGH@XL0Y17-ooGsO@x-nqHkPo`AzbaL~hP&O*DNAlyBJXj@QF zV0QjzcA?3Y0Yr4dzI5v7U$vHZ;3oW=fJMM-&i-|b;1 z*T=)5WCr+)jsDm--;Fki+P3{vM!p_?brc*`irQTdg1$_+RO?;}N9aTR9QY{q>6Zw~ zifeRUk8E-$ZJ^TNT2?*?V%A-K|CKT478WAo;o*fzsB3F~Cxpnw9XnZI-Gw+F58 z_<7V!V&V-2VFdreXE!$rMnV$R)+VO?>Xqg9k57BbBjAx0fL^fbu775pW}c2%@%~-y zG}TLpkC%6H;{5*ods1>TbX*wxkITvL=jJU&GGY;?6{wO|*!MtkJU)gVc$4=$3_R0bl9$J*fo z_T56OfNY|aFm;~d19G~6pPt&FDs%(-VtwWMTg#?c91n zB|ZoTK`;u6Sa313sXTelThW$CfN1g>8gzwBjI!+i?V*#QIXk0DdLrE|3qoHe^4t|9 zo5j)~w_ZwDVO1SC_8z{w>(Vr3Lk|5c==BQFUXU&j=$yRV8$0l8McjU)fyhPsVYnJV zHvr{&tsfOoEfK)ES?{<#pl}mL-@_El?3TPmx?8jFdKPdQS&e7*$DtaP21;-#E@1j4 zUsumc)Aw?tUbjuOY{#aY=qFgTbRpNgqvE0>6u2oPH-k_QIH&)B#ieEfd3kw(iavo zIVo<}iAb3Z8xN$UvU0#OQ9LpNlLRa$F#wU$AnH^5?&eCHGSjuYe!y#TgSI{7Z5@t~ z1mDDV;fj2@oS32VGT7cuPhZRIxq%=nB~gqTA%O4-w?052!3|bEmU#VmkY@0!PT7sP zy6L`eWe_-@@I91#I8Fq}ITWB+b#EE71+5sQIZmKnSXHthn*DMx9E(}q9UV&=0Vh=(h$H6Ia_37v*YLP zU}MR%>>ALhLsASM&EKNIfu};ykjF2zSPI_9ZH0jwU&lR3&mN-Xx@?)aC5YKY2Jm5< ztq2TDLDpB8{DicOU;Yb@OwISxQuhT-hIC#&xf;%wvaxv-jS5HjdTd;k-C`mhpM%!_ z@8#rz-QUOBzBlj~(%Dxv7uFfnPmS!`^cCXLObUl+gl}UNTS&>Ek-1qQvJux2PU<;EtK`LT2=Y4_uSFkb1zb zkkHIeGYbryAUUWE+HuT77{ekyH6dl-*#Ct4P@UXfX=(;%Wo1eK+s7IgCd7y^Ox&{v z^t$yaZKezk1IR(i3%dDOgd%Y}agvN%J<&JGGE|@Z-r2D{-5$FSEc*5hzgRuH=SQK+ zGBs^NBo^-5Zm~LfZj(l?7cN+mfV(iLn6uVrCzj2hk^*!}skX>n+^`jbC6?CHdl>!( z*VU}=2zx-W1d^}Bd4x*Ww^R^J^VZNb0eEWe_#AFfaiMUGMu+%ijr0K^ zpE!kEbxHh~zIER&-ls2oeOroS<H9WwlF#Dt`?PAB^$&^XDua9QFG1zkdBWI@bYi70?y{gFqm@ z?bKUaTT0KLOUuX{jp8UobDdF8>Z^2Zb5LKrY(-(~#7+tiN6MVn0e6|2nvx0D+gAb{ z!jB(693)zuS0xGx3x%;VzkTD)EY*Zc6>6D}eR>8eH&`u2ndau^y_IiH%qBf>jcx9k z(pPRUD}&B(yOx%}zI$GPW_ricLt}uGoO4ErJ8;-2xXpgNPlj%9XQ!#7gIuVE9R47^!*C4UmKV4{(4agoknhk!5>po^8i!??ii;L_+om%g#m8rK!k~|PvvUL;%H)o9dq3x`m=XrPcN?P zc&=kxh@}A{C&ifebxrp?wAEq=s%2yp7lw)}!V6Ljn`N`o-(!r82B|wq@ zJL@$#WT^sDT&9bveOD*Rk2chjJL)=_H zPw!1432760@^{Rl3JVLts>2zOk56Bs+_8g+9yLB9_&=t+mX<)ttA9j)f^qDa8>*e% ztyOh=d~I#S=ds}U>Ul4>ARYcC^AIstfg-|wz=;X9w2$Ak`!sq;y+&b+hyWvy-GByUqCs7;SDO3Kx} zghY1mF4;mZdPorY-^Lts65!rtO0_XKssLvP(ff3C0BUx|#q{CcXq`@>K9>S(kz0Z$ zsRAT_ACM(DXO9Mi2L)d(x_}YC_88+{smrkwe$IpMPGFTzKoe8);$ix6(UA!+w!epA z25<$m#If1rQSV?IhQPRAnIin*GSKbo3){MzVXjgLl@1tD^PibLYv`B_L@S(H&u!wO zgI_8-o_sG1A7|7N`aV59x;F|}mNjJqEGY0IP_t5PNnlXeW$Mv6n}z*rrf=i5sW?M+ zW7NzT?oS7ZN(`JmH>`{?6W(B7{jtkjzDTCSs_|Xvzi_HYRC~K+85qk zn8?O)LrSMCM%9w&94(M~di@Qn*0CCqqk)OCStpge{ckNmRU1OKy3Mqlpr9ZLfI9rt z#B_n*f9N0en`9%Gc8_SDQEL6Zy0#{Ddi(1i0f8{WDgJBt5?J<`OR#KBHn$5Dc|9hq zd=STzxkEkN4ff&kAby)h(-KK|T{4(NFD{>e)!lxa>Ko74KNMVLO0E?QiZ!&%%!7Ny zn81^P+@%~~Y?fl?a4fu_eXcIyKkPz;h}ci@SuN6u82azglhY+k-f2o!&ASXFAu#gr z;7Q*8oJjig=^eb-wX)p~hs=Pu1^?)^e+nF->q{HWNeM{YA7k7rLlB#Mr zV4Ah<$k4xmBm%?;P^Jev0jLzc&Oq_m7}$pFog3Aasqb52|CvWoOt*2@((5OmoZot` zE{vs8_+5@IFKre%xBtxSmZ{9G|K~d|P^$(*+xKN}6-iU5voW*rCiX9zN01#FS3quRn2WFZEhIBU5YDr$i9 zQpNpp%#=~41TBy(Dxn!Mk?Ut!B}zJr0pBxM-(_D|veM^y!@BTZM}p%+hP$?rz{bC- z%5|0Ye;>L_G$XHvUkr5^ht$09`kPC|zC=w6_8Anexyj?fR@*u{7QXO7Kc;(_E9+{m zf!Kk5Ws_7o9DdT^h%Q^sDke83|g@M64oidci6`iu#U+l5} ztHh;2%+-GOe$3bIvtlTjGhcUG@!)7^?M~?O<(K*KppIh^_~L@d<5=|iHN;x&qs7mH zsipvjrupx(V!e(~3*L;RtgOE-hHh@WD?U3^#RKfRVrlzRQTaGDRFJYJ#I<3C80t=%LMpsx~b)~GF7M$#j|+7XG1VoGdIjfZAi zCw%Zo%~jyXZjg;vQj9*DtZ-v*(@7|Cu{#r9YWi2W`R(?zxDIr_a*K42n%34jx30*=3a-^s)9*RC38*PI{a^%~w1 zQ|=_Y8y|9SE;~~k_lX+ZUG%-qJknH_IdmI$wEus{GQk#!=y6O6wcSD_0yz>n!y;Sqd zGjj=z<0jpUI_I_>PK$YeQfu(bKS@9@nG-L{e7N8&H|!nS?9Z()Emr^z0k~o~Yi`Rw^!+*>oE~$Hyt3C(1kz^E*YSFSg^!c< z1fGX@HwWQ9<2CA79YpM$|~!Y`l^(<(AJa7Nh2OV z3F7_0<6ly$b~g-$yQrT(f6hFH!f@BlN#JB@;2+Y-sr{!u-?5^p{YjfV-Yo{bD06O( zu~FF%r~Zw8l6>_n0(Ebr0rG-1C0)ehYb%Z-8nkO70QpoB2f;=X*|)YfIL}EMk%!9lYc+x1W8NVZwn6D6fM_Eb zaZTfl_{rHpFaZMt{buiYsM&gTEJX!qDtc!tkJ4EvLU#C=>EHK+ovk%Mz^-*C z2Hjc0iO#1_pWbT*hV+ zOsOiuD($IH(32UenfQob> zH?S)NMUPEsOr+HEnZyX!a~HlQ$iGDz`zejG-2egG5TebByh8~h-hl`^9W>VTYuL35 zolDV@XYYb-^|ea3wTrfOgg2o(x1l^7P^9YWWQ#|NrakQz)qUlDaCsU6KPDjD41kljq(d!pM|^0D`uH31$m)K~QNt46W!zi?@IL1 zUi6m6GL={-mIjf6;$JJmMSkz-@ghOm(NCtuqjY{BisE9~IxC`uE4@`hCz!|>W>i)ozYW#iz)HF0T6r)v|N{J?oQK~wTb~$sK*I7ll zvQW>`LNWzWB&1HLU>tswf9Hm&x%M?ZJ_$c)1|kpIXF?&l^pU13f>5u7(;Q-RD{oS_ zV1OBb0>|rP1gvq5iYnD)ZKvZ;;3r$2g};1eR&iLUdCbA`8W5FMnshO7+U+_X_XNr8 z%0b0vb4L(?v5ero%|Z!IGO~n7N?IT#92y;ctEi7g#``c!mTvk;v-UkA#0v)s#{^@~lbn4lp=c zgeR&fMDle$tmaSex!jz+0h0qFBj8-3`x7#0bU|7MVkyK0ur|Mc|DG(Xr|qh01V!a8 z-b8Q3MD;J|jWTtAU1=YPt6gq$GBhI)97wgF%s*R}(l_Q)pJ_n*ARR+`q1VyK(MSsEDC^Z&&9t*C2p#hK2SzP5+NQ1*(CP19}`mdodNJ| z;a)4!i=9>OD6Mbo-kF9RY=| zjXv~D51^E?pDUNPvSJK%blCukD4--aCCS$ioCu7ZlY9)~HWlM4w*-9Os|V6`%+#=m zW>3WP;{D1u-0S&=RqG4QG@8?s+b%gXu6XHwBcw}d1zmcj`|e&>(;j{=0ktgnDfbp= zz?hhF>*{8#jqUC2JF5!6e*Fr=!<#zX?II&18yXsNX{u;$&WLosBv@L&I4ujRwoe^_ z7mzQwnm9jf`l`f^3|^jlm~t<5#k3G-l{hM7hcRP+*+17lWRA~N;O7?##Gt`4HEpay z+Vr@+cRBd{@&5@spHIAo$aurE8gcRV3G2?jAmC}XAFnwyji=gqj|HNsjZsrwVh=F{ zk@q z-m<>8=`L1%FKm;#Wclzfr1CpP!?oeudoKl~zY zJ5@uhi@(sIXiVI55`U(<(LQ<9c-YtorjB~f#*XiQs{0ck{k<8U$_Nl0~4mmjvGk}3F_DR){BfRnN#Jwq>TJ1jp!+P-3MMbXjQKun{?JCq6Q zi3k?QgQNugRX2h`J1kp={XSb)+xZ;sIN~UaI3dD~YFJu=A4oq}t{dXGiyap~ z2>n!IZ!u1|-C*G1{b=$sc})r`8r9z=@z2BeZOhu?CPTnL)OTRJJ%{vsR@-=){rX>)R)po7@nu zex;f#0bwm|3tu()P@n)mCVSW-I|^!Z)JH>)uwCZLqJDn)sg#NduzW)E;r2xdA(D_U z6}@r41_Wj$Z;jC_g8qZeqfC%E|*u>qlXwh}e!)SKTuS_C+{G z`vx5=tkbQdM0HyL`WcLk8QpD484k;ef9gAndbC75X*xOFL2`}#&lH3O%} zgm-NRkYXFMjsy=60R)wG3oA?VSn0yj?@&MFb33*nRUGa>el|7`6A8JG-0ff4Zpzy_HZ(?Oe1RV~?9MqZ3On#Wa%sHf{$o0&AfjxG;B zx#U$M|7)if{GBh8ltLK+m>uamTLT{1-i2Qc_;I;t|!1mI?h zuHzw}4jrOc_3x3=-LjTDR*f8cD?hvn-a5Bg+g4mGRN z&${tRf#e1N!Yk4&%gf?dn#<3WaviMD0GdSHQzCx?v|HeeW%Vqo0>T6NF8}hMpu$ks zK?e!@zrc~WP{g)S-`X2;i5USz2w*YA{DQEwwf?M1D*-J&UD&{*K92j8$(KEY%kOv$ zT?bi?=<%WlNSI_x)4hYwPzZG%XxShAZiJQup0wp?z+~>Qk*qEi`aS0P%>R~o|=BakjDUOGTwWC}WaHaP=M?5zkDP_vlxW}skw zGV1|^?6nKt(Q?}Ie#=7#u{d=5xWSc`KF7n zr67}Vq%rN=s`L@tuwtm;v20UR{J|9Now|w=ty@;FPaNA1xD&YOkoBKwHmHq4As2_M zhbbq>2$k0Z_>`tO|J-TZ@0q?l;CuH!bx{Aj9aQA6hhA`||DC`xW4AOBkN;a84B zuPhLpttWQ<;Omy53M$iALeqPT1%6y!MUL!L0zSp^8H#_2W8{jhWY+LiG#=pbuzoyI zX!6IAi4Fo#@A-d|c5`!cbe$cnY;3(ipW`rBs?7>gH4fHFAgSVn9RBXpTYE)I;H3(;>zO<>$r(i!y+dYUg)UdpjkH~(}Cvm z`>*x0nw6qSQy!?e&UZnl;I>Li)jr0r`N{F9s5p?K>l6BZN(~PUNhw5buGVE{W=^df zm~l%<4jN){AKp7&zSC*8ZM~JKtFODb&*j93K)}Yv4wHBVBA!D#9!-w(Kxqr{(rz8L z5sJ#`l1~ip=(}XkG1`TMfq?<(oL-8o_Jj4%N>T6dhzN6_5~3E5T17#Fh~1w}vjD;j zP=N~h-7hu7nH)sl?!rL1Bsn`98&dN1tBjgjd^HCX6Ea{LB%U@l9^d-y-5^pU@#0 zIde~y`|?IzQKdI2^-CAkJI#WUK3LRG-D)@zJw6>13hW>|uedNJd7h5u4j={Bi%3Jo z#FMgD+KA`_t4h$812gDPEfw*V0vb9JAuY80 zvaF12)7oOu96L^M%=7kZf%#B$yp&xX(oUuEU6&@#LT5zS*Cd1c_Vk@G4J6!%6cx%#h^J}t$- zeh6YDL3*kh^|xubI2wFB&JBhfuWr5;X8hH2WjQ&BmoC7Z?n#Ix<=IWu2mK|`jU8~} zO|HqVNSH&o*n%%ImnjPISaL2c34_eh6E*RwUB{8qdct5bS@O^g03cDXM61GrjszyB z%sXYnv0hO>BAbXT(wE;1V-YzOzA;CqGXWQk0!o+k%X{LI>v2z6E5e1dpJvn3uHwfy zf7zR(8*o%!E`n*-tfjKhd`yMmm$OLuhYQ|8W=xR=G>~G+Im!j$MIRTzN$YRrK)MMy z$pvFFsT0sAjV83?ef^tghMavJkBuTEXFMzL`UtMr^Xu?==nYAVYXSX_wxvPic<>|^U_h!jl@P;$FOJQMa=RjngZJ%7ine9(> zx;}toxYh=A(}N!EyCyB=exZ>q)css*QAusV*)I=fPz(QFS2G_H>m3$UAn6Yy;uyrFv+J?k0-X=0~N#g{NZ}gt{@BDFbAyWkp}2H%t{;F zjJ|nF+38O}MZyuH^_53TfEOv_Iq#C!qJw{_bFZX&HVNxcg$yY-vjo11(<|B+>;Dx0zp|1h_?_(1Ul}2a13K`^vl;8E_EpG8o0DM%X z8SIUAxC;ToPt73o*WXK1k9O(F=z0)`U!<24ZUW5+yh=G`4_SV0*?TPIM)<>$p~d=~ z0SHBa-B{nDQe7z?hUDDAYD@XkaW`YaND}6qHQ#6^tZgQR z4aKMV|3vQOBcd$8nbb>|@*HFa;%k9E!#41M0Cf0s0py{)NOymWxk=gAW@>_cMS%1TD_Gnc&lqOGc>Ohlo1#rC6n7qT68U}z$u(pq&A8zeCB5(#JdpU< zH`e`ZMV$QxP=f+1qM!fpqS5tYAV+_=iXcDwgOM&NnvljU5&i!74J0^*3WYyh*-VaQ zGJkOZP9=8OP@WS5Er2L!hFW{J+IzRbCv~fK7jWl5Ck9!QDN(&@jE3?Js3hT$#I#x# zz@nz^D8rN1qY$hhZDUA*7yM=p5qtNtEIMO7!!=K@BzScfn58A~pT!H(T6BR@pROu( zyzQr-i7?*L7r|#??g<)~?V*ASOOdcsG$U0X{36Xo^vqN>Ewiz=Om8a$@rda|>9aZt z!jsn4=DkG!-%W$KKC9h!64yL|Tq=#2*pTQL%8*+E`r{+2)S^b#r{b=l?=998bqGix zs{?L5A;H5|GJ$~-0?sR9zXNVx-l-uz803a!nMP4h37F=-pil7Ic&+C=eEF-_+5}O9v-BXmnOWAmk@2 zF@luRDdo5uq+$LVO z|LMy3-{p4I>b)5O|BE@ok3mWlJ3$Mq^~R|7en+89qc@O1wZBh)g=FmIfK#B!g zmJttk={o~TbKryw#(&oVkt=lBW?ZrAoHgM5&&Wva_37^q8XFajDc?|<1>`J7z^-XbAD0IkOlx(;$d2jP_M(acK?^hF#hk#Nw754tvSn-0ye z3Og8+Kxy8Tq?>sc%%s|+vG~v)2o%xL(Up{xAP@j#e>(U9xfE(U7ii17JY9UH2Lj0* z#eY4#BK+=VzVo5J3Ui$cv^1fvydllo$Kiat+};0SeoSb`e{C85j0;21y#tepR)MQI z;T?J40`M0@)dkagnxGbZa^ePhCwi@TvH!S(Cr^Z}q``u!Uyr0hFI~?cd!_DL7;;;$ zeIsQ}UAGMY7pD30B_c$~>wpqy2C_cA>^E)V?bZVl0`+W>h(eV#l>SR{2|G@p7uO3ZDJX8*N#1NAHV+e%)Uvzc>k17AwT(* zuI>rw!uqy)f0xZC*$DcR?|>{S%YEEKgvzwmI(~3u=jphSm$`ZF*vVnwrD$Tx2nBfV z#BjWVWzM4U^&_{Q^eIMJr#@!Xx2jI}Z3ED$3DO5H_Tqv9pe;pR@@VDlUKswsoc#X% z#$KM!$6Cooz}WZbxq)hV(9Lv?n2Cah%SFq`$cSUZImkAh#Dv#PG?AkvM_qOu$Bk6} z?~E&2(<{Akb#A~_M}T$?4U&<~ta~z4_IVTV1}K)d3VR?#)IOSmCmS4{<~&k@`vCMg z0A#T3FI01m`@;QnrC*|M?33XH^#8PvNlYYpFb13Gu|MbY)s}j56v=Y#d!JOr>F#jG zVf}yi(1vQ2%4uog1MT+F4;3z-{1T_FzZ2fp)=&QI@4qlH!GS;@e+$FB)S7mbCdERo zEFxut*8bE30@4DslmOcQ%<5Q-{^`?UCu38BI2~PGnc``{kP`lvWRQi6i-a5>3E!q; z9xR7Njo#HK&{GU(`7Ftx6$*I|Hk(SYtrmwX?eOP$nE3870kd8Q;y{`?j>D|$$F=e= z=rEg(=+Aedewl3ZT3{XEU65F~UCqNo|D_s8VOoN(%N*9D(gFfVHF~cLY2G}`}at=B1^6u|r zvH=|!5YQn)aA+hblg^FD{i}3XiL_Z$svUsvQ>)a$#2f{~rmEFT5A@7N(fvMzTfUJI zR@Vt1+?XBq@A+0%AER>7zIGG-mvokC)F3@2h3PJ7YiDP1)Ma6{BgDFTzG8Z>+PoJf zb3&OB|L%Gs=t90G1~coLY<~Y7{s%NQdDhbdWElu6xtxS_27 zIj_YZi#-RHw|Kj%EpxzG7M z&$+G(cMS)y*RZ!g{EuO#mn{m5m#_k(2nUnhU+fl#7KM!A@QX5^ z-`gR{tOfJeVz9}4ey{Z}?T}V6Q|5fY|J+34WCeG*`u%K$O&LEPKYmDDxD&NPX6l@^ z9`*~*;};=R9|PbtE;gkP&VLpb7M7lt=7we3l#ya$4rht|%rMUt9`Dm+^S!n-7mWo9 zp!CEb^?uEk2C9 zRHM5j)n1svV1Uo%?IJ9pqdUy=Sy2&=8wT+)mdau^HZ`5#AT#Z0d{g`0wwz09S@}!5 z5pw+-aXl&;7n~t5@oq9SvdF5IduA^-Dv>$ot!NrsIHBlJY(ia{ibR{_M8Bxg0F4lg zhJ-SN*53;}El-x$_Ntpj1ebq=ThL8n_ID^o#=>G}{Yw&=FhMg@-Te5Q1t8ZVMt|=O zq>oKZ_!__D*XIbd)_ew3Pe?4TB-URyd5$E~*FcJnPK`vTacjMqIuftkHUGMYdFzX~ zQGgmL4r)wM#>PNiSGwj6F0xGiXd==x&D3S%O4a=y);g-!-4p>~4K9vv?PS6Gy57AL zHKIy^ICS&? z9(2%@I04{TBr+L@ln7fm4TYdG`vb{gVF|#*d^%_}QP(zRBcK zjd$I>PR~8RIDXO2yxz^fS^xI5sjNkIUjXXP(=y&=)i_7>_;7~g?2L0=sd|GSFT7B5 zg@X0hlMu)a=nV5*L#I=z)CBMO+|K!eB=r=K-&e;85%|;hE2`p-gi=j=ovmq{I=J}4 zo&5d|tZawf_xnK!;&Bibc})M!%C`5HmzV3s+qawd2eNHzHML0}Cc;%RkEs~v08>wL|J&bS zfGfZYKLZkM1=q8QzGAcXexacJos48}aZ-6*?{Ko^3(zTt!W7d!+m@|w?H$zb(P*@X z6YE!`i=H{x5y6!ZaMRllG8?KZHI)O2eg;r`#nr#>85kJE!u$RRXBe26B&u46xID3^T`y1rE~r{} z4HjeBA*lNHb`$Qk2L{l@sw%Jz$uqBQbbTEmYq85)J^^+lKWmwgN@9l0-A6v^Ge#Lz zWvGgRhycr%x}55)M4?D-M5@PTB((DKzLYouu^g!!n}8Lv9mi2CZyZWND}lQ7TLj}j z9jN_@n4?1s5L5z3sFMtpwYYx165`gnzP&mF{5U0*lxeF*KylE+OGr>k+q|sk()``, where float may be signed (and fra dftd['C'] = dftd['A']-dftd['B'] dftd store.append('dftd',dftd,data_columns=True) - store.select('dftd',Term("C","<","-3.5D")) + store.select('dftd',"C<'-3.5D'") Indexing ~~~~~~~~ diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index 53973970e039a..753613e5caea9 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -14,7 +14,7 @@ from pandas.computation.ops import is_term from pandas.computation.expr import BaseExprVisitor from pandas.computation.common import _ensure_decoded - +from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type class Scope(expr.Scope): __slots__ = 'globals', 'locals', 'queryables' @@ -79,6 +79,9 @@ def __init__(self, op, lhs, rhs, queryables, encoding): self.filter = None self.condition = None + def _disallow_scalar_only_bool_ops(self): + pass + def prune(self, klass): def pr(left, right): @@ -177,6 +180,9 @@ def stringify(value): elif isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u('date'): v = time.mktime(v.timetuple()) return TermValue(v, pd.Timestamp(v), kind) + elif kind == u('timedelta64') or kind == u('timedelta'): + v = _coerce_scalar_to_timedelta_type(v,unit='s').item() + return TermValue(int(v), v, kind) elif kind == u('integer'): v = int(float(v)) return TermValue(v, v, kind) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 87def113266b2..322b626acc0ad 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1864,16 +1864,16 @@ def test_append_with_timedelta(self): result = store.select('df',Term("C","<",-3*86400)) assert_frame_equal(result,df.iloc[3:]) - result = store.select('df',Term("C","<",'-3D')) + result = store.select('df',"C<'-3D'") assert_frame_equal(result,df.iloc[3:]) # a bit hacky here as we don't really deal with the NaT properly - result = store.select('df',Term("C","<",'-500000s')) + result = store.select('df',"C<'-500000s'") result = result.dropna(subset=['C']) assert_frame_equal(result,df.iloc[6:]) - result = store.select('df',Term("C","<",'-3.5D')) + result = store.select('df',"C<'-3.5D'") result = result.iloc[1:] assert_frame_equal(result,df.iloc[4:]) @@ -2039,14 +2039,6 @@ def test_invalid_terms(self): self.assertRaises(ValueError, store.select, 'wp', "minor=['A', 'B']") self.assertRaises(ValueError, store.select, 'wp', ["index=['20121114']"]) self.assertRaises(ValueError, store.select, 'wp', ["index=['20121114', '20121114']"]) - - # deprecations - with tm.assert_produces_warning(expected_warning=DeprecationWarning): - Term('index','==') - - with tm.assert_produces_warning(expected_warning=DeprecationWarning): - Term('index', '>', 5) - self.assertRaises(TypeError, Term) # more invalid @@ -2086,11 +2078,10 @@ def test_terms(self): assert_panel_equal(result, expected) # with deprecation - with tm.assert_produces_warning(expected_warning=DeprecationWarning): - result = store.select('wp', [Term( - 'major_axis','<',"20000108"), Term("minor_axis=['A', 'B']")]) - expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) - tm.assert_panel_equal(result, expected) + result = store.select('wp', [Term( + 'major_axis','<',"20000108"), Term("minor_axis=['A', 'B']")]) + expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) + tm.assert_panel_equal(result, expected) # p4d result = store.select('p4d', [Term('major_axis<"20000108"'), @@ -2147,11 +2138,10 @@ def test_term_compat(self): minor_axis=['A', 'B', 'C', 'D']) store.append('wp',wp) - with tm.assert_produces_warning(expected_warning=DeprecationWarning): - result = store.select('wp', [Term('major_axis>20000102'), - Term('minor_axis', '=', ['A','B']) ]) - expected = wp.loc[:,wp.major_axis>Timestamp('20000102'),['A','B']] - assert_panel_equal(result, expected) + result = store.select('wp', [Term('major_axis>20000102'), + Term('minor_axis', '=', ['A','B']) ]) + expected = wp.loc[:,wp.major_axis>Timestamp('20000102'),['A','B']] + assert_panel_equal(result, expected) store.remove('wp', Term('major_axis>20000103')) result = store.select('wp') From b8a3ba3bcecd635a07646274a5db04a7b82f98b0 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 14 Sep 2013 16:42:55 -0400 Subject: [PATCH 13/16] CLN: correct in and not in Also added tests for nan in and not in and disallowed ops like pd.eval('1 or 2') since that should be performed in regular Python --- doc/source/enhancingperf.rst | 24 ++- doc/source/indexing.rst | 29 ++-- pandas/computation/expr.py | 15 +- pandas/computation/ops.py | 44 +++++- pandas/computation/tests/test_eval.py | 216 ++++++++++++++++++-------- pandas/tests/test_frame.py | 8 +- 6 files changed, 234 insertions(+), 102 deletions(-) diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index b876b6fe7d24a..6d5717e420d1e 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -384,6 +384,14 @@ Now let's do the same thing but with comparisons: %timeit pd.eval('df1 + df2 + df3 + df4 + s') +.. note:: + + Operations such as ``1 and 2`` should be performed in Python. An exception + will be raised if you try to performed any boolean or bitwise operations + with scalar operands that are not of type ``bool`` or ``np.bool_``. *This + includes bitwise operations on scalars.* You should perform these kinds of + operations in Python. + The ``DataFrame.eval`` method ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -393,7 +401,7 @@ evaluate an expression in the "context" of a ``DataFrame``. .. ipython:: python - df = DataFrame(randn(10, 2), columns=['a', 'b']) + df = DataFrame(randn(5, 2), columns=['a', 'b']) df.eval('a + b') @@ -410,7 +418,7 @@ You can refer to local variables the same way you would in vanilla Python .. ipython:: python - df = DataFrame(randn(10, 2), columns=['a', 'b']) + df = DataFrame(randn(5, 2), columns=['a', 'b']) newcol = randn(len(df)) df.eval('b + newcol') @@ -419,16 +427,22 @@ You can refer to local variables the same way you would in vanilla Python The one exception is when you have a local (or global) with the same name as a column in the ``DataFrame`` - .. ipython:: python - :okexcept: + .. code-block:: python - df = DataFrame(randn(10, 2), columns=['a', 'b']) + df = DataFrame(randn(5, 2), columns=['a', 'b']) a = randn(len(df)) df.eval('a + b') + NameResolutionError: resolvers and locals overlap on names ['a'] + To deal with these conflicts, a special syntax exists for referring variables with the same name as a column + .. ipython:: python + :suppress: + + a = randn(len(df)) + .. ipython:: python df.eval('@a + b') diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 003be5a795d95..7d22fb5335ae6 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1014,8 +1014,7 @@ The :meth:`~pandas.DataFrame.query` Method .. versionadded:: 0.13 :class:`~pandas.DataFrame` objects have a :meth:`~pandas.DataFrame.query` -method that allows selection using a string consisting of columns of the -calling :class:`~pandas.DataFrame`. +method that allows selection using a boolean expression. You can get the value of the frame where column ``b`` has values between the values of columns ``a`` and ``c``. @@ -1027,7 +1026,7 @@ between the values of columns ``a`` and ``c``. .. ipython:: python - n = 20 + n = 10 df = DataFrame(rand(n, 3), columns=list('abc')) df df[(df.a < df.b) & (df.b < df.c)] @@ -1038,7 +1037,7 @@ with the name ``a``. .. ipython:: python - df = DataFrame(randint(n, size=(n, 2)), columns=list('bc')) + df = DataFrame(randint(n / 2, size=(n, 2)), columns=list('bc')) df.index.name = 'a' df df.query('a < b and b < c') @@ -1075,13 +1074,14 @@ You can also use the levels of a ``DataFrame`` with a import pandas.util.testing as tm - colors = tm.choice(['red', 'green'], size=10) - foods = tm.choice(['eggs', 'ham'], size=10) + n = 10 + colors = tm.choice(['red', 'green'], size=n) + foods = tm.choice(['eggs', 'ham'], size=n) colors foods index = MultiIndex.from_arrays([colors, foods], names=['color', 'food']) - df = DataFrame(randn(10, 2), index=index) + df = DataFrame(randn(n, 2), index=index) df df.query('color == "red"') @@ -1091,8 +1091,7 @@ special names: .. ipython:: python - index.names = [None, None] - df = DataFrame(randn(10, 2), index=index) + df.index.names = [None, None] df df.query('ilevel_0 == "red"') @@ -1111,9 +1110,9 @@ having to specify which frame you're interested in querying .. ipython:: python - df = DataFrame(randint(n, size=(n, 2)), columns=list('bc')) + df = DataFrame(randint(n / 2, size=(n, 2)), columns=list('bc')) df.index.name = 'a' - df2 = DataFrame(randint(n + 10, size=(n + 10, 3)), columns=list('abc')) + df2 = DataFrame(randint(n + 5, size=(n + 5, 3)), columns=list('abc')) df2 expr = 'a < b & b < c' map(lambda frame: frame.query(expr), [df, df2]) @@ -1141,7 +1140,7 @@ Full numpy-like syntax .. ipython:: python - df = DataFrame(randint(n, size=(n, 3)), columns=list('abc')) + df = DataFrame(randint(n / 2, size=(n, 3)), columns=list('abc')) df df['(a < b) & (b < c)'] df[(df.a < df.b) & (df.b < df.c)] @@ -1164,10 +1163,6 @@ Pretty close to how you might write it on paper df['a < b < c'] -As you can see, these are all equivalent ways to express the same operation (in -fact, they are all ultimately parsed into something very similar to the first -example of the indexing syntax above). - The ``in`` and ``not in`` operators ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1184,7 +1179,7 @@ The ``in`` and ``not in`` operators .. ipython:: python # get all rows where columns "a" and "b" have overlapping values - df = DataFrame({'a': list('aaaabbbbcccc'), 'b': list('aabbccddeeff'), + df = DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'), 'c': randint(5, size=12), 'd': randint(9, size=12)}) df df['a in b'] diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index d8969e1297cd4..db92e7f57677b 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -461,10 +461,10 @@ def _rewrite_membership_op(self, node, left, right): name = self.env.add_tmp([right.value]) right = self.term_type(name, self.env) - # swap the operands so things like a == [1, 2] are translated to - # [1, 2] in a -> a.isin([1, 2]) - if right_list or right_str: - left, right = right, left + if left_str: + self.env.remove_tmp(left.name) + name = self.env.add_tmp([left.value]) + left = self.term_type(name, self.env) op = self.visit(op_instance) return op, op_instance, left, right @@ -662,13 +662,14 @@ def visitor(x, y): return reduce(visitor, operands) -_python_not_supported = frozenset(['Assign', 'Tuple', 'Dict', 'Call', - 'BoolOp', 'In', 'NotIn']) +_python_not_supported = frozenset(['Assign', 'Dict', 'Call', 'BoolOp', + 'In', 'NotIn']) _numexpr_supported_calls = frozenset(_reductions + _mathops) @disallow((_unsupported_nodes | _python_not_supported) - - (_boolop_nodes | frozenset(['BoolOp', 'Attribute', 'In', 'NotIn']))) + (_boolop_nodes | frozenset(['BoolOp', 'Attribute', 'In', 'NotIn', + 'Tuple']))) class PandasExprVisitor(BaseExprVisitor): def __init__(self, env, engine, parser, preparser=lambda x: _replace_locals(_replace_booleans(x))): diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 14f67a3ab6723..debc79e33968c 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -193,6 +193,13 @@ def name(self): def name(self, new_name): self._name = new_name + @property + def ndim(self): + try: + return self._value.ndim + except AttributeError: + return 0 + class Constant(Term): def __init__(self, value, env, side=None, encoding=None): @@ -207,6 +214,7 @@ def name(self): return self.value + _bool_op_map = {'not': '~', 'and': '&', 'or': '|'} @@ -236,17 +244,24 @@ def return_type(self): return np.bool_ return np.result_type(*(term.type for term in com.flatten(self))) + @property + def isscalar(self): + return all(operand.isscalar for operand in self.operands) + def _in(x, y): """Compute the vectorized membership of ``x in y`` if possible, otherwise use Python. """ try: - return y.isin(x) + return x.isin(y) except AttributeError: + if com.is_list_like(x): + try: + return y.isin(x) + except AttributeError: + pass return x in y - except TypeError: - return y.isin([x]) def _not_in(x, y): @@ -254,11 +269,14 @@ def _not_in(x, y): otherwise use Python. """ try: - return ~y.isin(x) + return ~x.isin(y) except AttributeError: + if com.is_list_like(x): + try: + return ~y.isin(x) + except AttributeError: + pass return x not in y - except TypeError: - return ~y.isin([x]) _cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=', 'in', 'not in' @@ -322,14 +340,17 @@ def __init__(self, op, lhs, rhs, **kwargs): self.lhs = lhs self.rhs = rhs + self._disallow_scalar_only_bool_ops() + self.convert_values() try: self.func = _binary_ops_dict[op] except KeyError: - keys = _binary_ops_dict.keys() + # has to be made a list for python3 + keys = list(_binary_ops_dict.keys()) raise ValueError('Invalid binary operator {0!r}, valid' - ' operators are {1}'.format(op, keys)) + ' operators are {1}'.format(op, keys)) def __call__(self, env): """Recursively evaluate an expression in Python space. @@ -425,6 +446,13 @@ def stringify(value): v = v.tz_convert('UTC') self.lhs.update(v) + def _disallow_scalar_only_bool_ops(self): + if ((self.lhs.isscalar or self.rhs.isscalar) and + self.op in _bool_ops_dict and + (not (issubclass(self.rhs.return_type, (bool, np.bool_)) and + issubclass(self.lhs.return_type, (bool, np.bool_))))): + raise NotImplementedError("cannot evaluate scalar only bool ops") + class Div(BinOp): """Div operator to special case casting. diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 8fb1b35abff37..d5bcf85d4de03 100755 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -85,7 +85,12 @@ def wrapper(self, lhs, arith1, rhs, *args, **kwargs): if _series_and_2d_ndarray(lhs, rhs): self.assertRaises(Exception, pd.eval, 'lhs {0} rhs'.format(arith1), local_dict={'lhs': lhs, 'rhs': rhs}, - engine=self.engine) + engine=self.engine, parser=self.parser) + elif (np.isscalar(lhs) and np.isscalar(rhs) and arith1 in + _bool_ops_syms): + with tm.assertRaises(NotImplementedError): + pd.eval('lhs {0} rhs'.format(arith1), engine=self.engine, + parser=self.parser) else: f(self, lhs, arith1, rhs, *args, **kwargs) return wrapper @@ -215,6 +220,17 @@ def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): self.assertRaises(TypeError, pd.eval, ex, local_dict={'lhs': lhs, 'rhs': rhs}, engine=self.engine, parser=self.parser) + elif _bool_and_frame(lhs, rhs): + self.assertRaises(TypeError, _eval_single_bin, lhs_new, '&', + rhs_new, self.engine) + self.assertRaises(TypeError, pd.eval, ex, + local_dict={'lhs': lhs, 'rhs': rhs}, + engine=self.engine, parser=self.parser) + elif (np.isscalar(lhs) and np.isnan(lhs) and + not np.isscalar(rhs) and (cmp1 in skip_these or cmp2 in + skip_these)): + with tm.assertRaises(TypeError): + _eval_single_bin(lhs, binop, rhs, self.engine) else: lhs_new = _eval_single_bin(lhs, cmp1, rhs, self.engine) rhs_new = _eval_single_bin(lhs, cmp2, rhs, self.engine) @@ -231,7 +247,16 @@ def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): #except AssertionError: #import ipdb; ipdb.set_trace() #raise - + elif (np.isscalar(lhs_new) and np.isnan(lhs_new) and + not np.isscalar(rhs_new) and binop in skip_these): + with tm.assertRaises(TypeError): + _eval_single_bin(lhs_new, binop, rhs_new, self.engine) + elif _bool_and_frame(lhs_new, rhs_new): + with tm.assertRaises(TypeError): + _eval_single_bin(lhs_new, binop, rhs_new, self.engine) + with tm.assertRaises(TypeError): + pd.eval('lhs_new & rhs_new'.format(binop), + engine=self.engine, parser=self.parser) else: expected = _eval_single_bin(lhs_new, binop, rhs_new, self.engine) result = pd.eval(ex, engine=self.engine, parser=self.parser) @@ -242,6 +267,21 @@ def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): skip_these = 'in', 'not in' def check_operands(left, right, cmp_op): + if (np.isscalar(left) and np.isnan(left) and not np.isscalar(right) + and cmp_op in skip_these): + ex = 'left {0} right'.format(cmp_op) + with tm.assertRaises(ValueError): + pd.eval(ex, engine=self.engine, parser=self.parser) + return + if (np.isscalar(left) and np.isscalar(right) and + cmp_op in _bool_ops_syms): + ex1 = 'lhs {0} mid {1} rhs'.format(cmp1, cmp2) + ex2 = 'lhs {0} mid and mid {1} rhs'.format(cmp1, cmp2) + ex3 = '(lhs {0} mid) & (mid {1} rhs)'.format(cmp1, cmp2) + for ex in (ex1, ex2, ex3): + with assertRaises(NotImplementedError): + pd.eval(ex, engine=self.engine, parser=self.parser) + return if (np.isscalar(right) and not np.isscalar(left) and cmp_op in skip_these): self.assertRaises(Exception, _eval_single_bin, left, cmp_op, @@ -294,8 +334,8 @@ def check_simple_cmp_op(self, lhs, cmp1, rhs): ex = 'lhs {0} rhs'.format(cmp1) if cmp1 in ('in', 'not in') and not com.is_list_like(rhs): self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, - parser=self.parser, local_dict={'lhs': lhs, - 'rhs': rhs}) + parser=self.parser, local_dict={'lhs': lhs, + 'rhs': rhs}) else: expected = _eval_single_bin(lhs, cmp1, rhs, self.engine) result = pd.eval(ex, engine=self.engine, parser=self.parser) @@ -414,11 +454,19 @@ def check_compound_invert_op(self, lhs, cmp1, rhs): self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, parser=self.parser, local_dict={'lhs': lhs, 'rhs': rhs}) + elif (np.isscalar(lhs) and np.isnan(lhs) and not np.isscalar(rhs) + and cmp1 in skip_these): + with tm.assertRaises(ValueError): + pd.eval(ex, engine=self.engine, parser=self.parser) else: # compound if np.isscalar(lhs) and np.isscalar(rhs): lhs, rhs = map(lambda x: np.array([x]), (lhs, rhs)) - expected = ~_eval_single_bin(lhs, cmp1, rhs, self.engine) + expected = _eval_single_bin(lhs, cmp1, rhs, self.engine) + if np.isscalar(expected): + expected = not expected + else: + expected = ~expected result = pd.eval(ex, engine=self.engine, parser=self.parser) assert_array_equal(expected, result) @@ -647,6 +695,18 @@ def test_scalar_unary(self): self.assertEqual(pd.eval('+True', parser=self.parser, engine=self.engine), +True) self.assertEqual(pd.eval('+False', parser=self.parser, engine=self.engine), +False) + def test_disallow_scalar_bool_ops(self): + exprs = '1 or 2', '1 and 2' + exprs += 'a and b', 'a or b' + exprs += '1 or 2 and (3 + 2) > 3', + exprs += '2 * x > 2 or 1 and 2', + exprs += '2 * df > 3 and 1 or a', + + x, a, b, df = np.random.randn(3), 1, 2, DataFrame(randn(3, 2)) + for ex in exprs: + with tm.assertRaises(NotImplementedError): + pd.eval(ex, engine=self.engine, parser=self.parser) + class TestEvalNumexprPython(TestEvalNumexprPandas): @classmethod @@ -999,21 +1059,18 @@ def test_simple_arith_ops(self): def test_simple_bool_ops(self): for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, False)): - expec = _eval_single_bin(lhs, op, rhs, self.engine) - x = self.eval('lhs {0} rhs'.format(op), local_dict={'lhs': lhs, - 'rhs': rhs}, - engine=self.engine, parser=self.parser) - assert_equal(x, expec) + ex = '{0} {1} {2}'.format(lhs, op, rhs) + res = self.eval(ex) + exp = eval(ex) + self.assertEqual(res, exp) def test_bool_ops_with_constants(self): - asteval = ast.literal_eval for op, lhs, rhs in product(expr._bool_ops_syms, ('True', 'False'), ('True', 'False')): - expec = _eval_single_bin(asteval(lhs), op, asteval(rhs), - self.engine) - x = self.eval('{0} {1} {2}'.format(lhs, op, rhs), - local_dict={'lhs': lhs, 'rhs': rhs}) - assert_equal(x, expec) + ex = '{0} {1} {2}'.format(lhs, op, rhs) + res = self.eval(ex) + exp = eval(ex) + self.assertEqual(res, exp) def test_panel_fails(self): x = Panel(randn(3, 4, 5)) @@ -1142,10 +1199,69 @@ def test_nested_period_index_subscript_expression(self): def test_date_boolean(self): df = DataFrame(randn(5, 3)) df['dates1'] = date_range('1/1/2012', periods=5) - res = self.eval('df.dates1 < 20130101', local_dict={'df': df}) + res = self.eval('df.dates1 < 20130101', local_dict={'df': df}, + engine=self.engine, parser=self.parser) expec = df.dates1 < '20130101' assert_series_equal(res, expec) + def test_simple_in_ops(self): + if self.parser != 'python': + res = pd.eval('1 in [1, 2]', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + + res = pd.eval('2 in (1, 2)', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + + res = pd.eval('3 in (1, 2)', engine=self.engine, + parser=self.parser) + self.assertFalse(res) + + res = pd.eval('3 not in (1, 2)', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + + res = pd.eval('[3] not in (1, 2)', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + + res = pd.eval('[3] in ([3], 2)', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + + res = pd.eval('[[3]] in [[[3]], 2]', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + + res = pd.eval('(3,) in [(3,), 2]', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + + res = pd.eval('(3,) not in [(3,), 2]', engine=self.engine, + parser=self.parser) + self.assertFalse(res) + + res = pd.eval('[(3,)] in [[(3,)], 2]', engine=self.engine, + parser=self.parser) + self.assertTrue(res) + else: + with tm.assertRaises(NotImplementedError): + pd.eval('1 in [1, 2]', engine=self.engine, parser=self.parser) + with tm.assertRaises(NotImplementedError): + pd.eval('2 in (1, 2)', engine=self.engine, parser=self.parser) + with tm.assertRaises(NotImplementedError): + pd.eval('3 in (1, 2)', engine=self.engine, parser=self.parser) + with tm.assertRaises(NotImplementedError): + pd.eval('3 not in (1, 2)', engine=self.engine, + parser=self.parser) + with tm.assertRaises(NotImplementedError): + pd.eval('[(3,)] in (1, 2, [(3,)])', engine=self.engine, + parser=self.parser) + with tm.assertRaises(NotImplementedError): + pd.eval('[3] not in (1, 2, [[3]])', engine=self.engine, + parser=self.parser) + class TestOperationsNumExprPython(TestOperationsNumExprPandas): @classmethod @@ -1178,47 +1294,39 @@ def test_fails_not(self): def test_fails_ampersand(self): df = DataFrame(np.random.randn(5, 3)) - self.assertRaises(TypeError, pd.eval, - '(df + 2)[df > 1] > 0 & (df > 0)', - local_dict={'df': df}, parser=self.parser, - engine=self.engine) + ex = '(df + 2)[df > 1] > 0 & (df > 0)' + with tm.assertRaises(NotImplementedError): + pd.eval(ex, parser=self.parser, engine=self.engine) def test_fails_pipe(self): df = DataFrame(np.random.randn(5, 3)) - self.assertRaises(TypeError, pd.eval, - '(df + 2)[df > 1] > 0 | (df > 0)', - local_dict={'df': df}, parser=self.parser, - engine=self.engine) + ex = '(df + 2)[df > 1] > 0 | (df > 0)' + with tm.assertRaises(NotImplementedError): + pd.eval(ex, parser=self.parser, engine=self.engine) def test_bool_ops_with_constants(self): - from ast import literal_eval as asteval for op, lhs, rhs in product(expr._bool_ops_syms, ('True', 'False'), ('True', 'False')): - if op not in ('and', 'or'): - expec = _eval_single_bin(asteval(lhs), op, asteval(rhs), - self.engine) - x = self.eval('{0} {1} {2}'.format(lhs, op, rhs), - local_dict={'lhs': lhs, 'rhs': rhs}) - assert_equal(x, expec) + ex = '{0} {1} {2}'.format(lhs, op, rhs) + if op in ('and', 'or'): + with tm.assertRaises(NotImplementedError): + self.eval(ex) else: - self.assertRaises(NotImplementedError, - self.eval, - '{0} {1} {2}'.format(lhs, op, rhs), - local_dict={'lhs': lhs, 'rhs': rhs}) + res = self.eval(ex) + exp = eval(ex) + self.assertEqual(res, exp) def test_simple_bool_ops(self): - for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, - False)): - if op not in ('and', 'or'): - expec = _eval_single_bin(lhs, op, rhs, self.engine) - x = self.eval('lhs {0} rhs'.format(op), local_dict={'lhs': lhs, - 'rhs': rhs}) - assert_equal(x, expec) + for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), + (True, False)): + ex = 'lhs {0} rhs'.format(op) + if op in ('and', 'or'): + with tm.assertRaises(NotImplementedError): + pd.eval(ex, engine=self.engine, parser=self.parser) else: - self.assertRaises(NotImplementedError, - self.eval, - 'lhs {0} rhs'.format(op), - local_dict={'lhs': lhs, 'rhs': rhs}) + res = pd.eval(ex, engine=self.engine, parser=self.parser) + exp = eval(ex) + self.assertEqual(res, exp) class TestOperationsPythonPython(TestOperationsNumExprPython): @@ -1229,20 +1337,6 @@ def setUpClass(cls): cls.arith_ops = filter(lambda x: x not in ('in', 'not in'), cls.arith_ops) - def test_fails_ampersand(self): - df = DataFrame(np.random.randn(5, 3)) - self.assertRaises(TypeError, pd.eval, - '(df + 2)[df > 1] > 0 & (df > 0)', - local_dict={'df': df}, parser=self.parser, - engine=self.engine) - - def test_fails_pipe(self): - df = DataFrame(np.random.randn(5, 3)) - self.assertRaises(TypeError, pd.eval, - '(df + 2)[df > 1] > 0 | (df > 0)', - local_dict={'df': df}, parser=self.parser, - engine=self.engine) - class TestOperationsPythonPandas(TestOperationsNumExprPandas): @classmethod diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index ae37953da62c1..b871a43e68b7a 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -11782,11 +11782,11 @@ def check_query_with_string_columns(self, parser, engine): 'd': np.random.randint(9, size=12)}) if parser == 'pandas': res = df.query('a in b', parser=parser, engine=engine) - expec = df[df.b.isin(df.a)] + expec = df[df.a.isin(df.b)] assert_frame_equal(res, expec) res = df.query('a in b and c < d', parser=parser, engine=engine) - expec = df[df.b.isin(df.a) & (df.c < df.d)] + expec = df[df.a.isin(df.b) & (df.c < df.d)] assert_frame_equal(res, expec) else: with assertRaises(NotImplementedError): @@ -11806,11 +11806,11 @@ def test_query_with_string_columns_numexpr(self): 'c': np.random.randint(5, size=12), 'd': np.random.randint(9, size=12)}) res = df['a in b'] - expec = df[df.b.isin(df.a)] + expec = df[df.a.isin(df.b)] assert_frame_equal(res, expec) res = df['a in b and c < d'] - expec = df[df.b.isin(df.a) & (df.c < df.d)] + expec = df[df.a.isin(df.b) & (df.c < df.d)] assert_frame_equal(res, expec) def check_object_array_eq_ne(self, parser, engine): From e55fa078337ddcd4b862b0538f2f46c3f9dc71f1 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sun, 15 Sep 2013 22:47:43 -0400 Subject: [PATCH 14/16] CLN: remove __getitem__ queries --- pandas/core/frame.py | 8 +- pandas/tests/test_frame.py | 488 ++++++++++++------------------------- 2 files changed, 161 insertions(+), 335 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c3504477b400b..0f6323aef96f0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1831,13 +1831,7 @@ def __getitem__(self, key): elif is_mi_columns: return self._getitem_multilevel(key) else: - try: - return self._getitem_column(key) - except KeyError: - if maybe_expression(key): - env = _ensure_scope(level=2) - return self.query(key, local_dict=env) - raise + return self._getitem_column(key) def _getitem_column(self, key): """ return the actual column """ diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index b871a43e68b7a..423707e0016d8 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -11129,167 +11129,6 @@ def skip_if_no_pandas_parser(parser): raise nose.SkipTest("cannot evaluate with parser {0!r}".format(parser)) -class TestDataFrameQueryNumExprPandas(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.engine = 'numexpr' - cls.parser = 'pandas' - skip_if_no_ne() - - @classmethod - def tearDownClass(cls): - del cls.engine, cls.parser - - def test_date_query_method(self): - engine, parser = self.engine, self.parser - df = DataFrame(randn(5, 3)) - df['dates1'] = date_range('1/1/2012', periods=5) - df['dates2'] = date_range('1/1/2013', periods=5) - df['dates3'] = date_range('1/1/2014', periods=5) - res = df.query('dates1 < 20130101 < dates3', engine=engine, - parser=parser) - expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] - assert_frame_equal(res, expec) - - def test_query_scope(self): - engine, parser = self.engine, self.parser - from pandas.computation.common import NameResolutionError - - df = DataFrame({"i": lrange(10), "+": lrange(3, 13), - "r": lrange(4, 14)}) - i, s = 5, 6 - self.assertRaises(NameResolutionError, df.query, 'i < 5', - engine=engine, parser=parser, local_dict={'i': i}) - self.assertRaises(SyntaxError, df.query, 'i - +', engine=engine, - parser=parser) - self.assertRaises(NameResolutionError, df.query, 'i == s', - engine=engine, parser=parser, local_dict={'i': i, - 's': s}) - - def test_query_scope_index(self): - engine, parser = self.engine, self.parser - from pandas.computation.common import NameResolutionError - df = DataFrame(np.random.randint(10, size=(10, 3)), - index=Index(range(10), name='blob'), - columns=['a', 'b', 'c']) - from numpy import sin - df.index.name = 'sin' - self.assertRaises(NameResolutionError, df.query, 'sin > 5', - engine=engine, parser=parser, local_dict={'sin': - sin}) - - def test_query(self): - engine, parser = self.engine, self.parser - df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c']) - - assert_frame_equal(df.query('a < b', engine=engine, parser=parser), - df[df.a < df.b]) - assert_frame_equal(df.query('a + b > b * c', engine=engine, - parser=parser), - df[df.a + df.b > df.b * df.c]) - - local_dict = dict(df.iteritems()) - local_dict.update({'df': df}) - self.assertRaises(NameError, df.query, 'a < d & b < f', - local_dict=local_dict, engine=engine, parser=parser) - - # make sure that it's not just because we didn't pass the locals in - self.assertRaises(AssertionError, self.assertRaises, NameError, - df.query, 'a < b', local_dict={'df': df}, - engine=engine, parser=parser) - - def test_query_index_with_name(self): - engine, parser = self.engine, self.parser - df = DataFrame(np.random.randint(10, size=(10, 3)), - index=Index(range(10), name='blob'), - columns=['a', 'b', 'c']) - res = df.query('(blob < 5) & (a < b)', engine=engine, parser=parser) - expec = df[(df.index < 5) & (df.a < df.b)] - assert_frame_equal(res, expec) - - res = df.query('blob < b', engine=engine, parser=parser) - expec = df[df.index < df.b] - - assert_frame_equal(res, expec) - - def test_query_index_without_name(self): - engine, parser = self.engine, self.parser - df = DataFrame(np.random.randint(10, size=(10, 3)), - index=range(10), columns=['a', 'b', 'c']) - - # "index" should refer to the index - res = df.query('index < b', engine=engine, parser=parser) - expec = df[df.index < df.b] - assert_frame_equal(res, expec) - - # test against a scalar - res = df.query('index < 5', engine=engine, parser=parser) - expec = df[df.index < 5] - assert_frame_equal(res, expec) - - def test_nested_scope(self): - engine = self.engine - parser = self.parser - # smoke test - x = 1 - result = pd.eval('x + 1', engine=engine, parser=parser) - self.assertEqual(result, 2) - - df = DataFrame(np.random.randn(5, 3)) - df2 = DataFrame(np.random.randn(5, 3)) - expected = df[(df>0) & (df2>0)] - - result = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) - assert_frame_equal(result, expected) - - result = pd.eval('df[(df > 0) and (df2 > 0)]', engine=engine, - parser=parser) - assert_frame_equal(result, expected) - - result = pd.eval('df[(df > 0) and (df2 > 0) and df[df > 0] > 0]', - engine=engine, parser=parser) - expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] - assert_frame_equal(result, expected) - - result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser) - expected = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) - assert_frame_equal(result, expected) - - def test_local_syntax(self): - skip_if_no_pandas_parser(self.parser) - - from pandas.computation.common import NameResolutionError - - engine, parser = self.engine, self.parser - df = DataFrame(randn(100, 10), columns=list('abcdefghij')) - b = 1 - expect = df[df.a < b] - result = df.query('a < @b', engine=engine, parser=parser) - assert_frame_equal(result, expect) - - # scope issue with self.assertRaises so just catch it and let it pass - try: - df.query('a < @b', engine=engine, parser=parser) - except NameResolutionError: - pass - - del b - expect = df[df.a < df.b] - result = df.query('a < b', engine=engine, parser=parser) - assert_frame_equal(result, expect) - - def test_chained_cmp_and_in(self): - skip_if_no_pandas_parser(self.parser) - engine, parser = self.engine, self.parser - cols = list('abc') - df = DataFrame(randn(100, len(cols)), columns=cols) - res = df.query('a < b < c and a not in b not in c', engine=engine, - parser=parser) - ind = (df.a < df.b) & (df.b < df.c) & ~df.b.isin(df.a) & ~df.c.isin(df.b) - expec = df[ind] - assert_frame_equal(res, expec) - - class TestDataFrameQueryWithMultiIndex(object): def check_query_with_named_multiindex(self, parser, engine): skip_if_no_ne(engine) @@ -11471,17 +11310,16 @@ def test_query_with_partially_named_multiindex(self): yield self.check_query_with_partially_named_multiindex, parser, engine -class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas): +class TestDataFrameQueryNumExprPandas(unittest.TestCase): @classmethod def setUpClass(cls): cls.engine = 'numexpr' - cls.parser = 'python' - skip_if_no_ne(cls.engine) - cls.frame = _frame.copy() + cls.parser = 'pandas' + skip_if_no_ne() @classmethod def tearDownClass(cls): - del cls.frame, cls.engine, cls.parser + del cls.engine, cls.parser def test_date_query_method(self): engine, parser = self.engine, self.parser @@ -11489,11 +11327,87 @@ def test_date_query_method(self): df['dates1'] = date_range('1/1/2012', periods=5) df['dates2'] = date_range('1/1/2013', periods=5) df['dates3'] = date_range('1/1/2014', periods=5) - res = df.query('(df.dates1 < 20130101) & (20130101 < df.dates3)', - engine=engine, parser=parser) + res = df.query('dates1 < 20130101 < dates3', engine=engine, + parser=parser) expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] assert_frame_equal(res, expec) + def test_query_scope(self): + engine, parser = self.engine, self.parser + from pandas.computation.common import NameResolutionError + + df = DataFrame({"i": lrange(10), "+": lrange(3, 13), + "r": lrange(4, 14)}) + i, s = 5, 6 + self.assertRaises(NameResolutionError, df.query, 'i < 5', + engine=engine, parser=parser, local_dict={'i': i}) + self.assertRaises(SyntaxError, df.query, 'i - +', engine=engine, + parser=parser) + self.assertRaises(NameResolutionError, df.query, 'i == s', + engine=engine, parser=parser, local_dict={'i': i, + 's': s}) + + def test_query_scope_index(self): + engine, parser = self.engine, self.parser + from pandas.computation.common import NameResolutionError + df = DataFrame(np.random.randint(10, size=(10, 3)), + index=Index(range(10), name='blob'), + columns=['a', 'b', 'c']) + from numpy import sin + df.index.name = 'sin' + self.assertRaises(NameResolutionError, df.query, 'sin > 5', + engine=engine, parser=parser, local_dict={'sin': + sin}) + + def test_query(self): + engine, parser = self.engine, self.parser + df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c']) + + assert_frame_equal(df.query('a < b', engine=engine, parser=parser), + df[df.a < df.b]) + assert_frame_equal(df.query('a + b > b * c', engine=engine, + parser=parser), + df[df.a + df.b > df.b * df.c]) + + local_dict = dict(df.iteritems()) + local_dict.update({'df': df}) + self.assertRaises(NameError, df.query, 'a < d & b < f', + local_dict=local_dict, engine=engine, parser=parser) + + # make sure that it's not just because we didn't pass the locals in + self.assertRaises(AssertionError, self.assertRaises, NameError, + df.query, 'a < b', local_dict={'df': df}, + engine=engine, parser=parser) + + def test_query_index_with_name(self): + engine, parser = self.engine, self.parser + df = DataFrame(np.random.randint(10, size=(10, 3)), + index=Index(range(10), name='blob'), + columns=['a', 'b', 'c']) + res = df.query('(blob < 5) & (a < b)', engine=engine, parser=parser) + expec = df[(df.index < 5) & (df.a < df.b)] + assert_frame_equal(res, expec) + + res = df.query('blob < b', engine=engine, parser=parser) + expec = df[df.index < df.b] + + assert_frame_equal(res, expec) + + def test_query_index_without_name(self): + engine, parser = self.engine, self.parser + df = DataFrame(np.random.randint(10, size=(10, 3)), + index=range(10), columns=['a', 'b', 'c']) + + # "index" should refer to the index + res = df.query('index < b', engine=engine, parser=parser) + expec = df[df.index < df.b] + assert_frame_equal(res, expec) + + # test against a scalar + res = df.query('index < 5', engine=engine, parser=parser) + expec = df[df.index < 5] + assert_frame_equal(res, expec) + def test_nested_scope(self): engine = self.engine parser = self.parser @@ -11509,11 +11423,11 @@ def test_nested_scope(self): result = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) assert_frame_equal(result, expected) - result = pd.eval('df[(df > 0) & (df2 > 0)]', engine=engine, + result = pd.eval('df[(df > 0) and (df2 > 0)]', engine=engine, parser=parser) assert_frame_equal(result, expected) - result = pd.eval('df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]', + result = pd.eval('df[(df > 0) and (df2 > 0) and df[df > 0] > 0]', engine=engine, parser=parser) expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] assert_frame_equal(result, expected) @@ -11522,131 +11436,114 @@ def test_nested_scope(self): expected = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) assert_frame_equal(result, expected) + def test_local_syntax(self): + skip_if_no_pandas_parser(self.parser) -class TestDataFrameQueryPythonPandas(TestDataFrameQueryNumExprPandas): - @classmethod - def setUpClass(cls): - cls.engine = 'python' - cls.parser = 'pandas' - cls.frame = _frame.copy() + from pandas.computation.common import NameResolutionError - @classmethod - def tearDownClass(cls): - del cls.frame, cls.engine, cls.parser + engine, parser = self.engine, self.parser + df = DataFrame(randn(100, 10), columns=list('abcdefghij')) + b = 1 + expect = df[df.a < b] + result = df.query('a < @b', engine=engine, parser=parser) + assert_frame_equal(result, expect) + # scope issue with self.assertRaises so just catch it and let it pass + try: + df.query('a < @b', engine=engine, parser=parser) + except NameResolutionError: + pass -class TestDataFrameQueryPythonPython(TestDataFrameQueryNumExprPython): - @classmethod - def setUpClass(cls): - cls.engine = cls.parser = 'python' - cls.frame = _frame.copy() + del b + expect = df[df.a < df.b] + result = df.query('a < b', engine=engine, parser=parser) + assert_frame_equal(result, expect) - @classmethod - def tearDownClass(cls): - del cls.frame, cls.engine, cls.parser + def test_chained_cmp_and_in(self): + skip_if_no_pandas_parser(self.parser) + engine, parser = self.engine, self.parser + cols = list('abc') + df = DataFrame(randn(100, len(cols)), columns=cols) + res = df.query('a < b < c and a not in b not in c', engine=engine, + parser=parser) + ind = (df.a < df.b) & (df.b < df.c) & ~df.b.isin(df.a) & ~df.c.isin(df.b) + expec = df[ind] + assert_frame_equal(res, expec) -class TestDataFrameQueryGetitem(unittest.TestCase): +class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas): @classmethod def setUpClass(cls): - skip_if_no_ne() + cls.engine = 'numexpr' + cls.parser = 'python' + skip_if_no_ne(cls.engine) cls.frame = _frame.copy() @classmethod def tearDownClass(cls): - del cls.frame - - def test_nested_scope(self): - df = DataFrame(np.random.randn(5, 3)) - df2 = DataFrame(np.random.randn(5, 3)) - expected = df[(df > 0) & (df2 > 0)] - - result = df['(df>0) & (df2>0)'] - assert_frame_equal(result, expected) + del cls.frame, cls.engine, cls.parser - def test_date_query_getitem(self): + def test_date_query_method(self): + engine, parser = self.engine, self.parser df = DataFrame(randn(5, 3)) df['dates1'] = date_range('1/1/2012', periods=5) df['dates2'] = date_range('1/1/2013', periods=5) df['dates3'] = date_range('1/1/2014', periods=5) - res = df['dates1 < 20130101 < dates3'] + res = df.query('(df.dates1 < 20130101) & (20130101 < df.dates3)', + engine=engine, parser=parser) expec = df[(df.dates1 < '20130101') & ('20130101' < df.dates3)] assert_frame_equal(res, expec) - def test_query_expressions_correct_failure(self): - import random - import string - - df = self.frame - exprs = 'and', 'or', 'not' - exprs += tuple(x + tm.rands(5) for x in exprs) - exprs += tuple(random.choice(string.ascii_letters) + tm.rands(5) + x - for x in exprs) - - exprs += 'inb', + def test_nested_scope(self): + engine = self.engine + parser = self.parser + # smoke test + x = 1 + result = pd.eval('x + 1', engine=engine, parser=parser) + self.assertEqual(result, 2) - for e in exprs: - with self.assertRaises(KeyError): - df[e] + df = DataFrame(np.random.randn(5, 3)) + df2 = DataFrame(np.random.randn(5, 3)) + expected = df[(df>0) & (df2>0)] - for e in (' and ', ' or ', ' not '): - self.assertRaises(SyntaxError, df.__getitem__, e) + result = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + assert_frame_equal(result, expected) - x = tm.randbool(size=(self.frame.shape[0],)) - self.assertRaises(KeyError, df.__getitem__, 'x') + result = pd.eval('df[(df > 0) & (df2 > 0)]', engine=engine, + parser=parser) + assert_frame_equal(result, expected) - self.assertRaises(NameError, df.__getitem__, 'not inb') + result = pd.eval('df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]', + engine=engine, parser=parser) + expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)] + assert_frame_equal(result, expected) - def test_query_expressions_with_index(self): - df = DataFrame(np.random.randint(10, size=(10, 3)), - index=Index(range(10), name='blob'), - columns=['a', 'b', 'c']) - assert_frame_equal(df['index < b'], df[df.index < df.b]) - assert_frame_equal(df['index < 5'], df[df.index < 5]) - assert_frame_equal(df['(blob < 5) & (a < b)'], - df[(df.index < 5) & (df.a < df.b)]) - assert_frame_equal(df['blob < b'], df[df.index < df.b]) + result = pd.eval('df[(df>0) & (df2>0)]', engine=engine, parser=parser) + expected = df.query('(df>0) & (df2>0)', engine=engine, parser=parser) + assert_frame_equal(result, expected) - def test_query_expressions(self): - df = DataFrame(np.random.randn(10, 3), columns=['a', 'b', 'c']) - assert_frame_equal(df['a < b'], df[df.a < df.b]) - assert_frame_equal(df['a + b > b * c'], - df[df.a + df.b > df.b * df.c]) - def test_simple_not_expression(self): - df = DataFrame(randn(10, 3), columns=list('abc')) - df['bools'] = rand(len(df)) > 0.5 - res = df['not bools'] - res2 = df['~bools'] - expec = df[~df.bools] - assert_frame_equal(res, expec) - assert_frame_equal(res2, expec) +class TestDataFrameQueryPythonPandas(TestDataFrameQueryNumExprPandas): + @classmethod + def setUpClass(cls): + cls.engine = 'python' + cls.parser = 'pandas' + cls.frame = _frame.copy() - def test_complex_boolean_expression(self): - df = DataFrame(randn(10, 3), columns=list('abc')) - df['bools'] = rand(len(df)) > 0.5 - res = df['a < b < c and (not bools) or bools > 2'] - expec = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)] - assert_frame_equal(res, expec) + @classmethod + def tearDownClass(cls): + del cls.frame, cls.engine, cls.parser - def test_local_syntax(self): - from pandas.computation.common import NameResolutionError - df = DataFrame(randn(1000, 10), columns=list('abcdefghij')) - b = 1 - expect = df[df.a < b] - result = df['a < @b'] - assert_frame_equal(result, expect) - # scope issue with self.assertRaises so just catch it and let it pass - try: - df['a < b'] - except NameResolutionError: - pass +class TestDataFrameQueryPythonPython(TestDataFrameQueryNumExprPython): + @classmethod + def setUpClass(cls): + cls.engine = cls.parser = 'python' + cls.frame = _frame.copy() - del b - expect = df[df.a < df.b] - result = df['a < b'] - assert_frame_equal(result, expect) + @classmethod + def tearDownClass(cls): + del cls.frame, cls.engine, cls.parser PARSERS = 'python', 'pandas' @@ -11737,43 +11634,6 @@ def check_str_list_query_method(self, parser, engine): parser=parser) assert_frame_equal(res, expect) - def test_str_query_getitem(self): - skip_if_no_ne() - df = DataFrame(randn(10, 1), columns=['b']) - df['strings'] = Series(list('aabbccddee')) - expect = df[df.strings == 'a'] - res = df['strings == "a"'] - assert_frame_equal(res, expect) - - res = df['"a" == strings'] - assert_frame_equal(res, expect) - - expect = df[df.strings != 'a'] - res = df['strings != "a"'] - assert_frame_equal(res, expect) - - res = df['"a" != strings'] - assert_frame_equal(res, expect) - - def test_str_query_list_getitem(self): - skip_if_no_ne() - df = DataFrame(randn(10, 1), columns=['b']) - df['strings'] = Series(list('aabbccddee')) - - expect = df[df.strings.isin(['a', 'b'])] - res = df['strings == ["a", "b"]'] - assert_frame_equal(res, expect) - - res = df['["a", "b"] == strings'] - assert_frame_equal(res, expect) - - expect = df[~df.strings.isin(['a', 'b'])] - res = df['strings != ["a", "b"]'] - assert_frame_equal(res, expect) - - res = df['["a", "b"] != strings'] - assert_frame_equal(res, expect) - def check_query_with_string_columns(self, parser, engine): skip_if_no_ne(engine) df = DataFrame({'a': list('aaaabbbbcccc'), @@ -11799,20 +11659,6 @@ def test_query_with_string_columns(self): for parser, engine in product(PARSERS, ENGINES): yield self.check_query_with_string_columns, parser, engine - def test_query_with_string_columns_numexpr(self): - skip_if_no_ne() - df = DataFrame({'a': list('aaaabbbbcccc'), - 'b': list('aabbccddeeff'), - 'c': np.random.randint(5, size=12), - 'd': np.random.randint(9, size=12)}) - res = df['a in b'] - expec = df[df.a.isin(df.b)] - assert_frame_equal(res, expec) - - res = df['a in b and c < d'] - expec = df[df.a.isin(df.b) & (df.c < df.d)] - assert_frame_equal(res, expec) - def check_object_array_eq_ne(self, parser, engine): skip_if_no_ne(engine) df = DataFrame({'a': list('aaaabbbbcccc'), @@ -11831,20 +11677,6 @@ def test_object_array_eq_ne(self): for parser, engine in product(PARSERS, ENGINES): yield self.check_object_array_eq_ne, parser, engine - def test_object_array_eq_ne_getitem(self): - skip_if_no_ne() - df = DataFrame({'a': list('aaaabbbbcccc'), - 'b': list('aabbccddeeff'), - 'c': np.random.randint(5, size=12), - 'd': np.random.randint(9, size=12)}) - res = df['a == b'] - exp = df[df.a == df.b] - assert_frame_equal(res, exp) - - res = df['a != b'] - exp = df[df.a != df.b] - assert_frame_equal(res, exp) - class TestDataFrameEvalNumExprPandas(unittest.TestCase): @classmethod From 50502fbd99f897aaa8c89c7f9e2fdabdd2888863 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 16 Sep 2013 10:38:09 -0400 Subject: [PATCH 15/16] CLN/DOC: remove __getitem__ from docs --- doc/source/comparison_with_r.rst | 1 - doc/source/enhancingperf.rst | 12 +- doc/source/indexing.rst | 914 ++++++++++++++++--------------- doc/source/release.rst | 10 +- doc/source/v0.13.0.txt | 8 +- pandas/computation/pytables.py | 2 - pandas/core/frame.py | 4 +- 7 files changed, 486 insertions(+), 465 deletions(-) diff --git a/doc/source/comparison_with_r.rst b/doc/source/comparison_with_r.rst index 012a6fe6baf96..ef609aaa7d70c 100644 --- a/doc/source/comparison_with_r.rst +++ b/doc/source/comparison_with_r.rst @@ -49,7 +49,6 @@ index/slice as well as standard boolean indexing: df = DataFrame({'a': randn(10), 'b': randn(10)}) df.query('a <= b') - df['a <= b'] df[df.a <= df.b] df.loc[df.a <= df.b] diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index 6d5717e420d1e..87b68248c3e9e 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -292,8 +292,8 @@ Read more in the `cython docs `__. .. _enhancingperf.eval: -Expression Evaluation via :func:`~pandas.eval` ----------------------------------------------- +Expression Evaluation via :func:`~pandas.eval` (Experimental) +------------------------------------------------------------- .. versionadded:: 0.13 @@ -392,8 +392,8 @@ Now let's do the same thing but with comparisons: includes bitwise operations on scalars.* You should perform these kinds of operations in Python. -The ``DataFrame.eval`` method -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The ``DataFrame.eval`` method (Experimental) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In addition to the top level :func:`~pandas.eval` function you can also evaluate an expression in the "context" of a ``DataFrame``. @@ -447,13 +447,11 @@ You can refer to local variables the same way you would in vanilla Python df.eval('@a + b') - The same is true for :meth:`~pandas.DataFrame.query` and - :meth:`~pandas.DataFrame.__getitem__` passed an expression + The same is true for :meth:`~pandas.DataFrame.query` .. ipython:: python df.query('@a < b') - df['@a < b'] .. ipython:: python :suppress: diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 7d22fb5335ae6..2f2a47d4b0bf2 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -26,48 +26,58 @@ The axis labeling information in pandas objects serves many purposes: - Enables automatic and explicit data alignment - Allows intuitive getting and setting of subsets of the data set -In this section / chapter, we will focus on the final point: namely, how to -slice, dice, and generally get and set subsets of pandas objects. The primary -focus will be on Series and DataFrame as they have received more development -attention in this area. Expect more work to be invested higher-dimensional data -structures (including Panel) in the future, especially in label-based advanced +In this section, we will focus on the final point: namely, how to slice, dice, +and generally get and set subsets of pandas objects. The primary focus will be +on Series and DataFrame as they have received more development attention in +this area. Expect more work to be invested higher-dimensional data structures +(including ``Panel``) in the future, especially in label-based advanced indexing. .. note:: - The Python and NumPy indexing operators ``[]`` and attribute operator ``.`` provide quick and easy access to pandas data structures - across a wide range of use cases. This makes interactive work intuitive, as - there's little new to learn if you already know how to deal with Python - dictionaries and NumPy arrays. However, since the type of the data to be accessed - isn't known in advance, directly using - standard operators has some optimization limits. For production code, we recommended - that you take advantage of the optimized pandas data access methods exposed in this chapter. + The Python and NumPy indexing operators ``[]`` and attribute operator ``.`` + provide quick and easy access to pandas data structures across a wide range + of use cases. This makes interactive work intuitive, as there's little new + to learn if you already know how to deal with Python dictionaries and NumPy + arrays. However, since the type of the data to be accessed isn't known in + advance, directly using standard operators has some optimization limits. For + production code, we recommended that you take advantage of the optimized + pandas data access methods exposed in this chapter. .. warning:: - Whether a copy or a reference is returned for a setting operation, may depend on the context. - This is sometimes called ``chained assignment`` and should be avoided. - See :ref:`Returning a View versus Copy ` + Whether a copy or a reference is returned for a setting operation, may + depend on the context. This is sometimes called ``chained assignment`` and + should be avoided. See :ref:`Returning a View versus Copy + ` See the :ref:`cookbook` for some advanced strategies -Choice ------- +Different Choices for Indexing (``loc``, ``iloc``, and ``ix``) +-------------------------------------------------------------- -Starting in 0.11.0, object selection has had a number of user-requested additions in -order to support more explicit location based indexing. Pandas now supports -three types of multi-axis indexing. +.. versionadded:: 0.11.0 -- ``.loc`` is strictly label based, will raise ``KeyError`` when the items are not found, allowed inputs are: +Object selection has had a number of user-requested additions in order to +support more explicit location based indexing. Pandas now supports three types +of multi-axis indexing. - - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is interpreted as a *label* of the index. This use is **not** an integer position along the index) +- ``.loc`` is strictly label based, will raise ``KeyError`` when the items are + not found, allowed inputs are: + + - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is interpreted as a + *label* of the index. This use is **not** an integer position along the + index) - A list or array of labels ``['a', 'b', 'c']`` - - A slice object with labels ``'a':'f'``, (note that contrary to usual python slices, **both** the start and the stop are included!) + - A slice object with labels ``'a':'f'``, (note that contrary to usual python + slices, **both** the start and the stop are included!) - A boolean array See more at :ref:`Selection by Label ` -- ``.iloc`` is strictly integer position based (from ``0`` to ``length-1`` of the axis), will raise ``IndexError`` when the requested indicies are out of bounds. Allowed inputs are: +- ``.iloc`` is strictly integer position based (from ``0`` to ``length-1`` of + the axis), will raise ``IndexError`` when the requested indicies are out of + bounds. Allowed inputs are: - An integer e.g. ``5`` - A list or array of integers ``[4, 3, 0]`` @@ -75,20 +85,24 @@ three types of multi-axis indexing. See more at :ref:`Selection by Position ` -- ``.ix`` supports mixed integer and label based access. It is primarily label based, but will fallback to integer positional access. ``.ix`` is the most general - and will support any of the inputs to ``.loc`` and ``.iloc``, as well as support for floating point label schemes. ``.ix`` is especially useful when dealing with mixed positional and label - based hierarchial indexes. - - As using integer slices with ``.ix`` have different behavior depending on whether the slice is interpreted as position based or label based, it's +- ``.ix`` supports mixed integer and label based access. It is primarily label + based, but will fallback to integer positional access. ``.ix`` is the most + general and will support any of the inputs to ``.loc`` and ``.iloc``, as well + as support for floating point label schemes. ``.ix`` is especially useful + when dealing with mixed positional and label based hierarchial indexes. + As using integer slices with ``.ix`` have different behavior depending on + whether the slice is interpreted as position based or label based, it's usually better to be explicit and use ``.iloc`` or ``.loc``. - See more at :ref:`Advanced Indexing `, :ref:`Advanced Hierarchical ` and :ref:`Fallback Indexing ` + See more at :ref:`Advanced Indexing `, :ref:`Advanced + Hierarchical ` and :ref:`Fallback Indexing + ` Getting values from an object with multi-axes selection uses the following notation (using ``.loc`` as an example, but applies to ``.iloc`` and ``.ix`` as well). Any of the axes accessors may be the null slice ``:``. Axes left out of the specification are assumed to be ``:``. (e.g. ``p.loc['a']`` is equiv to -``p.loc['a',:,:]``) +``p.loc['a', :, :]``) .. csv-table:: :header: "Object Type", "Indexers" @@ -100,7 +114,7 @@ the specification are assumed to be ``:``. (e.g. ``p.loc['a']`` is equiv to Panel; ``p.loc[item_indexer,major_indexer,minor_indexer]`` Deprecations -~~~~~~~~~~~~ +------------ Beginning with version 0.11.0, it's recommended that you transition away from the following methods as they *may* be deprecated in future versions. @@ -168,7 +182,7 @@ You may find this useful for applying a transform (in-place) to a subset of the columns. Attribute Access -~~~~~~~~~~~~~~~~ +---------------- .. _indexing.columns.multiple: @@ -213,7 +227,7 @@ If you are using the IPython environment, you may also use tab-completion to see these accessable attributes. Slicing ranges -~~~~~~~~~~~~~~ +-------------- The most robust and consistent way of slicing ranges along arbitrary axes is described in the :ref:`Selection by Position ` section @@ -247,7 +261,7 @@ largely as a convenience since it is such a common operation. .. _indexing.label: Selection By Label -~~~~~~~~~~~~~~~~~~ +------------------ .. warning:: @@ -318,7 +332,7 @@ For getting a value explicity (equiv to deprecated ``df.get_value('a','A')``) .. _indexing.integer: Selection By Position -~~~~~~~~~~~~~~~~~~~~~ +--------------------- .. warning:: @@ -415,7 +429,7 @@ Pandas will detect this and raise ``IndexError``, rather than return an empty st .. _indexing.basics.partial_setting: Setting With Enlargement -~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------ .. versionadded:: 0.13 @@ -450,7 +464,7 @@ This is like an ``append`` operation on the ``DataFrame``. .. _indexing.basics.get_value: Fast scalar value getting and setting -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------------- Since indexing with ``[]`` must handle a lot of cases (single-label access, slicing, boolean indexing, etc.), it has a bit of overhead in order to figure @@ -481,7 +495,7 @@ You can also set using these same indexers. df Boolean indexing -~~~~~~~~~~~~~~~~ +---------------- .. _indexing.boolean: @@ -572,8 +586,8 @@ You can also describe columns using integer location: df.isin(values, iloc=True) -Where and Masking -~~~~~~~~~~~~~~~~~ +The :meth:`~pandas.DataFrame.where` Method and Masking +------------------------------------------------------ Selecting values from a Series with a boolean vector generally returns a subset of the data. To guarantee that selection output has the same shape as @@ -673,634 +687,634 @@ This is equivalent (but faster than) the following. s.mask(s >= 0) df.mask(df >= 0) -Take Methods -~~~~~~~~~~~~ +.. _indexing.query: -.. _indexing.take: +The :meth:`~pandas.DataFrame.query` Method (Experimental) +--------------------------------------------------------- -Similar to numpy ndarrays, pandas Index, Series, and DataFrame also provides -the ``take`` method that retrieves elements along a given axis at the given -indices. The given indices must be either a list or an ndarray of integer -index positions. ``take`` will also accept negative integers as relative positions to the end of the object. +.. versionadded:: 0.13 + +:class:`~pandas.DataFrame` objects have a :meth:`~pandas.DataFrame.query` +method that allows selection using an expression. + +You can get the value of the frame where column ``b`` has values +between the values of columns ``a`` and ``c``. For example: .. ipython:: python + :suppress: - index = Index(randint(0, 1000, 10)) - index + from numpy.random import randint, rand + np.random.seed(1234) - positions = [0, 9, 3] +.. ipython:: python - index[positions] - index.take(positions) + n = 10 + df = DataFrame(rand(n, 3), columns=list('abc')) + df - ser = Series(randn(10)) + # pure python + df[(df.a < df.b) & (df.b < df.c)] - ser.ix[positions] - ser.take(positions) + # query + df.query('(a < b) & (b < c)') -For DataFrames, the given indices should be a 1d list or ndarray that specifies -row or column positions. +Do the same thing but fallback on a named index if there is no column +with the name ``a``. .. ipython:: python - frm = DataFrame(randn(5, 3)) + df = DataFrame(randint(n / 2, size=(n, 2)), columns=list('bc')) + df.index.name = 'a' + df + df.query('a < b and b < c') - frm.take([1, 4, 3]) +If instead you don't want to or cannot name your index, you can use the name +``index`` in your query expression: - frm.take([0, 2], axis=1) +.. ipython:: python + :suppress: -It is important to note that the ``take`` method on pandas objects are not -intended to work on boolean indices and may return unexpected results. + old_index = index + del index .. ipython:: python - arr = randn(10) - arr.take([False, False, True, True]) - arr[[0, 1]] + df = DataFrame(randint(n, size=(n, 2)), columns=list('bc')) + df + df.query('index < b < c') - ser = Series(randn(10)) - ser.take([False, False, True, True]) - ser.ix[[0, 1]] +.. ipython:: python + :suppress: -Finally, as a small note on performance, because the ``take`` method handles -a narrower range of inputs, it can offer performance that is a good deal -faster than fancy indexing. + index = old_index + del old_index -.. ipython:: - arr = randn(10000, 5) - indexer = np.arange(10000) - random.shuffle(indexer) +:class:`~pandas.MultiIndex` :meth:`~pandas.DataFrame.query` Syntax +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - timeit arr[indexer] - timeit arr.take(indexer, axis=0) +You can also use the levels of a ``DataFrame`` with a +:class:`~pandas.MultiIndex` as if they were columns in the frame: - ser = Series(arr[:, 0]) - timeit ser.ix[indexer] - timeit ser.take(indexer) +.. ipython:: python -Duplicate Data -~~~~~~~~~~~~~~ + import pandas.util.testing as tm -.. _indexing.duplicate: + n = 10 + colors = tm.choice(['red', 'green'], size=n) + foods = tm.choice(['eggs', 'ham'], size=n) + colors + foods -If you want to identify and remove duplicate rows in a DataFrame, there are -two methods that will help: ``duplicated`` and ``drop_duplicates``. Each -takes as an argument the columns to use to identify duplicated rows. + index = MultiIndex.from_arrays([colors, foods], names=['color', 'food']) + df = DataFrame(randn(n, 2), index=index) + df + df.query('color == "red"') -- ``duplicated`` returns a boolean vector whose length is the number of rows, and which indicates whether a row is duplicated. -- ``drop_duplicates`` removes duplicate rows. +If the levels of the ``MultiIndex`` are unnamed, you can refer to them using +special names: -By default, the first observed row of a duplicate set is considered unique, but -each method has a ``take_last`` parameter that indicates the last observed row -should be taken instead. .. ipython:: python - df2 = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'], - 'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'], - 'c' : np.random.randn(7)}) - df2.duplicated(['a','b']) - df2.drop_duplicates(['a','b']) - df2.drop_duplicates(['a','b'], take_last=True) - -.. _indexing.dictionarylike: + df.index.names = [None, None] + df + df.query('ilevel_0 == "red"') -Dictionary-like ``get`` method -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Each of Series, DataFrame, and Panel have a ``get`` method which can return a -default value. +The convention is ``ilevel_0``, which means "index level 0" for the 0th level +of the ``index``. -.. ipython:: python - s = Series([1,2,3], index=['a','b','c']) - s.get('a') # equivalent to s['a'] - s.get('x', default=-1) +:meth:`~pandas.DataFrame.query` Use Cases +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _indexing.advanced: +A use case for :meth:`~pandas.DataFrame.query` is when you have a collection of +:class:`~pandas.DataFrame` objects that have a subset of column names (or index +levels/names) in common. You can pass the same query to both frames *without* +having to specify which frame you're interested in querying -Advanced Indexing with ``.ix`` ------------------------------- +.. ipython:: python -.. note:: + df = DataFrame(rand(n, 3), columns=list('abc')) + df + df2 = DataFrame(rand(n + 2, 3), columns=df.columns) + df2 + expr = '0.0 <= a <= c <= 0.5' + map(lambda frame: frame.query(expr), [df, df2]) - The recent addition of ``.loc`` and ``.iloc`` have enabled users to be quite - explicit about indexing choices. ``.ix`` allows a great flexibility to - specify indexing locations by *label* and/or *integer position*. Pandas will - attempt to use any passed *integer* as *label* locations first (like what - ``.loc`` would do, then to fall back on *positional* indexing, like what - ``.iloc`` would do). See :ref:`Fallback Indexing ` for - an example. +:meth:`~pandas.DataFrame.query` Python versus pandas Syntax Comparison +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The syntax of using ``.ix`` is identical to ``.loc``, in :ref:`Selection by -Label `, and ``.iloc`` in :ref:`Selection by Position `. +Full numpy-like syntax -The ``.ix`` attribute takes the following inputs: +.. ipython:: python -- An integer or single label, e.g. ``5`` or ``'a'`` -- A list or array of labels ``['a', 'b', 'c']`` or integers ``[4, 3, 0]`` -- A slice object with ints ``1:7`` or labels ``'a':'f'`` -- A boolean array + df = DataFrame(randint(n, size=(n, 3)), columns=list('abc')) + df + df.query('(a < b) & (b < c)') + df[(df.a < df.b) & (df.b < df.c)] -We'll illustrate all of these methods. First, note that this provides a concise -way of reindexing on multiple axes at once: +Slightly nicer by removing the parentheses (by binding making comparison +operators bind tighter than ``&``/``|``) .. ipython:: python - subindex = dates[[3,4,5]] - df.reindex(index=subindex, columns=['C', 'B']) - df.ix[subindex, ['C', 'B']] + df.query('a < b & b < c') -Assignment / setting values is possible when using ``ix``: +Use English instead of symbols .. ipython:: python - df2 = df.copy() - df2.ix[subindex, ['C', 'B']] = 0 - df2 + df.query('a < b and b < c') -Indexing with an array of integers can also be done: +Pretty close to how you might write it on paper .. ipython:: python - df.ix[[4,3,1]] - df.ix[dates[[4,3,1]]] + df.query('a < b < c') -**Slicing** has standard Python semantics for integer slices: +The ``in`` and ``not in`` operators +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. ipython:: python +:meth:`~pandas.DataFrame.query` also supports special use of Python's ``in`` and +``not in`` comparison operators, providing a succint syntax for calling the +``isin`` method of a ``Series`` or ``DataFrame``. - df.ix[1:7, :2] +.. ipython:: python + :suppress: -Slicing with labels is semantically slightly different because the slice start -and stop are **inclusive** in the label-based case: + try: + old_d = d + del d + except NameError: + pass .. ipython:: python - date1, date2 = dates[[2, 4]] - print date1, date2 - df.ix[date1:date2] - df['A'].ix[date1:date2] + # get all rows where columns "a" and "b" have overlapping values + df = DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'), + 'c': randint(5, size=12), 'd': randint(9, size=12)}) + df + df.query('a in b') -Getting and setting rows in a DataFrame, especially by their location, is much -easier: + # How you'd do it in pure Python + df[df.a.isin(df.b)] -.. ipython:: python + df.query('a not in b') - df2 = df[:5].copy() - df2.ix[3] - df2.ix[3] = np.arange(len(df2.columns)) - df2 + # pure Python + df[~df.a.isin(df.b)] + + +You can combine this with other expressions for very succinct queries: -Column or row selection can be combined as you would expect with arrays of -labels or even boolean vectors: .. ipython:: python - df.ix[df['A'] > 0, 'B'] - df.ix[date1:date2, 'B'] - df.ix[date1, 'B'] + # rows where cols a and b have overlapping values and col c's values are less than col d's + df.query('a in b and c < d') -Slicing with labels is closely related to the ``truncate`` method which does -precisely ``.ix[start:stop]`` but returns a copy (for legacy reasons). + # pure Python + df[df.b.isin(df.a) & (df.c < df.d)] -The ``select`` method -~~~~~~~~~~~~~~~~~~~~~ -Another way to extract slices from an object is with the ``select`` method of -Series, DataFrame, and Panel. This method should be used only when there is no -more direct way. ``select`` takes a function which operates on labels along -``axis`` and returns a boolean. For instance: +.. note:: -.. ipython:: python + Note that ``in`` and ``not in`` are evaluated in Python, since ``numexpr`` + has no equivalent of this operation. However, **only the** ``in``/``not in`` + **expression itself** is evaluated in vanilla Python. For example, in the + expression - df.select(lambda x: x == 'A', axis=1) + .. code-block:: python -The ``lookup`` method -~~~~~~~~~~~~~~~~~~~~~ + df.query('a in b + c + d') -Sometimes you want to extract a set of values given a sequence of row labels -and column labels, and the ``lookup`` method allows for this and returns a -numpy array. For instance, + ``(b + c + d)`` is evaluated by ``numexpr`` and *then* the ``in`` + operation is evaluated in plain Python. In general, any operations that can + be evaluated using ``numexpr`` will be. + +Special use of the ``==`` operator with ``list`` objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Comparing a ``list`` of values to a column using ``==``/``!=`` works similarly +to ``in``/``not in`` .. ipython:: python - dflookup = DataFrame(np.random.rand(20,4), columns = ['A','B','C','D']) - dflookup.lookup(list(range(0,10,2)), ['B','C','A','B','D']) + df.query('b == ["a", "b", "c"]') -Setting values in mixed-type DataFrame -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # pure Python + df[df.b.isin(["a", "b", "c"])] -.. _indexing.mixed_type_setting: + df.query('c == [1, 2]') -Setting values on a mixed-type DataFrame or Panel is supported when using -scalar values, though setting arbitrary vectors is not yet supported: + df.query('c != [1, 2]') -.. ipython:: python + # using in/not in + df.query('[1, 2] in c') - df2 = df[:4] - df2['foo'] = 'bar' - print df2 - df2.ix[2] = np.nan - print df2 - print df2.dtypes + df.query('[1, 2] not in c') -.. _indexing.view_versus_copy: + # pure Python + df[df.c.isin([1, 2])] -Returning a view versus a copy -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The rules about when a view on the data is returned are entirely dependent on -NumPy. Whenever an array of labels or a boolean vector are involved in the -indexing operation, the result will be a copy. With single label / scalar -indexing and slicing, e.g. ``df.ix[3:6]`` or ``df.ix[:, 'A']``, a view will be -returned. +Boolean Operators +~~~~~~~~~~~~~~~~~ -In chained expressions, the order may determine whether a copy is returned or not: +You can negate boolean expressions with the word ``not`` or the ``~`` operator. .. ipython:: python + df = DataFrame(rand(n, 3), columns=list('abc')) + df['bools'] = rand(len(df)) > 0.5 + df.query('~bools') + df.query('not bools') + df.query('not bools') == df[~df.bools] - dfb = DataFrame({'a' : ['one', 'one', 'two', - 'three', 'two', 'one', 'six'], - 'b' : ['x', 'y', 'y', - 'x', 'y', 'x', 'x'], - 'c' : randn(7)}) - - - # goes to copy (will be lost) - dfb[dfb.a.str.startswith('o')]['c'] = 42 +Of course, expressions can be arbitrarily complex too - # passed via reference (will stay) - dfb['c'][dfb.a.str.startswith('o')] = 42 +.. ipython:: python -A chained assignment can also crop up in setting in a mixed dtype frame. + # short query syntax + shorter = df.query('a < b < c and (not bools) or bools > 2') -.. note:: + # equivalent in pure Python + longer = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)] - These setting rules apply to all of ``.loc/.iloc/.ix`` + shorter + longer -This is the correct access method + shorter == longer .. ipython:: python + :suppress: - dfc = DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]}) - dfc_copy = dfc.copy() - dfc_copy.loc[0,'A'] = 11 - dfc_copy - -This *can* work at times, but is not guaranteed, and so should be avoided + try: + d = old_d + del old_d + except NameError: + pass -.. ipython:: python - dfc_copy = dfc.copy() - dfc_copy['A'][0] = 111 - dfc_copy +Performance of :meth:`~pandas.DataFrame.query` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -This will **not** work at all, and so should be avoided - -.. ipython:: python +``DataFrame.query()`` using ``numexpr`` is slightly faster than Python for +large frames - dfc_copy = dfc.copy() - dfc_copy.loc[0]['A'] = 1111 - dfc_copy +.. image:: _static/query-perf.png -When assigning values to subsets of your data, thus, make sure to either use the -pandas access methods or explicitly handle the assignment creating a copy. +.. note:: -Fallback indexing -~~~~~~~~~~~~~~~~~~~~ + You will only see the performance benefits of using the ``numexpr`` engine + with ``DataFrame.query()`` if your frame has more than approximately 50,000 + rows -.. _indexing.fallback: + .. image:: _static/query-perf-small.png -Float indexes should be used only with caution. If you have a float indexed -``DataFrame`` and try to select using an integer, the row that Pandas returns -might not be what you expect. Pandas first attempts to use the *integer* -as a *label* location, but fails to find a match (because the types -are not equal). Pandas then falls back to back to positional indexing. +This plot was created using a ``DataFrame`` with 3 columns each containing +floating point values generated using ``numpy.random.randn()``. .. ipython:: python + :suppress: - df = pd.DataFrame(np.random.randn(4,4), - columns=list('ABCD'), index=[1.0, 2.0, 3.0, 4.0]) - df - df.ix[1] - -To select the row you do expect, instead use a float label or -use ``iloc``. + df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) + df2 = df.copy() -.. ipython:: python +Take Methods +------------ - df.ix[1.0] - df.iloc[0] +.. _indexing.take: -Instead of using a float index, it is often better to -convert to an integer index: +Similar to numpy ndarrays, pandas Index, Series, and DataFrame also provides +the ``take`` method that retrieves elements along a given axis at the given +indices. The given indices must be either a list or an ndarray of integer +index positions. ``take`` will also accept negative integers as relative positions to the end of the object. .. ipython:: python - df_new = df.reset_index() - df_new[df_new['index'] == 1.0] - # now you can also do "float selection" - df_new[(df_new['index'] >= 1.0) & (df_new['index'] < 2)] - + index = Index(randint(0, 1000, 10)) + index -.. _indexing.query: + positions = [0, 9, 3] -The :meth:`~pandas.DataFrame.query` Method -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + index[positions] + index.take(positions) -.. versionadded:: 0.13 + ser = Series(randn(10)) -:class:`~pandas.DataFrame` objects have a :meth:`~pandas.DataFrame.query` -method that allows selection using a boolean expression. + ser.ix[positions] + ser.take(positions) -You can get the value of the frame where column ``b`` has values -between the values of columns ``a`` and ``c``. +For DataFrames, the given indices should be a 1d list or ndarray that specifies +row or column positions. .. ipython:: python - :suppress: - from numpy.random import randint, rand + frm = DataFrame(randn(5, 3)) -.. ipython:: python + frm.take([1, 4, 3]) - n = 10 - df = DataFrame(rand(n, 3), columns=list('abc')) - df - df[(df.a < df.b) & (df.b < df.c)] - df.query('(a < b) & (b < c)') + frm.take([0, 2], axis=1) -Do the same thing but fallback on a named index if there is no column -with the name ``a``. +It is important to note that the ``take`` method on pandas objects are not +intended to work on boolean indices and may return unexpected results. .. ipython:: python - df = DataFrame(randint(n / 2, size=(n, 2)), columns=list('bc')) - df.index.name = 'a' - df - df.query('a < b and b < c') + arr = randn(10) + arr.take([False, False, True, True]) + arr[[0, 1]] -If instead you don't want to or cannot name your index, you can use the name -``index`` in your query expression: + ser = Series(randn(10)) + ser.take([False, False, True, True]) + ser.ix[[0, 1]] -.. ipython:: python - :suppress: +Finally, as a small note on performance, because the ``take`` method handles +a narrower range of inputs, it can offer performance that is a good deal +faster than fancy indexing. - old_index = index - del index +.. ipython:: -.. ipython:: python + arr = randn(10000, 5) + indexer = np.arange(10000) + random.shuffle(indexer) - df = DataFrame(randint(n, size=(n, 2)), columns=list('bc')) - df - df.query('index < b < c') + timeit arr[indexer] + timeit arr.take(indexer, axis=0) -.. ipython:: python - :suppress: + ser = Series(arr[:, 0]) + timeit ser.ix[indexer] + timeit ser.take(indexer) - index = old_index - del old_index +Duplicate Data +-------------- +.. _indexing.duplicate: -:class:`~pandas.MultiIndex` :meth:`~pandas.DataFrame.query` Syntax -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +If you want to identify and remove duplicate rows in a DataFrame, there are +two methods that will help: ``duplicated`` and ``drop_duplicates``. Each +takes as an argument the columns to use to identify duplicated rows. -You can also use the levels of a ``DataFrame`` with a -:class:`~pandas.MultiIndex` as if they were columns in the frame: +- ``duplicated`` returns a boolean vector whose length is the number of rows, and which indicates whether a row is duplicated. +- ``drop_duplicates`` removes duplicate rows. -.. ipython:: python +By default, the first observed row of a duplicate set is considered unique, but +each method has a ``take_last`` parameter that indicates the last observed row +should be taken instead. - import pandas.util.testing as tm +.. ipython:: python - n = 10 - colors = tm.choice(['red', 'green'], size=n) - foods = tm.choice(['eggs', 'ham'], size=n) - colors - foods + df2 = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'], + 'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'], + 'c' : np.random.randn(7)}) + df2.duplicated(['a','b']) + df2.drop_duplicates(['a','b']) + df2.drop_duplicates(['a','b'], take_last=True) - index = MultiIndex.from_arrays([colors, foods], names=['color', 'food']) - df = DataFrame(randn(n, 2), index=index) - df - df.query('color == "red"') +.. _indexing.dictionarylike: -If the levels of the ``MultiIndex`` are unnamed, you can refer to them using -special names: +Dictionary-like :meth:`~pandas.DataFrame.get` method +---------------------------------------------------- +Each of Series, DataFrame, and Panel have a ``get`` method which can return a +default value. .. ipython:: python - df.index.names = [None, None] - df - df.query('ilevel_0 == "red"') + s = Series([1,2,3], index=['a','b','c']) + s.get('a') # equivalent to s['a'] + s.get('x', default=-1) +.. _indexing.advanced: -The convention is ``ilevel_0``, which means "index level 0" for the 0th level -of the ``index``. +Advanced Indexing with ``.ix`` +------------------------------ +.. note:: -:meth:`~pandas.DataFrame.query` Use Cases -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + The recent addition of ``.loc`` and ``.iloc`` have enabled users to be quite + explicit about indexing choices. ``.ix`` allows a great flexibility to + specify indexing locations by *label* and/or *integer position*. Pandas will + attempt to use any passed *integer* as *label* locations first (like what + ``.loc`` would do, then to fall back on *positional* indexing, like what + ``.iloc`` would do). See :ref:`Fallback Indexing ` for + an example. -One use case for :meth:`~pandas.DataFrame.query` is when you have a collection of -:class:`~pandas.DataFrame` objects that have a subset of column names (or index -levels/names) in common. You can pass the same query to both frames *without* -having to specify which frame you're interested in querying +The syntax of using ``.ix`` is identical to ``.loc``, in :ref:`Selection by +Label `, and ``.iloc`` in :ref:`Selection by Position `. -.. ipython:: python +The ``.ix`` attribute takes the following inputs: - df = DataFrame(randint(n / 2, size=(n, 2)), columns=list('bc')) - df.index.name = 'a' - df2 = DataFrame(randint(n + 5, size=(n + 5, 3)), columns=list('abc')) - df2 - expr = 'a < b & b < c' - map(lambda frame: frame.query(expr), [df, df2]) +- An integer or single label, e.g. ``5`` or ``'a'`` +- A list or array of labels ``['a', 'b', 'c']`` or integers ``[4, 3, 0]`` +- A slice object with ints ``1:7`` or labels ``'a':'f'`` +- A boolean array -A chained comparison would also work in this situation, yielding slightly -cleaner syntax +We'll illustrate all of these methods. First, note that this provides a concise +way of reindexing on multiple axes at once: .. ipython:: python - expr = 'a < b < c' - map(lambda frame: frame.query(expr), [df, df2]) - -One neat feature of :meth:`~pandas.DataFrame.query` is that you can pass an -expression ``expr`` into ``df[]``, e.g., ``df[expr]``. + subindex = dates[[3,4,5]] + df.reindex(index=subindex, columns=['C', 'B']) + df.ix[subindex, ['C', 'B']] -This functionality can of course be combined with a slightly modified and more -readable Python syntax implemented in the workhorse function that underlies -:meth:`~pandas.DataFrame.query`--:func:`~pandas.eval`. +Assignment / setting values is possible when using ``ix``: +.. ipython:: python -:meth:`~pandas.DataFrame.query` Python versus pandas Syntax Comparison -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + df2 = df.copy() + df2.ix[subindex, ['C', 'B']] = 0 + df2 -Full numpy-like syntax +Indexing with an array of integers can also be done: .. ipython:: python - df = DataFrame(randint(n / 2, size=(n, 3)), columns=list('abc')) - df - df['(a < b) & (b < c)'] - df[(df.a < df.b) & (df.b < df.c)] + df.ix[[4,3,1]] + df.ix[dates[[4,3,1]]] -Slightly nicer by removing the parentheses +**Slicing** has standard Python semantics for integer slices: .. ipython:: python - df['a < b & b < c'] + df.ix[1:7, :2] -Use English instead of symbols +Slicing with labels is semantically slightly different because the slice start +and stop are **inclusive** in the label-based case: .. ipython:: python - df['a < b and b < c'] + date1, date2 = dates[[2, 4]] + print date1, date2 + df.ix[date1:date2] + df['A'].ix[date1:date2] -Pretty close to how you might write it on paper +Getting and setting rows in a DataFrame, especially by their location, is much +easier: .. ipython:: python - df['a < b < c'] - -The ``in`` and ``not in`` operators -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + df2 = df[:5].copy() + df2.ix[3] + df2.ix[3] = np.arange(len(df2.columns)) + df2 -:meth:`~pandas.DataFrame.query` also supports special use of Python's ``in`` and -``not in`` comparison operators, providing a succint syntax for calling the -``isin`` method of a ``Series`` or ``DataFrame``. +Column or row selection can be combined as you would expect with arrays of +labels or even boolean vectors: .. ipython:: python - :suppress: - old_d = d - del d - -.. ipython:: python + df.ix[df['A'] > 0, 'B'] + df.ix[date1:date2, 'B'] + df.ix[date1, 'B'] - # get all rows where columns "a" and "b" have overlapping values - df = DataFrame({'a': list('aabbccddeeff'), 'b': list('aaaabbbbcccc'), - 'c': randint(5, size=12), 'd': randint(9, size=12)}) - df - df['a in b'] +Slicing with labels is closely related to the ``truncate`` method which does +precisely ``.ix[start:stop]`` but returns a copy (for legacy reasons). - # How you'd do it in pure Python - df[df.a.isin(df.b)] +The :meth:`~pandas.DataFrame.select` Method +------------------------------------------- - df['a not in b'] +Another way to extract slices from an object is with the ``select`` method of +Series, DataFrame, and Panel. This method should be used only when there is no +more direct way. ``select`` takes a function which operates on labels along +``axis`` and returns a boolean. For instance: - # pure Python - df[~df.a.isin(df.b)] +.. ipython:: python + df.select(lambda x: x == 'A', axis=1) -You can, of course, combine this with other expressions for very succinct -queries: +The :meth:`~pandas.DataFrame.lookup` Method +------------------------------------------- +Sometimes you want to extract a set of values given a sequence of row labels +and column labels, and the ``lookup`` method allows for this and returns a +numpy array. For instance, .. ipython:: python - # rows where cols a and b have overlapping values and col c's values are less than col d's - df['a in b and c < d'] + dflookup = DataFrame(np.random.rand(20,4), columns = ['A','B','C','D']) + dflookup.lookup(list(range(0,10,2)), ['B','C','A','B','D']) - # pure Python - df[df.b.isin(df.a) & (df.c < df.d)] +Setting values in mixed-type DataFrame +-------------------------------------- +.. _indexing.mixed_type_setting: -.. note:: +Setting values on a mixed-type DataFrame or Panel is supported when using +scalar values, though setting arbitrary vectors is not yet supported: - Note that ``in`` and ``not in`` are evaluated in Python, since ``numexpr`` - has no equivalent of this operation. However, **only the** ``in``/``not in`` - **expression itself** is evaluated in vanilla Python. For example, in the - expression +.. ipython:: python - .. code-block:: python + df2 = df[:4] + df2['foo'] = 'bar' + print df2 + df2.ix[2] = np.nan + print df2 + print df2.dtypes - df['a in b + c + d'] +.. _indexing.view_versus_copy: - ``(b + c + d)`` is evaluated by ``numexpr`` and *then* the ``in`` - operation is evaluated in plain Python. In general, any operations that can - be evaluated using ``numexpr`` will be. +Returning a view versus a copy +------------------------------ -Special use of the ``==`` operator with ``list`` objects -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The rules about when a view on the data is returned are entirely dependent on +NumPy. Whenever an array of labels or a boolean vector are involved in the +indexing operation, the result will be a copy. With single label / scalar +indexing and slicing, e.g. ``df.ix[3:6]`` or ``df.ix[:, 'A']``, a view will be +returned. -Comparing a ``list`` of values to a column using ``==``/``!=`` works similarly -to ``in``/``not in`` +In chained expressions, the order may determine whether a copy is returned or not: .. ipython:: python - df['b == ["a", "b", "c"]'] - # pure Python - df[df.b.isin(["a", "b", "c"])] + dfb = DataFrame({'a' : ['one', 'one', 'two', + 'three', 'two', 'one', 'six'], + 'b' : ['x', 'y', 'y', + 'x', 'y', 'x', 'x'], + 'c' : randn(7)}) - df['c == [1, 2]'] - df['c != [1, 2]'] + # goes to copy (will be lost) + dfb[dfb.a.str.startswith('o')]['c'] = 42 - # using in/not in - df['[1, 2] in c'] + # passed via reference (will stay) + dfb['c'][dfb.a.str.startswith('o')] = 42 - df['[1, 2] not in c'] +A chained assignment can also crop up in setting in a mixed dtype frame. - # pure Python - df[df.c.isin([1, 2])] +.. note:: + These setting rules apply to all of ``.loc/.iloc/.ix`` -Boolean Operators -~~~~~~~~~~~~~~~~~ +This is the correct access method -You can negate boolean expressions with the word ``not`` or the ``~`` operator. +.. ipython:: python + + dfc = DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]}) + dfc_copy = dfc.copy() + dfc_copy.loc[0,'A'] = 11 + dfc_copy + +This *can* work at times, but is not guaranteed, and so should be avoided .. ipython:: python - df = DataFrame(rand(n, 3), columns=list('abc')) - df['bools'] = rand(len(df)) > 0.5 - df['~bools'] - df['not bools'] - df['not bools'] == df['~bools'] - df['not bools'] == df[~df.bools] + dfc_copy = dfc.copy() + dfc_copy['A'][0] = 111 + dfc_copy -Of course, expressions can be arbitrarily complex too +This will **not** work at all, and so should be avoided .. ipython:: python - # short query syntax - shorter = df['a < b < c and (not bools) or bools > 2'] + dfc_copy = dfc.copy() + dfc_copy.loc[0]['A'] = 1111 + dfc_copy - # equivalent in pure Python - longer = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)] +When assigning values to subsets of your data, thus, make sure to either use the +pandas access methods or explicitly handle the assignment creating a copy. - shorter - longer +Fallback indexing +----------------- - shorter == longer +.. _indexing.fallback: -.. ipython:: python - :suppress: +Float indexes should be used only with caution. If you have a float indexed +``DataFrame`` and try to select using an integer, the row that Pandas returns +might not be what you expect. Pandas first attempts to use the *integer* +as a *label* location, but fails to find a match (because the types +are not equal). Pandas then falls back to back to positional indexing. - d = old_d - del old_d +.. ipython:: python + df = pd.DataFrame(np.random.randn(4,4), + columns=list('ABCD'), index=[1.0, 2.0, 3.0, 4.0]) + df + df.ix[1] -Perfomance of ``DataFrame.query()`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +To select the row you do expect, instead use a float label or +use ``iloc``. -``DataFrame.query()`` using ``numexpr`` is slightly faster than Python for -large frames +.. ipython:: python -.. image:: _static/query-perf.png + df.ix[1.0] + df.iloc[0] -.. note:: +Instead of using a float index, it is often better to +convert to an integer index: - You will only see the performance benefits of using the ``numexpr`` engine - with ``DataFrame.query()`` if your frame has more than approximately 50,000 - rows +.. ipython:: python - .. image:: _static/query-perf-small.png + df_new = df.reset_index() + df_new[df_new['index'] == 1.0] + # now you can also do "float selection" + df_new[(df_new['index'] >= 1.0) & (df_new['index'] < 2)] -This plot was created using a ``DataFrame`` with 3 columns each containing -floating point values generated using ``numpy.random.randn()``. .. _indexing.class: @@ -1361,8 +1375,8 @@ operators: a & b a - b -``isin`` method of Index objects -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The ``isin`` method of Index objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ One additional operation is the ``isin`` method that works analogously to the ``Series.isin`` method found :ref:`here `. @@ -1567,7 +1581,7 @@ mailing list. .. _indexing.xs: Cross-section with hierarchical index -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The ``xs`` method of ``DataFrame`` additionally takes a level argument to make selecting data at a particular level of a MultiIndex easier. @@ -1599,8 +1613,8 @@ instance: print df2_aligned -The need for sortedness -~~~~~~~~~~~~~~~~~~~~~~~ +The need for sortedness with :class:`~pandas.MultiIndex` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ **Caveat emptor**: the present implementation of ``MultiIndex`` requires that the labels be sorted for some of the slicing / indexing routines to work @@ -1672,8 +1686,8 @@ However: ... KeyError: Key length (3) was greater than MultiIndex lexsort depth (2) -Swapping levels with ``swaplevel`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Swapping levels with :meth:`~pandas.MultiIndex.swaplevel` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The ``swaplevel`` function can switch the order of two levels: @@ -1684,8 +1698,8 @@ The ``swaplevel`` function can switch the order of two levels: .. _indexing.reorderlevels: -Reordering levels with ``reorder_levels`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Reordering levels with :meth:`~pandas.MultiIndex.reorder_levels` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The ``reorder_levels`` function generalizes the ``swaplevel`` function, allowing you to permute the hierarchical index levels in one step: @@ -1717,7 +1731,7 @@ if you compute the levels and labels yourself, please be careful. Setting index metadata (``name(s)``, ``levels``, ``labels``) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------------------------------------ .. _indexing.set_metadata: @@ -1746,7 +1760,7 @@ add an index after you've already done so. There are a couple of different ways. Add an index using DataFrame columns -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------------ .. _indexing.set_index: @@ -1789,7 +1803,7 @@ the index in-place (without creating a new object): data Remove / reset the index, ``reset_index`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +------------------------------------------ As a convenience, there is a new function on DataFrame called ``reset_index`` which transfers the index values into the DataFrame's columns and sets a simple @@ -1820,7 +1834,7 @@ discards the index, instead of putting index values in the DataFrame's columns. deprecated. Adding an ad hoc index -~~~~~~~~~~~~~~~~~~~~~~ +---------------------- If you create an index yourself, you can just assign it to the ``index`` field: @@ -1833,9 +1847,9 @@ Indexing internal details .. note:: - The following is largely relevant for those actually working on the pandas - codebase. And the source code is still the best place to look at the - specifics of how things are implemented. + The following is largely relevant for those actually working on the pandas + codebase. The source code is still the best place to look at the specifics + of how things are implemented. In pandas there are a few objects implemented which can serve as valid containers for the axis labels: @@ -1847,6 +1861,8 @@ containers for the axis labels: - ``Int64Index``: a version of ``Index`` highly optimized for 64-bit integer data, such as time stamps - ``MultiIndex``: the standard hierarchical index object + - ``PeriodIndex``: An Index object with Period elements + - ``DatetimeIndex``: An Index object with Timestamp elements - ``date_range``: fixed frequency date range generated from a time rule or DateOffset. An ndarray of Python datetime objects diff --git a/doc/source/release.rst b/doc/source/release.rst index 0ed1f39d72cb5..b8a817a00403c 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -294,7 +294,15 @@ See :ref:`Internal Refactoring` Experimental Features ~~~~~~~~~~~~~~~~~~~~~ -.. _release:bug_fixes-0.13.0: +- The new :func:`~pandas.eval` function implements expression evaluation using + ``numexpr`` behind the scenes. This results in large speedups for complicated + expressions involving large DataFrames/Series. +- :class:`~pandas.DataFrame` has a new :meth:`~pandas.DataFrame.eval` that + evaluates an expression in the context of the ``DataFrame``. +- A :meth:`~pandas.DataFrame.query` method has been added that allows + you to select elements of a ``DataFrame`` using a natural query syntax nearly + identical to Python syntax. + Bug Fixes ~~~~~~~~~ diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index 4f43cd5e0120c..694281b813c3b 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -362,10 +362,10 @@ Enhancements - DataFrame constructor now accepts a numpy masked record array (:issue:`3478`) -.. _whatsnew_0130.enhancingperf: +.. _whatsnew_0130.experimental: -Performance Enhancments -~~~~~~~~~~~~~~~~~~~~~~~ +Experimental +~~~~~~~~~~~~ - :func:`~pandas.eval`: @@ -439,7 +439,7 @@ Performance Enhancments n = 20 df = DataFrame(randint(n, size=(n, 3)), columns=['a', 'b', 'c']) - df['a < b < c'] + df.query('a < b < c') selects all the rows of ``df`` where ``a < b < c`` evaluates to ``True``. For more details see the :ref:`indexing documentation on query diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index 753613e5caea9..9ffae5edd93bc 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -571,5 +571,3 @@ def maybe_expression(s): # make sure we have an op at least return any(op in s for op in ops) - - diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0f6323aef96f0..4a1155abf436e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1927,7 +1927,9 @@ def query(self, expr, **kwargs): ---------- expr : string The query string to evaluate. The result of the evaluation of this - expression is passed to + expression is first passed to :attr:`~pandas.DataFrame.loc` and if + that fails because of a multidimensional key (e.g., a DataFrame) + then the result will be passed to :meth:`~pandas.DataFrame.__getitem__`. kwargs : dict See the documentation for :func:`~pandas.eval` for complete details From ab60f4b1b05760f0b996cc0d9b553301c91a6dee Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 16 Sep 2013 14:23:04 -0400 Subject: [PATCH 16/16] CLN: remove expr maybe_expression --- pandas/computation/expr.py | 40 -------------------------------------- pandas/core/frame.py | 2 +- 2 files changed, 1 insertion(+), 41 deletions(-) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index db92e7f57677b..ff9adc26b8201 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -749,46 +749,6 @@ def add_resolvers_to_locals(self): self.env.locals.update(self.env.resolver_dict) -# these we don't look for since column names can have these characters -_needs_filter = frozenset(['and', 'or', 'not', 'not in', 'in']) - -# these OTOH can only be operators, so you cannot create column names that are -# valid expressions -_ops_to_filter = frozenset([' and ', ' or ', 'not ', ' in ']) - -# if you don't filter out the above expressions you'll get a stack overflow, -# because DataFrame.__getitem__ will continue to search for a column name then -# an expression then a column name then an expression, and so on, until you -# blow up the stack and kill a kitten. - - -def maybe_expression(s, kind='pandas'): - """Loose checking if ``s`` is an expression. - - Parameters - ---------- - s : str or unicode - The expression to check - kind : str or unicode - The parser whose ops to check - - Returns - ------- - bool - ``True`` the expression contains some operators that would be valid - when parsed with the ``kind`` parser, otherwise ``False``. - """ - if not isinstance(s, string_types): - return False - - visitor = _parsers[kind] - ops = visitor.binary_ops + visitor.unary_ops - filtered = (frozenset(ops) | _ops_to_filter) - _needs_filter - - # make sure we have an op at least - return any(op in s for op in filtered) - - def isexpr(s, check_names=True): """Strict checking for a valid expression.""" try: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4a1155abf436e..86565a3a1d9e5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -37,7 +37,7 @@ from pandas.core.series import Series, _radd_compat import pandas.computation.expressions as expressions from pandas.computation.eval import eval as _eval -from pandas.computation.expr import maybe_expression, _ensure_scope +from pandas.computation.expr import _ensure_scope from pandas.compat.scipy import scoreatpercentile as _quantile from pandas.compat import(range, zip, lrange, lmap, lzip, StringIO, u, OrderedDict, raise_with_traceback)