From 89a03bea1e3846e0af520d8760a6be7f2516bfa3 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 15 Jun 2013 21:34:56 -0400 Subject: [PATCH 01/48] ENH: add new computation module and toplevel eval function --- pandas/__init__.py | 1 + pandas/computation/__init__.py | 0 pandas/computation/api.py | 1 + pandas/computation/common.py | 11 + pandas/computation/engines.py | 290 ++++++++++ pandas/computation/eval.py | 75 +++ pandas/computation/expr.py | 135 +++++ pandas/{core => computation}/expressions.py | 72 +-- pandas/computation/ops.py | 188 +++++++ pandas/computation/tests/__init__.py | 0 pandas/computation/tests/test_eval.py | 552 +++++++++++++++++++ pandas/computation/tests/test_expressions.py | 157 ++++++ pandas/core/frame.py | 4 +- pandas/core/internals.py | 2 +- pandas/tests/test_expressions.py | 203 ------- setup.py | 3 +- vb_suite/binary_ops.py | 12 +- vb_suite/indexing.py | 4 +- 18 files changed, 1465 insertions(+), 245 deletions(-) create mode 100644 pandas/computation/__init__.py create mode 100644 pandas/computation/api.py create mode 100644 pandas/computation/common.py create mode 100644 pandas/computation/engines.py create mode 100644 pandas/computation/eval.py create mode 100644 pandas/computation/expr.py rename pandas/{core => computation}/expressions.py (75%) create mode 100644 pandas/computation/ops.py create mode 100644 pandas/computation/tests/__init__.py create mode 100644 pandas/computation/tests/test_eval.py create mode 100644 pandas/computation/tests/test_expressions.py delete mode 100644 pandas/tests/test_expressions.py diff --git a/pandas/__init__.py b/pandas/__init__.py index a0edb397c28c1..bec0877b13bb8 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -29,6 +29,7 @@ from pandas.stats.api import * from pandas.tseries.api import * from pandas.io.api import * +from pandas.computation.api import eval from pandas.util.testing import debug diff --git a/pandas/computation/__init__.py b/pandas/computation/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/computation/api.py b/pandas/computation/api.py new file mode 100644 index 0000000000000..86f72902a52c8 --- /dev/null +++ b/pandas/computation/api.py @@ -0,0 +1 @@ +from pandas.computation.eval import eval diff --git a/pandas/computation/common.py b/pandas/computation/common.py new file mode 100644 index 0000000000000..4061984dd5e08 --- /dev/null +++ b/pandas/computation/common.py @@ -0,0 +1,11 @@ +import collections +from pandas.core.common import is_string + + +def flatten(l): + for el in l: + if isinstance(el, collections.Iterable) and not is_string(el): + for s in flatten(el): + yield s + else: + yield el diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py new file mode 100644 index 0000000000000..0eb9875b85549 --- /dev/null +++ b/pandas/computation/engines.py @@ -0,0 +1,290 @@ +import abc +from functools import partial +from itertools import izip + +import numpy as np + +import pandas as pd +import pandas.core.common as com +from pandas.computation.ops import _resolve_name, _update_names +from pandas.computation.common import flatten + + +def _align_core_single_unary_op(term): + if isinstance(term, np.ndarray) and not com.is_series(term): + typ = np.asanyarray + else: + typ = type(term) + ret = typ, [term] + + if not hasattr(term, 'axes'): + ret += None, + else: + ret += _zip_axes_from_type(typ, term.axes), + return ret + + +def _zip_axes_from_type(typ, new_axes): + axes = {} + for ax_ind, ax_name in typ._AXIS_NAMES.iteritems(): + axes[ax_name] = new_axes[ax_ind] + return axes + + +def _maybe_promote_shape(values, naxes): + # test to see if we have an array else leave since must be a number + if not isinstance(values, np.ndarray): + return values + + ndims = values.ndim + if ndims > naxes: + raise AssertionError('cannot have more dims than axes, ' + '{0} > {1}'.format(ndims, naxes)) + if ndims == naxes: + return values + + ndim = set(xrange(ndims)) + nax = set(xrange(naxes)) + + axes_slice = [slice(None)] * naxes + + # symmetric difference + slices = nax - ndim + + if ndims == naxes: + if slices: + raise AssertionError('slices should be empty if ndims == naxes ' + '{0}'.format(slices)) + else: + if not slices: + raise AssertionError('slices should NOT be empty if ndim != naxes ' + '{0}'.format(slices)) + + for sl in slices: + axes_slice[sl] = np.newaxis + + return values[tuple(axes_slice)] + + +def _align_core(terms): + # need to ensure that terms is not an iterator + terms = list(terms) + + ## special cases + + # single unary operand + if len(terms) == 1: + return _align_core_single_unary_op(terms[0]) + # only scalars + elif all(np.isscalar(term) for term in terms): + return np.result_type(*terms), terms, None + + # single dim ndarrays + all_has_size = all(hasattr(term, 'size') for term in terms) + if (all_has_size and all(term.size == 1 for term in terms)): + return np.result_type(*terms), terms, None + + # made it past the special cases + term_index = [i for i, term in enumerate(terms) if hasattr(term, 'axes')] + term_dims = [terms[i].ndim for i in term_index] + ndims = pd.Series(dict(zip(term_index, term_dims))) + + # initial axes are the axes of the largest-axis'd term + biggest = terms[ndims.idxmax()] + typ = biggest._constructor + axes = biggest.axes + naxes = len(axes) + + for i in term_index: + for axis, items in enumerate(terms[i].axes): + if com.is_series(terms[i]) and naxes > 1: + axes[naxes - 1] = axes[naxes - 1].join(terms[i].index, + how='outer') + else: + axes[axis] = axes[axis].join(items, how='outer') + + for i, ndim in ndims.iteritems(): + for axis, items in izip(xrange(ndim), axes): + ti = terms[i] # needed here because we modify it in the inner loop + + if hasattr(ti, 'reindex_axis'): + transpose = com.is_series(ti) and naxes > 1 + + if transpose: + f = partial(ti.reindex, index=axes[naxes - 1], copy=False) + else: + f = partial(ti.reindex_axis, items, axis=axis, copy=False) + + if pd.lib.is_bool_array(ti.values): + r = f(fill_value=True) + else: + r = f() + + terms[i] = r + + res = _maybe_promote_shape(terms[i].T if transpose else terms[i], + naxes) + res = res.T if transpose else res + + try: + terms[i] = res.values + except AttributeError: + terms[i] = res + + return typ, terms, _zip_axes_from_type(typ, axes) + + +def _filter_terms(flat): + # numeric literals + literals = filter(lambda string: not com.is_string(string), flat) + literals_set = set(literals) + + # these are strings which are variable names + names = filter(com.is_string, flat) + names_set = set(names) + + # literals are not names and names are not literals, by definition + if literals_set & names_set: + raise AssertionError('literals cannot be names and names cannot be ' + 'literals') + return names, literals + + +def _align(terms, env): + # flatten the parse tree (a nested list) + flat = list(flatten(terms)) + + names, literals = _filter_terms(flat) + + # given an expression consisting of literals + if not names: + return np.result_type(*literals).type, None + + # get the variables out + resolve_in_env = partial(_resolve_name, env) + resolved = map(resolve_in_env, names) + + # if all resolved variables are numeric scalars + if all(map(np.isscalar, resolved)): + return np.result_type(*resolved).type, None + + # perform the main alignment + typ, resolved, axes = _align_core(resolved) + + # put them back in the symbol table + _update_names(env, dict(izip(names, resolved))) + + # we need this to reconstruct things after evaluation since we CANNOT + # depend on the array interface + return typ, axes + + +def _reconstruct_object(typ, obj, axes): + """Reconstruct an object given its type, raw value, and possibly empty + (None) axes. + + Parameters + ---------- + typ : object + A type + obj : object + The value to use in the type constructor + axes : dict + The axes to use to construct the resulting pandas object + + Returns + ------- + reconst : typ + An object of type ``typ`` with the value `obj` and possible axes + `axes`. + """ + try: + # handle numpy dtypes + typ = typ.type + except AttributeError: + pass + + if typ != np.asanyarray and issubclass(typ, pd.core.generic.PandasObject): + return typ(obj, **axes) + + ret_value = typ(obj) + + try: + return ret_value.item() + except (AttributeError, ValueError): + return ret_value + + +class AbstractEngine(object): + """""" + __metaclass__ = abc.ABCMeta + + has_neg_frac = False + + def __init__(self, expr): + self.expr = expr + self.aligned_axes = None + self.result_type = None + + @abc.abstractmethod + def convert(self): + """Convert an expression for evaluation.""" + pass + + def evaluate(self, env): + if not self._is_aligned: + self.result_type, self.aligned_axes = _align(self.expr.terms, env) + + res = self._evaluate(env) + return _reconstruct_object(self.result_type, res, self.aligned_axes) + + @property + def _is_aligned(self): + return self.aligned_axes is not None and self.result_type is not None + + @abc.abstractmethod + def _evaluate(self, env): + """Return an evaluated expression.""" + pass + + +class NumExprEngine(AbstractEngine): + """NumExpr engine class""" + has_neg_frac = True + + def __init__(self, expr): + super(NumExprEngine, self).__init__(expr) + + def convert(self): + """Return a string""" + return str(self.expr) + + def _evaluate(self, env): + import numexpr as ne + + try: + return ne.evaluate(self.convert(), local_dict=env.locals, + global_dict=env.globals, + truediv=self.expr.truediv) + except KeyError as e: + raise NameError('{0!r} is not defined'.format(e.message)) + + +class PythonEngine(AbstractEngine): + """Use NumPy even if numexpr is installed""" + has_neg_frac = False + + def __init__(self, expr): + super(PythonEngine, self).__init__(expr) + + def convert(self): + pass + + def evaluate(self, env): + return self.expr(env) + + def _evaluate(self, env): + pass + + +_engines = {'numexpr': NumExprEngine, 'python': PythonEngine} diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py new file mode 100644 index 0000000000000..21348f221bc99 --- /dev/null +++ b/pandas/computation/eval.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python + +import sys +import numbers +import collections +import itertools + +import numpy as np + +Scope = collections.namedtuple('Scope', 'globals locals') + +import pandas.core.common as com +from pandas.computation.expr import Expr +from pandas.computation.engines import _engines + + +def _scope_has_series_and_frame_datetime_index(env): + from pandas import DatetimeIndex + series_index = frame_index = 0 + + for v in itertools.chain(env.locals.itervalues(), + env.globals.itervalues()): + series_index += com.is_series(v) and isinstance(v.index, DatetimeIndex) + frame_index += com.is_frame(v) and isinstance(v.index, DatetimeIndex) + return series_index, frame_index + + +def _maybe_convert_engine(env, engine): + assert isinstance(env, Scope), 'environment must be an instance of Scope' + assert isinstance(engine, basestring), 'engine name must be a string' + + ret = engine + + if all(_scope_has_series_and_frame_datetime_index(env)): + ret = 'python' + return ret + + +def eval(expr, engine='numexpr', truediv=True, local_dict=None, + global_dict=None): + # make sure we're passed a valid engine + if not engine in _engines: + raise KeyError('Invalid engine {0} passed, valid engines are' + ' {1}'.format(_engines.keys())) + + # 1 up in the call stack for locals/globals; see the documentation for the + # inspect module for why you must decrease the refcount of frame + frame = sys._getframe(1) + + try: + # get the globals and locals + gbl, lcl = global_dict or frame.f_globals, local_dict or frame.f_locals + + # shallow copy the scope so we don't overwrite everything + env = Scope(gbl.copy(), lcl.copy()) + + engine = _maybe_convert_engine(env, engine) + + # parse the expression + parsed_expr = Expr(expr, engine, truediv) + + # choose the engine + eng = _engines[engine] + + # construct the engine and evaluate + ret = eng(parsed_expr).evaluate(env) + finally: + del frame + + # sanity check for a number + if np.isscalar(ret): + if not isinstance(ret, (np.number, numbers.Number, np.bool_, bool)): + raise TypeError('scalar result must be numeric or bool, type is ' + '{0!r}'.format(ret.__class__.__name__)) + return ret diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py new file mode 100644 index 0000000000000..105c0a020a2ad --- /dev/null +++ b/pandas/computation/expr.py @@ -0,0 +1,135 @@ +import ast +from functools import partial + +from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops +from pandas.computation.ops import _cmp_ops_syms, _bool_ops_syms +from pandas.computation.ops import _arith_ops_syms, _unary_ops_syms + + +class ExprParserError(Exception): + pass + + +class ExprVisitor(ast.NodeVisitor): + """Custom ast walker + """ + bin_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms + bin_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'BitAnd', 'BitOr', + 'Add', 'Sub', 'Mult', 'Div', 'Pow', 'FloorDiv') + bin_op_nodes_map = dict(zip(bin_ops, bin_op_nodes)) + + unary_ops = _unary_ops_syms + unary_op_nodes = 'UAdd', 'USub', 'Invert' + unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) + + def __init__(self): + for bin_op in self.bin_ops: + setattr(self, 'visit_{0}'.format(self.bin_op_nodes_map[bin_op]), + lambda node, bin_op=bin_op: partial(BinOp, bin_op)) + + for unary_op in self.unary_ops: + setattr(self, + 'visit_{0}'.format(self.unary_op_nodes_map[unary_op]), + lambda node, unary_op=unary_op: partial(UnaryOp, unary_op)) + + def visit(self, node): + if not (isinstance(node, ast.AST) or isinstance(node, basestring)): + raise AssertionError('"node" must be an AST node or a string, you' + ' passed a(n) {0}'.format(node.__class__)) + if isinstance(node, basestring): + node = ast.fix_missing_locations(ast.parse(node)) + return super(ExprVisitor, self).visit(node) + + def visit_Module(self, node): + if len(node.body) != 1: + raise ExprParserError('only a single expression is allowed') + + expr = node.body[0] + if not isinstance(expr, ast.Expr): + raise SyntaxError('only expressions are allowed') + + return self.visit(expr) + + def visit_Expr(self, node): + return self.visit(node.value) + + def visit_BinOp(self, node): + op = self.visit(node.op) + left = self.visit(node.left) + right = self.visit(node.right) + return op(left, right) + + def visit_UnaryOp(self, node): + op = self.visit(node.op) + return op(self.visit(node.operand)) + + def visit_Name(self, node): + return node.id + + def visit_Num(self, node): + return node.n + + def visit_Compare(self, node): + ops = node.ops + comps = node.comparators + if len(ops) != 1: + raise ExprParserError('chained comparisons not supported') + return self.visit(ops[0])(self.visit(node.left), self.visit(comps[0])) + + def visit_Call(self, node): + if not isinstance(node.func, ast.Name): + raise TypeError("Only named functions are supported") + + valid_ops = _reductions + _mathops + + if node.func.id not in valid_ops: + raise ValueError("Only {0} are supported".format(valid_ops)) + + raise NotImplementedError("function calls not yet supported") + + def visit_Attribute(self, node): + raise NotImplementedError("attribute access is not yet supported") + + def visit_Mod(self, node): + raise NotImplementedError("modulo operator not yet supported") + + +class Expr(object): + """Expr object for pandas + """ + def __init__(self, expr, engine, truediv): + self.expr = expr + self._visitor = ExprVisitor() + self.terms = self.parse() + self.engine = engine + self.truediv = truediv + + def __call__(self, env): + env.locals['truediv'] = self.truediv + return self.terms(env) + + def __repr__(self): + return '{0} -> {1}'.format(self.expr, self.terms) + + def __str__(self): + return self.expr + + def parse(self): + """return a Termset""" + try: + visited = self._visitor.visit(self.expr) + except SyntaxError as e: + raise e + return visited + + def align(self, env): + """align a set of Terms""" + return self.terms.align(env) + + +def isexpr(s): + try: + Expr(s, engine=None) + except SyntaxError: + return False + return True diff --git a/pandas/core/expressions.py b/pandas/computation/expressions.py similarity index 75% rename from pandas/core/expressions.py rename to pandas/computation/expressions.py index abe891b82410c..e1551f9b0548e 100644 --- a/pandas/core/expressions.py +++ b/pandas/computation/expressions.py @@ -5,6 +5,7 @@ Offer fast expression evaluation thru numexpr """ + import numpy as np try: @@ -14,17 +15,19 @@ _NUMEXPR_INSTALLED = False _USE_NUMEXPR = _NUMEXPR_INSTALLED -_evaluate = None -_where = None +_evaluate = None +_where = None # the set of dtypes that we will allow pass to numexpr -_ALLOWED_DTYPES = dict(evaluate = set(['int64','int32','float64','float32','bool']), - where = set(['int64','float64','bool'])) +_ALLOWED_DTYPES = dict( + evaluate=set(['int64', 'int32', 'float64', 'float32', 'bool']), + where=set(['int64', 'float64', 'bool'])) # the minimum prod shape that we will use numexpr -_MIN_ELEMENTS = 10000 +_MIN_ELEMENTS = 10000 + -def set_use_numexpr(v = True): +def set_use_numexpr(v=True): # set/unset to use numexpr global _USE_NUMEXPR if _NUMEXPR_INSTALLED: @@ -34,12 +37,13 @@ def set_use_numexpr(v = True): global _evaluate, _where if not _USE_NUMEXPR: _evaluate = _evaluate_standard - _where = _where_standard + _where = _where_standard else: _evaluate = _evaluate_numexpr - _where = _where_numexpr + _where = _where_numexpr + -def set_numexpr_threads(n = None): +def set_numexpr_threads(n=None): # if we are using numexpr, set the threads to n # otherwise reset try: @@ -53,24 +57,25 @@ def set_numexpr_threads(n = None): def _evaluate_standard(op, op_str, a, b, raise_on_error=True, **eval_kwargs): """ standard evaluation """ - return op(a,b) + return op(a, b) + def _can_use_numexpr(op, op_str, a, b, dtype_check): """ return a boolean if we WILL be using numexpr """ if op_str is not None: - + # required min elements (otherwise we are adding overhead) if np.prod(a.shape) > _MIN_ELEMENTS: # check for dtype compatiblity dtypes = set() - for o in [ a, b ]: - if hasattr(o,'get_dtype_counts'): + for o in [a, b]: + if hasattr(o, 'get_dtype_counts'): s = o.get_dtype_counts() if len(s) > 1: return False dtypes |= set(s.index) - elif isinstance(o,np.ndarray): + elif isinstance(o, np.ndarray): dtypes |= set([o.dtype.name]) # allowed are a superset @@ -85,9 +90,9 @@ def _evaluate_numexpr(op, op_str, a, b, raise_on_error = False, **eval_kwargs): if _can_use_numexpr(op, op_str, a, b, 'evaluate'): try: a_value, b_value = a, b - if hasattr(a_value,'values'): + if hasattr(a_value, 'values'): a_value = a_value.values - if hasattr(b_value,'values'): + if hasattr(b_value, 'values'): b_value = b_value.values result = ne.evaluate('a_value %s b_value' % op_str, local_dict={ 'a_value' : a_value, @@ -98,33 +103,35 @@ def _evaluate_numexpr(op, op_str, a, b, raise_on_error = False, **eval_kwargs): pass except (Exception), detail: if raise_on_error: - raise TypeError(str(detail)) + raise if result is None: - result = _evaluate_standard(op,op_str,a,b,raise_on_error) + result = _evaluate_standard(op, op_str, a, b, raise_on_error) return result -def _where_standard(cond, a, b, raise_on_error=True): + +def _where_standard(cond, a, b, raise_on_error=True): return np.where(cond, a, b) -def _where_numexpr(cond, a, b, raise_on_error = False): + +def _where_numexpr(cond, a, b, raise_on_error=False): result = None if _can_use_numexpr(None, 'where', a, b, 'where'): try: cond_value, a_value, b_value = cond, a, b - if hasattr(cond_value,'values'): + if hasattr(cond_value, 'values'): cond_value = cond_value.values - if hasattr(a_value,'values'): + if hasattr(a_value, 'values'): a_value = a_value.values - if hasattr(b_value,'values'): + if hasattr(b_value, 'values'): b_value = b_value.values result = ne.evaluate('where(cond_value,a_value,b_value)', - local_dict={ 'cond_value' : cond_value, - 'a_value' : a_value, - 'b_value' : b_value }, + local_dict={'cond_value': cond_value, + 'a_value': a_value, + 'b_value': b_value}, casting='safe') except (ValueError), detail: if 'unknown type object' in str(detail): @@ -134,7 +141,7 @@ def _where_numexpr(cond, a, b, raise_on_error = False): raise TypeError(str(detail)) if result is None: - result = _where_standard(cond,a,b,raise_on_error) + result = _where_standard(cond, a, b, raise_on_error) return result @@ -152,8 +159,9 @@ def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, **eval_kw op_str: the string version of the op a : left operand b : right operand - raise_on_error : pass the error to the higher level if indicated (default is False), - otherwise evaluate the op with and return the results + raise_on_error : pass the error to the higher level if indicated + (default is False), otherwise evaluate the op with and + return the results use_numexpr : whether to try to use numexpr (default True) """ @@ -161,6 +169,7 @@ def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, **eval_kw return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error, **eval_kwargs) return _evaluate_standard(op, op_str, a, b, raise_on_error=raise_on_error) + def where(cond, a, b, raise_on_error=False, use_numexpr=True): """ evaluate the where condition cond on a and b @@ -170,8 +179,9 @@ def where(cond, a, b, raise_on_error=False, use_numexpr=True): cond : a boolean array a : return if cond is True b : return if cond is False - raise_on_error : pass the error to the higher level if indicated (default is False), - otherwise evaluate the op with and return the results + raise_on_error : pass the error to the higher level if indicated + (default is False), otherwise evaluate the op with and + return the results use_numexpr : whether to try to use numexpr (default True) """ diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py new file mode 100644 index 0000000000000..fb1965f45c52b --- /dev/null +++ b/pandas/computation/ops.py @@ -0,0 +1,188 @@ +import operator as op +from functools import partial + +from pandas.util.py3compat import PY3 + + +_reductions = 'sum', 'prod' +_mathops = 'sin', 'cos', 'tan' + + +class OperatorError(Exception): + pass + + +class UnaryOperatorError(OperatorError): + pass + + +class BinaryOperatorError(OperatorError): + pass + + +def _resolve_name(env, key): + res = env.locals.get(key, env.globals.get(key)) + + if res is None: + if not isinstance(key, basestring): + return key + + raise NameError('{0!r} is undefined'.format(key)) + + return res + + +def _update_name(env, key, value): + if isinstance(key, basestring): + try: + del env.locals[key] + env.locals[key] = value + except KeyError: + try: + del env.globals[key] + env.globals[key] = value + except KeyError: + raise NameError('{0!r} is undefined'.format(key)) + + +def _update_names(env, mapping): + updater = partial(_update_name, env) + for key, value in mapping.iteritems(): + updater(key, value) + + +class Op(object): + """Hold an operator of unknown arity + """ + def __init__(self, op, operands): + self.op = op + self.operands = operands + + def __iter__(self): + return iter(self.operands) + + @property + def name(self): + return self.__class__.__name__ + + +_cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=' +_cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne +_cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs)) + +_bool_ops_syms = '&', '|' +_bool_ops_funcs = op.and_, op.or_ +_bool_ops_dict = dict(zip(_bool_ops_syms, _bool_ops_funcs)) + +_arith_ops_syms = '+', '-', '*', '/', '**', '//' +_arith_ops_funcs = (op.add, op.sub, op.mul, op.truediv if PY3 else op.div, + op.pow, op.floordiv) +_arith_ops_dict = dict(zip(_arith_ops_syms, _arith_ops_funcs)) + +_binary_ops_dict = {} + +for d in (_cmp_ops_dict, _bool_ops_dict, _arith_ops_dict): + _binary_ops_dict.update(d) + + +class BinOp(Op): + """Hold a binary operator and its operands + + Parameters + ---------- + op : str or Op + left : str or Op + right : str or Op + """ + def __init__(self, op, lhs, rhs): + super(BinOp, self).__init__(op, (lhs, rhs)) + self.lhs = lhs + self.rhs = rhs + + try: + self.func = _binary_ops_dict[op] + except KeyError: + keys = _binary_ops_dict.keys() + raise BinaryOperatorError('Invalid binary operator {0}, valid' + ' operators are {1}'.format(op, keys)) + + def __repr__(self): + return '{0}(op={1!r}, lhs={2!r}, rhs={3!r})'.format(self.name, self.op, + self.lhs, self.rhs) + + __str__ = __repr__ + + def __call__(self, env): + # handle truediv + if self.op == '/' and env.locals['truediv']: + self.func = op.truediv + + # recurse over the left nodes + try: + left = self.lhs(env) + except TypeError: + left = self.lhs + + # recursve over the right nodes + try: + right = self.rhs(env) + except TypeError: + right = self.rhs + + # base cases + if not (isinstance(left, basestring) or isinstance(right, basestring)): + res = self.func(left, right) + elif isinstance(left, basestring) and not isinstance(right, + basestring): + res = self.func(_resolve_name(env, left), right) + elif not isinstance(left, basestring) and isinstance(right, + basestring): + res = self.func(left, _resolve_name(env, right)) + elif isinstance(left, basestring) and isinstance(right, basestring): + res = self.func(_resolve_name(env, left), _resolve_name(env, + right)) + + return res + + +_unary_ops_syms = '+', '-', '~' +_unary_ops_funcs = op.pos, op.neg, op.invert +_unary_ops_dict = dict(zip(_unary_ops_syms, _unary_ops_funcs)) + + +class UnaryOp(Op): + """Hold a unary operator and its operands + """ + def __init__(self, op, operand): + super(UnaryOp, self).__init__(op, (operand,)) + self.operand = operand + + try: + self.func = _unary_ops_dict[op] + except KeyError: + raise UnaryOperatorError('Invalid unary operator {0}, valid ' + 'operators are ' + '{1}'.format(op, _unary_ops_syms)) + + def __call__(self, env): + operand = self.operand + try: + operand = self.operand(env) + except TypeError: + operand = self.operand + + if isinstance(operand, basestring): + v = _resolve_name(env, operand) + else: + v = operand + + try: + res = self.func(v) + except TypeError: + res = self.func(v.values) + + return res + + def __repr__(self): + return '{0}(op={1!r}, operand={2!r})'.format(self.name, self.op, + self.operand) diff --git a/pandas/computation/tests/__init__.py b/pandas/computation/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py new file mode 100644 index 0000000000000..2d7bf4392cfea --- /dev/null +++ b/pandas/computation/tests/test_eval.py @@ -0,0 +1,552 @@ +#!/usr/bin/env python + +import itertools +from itertools import product + +import nose +from nose.tools import assert_raises, assert_tuple_equal, assert_equal +from nose.tools import assert_true + +from numpy.random import randn +import numpy as np +from numpy.testing import assert_array_equal +from numpy.testing.decorators import slow + +import pandas as pd +from pandas import DataFrame, Series +from pandas.util.testing import makeCustomDataframe as mkdf +from pandas.computation.engines import (_engines, _align_core, + _reconstruct_object) +from pandas.computation.ops import _binary_ops_dict, _unary_ops_dict +import pandas.computation.expr as expr +from pandas.computation.expressions import _USE_NUMEXPR +from pandas.computation.eval import Scope +from pandas.computation.eval import _scope_has_series_and_frame_datetime_index +from pandas.computation.eval import _maybe_convert_engine +from pandas.util.testing import assert_frame_equal, randbool + + +def skip_numexpr_engine(engine): + if not _USE_NUMEXPR and engine == 'numexpr': + raise nose.SkipTest + + +def engine_has_neg_frac(engine): + return _engines[engine].has_neg_frac + + +def fractional(x): + frac, _ = np.modf(np.asanyarray(x)) + return frac + + +def hasfractional(x): + return np.any(fractional(x) != 0.0) + + +def _eval_from_expr(lhs, cmp1, rhs, binop, cmp2): + f1 = _binary_ops_dict[cmp1] + f2 = _binary_ops_dict[cmp2] + bf = _binary_ops_dict[binop] + typ, (lhs, rhs), axes = _align_core((lhs, rhs)) + return _reconstruct_object(typ, bf(f1(lhs, rhs), f2(lhs, rhs)), axes) + + +def _eval_single_bin(lhs, cmp1, rhs, has_neg_frac): + c = _binary_ops_dict[cmp1] + if has_neg_frac: + try: + result = c(lhs, rhs) + except ValueError: + result = np.nan + else: + result = c(lhs, rhs) + return result + + +def isframe(x): + return isinstance(x, pd.DataFrame) + + +def isseries(x): + return isinstance(x, pd.Series) + + +def are_compatible_types(op, lhs, rhs): + if op in ('&', '|'): + if isframe(lhs) and isseries(rhs) or isframe(rhs) and isseries(lhs): + return False + return True + + +def _eval_bin_and_unary(unary, lhs, arith1, rhs): + binop = _binary_ops_dict[arith1] + unop = expr._unary_ops_dict[unary] + return unop(binop(lhs, rhs)) + + +# Smoke testing +class TestBasicEval(object): + + @classmethod + def setUpClass(self): + self.cmp_ops = expr._cmp_ops_syms + self.cmp2_ops = self.cmp_ops[::-1] + self.bin_ops = expr._bool_ops_syms + self.arith_ops = tuple(o for o in expr._arith_ops_syms if o != '//') + self.unary_ops = '+', '-' + + def set_current_engine(self): + self.engine = 'numexpr' + + def setup_data(self): + self.lhses = (DataFrame(randn(10, 5)), Series(randn(5)), randn(), + np.float64(randn())) + self.rhses = (DataFrame(randn(10, 5)), Series(randn(5)), randn(), + np.float64(randn())) + + def setUp(self): + try: + import numexpr as ne + self.ne = ne + except ImportError: + raise nose.SkipTest + self.set_current_engine() + self.setup_data() + self.current_engines = filter(lambda x: x != self.engine, + _engines.iterkeys()) + + @slow + def test_complex_cmp_ops(self): + self.setUp() + lhses, rhses = self.lhses, self.rhses + args = itertools.product(lhses, self.cmp_ops, rhses, self.bin_ops, + self.cmp2_ops) + for lhs, cmp1, rhs, binop, cmp2 in args: + self._create_cmp_op_t(lhs, cmp1, rhs, binop, cmp2) + + def test_simple_cmp_ops(self): + bool_lhses = (DataFrame(randbool(size=(10, 5))), + Series(randbool((5,))), randbool()) + bool_rhses = (DataFrame(randbool(size=(10, 5))), + Series(randbool((5,))), randbool()) + args = itertools.product(bool_lhses, bool_rhses, self.cmp_ops) + for lhs, rhs, cmp_op in args: + self._create_simple_cmp_op_t(lhs, rhs, cmp_op) + + def test_binary_arith_ops(self): + self.setUp() + lhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + rhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + args = itertools.product(lhses, self.arith_ops, rhses) + for lhs, op, rhs in args: + self._create_arith_op_t(lhs, op, rhs) + + def test_unary_arith_ops(self): + self.setUp() + lhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + rhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + aops = tuple(aop for aop in self.arith_ops if aop not in '+-') + args = itertools.product(self.unary_ops, lhses, aops, rhses) + for unary_op, lhs, arith_op, rhs in args: + self._create_unary_arith_op_t(unary_op, lhs, arith_op, rhs) + + def test_invert(self): + self.setUp() + lhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + rhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + args = itertools.product(lhses, self.cmp_ops, rhses) + for lhs, op, rhs in args: + self._create_invert_op_t(lhs, op, rhs) + + def _create_cmp_op_t(self, lhs, cmp1, rhs, binop, cmp2): + ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format(cmp1=cmp1, + binop=binop, + cmp2=cmp2) + expected = _eval_from_expr(lhs, cmp1, rhs, binop, cmp2) + result = pd.eval(ex, engine=self.engine) + assert_array_equal(result, expected) + + def _create_simple_cmp_op_t(self, lhs, rhs, cmp1): + ex = 'lhs {0} rhs'.format(cmp1) + + if are_compatible_types(cmp1, lhs, rhs): + expected = _eval_single_bin(lhs, cmp1, rhs, + engine_has_neg_frac(self.engine)) + result = pd.eval(ex, engine=self.engine) + assert_array_equal(result, expected) + else: + assert_raises(TypeError, _eval_single_bin, lhs, cmp1, rhs, + engine_has_neg_frac(self.engine)) + + def _create_arith_op_t(self, lhs, arith1, rhs): + ex = 'lhs {0} rhs'.format(arith1) + nan_frac_neg = (arith1 == '**' and np.any(lhs < 0) and + hasfractional(rhs) and np.isscalar(lhs) and + np.isscalar(rhs) and + not (isinstance(lhs, tuple(np.typeDict.values())) + or isinstance(rhs, tuple(np.typeDict.values())))) + if nan_frac_neg and not engine_has_neg_frac(self.engine): + assert_raises(ValueError, pd.eval, ex, engine=self.engine, + local_dict=locals(), global_dict=globals()) + else: + result = pd.eval(ex, engine=self.engine) + + if arith1 != '//': + expected = _eval_single_bin(lhs, arith1, rhs, + engine_has_neg_frac(self.engine)) + assert_array_equal(result, expected) + + # sanity check on recursive parsing + try: + ghs = rhs.copy() + except AttributeError: + ghs = rhs + + if nan_frac_neg and not engine_has_neg_frac(self.engine): + assert_raises(ValueError, pd.eval, ex, engine=self.engine, + local_dict=locals(), global_dict=globals()) + else: + if arith1 == '**': + ex = '(lhs {0} rhs) {0} ghs'.format(arith1) + else: + ex = 'lhs {0} rhs {0} ghs'.format(arith1) + result = pd.eval(ex, engine=self.engine) + + try: + nlhs = _eval_single_bin(lhs, arith1, rhs, + engine_has_neg_frac(self.engine)) + except ValueError: + assert_raises(ValueError, _eval_single_bin, lhs, arith1, rhs, + engine_has_neg_frac(self.engine)) + else: + try: + nlhs, ghs = nlhs.align(ghs) + except: + pass + if arith1 != '//': + expected = self.ne.evaluate('nlhs {0} ghs'.format(arith1)) + assert_array_equal(result, expected) + + def _create_invert_op_t(self, lhs, cmp1, rhs): + # simple + for el in (lhs, rhs): + try: + elb = el.astype(bool) + except AttributeError: + elb = np.array([bool(el)]) + expected = ~elb + result = pd.eval('~elb', engine=self.engine) + assert_array_equal(expected, result) + + for engine in self.current_engines: + assert_array_equal(result, pd.eval('~elb', engine=engine)) + + # compound + ex = '~(lhs {0} rhs)'.format(cmp1) + if np.isscalar(lhs) and np.isscalar(rhs): + lhs, rhs = map(lambda x: np.array([x]), (lhs, rhs)) + expected = ~_eval_single_bin(lhs, cmp1, rhs, + engine_has_neg_frac(self.engine)) + result = pd.eval(ex, engine=self.engine) + assert_array_equal(expected, result) + + # make sure the other engines work + for engine in self.current_engines: + ev = pd.eval(ex, engine=self.engine) + assert_array_equal(ev, result) + + def _create_unary_arith_op_t(self, unary_op, lhs, arith1, rhs): + # simple + ex = '{0}lhs'.format(unary_op, arith1) + f = _unary_ops_dict[unary_op] + bad_types = tuple(np.typeDict.values()) + + nan_frac_neg = (arith1 == '**' and + np.any(lhs < 0) and + hasfractional(rhs) and + np.isscalar(lhs) and np.isscalar(rhs) and + not (isinstance(lhs, bad_types) or + isinstance(rhs, bad_types)) + and not engine_has_neg_frac(self.engine)) + try: + expected = f(lhs.values) + except AttributeError: + expected = f(lhs) + result = pd.eval(ex, engine=self.engine) + assert_array_equal(result, expected) + + for engine in self.current_engines: + assert_array_equal(result, pd.eval(ex, engine=engine)) + + ex = '{0}(lhs {1} rhs)'.format(unary_op, arith1) + + if nan_frac_neg: + assert_raises(ValueError, pd.eval, ex, engine=self.engine, + local_dict=locals(), global_dict=globals()) + else: + # compound + result = pd.eval(ex, engine=self.engine) + + #(lhs, rhs), _ = _align((lhs, rhs)) + #if arith1 != '//': + #expected = self.ne.evaluate(ex) + #assert_array_equal(result, expected) + #else: + #assert_raises(TypeError, self.ne.evaluate, ex) + + #for engine in self.current_engines: + #if arith1 != '//': + #if engine_has_neg_frac(engine): + #assert_array_equal(result, pd.eval(ex, engine=engine)) + #else: + #assert_raises(TypeError, pd.eval, ex, engine=engine, + #local_dict=locals(), global_dict=globals()) + + +class TestBasicEvalPython(TestBasicEval): + + @classmethod + def setUpClass(cls): + cls.cmp_ops = expr._cmp_ops_syms + cls.cmp2_ops = cls.cmp_ops[::-1] + cls.bin_ops = expr._bool_ops_syms + cls.arith_ops = expr._arith_ops_syms + cls.unary_ops = '+', '-' + + def set_current_engine(self): + self.engine = 'python' + + +def test_syntax_error_exprs(): + for engine in _engines: + e = 's +' + assert_raises(SyntaxError, pd.eval, e, engine=engine) + + +def test_name_error_exprs(): + for engine in _engines: + e = 's + t' + assert_raises(NameError, pd.eval, e, engine=engine) + + +def test_align_nested_unary_op(): + for engine in _engines: + yield check_align_nested_unary_op, engine + + +f = lambda *args, **kwargs: np.random.randn() + + +def check_align_nested_unary_op(engine): + skip_numexpr_engine(engine) + s = 'df * ~2' + df = mkdf(10, 10, data_gen_f=f) + res = pd.eval(s, engine) + assert_frame_equal(res, df * ~2) + + +def check_basic_frame_alignment(engine): + df = mkdf(10, 10, data_gen_f=f) + df2 = mkdf(20, 10, data_gen_f=f) + res = pd.eval('df + df2', engine=engine) + assert_frame_equal(res, df + df2) + + +def test_basic_frame_alignment(): + for engine in _engines: + yield check_basic_frame_alignment, engine + + +def check_medium_complex_frame_alignment(engine, r1, r2, c1, c2): + skip_numexpr_engine(engine) + df = mkdf(5, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) + df2 = mkdf(10, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + df3 = mkdf(15, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + res = pd.eval('df + df2 + df3', engine=engine) + assert_frame_equal(res, df + df2 + df3) + + +@slow +def test_medium_complex_frame_alignment(): + args = product(_engines, *([INDEX_TYPES[:4]] * 4)) + for engine, r1, r2, c1, c2 in args: + check_medium_complex_frame_alignment(engine, r1, r2, c1, c2) + + +def check_basic_frame_series_alignment(engine, r_idx_type, c_idx_type, + index_name): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': + assert_raises(ValueError, pd.eval, 'df + s', local_dict=locals()) + assert_raises(ValueError, df.add, s, axis=1) + else: + res = pd.eval('df + s', engine=engine) + expected = df + s + assert_frame_equal(res, expected) + + +def check_not_both_period_fails_otherwise_succeeds(lhs, rhs, r_idx_type, + c_idx_type, index_name, s, + df, *terms): + if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': + assert_raises(ValueError, pd.eval, lhs, local_dict=locals()) + assert_raises(ValueError, pd.eval, rhs, local_dict=locals()) + else: + a, b = pd.eval(lhs), pd.eval(rhs) + assert_frame_equal(a, b) + + +def check_basic_series_frame_alignment(engine, r_idx_type, c_idx_type, + index_name): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': + assert_raises(ValueError, pd.eval, 's + df', local_dict=locals()) + assert_raises(ValueError, df.add, s, axis=1) + else: + res = pd.eval('s + df', engine=engine) + expected = s + df + assert_frame_equal(res, expected) + + +@slow +def check_basic_series_frame_alignment_datetime(engine, r_idx_type, c_idx_type, + index_name): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': + assert_raises(ValueError, pd.eval, 's + df', local_dict=locals()) + assert_raises(ValueError, df.add, s, axis=1) + else: + res = pd.eval('s + df', engine=engine) + expected = s + df + assert_frame_equal(res, expected) + + if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': + assert_raises(ValueError, pd.eval, 'df + s', local_dict=locals()) + assert_raises(ValueError, df.add, s, axis=1) + else: + res = pd.eval('df + s', engine=engine) + expected = df + s + assert_frame_equal(res, expected) + + +def check_series_frame_commutativity(engine, r_idx_type, c_idx_type, op, + index_name): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + lhs = 's {0} df'.format(op) + rhs = 'df {0} s'.format(op) + check_not_both_period_fails_otherwise_succeeds(lhs, rhs, r_idx_type, + c_idx_type, index_name, s, + df) + + +INDEX_TYPES = 'i', 'f', 's', 'u', 'dt', # 'p' + + +@slow +def test_series_frame_commutativity(): + args = product(_engines, INDEX_TYPES, INDEX_TYPES, ('+', '*'), ('index', + 'columns')) + for engine, r_idx_type, c_idx_type, op, index_name in args: + check_series_frame_commutativity(engine, r_idx_type, c_idx_type, op, + index_name) + + +def test_basic_frame_series_alignment(): + args = product(_engines, INDEX_TYPES, INDEX_TYPES, ('index', 'columns')) + for engine, r_idx_type, c_idx_type, index_name in args: + check_basic_frame_series_alignment(engine, r_idx_type, c_idx_type, + index_name) + + +@slow +def test_basic_series_frame_alignment_datetime(): + idx_types = INDEX_TYPES + args = product(_engines, idx_types, idx_types, ('index', 'columns')) + for engine, r_idx_type, c_idx_type, index_name in args: + check_basic_series_frame_alignment_datetime(engine, r_idx_type, + c_idx_type, index_name) + + +def test_basic_series_frame_alignment(): + args = product(_engines, INDEX_TYPES, INDEX_TYPES, ('index', 'columns')) + for engine, r_idx_type, c_idx_type, index_name in args: + check_basic_series_frame_alignment(engine, r_idx_type, c_idx_type, + index_name) + + +def check_complex_series_frame_alignment(engine, index_name, obj, r1, r2, c1, + c2): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) + df2 = mkdf(20, 10, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + index = getattr(locals()[obj], index_name) + s = Series(np.random.randn(5), index[:5]) + if engine != 'python': + expected = df2.add(s, axis=1).add(df) + else: + expected = df2 + s + df + res = pd.eval('df2 + s + df', engine=engine) + expected = df2 + s + df + assert_tuple_equal(res.shape, expected.shape) + assert_frame_equal(res, expected) + + +@slow +def test_complex_series_frame_alignment(): + args = product(_engines, ('index', 'columns'), ('df', 'df2'), + *([INDEX_TYPES[:4]] * 4)) + for engine, index_name, obj, r1, r2, c1, c2 in args: + check_complex_series_frame_alignment(engine, index_name, obj, r1, r2, + c1, c2) + + +def check_datetime_index_rows_punts_to_python(engine): + df = mkdf(10, 10, data_gen_f=f, r_idx_type='dt', c_idx_type='dt') + index = getattr(df, 'index') + s = Series(np.random.randn(5), index[:5]) + env = Scope(globals(), locals()) + assert_true(_scope_has_series_and_frame_datetime_index(env)) + assert_equal(_maybe_convert_engine(env, engine), 'python') + + +def test_datetime_index_rows_punts_to_python(): + for engine in _engines: + check_datetime_index_rows_punts_to_python(engine) + + +__var_s = randn(10) + + +def check_global_scope(engine): + e = '__var_s * 2' + assert_array_equal(__var_s * 2, pd.eval(e, engine=engine)) + + +def test_global_scope(): + for engine in _engines: + yield check_global_scope, engine + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/computation/tests/test_expressions.py b/pandas/computation/tests/test_expressions.py new file mode 100644 index 0000000000000..f197b8ef7a0ac --- /dev/null +++ b/pandas/computation/tests/test_expressions.py @@ -0,0 +1,157 @@ +# pylint: disable-msg=W0612,E1101 + +import unittest +import operator + +import nose + + +import numpy as np +from numpy.testing import assert_array_equal + +from pandas.core.api import DataFrame +from pandas.computation import expressions as expr + +if not expr._USE_NUMEXPR: + raise nose.SkipTest + +import numexpr as ne + + +_frame = DataFrame(np.random.randn(10000, 4), columns=list('ABCD'), + dtype='float64') +_frame2 = DataFrame(np.random.randn(100, 4), columns=list('ABCD'), + dtype='float64') +_mixed = DataFrame({'A': _frame['A'].copy(), + 'B': _frame['B'].astype('float32'), + 'C': _frame['C'].astype('int64'), + 'D': _frame['D'].astype('int32')}) +_mixed2 = DataFrame({'A': _frame2['A'].copy(), + 'B': _frame2['B'].astype('float32'), + 'C': _frame2['C'].astype('int64'), + 'D': _frame2['D'].astype('int32')}) + + +class TestExpressions(unittest.TestCase): + + _multiprocess_can_split_ = False + + def setUp(self): + self.frame = _frame.copy() + self.frame2 = _frame2.copy() + self.mixed = _mixed.copy() + self.mixed2 = _mixed2.copy() + + def test_invalid(self): + # no op + result = expr._can_use_numexpr(operator.add, None, self.frame, + self.frame, 'evaluate') + self.assertFalse(result) + + # mixed + result = expr._can_use_numexpr( + operator.add, '+', self.mixed, self.frame, 'evaluate') + self.assertFalse(result) + + # min elements + result = expr._can_use_numexpr( + operator.add, '+', self.frame2, self.frame2, 'evaluate') + self.assertFalse(result) + + # ok, we only check on first part of expression + result = expr._can_use_numexpr( + operator.add, '+', self.frame, self.frame2, 'evaluate') + self.assert_(result) + + def test_binary_ops(self): + def testit(): + + for f, f2 in [(self.frame, self.frame2), + (self.mixed, self.mixed2)]: + + for op, op_str in [('add', '+'), ('sub', '-'), ('mul', '*'), + ('div', '/'), ('pow', '**')]: + + op = getattr(operator, op, None) + if op is not None: + result = expr._can_use_numexpr( + op, op_str, f, f, 'evaluate') + self.assert_(result == (not f._is_mixed_type)) + + result = expr.evaluate( + op, op_str, f, f, use_numexpr=True) + expected = expr.evaluate( + op, op_str, f, f, use_numexpr=False) + assert_array_equal(result, expected.values) + + result = expr._can_use_numexpr( + op, op_str, f2, f2, 'evaluate') + self.assertFalse(result) + + expr.set_use_numexpr(False) + testit() + expr.set_use_numexpr(True) + expr.set_numexpr_threads(1) + testit() + expr.set_numexpr_threads() + testit() + + def test_boolean_ops(self): + def testit(): + for f, f2 in [(self.frame, self.frame2), + (self.mixed, self.mixed2)]: + + f11 = f + f12 = f + 1 + + f21 = f2 + f22 = f2 + 1 + + for op, op_str in [('gt', '>'), ('lt', '<'), ('ge', '>='), + ('le', '<='), ('eq', '=='), ('ne', '!=')]: + + op = getattr(operator, op) + + result = expr._can_use_numexpr( + op, op_str, f11, f12, 'evaluate') + self.assert_(result == (not f11._is_mixed_type)) + + result = expr.evaluate( + op, op_str, f11, f12, use_numexpr=True) + expected = expr.evaluate( + op, op_str, f11, f12, use_numexpr=False) + assert_array_equal(result, expected.values) + + result = expr._can_use_numexpr( + op, op_str, f21, f22, 'evaluate') + self.assertFalse(result) + + expr.set_use_numexpr(False) + testit() + expr.set_use_numexpr(True) + expr.set_numexpr_threads(1) + testit() + expr.set_numexpr_threads() + testit() + + def test_where(self): + def testit(): + for f in [self.frame, self.frame2, self.mixed, self.mixed2]: + + for cond in [True, False]: + + c = np.empty(f.shape, dtype=np.bool_) + c.fill(cond) + result = expr.where(c, f.values, f.values + 1) + expected = np.where(c, f.values, f.values + 1) + assert_array_equal(result, expected) + + expr.set_use_numexpr(False) + testit() + expr.set_use_numexpr(True) + expr.set_numexpr_threads(1) + testit() + expr.set_numexpr_threads() + testit() + + diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 401a7746953cb..a8bb74f86a43e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -35,7 +35,7 @@ create_block_manager_from_arrays, create_block_manager_from_blocks) from pandas.core.series import Series, _radd_compat -import pandas.core.expressions as expressions +import pandas.computation.expressions as expressions from pandas.compat.scipy import scoreatpercentile as _quantile from pandas.util.compat import OrderedDict from pandas.util import py3compat @@ -2652,6 +2652,8 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, passed MultiIndex level limit : int, default None Maximum size gap to forward or backward fill + fill_value : object, default NA + The value to use to fill in missing data. Examples -------- diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f23a89635aaf2..ab29a38760a51 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -14,7 +14,7 @@ import pandas.core.common as com import pandas.lib as lib import pandas.tslib as tslib -import pandas.core.expressions as expressions +import pandas.computation.expressions as expressions from pandas.tslib import Timestamp from pandas.util import py3compat diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py deleted file mode 100644 index ba0a9926dfa78..0000000000000 --- a/pandas/tests/test_expressions.py +++ /dev/null @@ -1,203 +0,0 @@ -# pylint: disable-msg=W0612,E1101 - -import unittest -import nose - -import operator -from numpy import random, nan -from numpy.random import randn -import numpy as np -from numpy.testing import assert_array_equal - -import pandas as pan -from pandas.core.api import DataFrame, Series, notnull, isnull -from pandas.core import expressions as expr - -from pandas.util.testing import (assert_almost_equal, - assert_series_equal, - assert_frame_equal) -from pandas.util import py3compat - -import pandas.util.testing as tm -import pandas.lib as lib - -from numpy.testing.decorators import slow - -if not expr._USE_NUMEXPR: - raise nose.SkipTest - -_frame = DataFrame(np.random.randn(10000, 4), columns = list('ABCD'), dtype='float64') -_frame2 = DataFrame(np.random.randn(100, 4), columns = list('ABCD'), dtype='float64') -_mixed = DataFrame({ 'A' : _frame['A'].copy(), 'B' : _frame['B'].astype('float32'), 'C' : _frame['C'].astype('int64'), 'D' : _frame['D'].astype('int32') }) -_mixed2 = DataFrame({ 'A' : _frame2['A'].copy(), 'B' : _frame2['B'].astype('float32'), 'C' : _frame2['C'].astype('int64'), 'D' : _frame2['D'].astype('int32') }) -_integer = DataFrame(np.random.randint(1, 100, size=(10001, 4)), columns = list('ABCD'), dtype='int64') - -class TestExpressions(unittest.TestCase): - - _multiprocess_can_split_ = False - - def setUp(self): - - self.frame = _frame.copy() - self.frame2 = _frame2.copy() - self.mixed = _mixed.copy() - self.mixed2 = _mixed2.copy() - self.integer = _integer.copy() - self._MIN_ELEMENTS = expr._MIN_ELEMENTS - - def tearDown(self): - expr._MIN_ELEMENTS = self._MIN_ELEMENTS - - #TODO: add test for Panel - #TODO: add tests for binary operations - @nose.tools.nottest - def run_arithmetic_test(self, df, assert_func, check_dtype=False): - expr._MIN_ELEMENTS = 0 - operations = ['add', 'sub', 'mul','mod','truediv','floordiv','pow'] - if not py3compat.PY3: - operations.append('div') - for arith in operations: - op = getattr(operator, arith) - expr.set_use_numexpr(False) - expected = op(df, df) - expr.set_use_numexpr(True) - result = op(df, df) - try: - if check_dtype: - if arith == 'div': - assert expected.dtype.kind == df.dtype.kind - if arith == 'truediv': - assert expected.dtype.kind == 'f' - assert_func(expected, result) - except Exception: - print("Failed test with operator %r" % op.__name__) - raise - - def test_integer_arithmetic(self): - self.run_arithmetic_test(self.integer, assert_frame_equal) - self.run_arithmetic_test(self.integer.icol(0), assert_series_equal, - check_dtype=True) - - def test_float_arithemtic(self): - self.run_arithmetic_test(self.frame, assert_frame_equal) - self.run_arithmetic_test(self.frame.icol(0), assert_series_equal, - check_dtype=True) - - def test_mixed_arithmetic(self): - self.run_arithmetic_test(self.mixed, assert_frame_equal) - for col in self.mixed.columns: - self.run_arithmetic_test(self.mixed[col], assert_series_equal) - - def test_integer_with_zeros(self): - self.integer *= np.random.randint(0, 2, size=np.shape(self.integer)) - self.run_arithmetic_test(self.integer, assert_frame_equal) - self.run_arithmetic_test(self.integer.icol(0), assert_series_equal) - - def test_invalid(self): - - # no op - result = expr._can_use_numexpr(operator.add, None, self.frame, self.frame, 'evaluate') - self.assert_(result == False) - - # mixed - result = expr._can_use_numexpr(operator.add, '+', self.mixed, self.frame, 'evaluate') - self.assert_(result == False) - - # min elements - result = expr._can_use_numexpr(operator.add, '+', self.frame2, self.frame2, 'evaluate') - self.assert_(result == False) - - # ok, we only check on first part of expression - result = expr._can_use_numexpr(operator.add, '+', self.frame, self.frame2, 'evaluate') - self.assert_(result == True) - - def test_binary_ops(self): - - def testit(): - - for f, f2 in [ (self.frame, self.frame2), (self.mixed, self.mixed2) ]: - - for op, op_str in [('add','+'),('sub','-'),('mul','*'),('div','/'),('pow','**')]: - - op = getattr(operator,op,None) - if op is not None: - result = expr._can_use_numexpr(op, op_str, f, f, 'evaluate') - self.assert_(result == (not f._is_mixed_type)) - - result = expr.evaluate(op, op_str, f, f, use_numexpr=True) - expected = expr.evaluate(op, op_str, f, f, use_numexpr=False) - assert_array_equal(result,expected.values) - - result = expr._can_use_numexpr(op, op_str, f2, f2, 'evaluate') - self.assert_(result == False) - - - expr.set_use_numexpr(False) - testit() - expr.set_use_numexpr(True) - expr.set_numexpr_threads(1) - testit() - expr.set_numexpr_threads() - testit() - - def test_boolean_ops(self): - - - def testit(): - for f, f2 in [ (self.frame, self.frame2), (self.mixed, self.mixed2) ]: - - f11 = f - f12 = f + 1 - - f21 = f2 - f22 = f2 + 1 - - for op, op_str in [('gt','>'),('lt','<'),('ge','>='),('le','<='),('eq','=='),('ne','!=')]: - - op = getattr(operator,op) - - result = expr._can_use_numexpr(op, op_str, f11, f12, 'evaluate') - self.assert_(result == (not f11._is_mixed_type)) - - result = expr.evaluate(op, op_str, f11, f12, use_numexpr=True) - expected = expr.evaluate(op, op_str, f11, f12, use_numexpr=False) - assert_array_equal(result,expected.values) - - result = expr._can_use_numexpr(op, op_str, f21, f22, 'evaluate') - self.assert_(result == False) - - expr.set_use_numexpr(False) - testit() - expr.set_use_numexpr(True) - expr.set_numexpr_threads(1) - testit() - expr.set_numexpr_threads() - testit() - - def test_where(self): - - def testit(): - for f in [ self.frame, self.frame2, self.mixed, self.mixed2 ]: - - - for cond in [ True, False ]: - - c = np.empty(f.shape,dtype=np.bool_) - c.fill(cond) - result = expr.where(c, f.values, f.values+1) - expected = np.where(c, f.values, f.values+1) - assert_array_equal(result,expected) - - expr.set_use_numexpr(False) - testit() - expr.set_use_numexpr(True) - expr.set_numexpr_threads(1) - testit() - expr.set_numexpr_threads() - testit() - -if __name__ == '__main__': - # unittest.main() - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/setup.py b/setup.py index 7d59e0f95f0e8..3984dc075d4f7 100755 --- a/setup.py +++ b/setup.py @@ -85,7 +85,7 @@ except ImportError: cython = False -from os.path import splitext, basename, join as pjoin +from os.path import join as pjoin class build_ext(_build_ext): @@ -502,6 +502,7 @@ def pxd(name): maintainer=AUTHOR, packages=['pandas', 'pandas.compat', + 'pandas.computation', 'pandas.core', 'pandas.io', 'pandas.rpy', diff --git a/vb_suite/binary_ops.py b/vb_suite/binary_ops.py index 54774344520c9..3f076f9f922a3 100644 --- a/vb_suite/binary_ops.py +++ b/vb_suite/binary_ops.py @@ -21,7 +21,7 @@ start_date=datetime(2012, 1, 1)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_numexpr_threads(1) @@ -32,7 +32,7 @@ start_date=datetime(2013, 2, 26)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_use_numexpr(False) @@ -53,7 +53,7 @@ start_date=datetime(2012, 1, 1)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_numexpr_threads(1) @@ -63,7 +63,7 @@ start_date=datetime(2013, 2, 26)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_use_numexpr(False) @@ -84,7 +84,7 @@ start_date=datetime(2012, 1, 1)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_numexpr_threads(1) @@ -94,7 +94,7 @@ start_date=datetime(2013, 2, 26)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_use_numexpr(False) diff --git a/vb_suite/indexing.py b/vb_suite/indexing.py index 9f07cc6ed15c3..2edb7548ebeef 100644 --- a/vb_suite/indexing.py +++ b/vb_suite/indexing.py @@ -103,7 +103,7 @@ start_date=datetime(2012, 1, 1)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(50000, 100)) df2 = DataFrame(np.random.randn(50000, 100)) expr.set_numexpr_threads(1) @@ -115,7 +115,7 @@ setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(50000, 100)) df2 = DataFrame(np.random.randn(50000, 100)) expr.set_use_numexpr(False) From bcd17b090a32afd43de0a21f3829f281635a8b51 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 15 Jun 2013 21:35:22 -0400 Subject: [PATCH 02/48] ENH/TST: add new instance testing functions and their tests --- pandas/core/common.py | 24 ++++++++++++++ pandas/tests/test_common.py | 65 +++++++++++++++++++++++++++++++++++-- pandas/util/testing.py | 5 ++- 3 files changed, 90 insertions(+), 4 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index ddacb98a2ddf3..4615571c5d86c 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -21,6 +21,7 @@ from pandas.core.config import get_option from pandas.core import array as pa +import pandas as pd # XXX: HACK for NumPy 1.5.1 to suppress warnings try: @@ -1509,6 +1510,29 @@ def is_bool(obj): return isinstance(obj, (bool, np.bool_)) +def is_string(obj): + return isinstance(obj, (basestring, np.str_, np.unicode_)) + + +def is_series(obj): + return isinstance(obj, pd.Series) + + +def is_frame(obj): + return isinstance(obj, pd.DataFrame) + + +def is_panel(obj): + return isinstance(obj, pd.Panel) + + +def is_pd_obj(obj): + return isinstance(obj, pd.core.generic.PandasObject) + + +def is_ndframe(obj): + return isinstance(obj, pd.core.generic.NDFrame) + def is_integer(obj): return isinstance(obj, (int, long, np.integer)) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index db01545fb3c9d..974e301c5d303 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -1,20 +1,19 @@ from datetime import datetime -import sys import re import nose import unittest -from pandas import Series, DataFrame, date_range, DatetimeIndex +from pandas import Series, DataFrame, date_range, DatetimeIndex, Panel from pandas.core.common import notnull, isnull import pandas.core.common as com import pandas.util.testing as tm import pandas.core.config as cf import numpy as np +from numpy.random import randn from pandas.tslib import iNaT -from pandas.util import py3compat _multiprocess_can_split_ = True @@ -33,6 +32,7 @@ def __getitem__(self): assert(not is_seq(A())) + def test_notnull(): assert notnull(1.) assert not notnull(None) @@ -98,6 +98,61 @@ def test_isnull_lists(): assert(not result.any()) +def test_is_string(): + class MyString(str): + pass + + class MyUnicode(unicode): + pass + + strings = ('s', np.str_('a'), np.unicode_('unicode_string'), + MyString('a _string blah'), u'asdf', MyUnicode(u'asdf')) + not_strings = [], 1, {}, set(), np.array(['1']), np.array([u'1']) + + for string in strings: + assert com.is_string(string), '{0} is not a string'.format(string) + + for not_string in not_strings: + assert not com.is_string(not_string), ('{0} is a ' + 'string'.format(not_string)) + + +def test_is_frame(): + df = DataFrame(randn(2, 1)) + assert com.is_frame(df) + assert not com.is_frame('s') + + +def test_is_series(): + s = Series(randn(2)) + assert com.is_series(s) + assert not com.is_series(s.values) + + +def test_is_panel(): + p = Panel(randn(2, 3, 4)) + assert com.is_panel(p) + assert not com.is_panel(2) + + +def test_is_pd_obj(): + df = DataFrame(randn(2, 1)) + s = Series(randn(2)) + p = Panel(randn(2, 3, 4)) + for obj in (df, s, p): + assert com.is_pd_obj(obj) + assert not com.is_pd_obj(obj.values) + + +def test_is_ndframe(): + df = DataFrame(randn(2, 1)) + p = Panel(randn(2, 3, 4)) + # should add series after @jreback's ndframe to series pr + for obj in (df, p): + assert com.is_ndframe(obj) + assert not com.is_ndframe(obj.values) + + def test_isnull_datetime(): assert (not isnull(datetime.now())) assert notnull(datetime.now()) @@ -112,11 +167,13 @@ def test_isnull_datetime(): assert(mask[0]) assert(not mask[1:].any()) + def test_datetimeindex_from_empty_datetime64_array(): for unit in [ 'ms', 'us', 'ns' ]: idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit)) assert(len(idx) == 0) + def test_any_none(): assert(com._any_none(1, 2, 3, None)) assert(not com._any_none(1, 2, 3, 4)) @@ -266,6 +323,7 @@ def test_ensure_int32(): result = com._ensure_int32(values) assert(result.dtype == np.int32) + def test_ensure_platform_int(): # verify that when we create certain types of indices @@ -748,6 +806,7 @@ def test_2d_datetime64(self): expected[:, [2, 4]] = datetime(2007, 1, 1) tm.assert_almost_equal(result, expected) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 47bde4ecb32a7..e1b2950b5c8d3 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -17,7 +17,7 @@ from urllib2 import urlopen from distutils.version import LooseVersion -from numpy.random import randn +from numpy.random import randn, rand import numpy as np from pandas.core.common import isnull, _is_sequence @@ -45,6 +45,9 @@ _RAISE_NETWORK_ERROR_DEFAULT = False +def randbool(size=(), p=0.5): + return rand(*size) <= p + def rands(n): choices = string.ascii_letters + string.digits return ''.join(random.choice(choices) for _ in xrange(n)) From 81bacd1d9a8dbec90cbdf3d92d45b3180d0eeee2 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 15 Jun 2013 21:58:28 -0400 Subject: [PATCH 03/48] BUG: prevent certain index types from joining with DatetimeIndex --- pandas/tseries/index.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 7fdb6d9d2603d..4c75ef66feb08 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -912,7 +912,8 @@ def join(self, other, how='left', level=None, return_indexers=False): See Index.join """ if (not isinstance(other, DatetimeIndex) and len(other) > 0 and - other.inferred_type != 'mixed-integer'): + other.inferred_type not in ('floating', 'mixed-integer', + 'mixed-integer-float', 'mixed')): try: other = DatetimeIndex(other) except TypeError: From e380271278cba82d669cd07312d4f37106a4c47d Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 15 Jun 2013 23:26:01 -0400 Subject: [PATCH 04/48] TST/ENH: add 2d bare numpy array and nan support --- pandas/computation/engines.py | 60 +++++++++++++++++---------- pandas/computation/ops.py | 4 +- pandas/computation/tests/test_eval.py | 35 ++++++++++++---- 3 files changed, 70 insertions(+), 29 deletions(-) diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 0eb9875b85549..5bb43efec3e15 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -1,4 +1,5 @@ import abc +import functools from functools import partial from itertools import izip @@ -66,25 +67,42 @@ def _maybe_promote_shape(values, naxes): return values[tuple(axes_slice)] -def _align_core(terms): - # need to ensure that terms is not an iterator - terms = list(terms) +def _any_pandas_objects(terms): + """Check a sequence of terms for instances of PandasObject.""" + return any(com.is_pd_obj(term) for term in terms) + + +def _filter_special_cases(f): + @functools.wraps(f) + def wrapper(terms): + # need to ensure that terms is not an iterator + terms = list(terms) + + ## special cases - ## special cases + # single unary operand + if len(terms) == 1: + return _align_core_single_unary_op(terms[0]) - # single unary operand - if len(terms) == 1: - return _align_core_single_unary_op(terms[0]) - # only scalars - elif all(np.isscalar(term) for term in terms): - return np.result_type(*terms), terms, None + # only scalars + elif all(np.isscalar(term) for term in terms): + return np.result_type(*terms), terms, None - # single dim ndarrays - all_has_size = all(hasattr(term, 'size') for term in terms) - if (all_has_size and all(term.size == 1 for term in terms)): - return np.result_type(*terms), terms, None + # single element ndarrays + all_has_size = all(hasattr(term, 'size') for term in terms) + if (all_has_size and all(term.size == 1 for term in terms)): + return np.result_type(*terms), terms, None - # made it past the special cases + # no pandas so just punt to the evaluator + if not _any_pandas_objects(terms): + return np.result_type(*terms), terms, None + + return f(terms) + return wrapper + + +@_filter_special_cases +def _align_core(terms): term_index = [i for i, term in enumerate(terms) if hasattr(term, 'axes')] term_dims = [terms[i].ndim for i in term_index] ndims = pd.Series(dict(zip(term_index, term_dims))) @@ -145,8 +163,8 @@ def _filter_terms(flat): # literals are not names and names are not literals, by definition if literals_set & names_set: - raise AssertionError('literals cannot be names and names cannot be ' - 'literals') + raise ValueError('literals cannot be names and names cannot be ' + 'literals') return names, literals @@ -154,10 +172,10 @@ def _align(terms, env): # flatten the parse tree (a nested list) flat = list(flatten(terms)) + # separate names and literals names, literals = _filter_terms(flat) - # given an expression consisting of literals - if not names: + if not names: # only literals so just promote to a common type return np.result_type(*literals).type, None # get the variables out @@ -165,13 +183,13 @@ def _align(terms, env): resolved = map(resolve_in_env, names) # if all resolved variables are numeric scalars - if all(map(np.isscalar, resolved)): + if all(np.isscalar(rsv) for rsv in resolved): return np.result_type(*resolved).type, None # perform the main alignment typ, resolved, axes = _align_core(resolved) - # put them back in the symbol table + # put the aligned arrays back in the table _update_names(env, dict(izip(names, resolved))) # we need this to reconstruct things after evaluation since we CANNOT diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index fb1965f45c52b..f79acc412023a 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -123,7 +123,7 @@ def __call__(self, env): except TypeError: left = self.lhs - # recursve over the right nodes + # recurse over the right nodes try: right = self.rhs(env) except TypeError: @@ -166,6 +166,8 @@ def __init__(self, op, operand): def __call__(self, env): operand = self.operand + + # recurse if operand is an Op try: operand = self.operand(env) except TypeError: diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 2d7bf4392cfea..cb52025e45df1 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -1,5 +1,6 @@ #!/usr/bin/env python +import unittest import itertools from itertools import product @@ -7,12 +8,13 @@ from nose.tools import assert_raises, assert_tuple_equal, assert_equal from nose.tools import assert_true -from numpy.random import randn +from numpy.random import randn, rand import numpy as np from numpy.testing import assert_array_equal from numpy.testing.decorators import slow import pandas as pd +from pandas.core import common as com from pandas import DataFrame, Series from pandas.util.testing import makeCustomDataframe as mkdf from pandas.computation.engines import (_engines, _align_core, @@ -85,8 +87,14 @@ def _eval_bin_and_unary(unary, lhs, arith1, rhs): return unop(binop(lhs, rhs)) +def _series_and_2d_ndarray(lhs, rhs): + return (com.is_series(lhs) and isinstance(rhs, np.ndarray) and rhs.ndim > 1 + or com.is_series(rhs) and isinstance(lhs, np.ndarray) and lhs.ndim + > 1) + + # Smoke testing -class TestBasicEval(object): +class TestBasicEval(unittest.TestCase): @classmethod def setUpClass(self): @@ -100,10 +108,14 @@ def set_current_engine(self): self.engine = 'numexpr' def setup_data(self): + nan_df = DataFrame(rand(10, 5)) + nan_df[nan_df > 0.5] = np.nan self.lhses = (DataFrame(randn(10, 5)), Series(randn(5)), randn(), - np.float64(randn())) + np.float64(randn()), randn(10, 5), randn(5), np.nan, + Series([1, 2, np.nan, np.nan, 5]), nan_df) self.rhses = (DataFrame(randn(10, 5)), Series(randn(5)), randn(), - np.float64(randn())) + np.float64(randn()), randn(10, 5), randn(5), np.nan, + Series([1, 2, np.nan, np.nan, 5]), nan_df) def setUp(self): try: @@ -163,9 +175,14 @@ def _create_cmp_op_t(self, lhs, cmp1, rhs, binop, cmp2): ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format(cmp1=cmp1, binop=binop, cmp2=cmp2) - expected = _eval_from_expr(lhs, cmp1, rhs, binop, cmp2) - result = pd.eval(ex, engine=self.engine) - assert_array_equal(result, expected) + if _series_and_2d_ndarray(lhs, rhs): + self.assertRaises(Exception, _eval_from_expr, lhs, cmp1, rhs, + binop, cmp2) + self.assertRaises(Exception, pd.eval, ex, engine=self.engine) + else: + expected = _eval_from_expr(lhs, cmp1, rhs, binop, cmp2) + result = pd.eval(ex, engine=self.engine) + assert_array_equal(result, expected) def _create_simple_cmp_op_t(self, lhs, rhs, cmp1): ex = 'lhs {0} rhs'.format(cmp1) @@ -534,6 +551,10 @@ def test_datetime_index_rows_punts_to_python(): check_datetime_index_rows_punts_to_python(engine) +def check_truediv(engine): + s = randn(10) + + __var_s = randn(10) From 99a3d280d86d6b6141086eef2fde29d979b9dc4f Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 17 Jun 2013 07:37:46 -0400 Subject: [PATCH 05/48] ENH: add modulus support --- pandas/computation/eval.py | 2 +- pandas/computation/expr.py | 13 +++++++------ pandas/computation/ops.py | 22 ++++++++++++++++++++-- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 21348f221bc99..64345e8d3a143 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -57,7 +57,7 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, engine = _maybe_convert_engine(env, engine) # parse the expression - parsed_expr = Expr(expr, engine, truediv) + parsed_expr = Expr(expr, engine, env, truediv) # choose the engine eng = _engines[engine] diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 105c0a020a2ad..f6d4ca39788ab 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -1,7 +1,7 @@ import ast from functools import partial -from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops +from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops, Mod from pandas.computation.ops import _cmp_ops_syms, _bool_ops_syms from pandas.computation.ops import _arith_ops_syms, _unary_ops_syms @@ -15,14 +15,14 @@ class ExprVisitor(ast.NodeVisitor): """ bin_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms bin_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'BitAnd', 'BitOr', - 'Add', 'Sub', 'Mult', 'Div', 'Pow', 'FloorDiv') + 'Add', 'Sub', 'Mult', 'Div', 'Pow', 'FloorDiv', 'Mod') bin_op_nodes_map = dict(zip(bin_ops, bin_op_nodes)) unary_ops = _unary_ops_syms unary_op_nodes = 'UAdd', 'USub', 'Invert' unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) - def __init__(self): + def __init__(self, env): for bin_op in self.bin_ops: setattr(self, 'visit_{0}'.format(self.bin_op_nodes_map[bin_op]), lambda node, bin_op=bin_op: partial(BinOp, bin_op)) @@ -31,6 +31,7 @@ def __init__(self): setattr(self, 'visit_{0}'.format(self.unary_op_nodes_map[unary_op]), lambda node, unary_op=unary_op: partial(UnaryOp, unary_op)) + self.env = env def visit(self, node): if not (isinstance(node, ast.AST) or isinstance(node, basestring)): @@ -91,15 +92,15 @@ def visit_Attribute(self, node): raise NotImplementedError("attribute access is not yet supported") def visit_Mod(self, node): - raise NotImplementedError("modulo operator not yet supported") + return partial(Mod, env=self.env) class Expr(object): """Expr object for pandas """ - def __init__(self, expr, engine, truediv): + def __init__(self, expr, engine, env, truediv): self.expr = expr - self._visitor = ExprVisitor() + self._visitor = ExprVisitor(env) self.terms = self.parse() self.engine = engine self.truediv = truediv diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index f79acc412023a..f81844d787a5a 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -1,6 +1,7 @@ import operator as op from functools import partial +import numpy as np from pandas.util.py3compat import PY3 @@ -74,9 +75,9 @@ def name(self): _bool_ops_funcs = op.and_, op.or_ _bool_ops_dict = dict(zip(_bool_ops_syms, _bool_ops_funcs)) -_arith_ops_syms = '+', '-', '*', '/', '**', '//' +_arith_ops_syms = '+', '-', '*', '/', '**', '//', '%' _arith_ops_funcs = (op.add, op.sub, op.mul, op.truediv if PY3 else op.div, - op.pow, op.floordiv) + op.pow, op.floordiv, op.mod) _arith_ops_dict = dict(zip(_arith_ops_syms, _arith_ops_funcs)) _binary_ops_dict = {} @@ -85,6 +86,17 @@ def name(self): _binary_ops_dict.update(d) +def _cast(terms, env, dtype): + resolver = partial(_resolve_name, env) + updater = partial(_update_name, env) + for term in terms: + t = resolver(term) + try: + new_value = t.astype(dtype) + except AttributeError: + new_value = dtype.type(t) + updater(term, t) + class BinOp(Op): """Hold a binary operator and its operands @@ -145,6 +157,12 @@ def __call__(self, env): return res +class Mod(BinOp): + def __init__(self, lhs, rhs, env=None): + super(Mod, self).__init__('%', lhs, rhs) + _cast(env, (lhs, rhs), np.float_) + + _unary_ops_syms = '+', '-', '~' _unary_ops_funcs = op.pos, op.neg, op.invert _unary_ops_dict = dict(zip(_unary_ops_syms, _unary_ops_funcs)) From 4db95fe90b529e2f25294acfad0408cdfe60f8ec Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 17 Jun 2013 08:02:44 -0400 Subject: [PATCH 06/48] TST: add failing modulus tests --- pandas/computation/tests/test_eval.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index cb52025e45df1..4e062d6a4e99b 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -10,7 +10,7 @@ from numpy.random import randn, rand import numpy as np -from numpy.testing import assert_array_equal +from numpy.testing import assert_array_equal, assert_allclose from numpy.testing.decorators import slow import pandas as pd @@ -212,7 +212,11 @@ def _create_arith_op_t(self, lhs, arith1, rhs): if arith1 != '//': expected = _eval_single_bin(lhs, arith1, rhs, engine_has_neg_frac(self.engine)) - assert_array_equal(result, expected) + # roundoff error with modulus + if arith1 == '%': + assert_allclose(result, expected) + else: + assert_array_equal(result, expected) # sanity check on recursive parsing try: @@ -243,7 +247,12 @@ def _create_arith_op_t(self, lhs, arith1, rhs): pass if arith1 != '//': expected = self.ne.evaluate('nlhs {0} ghs'.format(arith1)) - assert_array_equal(result, expected) + + # roundoff error with modulus + if arith1 == '%': + assert_allclose(result, expected) + else: + assert_array_equal(result, expected) def _create_invert_op_t(self, lhs, cmp1, rhs): # simple @@ -551,6 +560,11 @@ def test_datetime_index_rows_punts_to_python(): check_datetime_index_rows_punts_to_python(engine) +def test_truediv(): + for engine in _engines: + check_truediv(engine) + + def check_truediv(engine): s = randn(10) From 6000c89fe9af20c974ef9b5ff19ea13c4f49178a Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 17 Jun 2013 20:34:24 -0400 Subject: [PATCH 07/48] CLN: use format string for unicode --- pandas/computation/engines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 5bb43efec3e15..11843ffef1705 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -275,7 +275,7 @@ def __init__(self, expr): def convert(self): """Return a string""" - return str(self.expr) + return '%s' % self.expr def _evaluate(self, env): import numexpr as ne From c25a1d4b0853578183e75d341aaab051941bdce7 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 17 Jun 2013 20:35:45 -0400 Subject: [PATCH 08/48] CLN: remove engine detection and manip for datetimes --- pandas/computation/eval.py | 34 ++----------------- pandas/computation/expressions.py | 48 +++++++++++++-------------- pandas/computation/ops.py | 35 ++++++++++++++++--- pandas/computation/tests/test_eval.py | 23 +++++++++---- 4 files changed, 74 insertions(+), 66 deletions(-) diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 64345e8d3a143..298554005d6ed 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -2,40 +2,13 @@ import sys import numbers -import collections -import itertools import numpy as np -Scope = collections.namedtuple('Scope', 'globals locals') - -import pandas.core.common as com -from pandas.computation.expr import Expr +from pandas.computation.expr import Expr, Scope from pandas.computation.engines import _engines -def _scope_has_series_and_frame_datetime_index(env): - from pandas import DatetimeIndex - series_index = frame_index = 0 - - for v in itertools.chain(env.locals.itervalues(), - env.globals.itervalues()): - series_index += com.is_series(v) and isinstance(v.index, DatetimeIndex) - frame_index += com.is_frame(v) and isinstance(v.index, DatetimeIndex) - return series_index, frame_index - - -def _maybe_convert_engine(env, engine): - assert isinstance(env, Scope), 'environment must be an instance of Scope' - assert isinstance(engine, basestring), 'engine name must be a string' - - ret = engine - - if all(_scope_has_series_and_frame_datetime_index(env)): - ret = 'python' - return ret - - def eval(expr, engine='numexpr', truediv=True, local_dict=None, global_dict=None): # make sure we're passed a valid engine @@ -44,7 +17,8 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, ' {1}'.format(_engines.keys())) # 1 up in the call stack for locals/globals; see the documentation for the - # inspect module for why you must decrease the refcount of frame + # inspect module for why you must decrease the refcount of frame at all + # costs frame = sys._getframe(1) try: @@ -54,8 +28,6 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, # shallow copy the scope so we don't overwrite everything env = Scope(gbl.copy(), lcl.copy()) - engine = _maybe_convert_engine(env, engine) - # parse the expression parsed_expr = Expr(expr, engine, env, truediv) diff --git a/pandas/computation/expressions.py b/pandas/computation/expressions.py index e1551f9b0548e..0c13a50d15618 100644 --- a/pandas/computation/expressions.py +++ b/pandas/computation/expressions.py @@ -7,6 +7,7 @@ """ import numpy as np +import pandas.core.common as com try: import numexpr as ne @@ -46,13 +47,10 @@ def set_use_numexpr(v=True): def set_numexpr_threads(n=None): # if we are using numexpr, set the threads to n # otherwise reset - try: - if _NUMEXPR_INSTALLED and _USE_NUMEXPR: - if n is None: - n = ne.detect_number_of_cores() - ne.set_num_threads(n) - except: - pass + if _NUMEXPR_INSTALLED and _USE_NUMEXPR: + if n is None: + n = ne.detect_number_of_cores() + ne.set_num_threads(n) def _evaluate_standard(op, op_str, a, b, raise_on_error=True, **eval_kwargs): @@ -84,7 +82,8 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): return False -def _evaluate_numexpr(op, op_str, a, b, raise_on_error = False, **eval_kwargs): + +def _evaluate_numexpr(op, op_str, a, b, raise_on_error=False, **eval_kwargs): result = None if _can_use_numexpr(op, op_str, a, b, 'evaluate'): @@ -94,15 +93,13 @@ def _evaluate_numexpr(op, op_str, a, b, raise_on_error = False, **eval_kwargs): a_value = a_value.values if hasattr(b_value, 'values'): b_value = b_value.values - result = ne.evaluate('a_value %s b_value' % op_str, - local_dict={ 'a_value' : a_value, - 'b_value' : b_value }, + result = ne.evaluate('a_value %s b_value' % op_str, + local_dict={'a_value': a_value, + 'b_value': b_value}, casting='safe', **eval_kwargs) - except (ValueError), detail: - if 'unknown type object' in str(detail): - pass - except (Exception), detail: - if raise_on_error: + except Exception as detail: + if ('unknown type object' not in com.pprint_thing(detail) and + raise_on_error): raise if result is None: @@ -128,17 +125,15 @@ def _where_numexpr(cond, a, b, raise_on_error=False): a_value = a_value.values if hasattr(b_value, 'values'): b_value = b_value.values - result = ne.evaluate('where(cond_value,a_value,b_value)', + result = ne.evaluate('where(cond_value, a_value, b_value)', local_dict={'cond_value': cond_value, 'a_value': a_value, 'b_value': b_value}, casting='safe') - except (ValueError), detail: - if 'unknown type object' in str(detail): - pass - except (Exception), detail: - if raise_on_error: - raise TypeError(str(detail)) + except Exception as detail: + if ('unknown type object' not in com.pprint_thing(detail) and + raise_on_error): + raise if result is None: result = _where_standard(cond, a, b, raise_on_error) @@ -149,7 +144,9 @@ def _where_numexpr(cond, a, b, raise_on_error=False): # turn myself on set_use_numexpr(True) -def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, **eval_kwargs): + +def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, + **eval_kwargs): """ evaluate and return the expression of the op on a and b Parameters @@ -166,7 +163,8 @@ def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, **eval_kw """ if use_numexpr: - return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error, **eval_kwargs) + return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error, + **eval_kwargs) return _evaluate_standard(op, op_str, a, b, raise_on_error=raise_on_error) diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index f81844d787a5a..1a6d3fd1672ba 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -52,6 +52,32 @@ def _update_names(env, mapping): updater(key, value) +class Term(object): + def __init__(self, value, name, env): + self.value = value + self.name = name + self.env = env + self.type = type(value) + + def __iter__(self): + yield self.value + raise StopIteration + + def __str__(self): + return '{0}({1!r})'.format(self.__class__.__name__, self.name) + + __repr__ = __str__ + + def update(self, env, value): + _update_name(self.env, self.name, value) + self.value = value + + +class Constant(Term): + def __init__(self, value, env): + super(Constant, self).__init__(value, value, env) + + class Op(object): """Hold an operator of unknown arity """ @@ -89,13 +115,14 @@ def name(self): def _cast(terms, env, dtype): resolver = partial(_resolve_name, env) updater = partial(_update_name, env) + dt = np.dtype(dtype) for term in terms: t = resolver(term) try: - new_value = t.astype(dtype) + new_value = t.astype(dt) except AttributeError: - new_value = dtype.type(t) - updater(term, t) + new_value = dt.type(t) + updater(term, new_value) class BinOp(Op): """Hold a binary operator and its operands @@ -160,7 +187,7 @@ def __call__(self, env): class Mod(BinOp): def __init__(self, lhs, rhs, env=None): super(Mod, self).__init__('%', lhs, rhs) - _cast(env, (lhs, rhs), np.float_) + _cast((lhs, rhs), env, np.float_) _unary_ops_syms = '+', '-', '~' diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 4e062d6a4e99b..417fb106f90fa 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -5,8 +5,8 @@ from itertools import product import nose -from nose.tools import assert_raises, assert_tuple_equal, assert_equal -from nose.tools import assert_true +from nose.tools import assert_raises, assert_tuple_equal +from nose.tools import assert_true, assert_false from numpy.random import randn, rand import numpy as np @@ -23,8 +23,6 @@ import pandas.computation.expr as expr from pandas.computation.expressions import _USE_NUMEXPR from pandas.computation.eval import Scope -from pandas.computation.eval import _scope_has_series_and_frame_datetime_index -from pandas.computation.eval import _maybe_convert_engine from pandas.util.testing import assert_frame_equal, randbool @@ -551,8 +549,6 @@ def check_datetime_index_rows_punts_to_python(engine): index = getattr(df, 'index') s = Series(np.random.randn(5), index[:5]) env = Scope(globals(), locals()) - assert_true(_scope_has_series_and_frame_datetime_index(env)) - assert_equal(_maybe_convert_engine(env, engine), 'python') def test_datetime_index_rows_punts_to_python(): @@ -582,6 +578,21 @@ def test_global_scope(): yield check_global_scope, engine +def check_is_expr(engine): + s = 1 + valid = 's + 1' + invalid = 's +' + assert_true(expr.isexpr(valid, check_names=True)) + assert_false(expr.isexpr(valid, check_names=False)) + assert_false(expr.isexpr(invalid, check_names=False)) + assert_false(expr.isexpr(invalid, check_names=True)) + + +def test_is_expr(): + for engine in _engines: + check_is_expr(engine) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From 1132bc40843626fd2eb1afbb6755490a79924337 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Wed, 19 Jun 2013 21:40:11 -0400 Subject: [PATCH 09/48] CLN/ENH: add new interface to encapsulate Terms and Constants --- pandas/computation/engines.py | 100 +++++++++++++++------------------- pandas/computation/expr.py | 38 ++++++++++--- pandas/computation/ops.py | 72 +++++++++++++----------- 3 files changed, 112 insertions(+), 98 deletions(-) diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 11843ffef1705..4ebb4a15fdee4 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -1,27 +1,26 @@ import abc -import functools -from functools import partial +from functools import partial, wraps from itertools import izip import numpy as np import pandas as pd import pandas.core.common as com -from pandas.computation.ops import _resolve_name, _update_names +from pandas.computation.ops import is_const from pandas.computation.common import flatten def _align_core_single_unary_op(term): - if isinstance(term, np.ndarray) and not com.is_series(term): - typ = np.asanyarray + if isinstance(term.value, np.ndarray) and not com.is_series(term.value): + typ = partial(np.asanyarray, dtype=term.value.dtype) else: - typ = type(term) - ret = typ, [term] + typ = type(term.value) + ret = typ, - if not hasattr(term, 'axes'): + if not hasattr(term.value, 'axes'): ret += None, else: - ret += _zip_axes_from_type(typ, term.axes), + ret += _zip_axes_from_type(typ, term.value.axes), return ret @@ -69,33 +68,28 @@ def _maybe_promote_shape(values, naxes): def _any_pandas_objects(terms): """Check a sequence of terms for instances of PandasObject.""" - return any(com.is_pd_obj(term) for term in terms) + return any(com.is_pd_obj(term.value) for term in terms) def _filter_special_cases(f): - @functools.wraps(f) + @wraps(f) def wrapper(terms): - # need to ensure that terms is not an iterator - terms = list(terms) - - ## special cases - # single unary operand if len(terms) == 1: return _align_core_single_unary_op(terms[0]) # only scalars - elif all(np.isscalar(term) for term in terms): - return np.result_type(*terms), terms, None + elif all(term.isscalar for term in terms): + return np.result_type(*(term.value for term in terms)), None # single element ndarrays - all_has_size = all(hasattr(term, 'size') for term in terms) - if (all_has_size and all(term.size == 1 for term in terms)): - return np.result_type(*terms), terms, None + all_has_size = all(hasattr(term.value, 'size') for term in terms) + if (all_has_size and all(term.value.size == 1 for term in terms)): + return np.result_type(*(term.value for term in terms)), None # no pandas so just punt to the evaluator if not _any_pandas_objects(terms): - return np.result_type(*terms), terms, None + return np.result_type(*(term.value for term in terms)), None return f(terms) return wrapper @@ -103,27 +97,28 @@ def wrapper(terms): @_filter_special_cases def _align_core(terms): - term_index = [i for i, term in enumerate(terms) if hasattr(term, 'axes')] - term_dims = [terms[i].ndim for i in term_index] + term_index = [i for i, term in enumerate(terms) if hasattr(term.value, + 'axes')] + term_dims = [terms[i].value.ndim for i in term_index] ndims = pd.Series(dict(zip(term_index, term_dims))) # initial axes are the axes of the largest-axis'd term - biggest = terms[ndims.idxmax()] + biggest = terms[ndims.idxmax()].value typ = biggest._constructor axes = biggest.axes naxes = len(axes) for i in term_index: - for axis, items in enumerate(terms[i].axes): - if com.is_series(terms[i]) and naxes > 1: - axes[naxes - 1] = axes[naxes - 1].join(terms[i].index, + for axis, items in enumerate(terms[i].value.axes): + if com.is_series(terms[i].value) and naxes > 1: + axes[naxes - 1] = axes[naxes - 1].join(terms[i].value.index, how='outer') else: axes[axis] = axes[axis].join(items, how='outer') for i, ndim in ndims.iteritems(): for axis, items in izip(xrange(ndim), axes): - ti = terms[i] # needed here because we modify it in the inner loop + ti = terms[i].value # needed here because we modify it in the inner loop if hasattr(ti, 'reindex_axis'): transpose = com.is_series(ti) and naxes > 1 @@ -138,31 +133,31 @@ def _align_core(terms): else: r = f() - terms[i] = r + terms[i].update(r) - res = _maybe_promote_shape(terms[i].T if transpose else terms[i], - naxes) + res = _maybe_promote_shape(terms[i].value.T if transpose else + terms[i].value, naxes) res = res.T if transpose else res try: - terms[i] = res.values + v = res.values except AttributeError: - terms[i] = res + v = res + terms[i].update(v) - return typ, terms, _zip_axes_from_type(typ, axes) + return typ, _zip_axes_from_type(typ, axes) def _filter_terms(flat): # numeric literals - literals = filter(lambda string: not com.is_string(string), flat) - literals_set = set(literals) + literals = set(filter(is_const, flat)) # these are strings which are variable names - names = filter(com.is_string, flat) - names_set = set(names) + names = set(flat) - literals - # literals are not names and names are not literals, by definition - if literals_set & names_set: + # literals are not names and names are not literals, so intersection should + # be empty + if literals & names: raise ValueError('literals cannot be names and names cannot be ' 'literals') return names, literals @@ -170,30 +165,20 @@ def _filter_terms(flat): def _align(terms, env): # flatten the parse tree (a nested list) - flat = list(flatten(terms)) + terms = list(flatten(terms)) # separate names and literals - names, literals = _filter_terms(flat) + names, literals = _filter_terms(terms) if not names: # only literals so just promote to a common type return np.result_type(*literals).type, None - # get the variables out - resolve_in_env = partial(_resolve_name, env) - resolved = map(resolve_in_env, names) - # if all resolved variables are numeric scalars - if all(np.isscalar(rsv) for rsv in resolved): - return np.result_type(*resolved).type, None + if all(term.isscalar for term in terms): + return np.result_type(*(term.value for term in terms)).type, None # perform the main alignment - typ, resolved, axes = _align_core(resolved) - - # put the aligned arrays back in the table - _update_names(env, dict(izip(names, resolved))) - - # we need this to reconstruct things after evaluation since we CANNOT - # depend on the array interface + typ, axes = _align_core(terms) return typ, axes @@ -222,7 +207,8 @@ def _reconstruct_object(typ, obj, axes): except AttributeError: pass - if typ != np.asanyarray and issubclass(typ, pd.core.generic.PandasObject): + if (not isinstance(typ, partial) and + issubclass(typ, pd.core.generic.PandasObject)): return typ(obj, **axes) ret_value = typ(obj) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index f6d4ca39788ab..f0ed6b5de9ed6 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -1,9 +1,16 @@ import ast +import sys from functools import partial +import collections + from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops, Mod from pandas.computation.ops import _cmp_ops_syms, _bool_ops_syms from pandas.computation.ops import _arith_ops_syms, _unary_ops_syms +from pandas.computation.ops import _resolve_name, Term, Constant + + +Scope = collections.namedtuple('Scope', 'globals locals') class ExprParserError(Exception): @@ -65,10 +72,11 @@ def visit_UnaryOp(self, node): return op(self.visit(node.operand)) def visit_Name(self, node): - return node.id + name = node.id + return Term(_resolve_name(self.env, name), name, self.env) def visit_Num(self, node): - return node.n + return Constant(node.n, self.env) def visit_Compare(self, node): ops = node.ops @@ -92,19 +100,29 @@ def visit_Attribute(self, node): raise NotImplementedError("attribute access is not yet supported") def visit_Mod(self, node): - return partial(Mod, env=self.env) + return Mod class Expr(object): """Expr object for pandas """ - def __init__(self, expr, engine, env, truediv): + def __init__(self, expr, engine='numexpr', env=None, truediv=True): self.expr = expr - self._visitor = ExprVisitor(env) + self.env = env or self._get_calling_scope() + self._visitor = ExprVisitor(self.env) self.terms = self.parse() self.engine = engine self.truediv = truediv + def _get_calling_scope(self): + frame = sys._getframe(1) + gbl, lcl = frame.f_globals, frame.f_locals + + try: + return Scope(gbl, lcl) + finally: + del frame + def __call__(self, env): env.locals['truediv'] = self.truediv return self.terms(env) @@ -123,14 +141,16 @@ def parse(self): raise e return visited - def align(self, env): + def align(self): """align a set of Terms""" - return self.terms.align(env) + return self.terms.align(self.env) -def isexpr(s): +def isexpr(s, check_names=True): try: - Expr(s, engine=None) + Expr(s) except SyntaxError: return False + except NameError: + return not check_names return True diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 1a6d3fd1672ba..8c66fd0d122d5 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -3,6 +3,7 @@ import numpy as np from pandas.util.py3compat import PY3 +import pandas.core.common as com _reductions = 'sum', 'prod' @@ -59,19 +60,19 @@ def __init__(self, value, name, env): self.env = env self.type = type(value) - def __iter__(self): - yield self.value - raise StopIteration - def __str__(self): return '{0}({1!r})'.format(self.__class__.__name__, self.name) __repr__ = __str__ - def update(self, env, value): + def update(self, value): _update_name(self.env, self.name, value) self.value = value + @property + def isscalar(self): + return np.isscalar(self.value) + class Constant(Term): def __init__(self, value, env): @@ -112,17 +113,28 @@ def name(self): _binary_ops_dict.update(d) -def _cast(terms, env, dtype): - resolver = partial(_resolve_name, env) - updater = partial(_update_name, env) +def _cast(terms, dtype): dt = np.dtype(dtype) for term in terms: - t = resolver(term) + # cast all the way down the tree since operands must be try: - new_value = t.astype(dt) + _cast(term.operands, dtype) except AttributeError: - new_value = dt.type(t) - updater(term, new_value) + # we've bottomed out so cast + try: + new_value = term.value.astype(dt) + except AttributeError: + new_value = dt.type(term.value) + term.update(new_value) + + +def is_term(obj): + return isinstance(obj, Term) + + +def is_const(obj): + return isinstance(obj, Constant) + class BinOp(Op): """Hold a binary operator and its operands @@ -146,8 +158,9 @@ def __init__(self, op, lhs, rhs): ' operators are {1}'.format(op, keys)) def __repr__(self): - return '{0}(op={1!r}, lhs={2!r}, rhs={3!r})'.format(self.name, self.op, - self.lhs, self.rhs) + return com.pprint_thing('{0}(op={1!r}, lhs={2!r}, ' + 'rhs={3!r})'.format(self.name, self.op, + self.lhs, self.rhs)) __str__ = __repr__ @@ -169,25 +182,22 @@ def __call__(self, env): right = self.rhs # base cases - if not (isinstance(left, basestring) or isinstance(right, basestring)): + if is_term(left) and is_term(right): + res = self.func(left.value, right.value) + elif not is_term(left) and is_term(right): + res = self.func(left, right.value) + elif is_term(left) and not is_term(right): + res = self.func(left.value, right) + elif not (is_term(left) or is_term(right)): res = self.func(left, right) - elif isinstance(left, basestring) and not isinstance(right, - basestring): - res = self.func(_resolve_name(env, left), right) - elif not isinstance(left, basestring) and isinstance(right, - basestring): - res = self.func(left, _resolve_name(env, right)) - elif isinstance(left, basestring) and isinstance(right, basestring): - res = self.func(_resolve_name(env, left), _resolve_name(env, - right)) return res class Mod(BinOp): - def __init__(self, lhs, rhs, env=None): + def __init__(self, lhs, rhs): super(Mod, self).__init__('%', lhs, rhs) - _cast((lhs, rhs), env, np.float_) + _cast(self.operands, np.float_) _unary_ops_syms = '+', '-', '~' @@ -218,10 +228,7 @@ def __call__(self, env): except TypeError: operand = self.operand - if isinstance(operand, basestring): - v = _resolve_name(env, operand) - else: - v = operand + v = operand.value if is_term(operand) else operand try: res = self.func(v) @@ -231,5 +238,6 @@ def __call__(self, env): return res def __repr__(self): - return '{0}(op={1!r}, operand={2!r})'.format(self.name, self.op, - self.operand) + return com.pprint_thing('{0}(op={1!r}, ' + 'operand={2!r})'.format(self.name, self.op, + self.operand)) From 54f1897da4c119dec68b5f215981cd12787d5c77 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Wed, 19 Jun 2013 23:56:00 -0400 Subject: [PATCH 10/48] ENH: allow an already-parsed expression to be passed to eval --- pandas/computation/engines.py | 2 +- pandas/computation/eval.py | 22 ++++++++++++++-------- pandas/computation/expr.py | 5 +++-- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 4ebb4a15fdee4..342bde7b2beeb 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -48,7 +48,7 @@ def _maybe_promote_shape(values, naxes): axes_slice = [slice(None)] * naxes - # symmetric difference + # symmetric difference of numaxes and ndims slices = nax - ndim if ndims == naxes: diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 298554005d6ed..828ee334d71f9 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -22,14 +22,20 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, frame = sys._getframe(1) try: - # get the globals and locals - gbl, lcl = global_dict or frame.f_globals, local_dict or frame.f_locals - - # shallow copy the scope so we don't overwrite everything - env = Scope(gbl.copy(), lcl.copy()) - - # parse the expression - parsed_expr = Expr(expr, engine, env, truediv) + # parse the expression from a string + if isinstance(expr, basestring): + # get the globals and locals + gbl, lcl = (global_dict or frame.f_globals, + local_dict or frame.f_locals) + + # shallow copy the scope so we don't overwrite everything + env = Scope(gbl.copy(), lcl.copy()) + parsed_expr = Expr(expr, engine, env, truediv) + elif isinstance(expr, Expr): + parsed_expr = expr + else: + raise TypeError("eval only accepts strings and Expr objects, you " + "passed a {0!r}".format(expr.__class__.__name__)) # choose the engine eng = _engines[engine] diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index f0ed6b5de9ed6..63779da24394f 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -115,8 +115,9 @@ def __init__(self, expr, engine='numexpr', env=None, truediv=True): self.truediv = truediv def _get_calling_scope(self): - frame = sys._getframe(1) - gbl, lcl = frame.f_globals, frame.f_locals + # call this method **only** in the constructor + frame = sys._getframe(2) + gbl, lcl = frame.f_globals.copy(), frame.f_locals.copy() try: return Scope(gbl, lcl) From e20900ac61f7c22f5b78e6388f519a01eb53c12e Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Tue, 25 Jun 2013 20:30:09 -0400 Subject: [PATCH 11/48] CLN: add automatic scope creating object --- pandas/computation/expr.py | 31 +++++++++++++-------------- pandas/computation/tests/test_eval.py | 27 ++++++++++++++++++----- 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 63779da24394f..987f694bf0904 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -1,7 +1,6 @@ import ast import sys from functools import partial -import collections from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops, Mod @@ -10,7 +9,17 @@ from pandas.computation.ops import _resolve_name, Term, Constant -Scope = collections.namedtuple('Scope', 'globals locals') +class Scope(object): + __slots__ = 'globals', 'locals' + + def __init__(self, gbls=None, lcls=None, frame_level=1): + frame = sys._getframe(frame_level) + + try: + self.globals = gbls or frame.f_globals.copy() + self.locals = lcls or frame.f_locals.copy() + finally: + del frame class ExprParserError(Exception): @@ -104,26 +113,15 @@ def visit_Mod(self, node): class Expr(object): - """Expr object for pandas - """ + """Expr object""" def __init__(self, expr, engine='numexpr', env=None, truediv=True): self.expr = expr - self.env = env or self._get_calling_scope() + self.env = env or Scope(frame_level=2) self._visitor = ExprVisitor(self.env) self.terms = self.parse() self.engine = engine self.truediv = truediv - def _get_calling_scope(self): - # call this method **only** in the constructor - frame = sys._getframe(2) - gbl, lcl = frame.f_globals.copy(), frame.f_locals.copy() - - try: - return Scope(gbl, lcl) - finally: - del frame - def __call__(self, env): env.locals['truediv'] = self.truediv return self.terms(env) @@ -154,4 +152,5 @@ def isexpr(s, check_names=True): return False except NameError: return not check_names - return True + else: + return True diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 417fb106f90fa..18fe641db5ed2 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -19,11 +19,12 @@ from pandas.util.testing import makeCustomDataframe as mkdf from pandas.computation.engines import (_engines, _align_core, _reconstruct_object) -from pandas.computation.ops import _binary_ops_dict, _unary_ops_dict +from pandas.computation.ops import _binary_ops_dict, _unary_ops_dict, Term import pandas.computation.expr as expr from pandas.computation.expressions import _USE_NUMEXPR from pandas.computation.eval import Scope from pandas.util.testing import assert_frame_equal, randbool +from pandas.util.py3compat import PY3 def skip_numexpr_engine(engine): @@ -48,7 +49,9 @@ def _eval_from_expr(lhs, cmp1, rhs, binop, cmp2): f1 = _binary_ops_dict[cmp1] f2 = _binary_ops_dict[cmp2] bf = _binary_ops_dict[binop] - typ, (lhs, rhs), axes = _align_core((lhs, rhs)) + env = Scope() + typ, axes = _align_core((Term(lhs, 'lhs', env), Term(rhs, 'rhs', env))) + lhs, rhs = env.locals['lhs'], env.locals['rhs'] return _reconstruct_object(typ, bf(f1(lhs, rhs), f2(lhs, rhs)), axes) @@ -483,7 +486,7 @@ def check_series_frame_commutativity(engine, r_idx_type, c_idx_type, op, df) -INDEX_TYPES = 'i', 'f', 's', 'u', 'dt', # 'p' +INDEX_TYPES = 'i', 'f', 's', 'u', # 'dt', # 'p' @slow @@ -562,7 +565,21 @@ def test_truediv(): def check_truediv(engine): - s = randn(10) + s = np.array([1]) + ex = 's / 1' + + if PY3: + res = pd.eval(ex, truediv=False) + assert_array_equal(res, np.array([1.0])) + + res = pd.eval(ex, truediv=True) + assert_array_equal(res, np.array([1.0])) + else: + res = pd.eval(ex, truediv=False) + assert_array_equal(res, np.array([1])) + + res = pd.eval(ex, truediv=True) + assert_array_equal(res, np.array([1.0])) __var_s = randn(10) @@ -583,7 +600,7 @@ def check_is_expr(engine): valid = 's + 1' invalid = 's +' assert_true(expr.isexpr(valid, check_names=True)) - assert_false(expr.isexpr(valid, check_names=False)) + assert_true(expr.isexpr(valid, check_names=False)) assert_false(expr.isexpr(invalid, check_names=False)) assert_false(expr.isexpr(invalid, check_names=True)) From 51d80f6ca4febdcef4c11c65fa77b6861dae10bf Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 28 Jun 2013 16:15:37 -0400 Subject: [PATCH 12/48] CLN: make the environment an implementation detail --- pandas/computation/engines.py | 11 ++++--- pandas/computation/eval.py | 61 +++++++++++++++++++++-------------- 2 files changed, 42 insertions(+), 30 deletions(-) diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 342bde7b2beeb..39155ad112847 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -235,11 +235,12 @@ def convert(self): """Convert an expression for evaluation.""" pass - def evaluate(self, env): + def evaluate(self): if not self._is_aligned: - self.result_type, self.aligned_axes = _align(self.expr.terms, env) + self.result_type, self.aligned_axes = _align(self.expr.terms, + self.expr.env) - res = self._evaluate(env) + res = self._evaluate(self.expr.env) return _reconstruct_object(self.result_type, res, self.aligned_axes) @property @@ -284,8 +285,8 @@ def __init__(self, expr): def convert(self): pass - def evaluate(self, env): - return self.expr(env) + def evaluate(self): + return self.expr(self.expr.env) def _evaluate(self, env): pass diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 828ee334d71f9..7788eddf96f87 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -11,39 +11,50 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, global_dict=None): + """Evaluate a Python expression as a string. + + Parameters + ---------- + expr : string or Expr object + engine : string, optional, default 'numexpr' + The engine to use to evaluate the passed expression + truediv : bool, optional, default True + local_dict : dict or None, optional, default None + global_dict : dict or None, optional, default None + + Returns + ------- + obj : ndarray, scalar, DataFrame, Series, or Panel + """ # make sure we're passed a valid engine if not engine in _engines: raise KeyError('Invalid engine {0} passed, valid engines are' ' {1}'.format(_engines.keys())) - # 1 up in the call stack for locals/globals; see the documentation for the - # inspect module for why you must decrease the refcount of frame at all - # costs - frame = sys._getframe(1) + eng = _engines[engine] + + if isinstance(expr, basestring): + frame = sys._getframe(1) - try: - # parse the expression from a string - if isinstance(expr, basestring): - # get the globals and locals - gbl, lcl = (global_dict or frame.f_globals, - local_dict or frame.f_locals) + # get the globals and locals + gbl, lcl = (global_dict or frame.f_globals, + local_dict or frame.f_locals) - # shallow copy the scope so we don't overwrite everything + try: + # shallow copy the scope so we don't overwrite anything env = Scope(gbl.copy(), lcl.copy()) - parsed_expr = Expr(expr, engine, env, truediv) - elif isinstance(expr, Expr): - parsed_expr = expr - else: - raise TypeError("eval only accepts strings and Expr objects, you " - "passed a {0!r}".format(expr.__class__.__name__)) - - # choose the engine - eng = _engines[engine] - - # construct the engine and evaluate - ret = eng(parsed_expr).evaluate(env) - finally: - del frame + finally: + del frame + parsed_expr = Expr(expr, engine, env, truediv) + elif isinstance(expr, Expr): + parsed_expr = expr + else: + raise TypeError("eval only accepts strings and Expr objects, you " + "passed a {0!r}".format(expr.__class__.__name__)) + + + # construct the engine and evaluate + ret = eng(parsed_expr).evaluate() # sanity check for a number if np.isscalar(ret): From 038d79c25cf2c8968a176e37e8b6f2d14e44414a Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 28 Jun 2013 16:26:58 -0400 Subject: [PATCH 13/48] DOC: add docstring to eval --- pandas/computation/eval.py | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 7788eddf96f87..38248c26f88e3 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -11,20 +11,44 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, global_dict=None): - """Evaluate a Python expression as a string. + """Evaluate a Python expression as a string using various backends. + + The following arithmetic operations are supported: +, -, *, /, **, %, // + (python engine only) along with the following boolean operations: | (or), & + (and), and ~ (not). All Pandas objects are supported and behave as they + would with in-Python evaluation. Parameters ---------- expr : string or Expr object - engine : string, optional, default 'numexpr' - The engine to use to evaluate the passed expression + The expression to evaluate. This can be either a string or an ``Expr`` + object. + engine : string, optional, default 'numexpr', {'python', 'numexpr', 'pytables'} + The engine used to evaluate the expression. Supported engines are + + - 'numexpr': This default engine evaluates pandas objects using numexpr + for large speed ups in complex expressions with large + frames. + - 'python': Performs operations as if you had eval'd in top level + python + - 'pytables': Engine used for evaluating expressions for selection of + objects from PyTables HDF5 tables. + truediv : bool, optional, default True + Whether to use true division, like in Python >= 3 local_dict : dict or None, optional, default None + A dictionary of local variables, taken from locals() by default. global_dict : dict or None, optional, default None + A dictionary of global variables, taken from globals() by default. Returns ------- obj : ndarray, scalar, DataFrame, Series, or Panel + + Notes + ----- + The benefits of using ``eval`` are that very large frames that are terms in + long expressions are sped up, sometimes by as much as 10x. """ # make sure we're passed a valid engine if not engine in _engines: From 599cf32bdaaaf65e26478a6a9ae2e669f6ab7014 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 28 Jun 2013 17:09:42 -0400 Subject: [PATCH 14/48] CLN: cleanup pytables.py a bit --- pandas/io/pytables.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index fdb86c43b7160..a53907c518aab 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -219,7 +219,7 @@ def read_hdf(path_or_buf, key, **kwargs): # a passed store; user controls open/close f(path_or_buf, False) -class HDFStore(StringMixin): +class HDFStore(object): """ dict-like IO interface for storing pandas objects in PyTables format. @@ -421,7 +421,8 @@ def get(self, key): raise KeyError('No object named %s in the file' % key) return self._read_group(group) - def select(self, key, where=None, start=None, stop=None, columns=None, iterator=False, chunksize=None, auto_close=False, **kwargs): + def select(self, key, where=None, start=None, stop=None, columns=None, + iterator=False, chunksize=None, auto_close=False, **kwargs): """ Retrieve pandas object stored in file, optionally based on where criteria @@ -448,14 +449,18 @@ def select(self, key, where=None, start=None, stop=None, columns=None, iterator= # what we are actually going to do for a chunk def func(_start, _stop): - return s.read(where=where, start=_start, stop=_stop, columns=columns, **kwargs) + return s.read(where=where, start=_start, stop=_stop, + columns=columns, **kwargs) if iterator or chunksize is not None: if not s.is_table: raise TypeError("can only use an iterator or chunksize on a table") - return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, chunksize=chunksize, auto_close=auto_close) + return TableIterator(self, func, nrows=s.nrows, start=start, + stop=stop, chunksize=chunksize, + auto_close=auto_close) - return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, auto_close=auto_close).get_values() + return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, + auto_close=auto_close).get_values() def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs): """ @@ -1620,6 +1625,9 @@ def __unicode__(self): return "%-12.12s (shape->%s)" % (self.pandas_type,s) return self.pandas_type + def __str__(self): + return self.__repr__() + def set_object_info(self): """ set my pandas type & version """ self.attrs.pandas_type = self.pandas_kind From ea769e664d32e413bd44fedd5849ab077e8812f3 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 29 Jun 2013 11:08:00 -0400 Subject: [PATCH 15/48] CLN: clean up engines --- pandas/computation/engines.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 39155ad112847..64582192a9874 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -108,17 +108,17 @@ def _align_core(terms): axes = biggest.axes naxes = len(axes) - for i in term_index: - for axis, items in enumerate(terms[i].value.axes): - if com.is_series(terms[i].value) and naxes > 1: - axes[naxes - 1] = axes[naxes - 1].join(terms[i].value.index, - how='outer') + for term in (terms[i] for i in term_index): + for axis, items in enumerate(term.value.axes): + if com.is_series(term.value) and naxes > 1: + ax, itm = naxes - 1, term.value.index else: - axes[axis] = axes[axis].join(items, how='outer') + ax, itm = axis, items + axes[ax] = axes[ax].join(itm, how='outer') for i, ndim in ndims.iteritems(): for axis, items in izip(xrange(ndim), axes): - ti = terms[i].value # needed here because we modify it in the inner loop + ti = terms[i].value if hasattr(ti, 'reindex_axis'): transpose = com.is_series(ti) and naxes > 1 From ff78c08139f2c5f7e632827f313ccbf88aba9100 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Wed, 3 Jul 2013 21:21:49 -0400 Subject: [PATCH 16/48] CLN: clean up eval and have the Scope instance auto create the scope if none exists --- pandas/computation/eval.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 38248c26f88e3..591993bc4f228 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -58,17 +58,9 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, eng = _engines[engine] if isinstance(expr, basestring): - frame = sys._getframe(1) - - # get the globals and locals - gbl, lcl = (global_dict or frame.f_globals, - local_dict or frame.f_locals) - - try: - # shallow copy the scope so we don't overwrite anything - env = Scope(gbl.copy(), lcl.copy()) - finally: - del frame + # need to go 2 up in the call stack from the constructor since we want + # the calling scope's variables + env = Scope(global_dict, local_dict, frame_level=2) parsed_expr = Expr(expr, engine, env, truediv) elif isinstance(expr, Expr): parsed_expr = expr @@ -80,7 +72,7 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, # construct the engine and evaluate ret = eng(parsed_expr).evaluate() - # sanity check for a number + # sanity check for a number TODO: eventually take out if np.isscalar(ret): if not isinstance(ret, (np.number, numbers.Number, np.bool_, bool)): raise TypeError('scalar result must be numeric or bool, type is ' From f9f7fd7b6f841eae34ac1795f02320646b15708c Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Wed, 3 Jul 2013 21:29:19 -0400 Subject: [PATCH 17/48] CLN: add six.string_types checking instead of basestring --- pandas/computation/eval.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 591993bc4f228..b7d15d1d009bc 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -5,6 +5,8 @@ import numpy as np +import six + from pandas.computation.expr import Expr, Scope from pandas.computation.engines import _engines @@ -57,7 +59,7 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, eng = _engines[engine] - if isinstance(expr, basestring): + if isinstance(expr, six.string_types): # need to go 2 up in the call stack from the constructor since we want # the calling scope's variables env = Scope(global_dict, local_dict, frame_level=2) @@ -72,9 +74,11 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, # construct the engine and evaluate ret = eng(parsed_expr).evaluate() - # sanity check for a number TODO: eventually take out + # sanity check for a number + # TODO: eventually take out + # TODO: pytables engine will probably need a string check if np.isscalar(ret): - if not isinstance(ret, (np.number, numbers.Number, np.bool_, bool)): - raise TypeError('scalar result must be numeric or bool, type is ' - '{0!r}'.format(ret.__class__.__name__)) + if not isinstance(ret, (np.number, np.bool_, numbers.Number)): + raise TypeError('scalar result must be numeric or bool, passed ' + 'type is {0!r}'.format(ret.__class__.__name__)) return ret From 48eff13c0418c146dbca43f4893d027d0624ffe2 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Wed, 3 Jul 2013 22:13:08 -0400 Subject: [PATCH 18/48] TST: clean up some tests, add minor assertions where none existed --- pandas/io/tests/test_pytables.py | 33 +++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 00d8089ad2ee7..6737408081f3d 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1,8 +1,9 @@ import nose import unittest -import os import sys +import os import warnings +from contextlib import contextmanager import datetime import numpy as np @@ -19,7 +20,6 @@ from pandas import concat, Timestamp from pandas.util import py3compat -from numpy.testing.decorators import slow try: import tables @@ -36,12 +36,12 @@ # contextmanager to ensure the file cleanup def safe_remove(path): if path is not None: - import os try: os.remove(path) except: pass + def safe_close(store): try: if store is not None: @@ -49,7 +49,6 @@ def safe_close(store): except: pass -from contextlib import contextmanager @contextmanager def ensure_clean(path, mode='a', complevel=None, complib=None, @@ -620,7 +619,6 @@ def test_append_with_different_block_ordering(self): store.append('df',df) - def test_ndim_indexables(self): """ test using ndim tables in new ways""" @@ -1011,6 +1009,7 @@ def test_big_table_frame(self): store.append('df', df) rows = store.root.df.table.nrows recons = store.select('df') + assert isinstance(recons, DataFrame) print ("\nbig_table frame [%s] -> %5.2f" % (rows, time.time() - x)) @@ -1064,7 +1063,7 @@ def test_big_put_frame(self): with ensure_clean(self.path, mode='w') as store: start_time = time.time() - store = HDFStore(fn, mode='w') + store = HDFStore(self.path, mode='w') store.put('df', df) print (df.get_dtype_counts()) @@ -1092,6 +1091,7 @@ def test_big_table_panel(self): store.append('wp', wp) rows = store.root.wp.table.nrows recons = store.select('wp') + assert isinstance(recons, Panel) print ("\nbig_table panel [%s] -> %5.2f" % (rows, time.time() - x)) @@ -1254,7 +1254,6 @@ def test_table_values_dtypes_roundtrip(self): expected.sort() tm.assert_series_equal(result,expected) - def test_table_mixed_dtypes(self): # frame @@ -2352,7 +2351,6 @@ def test_string_select(self): expected = df[df.int!=2] assert_frame_equal(result,expected) - def test_read_column(self): df = tm.makeTimeDataFrame() @@ -2580,7 +2578,6 @@ def _check_double_roundtrip(self, obj, comparator, compression=False, again = store['obj'] comparator(again, obj, **kwargs) - def _check_roundtrip_table(self, obj, comparator, compression=False): options = {} if compression: @@ -2597,6 +2594,7 @@ def test_pytables_native_read(self): try: store = HDFStore(tm.get_data_path('legacy_hdf/pytables_native.h5'), 'r') d2 = store['detector/readout'] + assert isinstance(d2, DataFrame) finally: safe_close(store) @@ -2604,6 +2602,7 @@ def test_pytables_native_read(self): store = HDFStore(tm.get_data_path('legacy_hdf/pytables_native2.h5'), 'r') str(store) d1 = store['detector'] + assert isinstance(d1, DataFrame) finally: safe_close(store) @@ -2653,11 +2652,18 @@ def test_legacy_0_10_read(self): def test_legacy_0_11_read(self): # legacy from 0.11 try: - store = HDFStore(tm.get_data_path('legacy_hdf/legacy_table_0.11.h5'), 'r') + path = os.path.join('legacy_hdf', 'legacy_table_0.11.h5') + store = HDFStore(tm.get_data_path(path), 'r') str(store) + assert 'df' in store + assert 'df1' in store + assert 'mi' in store df = store.select('df') df1 = store.select('df1') mi = store.select('mi') + assert isinstance(df, DataFrame) + assert isinstance(df1, DataFrame) + assert isinstance(mi, DataFrame) finally: safe_close(store) @@ -2665,10 +2671,9 @@ def test_copy(self): def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): try: - import os - if f is None: - f = tm.get_data_path('legacy_hdf/legacy_0.10.h5') + f = tm.get_data_path(os.path.join('legacy_hdf', + 'legacy_0.10.h5')) store = HDFStore(f, 'r') @@ -2738,6 +2743,7 @@ def test_legacy_table_write(self): df = DataFrame(dict(A = 'foo', B = 'bar'),index=range(10)) store.append('df', df, data_columns = ['B'], min_itemsize={'A' : 200 }) + store.append('wp', wp) store.close() @@ -2824,6 +2830,7 @@ def _test_sort(obj): else: raise ValueError('type not supported here') + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From d87f0271669824091ec3822956011bc0e2b55900 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 08:24:45 -0400 Subject: [PATCH 19/48] CLN: clean up frame.py a bit --- pandas/core/frame.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a8bb74f86a43e..7f0a8492a4403 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5681,6 +5681,7 @@ def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): return create_block_manager_from_arrays(arrays, arr_names, axes) + def extract_index(data): from pandas.core.index import _union_indexes @@ -5941,6 +5942,7 @@ def _homogenize(data, index, dtype=None): return homogenized + def _from_nested_dict(data): # TODO: this should be seriously cythonized new_data = OrderedDict() From 5b58a08d77141ef7ef1faab4d33089bd8f71c64c Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 08:25:17 -0400 Subject: [PATCH 20/48] CLN: clean up pytables arguments a bit --- pandas/core/base.py | 1 + pandas/io/pytables.py | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 6122e78fa8bce..2caaf00723824 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -39,6 +39,7 @@ def __repr__(self): """ return str(self) + class PandasObject(StringMixin): """baseclass for various pandas objects""" diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index a53907c518aab..4a538b22bf939 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -182,7 +182,8 @@ def get_store(path, mode='a', complevel=None, complib=None, ### interface to/from ### -def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, append=None, **kwargs): +def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, + append=None, **kwargs): """ store this object, close it if we opened it """ if append: f = lambda store: store.append(key, value, **kwargs) @@ -190,7 +191,8 @@ def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, app f = lambda store: store.put(key, value, **kwargs) if isinstance(path_or_buf, basestring): - with get_store(path_or_buf, mode=mode, complevel=complevel, complib=complib) as store: + with get_store(path_or_buf, mode=mode, complevel=complevel, + complib=complib) as store: f(store) else: f(path_or_buf) From 7482a277a8c0309faec6481d0a4885670deb7369 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 12:45:54 -0400 Subject: [PATCH 21/48] CLN: use shiny new string mixin to refactor repring --- pandas/computation/ops.py | 60 ++++++++++++++------------- pandas/computation/tests/test_eval.py | 2 +- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 8c66fd0d122d5..26774c17959fb 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -1,13 +1,15 @@ import operator as op -from functools import partial import numpy as np from pandas.util.py3compat import PY3 import pandas.core.common as com +from pandas.core.base import StringMixin _reductions = 'sum', 'prod' -_mathops = 'sin', 'cos', 'tan' +_mathops = ('sin', 'cos', 'exp', 'log', 'expm1', 'log1p', 'pow', 'div', 'sqrt', + 'inv', 'sinh', 'cosh', 'tanh', 'arcsin', 'arccos', 'arctan', + 'arccosh', 'arcsinh', 'arctanh', 'arctan2', 'abs') class OperatorError(Exception): @@ -47,23 +49,21 @@ def _update_name(env, key, value): raise NameError('{0!r} is undefined'.format(key)) -def _update_names(env, mapping): - updater = partial(_update_name, env) - for key, value in mapping.iteritems(): - updater(key, value) +class NamedObjectMixin(object): + @property + def typename(self): + return com.pprint_thing(self.__class__.__name__) -class Term(object): - def __init__(self, value, name, env): - self.value = value +class Term(StringMixin, NamedObjectMixin): + def __init__(self, name, env): self.name = name + self.value = _resolve_name(env, name) self.env = env - self.type = type(value) - - def __str__(self): - return '{0}({1!r})'.format(self.__class__.__name__, self.name) + self.type = type(self.value) - __repr__ = __str__ + def __unicode__(self): + return com.pprint_thing('{0}({1!r})'.format(self.typename, self.name)) def update(self, value): _update_name(self.env, self.name, value) @@ -76,10 +76,10 @@ def isscalar(self): class Constant(Term): def __init__(self, value, env): - super(Constant, self).__init__(value, value, env) + super(Constant, self).__init__(value, env) -class Op(object): +class Op(NamedObjectMixin, StringMixin): """Hold an operator of unknown arity """ def __init__(self, op, operands): @@ -89,9 +89,13 @@ def __init__(self, op, operands): def __iter__(self): return iter(self.operands) - @property - def name(self): - return self.__class__.__name__ + def __unicode__(self): + op = 'op={1!r}'.format(self.op) + operands = ', '.join('opr_{i}={opr}'.format(i=i, opr=opr) + for i, opr in enumerate(self.operands)) + return com.pprint_thing('{0}({op}, ' + '{operands})'.format(self.name, op=op, + operands=operands)) _cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=' @@ -113,14 +117,14 @@ def name(self): _binary_ops_dict.update(d) -def _cast(terms, dtype): +def _cast_inplace(terms, dtype): dt = np.dtype(dtype) for term in terms: # cast all the way down the tree since operands must be try: - _cast(term.operands, dtype) + _cast_inplace(term.operands, dtype) except AttributeError: - # we've bottomed out so cast + # we've bottomed out so actually do the cast try: new_value = term.value.astype(dt) except AttributeError: @@ -157,13 +161,11 @@ def __init__(self, op, lhs, rhs): raise BinaryOperatorError('Invalid binary operator {0}, valid' ' operators are {1}'.format(op, keys)) - def __repr__(self): + def __unicode__(self): return com.pprint_thing('{0}(op={1!r}, lhs={2!r}, ' - 'rhs={3!r})'.format(self.name, self.op, + 'rhs={3!r})'.format(self.typename, self.op, self.lhs, self.rhs)) - __str__ = __repr__ - def __call__(self, env): # handle truediv if self.op == '/' and env.locals['truediv']: @@ -197,7 +199,7 @@ def __call__(self, env): class Mod(BinOp): def __init__(self, lhs, rhs): super(Mod, self).__init__('%', lhs, rhs) - _cast(self.operands, np.float_) + _cast_inplace(self.operands, np.float_) _unary_ops_syms = '+', '-', '~' @@ -237,7 +239,7 @@ def __call__(self, env): return res - def __repr__(self): + def __unicode__(self): return com.pprint_thing('{0}(op={1!r}, ' - 'operand={2!r})'.format(self.name, self.op, + 'operand={2!r})'.format(self.typename, self.op, self.operand)) diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 18fe641db5ed2..15509e2e489df 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -50,7 +50,7 @@ def _eval_from_expr(lhs, cmp1, rhs, binop, cmp2): f2 = _binary_ops_dict[cmp2] bf = _binary_ops_dict[binop] env = Scope() - typ, axes = _align_core((Term(lhs, 'lhs', env), Term(rhs, 'rhs', env))) + typ, axes = _align_core((Term('lhs', env), Term('rhs', env))) lhs, rhs = env.locals['lhs'], env.locals['rhs'] return _reconstruct_object(typ, bf(f1(lhs, rhs), f2(lhs, rhs)), axes) From 0d40fe182af58cceda1d4fcc99a6556ac83293d3 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 14:02:16 -0400 Subject: [PATCH 22/48] CLN: move align to its own file --- pandas/computation/align.py | 219 ++++++++++++++++++++++++++++++++++ pandas/computation/engines.py | 218 +-------------------------------- 2 files changed, 220 insertions(+), 217 deletions(-) create mode 100644 pandas/computation/align.py diff --git a/pandas/computation/align.py b/pandas/computation/align.py new file mode 100644 index 0000000000000..f2bf11d41e185 --- /dev/null +++ b/pandas/computation/align.py @@ -0,0 +1,219 @@ +from functools import partial, wraps +from itertools import izip + +import numpy as np + +import pandas as pd +import pandas.core.common as com +from pandas.computation.ops import is_const +from pandas.computation.common import flatten + + +def _align_core_single_unary_op(term): + if isinstance(term.value, np.ndarray) and not com.is_series(term.value): + typ = partial(np.asanyarray, dtype=term.value.dtype) + else: + typ = type(term.value) + ret = typ, + + if not hasattr(term.value, 'axes'): + ret += None, + else: + ret += _zip_axes_from_type(typ, term.value.axes), + return ret + + +def _zip_axes_from_type(typ, new_axes): + axes = {} + for ax_ind, ax_name in typ._AXIS_NAMES.iteritems(): + axes[ax_name] = new_axes[ax_ind] + return axes + + +def _maybe_promote_shape(values, naxes): + # test to see if we have an array else leave since must be a number + if not isinstance(values, np.ndarray): + return values + + ndims = values.ndim + if ndims > naxes: + raise AssertionError('cannot have more dims than axes, ' + '{0} > {1}'.format(ndims, naxes)) + if ndims == naxes: + return values + + ndim = set(xrange(ndims)) + nax = set(xrange(naxes)) + + axes_slice = [slice(None)] * naxes + + # symmetric difference of numaxes and ndims + slices = nax - ndim + + if ndims == naxes: + if slices: + raise AssertionError('slices should be empty if ndims == naxes ' + '{0}'.format(slices)) + else: + if not slices: + raise AssertionError('slices should NOT be empty if ndim != naxes ' + '{0}'.format(slices)) + + for sl in slices: + axes_slice[sl] = np.newaxis + + return values[tuple(axes_slice)] + + +def _any_pandas_objects(terms): + """Check a sequence of terms for instances of PandasObject.""" + return any(com.is_pd_obj(term.value) for term in terms) + + +def _filter_special_cases(f): + @wraps(f) + def wrapper(terms): + # single unary operand + if len(terms) == 1: + return _align_core_single_unary_op(terms[0]) + + # only scalars + elif all(term.isscalar for term in terms): + return np.result_type(*(term.value for term in terms)), None + + # single element ndarrays + all_has_size = all(hasattr(term.value, 'size') for term in terms) + if (all_has_size and all(term.value.size == 1 for term in terms)): + return np.result_type(*(term.value for term in terms)), None + + # no pandas so just punt to the evaluator + if not _any_pandas_objects(terms): + return np.result_type(*(term.value for term in terms)), None + + return f(terms) + return wrapper + + +@_filter_special_cases +def _align_core(terms): + term_index = [i for i, term in enumerate(terms) if hasattr(term.value, + 'axes')] + term_dims = [terms[i].value.ndim for i in term_index] + ndims = pd.Series(dict(zip(term_index, term_dims))) + + # initial axes are the axes of the largest-axis'd term + biggest = terms[ndims.idxmax()].value + typ = biggest._constructor + axes = biggest.axes + naxes = len(axes) + + for term in (terms[i] for i in term_index): + for axis, items in enumerate(term.value.axes): + if com.is_series(term.value) and naxes > 1: + ax, itm = naxes - 1, term.value.index + else: + ax, itm = axis, items + axes[ax] = axes[ax].join(itm, how='outer') + + for i, ndim in ndims.iteritems(): + for axis, items in izip(xrange(ndim), axes): + ti = terms[i].value + + if hasattr(ti, 'reindex_axis'): + transpose = com.is_series(ti) and naxes > 1 + + if transpose: + f = partial(ti.reindex, index=axes[naxes - 1], copy=False) + else: + f = partial(ti.reindex_axis, items, axis=axis, copy=False) + + if pd.lib.is_bool_array(ti.values): + r = f(fill_value=True) + else: + r = f() + + terms[i].update(r) + + res = _maybe_promote_shape(terms[i].value.T if transpose else + terms[i].value, naxes) + res = res.T if transpose else res + + try: + v = res.values + except AttributeError: + v = res + terms[i].update(v) + + return typ, _zip_axes_from_type(typ, axes) + + +def _filter_terms(flat): + # numeric literals + literals = set(filter(is_const, flat)) + + # these are strings which are variable names + names = set(flat) - literals + + # literals are not names and names are not literals, so intersection should + # be empty + if literals & names: + raise ValueError('literals cannot be names and names cannot be ' + 'literals') + return names, literals + + +def _align(terms, env): + # flatten the parse tree (a nested list) + terms = list(flatten(terms)) + + # separate names and literals + names, literals = _filter_terms(terms) + + if not names: # only literals so just promote to a common type + return np.result_type(*literals).type, None + + # if all resolved variables are numeric scalars + if all(term.isscalar for term in terms): + return np.result_type(*(term.value for term in terms)).type, None + + # perform the main alignment + typ, axes = _align_core(terms) + return typ, axes + + +def _reconstruct_object(typ, obj, axes): + """Reconstruct an object given its type, raw value, and possibly empty + (None) axes. + + Parameters + ---------- + typ : object + A type + obj : object + The value to use in the type constructor + axes : dict + The axes to use to construct the resulting pandas object + + Returns + ------- + reconst : typ + An object of type ``typ`` with the value `obj` and possible axes + `axes`. + """ + try: + # handle numpy dtypes + typ = typ.type + except AttributeError: + pass + + if (not isinstance(typ, partial) and + issubclass(typ, pd.core.generic.PandasObject)): + return typ(obj, **axes) + + ret_value = typ(obj) + + try: + return ret_value.item() + except (AttributeError, ValueError): + return ret_value + diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 64582192a9874..db6beb87da3a5 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -1,222 +1,6 @@ import abc -from functools import partial, wraps -from itertools import izip -import numpy as np - -import pandas as pd -import pandas.core.common as com -from pandas.computation.ops import is_const -from pandas.computation.common import flatten - - -def _align_core_single_unary_op(term): - if isinstance(term.value, np.ndarray) and not com.is_series(term.value): - typ = partial(np.asanyarray, dtype=term.value.dtype) - else: - typ = type(term.value) - ret = typ, - - if not hasattr(term.value, 'axes'): - ret += None, - else: - ret += _zip_axes_from_type(typ, term.value.axes), - return ret - - -def _zip_axes_from_type(typ, new_axes): - axes = {} - for ax_ind, ax_name in typ._AXIS_NAMES.iteritems(): - axes[ax_name] = new_axes[ax_ind] - return axes - - -def _maybe_promote_shape(values, naxes): - # test to see if we have an array else leave since must be a number - if not isinstance(values, np.ndarray): - return values - - ndims = values.ndim - if ndims > naxes: - raise AssertionError('cannot have more dims than axes, ' - '{0} > {1}'.format(ndims, naxes)) - if ndims == naxes: - return values - - ndim = set(xrange(ndims)) - nax = set(xrange(naxes)) - - axes_slice = [slice(None)] * naxes - - # symmetric difference of numaxes and ndims - slices = nax - ndim - - if ndims == naxes: - if slices: - raise AssertionError('slices should be empty if ndims == naxes ' - '{0}'.format(slices)) - else: - if not slices: - raise AssertionError('slices should NOT be empty if ndim != naxes ' - '{0}'.format(slices)) - - for sl in slices: - axes_slice[sl] = np.newaxis - - return values[tuple(axes_slice)] - - -def _any_pandas_objects(terms): - """Check a sequence of terms for instances of PandasObject.""" - return any(com.is_pd_obj(term.value) for term in terms) - - -def _filter_special_cases(f): - @wraps(f) - def wrapper(terms): - # single unary operand - if len(terms) == 1: - return _align_core_single_unary_op(terms[0]) - - # only scalars - elif all(term.isscalar for term in terms): - return np.result_type(*(term.value for term in terms)), None - - # single element ndarrays - all_has_size = all(hasattr(term.value, 'size') for term in terms) - if (all_has_size and all(term.value.size == 1 for term in terms)): - return np.result_type(*(term.value for term in terms)), None - - # no pandas so just punt to the evaluator - if not _any_pandas_objects(terms): - return np.result_type(*(term.value for term in terms)), None - - return f(terms) - return wrapper - - -@_filter_special_cases -def _align_core(terms): - term_index = [i for i, term in enumerate(terms) if hasattr(term.value, - 'axes')] - term_dims = [terms[i].value.ndim for i in term_index] - ndims = pd.Series(dict(zip(term_index, term_dims))) - - # initial axes are the axes of the largest-axis'd term - biggest = terms[ndims.idxmax()].value - typ = biggest._constructor - axes = biggest.axes - naxes = len(axes) - - for term in (terms[i] for i in term_index): - for axis, items in enumerate(term.value.axes): - if com.is_series(term.value) and naxes > 1: - ax, itm = naxes - 1, term.value.index - else: - ax, itm = axis, items - axes[ax] = axes[ax].join(itm, how='outer') - - for i, ndim in ndims.iteritems(): - for axis, items in izip(xrange(ndim), axes): - ti = terms[i].value - - if hasattr(ti, 'reindex_axis'): - transpose = com.is_series(ti) and naxes > 1 - - if transpose: - f = partial(ti.reindex, index=axes[naxes - 1], copy=False) - else: - f = partial(ti.reindex_axis, items, axis=axis, copy=False) - - if pd.lib.is_bool_array(ti.values): - r = f(fill_value=True) - else: - r = f() - - terms[i].update(r) - - res = _maybe_promote_shape(terms[i].value.T if transpose else - terms[i].value, naxes) - res = res.T if transpose else res - - try: - v = res.values - except AttributeError: - v = res - terms[i].update(v) - - return typ, _zip_axes_from_type(typ, axes) - - -def _filter_terms(flat): - # numeric literals - literals = set(filter(is_const, flat)) - - # these are strings which are variable names - names = set(flat) - literals - - # literals are not names and names are not literals, so intersection should - # be empty - if literals & names: - raise ValueError('literals cannot be names and names cannot be ' - 'literals') - return names, literals - - -def _align(terms, env): - # flatten the parse tree (a nested list) - terms = list(flatten(terms)) - - # separate names and literals - names, literals = _filter_terms(terms) - - if not names: # only literals so just promote to a common type - return np.result_type(*literals).type, None - - # if all resolved variables are numeric scalars - if all(term.isscalar for term in terms): - return np.result_type(*(term.value for term in terms)).type, None - - # perform the main alignment - typ, axes = _align_core(terms) - return typ, axes - - -def _reconstruct_object(typ, obj, axes): - """Reconstruct an object given its type, raw value, and possibly empty - (None) axes. - - Parameters - ---------- - typ : object - A type - obj : object - The value to use in the type constructor - axes : dict - The axes to use to construct the resulting pandas object - - Returns - ------- - reconst : typ - An object of type ``typ`` with the value `obj` and possible axes - `axes`. - """ - try: - # handle numpy dtypes - typ = typ.type - except AttributeError: - pass - - if (not isinstance(typ, partial) and - issubclass(typ, pd.core.generic.PandasObject)): - return typ(obj, **axes) - - ret_value = typ(obj) - - try: - return ret_value.item() - except (AttributeError, ValueError): - return ret_value +from pandas.computation.align import _align, _reconstruct_object class AbstractEngine(object): From 87957d24f08f09f2f4a8574e435d4a9dad75ec55 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 14:06:20 -0400 Subject: [PATCH 23/48] CLN: clean up and use new stringmixin for Expr --- pandas/computation/expr.py | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 987f694bf0904..777ac2a03beea 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -2,11 +2,11 @@ import sys from functools import partial - -from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops, Mod +from pandas.core.base import StringMixin +from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops from pandas.computation.ops import _cmp_ops_syms, _bool_ops_syms from pandas.computation.ops import _arith_ops_syms, _unary_ops_syms -from pandas.computation.ops import _resolve_name, Term, Constant +from pandas.computation.ops import Term, Constant class Scope(object): @@ -51,8 +51,8 @@ def __init__(self, env): def visit(self, node): if not (isinstance(node, ast.AST) or isinstance(node, basestring)): - raise AssertionError('"node" must be an AST node or a string, you' - ' passed a(n) {0}'.format(node.__class__)) + raise TypeError('"node" must be an AST node or a string, you' + ' passed a(n) {0}'.format(node.__class__)) if isinstance(node, basestring): node = ast.fix_missing_locations(ast.parse(node)) return super(ExprVisitor, self).visit(node) @@ -81,8 +81,7 @@ def visit_UnaryOp(self, node): return op(self.visit(node.operand)) def visit_Name(self, node): - name = node.id - return Term(_resolve_name(self.env, name), name, self.env) + return Term(node.id, self.env) def visit_Num(self, node): return Constant(node.n, self.env) @@ -108,16 +107,14 @@ def visit_Call(self, node): def visit_Attribute(self, node): raise NotImplementedError("attribute access is not yet supported") - def visit_Mod(self, node): - return Mod - -class Expr(object): +class Expr(StringMixin): """Expr object""" - def __init__(self, expr, engine='numexpr', env=None, truediv=True): + def __init__(self, expr, engine='numexpr', env=None, truediv=True, + parsing='strict'): self.expr = expr self.env = env or Scope(frame_level=2) - self._visitor = ExprVisitor(self.env) + self._visitor = ExprVisitor(self.env, parsing) self.terms = self.parse() self.engine = engine self.truediv = truediv @@ -126,19 +123,12 @@ def __call__(self, env): env.locals['truediv'] = self.truediv return self.terms(env) - def __repr__(self): - return '{0} -> {1}'.format(self.expr, self.terms) - - def __str__(self): - return self.expr + def __unicode__(self): + return unicode(self.terms) def parse(self): """return a Termset""" - try: - visited = self._visitor.visit(self.expr) - except SyntaxError as e: - raise e - return visited + return self._visitor.visit(self.expr) def align(self): """align a set of Terms""" From e35cb5cf07b38390bdd66e583d0c98a0ae154193 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 14:06:36 -0400 Subject: [PATCH 24/48] ENH/CLN: be more careful about unicode --- pandas/computation/eval.py | 1 - pandas/computation/expr.py | 5 ++--- pandas/computation/ops.py | 12 +++++------- pandas/computation/tests/test_eval.py | 4 ++-- 4 files changed, 9 insertions(+), 13 deletions(-) diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index b7d15d1d009bc..e08e0f28d7877 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -1,6 +1,5 @@ #!/usr/bin/env python -import sys import numbers import numpy as np diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 777ac2a03beea..60fea6e935070 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -110,11 +110,10 @@ def visit_Attribute(self, node): class Expr(StringMixin): """Expr object""" - def __init__(self, expr, engine='numexpr', env=None, truediv=True, - parsing='strict'): + def __init__(self, expr, engine='numexpr', env=None, truediv=True): self.expr = expr self.env = env or Scope(frame_level=2) - self._visitor = ExprVisitor(self.env, parsing) + self._visitor = ExprVisitor(self.env) self.terms = self.parse() self.engine = engine self.truediv = truediv diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 26774c17959fb..24000b27a033a 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -63,7 +63,7 @@ def __init__(self, name, env): self.type = type(self.value) def __unicode__(self): - return com.pprint_thing('{0}({1!r})'.format(self.typename, self.name)) + return com.pprint_thing(self.name) def update(self, value): _update_name(self.env, self.name, value) @@ -162,9 +162,8 @@ def __init__(self, op, lhs, rhs): ' operators are {1}'.format(op, keys)) def __unicode__(self): - return com.pprint_thing('{0}(op={1!r}, lhs={2!r}, ' - 'rhs={3!r})'.format(self.typename, self.op, - self.lhs, self.rhs)) + return com.pprint_thing('({0}) {1} ({2})'.format(self.lhs, self.op, + self.rhs)) def __call__(self, env): # handle truediv @@ -240,6 +239,5 @@ def __call__(self, env): return res def __unicode__(self): - return com.pprint_thing('{0}(op={1!r}, ' - 'operand={2!r})'.format(self.typename, self.op, - self.operand)) + return com.pprint_thing('{0}({1})'.format(self.op, self.operand)) + diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 15509e2e489df..0a1356915523a 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -17,8 +17,8 @@ from pandas.core import common as com from pandas import DataFrame, Series from pandas.util.testing import makeCustomDataframe as mkdf -from pandas.computation.engines import (_engines, _align_core, - _reconstruct_object) +from pandas.computation.engines import _engines, _reconstruct_object +from pandas.computation.align import _align_core from pandas.computation.ops import _binary_ops_dict, _unary_ops_dict, Term import pandas.computation.expr as expr from pandas.computation.expressions import _USE_NUMEXPR From 1ceec39bf7e983d0deec9a5dec2fe8583e411a5e Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 14:30:29 -0400 Subject: [PATCH 25/48] CLN: run autopep8 on pandas/io/pytables.py --- pandas/io/pytables.py | 811 +++++++++++++++++++++++++++--------------- 1 file changed, 520 insertions(+), 291 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4a538b22bf939..013e596320250 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -87,40 +87,40 @@ class AttributeConflictWarning(Warning): # map object types _TYPE_MAP = { - Series : u'series', - SparseSeries : u'sparse_series', - TimeSeries : u'series', - DataFrame : u'frame', - SparseDataFrame : u'sparse_frame', - Panel : u'wide', - Panel4D : u'ndim', - SparsePanel : u'sparse_panel' + Series: u'series', + SparseSeries: u'sparse_series', + TimeSeries: u'series', + DataFrame: u'frame', + SparseDataFrame: u'sparse_frame', + Panel: u'wide', + Panel4D: u'ndim', + SparsePanel: u'sparse_panel' } # storer class map _STORER_MAP = { - u'TimeSeries' : 'LegacySeriesStorer', - u'Series' : 'LegacySeriesStorer', - u'DataFrame' : 'LegacyFrameStorer', - u'DataMatrix' : 'LegacyFrameStorer', - u'series' : 'SeriesStorer', - u'sparse_series' : 'SparseSeriesStorer', - u'frame' : 'FrameStorer', - u'sparse_frame' : 'SparseFrameStorer', - u'wide' : 'PanelStorer', - u'sparse_panel' : 'SparsePanelStorer', + u'TimeSeries': 'LegacySeriesStorer', + u'Series': 'LegacySeriesStorer', + u'DataFrame': 'LegacyFrameStorer', + u'DataMatrix': 'LegacyFrameStorer', + u'series': 'SeriesStorer', + u'sparse_series': 'SparseSeriesStorer', + u'frame': 'FrameStorer', + u'sparse_frame': 'SparseFrameStorer', + u'wide': 'PanelStorer', + u'sparse_panel': 'SparsePanelStorer', } # table class map _TABLE_MAP = { - u'generic_table' : 'GenericTable', - u'appendable_frame' : 'AppendableFrameTable', - u'appendable_multiframe' : 'AppendableMultiFrameTable', - u'appendable_panel' : 'AppendablePanelTable', - u'appendable_ndim' : 'AppendableNDimTable', - u'worm' : 'WORMTable', - u'legacy_frame' : 'LegacyFrameTable', - u'legacy_panel' : 'LegacyPanelTable', + u'generic_table': 'GenericTable', + u'appendable_frame': 'AppendableFrameTable', + u'appendable_multiframe': 'AppendableMultiFrameTable', + u'appendable_panel': 'AppendablePanelTable', + u'appendable_ndim': 'AppendableNDimTable', + u'worm': 'WORMTable', + u'legacy_frame': 'LegacyFrameTable', + u'legacy_panel': 'LegacyPanelTable', } # axes map @@ -149,6 +149,7 @@ def _tables(): return _table_mod + def h5_open(path, mode): tables = _tables() return tables.openFile(path, mode) @@ -180,7 +181,7 @@ def get_store(path, mode='a', complevel=None, complib=None, store.close() -### interface to/from ### +# interface to/from ### def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, append=None, **kwargs): @@ -197,9 +198,11 @@ def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, else: f(path_or_buf) + def read_hdf(path_or_buf, key, **kwargs): """ read from the store, closeit if we opened it """ - f = lambda store, auto_close: store.select(key, auto_close=auto_close, **kwargs) + f = lambda store, auto_close: store.select( + key, auto_close=auto_close, **kwargs) if isinstance(path_or_buf, basestring): @@ -221,7 +224,9 @@ def read_hdf(path_or_buf, key, **kwargs): # a passed store; user controls open/close f(path_or_buf, False) + class HDFStore(object): + """ dict-like IO interface for storing pandas objects in PyTables format. @@ -322,7 +327,7 @@ def __unicode__(self): output = '%s\nFile path: %s\n' % (type(self), pprint_thing(self._path)) if len(self.keys()): - keys = [] + keys = [] values = [] for k in self.keys(): @@ -330,10 +335,13 @@ def __unicode__(self): s = self.get_storer(k) if s is not None: keys.append(pprint_thing(s.pathname or k)) - values.append(pprint_thing(s or 'invalid_HDFStore node')) + values.append( + pprint_thing(s or 'invalid_HDFStore node')) except Exception as detail: keys.append(k) - values.append("[invalid_HDFStore node: %s]" % pprint_thing(detail)) + values.append( + "[invalid_HDFStore node: %s]" % + pprint_thing(detail)) output += adjoin(12, keys, values) else: @@ -387,7 +395,7 @@ def open(self, mode='a', warn=True): try: self._handle = h5_open(self._path, self._mode) - except IOError, e: # pragma: no cover + except IOError as e: # pragma: no cover if 'can not be written' in str(e): print ('Opening %s in read-only mode' % self._path) self._handle = h5_open(self._path, 'r') @@ -456,7 +464,8 @@ def func(_start, _stop): if iterator or chunksize is not None: if not s.is_table: - raise TypeError("can only use an iterator or chunksize on a table") + raise TypeError( + "can only use an iterator or chunksize on a table") return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, chunksize=chunksize, auto_close=auto_close) @@ -464,7 +473,8 @@ def func(_start, _stop): return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, auto_close=auto_close).get_values() - def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs): + def select_as_coordinates( + self, key, where=None, start=None, stop=None, **kwargs): """ return the selection as a Coordinates. @@ -480,7 +490,7 @@ def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs def unique(self, key, column, **kwargs): warnings.warn("unique(key,column) is deprecated\n" "use select_column(key,column).unique() instead") - return self.get_storer(key).read_column(column = column, **kwargs).unique() + return self.get_storer(key).read_column(column=column, **kwargs).unique() def select_column(self, key, column, **kwargs): """ @@ -497,9 +507,10 @@ def select_column(self, key, column, **kwargs): raises ValueError if the column can not be extracted indivually (it is part of a data block) """ - return self.get_storer(key).read_column(column = column, **kwargs) + return self.get_storer(key).read_column(column=column, **kwargs) - def select_as_multiple(self, keys, where=None, selector=None, columns=None, start=None, stop=None, iterator=False, chunksize=None, auto_close=False, **kwargs): + def select_as_multiple(self, keys, where=None, selector=None, columns=None, + start=None, stop=None, iterator=False, chunksize=None, auto_close=False, **kwargs): """ Retrieve pandas objects from multiple tables Parameters @@ -533,7 +544,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, star selector = keys[0] # collect the tables - tbls = [ self.get_storer(k) for k in keys ] + tbls = [self.get_storer(k) for k in keys] # validate rows nrows = None @@ -541,24 +552,32 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, star if t is None: raise TypeError("Invalid table [%s]" % k) if not t.is_table: - raise TypeError("object [%s] is not a table, and cannot be used in all select as multiple" % t.pathname) + raise TypeError( + "object [%s] is not a table, and cannot be used in all select as multiple" % + t.pathname) if nrows is None: nrows = t.nrows elif t.nrows != nrows: - raise ValueError("all tables must have exactly the same nrows!") + raise ValueError( + "all tables must have exactly the same nrows!") # select coordinates from the selector table try: - c = self.select_as_coordinates(selector, where, start=start, stop=stop) + c = self.select_as_coordinates( + selector, + where, + start=start, + stop=stop) nrows = len(c) - except (Exception), detail: + except (Exception) as detail: raise ValueError("invalid selector [%s]" % selector) def func(_start, _stop): # collect the returns objs - objs = [t.read(where=c[_start:_stop], columns=columns) for t in tbls] + objs = [t.read(where=c[_start:_stop], columns=columns) + for t in tbls] # axis is the concentation axes axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0] @@ -571,7 +590,6 @@ def func(_start, _stop): return TableIterator(self, func, nrows=nrows, start=start, stop=stop, auto_close=auto_close).get_values() - def put(self, key, value, table=None, append=False, **kwargs): """ Store object in HDFStore @@ -617,7 +635,8 @@ def remove(self, key, where=None, start=None, stop=None): except: if where is not None: - raise ValueError("trying to remove a node with a non-None where clause!") + raise ValueError( + "trying to remove a node with a non-None where clause!") # we are actually trying to remove a node (with children) s = self.get_node(key) @@ -635,8 +654,9 @@ def remove(self, key, where=None, start=None, stop=None): # delete from the table else: if not s.is_table: - raise ValueError('can only remove with where on objects written as tables') - return s.delete(where = where, start=start, stop=stop) + raise ValueError( + 'can only remove with where on objects written as tables') + return s.delete(where=where, start=start, stop=stop) def append(self, key, value, columns=None, **kwargs): """ @@ -660,11 +680,13 @@ def append(self, key, value, columns=None, **kwargs): data in the table, so be careful """ if columns is not None: - raise Exception("columns is not a supported keyword in append, try data_columns") + raise Exception( + "columns is not a supported keyword in append, try data_columns") self._write_to_group(key, value, table=True, append=True, **kwargs) - def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, **kwargs): + def append_to_multiple( + self, d, value, selector, data_columns=None, axes=None, **kwargs): """ Append to multiple tables @@ -683,13 +705,16 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, * """ if axes is not None: - raise Exception("axes is currently not accepted as a paremter to append_to_multiple; you can create the tables indepdently instead") + raise Exception( + "axes is currently not accepted as a paremter to append_to_multiple; you can create the tables indepdently instead") if not isinstance(d, dict): - raise ValueError("append_to_multiple must have a dictionary specified as the way to split the value") + raise ValueError( + "append_to_multiple must have a dictionary specified as the way to split the value") if selector not in d: - raise ValueError("append_to_multiple requires a selector that is in passed dict") + raise ValueError( + "append_to_multiple requires a selector that is in passed dict") # figure out the splitting axis (the non_index_axis) axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0] @@ -700,7 +725,8 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, * for k, v in d.items(): if v is None: if remain_key is not None: - raise ValueError("append_to_multiple can only have one value in d that is None") + raise ValueError( + "append_to_multiple can only have one value in d that is None") remain_key = k else: remain_values.extend(v) @@ -741,7 +767,8 @@ def create_table_index(self, key, **kwargs): raise Exception("PyTables >= 2.3 is required for table indexing") s = self.get_storer(key) - if s is None: return + if s is None: + return if not s.is_table: raise TypeError("cannot create table index on a non-table") @@ -750,8 +777,8 @@ def create_table_index(self, key, **kwargs): def groups(self): """ return a list of all the top-level nodes (that are not themselves a pandas storage object) """ _tables() - return [ g for g in self._handle.walkNodes() if getattr(g._v_attrs,'pandas_type',None) or getattr( - g,'table',None) or (isinstance(g,_table_mod.table.Table) and g._v_name != u'table') ] + return [g for g in self._handle.walkNodes() if getattr(g._v_attrs, 'pandas_type', None) or getattr( + g, 'table', None) or (isinstance(g, _table_mod.table.Table) and g._v_name != u'table')] def get_node(self, key): """ return the node with the key or None if it does not exist """ @@ -771,8 +798,9 @@ def get_storer(self, key): s.infer_axes() return s - def copy(self, file, mode = 'w', propindexes = True, keys = None, complib = None, complevel = None, - fletcher32 = False, overwrite = True): + def copy( + self, file, mode='w', propindexes=True, keys=None, complib=None, complevel=None, + fletcher32=False, overwrite=True): """ copy the existing store to a new file, upgrading in place Parameters @@ -787,13 +815,18 @@ def copy(self, file, mode = 'w', propindexes = True, keys = None, complib = None open file handle of the new store """ - new_store = HDFStore(file, mode = mode, complib = complib, complevel = complevel, fletcher32 = fletcher32) + new_store = HDFStore( + file, + mode=mode, + complib=complib, + complevel=complevel, + fletcher32=fletcher32) if keys is None: keys = self.keys() - if not isinstance(keys, (tuple,list)): - keys = [ keys ] + if not isinstance(keys, (tuple, list)): + keys = [keys] for k in keys: - s = self.get_storer(k) + s = self.get_storer(k) if s is not None: if k in new_store: @@ -805,35 +838,45 @@ def copy(self, file, mode = 'w', propindexes = True, keys = None, complib = None index = False if propindexes: - index = [ a.name for a in s.axes if a.is_indexed ] - new_store.append(k, data, index=index, data_columns=getattr(s,'data_columns',None), encoding=s.encoding) + index = [a.name for a in s.axes if a.is_indexed] + new_store.append( + k, + data, + index=index, + data_columns=getattr( + s, + 'data_columns', + None), + encoding=s.encoding) else: new_store.put(k, data, encoding=s.encoding) return new_store - ###### private methods ###### + # private methods ###### - def _create_storer(self, group, value = None, table = False, append = False, **kwargs): + def _create_storer( + self, group, value=None, table=False, append=False, **kwargs): """ return a suitable Storer class to operate """ def error(t): raise TypeError("cannot properly create the storer for: [%s] [group->%s,value->%s,table->%s,append->%s,kwargs->%s]" % - (t,group,type(value),table,append,kwargs)) + (t, group, type(value), table, append, kwargs)) - pt = _ensure_decoded(getattr(group._v_attrs,'pandas_type',None)) - tt = _ensure_decoded(getattr(group._v_attrs,'table_type',None)) + pt = _ensure_decoded(getattr(group._v_attrs, 'pandas_type', None)) + tt = _ensure_decoded(getattr(group._v_attrs, 'table_type', None)) # infer the pt from the passed value if pt is None: if value is None: _tables() - if getattr(group,'table',None) or isinstance(group,_table_mod.table.Table): + if getattr(group, 'table', None) or isinstance(group, _table_mod.table.Table): pt = u'frame_table' tt = u'generic_table' else: - raise TypeError("cannot create a storer if the object is not existing nor a value are passed") + raise TypeError( + "cannot create a storer if the object is not existing nor a value are passed") else: try: @@ -859,14 +902,14 @@ def error(t): if value is not None: if pt == u'frame_table': - index = getattr(value,'index',None) + index = getattr(value, 'index', None) if index is not None: if index.nlevels == 1: tt = u'appendable_frame' elif index.nlevels > 1: tt = u'appendable_multiframe' elif pt == u'wide_table': - tt = u'appendable_panel' + tt = u'appendable_panel' elif pt == u'ndim_table': tt = u'appendable_ndim' @@ -886,8 +929,9 @@ def error(t): except: error('_TABLE_MAP') - def _write_to_group(self, key, value, index=True, table=False, append=False, - complib=None, encoding=None, **kwargs): + def _write_to_group( + self, key, value, index=True, table=False, append=False, + complib=None, encoding=None, **kwargs): group = self.get_node(key) # remove the node if we are not appending @@ -927,16 +971,18 @@ def _write_to_group(self, key, value, index=True, table=False, append=False, if not s.is_table and complib: raise ValueError('Compression not supported on non-table') - s.write(obj = value, append=append, complib=complib, **kwargs) + s.write(obj=value, append=append, complib=complib, **kwargs) if s.is_table and index: - s.create_index(columns = index) + s.create_index(columns=index) def _read_group(self, group, **kwargs): s = self._create_storer(group) s.infer_axes() return s.read(**kwargs) + class TableIterator(object): + """ define the iteration interface on a table Parameters @@ -953,15 +999,16 @@ class TableIterator(object): kwargs : the passed kwargs """ - def __init__(self, store, func, nrows, start=None, stop=None, chunksize=None, auto_close=False): + def __init__(self, store, func, nrows, start=None, + stop=None, chunksize=None, auto_close=False): self.store = store - self.func = func + self.func = func self.nrows = nrows or 0 self.start = start or 0 if stop is None: stop = self.nrows - self.stop = min(self.nrows,stop) + self.stop = min(self.nrows, stop) if chunksize is None: chunksize = 100000 @@ -992,7 +1039,9 @@ def get_values(self): self.close() return results + class IndexCol(StringMixin): + """ an index column description class Parameters @@ -1008,11 +1057,12 @@ class IndexCol(StringMixin): is_an_indexable = True is_data_indexable = True is_searchable = False - _info_fields = ['freq','tz','index_name'] + _info_fields = ['freq', 'tz', 'index_name'] - def __init__(self, values=None, kind=None, typ=None, cname=None, itemsize=None, - name=None, axis=None, kind_attr=None, pos=None, freq=None, tz=None, - index_name=None, **kwargs): + def __init__( + self, values=None, kind=None, typ=None, cname=None, itemsize=None, + name=None, axis=None, kind_attr=None, pos=None, freq=None, tz=None, + index_name=None, **kwargs): self.values = values self.kind = kind self.typ = typ @@ -1059,7 +1109,13 @@ def set_table(self, table): return self def __unicode__(self): - temp = tuple(map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))) + temp = tuple( + map(pprint_thing, + (self.name, + self.cname, + self.axis, + self.pos, + self.kind))) return "name->%s,cname->%s,axis->%s,pos->%s,kind->%s" % temp def __eq__(self, other): @@ -1073,7 +1129,7 @@ def __ne__(self, other): def is_indexed(self): """ return whether I am an indexed column """ try: - return getattr(self.table.cols,self.cname).is_indexed + return getattr(self.table.cols, self.cname).is_indexed except: False @@ -1095,7 +1151,7 @@ def convert(self, values, nan_rep, encoding): except: pass - values =_maybe_convert(values, self.kind, encoding) + values = _maybe_convert(values, self.kind, encoding) kwargs = dict() if self.freq is not None: @@ -1106,15 +1162,22 @@ def convert(self, values, nan_rep, encoding): self.values = Index(values, **kwargs) except: - # if the output freq is different that what we recorded, then infer it + # if the output freq is different that what we recorded, then infer + # it if 'freq' in kwargs: kwargs['freq'] = 'infer' - self.values = Index(_maybe_convert(values, self.kind, encoding), **kwargs) + self.values = Index( + _maybe_convert( + values, + self.kind, + encoding), + **kwargs) # set the timezone if indicated # we stored in utc, so reverse to local timezone if self.tz is not None: - self.values = self.values.tz_localize('UTC').tz_convert(_ensure_decoded(self.tz)) + self.values = self.values.tz_localize( + 'UTC').tz_convert(_ensure_decoded(self.tz)) return self @@ -1177,7 +1240,7 @@ def validate_col(self, itemsize=None): raise ValueError("Trying to store a string with len [%s] in [%s] column but\n" "this column has a limit of [%s]!\n" "Consider using min_itemsize to preset the sizes on these columns" - % (itemsize,self.cname, c.itemsize)) + % (itemsize, self.cname, c.itemsize)) return c.itemsize return None @@ -1196,7 +1259,7 @@ def update_info(self, info): for key in self._info_fields: - value = getattr(self,key,None) + value = getattr(self, key, None) try: idx = info[self.name] @@ -1207,18 +1270,18 @@ def update_info(self, info): if key in idx and value is not None and existing_value != value: # frequency/name just warn - if key in ['freq','index_name']: - ws = attribute_conflict_doc % (key,existing_value,value) + if key in ['freq', 'index_name']: + ws = attribute_conflict_doc % (key, existing_value, value) warnings.warn(ws, AttributeConflictWarning) # reset idx[key] = None - setattr(self,key,None) + setattr(self, key, None) else: raise ValueError("invalid info for [%s] for [%s]""" ", existing_value [%s] conflicts with new value [%s]" % (self.name, - key,existing_value,value)) + key, existing_value, value)) else: if value is not None or existing_value is not None: idx[key] = value @@ -1239,7 +1302,9 @@ def set_attr(self): """ set the kind for this colummn """ setattr(self.attrs, self.kind_attr, self.kind) + class GenericIndexCol(IndexCol): + """ an index which is not represented in the data of the table """ @property @@ -1258,7 +1323,9 @@ def get_attr(self): def set_attr(self): pass + class DataCol(IndexCol): + """ a data holding column, by definition this is not indexable Parameters @@ -1273,7 +1340,8 @@ class DataCol(IndexCol): _info_fields = ['tz'] @classmethod - def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs): + def create_for_block( + cls, i=None, name=None, cname=None, version=None, **kwargs): """ return a new datacol with the block i """ if cname is None: @@ -1293,7 +1361,8 @@ def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs) return cls(name=name, cname=cname, **kwargs) - def __init__(self, values=None, kind=None, typ=None, cname=None, data=None, block=None, **kwargs): + def __init__(self, values=None, kind=None, typ=None, + cname=None, data=None, block=None, **kwargs): super(DataCol, self).__init__( values=values, kind=kind, typ=typ, cname=cname, **kwargs) self.dtype = None @@ -1337,13 +1406,16 @@ def set_kind(self): elif dtype.startswith(u'bool'): self.kind = 'bool' else: - raise AssertionError("cannot interpret dtype of [%s] in [%s]" % (dtype,self)) + raise AssertionError( + "cannot interpret dtype of [%s] in [%s]" % + (dtype, self)) # set my typ if we need if self.typ is None: - self.typ = getattr(self.description,self.cname,None) + self.typ = getattr(self.description, self.cname, None) - def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=None, **kwargs): + def set_atom(self, block, existing_col, min_itemsize, + nan_rep, info, encoding=None, **kwargs): """ create and setup my atom from the block b """ self.values = list(block.items) @@ -1357,7 +1429,7 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No raise TypeError( "[date] is not implemented as a table column") elif inferred_type == 'datetime': - if getattr(rvalues[0],'tzinfo',None) is not None: + if getattr(rvalues[0], 'tzinfo', None) is not None: # if this block has more than one timezone, raise if len(set([r.tzinfo for r in rvalues])) != 1: @@ -1366,7 +1438,7 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No # convert this column to datetime64[ns] utc, and save the tz index = DatetimeIndex(rvalues) - tz = getattr(index,'tz',None) + tz = getattr(index, 'tz', None) if tz is None: raise TypeError( "invalid timezone specification") @@ -1380,7 +1452,9 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No self.tz = zone self.update_info(info) - self.set_atom_datetime64(block, values.reshape(block.values.shape)) + self.set_atom_datetime64( + block, + values.reshape(block.values.shape)) else: raise TypeError( @@ -1392,7 +1466,12 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No # this is basically a catchall; if say a datetime64 has nans then will # end up here ### elif inferred_type == 'string' or dtype == 'object': - self.set_atom_string(block, existing_col, min_itemsize, nan_rep, encoding) + self.set_atom_string( + block, + existing_col, + min_itemsize, + nan_rep, + encoding) else: self.set_atom_data(block) @@ -1401,16 +1480,18 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No def get_atom_string(self, block, itemsize): return _tables().StringCol(itemsize=itemsize, shape=block.shape[0]) - def set_atom_string(self, block, existing_col, min_itemsize, nan_rep, encoding): + def set_atom_string( + self, block, existing_col, min_itemsize, nan_rep, encoding): # fill nan items with myself block = block.fillna(nan_rep) - data = block.values + data = block.values # see if we have a valid string type inferred_type = lib.infer_dtype(data.ravel()) if inferred_type != 'string': - # we cannot serialize this data, so report an exception on a column by column basis + # we cannot serialize this data, so report an exception on a column + # by column basis for item in block.items: col = block.get(item) @@ -1418,8 +1499,7 @@ def set_atom_string(self, block, existing_col, min_itemsize, nan_rep, encoding): if inferred_type != 'string': raise TypeError("Cannot serialize the column [%s] because\n" "its data contents are [%s] object dtype" % - (item,inferred_type)) - + (item, inferred_type)) # itemsize is the maximum length of a string (along any dimension) itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) @@ -1464,7 +1544,7 @@ def set_atom_data(self, block): def get_atom_datetime64(self, block): return _tables().Int64Col(shape=block.shape[0]) - def set_atom_datetime64(self, block, values = None): + def set_atom_datetime64(self, block, values=None): self.kind = 'datetime64' self.typ = self.get_atom_datetime64(block) if values is None: @@ -1487,13 +1567,13 @@ def validate_attr(self, append): if (existing_fields is not None and existing_fields != list(self.values)): raise ValueError("appended items do not match existing items" - " in table!") + " in table!") existing_dtype = getattr(self.attrs, self.dtype_attr, None) if (existing_dtype is not None and existing_dtype != self.dtype): raise ValueError("appended items dtype do not match existing items dtype" - " in table!") + " in table!") def convert(self, values, nan_rep, encoding): """ set the data from this selection (and convert to the correct dtype if we can) """ @@ -1515,8 +1595,12 @@ def convert(self, values, nan_rep, encoding): # data should be 2-dim here # we stored as utc, so just set the tz - index = DatetimeIndex(self.data.ravel(),tz='UTC').tz_convert(self.tz) - self.data = np.array(index.tolist(),dtype=object).reshape(self.data.shape) + index = DatetimeIndex( + self.data.ravel(), + tz='UTC').tz_convert(self.tz) + self.data = np.array( + index.tolist(), + dtype=object).reshape(self.data.shape) else: self.data = np.asarray(self.data, dtype='M8[ns]') @@ -1537,14 +1621,17 @@ def convert(self, values, nan_rep, encoding): # convert nans / decode if _ensure_decoded(self.kind) == u'string': - self.data = _unconvert_string_array(self.data, nan_rep=nan_rep, encoding=encoding) + self.data = _unconvert_string_array( + self.data, + nan_rep=nan_rep, + encoding=encoding) return self def get_attr(self): """ get the data for this colummn """ self.values = getattr(self.attrs, self.kind_attr, None) - self.dtype = getattr(self.attrs, self.dtype_attr, None) + self.dtype = getattr(self.attrs, self.dtype_attr, None) self.set_kind() def set_attr(self): @@ -1555,6 +1642,7 @@ def set_attr(self): class DataIndexableCol(DataCol): + """ represent a data column that can be indexed """ is_data_indexable = True @@ -1571,13 +1659,17 @@ def get_atom_data(self, block): def get_atom_datetime64(self, block): return _tables().Int64Col() + class GenericDataIndexableCol(DataIndexableCol): + """ represent a generic pytables data column """ def get_attr(self): pass + class Storer(StringMixin): + """ represent an object in my store facilitate read/write of various types of objects this is an abstract base class @@ -1589,14 +1681,14 @@ class Storer(StringMixin): group : the group node where the table resides """ pandas_kind = None - obj_type = None - ndim = None - is_table = False + obj_type = None + ndim = None + is_table = False def __init__(self, parent, group, encoding=None, **kwargs): - self.parent = parent - self.group = group - self.encoding = _ensure_encoding(encoding) + self.parent = parent + self.group = group + self.encoding = _ensure_encoding(encoding) self.set_version() @property @@ -1605,7 +1697,11 @@ def is_old_version(self): def set_version(self): """ compute and set our version """ - version = _ensure_decoded(getattr(self.group._v_attrs,'pandas_version',None)) + version = _ensure_decoded( + getattr( + self.group._v_attrs, + 'pandas_version', + None)) try: self.version = tuple([int(x) for x in version.split('.')]) if len(self.version) == 2: @@ -1622,9 +1718,9 @@ def __unicode__(self): self.infer_axes() s = self.shape if s is not None: - if isinstance(s, (list,tuple)): + if isinstance(s, (list, tuple)): s = "[%s]" % ','.join([pprint_thing(x) for x in s]) - return "%-12.12s (shape->%s)" % (self.pandas_type,s) + return "%-12.12s (shape->%s)" % (self.pandas_type, s) return self.pandas_type def __str__(self): @@ -1695,14 +1791,15 @@ def is_exists(self): @property def nrows(self): - return getattr(self.storable,'nrows',None) + return getattr(self.storable, 'nrows', None) def validate(self, other): """ validate against an existing storable """ - if other is None: return + if other is None: + return return True - def validate_version(self, where = None): + def validate_version(self, where=None): """ are we trying to operate on an old version? """ return True @@ -1717,12 +1814,14 @@ def infer_axes(self): return True def read(self, **kwargs): - raise NotImplementedError("cannot read on an abstract storer: subclasses should implement") + raise NotImplementedError( + "cannot read on an abstract storer: subclasses should implement") def write(self, **kwargs): - raise NotImplementedError("cannot write on an abstract storer: sublcasses should implement") + raise NotImplementedError( + "cannot write on an abstract storer: sublcasses should implement") - def delete(self, where = None, **kwargs): + def delete(self, where=None, **kwargs): """ support fully deleting the node in its entirety (only) - where specification must be None """ if where is None: self._handle.removeNode(self.group, recursive=True) @@ -1730,11 +1829,14 @@ def delete(self, where = None, **kwargs): raise TypeError("cannot delete on an abstract storer") + class GenericStorer(Storer): + """ a generified storer version """ - _index_type_map = { DatetimeIndex: 'datetime', - PeriodIndex: 'period'} - _reverse_index_map = dict([ (v,k) for k, v in _index_type_map.iteritems() ]) + _index_type_map = {DatetimeIndex: 'datetime', + PeriodIndex: 'period'} + _reverse_index_map = dict([(v, k) + for k, v in _index_type_map.iteritems()]) attributes = [] # indexer helpders @@ -1756,9 +1858,11 @@ def f(values, freq=None, tz=None): def validate_read(self, kwargs): if kwargs.get('columns') is not None: - raise TypeError("cannot pass a column specification when reading a Storer") + raise TypeError( + "cannot pass a column specification when reading a Storer") if kwargs.get('where') is not None: - raise TypeError("cannot pass a where specification when reading a Storer") + raise TypeError( + "cannot pass a where specification when reading a Storer") @property def is_exists(self): @@ -1770,9 +1874,9 @@ def set_attrs(self): def get_attrs(self): """ retrieve our attributes """ - self.encoding = _ensure_encoding(getattr(self.attrs,'encoding',None)) + self.encoding = _ensure_encoding(getattr(self.attrs, 'encoding', None)) for n in self.attributes: - setattr(self,n,_ensure_decoded(getattr(self.attrs, n, None))) + setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None))) def write(self, obj, **kwargs): self.set_attrs() @@ -1833,7 +1937,7 @@ def write_index(self, key, index): self.write_sparse_intindex(key, index) else: setattr(self.attrs, '%s_variety' % key, 'regular') - converted = _convert_index(index,self.encoding).set_name('index') + converted = _convert_index(index, self.encoding).set_name('index') self.write_array(key, converted.values) node = getattr(self.group, key) node._v_attrs.kind = converted.kind @@ -1851,7 +1955,6 @@ def write_index(self, key, index): zone = tslib.tot_seconds(index.tz.utcoffset()) node._v_attrs.tz = zone - def write_block_index(self, key, index): self.write_array('%s_blocs' % key, index.blocs) self.write_array('%s_blengths' % key, index.blengths) @@ -1931,10 +2034,15 @@ def read_index_node(self, node): kwargs['tz'] = node._v_attrs['tz'] if kind in (u'date', u'datetime'): - index = factory(_unconvert_index(data, kind, encoding=self.encoding), dtype=object, - **kwargs) + index = factory( + _unconvert_index(data, kind, encoding=self.encoding), dtype=object, + **kwargs) else: - index = factory(_unconvert_index(data, kind, encoding=self.encoding), **kwargs) + index = factory( + _unconvert_index(data, + kind, + encoding=self.encoding), + **kwargs) index.name = name @@ -1985,7 +2093,8 @@ def write_array(self, key, value, items=None): if value.dtype.type == np.object_: - # infer the type, warn if we have a non-string type here (for performance) + # infer the type, warn if we have a non-string type here (for + # performance) inferred_type = lib.infer_dtype(value.ravel()) if empty_array: pass @@ -1996,11 +2105,11 @@ def write_array(self, key, value, items=None): items = list(items) except: pass - ws = performance_doc % (inferred_type,key,items) + ws = performance_doc % (inferred_type, key, items) warnings.warn(ws, PerformanceWarning) vlarr = self._handle.createVLArray(self.group, key, - _tables().ObjectAtom()) + _tables().ObjectAtom()) vlarr.append(value) elif value.dtype.type == np.datetime64: self._handle.createArray(self.group, key, value.view('i8')) @@ -2013,14 +2122,16 @@ def write_array(self, key, value, items=None): getattr(self.group, key)._v_attrs.transposed = transposed + class LegacyStorer(GenericStorer): def read_index_legacy(self, key): - node = getattr(self.group,key) + node = getattr(self.group, key) data = node[:] kind = node._v_attrs.kind return _unconvert_index_legacy(data, kind, encoding=self.encoding) + class LegacySeriesStorer(LegacyStorer): def read(self, **kwargs): @@ -2029,6 +2140,7 @@ def read(self, **kwargs): values = self.read_array('values') return Series(values, index=index) + class LegacyFrameStorer(LegacyStorer): def read(self, **kwargs): @@ -2038,6 +2150,7 @@ def read(self, **kwargs): values = self.read_array('values') return DataFrame(values, index=index, columns=columns) + class SeriesStorer(GenericStorer): pandas_kind = u'series' attributes = ['name'] @@ -2045,7 +2158,7 @@ class SeriesStorer(GenericStorer): @property def shape(self): try: - return len(getattr(self.group,'values')), + return len(getattr(self.group, 'values')), except: return None @@ -2065,9 +2178,10 @@ def write(self, obj, **kwargs): self.write_array('values', obj.values) self.attrs.name = obj.name + class SparseSeriesStorer(GenericStorer): pandas_kind = u'sparse_series' - attributes = ['name','fill_value','kind'] + attributes = ['name', 'fill_value', 'kind'] def read(self, **kwargs): self.validate_read(kwargs) @@ -2087,9 +2201,10 @@ def write(self, obj, **kwargs): self.attrs.fill_value = obj.fill_value self.attrs.kind = obj.kind + class SparseFrameStorer(GenericStorer): pandas_kind = u'sparse_frame' - attributes = ['default_kind','default_fill_value'] + attributes = ['default_kind', 'default_fill_value'] def read(self, **kwargs): self.validate_read(kwargs) @@ -2097,7 +2212,7 @@ def read(self, **kwargs): sdict = {} for c in columns: key = 'sparse_series_%s' % c - s = SparseSeriesStorer(self.parent, getattr(self.group,key)) + s = SparseSeriesStorer(self.parent, getattr(self.group, key)) s.infer_axes() sdict[c] = s.read() return SparseDataFrame(sdict, columns=columns, @@ -2116,12 +2231,13 @@ def write(self, obj, **kwargs): s = SparseSeriesStorer(self.parent, node) s.write(ss) self.attrs.default_fill_value = obj.default_fill_value - self.attrs.default_kind = obj.default_kind + self.attrs.default_kind = obj.default_kind self.write_index('columns', obj.columns) + class SparsePanelStorer(GenericStorer): pandas_kind = u'sparse_panel' - attributes = ['default_kind','default_fill_value'] + attributes = ['default_kind', 'default_fill_value'] def read(self, **kwargs): self.validate_read(kwargs) @@ -2131,7 +2247,7 @@ def read(self, **kwargs): for name in items: key = 'sparse_frame_%s' % name node = getattr(self.group, key) - s = SparseFrameStorer(self.parent, getattr(self.group,key)) + s = SparseFrameStorer(self.parent, getattr(self.group, key)) s.infer_axes() sdict[name] = s.read() return SparsePanel(sdict, items=items, default_kind=self.default_kind, @@ -2140,7 +2256,7 @@ def read(self, **kwargs): def write(self, obj, **kwargs): super(SparsePanelStorer, self).write(obj, **kwargs) self.attrs.default_fill_value = obj.default_fill_value - self.attrs.default_kind = obj.default_kind + self.attrs.default_kind = obj.default_kind self.write_index('items', obj.items) for name, sdf in obj.iterkv(): @@ -2152,8 +2268,9 @@ def write(self, obj, **kwargs): s = SparseFrameStorer(self.parent, node) s.write(sdf) + class BlockManagerStorer(GenericStorer): - attributes = ['ndim','nblocks'] + attributes = ['ndim', 'nblocks'] is_shape_reversed = False @property @@ -2165,15 +2282,15 @@ def shape(self): items = 0 for i in range(self.nblocks): node = getattr(self.group, 'block%d_items' % i) - shape = getattr(node,'shape',None) + shape = getattr(node, 'shape', None) if shape is not None: items += shape[0] # data shape node = getattr(self.group, 'block0_values') - shape = getattr(node,'shape',None) + shape = getattr(node, 'shape', None) if shape is not None: - shape = list(shape[0:(ndim-1)]) + shape = list(shape[0:(ndim - 1)]) else: shape = [] @@ -2223,20 +2340,24 @@ def write(self, obj, **kwargs): self.write_array('block%d_values' % i, blk.values, items=blk.items) self.write_index('block%d_items' % i, blk.items) + class FrameStorer(BlockManagerStorer): pandas_kind = u'frame' - obj_type = DataFrame + obj_type = DataFrame + class PanelStorer(BlockManagerStorer): pandas_kind = u'wide' - obj_type = Panel + obj_type = Panel is_shape_reversed = True def write(self, obj, **kwargs): obj._consolidate_inplace() return super(PanelStorer, self).write(obj, **kwargs) + class Table(Storer): + """ represent a table: facilitate read/write of various types of tables @@ -2254,20 +2375,20 @@ class Table(Storer): """ pandas_kind = u'wide_table' - table_type = None - levels = 1 - is_table = True + table_type = None + levels = 1 + is_table = True is_shape_reversed = False def __init__(self, *args, **kwargs): super(Table, self).__init__(*args, **kwargs) - self.index_axes = [] + self.index_axes = [] self.non_index_axes = [] - self.values_axes = [] - self.data_columns = [] - self.info = dict() - self.nan_rep = None - self.selection = None + self.values_axes = [] + self.data_columns = [] + self.info = dict() + self.nan_rep = None + self.selection = None @property def table_type_short(self): @@ -2276,18 +2397,21 @@ def table_type_short(self): def __repr__(self): """ return a pretty representatgion of myself """ self.infer_axes() - dc = ",dc->[%s]" % ','.join(self.data_columns) if len(self.data_columns) else '' + dc = ",dc->[%s]" % ','.join( + self.data_columns) if len( + self.data_columns) else '' ver = '' if self.is_old_version: - ver = "[%s]" % '.'.join([ str(x) for x in self.version ]) + ver = "[%s]" % '.'.join([str(x) for x in self.version]) return "%-12.12s%s (typ->%s,nrows->%s,ncols->%s,indexers->[%s]%s)" % (self.pandas_type, ver, self.table_type_short, self.nrows, self.ncols, - ','.join([ a.name for a in self.index_axes ]), + ','.join( + [a.name for a in self.index_axes]), dc) def __getitem__(self, c): @@ -2299,30 +2423,35 @@ def __getitem__(self, c): def validate(self, other): """ validate against an existing table """ - if other is None: return + if other is None: + return if other.table_type != self.table_type: raise TypeError("incompatible table_type with existing [%s - %s]" % (other.table_type, self.table_type)) - for c in ['index_axes','non_index_axes','values_axes']: - sv = getattr(self,c,None) - ov = getattr(other,c,None) + for c in ['index_axes', 'non_index_axes', 'values_axes']: + sv = getattr(self, c, None) + ov = getattr(other, c, None) if sv != ov: # show the error for the specific axes for i, sax in enumerate(sv): oax = ov[i] if sax != oax: - raise ValueError("invalid combinate of [%s] on appending data [%s] vs current table [%s]" % (c,sax,oax)) + raise ValueError( + "invalid combinate of [%s] on appending data [%s] vs current table [%s]" % + (c, sax, oax)) # should never get here - raise Exception("invalid combinate of [%s] on appending data [%s] vs current table [%s]" % (c,sv,ov)) + raise Exception( + "invalid combinate of [%s] on appending data [%s] vs current table [%s]" % + (c, sv, ov)) @property def nrows_expected(self): """ based on our axes, compute the expected nrows """ - return np.prod([ i.cvalues.shape[0] for i in self.index_axes ]) + return np.prod([i.cvalues.shape[0] for i in self.index_axes]) @property def is_exists(self): @@ -2331,7 +2460,7 @@ def is_exists(self): @property def storable(self): - return getattr(self.group,'table',None) + return getattr(self.group, 'table', None) @property def table(self): @@ -2353,7 +2482,7 @@ def axes(self): @property def ncols(self): """ the number of total columns in the values axes """ - return sum([ len(a.values) for a in self.values_axes ]) + return sum([len(a.values) for a in self.values_axes]) @property def is_transposed(self): @@ -2370,7 +2499,8 @@ def queryables(self): # compute the values_axes queryables return dict([(a.cname, a.kind) for a in self.index_axes] + [(self.obj_type._AXIS_NAMES[axis], None) for axis, values in self.non_index_axes] + - [(v.cname, v.kind) for v in self.values_axes if v.name in set(self.data_columns)] + [(v.cname, v.kind) + for v in self.values_axes if v.name in set(self.data_columns)] ) def index_cols(self): @@ -2383,44 +2513,62 @@ def values_cols(self): def set_info(self): """ update our table index info """ - self.attrs.info = self.info + self.attrs.info = self.info def set_attrs(self): """ set our table type & indexables """ - self.attrs.table_type = self.table_type - self.attrs.index_cols = self.index_cols() - self.attrs.values_cols = self.values_cols() + self.attrs.table_type = self.table_type + self.attrs.index_cols = self.index_cols() + self.attrs.values_cols = self.values_cols() self.attrs.non_index_axes = self.non_index_axes self.attrs.data_columns = self.data_columns - self.attrs.nan_rep = self.nan_rep - self.attrs.encoding = self.encoding - self.attrs.levels = self.levels + self.attrs.nan_rep = self.nan_rep + self.attrs.encoding = self.encoding + self.attrs.levels = self.levels self.set_info() def get_attrs(self): """ retrieve our attributes """ - self.non_index_axes = getattr(self.attrs,'non_index_axes',None) or [] - self.data_columns = getattr(self.attrs,'data_columns',None) or [] - self.info = getattr(self.attrs,'info',None) or dict() - self.nan_rep = getattr(self.attrs,'nan_rep',None) - self.encoding = _ensure_encoding(getattr(self.attrs,'encoding',None)) - self.levels = getattr(self.attrs,'levels',None) or [] + self.non_index_axes = getattr( + self.attrs, + 'non_index_axes', + None) or [] + self.data_columns = getattr( + self.attrs, + 'data_columns', + None) or [] + self.info = getattr( + self.attrs, + 'info', + None) or dict() + self.nan_rep = getattr(self.attrs, 'nan_rep', None) + self.encoding = _ensure_encoding( + getattr(self.attrs, 'encoding', None)) + self.levels = getattr( + self.attrs, + 'levels', + None) or [] t = self.table - self.index_axes = [ a.infer(t) for a in self.indexables if a.is_an_indexable ] - self.values_axes = [ a.infer(t) for a in self.indexables if not a.is_an_indexable ] + self.index_axes = [a.infer(t) + for a in self.indexables if a.is_an_indexable] + self.values_axes = [a.infer(t) + for a in self.indexables if not a.is_an_indexable] - def validate_version(self, where = None): + def validate_version(self, where=None): """ are we trying to operate on an old version? """ if where is not None: if self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1: - ws = incompatibility_doc % '.'.join([ str(x) for x in self.version ]) + ws = incompatibility_doc % '.'.join( + [str(x) for x in self.version]) warnings.warn(ws, IncompatibilityWarning) def validate_min_itemsize(self, min_itemsize): """ validate the min_itemisze doesn't contain items that are not in the axes this needs data_columns to be defined """ - if min_itemsize is None: return - if not isinstance(min_itemsize, dict): return + if min_itemsize is None: + return + if not isinstance(min_itemsize, dict): + return q = self.queryables() for k, v in min_itemsize.items(): @@ -2429,7 +2577,9 @@ def validate_min_itemsize(self, min_itemsize): if k == 'values': continue if k not in q: - raise ValueError("min_itemsize has the key [%s] which is not an axis or data_column" % k) + raise ValueError( + "min_itemsize has the key [%s] which is not an axis or data_column" % + k) @property def indexables(self): @@ -2440,7 +2590,8 @@ def indexables(self): self._indexables = [] # index columns - self._indexables.extend([ IndexCol(name=name,axis=axis,pos=i) for i, (axis, name) in enumerate(self.attrs.index_cols)]) + self._indexables.extend([IndexCol(name=name, axis=axis, pos=i) + for i, (axis, name) in enumerate(self.attrs.index_cols)]) # values columns dc = set(self.data_columns) @@ -2558,15 +2709,17 @@ def validate_data_columns(self, data_columns, min_itemsize): data_columns = [] # if min_itemsize is a dict, add the keys (exclude 'values') - if isinstance(min_itemsize,dict): + if isinstance(min_itemsize, dict): existing_data_columns = set(data_columns) - data_columns.extend([ k for k in min_itemsize.keys() if k != 'values' and k not in existing_data_columns ]) + data_columns.extend( + [k for k in min_itemsize.keys() if k != 'values' and k not in existing_data_columns]) # return valid columns in the order of our axis return [c for c in data_columns if c in axis_labels] - def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, min_itemsize=None, **kwargs): + def create_axes(self, axes, obj, validate=True, nan_rep=None, + data_columns=None, min_itemsize=None, **kwargs): """ create and return the axes leagcy tables create an indexable column, indexable index, non-indexable fields @@ -2588,7 +2741,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, axes = _AXES_MAP[type(obj)] except: raise TypeError("cannot properly create the storer for: [group->%s,value->%s]" % - (self.group._v_name,type(obj))) + (self.group._v_name, type(obj))) # map axes to numbers axes = [obj._get_axis_number(a) for a in axes] @@ -2597,17 +2750,18 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, if self.infer_axes(): existing_table = self.copy() existing_table.infer_axes() - axes = [ a.axis for a in existing_table.index_axes] - data_columns = existing_table.data_columns - nan_rep = existing_table.nan_rep + axes = [a.axis for a in existing_table.index_axes] + data_columns = existing_table.data_columns + nan_rep = existing_table.nan_rep self.encoding = existing_table.encoding - self.info = copy.copy(existing_table.info) + self.info = copy.copy(existing_table.info) else: existing_table = None # currently support on ndim-1 axes if len(axes) != self.ndim - 1: - raise ValueError("currently only support ndim-1 indexers in an AppendableTable") + raise ValueError( + "currently only support ndim-1 indexers in an AppendableTable") # create according to the new data self.non_index_axes = [] @@ -2644,8 +2798,9 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, self.non_index_axes.append((i, append_axis)) # set axis positions (based on the axes) - self.index_axes = [index_axes_map[a].set_pos(j).update_info(self.info) for j, - a in enumerate(axes)] + self.index_axes = [index_axes_map[a].set_pos( + j).update_info(self.info) for j, + a in enumerate(axes)] j = len(self.index_axes) # check for column conflicts @@ -2662,17 +2817,18 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, blocks = block_obj._data.blocks if len(self.non_index_axes): axis, axis_labels = self.non_index_axes[0] - data_columns = self.validate_data_columns(data_columns, min_itemsize) + data_columns = self.validate_data_columns( + data_columns, min_itemsize) if len(data_columns): blocks = block_obj.reindex_axis(Index(axis_labels) - Index( - data_columns), axis=axis, copy=False)._data.blocks + data_columns), axis=axis, copy=False)._data.blocks for c in data_columns: blocks.extend(block_obj.reindex_axis( - [c], axis=axis, copy=False)._data.blocks) + [c], axis=axis, copy=False)._data.blocks) # reorder the blocks in the same order as the existing_table if we can if existing_table is not None: - by_items = dict([ (tuple(b.items.tolist()),b) for b in blocks ]) + by_items = dict([(tuple(b.items.tolist()), b) for b in blocks]) new_blocks = [] for ea in existing_table.values_axes: items = tuple(ea.values) @@ -2680,7 +2836,9 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, b = by_items.pop(items) new_blocks.append(b) except: - raise ValueError("cannot match existing table structure for [%s] on appending data" % items) + raise ValueError( + "cannot match existing table structure for [%s] on appending data" % + items) blocks = new_blocks # add my values @@ -2704,7 +2862,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, existing_col = existing_table.values_axes[i] except: raise ValueError("Incompatible appended table [%s] with existing table [%s]" % - (blocks,existing_table.values_axes)) + (blocks, existing_table.values_axes)) else: existing_col = None @@ -2721,10 +2879,12 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, col.set_pos(j) self.values_axes.append(col) - except (NotImplementedError, ValueError, TypeError), e: + except (NotImplementedError, ValueError, TypeError) as e: raise e - except (Exception), detail: - raise Exception("cannot find the correct atom type -> [dtype->%s,items->%s] %s" % (b.dtype.name, b.items, str(detail))) + except (Exception) as detail: + raise Exception( + "cannot find the correct atom type -> [dtype->%s,items->%s] %s" % + (b.dtype.name, b.items, str(detail))) j += 1 # validate our min_itemsize @@ -2755,29 +2915,32 @@ def process_filter(field, filt): # see if the field is the name of an axis if field == axis_name: - takers = op(axis_values,filt) - return obj.ix._getitem_axis(takers,axis=axis_number) + takers = op(axis_values, filt) + return obj.ix._getitem_axis(takers, axis=axis_number) # this might be the name of a file IN an axis elif field in axis_values: # we need to filter on this dimension - values = _ensure_index(getattr(obj,field).values) - filt = _ensure_index(filt) + values = _ensure_index(getattr(obj, field).values) + filt = _ensure_index(filt) # hack until we support reversed dim flags - if isinstance(obj,DataFrame): - axis_number = 1-axis_number - takers = op(values,filt) - return obj.ix._getitem_axis(takers,axis=axis_number) + if isinstance(obj, DataFrame): + axis_number = 1 - axis_number + takers = op(values, filt) + return obj.ix._getitem_axis(takers, axis=axis_number) - raise ValueError("cannot find the field [%s] for filtering!" % field) + raise ValueError( + "cannot find the field [%s] for filtering!" % + field) obj = process_filter(field, filt) return obj - def create_description(self, complib=None, complevel=None, fletcher32=False, expectedrows=None): + def create_description( + self, complib=None, complevel=None, fletcher32=False, expectedrows=None): """ create the description of the table from the axes & values """ # expected rows estimate @@ -2811,10 +2974,15 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs): return False # create the selection - self.selection = Selection(self, where=where, start=start, stop=stop, **kwargs) + self.selection = Selection( + self, + where=where, + start=start, + stop=stop, + **kwargs) return Coordinates(self.selection.select_coords(), group=self.group, where=where) - def read_column(self, column, where = None, **kwargs): + def read_column(self, column, where=None, **kwargs): """ return a single column from the table, generally only indexables are interesting """ # validate the version @@ -2825,14 +2993,17 @@ def read_column(self, column, where = None, **kwargs): return False if where is not None: - raise Exception("read_column does not currently accept a where clause") + raise Exception( + "read_column does not currently accept a where clause") # find the axes for a in self.axes: if column == a.name: if not a.is_data_indexable: - raise ValueError("column [%s] can not be extracted individually; it is not data indexable" % column) + raise ValueError( + "column [%s] can not be extracted individually; it is not data indexable" % + column) # column must be an indexable or a data column c = getattr(self.table.cols, column) @@ -2841,7 +3012,9 @@ def read_column(self, column, where = None, **kwargs): raise KeyError("column [%s] not found in the table" % column) + class WORMTable(Table): + """ a write-once read-many table: this format DOES NOT ALLOW appending to a table. writing is a one-time operation the data are stored in a format that allows for searching the data on disk @@ -2861,6 +3034,7 @@ def write(self, **kwargs): class LegacyTable(Table): + """ an appendable table: allow append/query/delete operations to a (possibily) already existing appendable table this table ALLOWS append (but doesn't require them), and stores the data in a format @@ -2960,6 +3134,7 @@ def read(self, where=None, columns=None, **kwargs): class LegacyFrameTable(LegacyTable): + """ support the legacy frame table """ pandas_kind = u'frame_table' table_type = u'legacy_frame' @@ -2970,12 +3145,14 @@ def read(self, *args, **kwargs): class LegacyPanelTable(LegacyTable): + """ support the legacy panel table """ table_type = u'legacy_panel' obj_type = Panel class AppendableTable(LegacyTable): + """ suppor the new appendable table formats """ _indexables = None table_type = u'appendable' @@ -3043,7 +3220,8 @@ def write_data(self, chunksize): values = [a.take_data() for a in self.values_axes] # transpose the values so first dimension is last - values = [ v.transpose(np.roll(np.arange(v.ndim),v.ndim-1)) for v in values ] + values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) + for v in values] # write the chunks if chunksize is None: @@ -3076,15 +3254,17 @@ def write_data_chunk(self, indexes, mask, search, values): args = list(indexes) args.extend([self.dtype, mask, search, values]) rows = func(*args) - except (Exception), detail: + except (Exception) as detail: raise Exception("cannot create row-data -> %s" % str(detail)) try: if len(rows): self.table.append(rows) self.table.flush() - except (Exception), detail: - raise Exception("tables cannot write this data -> %s" % str(detail)) + except (Exception) as detail: + raise Exception( + "tables cannot write this data -> %s" % + str(detail)) def delete(self, where=None, **kwargs): @@ -3140,6 +3320,7 @@ def delete(self, where=None, **kwargs): class AppendableFrameTable(AppendableTable): + """ suppor the new appendable table formats """ pandas_kind = u'frame_table' table_type = u'appendable_frame' @@ -3169,10 +3350,10 @@ def read(self, where=None, columns=None, **kwargs): if self.is_transposed: values = a.cvalues index_ = cols - cols_ = Index(index,name=getattr(index,'name',None)) + cols_ = Index(index, name=getattr(index, 'name', None)) else: values = a.cvalues.T - index_ = Index(index,name=getattr(index,'name',None)) + index_ = Index(index, name=getattr(index, 'name', None)) cols_ = cols # if we have a DataIndexableCol, its shape will only be 1 dim @@ -3195,6 +3376,7 @@ def read(self, where=None, columns=None, **kwargs): class GenericTable(AppendableFrameTable): + """ a table that read/writes the generic pytables table format """ pandas_kind = u'frame_table' table_type = u'generic_table' @@ -3207,17 +3389,19 @@ def pandas_type(self): @property def storable(self): - return getattr(self.group,'table',None) or self.group + return getattr(self.group, 'table', None) or self.group def get_attrs(self): """ retrieve our attributes """ - self.non_index_axes = [] - self.nan_rep = None - self.levels = [] + self.non_index_axes = [] + self.nan_rep = None + self.levels = [] t = self.table - self.index_axes = [ a.infer(t) for a in self.indexables if a.is_an_indexable ] - self.values_axes = [ a.infer(t) for a in self.indexables if not a.is_an_indexable ] - self.data_columns = [ a.name for a in self.values_axes ] + self.index_axes = [a.infer(t) + for a in self.indexables if a.is_an_indexable] + self.values_axes = [a.infer(t) + for a in self.indexables if not a.is_an_indexable] + self.data_columns = [a.name for a in self.values_axes] @property def indexables(self): @@ -3227,11 +3411,15 @@ def indexables(self): d = self.description # the index columns is just a simple index - self._indexables = [ GenericIndexCol(name='index',axis=0) ] + self._indexables = [GenericIndexCol(name='index', axis=0)] for i, n in enumerate(d._v_names): - dc = GenericDataIndexableCol(name = n, pos=i, values = [ n ], version = self.version) + dc = GenericDataIndexableCol( + name=n, + pos=i, + values=[n], + version=self.version) self._indexables.append(dc) return self._indexables @@ -3239,7 +3427,9 @@ def indexables(self): def write(self, **kwargs): raise NotImplementedError("cannot write on an generic table") + class AppendableMultiFrameTable(AppendableFrameTable): + """ a frame with a multi-index """ table_type = u'appendable_multiframe' obj_type = DataFrame @@ -3265,12 +3455,17 @@ def read(self, columns=None, **kwargs): for n in self.levels: if n not in columns: columns.insert(0, n) - df = super(AppendableMultiFrameTable, self).read(columns=columns, **kwargs) + df = super( + AppendableMultiFrameTable, + self).read( + columns=columns, + **kwargs) df.set_index(self.levels, inplace=True) return df class AppendablePanelTable(AppendableTable): + """ suppor the new appendable table formats """ table_type = u'appendable_panel' ndim = 3 @@ -3288,23 +3483,26 @@ def is_transposed(self): class AppendableNDimTable(AppendablePanelTable): + """ suppor the new appendable table formats """ table_type = u'appendable_ndim' ndim = 4 obj_type = Panel4D + def _convert_index(index, encoding=None): - index_name = getattr(index,'name',None) + index_name = getattr(index, 'name', None) if isinstance(index, DatetimeIndex): converted = index.asi8 return IndexCol(converted, 'datetime64', _tables().Int64Col(), - freq=getattr(index,'freq',None), tz=getattr(index,'tz',None), + freq=getattr(index, 'freq', None), tz=getattr(index, 'tz', None), index_name=index_name) elif isinstance(index, (Int64Index, PeriodIndex)): atom = _tables().Int64Col() - return IndexCol(index.values, 'integer', atom, freq=getattr(index,'freq',None), - index_name=index_name) + return IndexCol( + index.values, 'integer', atom, freq=getattr(index, 'freq', None), + index_name=index_name) if isinstance(index, MultiIndex): raise Exception('MultiIndex not supported here!') @@ -3316,7 +3514,7 @@ def _convert_index(index, encoding=None): if inferred_type == 'datetime64': converted = values.view('i8') return IndexCol(converted, 'datetime64', _tables().Int64Col(), - freq=getattr(index,'freq',None), tz=getattr(index,'tz',None), + freq=getattr(index, 'freq', None), tz=getattr(index, 'tz', None), index_name=index_name) elif inferred_type == 'datetime': converted = np.array([(time.mktime(v.timetuple()) + @@ -3335,8 +3533,9 @@ def _convert_index(index, encoding=None): converted = _convert_string_array(values, encoding) itemsize = converted.dtype.itemsize - return IndexCol(converted, 'string', _tables().StringCol(itemsize), itemsize=itemsize, - index_name=index_name) + return IndexCol( + converted, 'string', _tables().StringCol(itemsize), itemsize=itemsize, + index_name=index_name) elif inferred_type == 'unicode': atom = _tables().ObjectAtom() return IndexCol(np.asarray(values, dtype='O'), 'object', atom, @@ -3355,6 +3554,7 @@ def _convert_index(index, encoding=None): return IndexCol(np.asarray(values, dtype='O'), 'object', atom, index_name=index_name) + def _unconvert_index(data, kind, encoding=None): kind = _ensure_decoded(kind) if kind == u'datetime64': @@ -3374,6 +3574,7 @@ def _unconvert_index(data, kind, encoding=None): raise ValueError('unrecognized index type %s' % kind) return index + def _unconvert_index_legacy(data, kind, legacy=False, encoding=None): kind = _ensure_decoded(kind) if kind == u'datetime': @@ -3386,6 +3587,7 @@ def _unconvert_index_legacy(data, kind, legacy=False, encoding=None): raise ValueError('unrecognized index type %s' % kind) return index + def _convert_string_array(data, encoding, itemsize=None): # encode if needed @@ -3397,19 +3599,20 @@ def _convert_string_array(data, encoding, itemsize=None): if itemsize is None: itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) - data = np.array(data,dtype="S%d" % itemsize) + data = np.array(data, dtype="S%d" % itemsize) return data + def _unconvert_string_array(data, nan_rep=None, encoding=None): """ deserialize a string array, possibly decoding """ shape = data.shape - data = np.array(data.ravel(),dtype=object) + data = np.array(data.ravel(), dtype=object) # guard against a None encoding in PY3 (because of a legacy # where the passed encoding is actually None) encoding = _ensure_encoding(encoding) if encoding is not None and len(data): - f = np.vectorize(lambda x: x.decode(encoding),otypes=[np.object]) + f = np.vectorize(lambda x: x.decode(encoding), otypes=[np.object]) data = f(data) if nan_rep is None: @@ -3418,6 +3621,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): data = lib.string_array_replace_from_nan_rep(data, nan_rep) return data.reshape(shape) + def _maybe_convert(values, val_kind, encoding): if _need_convert(val_kind): conv = _get_converter(val_kind, encoding) @@ -3425,6 +3629,7 @@ def _maybe_convert(values, val_kind, encoding): values = conv(values) return values + def _get_converter(kind, encoding): kind = _ensure_decoded(kind) if kind == 'datetime64': @@ -3432,17 +3637,20 @@ def _get_converter(kind, encoding): elif kind == 'datetime': return lib.convert_timestamps elif kind == 'string': - return lambda x: _unconvert_string_array(x,encoding=encoding) + return lambda x: _unconvert_string_array(x, encoding=encoding) else: # pragma: no cover raise ValueError('invalid kind %s' % kind) + def _need_convert(kind): kind = _ensure_decoded(kind) if kind in (u'datetime', u'datetime64', u'string'): return True return False + class Term(StringMixin): + """create a term object that holds a field, op, and value Parameters @@ -3470,10 +3678,13 @@ class Term(StringMixin): """ _ops = ['<=', '<', '>=', '>', '!=', '==', '='] - _search = re.compile("^\s*(?P\w+)\s*(?P%s)\s*(?P.+)\s*$" % '|'.join(_ops)) + _search = re.compile( + "^\s*(?P\w+)\s*(?P%s)\s*(?P.+)\s*$" % + '|'.join(_ops)) _max_selectors = 31 - def __init__(self, field, op=None, value=None, queryables=None, encoding=None): + def __init__(self, field, op=None, + value=None, queryables=None, encoding=None): self.field = None self.op = None self.value = None @@ -3538,8 +3749,10 @@ def __init__(self, field, op=None, value=None, queryables=None, encoding=None): # we have valid conditions if self.op in ['>', '>=', '<', '<=']: - if hasattr(self.value, '__iter__') and len(self.value) > 1 and not isinstance(self.value,basestring): - raise ValueError("an inequality condition cannot have multiple values [%s]" % str(self)) + if hasattr(self.value, '__iter__') and len(self.value) > 1 and not isinstance(self.value, basestring): + raise ValueError( + "an inequality condition cannot have multiple values [%s]" % + str(self)) if not is_list_like(self.value): self.value = [self.value] @@ -3581,7 +3794,7 @@ def eval(self): if self.is_in_table: values = [self.convert_value(v) for v in self.value] else: - values = [TermValue(v,v,self.kind) for v in self.value] + values = [TermValue(v, v, self.kind) for v in self.value] # equality conditions if self.op in ['==', '!=']: @@ -3592,21 +3805,26 @@ def eval(self): else: filter_op = lambda axis, vals: axis.isin(vals) - if self.is_in_table: # too many values to create the expression? if len(values) <= self._max_selectors: - vs = [ self.generate(v) for v in values ] + vs = [self.generate(v) for v in values] self.condition = "(%s)" % ' | '.join(vs) # use a filter after reading else: - self.filter = (self.field, filter_op, Index([v.value for v in values])) + self.filter = ( + self.field, + filter_op, + Index([v.value for v in values])) else: - self.filter = (self.field, filter_op, Index([v.value for v in values])) + self.filter = ( + self.field, + filter_op, + Index([v.value for v in values])) else: @@ -3616,7 +3834,9 @@ def eval(self): else: - raise TypeError("passing a filterable condition to a non-table indexer [%s]" % str(self)) + raise TypeError( + "passing a filterable condition to a non-table indexer [%s]" % + str(self)) def convert_value(self, v): """ convert the expression that is in the term to something that is accepted by pytables """ @@ -3628,34 +3848,37 @@ def stringify(value): return value kind = _ensure_decoded(self.kind) - if kind == u'datetime64' or kind == u'datetime' : + if kind == u'datetime64' or kind == u'datetime': v = lib.Timestamp(v) if v.tz is not None: v = v.tz_convert('UTC') - return TermValue(v,v.value,kind) + return TermValue(v, v.value, kind) elif isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u'date': v = time.mktime(v.timetuple()) - return TermValue(v,Timestamp(v),kind) + return TermValue(v, Timestamp(v), kind) elif kind == u'integer': v = int(float(v)) - return TermValue(v,v,kind) + return TermValue(v, v, kind) elif kind == u'float': v = float(v) - return TermValue(v,v,kind) + return TermValue(v, v, kind) elif kind == u'bool': if isinstance(v, basestring): - v = not v.strip().lower() in [u'false', u'f', u'no', u'n', u'none', u'0', u'[]', u'{}', u''] + v = not v.strip().lower() in [ + u'false', u'f', u'no', u'n', u'none', u'0', u'[]', u'{}', u''] else: v = bool(v) - return TermValue(v,v,kind) + return TermValue(v, v, kind) elif not isinstance(v, basestring): v = stringify(v) - return TermValue(v,stringify(v),u'string') + return TermValue(v, stringify(v), u'string') # string quoting - return TermValue(v,stringify(v),u'string') + return TermValue(v, stringify(v), u'string') + class TermValue(object): + """ hold a term value the we use to construct a condition/filter """ def __init__(self, value, converted, kind): @@ -3672,7 +3895,9 @@ def tostring(self, encoding): return '"%s"' % self.converted return self.converted + class Coordinates(object): + """ holds a returned coordinates list, useful to select the same rows from different tables coordinates : holds the array of coordinates @@ -3692,7 +3917,9 @@ def __getitem__(self, key): """ return a new coordinates object, sliced by the key """ return Coordinates(self.values[key], self.group, self.where) + class Selection(object): + """ Carries out a selection operation on a tables.Table object. @@ -3703,6 +3930,7 @@ class Selection(object): start, stop: indicies to start and/or stop selection """ + def __init__(self, table, where=None, start=None, stop=None, **kwargs): self.table = table self.where = where @@ -3720,9 +3948,10 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): # create the numexpr & the filter if self.terms: - terms = [ t for t in self.terms if t.condition is not None ] + terms = [t for t in self.terms if t.condition is not None] if len(terms): - self.condition = "(%s)" % ' & '.join([ t.condition for t in terms ]) + self.condition = "(%s)" % ' & '.join( + [t.condition for t in terms]) self.filter = [] for t in self.terms: if t.filter is not None: @@ -3767,13 +3996,13 @@ def select_coords(self): return self.table.table.getWhereList(self.condition, start=self.start, stop=self.stop, sort=True) -### utilities ### +# utilities ### -def timeit(key,df,fn=None,remove=True,**kwargs): +def timeit(key, df, fn=None, remove=True, **kwargs): if fn is None: fn = 'timeit.h5' - store = HDFStore(fn,mode='w') - store.append(key,df,**kwargs) + store = HDFStore(fn, mode='w') + store.append(key, df, **kwargs) store.close() if remove: From c665a85b6f7422403acf684d086141d0d701f952 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 14:32:24 -0400 Subject: [PATCH 26/48] DOC: reference future enhancingperf.eval section --- pandas/computation/eval.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index e08e0f28d7877..1a681e37d6130 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -48,8 +48,10 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, Notes ----- - The benefits of using ``eval`` are that very large frames that are terms in - long expressions are sped up, sometimes by as much as 10x. + * The benefits of using ``eval`` are that very large frames that are terms in + long expressions are sped up, sometimes by as much as 10x. + + See :ref:`Enhancing performance ` for more details. """ # make sure we're passed a valid engine if not engine in _engines: From cb27934a41ebcd1085ac08b587f44202103c3413 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 14:33:45 -0400 Subject: [PATCH 27/48] CLN/DOC: clean up docstrings in pytables --- pandas/io/pytables.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 013e596320250..1cb465cbdf16a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -167,9 +167,12 @@ def get_store(path, mode='a', complevel=None, complib=None, Examples -------- + >>> from pandas import DataFrame + >>> from numpy.random import randn + >>> bar = DataFrame(randn(10, 4)) >>> with get_store('test.h5') as store: - >>> store['foo'] = bar # write to HDF5 - >>> bar = store['foo'] # retrieve + ... store['foo'] = bar # write to HDF5 + ... bar = store['foo'] # retrieve """ store = None try: @@ -262,6 +265,9 @@ class HDFStore(object): Examples -------- + >>> from pandas import DataFrame + >>> from numpy.random import randn + >>> bar = DataFrame(randn(10, 4)) >>> store = HDFStore('test.h5') >>> store['foo'] = bar # write to HDF5 >>> bar = store['foo'] # retrieve From 63ba37d0943607f679fd9d3b4715ba38e8ae9739 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 14:34:20 -0400 Subject: [PATCH 28/48] CLN: actually pass fletcher32 in get_store --- pandas/io/pytables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 1cb465cbdf16a..21da0d58b67f7 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -177,7 +177,7 @@ def get_store(path, mode='a', complevel=None, complib=None, store = None try: store = HDFStore(path, mode=mode, complevel=complevel, - complib=complib, fletcher32=False) + complib=complib, fletcher32=fletcher32) yield store finally: if store is not None: From dcde5901f7975c4aac046d4f3c0b7c6629bc4f15 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 14:35:32 -0400 Subject: [PATCH 29/48] CLN: remove unused variables --- pandas/io/pytables.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 21da0d58b67f7..2ac4e19a7eb7b 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -278,7 +278,7 @@ class HDFStore(object): def __init__(self, path, mode=None, complevel=None, complib=None, fletcher32=False): try: - import tables as _ + import tables except ImportError: # pragma: no cover raise Exception('HDFStore requires PyTables') @@ -576,7 +576,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, start=start, stop=stop) nrows = len(c) - except (Exception) as detail: + except Exception: raise ValueError("invalid selector [%s]" % selector) def func(_start, _stop): @@ -1235,7 +1235,6 @@ def validate_col(self, itemsize=None): """ validate this column: return the compared against itemsize """ # validate this column for string truncation (or reset to the max size) - dtype = getattr(self, 'dtype', None) if _ensure_decoded(self.kind) == u'string': c = self.col @@ -2252,7 +2251,6 @@ def read(self, **kwargs): sdict = {} for name in items: key = 'sparse_frame_%s' % name - node = getattr(self.group, key) s = SparseFrameStorer(self.parent, getattr(self.group, key)) s.infer_axes() sdict[name] = s.read() @@ -2592,7 +2590,6 @@ def indexables(self): """ create/cache the indexables if they don't exist """ if self._indexables is None: - d = self.description self._indexables = [] # index columns From 3c4e2b3fa40df21ba477693ce647542f156f1e92 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 14:41:02 -0400 Subject: [PATCH 30/48] CLN: more pep8 and get rid of most raise Exception clauses --- pandas/io/pytables.py | 83 ++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 41 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 2ac4e19a7eb7b..0f84884d51340 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -280,7 +280,7 @@ def __init__(self, path, mode=None, complevel=None, complib=None, try: import tables except ImportError: # pragma: no cover - raise Exception('HDFStore requires PyTables') + raise ImportError('HDFStore requires PyTables') self._path = path if mode is None: @@ -516,7 +516,8 @@ def select_column(self, key, column, **kwargs): return self.get_storer(key).read_column(column=column, **kwargs) def select_as_multiple(self, keys, where=None, selector=None, columns=None, - start=None, stop=None, iterator=False, chunksize=None, auto_close=False, **kwargs): + start=None, stop=None, iterator=False, + chunksize=None, auto_close=False, **kwargs): """ Retrieve pandas objects from multiple tables Parameters @@ -538,13 +539,15 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, if isinstance(keys, (list, tuple)) and len(keys) == 1: keys = keys[0] if isinstance(keys, basestring): - return self.select(key=keys, where=where, columns=columns, start=start, stop=stop, iterator=iterator, chunksize=chunksize, **kwargs) + return self.select(key=keys, where=where, columns=columns, + start=start, stop=stop, iterator=iterator, + chunksize=chunksize, **kwargs) if not isinstance(keys, (list, tuple)): - raise Exception("keys must be a list/tuple") + raise TypeError("keys must be a list/tuple") - if len(keys) == 0: - raise Exception("keys must have a non-zero length") + if not len(keys): + raise ValueError("keys must have a non-zero length") if selector is None: selector = keys[0] @@ -686,13 +689,13 @@ def append(self, key, value, columns=None, **kwargs): data in the table, so be careful """ if columns is not None: - raise Exception( - "columns is not a supported keyword in append, try data_columns") + raise TypeError("columns is not a supported keyword in append, " + "try data_columns") self._write_to_group(key, value, table=True, append=True, **kwargs) - def append_to_multiple( - self, d, value, selector, data_columns=None, axes=None, **kwargs): + def append_to_multiple(self, d, value, selector, data_columns=None, + axes=None, **kwargs): """ Append to multiple tables @@ -711,8 +714,9 @@ def append_to_multiple( """ if axes is not None: - raise Exception( - "axes is currently not accepted as a paremter to append_to_multiple; you can create the tables indepdently instead") + raise TypeError("axes is currently not accepted as a parameter to" + " append_to_multiple; you can create the " + "tables indepdently instead") if not isinstance(d, dict): raise ValueError( @@ -770,7 +774,7 @@ def create_table_index(self, key, **kwargs): # version requirements _tables() if not _table_supports_index: - raise Exception("PyTables >= 2.3 is required for table indexing") + raise ValueError("PyTables >= 2.3 is required for table indexing") s = self.get_storer(key) if s is None: @@ -1005,8 +1009,8 @@ class TableIterator(object): kwargs : the passed kwargs """ - def __init__(self, store, func, nrows, start=None, - stop=None, chunksize=None, auto_close=False): + def __init__(self, store, func, nrows, start=None, stop=None, + chunksize=None, auto_close=False): self.store = store self.func = func self.nrows = nrows or 0 @@ -1928,7 +1932,7 @@ def read_index(self, key): _, index = self.read_index_node(getattr(self.group, key)) return index else: # pragma: no cover - raise Exception('unrecognized index variety: %s' % variety) + raise TypeError('unrecognized index variety: %s' % variety) def write_index(self, key, index): if isinstance(index, MultiIndex): @@ -2448,7 +2452,7 @@ def validate(self, other): (c, sax, oax)) # should never get here - raise Exception( + raise ValueError( "invalid combinate of [%s] on appending data [%s] vs current table [%s]" % (c, sv, ov)) @@ -2884,10 +2888,11 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, self.values_axes.append(col) except (NotImplementedError, ValueError, TypeError) as e: raise e - except (Exception) as detail: - raise Exception( - "cannot find the correct atom type -> [dtype->%s,items->%s] %s" % - (b.dtype.name, b.items, str(detail))) + except Exception as detail: + raise TypeError("cannot find the correct atom type -> " + "[dtype->%s,items->%s] %s" % (b.dtype.name, + b.items, + str(detail))) j += 1 # validate our min_itemsize @@ -2996,8 +3001,8 @@ def read_column(self, column, where=None, **kwargs): return False if where is not None: - raise Exception( - "read_column does not currently accept a where clause") + raise TypeError("read_column does not currently accept a where " + "clause") # find the axes for a in self.axes: @@ -3052,7 +3057,7 @@ class LegacyTable(Table): ndim = 3 def write(self, **kwargs): - raise Exception("write operations are not allowed on legacy tables!") + raise TypeError("write operations are not allowed on legacy tables!") def read(self, where=None, columns=None, **kwargs): """ we have n indexable columns, with an arbitrary number of data axes """ @@ -3257,17 +3262,15 @@ def write_data_chunk(self, indexes, mask, search, values): args = list(indexes) args.extend([self.dtype, mask, search, values]) rows = func(*args) - except (Exception) as detail: - raise Exception("cannot create row-data -> %s" % str(detail)) + except Exception as detail: + raise Exception("cannot create row-data -> %s" % detail) try: if len(rows): self.table.append(rows) self.table.flush() - except (Exception) as detail: - raise Exception( - "tables cannot write this data -> %s" % - str(detail)) + except Exception as detail: + raise TypeError("tables cannot write this data -> %s" % detail) def delete(self, where=None, **kwargs): @@ -3499,16 +3502,15 @@ def _convert_index(index, encoding=None): if isinstance(index, DatetimeIndex): converted = index.asi8 return IndexCol(converted, 'datetime64', _tables().Int64Col(), - freq=getattr(index, 'freq', None), tz=getattr(index, 'tz', None), - index_name=index_name) + freq=getattr(index, 'freq', None), + tz=getattr(index, 'tz', None), index_name=index_name) elif isinstance(index, (Int64Index, PeriodIndex)): atom = _tables().Int64Col() - return IndexCol( - index.values, 'integer', atom, freq=getattr(index, 'freq', None), - index_name=index_name) + return IndexCol(index.values, 'integer', atom, freq=getattr(index, + 'freq', None), index_name=index_name) if isinstance(index, MultiIndex): - raise Exception('MultiIndex not supported here!') + raise TypeError('MultiIndex not supported here!') inferred_type = lib.infer_dtype(index) @@ -3517,8 +3519,8 @@ def _convert_index(index, encoding=None): if inferred_type == 'datetime64': converted = values.view('i8') return IndexCol(converted, 'datetime64', _tables().Int64Col(), - freq=getattr(index, 'freq', None), tz=getattr(index, 'tz', None), - index_name=index_name) + freq=getattr(index, 'freq', None), + tz=getattr(index, 'tz', None), index_name=index_name) elif inferred_type == 'datetime': converted = np.array([(time.mktime(v.timetuple()) + v.microsecond / 1E6) for v in values], @@ -3536,9 +3538,8 @@ def _convert_index(index, encoding=None): converted = _convert_string_array(values, encoding) itemsize = converted.dtype.itemsize - return IndexCol( - converted, 'string', _tables().StringCol(itemsize), itemsize=itemsize, - index_name=index_name) + return IndexCol(converted, 'string', _tables().StringCol(itemsize), + itemsize=itemsize, index_name=index_name) elif inferred_type == 'unicode': atom = _tables().ObjectAtom() return IndexCol(np.asarray(values, dtype='O'), 'object', atom, From 226c7869742582cf62af604dcab2237cfe1750c4 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 18:52:36 -0400 Subject: [PATCH 31/48] CLN: change NameError to match python --- pandas/computation/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 24000b27a033a..a35d80568b482 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -31,7 +31,7 @@ def _resolve_name(env, key): if not isinstance(key, basestring): return key - raise NameError('{0!r} is undefined'.format(key)) + raise NameError('name {0!r} is not defined'.format(key)) return res From 79871d8b9c24d7b52d8ab86897b43c2a5481c89d Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 5 Jul 2013 11:26:31 -0400 Subject: [PATCH 32/48] API: expose the Expr object to top level pandas --- pandas/__init__.py | 2 +- pandas/computation/api.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index bec0877b13bb8..5315fd770e796 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -29,7 +29,7 @@ from pandas.stats.api import * from pandas.tseries.api import * from pandas.io.api import * -from pandas.computation.api import eval +from pandas.computation.api import * from pandas.util.testing import debug diff --git a/pandas/computation/api.py b/pandas/computation/api.py index 86f72902a52c8..db8269a497768 100644 --- a/pandas/computation/api.py +++ b/pandas/computation/api.py @@ -1 +1,2 @@ from pandas.computation.eval import eval +from pandas.computation.expr import Expr From 84fdb453fb497ec73ae70cd059840d3b087fa828 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 5 Jul 2013 11:27:02 -0400 Subject: [PATCH 33/48] CLN/TST: fail with a NotImplementedError on and or not --- pandas/computation/expr.py | 4 ++++ pandas/computation/tests/test_eval.py | 33 +++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 60fea6e935070..666eb891f9929 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -77,6 +77,8 @@ def visit_BinOp(self, node): return op(left, right) def visit_UnaryOp(self, node): + if isinstance(node.op, ast.Not): + raise NotImplementedError("not operator not yet supported") op = self.visit(node.op) return op(self.visit(node.operand)) @@ -107,6 +109,8 @@ def visit_Call(self, node): def visit_Attribute(self, node): raise NotImplementedError("attribute access is not yet supported") + def visit_BoolOp(self, node): + raise NotImplementedError("boolean operators are not yet supported") class Expr(StringMixin): """Expr object""" diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 0a1356915523a..8e185f5b9772b 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -610,6 +610,39 @@ def test_is_expr(): check_is_expr(engine) +def check_not_fails(engine): + x = True + assert_raises(NotImplementedError, pd.eval, 'not x', engine=engine, + local_dict={'x': x}) + + +def test_not_fails(): + for engine in _engines: + check_not_fails(engine) + + +def check_and_fails(engine): + x, y = False, True + assert_raises(NotImplementedError, pd.eval, 'x and y', engine=engine, + local_dict={'x': x, 'y': y}) + + +def test_and_fails(): + for engine in _engines: + check_and_fails(engine) + + +def check_or_fails(engine): + x, y = True, False + assert_raises(NotImplementedError, pd.eval, 'x or y', engine=engine, + local_dict={'x': x, 'y': y}) + + +def test_or_fails(): + for engine in _engines: + check_or_fails(engine) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From 4d9f9a7805efa54f08af7719207703e7722bb59d Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 5 Jul 2013 11:27:17 -0400 Subject: [PATCH 34/48] CLN: generlize operator/expression printing --- pandas/computation/ops.py | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index a35d80568b482..0d67c56ba472a 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -49,13 +49,7 @@ def _update_name(env, key, value): raise NameError('{0!r} is undefined'.format(key)) -class NamedObjectMixin(object): - @property - def typename(self): - return com.pprint_thing(self.__class__.__name__) - - -class Term(StringMixin, NamedObjectMixin): +class Term(StringMixin): def __init__(self, name, env): self.name = name self.value = _resolve_name(env, name) @@ -79,7 +73,11 @@ def __init__(self, value, env): super(Constant, self).__init__(value, env) -class Op(NamedObjectMixin, StringMixin): +def _print_operand(opr): + return opr.name if is_term(opr) else unicode(opr) + + +class Op(StringMixin): """Hold an operator of unknown arity """ def __init__(self, op, operands): @@ -90,12 +88,11 @@ def __iter__(self): return iter(self.operands) def __unicode__(self): - op = 'op={1!r}'.format(self.op) - operands = ', '.join('opr_{i}={opr}'.format(i=i, opr=opr) - for i, opr in enumerate(self.operands)) - return com.pprint_thing('{0}({op}, ' - '{operands})'.format(self.name, op=op, - operands=operands)) + """Print a generic n-ary operator and its operands""" + # recurse over the operands + parened = ('({0})'.format(_print_operand(opr)) + for opr in self.operands) + return com.pprint_thing(' {0} '.format(self.op).join(parened)) _cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=' @@ -161,10 +158,6 @@ def __init__(self, op, lhs, rhs): raise BinaryOperatorError('Invalid binary operator {0}, valid' ' operators are {1}'.format(op, keys)) - def __unicode__(self): - return com.pprint_thing('({0}) {1} ({2})'.format(self.lhs, self.op, - self.rhs)) - def __call__(self, env): # handle truediv if self.op == '/' and env.locals['truediv']: From a0d2ce0f458f18d43e87e9971d2625457e7c1814 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 5 Jul 2013 11:34:20 -0400 Subject: [PATCH 35/48] CLN: clean up testing and expr --- pandas/computation/expr.py | 1 + pandas/computation/tests/test_eval.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 666eb891f9929..6d33f6ac50a0d 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -112,6 +112,7 @@ def visit_Attribute(self, node): def visit_BoolOp(self, node): raise NotImplementedError("boolean operators are not yet supported") + class Expr(StringMixin): """Expr object""" def __init__(self, expr, engine='numexpr', env=None, truediv=True): diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 8e185f5b9772b..fc1cccf320201 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -42,7 +42,7 @@ def fractional(x): def hasfractional(x): - return np.any(fractional(x) != 0.0) + return np.any(fractional(x)) def _eval_from_expr(lhs, cmp1, rhs, binop, cmp2): From 317a1530b1e46a61fb4c97388108fdd7e43ece77 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 6 Jul 2013 16:25:20 -0400 Subject: [PATCH 36/48] ENH: add modest type inference --- pandas/computation/align.py | 17 +++++++++-------- pandas/computation/engines.py | 3 ++- pandas/computation/ops.py | 31 +++++++++++++++++++++++++------ 3 files changed, 36 insertions(+), 15 deletions(-) diff --git a/pandas/computation/align.py b/pandas/computation/align.py index f2bf11d41e185..529fe84fd06a7 100644 --- a/pandas/computation/align.py +++ b/pandas/computation/align.py @@ -163,6 +163,7 @@ def _filter_terms(flat): def _align(terms, env): + # flatten the parse tree (a nested list) terms = list(flatten(terms)) @@ -181,7 +182,7 @@ def _align(terms, env): return typ, axes -def _reconstruct_object(typ, obj, axes): +def _reconstruct_object(typ, obj, axes, dtype): """Reconstruct an object given its type, raw value, and possibly empty (None) axes. @@ -200,20 +201,20 @@ def _reconstruct_object(typ, obj, axes): An object of type ``typ`` with the value `obj` and possible axes `axes`. """ + #import ipdb; ipdb.set_trace() try: - # handle numpy dtypes typ = typ.type except AttributeError: pass if (not isinstance(typ, partial) and issubclass(typ, pd.core.generic.PandasObject)): - return typ(obj, **axes) + return typ(obj, dtype=dtype, **axes) - ret_value = typ(obj) + ret_value = typ(obj).astype(dtype) try: - return ret_value.item() - except (AttributeError, ValueError): - return ret_value - + ret = ret_value.item() + except ValueError: + ret = ret_value + return ret diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index db6beb87da3a5..7f500dccb825b 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -25,7 +25,8 @@ def evaluate(self): self.expr.env) res = self._evaluate(self.expr.env) - return _reconstruct_object(self.result_type, res, self.aligned_axes) + return _reconstruct_object(self.result_type, res, self.aligned_axes, + self.expr.terms.return_type) @property def _is_aligned(self): diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 0d67c56ba472a..ca5f6d4872a72 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -4,6 +4,7 @@ from pandas.util.py3compat import PY3 import pandas.core.common as com from pandas.core.base import StringMixin +from pandas.computation.common import flatten _reductions = 'sum', 'prod' @@ -46,15 +47,25 @@ def _update_name(env, key, value): del env.globals[key] env.globals[key] = value except KeyError: - raise NameError('{0!r} is undefined'.format(key)) + raise NameError('name {0!r} is not defined'.format(key)) class Term(StringMixin): def __init__(self, name, env): self.name = name - self.value = _resolve_name(env, name) self.env = env - self.type = type(self.value) + self.value = _resolve_name(self.env, self.name) + + try: + # ndframe potentially very slow for large, mixed dtype frames + self.type = self.value.values.dtype + except AttributeError: + try: + # ndarray + self.type = self.value.dtype + except AttributeError: + # scalar + self.type = type(self.value) def __unicode__(self): return com.pprint_thing(self.name) @@ -88,15 +99,23 @@ def __iter__(self): return iter(self.operands) def __unicode__(self): - """Print a generic n-ary operator and its operands""" + """Print a generic n-ary operator and its operands using infix + notation""" # recurse over the operands parened = ('({0})'.format(_print_operand(opr)) for opr in self.operands) return com.pprint_thing(' {0} '.format(self.op).join(parened)) + @property + def return_type(self): + # clobber types to bool if the op is a boolean operator + if self.op in (_cmp_ops_syms + _bool_ops_syms): + return np.bool_ + return np.result_type(*(term.type for term in flatten(self))) + -_cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=' -_cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne +_cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=', '=' +_cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne, op.eq _cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs)) _bool_ops_syms = '&', '|' From 401bc288cc7e145a8a4076376204d59793e94b02 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 6 Jul 2013 16:25:58 -0400 Subject: [PATCH 37/48] ENH: rewrite assignment as equal comparison --- pandas/computation/expr.py | 47 ++++++++++++++++++++++++--- pandas/computation/tests/test_eval.py | 4 ++- 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 6d33f6ac50a0d..9a9cd226278bc 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -1,7 +1,12 @@ import ast import sys +import itertools +import tokenize +import re +from cStringIO import StringIO from functools import partial + from pandas.core.base import StringMixin from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops from pandas.computation.ops import _cmp_ops_syms, _bool_ops_syms @@ -26,12 +31,38 @@ class ExprParserError(Exception): pass +def _rewrite_assign(source): + res = [] + g = tokenize.generate_tokens(StringIO(source).readline) + for toknum, tokval, _, _, _ in g: + res.append((toknum, '==' if tokval == '=' else tokval)) + return tokenize.untokenize(res) + + +def _parenthesize_booleans(source, ops='|&'): + res = source + for op in ops: + terms = res.split(op) + + t = [] + for term in terms: + t.append('({0})'.format(term)) + + res = op.join(t) + return res + + +def preparse(source): + return _parenthesize_booleans(_rewrite_assign(source)) + + class ExprVisitor(ast.NodeVisitor): """Custom ast walker """ bin_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms - bin_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'BitAnd', 'BitOr', - 'Add', 'Sub', 'Mult', 'Div', 'Pow', 'FloorDiv', 'Mod') + bin_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', None, + 'BitAnd', 'BitOr', 'Add', 'Sub', 'Mult', 'Div', 'Pow', + 'FloorDiv', 'Mod') bin_op_nodes_map = dict(zip(bin_ops, bin_op_nodes)) unary_ops = _unary_ops_syms @@ -39,7 +70,7 @@ class ExprVisitor(ast.NodeVisitor): unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) def __init__(self, env): - for bin_op in self.bin_ops: + for bin_op in itertools.ifilter(lambda x: x is not None, self.bin_ops): setattr(self, 'visit_{0}'.format(self.bin_op_nodes_map[bin_op]), lambda node, bin_op=bin_op: partial(BinOp, bin_op)) @@ -54,7 +85,7 @@ def visit(self, node): raise TypeError('"node" must be an AST node or a string, you' ' passed a(n) {0}'.format(node.__class__)) if isinstance(node, basestring): - node = ast.fix_missing_locations(ast.parse(node)) + node = ast.fix_missing_locations(ast.parse(preparse(node))) return super(ExprVisitor, self).visit(node) def visit_Module(self, node): @@ -62,7 +93,7 @@ def visit_Module(self, node): raise ExprParserError('only a single expression is allowed') expr = node.body[0] - if not isinstance(expr, ast.Expr): + if not isinstance(expr, (ast.Expr, ast.Assign)): raise SyntaxError('only expressions are allowed') return self.visit(expr) @@ -95,6 +126,12 @@ def visit_Compare(self, node): raise ExprParserError('chained comparisons not supported') return self.visit(ops[0])(self.visit(node.left), self.visit(comps[0])) + def visit_Assign(self, node): + cmpr = ast.copy_location(ast.Compare(ops=[ast.Eq()], + left=node.targets[0], + comparators=[node.value]), node) + return self.visit(cmpr) + def visit_Call(self, node): if not isinstance(node.func, ast.Name): raise TypeError("Only named functions are supported") diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index fc1cccf320201..6ec630b80614d 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -52,7 +52,9 @@ def _eval_from_expr(lhs, cmp1, rhs, binop, cmp2): env = Scope() typ, axes = _align_core((Term('lhs', env), Term('rhs', env))) lhs, rhs = env.locals['lhs'], env.locals['rhs'] - return _reconstruct_object(typ, bf(f1(lhs, rhs), f2(lhs, rhs)), axes) + ret_type = np.result_type(lhs, rhs) + return _reconstruct_object(typ, bf(f1(lhs, rhs), f2(lhs, rhs)), axes, + ret_type) def _eval_single_bin(lhs, cmp1, rhs, has_neg_frac): From 22dedcb87392276c057882c5e1c5e71f06c2e83d Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 6 Jul 2013 15:52:29 -0400 Subject: [PATCH 38/48] ENH: initial commit for adding Expr based terms for pytables support --- pandas/computation/engines.py | 3 +- pandas/computation/expr.py | 51 +++--- pandas/computation/ops.py | 56 +++--- pandas/computation/pytables.py | 281 +++++++++++++++++++++++++++++++ pandas/io/pytables.py | 276 +----------------------------- pandas/io/tests/test_pytables.py | 39 +++++ 6 files changed, 389 insertions(+), 317 deletions(-) create mode 100644 pandas/computation/pytables.py diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 7f500dccb825b..c28e88bdef887 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -76,5 +76,4 @@ def evaluate(self): def _evaluate(self, env): pass - -_engines = {'numexpr': NumExprEngine, 'python': PythonEngine} +_engines = {'numexpr': NumExprEngine, 'python': PythonEngine } diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 9a9cd226278bc..10ca7e1083983 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -13,7 +13,6 @@ from pandas.computation.ops import _arith_ops_syms, _unary_ops_syms from pandas.computation.ops import Term, Constant - class Scope(object): __slots__ = 'globals', 'locals' @@ -26,7 +25,6 @@ def __init__(self, gbls=None, lcls=None, frame_level=1): finally: del frame - class ExprParserError(Exception): pass @@ -80,15 +78,28 @@ def __init__(self, env): lambda node, unary_op=unary_op: partial(UnaryOp, unary_op)) self.env = env - def visit(self, node): + def generic_visit(self, node, **kwargs): + """Called if no explicit visitor function exists for a node.""" + for field, value in iter_fields(node): + if isinstance(value, list): + for item in value: + if isinstance(item, AST): + self.visit(item, **kwargs) + elif isinstance(value, AST): + self.visit(value, **kwargs) + + def visit(self, node, **kwargs): if not (isinstance(node, ast.AST) or isinstance(node, basestring)): raise TypeError('"node" must be an AST node or a string, you' ' passed a(n) {0}'.format(node.__class__)) if isinstance(node, basestring): node = ast.fix_missing_locations(ast.parse(preparse(node))) - return super(ExprVisitor, self).visit(node) - def visit_Module(self, node): + method = 'visit_' + node.__class__.__name__ + visitor = getattr(self, method, self.generic_visit) + return visitor(node, **kwargs) + + def visit_Module(self, node, **kwargs): if len(node.body) != 1: raise ExprParserError('only a single expression is allowed') @@ -96,43 +107,43 @@ def visit_Module(self, node): if not isinstance(expr, (ast.Expr, ast.Assign)): raise SyntaxError('only expressions are allowed') - return self.visit(expr) + return self.visit(expr, **kwargs) - def visit_Expr(self, node): - return self.visit(node.value) + def visit_Expr(self, node, **kwargs): + return self.visit(node.value, **kwargs) - def visit_BinOp(self, node): + def visit_BinOp(self, node, **kwargs): op = self.visit(node.op) - left = self.visit(node.left) - right = self.visit(node.right) + left = self.visit(node.left,side='left') + right = self.visit(node.right,side='right') return op(left, right) - def visit_UnaryOp(self, node): + def visit_UnaryOp(self, node, **kwargs): if isinstance(node.op, ast.Not): raise NotImplementedError("not operator not yet supported") op = self.visit(node.op) return op(self.visit(node.operand)) - def visit_Name(self, node): + def visit_Name(self, node, **kwargs): return Term(node.id, self.env) - def visit_Num(self, node): + def visit_Num(self, node, **kwargs): return Constant(node.n, self.env) - def visit_Compare(self, node): + def visit_Compare(self, node, **kwargs): ops = node.ops comps = node.comparators if len(ops) != 1: raise ExprParserError('chained comparisons not supported') - return self.visit(ops[0])(self.visit(node.left), self.visit(comps[0])) + return self.visit(ops[0])(self.visit(node.left,side='left'), self.visit(comps[0],side='right')) - def visit_Assign(self, node): + def visit_Assign(self, node, **kwargs): cmpr = ast.copy_location(ast.Compare(ops=[ast.Eq()], left=node.targets[0], comparators=[node.value]), node) return self.visit(cmpr) - def visit_Call(self, node): + def visit_Call(self, node, **kwargs): if not isinstance(node.func, ast.Name): raise TypeError("Only named functions are supported") @@ -143,10 +154,10 @@ def visit_Call(self, node): raise NotImplementedError("function calls not yet supported") - def visit_Attribute(self, node): + def visit_Attribute(self, node, **kwargs): raise NotImplementedError("attribute access is not yet supported") - def visit_BoolOp(self, node): + def visit_BoolOp(self, node, **kwargs): raise NotImplementedError("boolean operators are not yet supported") diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index ca5f6d4872a72..76e5497d48175 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -25,36 +25,12 @@ class BinaryOperatorError(OperatorError): pass -def _resolve_name(env, key): - res = env.locals.get(key, env.globals.get(key)) - - if res is None: - if not isinstance(key, basestring): - return key - - raise NameError('name {0!r} is not defined'.format(key)) - - return res - - -def _update_name(env, key, value): - if isinstance(key, basestring): - try: - del env.locals[key] - env.locals[key] = value - except KeyError: - try: - del env.globals[key] - env.globals[key] = value - except KeyError: - raise NameError('name {0!r} is not defined'.format(key)) - - class Term(StringMixin): - def __init__(self, name, env): + def __init__(self, name, env, side=None): self.name = name self.env = env - self.value = _resolve_name(self.env, self.name) + self.side = side + self.value = self._resolve_name() try: # ndframe potentially very slow for large, mixed dtype frames @@ -70,8 +46,32 @@ def __init__(self, name, env): def __unicode__(self): return com.pprint_thing(self.name) + def _resolve_name(self): + env = self.env + key = self.name + res = env.locals.get(key, env.globals.get(key)) + + if res is None: + if not isinstance(key, basestring): + return key + + raise NameError('name {0!r} is not defined'.format(key)) + return res + def update(self, value): - _update_name(self.env, self.name, value) + env = self.env + key = self.name + if isinstance(key, basestring): + try: + del env.locals[key] + env.locals[key] = value + except KeyError: + try: + del env.globals[key] + env.globals[key] = value + except KeyError: + raise NameError('{0!r} is undefined'.format(key)) + self.value = value @property diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py new file mode 100644 index 0000000000000..7f070adff7054 --- /dev/null +++ b/pandas/computation/pytables.py @@ -0,0 +1,281 @@ +import sys +import re +import ast +from functools import partial + +from pandas.computation import expr, ops +from pandas.computation.ops import is_term +from pandas.computation.expr import ExprParserError + +class Scope(expr.Scope): + __slots__ = 'globals', 'locals', 'queryables' + + def __init__(self, gbls=None, lcls=None, queryables=None, frame_level=1): + super(Scope, self).__init__(gbls=gbls, lcls=lcls, frame_level=frame_level) + self.queryables = queryables or dict() + +class Term(ops.Term): + + def __init__(self, name, env, side=None): + super(Term, self).__init__(name, env, side=side) + + def _resolve_name(self): + + # must be a queryable + if self.side == 'left': + if self.name not in self.env.queryables: + raise NameError('name {0!r} is not defined'.format(self.name)) + return self.name + + # resolve the rhs (and allow to be None) + return self.env.locals.get(self.name, self.env.globals.get(self.name,self.name)) + +def format_value(q, lhs, v): + """ given a queryable, a lhs name and value, return a formatted value """ + return v + +class BinOp(ops.BinOp): + + def __call__(self, q): + left, right = self.lhs, self.rhs + + # base cases + if is_term(left) and is_term(right): + res = "(%s %s %s)" % (left.value,self.op,format_value(q, left.value, right.value)) + elif not is_term(left) and is_term(right): + res = "(%s %s %s)" % (left(q),self.op,right.value) + elif is_term(left) and not is_term(right): + res = "(%s %s %s)" % (left.value,self.op,right(q)) + elif not (is_term(left) or is_term(right)): + res = "(%s %s %s)" % (left(q),self.op,right(q)) + + return res + +class UnaryOp(ops.UnaryOp): + def __call__(self, q): + operand = self.operand + v = operand.value if is_term(operand) else operand + return "%s (%s)" % (operand,v) + +class ExprVisitor(expr.ExprVisitor): + + bin_ops = '>', '<', '>=', '<=', '==', '!=', '&', '|' + bin_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'BitAnd', 'BitOr') + bin_op_nodes_map = dict(zip(bin_ops, bin_op_nodes)) + + unary_ops = ['~'] + unary_op_nodes = 'Invert' + unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) + + def __init__(self, env): + for bin_op in self.bin_ops: + setattr(self, 'visit_{0}'.format(self.bin_op_nodes_map[bin_op]), + lambda node, bin_op=bin_op: partial(BinOp, bin_op)) + + for unary_op in self.unary_ops: + setattr(self, + 'visit_{0}'.format(self.unary_op_nodes_map[unary_op]), + lambda node, unary_op=unary_op: partial(UnaryOp, unary_op)) + self.env = env + + def visit_Module(self, node, **kwargs): + if len(node.body) != 1: + raise ExprParserError('only a single expression is allowed') + + body = node.body[0] + return self.visit(body) + + def visit_Compare(self, node, **kwargs): + ops = node.ops + comps = node.comparators + for op, comp in zip(ops, comps): + node = self.visit(op)(self.visit(node.left,side='left'), self.visit(comp,side='right')) + return node + + def visit_Name(self, node, side=None, **kwargs): + return Term(node.id, self.env, side=side) + +class Expr(expr.Expr): + + """ hold a pytables like expression, comprised of possibly multiple 'terms' + + Parameters + ---------- + field : dict, string term expression, or the field to operate (must be a valid index/column type of DataFrame/Panel) + queryables : a kinds map (dict of column name -> kind), or None i column is non-indexable + encoding : an encoding that will encode the query terms + + Returns + ------- + an Expr object + + Examples + -------- + """ + + _max_selectors = 31 + + def __init__(self, expression, queryables=None, encoding=None): + self.expr = self.pre_parse(expression) + self.env = Scope(queryables=queryables,frame_level=2) + self._visitor = ExprVisitor(self.env) + self.terms = self.parse() + self.encoding = encoding + self.condition = None + self.filter = None + + def pre_parse(self, expression): + """ transform = to == """ + expression = re.sub("=+","==",expression) + return expression + + def evaluate(self): + """ create and return the numexpr condition and filter """ + import pdb; pdb.set_trace() + terms = [] + filter = [] + + self.terms(self.env) + #for t in self.terms: + + terms = [t for t in self.terms if t.condition is not None] + if len(terms): + self.condition = "(%s)" % ' & '.join( + [t.condition for t in terms]) + self.filter = [] + for t in self.terms: + if t.filter is not None: + self.filter.append(t.filter) + + + @property + def is_valid(self): + """ return True if this is a valid field """ + return self.field in self.q + + @property + def is_in_table(self): + """ return True if this is a valid column name for generation (e.g. an actual column in the table) """ + return self.q.get(self.field) is not None + + @property + def kind(self): + """ the kind of my field """ + return self.q.get(self.field) + + def generate(self, v): + """ create and return the op string for this TermValue """ + val = v.tostring(self.encoding) + return "(%s %s %s)" % (self.field, self.op, val) + + """ set the numexpr expression for this term """ + + if not self.is_valid: + raise ValueError("query term is not valid [%s]" % str(self)) + + # convert values if we are in the table + if self.is_in_table: + values = [self.convert_value(v) for v in self.value] + else: + values = [TermValue(v, v, self.kind) for v in self.value] + + # equality conditions + if self.op in ['==', '!=']: + + # our filter op expression + if self.op == '!=': + filter_op = lambda axis, vals: not axis.isin(vals) + else: + filter_op = lambda axis, vals: axis.isin(vals) + + if self.is_in_table: + + # too many values to create the expression? + if len(values) <= self._max_selectors: + vs = [self.generate(v) for v in values] + self.condition = "(%s)" % ' | '.join(vs) + + # use a filter after reading + else: + self.filter = ( + self.field, + filter_op, + Index([v.value for v in values])) + + else: + + self.filter = ( + self.field, + filter_op, + Index([v.value for v in values])) + + else: + + if self.is_in_table: + + self.condition = self.generate(values[0]) + + else: + + raise TypeError( + "passing a filterable condition to a non-table indexer [%s]" % + str(self)) + + def convert_value(self, v): + """ convert the expression that is in the term to something that is accepted by pytables """ + + def stringify(value): + value = str(value) + if self.encoding is not None: + value = value.encode(self.encoding) + return value + + kind = _ensure_decoded(self.kind) + if kind == u'datetime64' or kind == u'datetime': + v = lib.Timestamp(v) + if v.tz is not None: + v = v.tz_convert('UTC') + return TermValue(v, v.value, kind) + elif isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u'date': + v = time.mktime(v.timetuple()) + return TermValue(v, Timestamp(v), kind) + elif kind == u'integer': + v = int(float(v)) + return TermValue(v, v, kind) + elif kind == u'float': + v = float(v) + return TermValue(v, v, kind) + elif kind == u'bool': + if isinstance(v, basestring): + v = not v.strip().lower() in [ + u'false', u'f', u'no', u'n', u'none', u'0', u'[]', u'{}', u''] + else: + v = bool(v) + return TermValue(v, v, kind) + elif not isinstance(v, basestring): + v = stringify(v) + return TermValue(v, stringify(v), u'string') + + # string quoting + return TermValue(v, stringify(v), u'string') + + +class TermValue(object): + + """ hold a term value the we use to construct a condition/filter """ + + def __init__(self, value, converted, kind): + self.value = value + self.converted = converted + self.kind = kind + + def tostring(self, encoding): + """ quote the string if not encoded + else encode and return """ + if self.kind == u'string': + if encoding is not None: + return self.converted + return '"%s"' % self.converted + return self.converted + + diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 0f84884d51340..5e45cc4d45e3c 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -29,6 +29,7 @@ from pandas.tools.merge import concat from pandas.util import py3compat from pandas.io.common import PerformanceWarning +from pandas.computation.pytables import Expr import pandas.lib as lib import pandas.algos as algos @@ -3652,253 +3653,7 @@ def _need_convert(kind): return True return False - -class Term(StringMixin): - - """create a term object that holds a field, op, and value - - Parameters - ---------- - field : dict, string term expression, or the field to operate (must be a valid index/column type of DataFrame/Panel) - op : a valid op (defaults to '=') (optional) - >, >=, <, <=, =, != (not equal) are allowed - value : a value or list of values (required) - queryables : a kinds map (dict of column name -> kind), or None i column is non-indexable - encoding : an encoding that will encode the query terms - - Returns - ------- - a Term object - - Examples - -------- - >>> Term(dict(field = 'index', op = '>', value = '20121114')) - >>> Term('index', '20121114') - >>> Term('index', '>', '20121114') - >>> Term('index', ['20121114','20121114']) - >>> Term('index', datetime(2012,11,14)) - >>> Term('major_axis>20121114') - >>> Term('minor_axis', ['A','U']) - """ - - _ops = ['<=', '<', '>=', '>', '!=', '==', '='] - _search = re.compile( - "^\s*(?P\w+)\s*(?P%s)\s*(?P.+)\s*$" % - '|'.join(_ops)) - _max_selectors = 31 - - def __init__(self, field, op=None, - value=None, queryables=None, encoding=None): - self.field = None - self.op = None - self.value = None - self.q = queryables or dict() - self.filter = None - self.condition = None - self.encoding = encoding - - # unpack lists/tuples in field - while(isinstance(field, (tuple, list))): - f = field - field = f[0] - if len(f) > 1: - op = f[1] - if len(f) > 2: - value = f[2] - - # backwards compatible - if isinstance(field, dict): - self.field = field.get('field') - self.op = field.get('op') or '==' - self.value = field.get('value') - - # passed a term - elif isinstance(field, Term): - self.field = field.field - self.op = field.op - self.value = field.value - - # a string expression (or just the field) - elif isinstance(field, basestring): - - # is a term is passed - s = self._search.match(field) - if s is not None: - self.field = s.group('field') - self.op = s.group('op') - self.value = s.group('value') - - else: - self.field = field - - # is an op passed? - if isinstance(op, basestring) and op in self._ops: - self.op = op - self.value = value - else: - self.op = '==' - self.value = op - - else: - raise ValueError( - "Term does not understand the supplied field [%s]" % field) - - # we have valid fields - if self.field is None or self.op is None or self.value is None: - raise ValueError("Could not create this term [%s]" % str(self)) - - # = vs == - if self.op == '=': - self.op = '==' - - # we have valid conditions - if self.op in ['>', '>=', '<', '<=']: - if hasattr(self.value, '__iter__') and len(self.value) > 1 and not isinstance(self.value, basestring): - raise ValueError( - "an inequality condition cannot have multiple values [%s]" % - str(self)) - - if not is_list_like(self.value): - self.value = [self.value] - - if len(self.q): - self.eval() - - def __unicode__(self): - attrs = map(pprint_thing, (self.field, self.op, self.value)) - return "field->%s,op->%s,value->%s" % tuple(attrs) - - @property - def is_valid(self): - """ return True if this is a valid field """ - return self.field in self.q - - @property - def is_in_table(self): - """ return True if this is a valid column name for generation (e.g. an actual column in the table) """ - return self.q.get(self.field) is not None - - @property - def kind(self): - """ the kind of my field """ - return self.q.get(self.field) - - def generate(self, v): - """ create and return the op string for this TermValue """ - val = v.tostring(self.encoding) - return "(%s %s %s)" % (self.field, self.op, val) - - def eval(self): - """ set the numexpr expression for this term """ - - if not self.is_valid: - raise ValueError("query term is not valid [%s]" % str(self)) - - # convert values if we are in the table - if self.is_in_table: - values = [self.convert_value(v) for v in self.value] - else: - values = [TermValue(v, v, self.kind) for v in self.value] - - # equality conditions - if self.op in ['==', '!=']: - - # our filter op expression - if self.op == '!=': - filter_op = lambda axis, vals: not axis.isin(vals) - else: - filter_op = lambda axis, vals: axis.isin(vals) - - if self.is_in_table: - - # too many values to create the expression? - if len(values) <= self._max_selectors: - vs = [self.generate(v) for v in values] - self.condition = "(%s)" % ' | '.join(vs) - - # use a filter after reading - else: - self.filter = ( - self.field, - filter_op, - Index([v.value for v in values])) - - else: - - self.filter = ( - self.field, - filter_op, - Index([v.value for v in values])) - - else: - - if self.is_in_table: - - self.condition = self.generate(values[0]) - - else: - - raise TypeError( - "passing a filterable condition to a non-table indexer [%s]" % - str(self)) - - def convert_value(self, v): - """ convert the expression that is in the term to something that is accepted by pytables """ - - def stringify(value): - value = str(value) - if self.encoding is not None: - value = value.encode(self.encoding) - return value - - kind = _ensure_decoded(self.kind) - if kind == u'datetime64' or kind == u'datetime': - v = lib.Timestamp(v) - if v.tz is not None: - v = v.tz_convert('UTC') - return TermValue(v, v.value, kind) - elif isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u'date': - v = time.mktime(v.timetuple()) - return TermValue(v, Timestamp(v), kind) - elif kind == u'integer': - v = int(float(v)) - return TermValue(v, v, kind) - elif kind == u'float': - v = float(v) - return TermValue(v, v, kind) - elif kind == u'bool': - if isinstance(v, basestring): - v = not v.strip().lower() in [ - u'false', u'f', u'no', u'n', u'none', u'0', u'[]', u'{}', u''] - else: - v = bool(v) - return TermValue(v, v, kind) - elif not isinstance(v, basestring): - v = stringify(v) - return TermValue(v, stringify(v), u'string') - - # string quoting - return TermValue(v, stringify(v), u'string') - - -class TermValue(object): - - """ hold a term value the we use to construct a condition/filter """ - - def __init__(self, value, converted, kind): - self.value = value - self.converted = converted - self.kind = kind - - def tostring(self, encoding): - """ quote the string if not encoded - else encode and return """ - if self.kind == u'string': - if encoding is not None: - return self.converted - return '"%s"' % self.converted - return self.converted - +Term = Expr class Coordinates(object): @@ -3951,34 +3706,21 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): self.terms = self.generate(where) # create the numexpr & the filter - if self.terms: - terms = [t for t in self.terms if t.condition is not None] - if len(terms): - self.condition = "(%s)" % ' & '.join( - [t.condition for t in terms]) - self.filter = [] - for t in self.terms: - if t.filter is not None: - self.filter.append(t.filter) + if self.terms is not None: + self.condition, self.filter = self.terms.evaluate() def generate(self, where): """ where can be a : dict,list,tuple,string """ if where is None: return None - if not isinstance(where, (list, tuple)): - where = [where] - else: - - # make this a list of we think that we only have a sigle term & no - # operands inside any terms - if not any([isinstance(w, (list, tuple, Term)) for w in where]): - - if not any([isinstance(w, basestring) and Term._search.match(w) for w in where]): - where = [where] + if isinstance(where, basestring): + pass + elif isinstance(where, (list, tuple)): + where = ' & ' .join([ "(%s)" for w in where]) queryables = self.table.queryables() - return [Term(c, queryables=queryables, encoding=self.table.encoding) for c in where] + return Expr(where, queryables=queryables, encoding=self.table.encoding) def select(self): """ diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 6737408081f3d..e339eeb69692d 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -581,6 +581,7 @@ def test_append_frame_column_oriented(self): store.append('df1', df.ix[:, 2:]) tm.assert_frame_equal(store['df1'], df) + result = store.select('df1', '(columns=A) | (columns=B)') result = store.select('df1', 'columns=A') expected = df.reindex(columns=['A']) tm.assert_frame_equal(expected, result) @@ -1605,6 +1606,44 @@ def test_terms(self): for t in terms: store.select('p4d', t) + def test_eval(self): + """ test evaluation using new terms """ + + with ensure_clean(self.path) as store: + + wp = tm.makePanel() + p4d = tm.makePanel4D() + + # valid terms + terms = [ + dict(field='major_axis', op='>', value='20121114'), + ('major_axis', '20121114'), + ('major_axis', '>', '20121114'), + (('major_axis', ['20121114', '20121114']),), + ('major_axis', datetime.datetime(2012, 11, 14)), + 'major_axis> 20121114', + 'major_axis >20121114', + 'major_axis > 20121114', + (('minor_axis', ['A', 'B']),), + (('minor_axis', ['A', 'B']),), + ((('minor_axis', ['A', 'B']),),), + (('items', ['ItemA', 'ItemB']),), + ('items=ItemA'), + ] + + for t in terms: + store.select('wp', t) + store.select('p4d', t) + + # valid for p4d only + terms = [ + (('labels', '=', ['l1', 'l2']),), + Term('labels', '=', ['l1', 'l2']), + ] + + for t in terms: + store.select('p4d', t) + def test_series(self): s = tm.makeStringSeries() From 441285c9c64cc08b701cbf81ce78a5fa567f2c67 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 6 Jul 2013 20:27:58 -0400 Subject: [PATCH 39/48] WIP: still some debugging statements in --- pandas/computation/expr.py | 6 +- pandas/computation/pytables.py | 406 +++++++++++++++++++------------ pandas/io/pytables.py | 7 +- pandas/io/tests/test_pytables.py | 11 +- 4 files changed, 268 insertions(+), 162 deletions(-) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 10ca7e1083983..2104a437b1ba3 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -80,12 +80,12 @@ def __init__(self, env): def generic_visit(self, node, **kwargs): """Called if no explicit visitor function exists for a node.""" - for field, value in iter_fields(node): + for field, value in ast.iter_fields(node): if isinstance(value, list): for item in value: - if isinstance(item, AST): + if isinstance(item, ast.AST): self.visit(item, **kwargs) - elif isinstance(value, AST): + elif isinstance(value, ast.AST): self.visit(value, **kwargs) def visit(self, node, **kwargs): diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index 7f070adff7054..931c0ffe7e0fb 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -1,12 +1,24 @@ -import sys +import sys, inspect import re import ast from functools import partial +import numpy as np +from datetime import datetime +import time +import pandas +import pandas.core.common as com +import pandas.lib as lib from pandas.computation import expr, ops from pandas.computation.ops import is_term from pandas.computation.expr import ExprParserError +def _ensure_decoded(s): + """ if we have bytes, decode them to unicde """ + if isinstance(s, np.bytes_): + s = s.decode('UTF-8') + return s + class Scope(expr.Scope): __slots__ = 'globals', 'locals', 'queryables' @@ -21,7 +33,7 @@ def __init__(self, name, env, side=None): def _resolve_name(self): - # must be a queryable + # must be a queryables if self.side == 'left': if self.name not in self.env.queryables: raise NameError('name {0!r} is not defined'.format(self.name)) @@ -30,29 +42,224 @@ def _resolve_name(self): # resolve the rhs (and allow to be None) return self.env.locals.get(self.name, self.env.globals.get(self.name,self.name)) -def format_value(q, lhs, v): - """ given a queryable, a lhs name and value, return a formatted value """ - return v - class BinOp(ops.BinOp): - def __call__(self, q): + def __init__(self, op, lhs, rhs, queryables, encoding): + super(BinOp, self).__init__(op, lhs, rhs) + self.queryables = queryables + self.encoding = encoding + self.filter = None + self.condition = None + + def prune(self, klass): + + def pr(left, right): + """ create and return a new specilized BinOp from myself """ + + if left is None: + return right + elif right is None: + return left + + k = klass + if isinstance(left, ConditionBinOp): + if isinstance(left, ConditionBinOp) and isinstance(right, ConditionBinOp): + k = JointConditionBinOp + elif isinstance(left, k): + return left + elif isinstance(right, k): + return right + + elif isinstance(left, FilterBinOp): + if isinstance(left, FilterBinOp) and isinstance(right, FilterBinOp): + k = JointFilterBinOp + elif isinstance(left, k): + return left + elif isinstance(right, k): + return right + + return k(self.op, left, right, queryables=self.queryables, encoding=self.encoding).evaluate() + left, right = self.lhs, self.rhs - # base cases if is_term(left) and is_term(right): - res = "(%s %s %s)" % (left.value,self.op,format_value(q, left.value, right.value)) + res = pr(left.value,right.value) elif not is_term(left) and is_term(right): - res = "(%s %s %s)" % (left(q),self.op,right.value) + res = pr(left.prune(klass),right.value) elif is_term(left) and not is_term(right): - res = "(%s %s %s)" % (left.value,self.op,right(q)) + res = pr(left.value,right.prune(klass)) elif not (is_term(left) or is_term(right)): - res = "(%s %s %s)" % (left(q),self.op,right(q)) + res = pr(left.prune(klass),right.prune(klass)) return res + @property + def is_valid(self): + """ return True if this is a valid field """ + return self.lhs in self.queryables + + @property + def is_in_table(self): + """ return True if this is a valid column name for generation (e.g. an actual column in the table) """ + return self.queryables.get(self.lhs) is not None + + @property + def kind(self): + """ the kind of my field """ + return self.queryables.get(self.lhs) + + def generate(self, v): + """ create and return the op string for this TermValue """ + val = v.tostring(self.encoding) + return "(%s %s %s)" % (self.lhs, self.op, val) + + def convert_value(self, v): + """ convert the expression that is in the term to something that is accepted by pytables """ + + def stringify(value): + value = str(value) + if self.encoding is not None: + value = value.encode(self.encoding) + return value + + kind = _ensure_decoded(self.kind) + if kind == u'datetime64' or kind == u'datetime': + v = lib.Timestamp(v) + if v.tz is not None: + v = v.tz_convert('UTC') + return TermValue(v, v.value, kind) + elif isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u'date': + v = time.mktime(v.timetuple()) + return TermValue(v, Timestamp(v), kind) + elif kind == u'integer': + v = int(float(v)) + return TermValue(v, v, kind) + elif kind == u'float': + v = float(v) + return TermValue(v, v, kind) + elif kind == u'bool': + if isinstance(v, basestring): + v = not v.strip().lower() in [ + u'false', u'f', u'no', u'n', u'none', u'0', u'[]', u'{}', u''] + else: + v = bool(v) + return TermValue(v, v, kind) + elif not isinstance(v, basestring): + v = stringify(v) + return TermValue(v, stringify(v), u'string') + + # string quoting + return TermValue(v, stringify(v), u'string') + +class FilterBinOp(BinOp): + + def __unicode__(self): + return com.pprint_thing("[Filter : [{0}] -> [{1}]".format(self.filter[0],self.filter[1])) + + def evaluate(self): + + if not isinstance(self.lhs,basestring): + return self + + if not self.is_valid: + raise ValueError("query term is not valid [%s]" % self) + + if self.is_in_table: + return None + + import pdb; pdb.set_trace() + + if not isinstance(self.rhs, list): + self.rhs = [ self.rhs ] + values = [TermValue(v, v, self.kind) for v in self.rhs] + + # equality conditions + if self.op in ['==', '!=']: + + # our filter op expression + if self.op == '!=': + filter_op = lambda axis, vals: not axis.isin(vals) + else: + filter_op = lambda axis, vals: axis.isin(vals) + + self.filter = ( + self.lhs, + filter_op, + Index([v.value for v in values])) + + else: + + raise TypeError( + "passing a filterable condition to a non-table indexer [%s]" % + self) + + return self + +class JointFilterBinOp(FilterBinOp): + + def evaluate(self): + return self + +class ConditionBinOp(BinOp): + + _max_selectors = 31 + + def __unicode__(self): + return com.pprint_thing("[Condition : [{0}]]".format(self.condition)) + + def format(self): + """ return the actual ne format """ + return self.condition + + def evaluate(self): + + if not isinstance(self.lhs,basestring): + return self + + if not self.is_valid: + raise ValueError("query term is not valid [%s]" % self) + + # convert values if we are in the table + if not self.is_in_table: + return None + + if not isinstance(self.rhs, list): + self.rhs = [ self.rhs ] + values = [self.convert_value(v) for v in self.rhs] + + # equality conditions + if self.op in ['==', '!=']: + + # our filter op expression + if self.op == '!=': + filter_op = lambda axis, vals: not axis.isin(vals) + else: + filter_op = lambda axis, vals: axis.isin(vals) + + # too many values to create the expression? + if len(values) <= self._max_selectors: + vs = [self.generate(v) for v in values] + self.condition = "(%s)" % ' | '.join(vs) + + # use a filter after reading + else: + return None + + else: + + self.condition = self.generate(values[0]) + + return self + +class JointConditionBinOp(ConditionBinOp): + + def evaluate(self): + self.condition = "(%s %s %s)" % (self.lhs.condition,self.op,self.rhs.condition) + return self + class UnaryOp(ops.UnaryOp): - def __call__(self, q): + + def apply(self, func): operand = self.operand v = operand.value if is_term(operand) else operand return "%s (%s)" % (operand,v) @@ -67,10 +274,10 @@ class ExprVisitor(expr.ExprVisitor): unary_op_nodes = 'Invert' unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) - def __init__(self, env): + def __init__(self, env, **kwargs): for bin_op in self.bin_ops: setattr(self, 'visit_{0}'.format(self.bin_op_nodes_map[bin_op]), - lambda node, bin_op=bin_op: partial(BinOp, bin_op)) + lambda node, bin_op=bin_op: partial(BinOp, bin_op, **kwargs)) for unary_op in self.unary_ops: setattr(self, @@ -95,6 +302,14 @@ def visit_Compare(self, node, **kwargs): def visit_Name(self, node, side=None, **kwargs): return Term(node.id, self.env, side=side) + def visit_Attribute(self, node, **kwargs): + import pdb; pdb.set_trace() + raise NotImplementedError("attribute access is not yet supported") + + def visit_BoolOp(self, node, **kwargs): + import pdb; pdb.set_trace() + raise NotImplementedError("boolean operators are not yet supported") + class Expr(expr.Expr): """ hold a pytables like expression, comprised of possibly multiple 'terms' @@ -113,16 +328,31 @@ class Expr(expr.Expr): -------- """ - _max_selectors = 31 - - def __init__(self, expression, queryables=None, encoding=None): - self.expr = self.pre_parse(expression) - self.env = Scope(queryables=queryables,frame_level=2) - self._visitor = ExprVisitor(self.env) - self.terms = self.parse() - self.encoding = encoding + def __init__(self, expression, queryables=None, encoding=None, env=None): + self.expr = expression self.condition = None self.filter = None + self.terms = None + self._visitor = None + + if env is None: + frame = inspect.currentframe() + try: + env = Scope(lcls = frame.f_back.f_locals.copy()) + finally: + del frame + self.env = env + + if queryables is not None: + self.env.queryables.update(queryables) + self._visitor = ExprVisitor(self.env, queryables=queryables, encoding=encoding) + self.expr = self.pre_parse(self.expr) + self.terms = self.parse() + + def __unicode__(self): + if self.terms is not None: + return unicode(self.terms) + return self.expr def pre_parse(self, expression): """ transform = to == """ @@ -131,134 +361,10 @@ def pre_parse(self, expression): def evaluate(self): """ create and return the numexpr condition and filter """ - import pdb; pdb.set_trace() - terms = [] - filter = [] - - self.terms(self.env) - #for t in self.terms: - - terms = [t for t in self.terms if t.condition is not None] - if len(terms): - self.condition = "(%s)" % ' & '.join( - [t.condition for t in terms]) - self.filter = [] - for t in self.terms: - if t.filter is not None: - self.filter.append(t.filter) - - - @property - def is_valid(self): - """ return True if this is a valid field """ - return self.field in self.q - - @property - def is_in_table(self): - """ return True if this is a valid column name for generation (e.g. an actual column in the table) """ - return self.q.get(self.field) is not None - - @property - def kind(self): - """ the kind of my field """ - return self.q.get(self.field) - - def generate(self, v): - """ create and return the op string for this TermValue """ - val = v.tostring(self.encoding) - return "(%s %s %s)" % (self.field, self.op, val) - - """ set the numexpr expression for this term """ - - if not self.is_valid: - raise ValueError("query term is not valid [%s]" % str(self)) - - # convert values if we are in the table - if self.is_in_table: - values = [self.convert_value(v) for v in self.value] - else: - values = [TermValue(v, v, self.kind) for v in self.value] - - # equality conditions - if self.op in ['==', '!=']: - - # our filter op expression - if self.op == '!=': - filter_op = lambda axis, vals: not axis.isin(vals) - else: - filter_op = lambda axis, vals: axis.isin(vals) - - if self.is_in_table: - - # too many values to create the expression? - if len(values) <= self._max_selectors: - vs = [self.generate(v) for v in values] - self.condition = "(%s)" % ' | '.join(vs) - - # use a filter after reading - else: - self.filter = ( - self.field, - filter_op, - Index([v.value for v in values])) - - else: - - self.filter = ( - self.field, - filter_op, - Index([v.value for v in values])) - - else: - - if self.is_in_table: - - self.condition = self.generate(values[0]) - - else: - - raise TypeError( - "passing a filterable condition to a non-table indexer [%s]" % - str(self)) - - def convert_value(self, v): - """ convert the expression that is in the term to something that is accepted by pytables """ - - def stringify(value): - value = str(value) - if self.encoding is not None: - value = value.encode(self.encoding) - return value - - kind = _ensure_decoded(self.kind) - if kind == u'datetime64' or kind == u'datetime': - v = lib.Timestamp(v) - if v.tz is not None: - v = v.tz_convert('UTC') - return TermValue(v, v.value, kind) - elif isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u'date': - v = time.mktime(v.timetuple()) - return TermValue(v, Timestamp(v), kind) - elif kind == u'integer': - v = int(float(v)) - return TermValue(v, v, kind) - elif kind == u'float': - v = float(v) - return TermValue(v, v, kind) - elif kind == u'bool': - if isinstance(v, basestring): - v = not v.strip().lower() in [ - u'false', u'f', u'no', u'n', u'none', u'0', u'[]', u'{}', u''] - else: - v = bool(v) - return TermValue(v, v, kind) - elif not isinstance(v, basestring): - v = stringify(v) - return TermValue(v, stringify(v), u'string') - - # string quoting - return TermValue(v, stringify(v), u'string') + self.condition = self.terms.prune(ConditionBinOp) + self.filter = self.terms.prune(FilterBinOp) + return self.condition, self.filter class TermValue(object): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5e45cc4d45e3c..e229301e96ea8 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3714,10 +3714,11 @@ def generate(self, where): if where is None: return None + import pdb; pd.set_trace() if isinstance(where, basestring): pass elif isinstance(where, (list, tuple)): - where = ' & ' .join([ "(%s)" for w in where]) + where = ' & ' .join([ "(%s)" % w for w in where]) queryables = self.table.queryables() return Expr(where, queryables=queryables, encoding=self.table.encoding) @@ -3727,7 +3728,7 @@ def select(self): generate the selection """ if self.condition is not None: - return self.table.table.readWhere(self.condition, start=self.start, stop=self.stop) + return self.table.table.readWhere(self.condition.format(), start=self.start, stop=self.stop) elif self.coordinates is not None: return self.table.table.readCoordinates(self.coordinates) return self.table.table.read(start=self.start, stop=self.stop) @@ -3739,7 +3740,7 @@ def select_coords(self): if self.condition is None: return np.arange(self.table.nrows) - return self.table.table.getWhereList(self.condition, start=self.start, stop=self.stop, sort=True) + return self.table.table.getWhereList(self.condition.format(), start=self.start, stop=self.stop, sort=True) # utilities ### diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index e339eeb69692d..00cce4a22279d 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -581,21 +581,20 @@ def test_append_frame_column_oriented(self): store.append('df1', df.ix[:, 2:]) tm.assert_frame_equal(store['df1'], df) - result = store.select('df1', '(columns=A) | (columns=B)') result = store.select('df1', 'columns=A') expected = df.reindex(columns=['A']) tm.assert_frame_equal(expected, result) - # this isn't supported - self.assertRaises(TypeError, store.select, 'df1', ( - 'columns=A', Term('index', '>', df.index[4]))) - # selection on the non-indexable result = store.select( - 'df1', ('columns=A', Term('index', '=', df.index[0:4]))) + 'df1', ('columns=A', Term('index=df.index[0:4]'))) expected = df.reindex(columns=['A'], index=df.index[0:4]) tm.assert_frame_equal(expected, result) + # this isn't supported + self.assertRaises(TypeError, store.select, 'df1', ( + 'columns=A', Term('index', '>', df.index[4]))) + def test_append_with_different_block_ordering(self): #GH 4096; using same frames, but different block orderings From 05a005f4629cbc065728f6853fe6ae426e6fbd35 Mon Sep 17 00:00:00 2001 From: jreback Date: Sat, 6 Jul 2013 21:35:40 -0400 Subject: [PATCH 40/48] WIP: conditions working now, filtering still only ok good parsing of attributes, subscripting, e.g df.index[0:4] works! --- pandas/computation/ops.py | 7 +++ pandas/computation/pytables.py | 81 ++++++++++++++++++++++---------- pandas/io/pytables.py | 11 +++-- pandas/io/tests/test_pytables.py | 19 ++++---- 4 files changed, 79 insertions(+), 39 deletions(-) diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 76e5497d48175..926b9bf9bc509 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -83,6 +83,13 @@ class Constant(Term): def __init__(self, value, env): super(Constant, self).__init__(value, env) +class Value(Term): + """ a resolved value """ + def __init__(self, value, env, name=None): + self.name = name + self.env = env + self.value = value + self.type = type(self.value) def _print_operand(opr): return opr.name if is_term(opr) else unicode(opr) diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index 931c0ffe7e0fb..f3cb4f45874c3 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -10,8 +10,10 @@ import pandas.core.common as com import pandas.lib as lib from pandas.computation import expr, ops -from pandas.computation.ops import is_term +from pandas.computation.ops import is_term, Value from pandas.computation.expr import ExprParserError +from pandas import Index +from pandas.core.common import is_list_like def _ensure_decoded(s): """ if we have bytes, decode them to unicde """ @@ -93,6 +95,14 @@ def pr(left, right): return res + def conform(self, rhs): + """ inplace conform rhs """ + if not is_list_like(rhs): + rhs = [ rhs ] + if hasattr(self.rhs,'ravel'): + rhs = rhs.ravel() + return rhs + @property def is_valid(self): """ return True if this is a valid field """ @@ -167,11 +177,8 @@ def evaluate(self): if self.is_in_table: return None - import pdb; pdb.set_trace() - - if not isinstance(self.rhs, list): - self.rhs = [ self.rhs ] - values = [TermValue(v, v, self.kind) for v in self.rhs] + rhs = self.conform(self.rhs) + values = [TermValue(v, v, self.kind) for v in rhs] # equality conditions if self.op in ['==', '!=']: @@ -223,9 +230,8 @@ def evaluate(self): if not self.is_in_table: return None - if not isinstance(self.rhs, list): - self.rhs = [ self.rhs ] - values = [self.convert_value(v) for v in self.rhs] + rhs = self.conform(self.rhs) + values = [self.convert_value(v) for v in rhs] # equality conditions if self.op in ['==', '!=']: @@ -303,8 +309,30 @@ def visit_Name(self, node, side=None, **kwargs): return Term(node.id, self.env, side=side) def visit_Attribute(self, node, **kwargs): - import pdb; pdb.set_trace() - raise NotImplementedError("attribute access is not yet supported") + attr = node.attr + value = node.value + + # resolve the value + return getattr(self.visit(value).value,attr) + + def visit_Subscript(self, node, **kwargs): + value = self.visit(node.value) + slobj = self.visit(node.slice) + + return Value(value[slobj],self.env) + + def visit_Slice(self, node, **kwargs): + lower = node.lower + if lower is not None: + lower = self.visit(lower).value + upper = node.upper + if upper is not None: + upper = self.visit(upper).value + step = node.step + if step is not None: + step = self.visit(step).value + + return slice(lower,upper,step) def visit_BoolOp(self, node, **kwargs): import pdb; pdb.set_trace() @@ -328,25 +356,33 @@ class Expr(expr.Expr): -------- """ - def __init__(self, expression, queryables=None, encoding=None, env=None): + def __init__(self, expression, queryables=None, encoding=None, lcls=None): + if isinstance(expression, Expr): + expression = str(expression) self.expr = expression self.condition = None self.filter = None self.terms = None self._visitor = None - if env is None: - frame = inspect.currentframe() - try: - env = Scope(lcls = frame.f_back.f_locals.copy()) - finally: - del frame - self.env = env + # add current locals scope + frame = inspect.currentframe() + try: + if lcls is None: + lcls = dict() + lcls.update(frame.f_back.f_locals) + self.env = Scope(lcls = lcls) + finally: + del frame if queryables is not None: + + # if using the old format, this will raise + if not isinstance(queryables, dict): + raise TypeError("Expr must be called with a single-string expression") + self.env.queryables.update(queryables) self._visitor = ExprVisitor(self.env, queryables=queryables, encoding=encoding) - self.expr = self.pre_parse(self.expr) self.terms = self.parse() def __unicode__(self): @@ -354,11 +390,6 @@ def __unicode__(self): return unicode(self.terms) return self.expr - def pre_parse(self, expression): - """ transform = to == """ - expression = re.sub("=+","==",expression) - return expression - def evaluate(self): """ create and return the numexpr condition and filter """ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e229301e96ea8..f072d484f223e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3714,14 +3714,15 @@ def generate(self, where): if where is None: return None - import pdb; pd.set_trace() - if isinstance(where, basestring): - pass - elif isinstance(where, (list, tuple)): + lcls = dict() + if isinstance(where, (list, tuple)): + for w in where: + if isinstance(w, Term): + lcls.update(w.env.locals) where = ' & ' .join([ "(%s)" % w for w in where]) queryables = self.table.queryables() - return Expr(where, queryables=queryables, encoding=self.table.encoding) + return Expr(where, queryables=queryables, encoding=self.table.encoding, lcls=lcls) def select(self): """ diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 00cce4a22279d..8ca0ffee50d40 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -593,7 +593,7 @@ def test_append_frame_column_oriented(self): # this isn't supported self.assertRaises(TypeError, store.select, 'df1', ( - 'columns=A', Term('index', '>', df.index[4]))) + 'columns=A', Term('index>df.index[4]'))) def test_append_with_different_block_ordering(self): @@ -816,7 +816,7 @@ def test_append_with_data_columns(self): # data column searching (with an indexable and a data_columns) result = store.select( - 'df', [Term('B>0'), Term('index', '>', df.index[3])]) + 'df', [Term('B>0'), Term('index>df.index[3]')]) df_new = df.reindex(index=df.index[4:]) expected = df_new[df_new.B > 0] tm.assert_frame_equal(result, expected) @@ -828,7 +828,7 @@ def test_append_with_data_columns(self): df_new['string'][5:6] = 'bar' _maybe_remove(store, 'df') store.append('df', df_new, data_columns=['string']) - result = store.select('df', [Term('string', '=', 'foo')]) + result = store.select('df', [Term('string=foo')]) expected = df_new[df_new.string == 'foo'] tm.assert_frame_equal(result, expected) @@ -874,14 +874,14 @@ def check_col(key,name,size): _maybe_remove(store, 'df') store.append( 'df', df_new, data_columns=['A', 'B', 'string', 'string2']) - result = store.select('df', [Term('string', '=', 'foo'), Term( + result = store.select('df', [Term('string=foo'), Term( 'string2=foo'), Term('A>0'), Term('B<0')]) expected = df_new[(df_new.string == 'foo') & ( df_new.string2 == 'foo') & (df_new.A > 0) & (df_new.B < 0)] tm.assert_frame_equal(result, expected) # yield an empty frame - result = store.select('df', [Term('string', '=', 'foo'), Term( + result = store.select('df', [Term('string=foo'), Term( 'string2=cool')]) expected = df_new[(df_new.string == 'foo') & ( df_new.string2 == 'cool')] @@ -2318,10 +2318,11 @@ def test_frame_select(self): store.put('frame', df, table=True) date = df.index[len(df) // 2] - crit1 = ('index', '>=', date) - crit2 = ('columns', ['A', 'D']) - crit3 = ('columns', 'A') + crit1 = ('index>=date') + crit2 = ("columns=['A', 'D']") + crit3 = ('columns=A') + import pdb; pdb.set_trace() result = store.select('frame', [crit1, crit2]) expected = df.ix[date:, ['A', 'D']] tm.assert_frame_equal(result, expected) @@ -2668,7 +2669,7 @@ def test_legacy_table_read(self): # old version warning warnings.filterwarnings('ignore', category=IncompatibilityWarning) self.assertRaises( - Exception, store.select, 'wp1', Term('minor_axis', '=', 'B')) + Exception, store.select, 'wp1', Term('minor_axis=B')) df2 = store.select('df2') store.select('df2', Term('index', '>', df2.index[2])) From 22b4a93b209fa7bdf9bb7f2e0f8ca6df35bfb376 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 7 Jul 2013 10:39:19 -0400 Subject: [PATCH 41/48] TST: more test changes --- pandas/computation/expr.py | 29 +++++++- pandas/computation/pytables.py | 52 ++++++-------- pandas/io/pytables.py | 5 +- pandas/io/tests/test_pytables.py | 116 +++++++++++-------------------- 4 files changed, 95 insertions(+), 107 deletions(-) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 2104a437b1ba3..b8acab7cf9edb 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -11,7 +11,7 @@ from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops from pandas.computation.ops import _cmp_ops_syms, _bool_ops_syms from pandas.computation.ops import _arith_ops_syms, _unary_ops_syms -from pandas.computation.ops import Term, Constant +from pandas.computation.ops import Term, Constant, Value class Scope(object): __slots__ = 'globals', 'locals' @@ -96,7 +96,10 @@ def visit(self, node, **kwargs): node = ast.fix_missing_locations(ast.parse(preparse(node))) method = 'visit_' + node.__class__.__name__ - visitor = getattr(self, method, self.generic_visit) + visitor = getattr(self, method, None) + if visitor is None: + visitor = self.generic_visit + print method return visitor(node, **kwargs) def visit_Module(self, node, **kwargs): @@ -124,12 +127,34 @@ def visit_UnaryOp(self, node, **kwargs): op = self.visit(node.op) return op(self.visit(node.operand)) + def visit_List(self, node, **kwargs): + return Value([ self.visit(e) for e in node.elts ], self.env) + def visit_Name(self, node, **kwargs): return Term(node.id, self.env) def visit_Num(self, node, **kwargs): return Constant(node.n, self.env) + def visit_Subscript(self, node, **kwargs): + value = self.visit(node.value) + slobj = self.visit(node.slice) + + return Value(value[slobj],self.env) + + def visit_Slice(self, node, **kwargs): + lower = node.lower + if lower is not None: + lower = self.visit(lower).value + upper = node.upper + if upper is not None: + upper = self.visit(upper).value + step = node.step + if step is not None: + step = self.visit(step).value + + return slice(lower,upper,step) + def visit_Compare(self, node, **kwargs): ops = node.ops comps = node.comparators diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index f3cb4f45874c3..2b04c6fb9e12a 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -166,6 +166,10 @@ class FilterBinOp(BinOp): def __unicode__(self): return com.pprint_thing("[Filter : [{0}] -> [{1}]".format(self.filter[0],self.filter[1])) + def format(self): + """ return the actual filter format """ + return [ self.filter ] + def evaluate(self): if not isinstance(self.lhs,basestring): @@ -204,6 +208,9 @@ def evaluate(self): class JointFilterBinOp(FilterBinOp): + def format(self): + raise NotImplementedError("unable to collapse Joint Filters") + def evaluate(self): return self @@ -298,16 +305,6 @@ def visit_Module(self, node, **kwargs): body = node.body[0] return self.visit(body) - def visit_Compare(self, node, **kwargs): - ops = node.ops - comps = node.comparators - for op, comp in zip(ops, comps): - node = self.visit(op)(self.visit(node.left,side='left'), self.visit(comp,side='right')) - return node - - def visit_Name(self, node, side=None, **kwargs): - return Term(node.id, self.env, side=side) - def visit_Attribute(self, node, **kwargs): attr = node.attr value = node.value @@ -315,28 +312,24 @@ def visit_Attribute(self, node, **kwargs): # resolve the value return getattr(self.visit(value).value,attr) - def visit_Subscript(self, node, **kwargs): - value = self.visit(node.value) - slobj = self.visit(node.slice) - - return Value(value[slobj],self.env) + def visit_Call(self, node, **kwargs): + if not isinstance(node.func, ast.Name): + raise TypeError("Only named functions are supported") - def visit_Slice(self, node, **kwargs): - lower = node.lower - if lower is not None: - lower = self.visit(lower).value - upper = node.upper - if upper is not None: - upper = self.visit(upper).value - step = node.step - if step is not None: - step = self.visit(step).value + res = self.visit(node.func) + if res is None: + raise ValueError("Invalid function call {0}".format(node.func.id)) + return res - return slice(lower,upper,step) + def visit_Compare(self, node, **kwargs): + ops = node.ops + comps = node.comparators + for op, comp in zip(ops, comps): + node = self.visit(op)(self.visit(node.left,side='left'), self.visit(comp,side='right')) + return node - def visit_BoolOp(self, node, **kwargs): - import pdb; pdb.set_trace() - raise NotImplementedError("boolean operators are not yet supported") + def visit_Name(self, node, side=None, **kwargs): + return Term(node.id, self.env, side=side) class Expr(expr.Expr): @@ -395,6 +388,7 @@ def evaluate(self): self.condition = self.terms.prune(ConditionBinOp) self.filter = self.terms.prune(FilterBinOp) + return self.condition, self.filter class TermValue(object): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index f072d484f223e..68f2b8698d960 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2913,8 +2913,8 @@ def process_axes(self, obj, columns=None): obj = obj.reindex_axis(labels, axis=axis, copy=False) # apply the selection filters (but keep in the same order) - if self.selection.filter: - for field, op, filt in self.selection.filter: + if self.selection.filter is not None: + for field, op, filt in self.selection.filter.format(): def process_filter(field, filt): @@ -3719,6 +3719,7 @@ def generate(self, where): for w in where: if isinstance(w, Term): lcls.update(w.env.locals) + where = ' & ' .join([ "(%s)" % w for w in where]) queryables = self.table.queryables() diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 8ca0ffee50d40..76ad477a43ccb 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1358,7 +1358,7 @@ def compare(a,b): assert_frame_equal(result,df) # select with tz aware - compare(store.select('df_tz',where=Term('A','>=',df.A[3])),df[df.A>=df.A[3]]) + compare(store.select('df_tz',where=Term('A>=df.A[3]')),df[df.A>=df.A[3]]) _maybe_remove(store, 'df_tz') df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130103',tz='US/Eastern')),index=range(5)) @@ -1440,14 +1440,14 @@ def test_remove_where(self): with ensure_clean(self.path) as store: # non-existance - crit1 = Term('index', '>', 'foo') + crit1 = Term('index>foo') self.assertRaises(KeyError, store.remove, 'a', [crit1]) # try to remove non-table (with crit) # non-table ok (where = None) wp = tm.makePanel() store.put('wp', wp, table=True) - store.remove('wp', [('minor_axis', ['A', 'D'])]) + store.remove('wp', [("minor_axis=['A', 'D']")]) rs = store.select('wp') expected = wp.reindex(minor_axis=['B', 'C']) tm.assert_panel_equal(rs, expected) @@ -1479,7 +1479,7 @@ def test_remove_crit(self): # group row removal date4 = wp.major_axis.take([0, 1, 2, 4, 5, 6, 8, 9, 10]) - crit4 = Term('major_axis', date4) + crit4 = Term('major_axis=date4') store.put('wp3', wp, table=True) n = store.remove('wp3', where=[crit4]) assert(n == 36) @@ -1491,8 +1491,8 @@ def test_remove_crit(self): store.put('wp', wp, table=True) date = wp.major_axis[len(wp.major_axis) // 2] - crit1 = Term('major_axis', '>', date) - crit2 = Term('minor_axis', ['A', 'D']) + crit1 = Term('major_axis>date') + crit2 = Term("minor_axis=['A', 'D']") n = store.remove('wp', where=[crit1]) assert(n == 56) @@ -1548,9 +1548,9 @@ def test_terms(self): # some invalid terms terms = [ - ['minor', ['A', 'B']], - ['index', ['20121114']], - ['index', ['20121114', '20121114']], + ["minor=['A', 'B']"], + ["index=['20121114']"], + ["index=['20121114', '20121114']"], ] for t in terms: self.assertRaises(Exception, store.select, 'wp', t) @@ -1558,75 +1558,43 @@ def test_terms(self): self.assertRaises(Exception, Term.__init__) self.assertRaises(Exception, Term.__init__, 'blah') self.assertRaises(Exception, Term.__init__, 'index') - self.assertRaises(Exception, Term.__init__, 'index', '==') - self.assertRaises(Exception, Term.__init__, 'index', '>', 5) + self.assertRaises(TypeError, Term.__init__, 'index', '==') + self.assertRaises(TypeError, Term.__init__, 'index', '>', 5) # panel result = store.select('wp', [Term( - 'major_axis<20000108'), Term('minor_axis', '=', ['A', 'B'])]) + 'major_axis<20000108'), Term("minor_axis=['A', 'B']")]) expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) tm.assert_panel_equal(result, expected) # p4d result = store.select('p4d', [Term('major_axis<20000108'), - Term('minor_axis', '=', ['A', 'B']), - Term('items', '=', ['ItemA', 'ItemB'])]) + Term("minor_axis=['A', 'B']"), + Term("items=['ItemA', 'ItemB']")]) expected = p4d.truncate(after='20000108').reindex( minor=['A', 'B'], items=['ItemA', 'ItemB']) tm.assert_panel4d_equal(result, expected) - # valid terms - terms = [ - dict(field='major_axis', op='>', value='20121114'), - ('major_axis', '20121114'), - ('major_axis', '>', '20121114'), - (('major_axis', ['20121114', '20121114']),), - ('major_axis', datetime.datetime(2012, 11, 14)), - 'major_axis> 20121114', - 'major_axis >20121114', - 'major_axis > 20121114', - (('minor_axis', ['A', 'B']),), - (('minor_axis', ['A', 'B']),), - ((('minor_axis', ['A', 'B']),),), - (('items', ['ItemA', 'ItemB']),), - ('items=ItemA'), - ] - - for t in terms: - store.select('wp', t) - store.select('p4d', t) - - # valid for p4d only + # back compat invalid terms terms = [ - (('labels', '=', ['l1', 'l2']),), - Term('labels', '=', ['l1', 'l2']), + dict(field='major_axis', op='>', value='20121114') ] - for t in terms: - store.select('p4d', t) - - def test_eval(self): - """ test evaluation using new terms """ - - with ensure_clean(self.path) as store: - - wp = tm.makePanel() - p4d = tm.makePanel4D() + self.assertRaises(TypeError, Term.__init__, t) # valid terms terms = [ - dict(field='major_axis', op='>', value='20121114'), - ('major_axis', '20121114'), - ('major_axis', '>', '20121114'), - (('major_axis', ['20121114', '20121114']),), + ('major_axis=20121114'), + ('major_axis>20121114'), + (("major_axis=['20121114', '20121114']"),), ('major_axis', datetime.datetime(2012, 11, 14)), 'major_axis> 20121114', 'major_axis >20121114', 'major_axis > 20121114', - (('minor_axis', ['A', 'B']),), - (('minor_axis', ['A', 'B']),), - ((('minor_axis', ['A', 'B']),),), - (('items', ['ItemA', 'ItemB']),), + (("minor_axis=['A', 'B']"),), + (("minor_axis=['A', 'B']"),), + ((("minor_axis==['A', 'B']"),),), + (("items=['ItemA', 'ItemB']"),), ('items=ItemA'), ] @@ -1636,8 +1604,8 @@ def test_eval(self): # valid for p4d only terms = [ - (('labels', '=', ['l1', 'l2']),), - Term('labels', '=', ['l1', 'l2']), + (("labels=['l1', 'l2']"),), + Term("labels=['l1', 'l2']"), ] for t in terms: @@ -2017,7 +1985,7 @@ def test_select(self): _maybe_remove(store, 'wp') store.append('wp', wp) items = ['Item%03d' % i for i in xrange(80)] - result = store.select('wp', Term('items', items)) + result = store.select('wp', Term('items=items')) expected = wp.reindex(items=items) tm.assert_panel_equal(expected, result) @@ -2034,7 +2002,7 @@ def test_select(self): tm.assert_frame_equal(expected, result) # equivalentsly - result = store.select('df', [('columns', ['A', 'B'])]) + result = store.select('df', [("columns=['A', 'B']")]) expected = df.reindex(columns=['A', 'B']) tm.assert_frame_equal(expected, result) @@ -2067,7 +2035,7 @@ def test_select_dtypes(self): df = DataFrame(dict(ts=bdate_range('2012-01-01', periods=300), A=np.random.randn(300))) _maybe_remove(store, 'df') store.append('df', df, data_columns=['ts', 'A']) - result = store.select('df', [Term('ts', '>=', Timestamp('2012-02-01'))]) + result = store.select('df', [Term("ts>=Timestamp('2012-02-01')")]) expected = df[df.ts >= Timestamp('2012-02-01')] tm.assert_frame_equal(expected, result) @@ -2120,30 +2088,30 @@ def test_select_with_many_inputs(self): store.append('df', df, data_columns=['ts', 'A', 'B', 'users']) # regular select - result = store.select('df', [Term('ts', '>=', Timestamp('2012-02-01'))]) + result = store.select('df', [Term("ts>=Timestamp('2012-02-01')")]) expected = df[df.ts >= Timestamp('2012-02-01')] tm.assert_frame_equal(expected, result) # small selector - result = store.select('df', [Term('ts', '>=', Timestamp('2012-02-01')),Term('users',['a','b','c'])]) + result = store.select('df', [Term("ts>=Timestamp('2012-02-01') & users=['a','b','c']")]) expected = df[ (df.ts >= Timestamp('2012-02-01')) & df.users.isin(['a','b','c']) ] tm.assert_frame_equal(expected, result) # big selector along the columns selector = [ 'a','b','c' ] + [ 'a%03d' % i for i in xrange(60) ] - result = store.select('df', [Term('ts', '>=', Timestamp('2012-02-01')),Term('users',selector)]) + result = store.select('df', [Term("ts>=Timestamp('2012-02-01')"),Term('users=selector')]) expected = df[ (df.ts >= Timestamp('2012-02-01')) & df.users.isin(selector) ] tm.assert_frame_equal(expected, result) selector = range(100,200) - result = store.select('df', [Term('B', selector)]) + result = store.select('df', [Term('B=selector')]) expected = df[ df.B.isin(selector) ] tm.assert_frame_equal(expected, result) self.assert_(len(result) == 100) # big selector along the index selector = Index(df.ts[0:100].values) - result = store.select('df', [Term('ts', selector)]) + result = store.select('df', [Term('ts=selector')]) expected = df[ df.ts.isin(selector.values) ] tm.assert_frame_equal(expected, result) self.assert_(len(result) == 100) @@ -2298,15 +2266,15 @@ def test_panel_select(self): store.put('wp', wp, table=True) date = wp.major_axis[len(wp.major_axis) // 2] - crit1 = ('major_axis', '>=', date) - crit2 = ('minor_axis', '=', ['A', 'D']) + crit1 = ('major_axis>=date') + crit2 = ("minor_axis=['A', 'D']") result = store.select('wp', [crit1, crit2]) expected = wp.truncate(before=date).reindex(minor=['A', 'D']) tm.assert_panel_equal(result, expected) result = store.select( - 'wp', ['major_axis>=20000124', ('minor_axis', '=', ['A', 'B'])]) + 'wp', ['major_axis>=20000124', ("minor_axis=['A', 'B']")]) expected = wp.truncate(before='20000124').reindex(minor=['A', 'B']) tm.assert_panel_equal(result, expected) @@ -2318,7 +2286,7 @@ def test_frame_select(self): store.put('frame', df, table=True) date = df.index[len(df) // 2] - crit1 = ('index>=date') + crit1 = Term('index>=date') crit2 = ("columns=['A', 'D']") crit3 = ('columns=A') @@ -2569,13 +2537,13 @@ def test_start_stop(self): store.append('df', df) result = store.select( - 'df', [Term("columns", "=", ["A"])], start=0, stop=5) + 'df', [Term("columns=['A']")], start=0, stop=5) expected = df.ix[0:4, ['A']] tm.assert_frame_equal(result, expected) # out of range result = store.select( - 'df', [Term("columns", "=", ["A"])], start=30, stop=40) + 'df', [Term("columns=['A']")], start=30, stop=40) assert(len(result) == 0) assert(type(result) == DataFrame) @@ -2588,7 +2556,7 @@ def test_select_filter_corner(self): with ensure_clean(self.path) as store: store.put('frame', df, table=True) - crit = Term('columns', df.columns[:75]) + crit = Term('columns=df.columns[:75]') result = store.select('frame', [crit]) tm.assert_frame_equal(result, df.ix[:, df.columns[:75]]) @@ -2672,7 +2640,7 @@ def test_legacy_table_read(self): Exception, store.select, 'wp1', Term('minor_axis=B')) df2 = store.select('df2') - store.select('df2', Term('index', '>', df2.index[2])) + store.select('df2', Term('index>df2.index[2]')) warnings.filterwarnings('always', category=IncompatibilityWarning) finally: From ca292c20f92f36483d1567946da7b2fce7d7176d Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 7 Jul 2013 10:45:22 -0400 Subject: [PATCH 42/48] BUG: added HDFStore to inherit from Stringmixin --- pandas/io/pytables.py | 2 +- pandas/io/tests/test_pytables.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 68f2b8698d960..14d67b9313ff3 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -229,7 +229,7 @@ def read_hdf(path_or_buf, key, **kwargs): f(path_or_buf, False) -class HDFStore(object): +class HDFStore(StringMixin): """ dict-like IO interface for storing pandas objects in PyTables diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 76ad477a43ccb..4b5ca28e702a7 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -797,6 +797,7 @@ def check_col(key,name,size): def test_append_with_data_columns(self): + import pdb; pdb.set_trace() with ensure_clean(self.path) as store: df = tm.makeTimeDataFrame() df.loc[:,'B'].iloc[0] = 1. From dfef6175140e7cfede80260c1aa5779733a57bda Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 7 Jul 2013 10:53:14 -0400 Subject: [PATCH 43/48] BUG: process visit_Index --- pandas/computation/expr.py | 6 ++++++ pandas/io/tests/test_pytables.py | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index b8acab7cf9edb..22210cb1fe9e7 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -136,13 +136,19 @@ def visit_Name(self, node, **kwargs): def visit_Num(self, node, **kwargs): return Constant(node.n, self.env) + def visit_Index(self, node, **kwargs): + """ df.index[4] """ + return self.visit(node.value).value + def visit_Subscript(self, node, **kwargs): + """ df.index[4:6] """ value = self.visit(node.value) slobj = self.visit(node.slice) return Value(value[slobj],self.env) def visit_Slice(self, node, **kwargs): + """ df.index[slice(4,6)] """ lower = node.lower if lower is not None: lower = self.visit(lower).value diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 4b5ca28e702a7..76ad477a43ccb 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -797,7 +797,6 @@ def check_col(key,name,size): def test_append_with_data_columns(self): - import pdb; pdb.set_trace() with ensure_clean(self.path) as store: df = tm.makeTimeDataFrame() df.loc[:,'B'].iloc[0] = 1. From b168fb3df7d6924a2d6b60b898a66cc1edadab5a Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 7 Jul 2013 16:30:12 -0400 Subject: [PATCH 44/48] ENH: use non_implemented function call in ExprVisitor ENH: support Load context in Attribute only --- pandas/computation/expr.py | 25 ++++++++++--------------- pandas/computation/pytables.py | 7 +++++-- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 22210cb1fe9e7..bb028d6eec1c2 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -78,15 +78,8 @@ def __init__(self, env): lambda node, unary_op=unary_op: partial(UnaryOp, unary_op)) self.env = env - def generic_visit(self, node, **kwargs): - """Called if no explicit visitor function exists for a node.""" - for field, value in ast.iter_fields(node): - if isinstance(value, list): - for item in value: - if isinstance(item, ast.AST): - self.visit(item, **kwargs) - elif isinstance(value, ast.AST): - self.visit(value, **kwargs) + def not_implemented(self, s): + raise NotImplementedError("{0} not yet supported".format(s)) def visit(self, node, **kwargs): if not (isinstance(node, ast.AST) or isinstance(node, basestring)): @@ -98,8 +91,7 @@ def visit(self, node, **kwargs): method = 'visit_' + node.__class__.__name__ visitor = getattr(self, method, None) if visitor is None: - visitor = self.generic_visit - print method + self.not_implemented("ast visitor [{0}]".format(method)) return visitor(node, **kwargs) def visit_Module(self, node, **kwargs): @@ -123,7 +115,7 @@ def visit_BinOp(self, node, **kwargs): def visit_UnaryOp(self, node, **kwargs): if isinstance(node.op, ast.Not): - raise NotImplementedError("not operator not yet supported") + self.not_implemented('not operator') op = self.visit(node.op) return op(self.visit(node.operand)) @@ -136,6 +128,9 @@ def visit_Name(self, node, **kwargs): def visit_Num(self, node, **kwargs): return Constant(node.n, self.env) + def visit_Str(self, node, **kwargs): + return Value(node.s, self.env) + def visit_Index(self, node, **kwargs): """ df.index[4] """ return self.visit(node.value).value @@ -183,13 +178,13 @@ def visit_Call(self, node, **kwargs): if node.func.id not in valid_ops: raise ValueError("Only {0} are supported".format(valid_ops)) - raise NotImplementedError("function calls not yet supported") + self.not_implemented('function calls') def visit_Attribute(self, node, **kwargs): - raise NotImplementedError("attribute access is not yet supported") + self.not_implemented('attribute access') def visit_BoolOp(self, node, **kwargs): - raise NotImplementedError("boolean operators are not yet supported") + self.not_implemented('boolean operators') class Expr(StringMixin): diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index 2b04c6fb9e12a..6be9c67f443cf 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -309,8 +309,11 @@ def visit_Attribute(self, node, **kwargs): attr = node.attr value = node.value - # resolve the value - return getattr(self.visit(value).value,attr) + ctx = node.ctx.__class__ + if ctx == ast.Load: + # resolve the value + return getattr(self.visit(value).value,attr) + raise ValueError("Invalid Attribute context {0}".format(ctx.__name__)) def visit_Call(self, node, **kwargs): if not isinstance(node.func, ast.Name): From 5fac7495bd123d60ec7fc323f73f482251ddf6f9 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 7 Jul 2013 17:29:00 -0400 Subject: [PATCH 45/48] BUG: fixed scoping issues by _ensure_term at the top-level --- pandas/computation/expr.py | 25 +++++++++++++++++++-- pandas/computation/ops.py | 3 +++ pandas/computation/pytables.py | 38 ++++++++++++++++++++------------ pandas/io/pytables.py | 32 +++++++++++++++++---------- pandas/io/tests/test_pytables.py | 14 ++++++------ 5 files changed, 77 insertions(+), 35 deletions(-) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index bb028d6eec1c2..b7a65c96a5d6a 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -1,5 +1,5 @@ import ast -import sys +import sys, inspect import itertools import tokenize import re @@ -25,6 +25,27 @@ def __init__(self, gbls=None, lcls=None, frame_level=1): finally: del frame + + def update(self, scope_level=None): + + # we are always 2 levels below the caller + # plus the caller maybe below the env level + # in which case we need addtl levels + sl = 2 + if scope_level is not None: + sl += scope_level + + # add current locals scope + frame = inspect.currentframe() + try: + while(sl>0): + frame = frame.f_back + sl -= 1 + self.locals.update(frame.f_locals) + finally: + del frame + + class ExprParserError(Exception): pass @@ -120,7 +141,7 @@ def visit_UnaryOp(self, node, **kwargs): return op(self.visit(node.operand)) def visit_List(self, node, **kwargs): - return Value([ self.visit(e) for e in node.elts ], self.env) + return Value([ self.visit(e).value for e in node.elts ], self.env) def visit_Name(self, node, **kwargs): return Term(node.id, self.env) diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 926b9bf9bc509..3efe4bf743a0a 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -91,6 +91,9 @@ def __init__(self, value, env, name=None): self.value = value self.type = type(self.value) + def __unicode__(self): + return com.pprint_thing(self.value) + def _print_operand(opr): return opr.name if is_term(opr) else unicode(opr) diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index 6be9c67f443cf..853f5897e39f7 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -134,6 +134,9 @@ def stringify(value): kind = _ensure_decoded(self.kind) if kind == u'datetime64' or kind == u'datetime': + + if isinstance(v, (int, float)): + raise ValueError("cannot index datelike with an integer/float value") v = lib.Timestamp(v) if v.tz is not None: v = v.tz_convert('UTC') @@ -340,7 +343,7 @@ class Expr(expr.Expr): Parameters ---------- - field : dict, string term expression, or the field to operate (must be a valid index/column type of DataFrame/Panel) + where : string term expression, Expr, or list-like of Exprs queryables : a kinds map (dict of column name -> kind), or None i column is non-indexable encoding : an encoding that will encode the query terms @@ -352,24 +355,31 @@ class Expr(expr.Expr): -------- """ - def __init__(self, expression, queryables=None, encoding=None, lcls=None): - if isinstance(expression, Expr): - expression = str(expression) - self.expr = expression + def __init__(self, where, queryables=None, encoding=None, scope_level=None): + self.encoding = encoding self.condition = None self.filter = None self.terms = None self._visitor = None - # add current locals scope - frame = inspect.currentframe() - try: - if lcls is None: - lcls = dict() - lcls.update(frame.f_back.f_locals) - self.env = Scope(lcls = lcls) - finally: - del frame + # capture the environement if needed + lcls = dict() + if isinstance(where, Expr): + + lcls.update(where.env.locals) + where = str(where) + + elif isinstance(where, (list, tuple)): + + for w in where: + if isinstance(w, Expr): + lcls.update(w.env.locals) + + where = ' & ' .join([ "(%s)" % w for w in where]) + + self.expr = where + self.env = Scope(lcls = lcls) + self.env.update(scope_level) if queryables is not None: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 14d67b9313ff3..2acb61baa0195 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -58,6 +58,21 @@ def _ensure_encoding(encoding): encoding = _default_encoding return encoding +Term = Expr + +def _ensure_term(where): + """ ensure that the where is a Term or a list of Term + this makes sure that we are capturing the scope of variables + that are passed """ + + # create the terms here with a frame_level=2 (we are 2 levels down) + if isinstance(where, (list, tuple)): + where = [ w if isinstance(w, Term) else Term(w, scope_level=2) for w in where if w is not None ] + elif where is None or isinstance(where, Coordinates): + pass + elif not isinstance(where, Term): + where = Term(where, scope_level=2) + return where class IncompatibilityWarning(Warning): pass @@ -461,6 +476,7 @@ def select(self, key, where=None, start=None, stop=None, columns=None, raise KeyError('No object named %s in the file' % key) # create the storer and axes + where = _ensure_term(where) s = self._create_storer(group) s.infer_axes() @@ -492,6 +508,7 @@ def select_as_coordinates( start : integer (defaults to None), row number to start selection stop : integer (defaults to None), row number to stop selection """ + where = _ensure_term(where) return self.get_storer(key).read_coordinates(where=where, start=start, stop=stop, **kwargs) def unique(self, key, column, **kwargs): @@ -537,6 +554,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, """ # default to single select + where = _ensure_term(where) if isinstance(keys, (list, tuple)) and len(keys) == 1: keys = keys[0] if isinstance(keys, basestring): @@ -640,6 +658,7 @@ def remove(self, key, where=None, start=None, stop=None): raises KeyError if key is not a valid store """ + where = _ensure_term(where) try: s = self.get_storer(key) except: @@ -3653,8 +3672,6 @@ def _need_convert(kind): return True return False -Term = Expr - class Coordinates(object): """ holds a returned coordinates list, useful to select the same rows from different tables @@ -3714,16 +3731,7 @@ def generate(self, where): if where is None: return None - lcls = dict() - if isinstance(where, (list, tuple)): - for w in where: - if isinstance(w, Term): - lcls.update(w.env.locals) - - where = ' & ' .join([ "(%s)" % w for w in where]) - - queryables = self.table.queryables() - return Expr(where, queryables=queryables, encoding=self.table.encoding, lcls=lcls) + return Expr(where, queryables=self.table.queryables(), encoding=self.table.encoding) def select(self): """ diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 76ad477a43ccb..0a8cac5e05b44 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1551,19 +1551,20 @@ def test_terms(self): ["minor=['A', 'B']"], ["index=['20121114']"], ["index=['20121114', '20121114']"], + ['major=20121114'], # passing an integer as the value ] for t in terms: - self.assertRaises(Exception, store.select, 'wp', t) + self.assertRaises(ValueError, store.select, 'wp', t) - self.assertRaises(Exception, Term.__init__) - self.assertRaises(Exception, Term.__init__, 'blah') - self.assertRaises(Exception, Term.__init__, 'index') + self.assertRaises(TypeError, Term.__init__) + self.assertRaises(TypeError, Term.__init__, 'blah') + self.assertRaises(TypeError, Term.__init__, 'index') self.assertRaises(TypeError, Term.__init__, 'index', '==') self.assertRaises(TypeError, Term.__init__, 'index', '>', 5) # panel result = store.select('wp', [Term( - 'major_axis<20000108'), Term("minor_axis=['A', 'B']")]) + 'major_axis<"20000108"'), Term("minor_axis=['A', 'B']")]) expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) tm.assert_panel_equal(result, expected) @@ -2274,7 +2275,7 @@ def test_panel_select(self): tm.assert_panel_equal(result, expected) result = store.select( - 'wp', ['major_axis>=20000124', ("minor_axis=['A', 'B']")]) + 'wp', ['major_axis>="20000124"', ("minor_axis=['A', 'B']")]) expected = wp.truncate(before='20000124').reindex(minor=['A', 'B']) tm.assert_panel_equal(result, expected) @@ -2290,7 +2291,6 @@ def test_frame_select(self): crit2 = ("columns=['A', 'D']") crit3 = ('columns=A') - import pdb; pdb.set_trace() result = store.select('frame', [crit1, crit2]) expected = df.ix[date:, ['A', 'D']] tm.assert_frame_equal(result, expected) From c5a3c9fdb96e78d039a3d56870fd3c804ba54d45 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 7 Jul 2013 18:19:29 -0400 Subject: [PATCH 46/48] TST: fixed remaining tests BUG: fixed Attribute ast node in a Call expression BUG: condition with > max_selectors wasn't being handled (in filter) --- pandas/computation/expr.py | 7 +++- pandas/computation/pytables.py | 72 ++++++++++++++++++++++++-------- pandas/io/tests/test_pytables.py | 45 ++++++++------------ 3 files changed, 78 insertions(+), 46 deletions(-) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index b7a65c96a5d6a..5c19f3f1859f9 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -6,13 +6,15 @@ from cStringIO import StringIO from functools import partial - from pandas.core.base import StringMixin from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops from pandas.computation.ops import _cmp_ops_syms, _bool_ops_syms from pandas.computation.ops import _arith_ops_syms, _unary_ops_syms from pandas.computation.ops import Term, Constant, Value +from pandas import Timestamp +import datetime + class Scope(object): __slots__ = 'globals', 'locals' @@ -25,6 +27,9 @@ def __init__(self, gbls=None, lcls=None, frame_level=1): finally: del frame + # add some usefule defaults + self.globals['Timestamp'] = Timestamp + self.globals['datetime'] = datetime def update(self, scope_level=None): diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index 853f5897e39f7..31853113d7e34 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -46,6 +46,8 @@ def _resolve_name(self): class BinOp(ops.BinOp): + _max_selectors = 31 + def __init__(self, op, lhs, rhs, queryables, encoding): super(BinOp, self).__init__(op, lhs, rhs) self.queryables = queryables @@ -136,7 +138,7 @@ def stringify(value): if kind == u'datetime64' or kind == u'datetime': if isinstance(v, (int, float)): - raise ValueError("cannot index datelike with an integer/float value") + v = stringify(v) v = lib.Timestamp(v) if v.tz is not None: v = v.tz_convert('UTC') @@ -181,12 +183,29 @@ def evaluate(self): if not self.is_valid: raise ValueError("query term is not valid [%s]" % self) - if self.is_in_table: - return None - rhs = self.conform(self.rhs) values = [TermValue(v, v, self.kind) for v in rhs] + if self.is_in_table: + + # if too many values to create the expression, use a filter instead + if self.op in ['==', '!='] and len(values) > self._max_selectors: + + # our filter op expression + if self.op == '!=': + filter_op = lambda axis, vals: not axis.isin(vals) + else: + filter_op = lambda axis, vals: axis.isin(vals) + + self.filter = ( + self.lhs, + filter_op, + Index([v.value for v in values])) + + return self + + return None + # equality conditions if self.op in ['==', '!=']: @@ -219,8 +238,6 @@ def evaluate(self): class ConditionBinOp(BinOp): - _max_selectors = 31 - def __unicode__(self): return com.pprint_thing("[Condition : [{0}]]".format(self.condition)) @@ -246,12 +263,6 @@ def evaluate(self): # equality conditions if self.op in ['==', '!=']: - # our filter op expression - if self.op == '!=': - filter_op = lambda axis, vals: not axis.isin(vals) - else: - filter_op = lambda axis, vals: axis.isin(vals) - # too many values to create the expression? if len(values) <= self._max_selectors: vs = [self.generate(v) for v in values] @@ -259,6 +270,7 @@ def evaluate(self): # use a filter after reading else: + return None else: @@ -319,13 +331,33 @@ def visit_Attribute(self, node, **kwargs): raise ValueError("Invalid Attribute context {0}".format(ctx.__name__)) def visit_Call(self, node, **kwargs): - if not isinstance(node.func, ast.Name): + + # this can happen with: datetime.datetime + if isinstance(node.func, ast.Attribute): + res = self.visit_Attribute(node.func) + elif not isinstance(node.func, ast.Name): raise TypeError("Only named functions are supported") + else: + res = self.visit(node.func) - res = self.visit(node.func) if res is None: raise ValueError("Invalid function call {0}".format(node.func.id)) - return res + if hasattr(res,'value'): + res = res.value + + args = [self.visit(targ).value for targ in node.args] + if node.starargs is not None: + args = args + self.visit(node.starargs).value + + keywords = {} + for key in node.keywords: + if not isinstance(key, ast.keyword): + raise ValueError("keyword error in function call '{0}'".format(node.func.id)) + keywords[key.arg] = self.visit(key.value).value + if node.kwargs is not None: + keywords.update(self.visit(node.kwargs).value) + + return Value(res(*args,**keywords),self.env) def visit_Compare(self, node, **kwargs): ops = node.ops @@ -399,8 +431,14 @@ def __unicode__(self): def evaluate(self): """ create and return the numexpr condition and filter """ - self.condition = self.terms.prune(ConditionBinOp) - self.filter = self.terms.prune(FilterBinOp) + try: + self.condition = self.terms.prune(ConditionBinOp) + except AttributeError: + raise ValueError("cannot process node for the condition [{0}]".format(self)) + try: + self.filter = self.terms.prune(FilterBinOp) + except AttributeError: + raise ValueError("cannot process node for the filter [{0}]".format(self)) return self.condition, self.filter diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 0a8cac5e05b44..f02b247826653 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1508,14 +1508,14 @@ def test_remove_crit(self): store.put('wp2', wp, table=True) date1 = wp.major_axis[1:3] - crit1 = Term('major_axis', date1) + crit1 = Term('major_axis=date1') store.remove('wp2', where=[crit1]) result = store.select('wp2') expected = wp.reindex(major_axis=wp.major_axis - date1) tm.assert_panel_equal(result, expected) date2 = wp.major_axis[5] - crit2 = Term('major_axis', date2) + crit2 = Term('major_axis=date2') store.remove('wp2', where=[crit2]) result = store['wp2'] expected = wp.reindex( @@ -1523,7 +1523,7 @@ def test_remove_crit(self): tm.assert_panel_equal(result, expected) date3 = [wp.major_axis[7], wp.major_axis[9]] - crit3 = Term('major_axis', date3) + crit3 = Term('major_axis=date3') store.remove('wp2', where=[crit3]) result = store['wp2'] expected = wp.reindex( @@ -1533,7 +1533,7 @@ def test_remove_crit(self): # corners store.put('wp4', wp, table=True) n = store.remove( - 'wp4', where=[Term('major_axis', '>', wp.major_axis[-1])]) + 'wp4', where=[Term('major_axis>wp.major_axis[-1]')]) result = store.select('wp4') tm.assert_panel_equal(result, wp) @@ -1547,14 +1547,9 @@ def test_terms(self): store.put('p4d', p4d, table=True) # some invalid terms - terms = [ - ["minor=['A', 'B']"], - ["index=['20121114']"], - ["index=['20121114', '20121114']"], - ['major=20121114'], # passing an integer as the value - ] - for t in terms: - self.assertRaises(ValueError, store.select, 'wp', t) + self.assertRaises(NameError, store.select, 'wp', "minor=['A', 'B']") + self.assertRaises(NameError, store.select, 'wp', ["index=['20121114']"]) + self.assertRaises(NameError, store.select, 'wp', ["index=['20121114', '20121114']"]) self.assertRaises(TypeError, Term.__init__) self.assertRaises(TypeError, Term.__init__, 'blah') @@ -1569,7 +1564,7 @@ def test_terms(self): tm.assert_panel_equal(result, expected) # p4d - result = store.select('p4d', [Term('major_axis<20000108'), + result = store.select('p4d', [Term('major_axis<"20000108"'), Term("minor_axis=['A', 'B']"), Term("items=['ItemA', 'ItemB']")]) expected = p4d.truncate(after='20000108').reindex( @@ -1588,7 +1583,7 @@ def test_terms(self): ('major_axis=20121114'), ('major_axis>20121114'), (("major_axis=['20121114', '20121114']"),), - ('major_axis', datetime.datetime(2012, 11, 14)), + ('major_axis=datetime.datetime(2012, 11, 14)'), 'major_axis> 20121114', 'major_axis >20121114', 'major_axis > 20121114', @@ -2036,6 +2031,7 @@ def test_select_dtypes(self): df = DataFrame(dict(ts=bdate_range('2012-01-01', periods=300), A=np.random.randn(300))) _maybe_remove(store, 'df') store.append('df', df, data_columns=['ts', 'A']) + result = store.select('df', [Term("ts>=Timestamp('2012-02-01')")]) expected = df[df.ts >= Timestamp('2012-02-01')] tm.assert_frame_equal(expected, result) @@ -2063,7 +2059,7 @@ def test_select_dtypes(self): _maybe_remove(store, 'df_int') store.append('df_int', df) result = store.select( - 'df_int', [Term("index<10"), Term("columns", "=", ["A"])]) + 'df_int', [Term("index<10"), Term("columns=['A']")]) expected = df.reindex(index=list(df.index)[0:10],columns=['A']) tm.assert_frame_equal(expected, result) @@ -2073,7 +2069,7 @@ def test_select_dtypes(self): _maybe_remove(store, 'df_float') store.append('df_float', df) result = store.select( - 'df_float', [Term("index<10.0"), Term("columns", "=", ["A"])]) + 'df_float', [Term("index<10.0"), Term("columns=['A']")]) expected = df.reindex(index=list(df.index)[0:10],columns=['A']) tm.assert_frame_equal(expected, result) @@ -2511,18 +2507,11 @@ def test_select_as_multiple(self): tm.assert_frame_equal(result, expected) # multiple (diff selector) - try: - result = store.select_as_multiple(['df1', 'df2'], where=[Term( - 'index', '>', df2.index[4])], selector='df2') - expected = concat([df1, df2], axis=1) - expected = expected[5:] - tm.assert_frame_equal(result, expected) - except (Exception), detail: - print ("error in select_as_multiple %s" % str(detail)) - print ("store: %s" % store) - print ("df1: %s" % df1) - print ("df2: %s" % df2) - + result = store.select_as_multiple(['df1', 'df2'], where=[Term( + 'index>df2.index[4]')], selector='df2') + expected = concat([df1, df2], axis=1) + expected = expected[5:] + tm.assert_frame_equal(result, expected) # test excpection for diff rows store.append('df3', tm.makeTimeDataFrame(nper=50)) From 71a23a8baa494979ab83b695eab6468e451ccbb4 Mon Sep 17 00:00:00 2001 From: jreback Date: Sun, 7 Jul 2013 20:58:52 -0400 Subject: [PATCH 47/48] BUG: py3 fixes; revise scoping rules to be more broad record variable states from the furthermost part of the stack to the most recent, overwriting if shadow variables occur --- pandas/computation/expr.py | 12 ++++++++---- pandas/computation/pytables.py | 17 ++++++++++------- pandas/io/tests/test_pytables.py | 8 +++----- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 5c19f3f1859f9..1db6a809773f8 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -40,17 +40,21 @@ def update(self, scope_level=None): if scope_level is not None: sl += scope_level - # add current locals scope + # add sl frames to the scope starting with the + # most distant and overwritting with more current + # makes sure that we can capture variable scope frame = inspect.currentframe() try: - while(sl>0): + frames = [] + while(sl>=0): frame = frame.f_back sl -= 1 - self.locals.update(frame.f_locals) + frames.append(frame) + for f in frames[::-1]: + self.locals.update(f.f_locals) finally: del frame - class ExprParserError(Exception): pass diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index 31853113d7e34..521bf60284107 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -17,7 +17,7 @@ def _ensure_decoded(s): """ if we have bytes, decode them to unicde """ - if isinstance(s, np.bytes_): + if isinstance(s, (np.bytes_, bytes)): s = s.decode('UTF-8') return s @@ -139,6 +139,7 @@ def stringify(value): if isinstance(v, (int, float)): v = stringify(v) + v = _ensure_decoded(v) v = lib.Timestamp(v) if v.tz is not None: v = v.tz_convert('UTC') @@ -295,12 +296,7 @@ def apply(self, func): class ExprVisitor(expr.ExprVisitor): bin_ops = '>', '<', '>=', '<=', '==', '!=', '&', '|' - bin_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'BitAnd', 'BitOr') - bin_op_nodes_map = dict(zip(bin_ops, bin_op_nodes)) - - unary_ops = ['~'] - unary_op_nodes = 'Invert' - unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) + unary_ops = ['-','~'] def __init__(self, env, **kwargs): for bin_op in self.bin_ops: @@ -369,6 +365,13 @@ def visit_Compare(self, node, **kwargs): def visit_Name(self, node, side=None, **kwargs): return Term(node.id, self.env, side=side) + def visit_UnaryOp(self, node, **kwargs): + if isinstance(node.op, ast.Not): + return UnaryOp(node.op,self.visit(node.operand)) + elif isinstance(node.op, ast.USub): + return Value(-self.visit(node.operand).value,self.env) + self.not_implemented("{0} unary operations".format(node.op)) + class Expr(expr.Expr): """ hold a pytables like expression, comprised of possibly multiple 'terms' diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index f02b247826653..4e5b518f187d3 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1551,11 +1551,9 @@ def test_terms(self): self.assertRaises(NameError, store.select, 'wp', ["index=['20121114']"]) self.assertRaises(NameError, store.select, 'wp', ["index=['20121114', '20121114']"]) - self.assertRaises(TypeError, Term.__init__) - self.assertRaises(TypeError, Term.__init__, 'blah') - self.assertRaises(TypeError, Term.__init__, 'index') - self.assertRaises(TypeError, Term.__init__, 'index', '==') - self.assertRaises(TypeError, Term.__init__, 'index', '>', 5) + self.assertRaises(TypeError, Term) + self.assertRaises(TypeError, Term, 'index', '==') + self.assertRaises(TypeError, Term, 'index', '>', 5) # panel result = store.select('wp', [Term( From e71276230f13efea46db6b225024ebb0f3e57530 Mon Sep 17 00:00:00 2001 From: jreback Date: Mon, 8 Jul 2013 11:35:55 -0400 Subject: [PATCH 48/48] COMPAT: allow prior 0.12 query syntax for terms, e.g. Term('index','>',5) (and show deprecation warning) --- pandas/computation/expr.py | 1 + pandas/computation/pytables.py | 23 ++++++++++++++++++----- pandas/io/tests/test_pytables.py | 17 ++++++++++++++--- 3 files changed, 33 insertions(+), 8 deletions(-) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 1db6a809773f8..63a9776bb027c 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -54,6 +54,7 @@ def update(self, scope_level=None): self.locals.update(f.f_locals) finally: del frame + del frames class ExprParserError(Exception): pass diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index 521bf60284107..e24445bbd71e9 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -390,7 +390,24 @@ class Expr(expr.Expr): -------- """ - def __init__(self, where, queryables=None, encoding=None, scope_level=None): + def __init__(self, where, op=None, value=None, queryables=None, encoding=None, scope_level=None): + + # try to be back compat + if op is not None: + if not isinstance(where, basestring): + raise TypeError("where must be passed as a string if op/value are passed") + if isinstance(op, Expr): + raise TypeError("invalid op passed, must be a string") + where = "{0}{1}".format(where,op) + if value is not None: + if isinstance(value, Expr): + raise TypeError("invalid value passed, must be a string") + where = "{0}{1}".format(where,value) + + import warnings + warnings.warn("passing multiple values to Expre is deprecated " + "pass the where as a single string", DeprecationWarning) + self.encoding = encoding self.condition = None self.filter = None @@ -418,10 +435,6 @@ def __init__(self, where, queryables=None, encoding=None, scope_level=None): if queryables is not None: - # if using the old format, this will raise - if not isinstance(queryables, dict): - raise TypeError("Expr must be called with a single-string expression") - self.env.queryables.update(queryables) self._visitor = ExprVisitor(self.env, queryables=queryables, encoding=encoding) self.terms = self.parse() diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 4e5b518f187d3..1ebcae4457bef 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -20,7 +20,6 @@ from pandas import concat, Timestamp from pandas.util import py3compat - try: import tables except ImportError: @@ -1551,9 +1550,14 @@ def test_terms(self): self.assertRaises(NameError, store.select, 'wp', ["index=['20121114']"]) self.assertRaises(NameError, store.select, 'wp', ["index=['20121114', '20121114']"]) + # deprecations + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + Term('index','==') + + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + Term('index', '>', 5) + self.assertRaises(TypeError, Term) - self.assertRaises(TypeError, Term, 'index', '==') - self.assertRaises(TypeError, Term, 'index', '>', 5) # panel result = store.select('wp', [Term( @@ -1561,6 +1565,13 @@ def test_terms(self): expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) tm.assert_panel_equal(result, expected) + # with deprecation + with tm.assert_produces_warning(expected_warning=DeprecationWarning): + result = store.select('wp', [Term( + 'major_axis','<',"20000108"), Term("minor_axis=['A', 'B']")]) + expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) + tm.assert_panel_equal(result, expected) + # p4d result = store.select('p4d', [Term('major_axis<"20000108"'), Term("minor_axis=['A', 'B']"),