From 89a03bea1e3846e0af520d8760a6be7f2516bfa3 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 15 Jun 2013 21:34:56 -0400 Subject: [PATCH 01/37] ENH: add new computation module and toplevel eval function --- pandas/__init__.py | 1 + pandas/computation/__init__.py | 0 pandas/computation/api.py | 1 + pandas/computation/common.py | 11 + pandas/computation/engines.py | 290 ++++++++++ pandas/computation/eval.py | 75 +++ pandas/computation/expr.py | 135 +++++ pandas/{core => computation}/expressions.py | 72 +-- pandas/computation/ops.py | 188 +++++++ pandas/computation/tests/__init__.py | 0 pandas/computation/tests/test_eval.py | 552 +++++++++++++++++++ pandas/computation/tests/test_expressions.py | 157 ++++++ pandas/core/frame.py | 4 +- pandas/core/internals.py | 2 +- pandas/tests/test_expressions.py | 203 ------- setup.py | 3 +- vb_suite/binary_ops.py | 12 +- vb_suite/indexing.py | 4 +- 18 files changed, 1465 insertions(+), 245 deletions(-) create mode 100644 pandas/computation/__init__.py create mode 100644 pandas/computation/api.py create mode 100644 pandas/computation/common.py create mode 100644 pandas/computation/engines.py create mode 100644 pandas/computation/eval.py create mode 100644 pandas/computation/expr.py rename pandas/{core => computation}/expressions.py (75%) create mode 100644 pandas/computation/ops.py create mode 100644 pandas/computation/tests/__init__.py create mode 100644 pandas/computation/tests/test_eval.py create mode 100644 pandas/computation/tests/test_expressions.py delete mode 100644 pandas/tests/test_expressions.py diff --git a/pandas/__init__.py b/pandas/__init__.py index a0edb397c28c1..bec0877b13bb8 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -29,6 +29,7 @@ from pandas.stats.api import * from pandas.tseries.api import * from pandas.io.api import * +from pandas.computation.api import eval from pandas.util.testing import debug diff --git a/pandas/computation/__init__.py b/pandas/computation/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/computation/api.py b/pandas/computation/api.py new file mode 100644 index 0000000000000..86f72902a52c8 --- /dev/null +++ b/pandas/computation/api.py @@ -0,0 +1 @@ +from pandas.computation.eval import eval diff --git a/pandas/computation/common.py b/pandas/computation/common.py new file mode 100644 index 0000000000000..4061984dd5e08 --- /dev/null +++ b/pandas/computation/common.py @@ -0,0 +1,11 @@ +import collections +from pandas.core.common import is_string + + +def flatten(l): + for el in l: + if isinstance(el, collections.Iterable) and not is_string(el): + for s in flatten(el): + yield s + else: + yield el diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py new file mode 100644 index 0000000000000..0eb9875b85549 --- /dev/null +++ b/pandas/computation/engines.py @@ -0,0 +1,290 @@ +import abc +from functools import partial +from itertools import izip + +import numpy as np + +import pandas as pd +import pandas.core.common as com +from pandas.computation.ops import _resolve_name, _update_names +from pandas.computation.common import flatten + + +def _align_core_single_unary_op(term): + if isinstance(term, np.ndarray) and not com.is_series(term): + typ = np.asanyarray + else: + typ = type(term) + ret = typ, [term] + + if not hasattr(term, 'axes'): + ret += None, + else: + ret += _zip_axes_from_type(typ, term.axes), + return ret + + +def _zip_axes_from_type(typ, new_axes): + axes = {} + for ax_ind, ax_name in typ._AXIS_NAMES.iteritems(): + axes[ax_name] = new_axes[ax_ind] + return axes + + +def _maybe_promote_shape(values, naxes): + # test to see if we have an array else leave since must be a number + if not isinstance(values, np.ndarray): + return values + + ndims = values.ndim + if ndims > naxes: + raise AssertionError('cannot have more dims than axes, ' + '{0} > {1}'.format(ndims, naxes)) + if ndims == naxes: + return values + + ndim = set(xrange(ndims)) + nax = set(xrange(naxes)) + + axes_slice = [slice(None)] * naxes + + # symmetric difference + slices = nax - ndim + + if ndims == naxes: + if slices: + raise AssertionError('slices should be empty if ndims == naxes ' + '{0}'.format(slices)) + else: + if not slices: + raise AssertionError('slices should NOT be empty if ndim != naxes ' + '{0}'.format(slices)) + + for sl in slices: + axes_slice[sl] = np.newaxis + + return values[tuple(axes_slice)] + + +def _align_core(terms): + # need to ensure that terms is not an iterator + terms = list(terms) + + ## special cases + + # single unary operand + if len(terms) == 1: + return _align_core_single_unary_op(terms[0]) + # only scalars + elif all(np.isscalar(term) for term in terms): + return np.result_type(*terms), terms, None + + # single dim ndarrays + all_has_size = all(hasattr(term, 'size') for term in terms) + if (all_has_size and all(term.size == 1 for term in terms)): + return np.result_type(*terms), terms, None + + # made it past the special cases + term_index = [i for i, term in enumerate(terms) if hasattr(term, 'axes')] + term_dims = [terms[i].ndim for i in term_index] + ndims = pd.Series(dict(zip(term_index, term_dims))) + + # initial axes are the axes of the largest-axis'd term + biggest = terms[ndims.idxmax()] + typ = biggest._constructor + axes = biggest.axes + naxes = len(axes) + + for i in term_index: + for axis, items in enumerate(terms[i].axes): + if com.is_series(terms[i]) and naxes > 1: + axes[naxes - 1] = axes[naxes - 1].join(terms[i].index, + how='outer') + else: + axes[axis] = axes[axis].join(items, how='outer') + + for i, ndim in ndims.iteritems(): + for axis, items in izip(xrange(ndim), axes): + ti = terms[i] # needed here because we modify it in the inner loop + + if hasattr(ti, 'reindex_axis'): + transpose = com.is_series(ti) and naxes > 1 + + if transpose: + f = partial(ti.reindex, index=axes[naxes - 1], copy=False) + else: + f = partial(ti.reindex_axis, items, axis=axis, copy=False) + + if pd.lib.is_bool_array(ti.values): + r = f(fill_value=True) + else: + r = f() + + terms[i] = r + + res = _maybe_promote_shape(terms[i].T if transpose else terms[i], + naxes) + res = res.T if transpose else res + + try: + terms[i] = res.values + except AttributeError: + terms[i] = res + + return typ, terms, _zip_axes_from_type(typ, axes) + + +def _filter_terms(flat): + # numeric literals + literals = filter(lambda string: not com.is_string(string), flat) + literals_set = set(literals) + + # these are strings which are variable names + names = filter(com.is_string, flat) + names_set = set(names) + + # literals are not names and names are not literals, by definition + if literals_set & names_set: + raise AssertionError('literals cannot be names and names cannot be ' + 'literals') + return names, literals + + +def _align(terms, env): + # flatten the parse tree (a nested list) + flat = list(flatten(terms)) + + names, literals = _filter_terms(flat) + + # given an expression consisting of literals + if not names: + return np.result_type(*literals).type, None + + # get the variables out + resolve_in_env = partial(_resolve_name, env) + resolved = map(resolve_in_env, names) + + # if all resolved variables are numeric scalars + if all(map(np.isscalar, resolved)): + return np.result_type(*resolved).type, None + + # perform the main alignment + typ, resolved, axes = _align_core(resolved) + + # put them back in the symbol table + _update_names(env, dict(izip(names, resolved))) + + # we need this to reconstruct things after evaluation since we CANNOT + # depend on the array interface + return typ, axes + + +def _reconstruct_object(typ, obj, axes): + """Reconstruct an object given its type, raw value, and possibly empty + (None) axes. + + Parameters + ---------- + typ : object + A type + obj : object + The value to use in the type constructor + axes : dict + The axes to use to construct the resulting pandas object + + Returns + ------- + reconst : typ + An object of type ``typ`` with the value `obj` and possible axes + `axes`. + """ + try: + # handle numpy dtypes + typ = typ.type + except AttributeError: + pass + + if typ != np.asanyarray and issubclass(typ, pd.core.generic.PandasObject): + return typ(obj, **axes) + + ret_value = typ(obj) + + try: + return ret_value.item() + except (AttributeError, ValueError): + return ret_value + + +class AbstractEngine(object): + """""" + __metaclass__ = abc.ABCMeta + + has_neg_frac = False + + def __init__(self, expr): + self.expr = expr + self.aligned_axes = None + self.result_type = None + + @abc.abstractmethod + def convert(self): + """Convert an expression for evaluation.""" + pass + + def evaluate(self, env): + if not self._is_aligned: + self.result_type, self.aligned_axes = _align(self.expr.terms, env) + + res = self._evaluate(env) + return _reconstruct_object(self.result_type, res, self.aligned_axes) + + @property + def _is_aligned(self): + return self.aligned_axes is not None and self.result_type is not None + + @abc.abstractmethod + def _evaluate(self, env): + """Return an evaluated expression.""" + pass + + +class NumExprEngine(AbstractEngine): + """NumExpr engine class""" + has_neg_frac = True + + def __init__(self, expr): + super(NumExprEngine, self).__init__(expr) + + def convert(self): + """Return a string""" + return str(self.expr) + + def _evaluate(self, env): + import numexpr as ne + + try: + return ne.evaluate(self.convert(), local_dict=env.locals, + global_dict=env.globals, + truediv=self.expr.truediv) + except KeyError as e: + raise NameError('{0!r} is not defined'.format(e.message)) + + +class PythonEngine(AbstractEngine): + """Use NumPy even if numexpr is installed""" + has_neg_frac = False + + def __init__(self, expr): + super(PythonEngine, self).__init__(expr) + + def convert(self): + pass + + def evaluate(self, env): + return self.expr(env) + + def _evaluate(self, env): + pass + + +_engines = {'numexpr': NumExprEngine, 'python': PythonEngine} diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py new file mode 100644 index 0000000000000..21348f221bc99 --- /dev/null +++ b/pandas/computation/eval.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python + +import sys +import numbers +import collections +import itertools + +import numpy as np + +Scope = collections.namedtuple('Scope', 'globals locals') + +import pandas.core.common as com +from pandas.computation.expr import Expr +from pandas.computation.engines import _engines + + +def _scope_has_series_and_frame_datetime_index(env): + from pandas import DatetimeIndex + series_index = frame_index = 0 + + for v in itertools.chain(env.locals.itervalues(), + env.globals.itervalues()): + series_index += com.is_series(v) and isinstance(v.index, DatetimeIndex) + frame_index += com.is_frame(v) and isinstance(v.index, DatetimeIndex) + return series_index, frame_index + + +def _maybe_convert_engine(env, engine): + assert isinstance(env, Scope), 'environment must be an instance of Scope' + assert isinstance(engine, basestring), 'engine name must be a string' + + ret = engine + + if all(_scope_has_series_and_frame_datetime_index(env)): + ret = 'python' + return ret + + +def eval(expr, engine='numexpr', truediv=True, local_dict=None, + global_dict=None): + # make sure we're passed a valid engine + if not engine in _engines: + raise KeyError('Invalid engine {0} passed, valid engines are' + ' {1}'.format(_engines.keys())) + + # 1 up in the call stack for locals/globals; see the documentation for the + # inspect module for why you must decrease the refcount of frame + frame = sys._getframe(1) + + try: + # get the globals and locals + gbl, lcl = global_dict or frame.f_globals, local_dict or frame.f_locals + + # shallow copy the scope so we don't overwrite everything + env = Scope(gbl.copy(), lcl.copy()) + + engine = _maybe_convert_engine(env, engine) + + # parse the expression + parsed_expr = Expr(expr, engine, truediv) + + # choose the engine + eng = _engines[engine] + + # construct the engine and evaluate + ret = eng(parsed_expr).evaluate(env) + finally: + del frame + + # sanity check for a number + if np.isscalar(ret): + if not isinstance(ret, (np.number, numbers.Number, np.bool_, bool)): + raise TypeError('scalar result must be numeric or bool, type is ' + '{0!r}'.format(ret.__class__.__name__)) + return ret diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py new file mode 100644 index 0000000000000..105c0a020a2ad --- /dev/null +++ b/pandas/computation/expr.py @@ -0,0 +1,135 @@ +import ast +from functools import partial + +from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops +from pandas.computation.ops import _cmp_ops_syms, _bool_ops_syms +from pandas.computation.ops import _arith_ops_syms, _unary_ops_syms + + +class ExprParserError(Exception): + pass + + +class ExprVisitor(ast.NodeVisitor): + """Custom ast walker + """ + bin_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms + bin_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'BitAnd', 'BitOr', + 'Add', 'Sub', 'Mult', 'Div', 'Pow', 'FloorDiv') + bin_op_nodes_map = dict(zip(bin_ops, bin_op_nodes)) + + unary_ops = _unary_ops_syms + unary_op_nodes = 'UAdd', 'USub', 'Invert' + unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) + + def __init__(self): + for bin_op in self.bin_ops: + setattr(self, 'visit_{0}'.format(self.bin_op_nodes_map[bin_op]), + lambda node, bin_op=bin_op: partial(BinOp, bin_op)) + + for unary_op in self.unary_ops: + setattr(self, + 'visit_{0}'.format(self.unary_op_nodes_map[unary_op]), + lambda node, unary_op=unary_op: partial(UnaryOp, unary_op)) + + def visit(self, node): + if not (isinstance(node, ast.AST) or isinstance(node, basestring)): + raise AssertionError('"node" must be an AST node or a string, you' + ' passed a(n) {0}'.format(node.__class__)) + if isinstance(node, basestring): + node = ast.fix_missing_locations(ast.parse(node)) + return super(ExprVisitor, self).visit(node) + + def visit_Module(self, node): + if len(node.body) != 1: + raise ExprParserError('only a single expression is allowed') + + expr = node.body[0] + if not isinstance(expr, ast.Expr): + raise SyntaxError('only expressions are allowed') + + return self.visit(expr) + + def visit_Expr(self, node): + return self.visit(node.value) + + def visit_BinOp(self, node): + op = self.visit(node.op) + left = self.visit(node.left) + right = self.visit(node.right) + return op(left, right) + + def visit_UnaryOp(self, node): + op = self.visit(node.op) + return op(self.visit(node.operand)) + + def visit_Name(self, node): + return node.id + + def visit_Num(self, node): + return node.n + + def visit_Compare(self, node): + ops = node.ops + comps = node.comparators + if len(ops) != 1: + raise ExprParserError('chained comparisons not supported') + return self.visit(ops[0])(self.visit(node.left), self.visit(comps[0])) + + def visit_Call(self, node): + if not isinstance(node.func, ast.Name): + raise TypeError("Only named functions are supported") + + valid_ops = _reductions + _mathops + + if node.func.id not in valid_ops: + raise ValueError("Only {0} are supported".format(valid_ops)) + + raise NotImplementedError("function calls not yet supported") + + def visit_Attribute(self, node): + raise NotImplementedError("attribute access is not yet supported") + + def visit_Mod(self, node): + raise NotImplementedError("modulo operator not yet supported") + + +class Expr(object): + """Expr object for pandas + """ + def __init__(self, expr, engine, truediv): + self.expr = expr + self._visitor = ExprVisitor() + self.terms = self.parse() + self.engine = engine + self.truediv = truediv + + def __call__(self, env): + env.locals['truediv'] = self.truediv + return self.terms(env) + + def __repr__(self): + return '{0} -> {1}'.format(self.expr, self.terms) + + def __str__(self): + return self.expr + + def parse(self): + """return a Termset""" + try: + visited = self._visitor.visit(self.expr) + except SyntaxError as e: + raise e + return visited + + def align(self, env): + """align a set of Terms""" + return self.terms.align(env) + + +def isexpr(s): + try: + Expr(s, engine=None) + except SyntaxError: + return False + return True diff --git a/pandas/core/expressions.py b/pandas/computation/expressions.py similarity index 75% rename from pandas/core/expressions.py rename to pandas/computation/expressions.py index abe891b82410c..e1551f9b0548e 100644 --- a/pandas/core/expressions.py +++ b/pandas/computation/expressions.py @@ -5,6 +5,7 @@ Offer fast expression evaluation thru numexpr """ + import numpy as np try: @@ -14,17 +15,19 @@ _NUMEXPR_INSTALLED = False _USE_NUMEXPR = _NUMEXPR_INSTALLED -_evaluate = None -_where = None +_evaluate = None +_where = None # the set of dtypes that we will allow pass to numexpr -_ALLOWED_DTYPES = dict(evaluate = set(['int64','int32','float64','float32','bool']), - where = set(['int64','float64','bool'])) +_ALLOWED_DTYPES = dict( + evaluate=set(['int64', 'int32', 'float64', 'float32', 'bool']), + where=set(['int64', 'float64', 'bool'])) # the minimum prod shape that we will use numexpr -_MIN_ELEMENTS = 10000 +_MIN_ELEMENTS = 10000 + -def set_use_numexpr(v = True): +def set_use_numexpr(v=True): # set/unset to use numexpr global _USE_NUMEXPR if _NUMEXPR_INSTALLED: @@ -34,12 +37,13 @@ def set_use_numexpr(v = True): global _evaluate, _where if not _USE_NUMEXPR: _evaluate = _evaluate_standard - _where = _where_standard + _where = _where_standard else: _evaluate = _evaluate_numexpr - _where = _where_numexpr + _where = _where_numexpr + -def set_numexpr_threads(n = None): +def set_numexpr_threads(n=None): # if we are using numexpr, set the threads to n # otherwise reset try: @@ -53,24 +57,25 @@ def set_numexpr_threads(n = None): def _evaluate_standard(op, op_str, a, b, raise_on_error=True, **eval_kwargs): """ standard evaluation """ - return op(a,b) + return op(a, b) + def _can_use_numexpr(op, op_str, a, b, dtype_check): """ return a boolean if we WILL be using numexpr """ if op_str is not None: - + # required min elements (otherwise we are adding overhead) if np.prod(a.shape) > _MIN_ELEMENTS: # check for dtype compatiblity dtypes = set() - for o in [ a, b ]: - if hasattr(o,'get_dtype_counts'): + for o in [a, b]: + if hasattr(o, 'get_dtype_counts'): s = o.get_dtype_counts() if len(s) > 1: return False dtypes |= set(s.index) - elif isinstance(o,np.ndarray): + elif isinstance(o, np.ndarray): dtypes |= set([o.dtype.name]) # allowed are a superset @@ -85,9 +90,9 @@ def _evaluate_numexpr(op, op_str, a, b, raise_on_error = False, **eval_kwargs): if _can_use_numexpr(op, op_str, a, b, 'evaluate'): try: a_value, b_value = a, b - if hasattr(a_value,'values'): + if hasattr(a_value, 'values'): a_value = a_value.values - if hasattr(b_value,'values'): + if hasattr(b_value, 'values'): b_value = b_value.values result = ne.evaluate('a_value %s b_value' % op_str, local_dict={ 'a_value' : a_value, @@ -98,33 +103,35 @@ def _evaluate_numexpr(op, op_str, a, b, raise_on_error = False, **eval_kwargs): pass except (Exception), detail: if raise_on_error: - raise TypeError(str(detail)) + raise if result is None: - result = _evaluate_standard(op,op_str,a,b,raise_on_error) + result = _evaluate_standard(op, op_str, a, b, raise_on_error) return result -def _where_standard(cond, a, b, raise_on_error=True): + +def _where_standard(cond, a, b, raise_on_error=True): return np.where(cond, a, b) -def _where_numexpr(cond, a, b, raise_on_error = False): + +def _where_numexpr(cond, a, b, raise_on_error=False): result = None if _can_use_numexpr(None, 'where', a, b, 'where'): try: cond_value, a_value, b_value = cond, a, b - if hasattr(cond_value,'values'): + if hasattr(cond_value, 'values'): cond_value = cond_value.values - if hasattr(a_value,'values'): + if hasattr(a_value, 'values'): a_value = a_value.values - if hasattr(b_value,'values'): + if hasattr(b_value, 'values'): b_value = b_value.values result = ne.evaluate('where(cond_value,a_value,b_value)', - local_dict={ 'cond_value' : cond_value, - 'a_value' : a_value, - 'b_value' : b_value }, + local_dict={'cond_value': cond_value, + 'a_value': a_value, + 'b_value': b_value}, casting='safe') except (ValueError), detail: if 'unknown type object' in str(detail): @@ -134,7 +141,7 @@ def _where_numexpr(cond, a, b, raise_on_error = False): raise TypeError(str(detail)) if result is None: - result = _where_standard(cond,a,b,raise_on_error) + result = _where_standard(cond, a, b, raise_on_error) return result @@ -152,8 +159,9 @@ def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, **eval_kw op_str: the string version of the op a : left operand b : right operand - raise_on_error : pass the error to the higher level if indicated (default is False), - otherwise evaluate the op with and return the results + raise_on_error : pass the error to the higher level if indicated + (default is False), otherwise evaluate the op with and + return the results use_numexpr : whether to try to use numexpr (default True) """ @@ -161,6 +169,7 @@ def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, **eval_kw return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error, **eval_kwargs) return _evaluate_standard(op, op_str, a, b, raise_on_error=raise_on_error) + def where(cond, a, b, raise_on_error=False, use_numexpr=True): """ evaluate the where condition cond on a and b @@ -170,8 +179,9 @@ def where(cond, a, b, raise_on_error=False, use_numexpr=True): cond : a boolean array a : return if cond is True b : return if cond is False - raise_on_error : pass the error to the higher level if indicated (default is False), - otherwise evaluate the op with and return the results + raise_on_error : pass the error to the higher level if indicated + (default is False), otherwise evaluate the op with and + return the results use_numexpr : whether to try to use numexpr (default True) """ diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py new file mode 100644 index 0000000000000..fb1965f45c52b --- /dev/null +++ b/pandas/computation/ops.py @@ -0,0 +1,188 @@ +import operator as op +from functools import partial + +from pandas.util.py3compat import PY3 + + +_reductions = 'sum', 'prod' +_mathops = 'sin', 'cos', 'tan' + + +class OperatorError(Exception): + pass + + +class UnaryOperatorError(OperatorError): + pass + + +class BinaryOperatorError(OperatorError): + pass + + +def _resolve_name(env, key): + res = env.locals.get(key, env.globals.get(key)) + + if res is None: + if not isinstance(key, basestring): + return key + + raise NameError('{0!r} is undefined'.format(key)) + + return res + + +def _update_name(env, key, value): + if isinstance(key, basestring): + try: + del env.locals[key] + env.locals[key] = value + except KeyError: + try: + del env.globals[key] + env.globals[key] = value + except KeyError: + raise NameError('{0!r} is undefined'.format(key)) + + +def _update_names(env, mapping): + updater = partial(_update_name, env) + for key, value in mapping.iteritems(): + updater(key, value) + + +class Op(object): + """Hold an operator of unknown arity + """ + def __init__(self, op, operands): + self.op = op + self.operands = operands + + def __iter__(self): + return iter(self.operands) + + @property + def name(self): + return self.__class__.__name__ + + +_cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=' +_cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne +_cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs)) + +_bool_ops_syms = '&', '|' +_bool_ops_funcs = op.and_, op.or_ +_bool_ops_dict = dict(zip(_bool_ops_syms, _bool_ops_funcs)) + +_arith_ops_syms = '+', '-', '*', '/', '**', '//' +_arith_ops_funcs = (op.add, op.sub, op.mul, op.truediv if PY3 else op.div, + op.pow, op.floordiv) +_arith_ops_dict = dict(zip(_arith_ops_syms, _arith_ops_funcs)) + +_binary_ops_dict = {} + +for d in (_cmp_ops_dict, _bool_ops_dict, _arith_ops_dict): + _binary_ops_dict.update(d) + + +class BinOp(Op): + """Hold a binary operator and its operands + + Parameters + ---------- + op : str or Op + left : str or Op + right : str or Op + """ + def __init__(self, op, lhs, rhs): + super(BinOp, self).__init__(op, (lhs, rhs)) + self.lhs = lhs + self.rhs = rhs + + try: + self.func = _binary_ops_dict[op] + except KeyError: + keys = _binary_ops_dict.keys() + raise BinaryOperatorError('Invalid binary operator {0}, valid' + ' operators are {1}'.format(op, keys)) + + def __repr__(self): + return '{0}(op={1!r}, lhs={2!r}, rhs={3!r})'.format(self.name, self.op, + self.lhs, self.rhs) + + __str__ = __repr__ + + def __call__(self, env): + # handle truediv + if self.op == '/' and env.locals['truediv']: + self.func = op.truediv + + # recurse over the left nodes + try: + left = self.lhs(env) + except TypeError: + left = self.lhs + + # recursve over the right nodes + try: + right = self.rhs(env) + except TypeError: + right = self.rhs + + # base cases + if not (isinstance(left, basestring) or isinstance(right, basestring)): + res = self.func(left, right) + elif isinstance(left, basestring) and not isinstance(right, + basestring): + res = self.func(_resolve_name(env, left), right) + elif not isinstance(left, basestring) and isinstance(right, + basestring): + res = self.func(left, _resolve_name(env, right)) + elif isinstance(left, basestring) and isinstance(right, basestring): + res = self.func(_resolve_name(env, left), _resolve_name(env, + right)) + + return res + + +_unary_ops_syms = '+', '-', '~' +_unary_ops_funcs = op.pos, op.neg, op.invert +_unary_ops_dict = dict(zip(_unary_ops_syms, _unary_ops_funcs)) + + +class UnaryOp(Op): + """Hold a unary operator and its operands + """ + def __init__(self, op, operand): + super(UnaryOp, self).__init__(op, (operand,)) + self.operand = operand + + try: + self.func = _unary_ops_dict[op] + except KeyError: + raise UnaryOperatorError('Invalid unary operator {0}, valid ' + 'operators are ' + '{1}'.format(op, _unary_ops_syms)) + + def __call__(self, env): + operand = self.operand + try: + operand = self.operand(env) + except TypeError: + operand = self.operand + + if isinstance(operand, basestring): + v = _resolve_name(env, operand) + else: + v = operand + + try: + res = self.func(v) + except TypeError: + res = self.func(v.values) + + return res + + def __repr__(self): + return '{0}(op={1!r}, operand={2!r})'.format(self.name, self.op, + self.operand) diff --git a/pandas/computation/tests/__init__.py b/pandas/computation/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py new file mode 100644 index 0000000000000..2d7bf4392cfea --- /dev/null +++ b/pandas/computation/tests/test_eval.py @@ -0,0 +1,552 @@ +#!/usr/bin/env python + +import itertools +from itertools import product + +import nose +from nose.tools import assert_raises, assert_tuple_equal, assert_equal +from nose.tools import assert_true + +from numpy.random import randn +import numpy as np +from numpy.testing import assert_array_equal +from numpy.testing.decorators import slow + +import pandas as pd +from pandas import DataFrame, Series +from pandas.util.testing import makeCustomDataframe as mkdf +from pandas.computation.engines import (_engines, _align_core, + _reconstruct_object) +from pandas.computation.ops import _binary_ops_dict, _unary_ops_dict +import pandas.computation.expr as expr +from pandas.computation.expressions import _USE_NUMEXPR +from pandas.computation.eval import Scope +from pandas.computation.eval import _scope_has_series_and_frame_datetime_index +from pandas.computation.eval import _maybe_convert_engine +from pandas.util.testing import assert_frame_equal, randbool + + +def skip_numexpr_engine(engine): + if not _USE_NUMEXPR and engine == 'numexpr': + raise nose.SkipTest + + +def engine_has_neg_frac(engine): + return _engines[engine].has_neg_frac + + +def fractional(x): + frac, _ = np.modf(np.asanyarray(x)) + return frac + + +def hasfractional(x): + return np.any(fractional(x) != 0.0) + + +def _eval_from_expr(lhs, cmp1, rhs, binop, cmp2): + f1 = _binary_ops_dict[cmp1] + f2 = _binary_ops_dict[cmp2] + bf = _binary_ops_dict[binop] + typ, (lhs, rhs), axes = _align_core((lhs, rhs)) + return _reconstruct_object(typ, bf(f1(lhs, rhs), f2(lhs, rhs)), axes) + + +def _eval_single_bin(lhs, cmp1, rhs, has_neg_frac): + c = _binary_ops_dict[cmp1] + if has_neg_frac: + try: + result = c(lhs, rhs) + except ValueError: + result = np.nan + else: + result = c(lhs, rhs) + return result + + +def isframe(x): + return isinstance(x, pd.DataFrame) + + +def isseries(x): + return isinstance(x, pd.Series) + + +def are_compatible_types(op, lhs, rhs): + if op in ('&', '|'): + if isframe(lhs) and isseries(rhs) or isframe(rhs) and isseries(lhs): + return False + return True + + +def _eval_bin_and_unary(unary, lhs, arith1, rhs): + binop = _binary_ops_dict[arith1] + unop = expr._unary_ops_dict[unary] + return unop(binop(lhs, rhs)) + + +# Smoke testing +class TestBasicEval(object): + + @classmethod + def setUpClass(self): + self.cmp_ops = expr._cmp_ops_syms + self.cmp2_ops = self.cmp_ops[::-1] + self.bin_ops = expr._bool_ops_syms + self.arith_ops = tuple(o for o in expr._arith_ops_syms if o != '//') + self.unary_ops = '+', '-' + + def set_current_engine(self): + self.engine = 'numexpr' + + def setup_data(self): + self.lhses = (DataFrame(randn(10, 5)), Series(randn(5)), randn(), + np.float64(randn())) + self.rhses = (DataFrame(randn(10, 5)), Series(randn(5)), randn(), + np.float64(randn())) + + def setUp(self): + try: + import numexpr as ne + self.ne = ne + except ImportError: + raise nose.SkipTest + self.set_current_engine() + self.setup_data() + self.current_engines = filter(lambda x: x != self.engine, + _engines.iterkeys()) + + @slow + def test_complex_cmp_ops(self): + self.setUp() + lhses, rhses = self.lhses, self.rhses + args = itertools.product(lhses, self.cmp_ops, rhses, self.bin_ops, + self.cmp2_ops) + for lhs, cmp1, rhs, binop, cmp2 in args: + self._create_cmp_op_t(lhs, cmp1, rhs, binop, cmp2) + + def test_simple_cmp_ops(self): + bool_lhses = (DataFrame(randbool(size=(10, 5))), + Series(randbool((5,))), randbool()) + bool_rhses = (DataFrame(randbool(size=(10, 5))), + Series(randbool((5,))), randbool()) + args = itertools.product(bool_lhses, bool_rhses, self.cmp_ops) + for lhs, rhs, cmp_op in args: + self._create_simple_cmp_op_t(lhs, rhs, cmp_op) + + def test_binary_arith_ops(self): + self.setUp() + lhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + rhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + args = itertools.product(lhses, self.arith_ops, rhses) + for lhs, op, rhs in args: + self._create_arith_op_t(lhs, op, rhs) + + def test_unary_arith_ops(self): + self.setUp() + lhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + rhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + aops = tuple(aop for aop in self.arith_ops if aop not in '+-') + args = itertools.product(self.unary_ops, lhses, aops, rhses) + for unary_op, lhs, arith_op, rhs in args: + self._create_unary_arith_op_t(unary_op, lhs, arith_op, rhs) + + def test_invert(self): + self.setUp() + lhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + rhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + args = itertools.product(lhses, self.cmp_ops, rhses) + for lhs, op, rhs in args: + self._create_invert_op_t(lhs, op, rhs) + + def _create_cmp_op_t(self, lhs, cmp1, rhs, binop, cmp2): + ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format(cmp1=cmp1, + binop=binop, + cmp2=cmp2) + expected = _eval_from_expr(lhs, cmp1, rhs, binop, cmp2) + result = pd.eval(ex, engine=self.engine) + assert_array_equal(result, expected) + + def _create_simple_cmp_op_t(self, lhs, rhs, cmp1): + ex = 'lhs {0} rhs'.format(cmp1) + + if are_compatible_types(cmp1, lhs, rhs): + expected = _eval_single_bin(lhs, cmp1, rhs, + engine_has_neg_frac(self.engine)) + result = pd.eval(ex, engine=self.engine) + assert_array_equal(result, expected) + else: + assert_raises(TypeError, _eval_single_bin, lhs, cmp1, rhs, + engine_has_neg_frac(self.engine)) + + def _create_arith_op_t(self, lhs, arith1, rhs): + ex = 'lhs {0} rhs'.format(arith1) + nan_frac_neg = (arith1 == '**' and np.any(lhs < 0) and + hasfractional(rhs) and np.isscalar(lhs) and + np.isscalar(rhs) and + not (isinstance(lhs, tuple(np.typeDict.values())) + or isinstance(rhs, tuple(np.typeDict.values())))) + if nan_frac_neg and not engine_has_neg_frac(self.engine): + assert_raises(ValueError, pd.eval, ex, engine=self.engine, + local_dict=locals(), global_dict=globals()) + else: + result = pd.eval(ex, engine=self.engine) + + if arith1 != '//': + expected = _eval_single_bin(lhs, arith1, rhs, + engine_has_neg_frac(self.engine)) + assert_array_equal(result, expected) + + # sanity check on recursive parsing + try: + ghs = rhs.copy() + except AttributeError: + ghs = rhs + + if nan_frac_neg and not engine_has_neg_frac(self.engine): + assert_raises(ValueError, pd.eval, ex, engine=self.engine, + local_dict=locals(), global_dict=globals()) + else: + if arith1 == '**': + ex = '(lhs {0} rhs) {0} ghs'.format(arith1) + else: + ex = 'lhs {0} rhs {0} ghs'.format(arith1) + result = pd.eval(ex, engine=self.engine) + + try: + nlhs = _eval_single_bin(lhs, arith1, rhs, + engine_has_neg_frac(self.engine)) + except ValueError: + assert_raises(ValueError, _eval_single_bin, lhs, arith1, rhs, + engine_has_neg_frac(self.engine)) + else: + try: + nlhs, ghs = nlhs.align(ghs) + except: + pass + if arith1 != '//': + expected = self.ne.evaluate('nlhs {0} ghs'.format(arith1)) + assert_array_equal(result, expected) + + def _create_invert_op_t(self, lhs, cmp1, rhs): + # simple + for el in (lhs, rhs): + try: + elb = el.astype(bool) + except AttributeError: + elb = np.array([bool(el)]) + expected = ~elb + result = pd.eval('~elb', engine=self.engine) + assert_array_equal(expected, result) + + for engine in self.current_engines: + assert_array_equal(result, pd.eval('~elb', engine=engine)) + + # compound + ex = '~(lhs {0} rhs)'.format(cmp1) + if np.isscalar(lhs) and np.isscalar(rhs): + lhs, rhs = map(lambda x: np.array([x]), (lhs, rhs)) + expected = ~_eval_single_bin(lhs, cmp1, rhs, + engine_has_neg_frac(self.engine)) + result = pd.eval(ex, engine=self.engine) + assert_array_equal(expected, result) + + # make sure the other engines work + for engine in self.current_engines: + ev = pd.eval(ex, engine=self.engine) + assert_array_equal(ev, result) + + def _create_unary_arith_op_t(self, unary_op, lhs, arith1, rhs): + # simple + ex = '{0}lhs'.format(unary_op, arith1) + f = _unary_ops_dict[unary_op] + bad_types = tuple(np.typeDict.values()) + + nan_frac_neg = (arith1 == '**' and + np.any(lhs < 0) and + hasfractional(rhs) and + np.isscalar(lhs) and np.isscalar(rhs) and + not (isinstance(lhs, bad_types) or + isinstance(rhs, bad_types)) + and not engine_has_neg_frac(self.engine)) + try: + expected = f(lhs.values) + except AttributeError: + expected = f(lhs) + result = pd.eval(ex, engine=self.engine) + assert_array_equal(result, expected) + + for engine in self.current_engines: + assert_array_equal(result, pd.eval(ex, engine=engine)) + + ex = '{0}(lhs {1} rhs)'.format(unary_op, arith1) + + if nan_frac_neg: + assert_raises(ValueError, pd.eval, ex, engine=self.engine, + local_dict=locals(), global_dict=globals()) + else: + # compound + result = pd.eval(ex, engine=self.engine) + + #(lhs, rhs), _ = _align((lhs, rhs)) + #if arith1 != '//': + #expected = self.ne.evaluate(ex) + #assert_array_equal(result, expected) + #else: + #assert_raises(TypeError, self.ne.evaluate, ex) + + #for engine in self.current_engines: + #if arith1 != '//': + #if engine_has_neg_frac(engine): + #assert_array_equal(result, pd.eval(ex, engine=engine)) + #else: + #assert_raises(TypeError, pd.eval, ex, engine=engine, + #local_dict=locals(), global_dict=globals()) + + +class TestBasicEvalPython(TestBasicEval): + + @classmethod + def setUpClass(cls): + cls.cmp_ops = expr._cmp_ops_syms + cls.cmp2_ops = cls.cmp_ops[::-1] + cls.bin_ops = expr._bool_ops_syms + cls.arith_ops = expr._arith_ops_syms + cls.unary_ops = '+', '-' + + def set_current_engine(self): + self.engine = 'python' + + +def test_syntax_error_exprs(): + for engine in _engines: + e = 's +' + assert_raises(SyntaxError, pd.eval, e, engine=engine) + + +def test_name_error_exprs(): + for engine in _engines: + e = 's + t' + assert_raises(NameError, pd.eval, e, engine=engine) + + +def test_align_nested_unary_op(): + for engine in _engines: + yield check_align_nested_unary_op, engine + + +f = lambda *args, **kwargs: np.random.randn() + + +def check_align_nested_unary_op(engine): + skip_numexpr_engine(engine) + s = 'df * ~2' + df = mkdf(10, 10, data_gen_f=f) + res = pd.eval(s, engine) + assert_frame_equal(res, df * ~2) + + +def check_basic_frame_alignment(engine): + df = mkdf(10, 10, data_gen_f=f) + df2 = mkdf(20, 10, data_gen_f=f) + res = pd.eval('df + df2', engine=engine) + assert_frame_equal(res, df + df2) + + +def test_basic_frame_alignment(): + for engine in _engines: + yield check_basic_frame_alignment, engine + + +def check_medium_complex_frame_alignment(engine, r1, r2, c1, c2): + skip_numexpr_engine(engine) + df = mkdf(5, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) + df2 = mkdf(10, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + df3 = mkdf(15, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + res = pd.eval('df + df2 + df3', engine=engine) + assert_frame_equal(res, df + df2 + df3) + + +@slow +def test_medium_complex_frame_alignment(): + args = product(_engines, *([INDEX_TYPES[:4]] * 4)) + for engine, r1, r2, c1, c2 in args: + check_medium_complex_frame_alignment(engine, r1, r2, c1, c2) + + +def check_basic_frame_series_alignment(engine, r_idx_type, c_idx_type, + index_name): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': + assert_raises(ValueError, pd.eval, 'df + s', local_dict=locals()) + assert_raises(ValueError, df.add, s, axis=1) + else: + res = pd.eval('df + s', engine=engine) + expected = df + s + assert_frame_equal(res, expected) + + +def check_not_both_period_fails_otherwise_succeeds(lhs, rhs, r_idx_type, + c_idx_type, index_name, s, + df, *terms): + if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': + assert_raises(ValueError, pd.eval, lhs, local_dict=locals()) + assert_raises(ValueError, pd.eval, rhs, local_dict=locals()) + else: + a, b = pd.eval(lhs), pd.eval(rhs) + assert_frame_equal(a, b) + + +def check_basic_series_frame_alignment(engine, r_idx_type, c_idx_type, + index_name): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': + assert_raises(ValueError, pd.eval, 's + df', local_dict=locals()) + assert_raises(ValueError, df.add, s, axis=1) + else: + res = pd.eval('s + df', engine=engine) + expected = s + df + assert_frame_equal(res, expected) + + +@slow +def check_basic_series_frame_alignment_datetime(engine, r_idx_type, c_idx_type, + index_name): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': + assert_raises(ValueError, pd.eval, 's + df', local_dict=locals()) + assert_raises(ValueError, df.add, s, axis=1) + else: + res = pd.eval('s + df', engine=engine) + expected = s + df + assert_frame_equal(res, expected) + + if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': + assert_raises(ValueError, pd.eval, 'df + s', local_dict=locals()) + assert_raises(ValueError, df.add, s, axis=1) + else: + res = pd.eval('df + s', engine=engine) + expected = df + s + assert_frame_equal(res, expected) + + +def check_series_frame_commutativity(engine, r_idx_type, c_idx_type, op, + index_name): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + lhs = 's {0} df'.format(op) + rhs = 'df {0} s'.format(op) + check_not_both_period_fails_otherwise_succeeds(lhs, rhs, r_idx_type, + c_idx_type, index_name, s, + df) + + +INDEX_TYPES = 'i', 'f', 's', 'u', 'dt', # 'p' + + +@slow +def test_series_frame_commutativity(): + args = product(_engines, INDEX_TYPES, INDEX_TYPES, ('+', '*'), ('index', + 'columns')) + for engine, r_idx_type, c_idx_type, op, index_name in args: + check_series_frame_commutativity(engine, r_idx_type, c_idx_type, op, + index_name) + + +def test_basic_frame_series_alignment(): + args = product(_engines, INDEX_TYPES, INDEX_TYPES, ('index', 'columns')) + for engine, r_idx_type, c_idx_type, index_name in args: + check_basic_frame_series_alignment(engine, r_idx_type, c_idx_type, + index_name) + + +@slow +def test_basic_series_frame_alignment_datetime(): + idx_types = INDEX_TYPES + args = product(_engines, idx_types, idx_types, ('index', 'columns')) + for engine, r_idx_type, c_idx_type, index_name in args: + check_basic_series_frame_alignment_datetime(engine, r_idx_type, + c_idx_type, index_name) + + +def test_basic_series_frame_alignment(): + args = product(_engines, INDEX_TYPES, INDEX_TYPES, ('index', 'columns')) + for engine, r_idx_type, c_idx_type, index_name in args: + check_basic_series_frame_alignment(engine, r_idx_type, c_idx_type, + index_name) + + +def check_complex_series_frame_alignment(engine, index_name, obj, r1, r2, c1, + c2): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) + df2 = mkdf(20, 10, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + index = getattr(locals()[obj], index_name) + s = Series(np.random.randn(5), index[:5]) + if engine != 'python': + expected = df2.add(s, axis=1).add(df) + else: + expected = df2 + s + df + res = pd.eval('df2 + s + df', engine=engine) + expected = df2 + s + df + assert_tuple_equal(res.shape, expected.shape) + assert_frame_equal(res, expected) + + +@slow +def test_complex_series_frame_alignment(): + args = product(_engines, ('index', 'columns'), ('df', 'df2'), + *([INDEX_TYPES[:4]] * 4)) + for engine, index_name, obj, r1, r2, c1, c2 in args: + check_complex_series_frame_alignment(engine, index_name, obj, r1, r2, + c1, c2) + + +def check_datetime_index_rows_punts_to_python(engine): + df = mkdf(10, 10, data_gen_f=f, r_idx_type='dt', c_idx_type='dt') + index = getattr(df, 'index') + s = Series(np.random.randn(5), index[:5]) + env = Scope(globals(), locals()) + assert_true(_scope_has_series_and_frame_datetime_index(env)) + assert_equal(_maybe_convert_engine(env, engine), 'python') + + +def test_datetime_index_rows_punts_to_python(): + for engine in _engines: + check_datetime_index_rows_punts_to_python(engine) + + +__var_s = randn(10) + + +def check_global_scope(engine): + e = '__var_s * 2' + assert_array_equal(__var_s * 2, pd.eval(e, engine=engine)) + + +def test_global_scope(): + for engine in _engines: + yield check_global_scope, engine + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/computation/tests/test_expressions.py b/pandas/computation/tests/test_expressions.py new file mode 100644 index 0000000000000..f197b8ef7a0ac --- /dev/null +++ b/pandas/computation/tests/test_expressions.py @@ -0,0 +1,157 @@ +# pylint: disable-msg=W0612,E1101 + +import unittest +import operator + +import nose + + +import numpy as np +from numpy.testing import assert_array_equal + +from pandas.core.api import DataFrame +from pandas.computation import expressions as expr + +if not expr._USE_NUMEXPR: + raise nose.SkipTest + +import numexpr as ne + + +_frame = DataFrame(np.random.randn(10000, 4), columns=list('ABCD'), + dtype='float64') +_frame2 = DataFrame(np.random.randn(100, 4), columns=list('ABCD'), + dtype='float64') +_mixed = DataFrame({'A': _frame['A'].copy(), + 'B': _frame['B'].astype('float32'), + 'C': _frame['C'].astype('int64'), + 'D': _frame['D'].astype('int32')}) +_mixed2 = DataFrame({'A': _frame2['A'].copy(), + 'B': _frame2['B'].astype('float32'), + 'C': _frame2['C'].astype('int64'), + 'D': _frame2['D'].astype('int32')}) + + +class TestExpressions(unittest.TestCase): + + _multiprocess_can_split_ = False + + def setUp(self): + self.frame = _frame.copy() + self.frame2 = _frame2.copy() + self.mixed = _mixed.copy() + self.mixed2 = _mixed2.copy() + + def test_invalid(self): + # no op + result = expr._can_use_numexpr(operator.add, None, self.frame, + self.frame, 'evaluate') + self.assertFalse(result) + + # mixed + result = expr._can_use_numexpr( + operator.add, '+', self.mixed, self.frame, 'evaluate') + self.assertFalse(result) + + # min elements + result = expr._can_use_numexpr( + operator.add, '+', self.frame2, self.frame2, 'evaluate') + self.assertFalse(result) + + # ok, we only check on first part of expression + result = expr._can_use_numexpr( + operator.add, '+', self.frame, self.frame2, 'evaluate') + self.assert_(result) + + def test_binary_ops(self): + def testit(): + + for f, f2 in [(self.frame, self.frame2), + (self.mixed, self.mixed2)]: + + for op, op_str in [('add', '+'), ('sub', '-'), ('mul', '*'), + ('div', '/'), ('pow', '**')]: + + op = getattr(operator, op, None) + if op is not None: + result = expr._can_use_numexpr( + op, op_str, f, f, 'evaluate') + self.assert_(result == (not f._is_mixed_type)) + + result = expr.evaluate( + op, op_str, f, f, use_numexpr=True) + expected = expr.evaluate( + op, op_str, f, f, use_numexpr=False) + assert_array_equal(result, expected.values) + + result = expr._can_use_numexpr( + op, op_str, f2, f2, 'evaluate') + self.assertFalse(result) + + expr.set_use_numexpr(False) + testit() + expr.set_use_numexpr(True) + expr.set_numexpr_threads(1) + testit() + expr.set_numexpr_threads() + testit() + + def test_boolean_ops(self): + def testit(): + for f, f2 in [(self.frame, self.frame2), + (self.mixed, self.mixed2)]: + + f11 = f + f12 = f + 1 + + f21 = f2 + f22 = f2 + 1 + + for op, op_str in [('gt', '>'), ('lt', '<'), ('ge', '>='), + ('le', '<='), ('eq', '=='), ('ne', '!=')]: + + op = getattr(operator, op) + + result = expr._can_use_numexpr( + op, op_str, f11, f12, 'evaluate') + self.assert_(result == (not f11._is_mixed_type)) + + result = expr.evaluate( + op, op_str, f11, f12, use_numexpr=True) + expected = expr.evaluate( + op, op_str, f11, f12, use_numexpr=False) + assert_array_equal(result, expected.values) + + result = expr._can_use_numexpr( + op, op_str, f21, f22, 'evaluate') + self.assertFalse(result) + + expr.set_use_numexpr(False) + testit() + expr.set_use_numexpr(True) + expr.set_numexpr_threads(1) + testit() + expr.set_numexpr_threads() + testit() + + def test_where(self): + def testit(): + for f in [self.frame, self.frame2, self.mixed, self.mixed2]: + + for cond in [True, False]: + + c = np.empty(f.shape, dtype=np.bool_) + c.fill(cond) + result = expr.where(c, f.values, f.values + 1) + expected = np.where(c, f.values, f.values + 1) + assert_array_equal(result, expected) + + expr.set_use_numexpr(False) + testit() + expr.set_use_numexpr(True) + expr.set_numexpr_threads(1) + testit() + expr.set_numexpr_threads() + testit() + + diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 401a7746953cb..a8bb74f86a43e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -35,7 +35,7 @@ create_block_manager_from_arrays, create_block_manager_from_blocks) from pandas.core.series import Series, _radd_compat -import pandas.core.expressions as expressions +import pandas.computation.expressions as expressions from pandas.compat.scipy import scoreatpercentile as _quantile from pandas.util.compat import OrderedDict from pandas.util import py3compat @@ -2652,6 +2652,8 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, passed MultiIndex level limit : int, default None Maximum size gap to forward or backward fill + fill_value : object, default NA + The value to use to fill in missing data. Examples -------- diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f23a89635aaf2..ab29a38760a51 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -14,7 +14,7 @@ import pandas.core.common as com import pandas.lib as lib import pandas.tslib as tslib -import pandas.core.expressions as expressions +import pandas.computation.expressions as expressions from pandas.tslib import Timestamp from pandas.util import py3compat diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py deleted file mode 100644 index ba0a9926dfa78..0000000000000 --- a/pandas/tests/test_expressions.py +++ /dev/null @@ -1,203 +0,0 @@ -# pylint: disable-msg=W0612,E1101 - -import unittest -import nose - -import operator -from numpy import random, nan -from numpy.random import randn -import numpy as np -from numpy.testing import assert_array_equal - -import pandas as pan -from pandas.core.api import DataFrame, Series, notnull, isnull -from pandas.core import expressions as expr - -from pandas.util.testing import (assert_almost_equal, - assert_series_equal, - assert_frame_equal) -from pandas.util import py3compat - -import pandas.util.testing as tm -import pandas.lib as lib - -from numpy.testing.decorators import slow - -if not expr._USE_NUMEXPR: - raise nose.SkipTest - -_frame = DataFrame(np.random.randn(10000, 4), columns = list('ABCD'), dtype='float64') -_frame2 = DataFrame(np.random.randn(100, 4), columns = list('ABCD'), dtype='float64') -_mixed = DataFrame({ 'A' : _frame['A'].copy(), 'B' : _frame['B'].astype('float32'), 'C' : _frame['C'].astype('int64'), 'D' : _frame['D'].astype('int32') }) -_mixed2 = DataFrame({ 'A' : _frame2['A'].copy(), 'B' : _frame2['B'].astype('float32'), 'C' : _frame2['C'].astype('int64'), 'D' : _frame2['D'].astype('int32') }) -_integer = DataFrame(np.random.randint(1, 100, size=(10001, 4)), columns = list('ABCD'), dtype='int64') - -class TestExpressions(unittest.TestCase): - - _multiprocess_can_split_ = False - - def setUp(self): - - self.frame = _frame.copy() - self.frame2 = _frame2.copy() - self.mixed = _mixed.copy() - self.mixed2 = _mixed2.copy() - self.integer = _integer.copy() - self._MIN_ELEMENTS = expr._MIN_ELEMENTS - - def tearDown(self): - expr._MIN_ELEMENTS = self._MIN_ELEMENTS - - #TODO: add test for Panel - #TODO: add tests for binary operations - @nose.tools.nottest - def run_arithmetic_test(self, df, assert_func, check_dtype=False): - expr._MIN_ELEMENTS = 0 - operations = ['add', 'sub', 'mul','mod','truediv','floordiv','pow'] - if not py3compat.PY3: - operations.append('div') - for arith in operations: - op = getattr(operator, arith) - expr.set_use_numexpr(False) - expected = op(df, df) - expr.set_use_numexpr(True) - result = op(df, df) - try: - if check_dtype: - if arith == 'div': - assert expected.dtype.kind == df.dtype.kind - if arith == 'truediv': - assert expected.dtype.kind == 'f' - assert_func(expected, result) - except Exception: - print("Failed test with operator %r" % op.__name__) - raise - - def test_integer_arithmetic(self): - self.run_arithmetic_test(self.integer, assert_frame_equal) - self.run_arithmetic_test(self.integer.icol(0), assert_series_equal, - check_dtype=True) - - def test_float_arithemtic(self): - self.run_arithmetic_test(self.frame, assert_frame_equal) - self.run_arithmetic_test(self.frame.icol(0), assert_series_equal, - check_dtype=True) - - def test_mixed_arithmetic(self): - self.run_arithmetic_test(self.mixed, assert_frame_equal) - for col in self.mixed.columns: - self.run_arithmetic_test(self.mixed[col], assert_series_equal) - - def test_integer_with_zeros(self): - self.integer *= np.random.randint(0, 2, size=np.shape(self.integer)) - self.run_arithmetic_test(self.integer, assert_frame_equal) - self.run_arithmetic_test(self.integer.icol(0), assert_series_equal) - - def test_invalid(self): - - # no op - result = expr._can_use_numexpr(operator.add, None, self.frame, self.frame, 'evaluate') - self.assert_(result == False) - - # mixed - result = expr._can_use_numexpr(operator.add, '+', self.mixed, self.frame, 'evaluate') - self.assert_(result == False) - - # min elements - result = expr._can_use_numexpr(operator.add, '+', self.frame2, self.frame2, 'evaluate') - self.assert_(result == False) - - # ok, we only check on first part of expression - result = expr._can_use_numexpr(operator.add, '+', self.frame, self.frame2, 'evaluate') - self.assert_(result == True) - - def test_binary_ops(self): - - def testit(): - - for f, f2 in [ (self.frame, self.frame2), (self.mixed, self.mixed2) ]: - - for op, op_str in [('add','+'),('sub','-'),('mul','*'),('div','/'),('pow','**')]: - - op = getattr(operator,op,None) - if op is not None: - result = expr._can_use_numexpr(op, op_str, f, f, 'evaluate') - self.assert_(result == (not f._is_mixed_type)) - - result = expr.evaluate(op, op_str, f, f, use_numexpr=True) - expected = expr.evaluate(op, op_str, f, f, use_numexpr=False) - assert_array_equal(result,expected.values) - - result = expr._can_use_numexpr(op, op_str, f2, f2, 'evaluate') - self.assert_(result == False) - - - expr.set_use_numexpr(False) - testit() - expr.set_use_numexpr(True) - expr.set_numexpr_threads(1) - testit() - expr.set_numexpr_threads() - testit() - - def test_boolean_ops(self): - - - def testit(): - for f, f2 in [ (self.frame, self.frame2), (self.mixed, self.mixed2) ]: - - f11 = f - f12 = f + 1 - - f21 = f2 - f22 = f2 + 1 - - for op, op_str in [('gt','>'),('lt','<'),('ge','>='),('le','<='),('eq','=='),('ne','!=')]: - - op = getattr(operator,op) - - result = expr._can_use_numexpr(op, op_str, f11, f12, 'evaluate') - self.assert_(result == (not f11._is_mixed_type)) - - result = expr.evaluate(op, op_str, f11, f12, use_numexpr=True) - expected = expr.evaluate(op, op_str, f11, f12, use_numexpr=False) - assert_array_equal(result,expected.values) - - result = expr._can_use_numexpr(op, op_str, f21, f22, 'evaluate') - self.assert_(result == False) - - expr.set_use_numexpr(False) - testit() - expr.set_use_numexpr(True) - expr.set_numexpr_threads(1) - testit() - expr.set_numexpr_threads() - testit() - - def test_where(self): - - def testit(): - for f in [ self.frame, self.frame2, self.mixed, self.mixed2 ]: - - - for cond in [ True, False ]: - - c = np.empty(f.shape,dtype=np.bool_) - c.fill(cond) - result = expr.where(c, f.values, f.values+1) - expected = np.where(c, f.values, f.values+1) - assert_array_equal(result,expected) - - expr.set_use_numexpr(False) - testit() - expr.set_use_numexpr(True) - expr.set_numexpr_threads(1) - testit() - expr.set_numexpr_threads() - testit() - -if __name__ == '__main__': - # unittest.main() - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/setup.py b/setup.py index 7d59e0f95f0e8..3984dc075d4f7 100755 --- a/setup.py +++ b/setup.py @@ -85,7 +85,7 @@ except ImportError: cython = False -from os.path import splitext, basename, join as pjoin +from os.path import join as pjoin class build_ext(_build_ext): @@ -502,6 +502,7 @@ def pxd(name): maintainer=AUTHOR, packages=['pandas', 'pandas.compat', + 'pandas.computation', 'pandas.core', 'pandas.io', 'pandas.rpy', diff --git a/vb_suite/binary_ops.py b/vb_suite/binary_ops.py index 54774344520c9..3f076f9f922a3 100644 --- a/vb_suite/binary_ops.py +++ b/vb_suite/binary_ops.py @@ -21,7 +21,7 @@ start_date=datetime(2012, 1, 1)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_numexpr_threads(1) @@ -32,7 +32,7 @@ start_date=datetime(2013, 2, 26)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_use_numexpr(False) @@ -53,7 +53,7 @@ start_date=datetime(2012, 1, 1)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_numexpr_threads(1) @@ -63,7 +63,7 @@ start_date=datetime(2013, 2, 26)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_use_numexpr(False) @@ -84,7 +84,7 @@ start_date=datetime(2012, 1, 1)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_numexpr_threads(1) @@ -94,7 +94,7 @@ start_date=datetime(2013, 2, 26)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_use_numexpr(False) diff --git a/vb_suite/indexing.py b/vb_suite/indexing.py index 9f07cc6ed15c3..2edb7548ebeef 100644 --- a/vb_suite/indexing.py +++ b/vb_suite/indexing.py @@ -103,7 +103,7 @@ start_date=datetime(2012, 1, 1)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(50000, 100)) df2 = DataFrame(np.random.randn(50000, 100)) expr.set_numexpr_threads(1) @@ -115,7 +115,7 @@ setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(50000, 100)) df2 = DataFrame(np.random.randn(50000, 100)) expr.set_use_numexpr(False) From bcd17b090a32afd43de0a21f3829f281635a8b51 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 15 Jun 2013 21:35:22 -0400 Subject: [PATCH 02/37] ENH/TST: add new instance testing functions and their tests --- pandas/core/common.py | 24 ++++++++++++++ pandas/tests/test_common.py | 65 +++++++++++++++++++++++++++++++++++-- pandas/util/testing.py | 5 ++- 3 files changed, 90 insertions(+), 4 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index ddacb98a2ddf3..4615571c5d86c 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -21,6 +21,7 @@ from pandas.core.config import get_option from pandas.core import array as pa +import pandas as pd # XXX: HACK for NumPy 1.5.1 to suppress warnings try: @@ -1509,6 +1510,29 @@ def is_bool(obj): return isinstance(obj, (bool, np.bool_)) +def is_string(obj): + return isinstance(obj, (basestring, np.str_, np.unicode_)) + + +def is_series(obj): + return isinstance(obj, pd.Series) + + +def is_frame(obj): + return isinstance(obj, pd.DataFrame) + + +def is_panel(obj): + return isinstance(obj, pd.Panel) + + +def is_pd_obj(obj): + return isinstance(obj, pd.core.generic.PandasObject) + + +def is_ndframe(obj): + return isinstance(obj, pd.core.generic.NDFrame) + def is_integer(obj): return isinstance(obj, (int, long, np.integer)) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index db01545fb3c9d..974e301c5d303 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -1,20 +1,19 @@ from datetime import datetime -import sys import re import nose import unittest -from pandas import Series, DataFrame, date_range, DatetimeIndex +from pandas import Series, DataFrame, date_range, DatetimeIndex, Panel from pandas.core.common import notnull, isnull import pandas.core.common as com import pandas.util.testing as tm import pandas.core.config as cf import numpy as np +from numpy.random import randn from pandas.tslib import iNaT -from pandas.util import py3compat _multiprocess_can_split_ = True @@ -33,6 +32,7 @@ def __getitem__(self): assert(not is_seq(A())) + def test_notnull(): assert notnull(1.) assert not notnull(None) @@ -98,6 +98,61 @@ def test_isnull_lists(): assert(not result.any()) +def test_is_string(): + class MyString(str): + pass + + class MyUnicode(unicode): + pass + + strings = ('s', np.str_('a'), np.unicode_('unicode_string'), + MyString('a _string blah'), u'asdf', MyUnicode(u'asdf')) + not_strings = [], 1, {}, set(), np.array(['1']), np.array([u'1']) + + for string in strings: + assert com.is_string(string), '{0} is not a string'.format(string) + + for not_string in not_strings: + assert not com.is_string(not_string), ('{0} is a ' + 'string'.format(not_string)) + + +def test_is_frame(): + df = DataFrame(randn(2, 1)) + assert com.is_frame(df) + assert not com.is_frame('s') + + +def test_is_series(): + s = Series(randn(2)) + assert com.is_series(s) + assert not com.is_series(s.values) + + +def test_is_panel(): + p = Panel(randn(2, 3, 4)) + assert com.is_panel(p) + assert not com.is_panel(2) + + +def test_is_pd_obj(): + df = DataFrame(randn(2, 1)) + s = Series(randn(2)) + p = Panel(randn(2, 3, 4)) + for obj in (df, s, p): + assert com.is_pd_obj(obj) + assert not com.is_pd_obj(obj.values) + + +def test_is_ndframe(): + df = DataFrame(randn(2, 1)) + p = Panel(randn(2, 3, 4)) + # should add series after @jreback's ndframe to series pr + for obj in (df, p): + assert com.is_ndframe(obj) + assert not com.is_ndframe(obj.values) + + def test_isnull_datetime(): assert (not isnull(datetime.now())) assert notnull(datetime.now()) @@ -112,11 +167,13 @@ def test_isnull_datetime(): assert(mask[0]) assert(not mask[1:].any()) + def test_datetimeindex_from_empty_datetime64_array(): for unit in [ 'ms', 'us', 'ns' ]: idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit)) assert(len(idx) == 0) + def test_any_none(): assert(com._any_none(1, 2, 3, None)) assert(not com._any_none(1, 2, 3, 4)) @@ -266,6 +323,7 @@ def test_ensure_int32(): result = com._ensure_int32(values) assert(result.dtype == np.int32) + def test_ensure_platform_int(): # verify that when we create certain types of indices @@ -748,6 +806,7 @@ def test_2d_datetime64(self): expected[:, [2, 4]] = datetime(2007, 1, 1) tm.assert_almost_equal(result, expected) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 47bde4ecb32a7..e1b2950b5c8d3 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -17,7 +17,7 @@ from urllib2 import urlopen from distutils.version import LooseVersion -from numpy.random import randn +from numpy.random import randn, rand import numpy as np from pandas.core.common import isnull, _is_sequence @@ -45,6 +45,9 @@ _RAISE_NETWORK_ERROR_DEFAULT = False +def randbool(size=(), p=0.5): + return rand(*size) <= p + def rands(n): choices = string.ascii_letters + string.digits return ''.join(random.choice(choices) for _ in xrange(n)) From 81bacd1d9a8dbec90cbdf3d92d45b3180d0eeee2 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 15 Jun 2013 21:58:28 -0400 Subject: [PATCH 03/37] BUG: prevent certain index types from joining with DatetimeIndex --- pandas/tseries/index.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 7fdb6d9d2603d..4c75ef66feb08 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -912,7 +912,8 @@ def join(self, other, how='left', level=None, return_indexers=False): See Index.join """ if (not isinstance(other, DatetimeIndex) and len(other) > 0 and - other.inferred_type != 'mixed-integer'): + other.inferred_type not in ('floating', 'mixed-integer', + 'mixed-integer-float', 'mixed')): try: other = DatetimeIndex(other) except TypeError: From e380271278cba82d669cd07312d4f37106a4c47d Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 15 Jun 2013 23:26:01 -0400 Subject: [PATCH 04/37] TST/ENH: add 2d bare numpy array and nan support --- pandas/computation/engines.py | 60 +++++++++++++++++---------- pandas/computation/ops.py | 4 +- pandas/computation/tests/test_eval.py | 35 ++++++++++++---- 3 files changed, 70 insertions(+), 29 deletions(-) diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 0eb9875b85549..5bb43efec3e15 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -1,4 +1,5 @@ import abc +import functools from functools import partial from itertools import izip @@ -66,25 +67,42 @@ def _maybe_promote_shape(values, naxes): return values[tuple(axes_slice)] -def _align_core(terms): - # need to ensure that terms is not an iterator - terms = list(terms) +def _any_pandas_objects(terms): + """Check a sequence of terms for instances of PandasObject.""" + return any(com.is_pd_obj(term) for term in terms) + + +def _filter_special_cases(f): + @functools.wraps(f) + def wrapper(terms): + # need to ensure that terms is not an iterator + terms = list(terms) + + ## special cases - ## special cases + # single unary operand + if len(terms) == 1: + return _align_core_single_unary_op(terms[0]) - # single unary operand - if len(terms) == 1: - return _align_core_single_unary_op(terms[0]) - # only scalars - elif all(np.isscalar(term) for term in terms): - return np.result_type(*terms), terms, None + # only scalars + elif all(np.isscalar(term) for term in terms): + return np.result_type(*terms), terms, None - # single dim ndarrays - all_has_size = all(hasattr(term, 'size') for term in terms) - if (all_has_size and all(term.size == 1 for term in terms)): - return np.result_type(*terms), terms, None + # single element ndarrays + all_has_size = all(hasattr(term, 'size') for term in terms) + if (all_has_size and all(term.size == 1 for term in terms)): + return np.result_type(*terms), terms, None - # made it past the special cases + # no pandas so just punt to the evaluator + if not _any_pandas_objects(terms): + return np.result_type(*terms), terms, None + + return f(terms) + return wrapper + + +@_filter_special_cases +def _align_core(terms): term_index = [i for i, term in enumerate(terms) if hasattr(term, 'axes')] term_dims = [terms[i].ndim for i in term_index] ndims = pd.Series(dict(zip(term_index, term_dims))) @@ -145,8 +163,8 @@ def _filter_terms(flat): # literals are not names and names are not literals, by definition if literals_set & names_set: - raise AssertionError('literals cannot be names and names cannot be ' - 'literals') + raise ValueError('literals cannot be names and names cannot be ' + 'literals') return names, literals @@ -154,10 +172,10 @@ def _align(terms, env): # flatten the parse tree (a nested list) flat = list(flatten(terms)) + # separate names and literals names, literals = _filter_terms(flat) - # given an expression consisting of literals - if not names: + if not names: # only literals so just promote to a common type return np.result_type(*literals).type, None # get the variables out @@ -165,13 +183,13 @@ def _align(terms, env): resolved = map(resolve_in_env, names) # if all resolved variables are numeric scalars - if all(map(np.isscalar, resolved)): + if all(np.isscalar(rsv) for rsv in resolved): return np.result_type(*resolved).type, None # perform the main alignment typ, resolved, axes = _align_core(resolved) - # put them back in the symbol table + # put the aligned arrays back in the table _update_names(env, dict(izip(names, resolved))) # we need this to reconstruct things after evaluation since we CANNOT diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index fb1965f45c52b..f79acc412023a 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -123,7 +123,7 @@ def __call__(self, env): except TypeError: left = self.lhs - # recursve over the right nodes + # recurse over the right nodes try: right = self.rhs(env) except TypeError: @@ -166,6 +166,8 @@ def __init__(self, op, operand): def __call__(self, env): operand = self.operand + + # recurse if operand is an Op try: operand = self.operand(env) except TypeError: diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 2d7bf4392cfea..cb52025e45df1 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -1,5 +1,6 @@ #!/usr/bin/env python +import unittest import itertools from itertools import product @@ -7,12 +8,13 @@ from nose.tools import assert_raises, assert_tuple_equal, assert_equal from nose.tools import assert_true -from numpy.random import randn +from numpy.random import randn, rand import numpy as np from numpy.testing import assert_array_equal from numpy.testing.decorators import slow import pandas as pd +from pandas.core import common as com from pandas import DataFrame, Series from pandas.util.testing import makeCustomDataframe as mkdf from pandas.computation.engines import (_engines, _align_core, @@ -85,8 +87,14 @@ def _eval_bin_and_unary(unary, lhs, arith1, rhs): return unop(binop(lhs, rhs)) +def _series_and_2d_ndarray(lhs, rhs): + return (com.is_series(lhs) and isinstance(rhs, np.ndarray) and rhs.ndim > 1 + or com.is_series(rhs) and isinstance(lhs, np.ndarray) and lhs.ndim + > 1) + + # Smoke testing -class TestBasicEval(object): +class TestBasicEval(unittest.TestCase): @classmethod def setUpClass(self): @@ -100,10 +108,14 @@ def set_current_engine(self): self.engine = 'numexpr' def setup_data(self): + nan_df = DataFrame(rand(10, 5)) + nan_df[nan_df > 0.5] = np.nan self.lhses = (DataFrame(randn(10, 5)), Series(randn(5)), randn(), - np.float64(randn())) + np.float64(randn()), randn(10, 5), randn(5), np.nan, + Series([1, 2, np.nan, np.nan, 5]), nan_df) self.rhses = (DataFrame(randn(10, 5)), Series(randn(5)), randn(), - np.float64(randn())) + np.float64(randn()), randn(10, 5), randn(5), np.nan, + Series([1, 2, np.nan, np.nan, 5]), nan_df) def setUp(self): try: @@ -163,9 +175,14 @@ def _create_cmp_op_t(self, lhs, cmp1, rhs, binop, cmp2): ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format(cmp1=cmp1, binop=binop, cmp2=cmp2) - expected = _eval_from_expr(lhs, cmp1, rhs, binop, cmp2) - result = pd.eval(ex, engine=self.engine) - assert_array_equal(result, expected) + if _series_and_2d_ndarray(lhs, rhs): + self.assertRaises(Exception, _eval_from_expr, lhs, cmp1, rhs, + binop, cmp2) + self.assertRaises(Exception, pd.eval, ex, engine=self.engine) + else: + expected = _eval_from_expr(lhs, cmp1, rhs, binop, cmp2) + result = pd.eval(ex, engine=self.engine) + assert_array_equal(result, expected) def _create_simple_cmp_op_t(self, lhs, rhs, cmp1): ex = 'lhs {0} rhs'.format(cmp1) @@ -534,6 +551,10 @@ def test_datetime_index_rows_punts_to_python(): check_datetime_index_rows_punts_to_python(engine) +def check_truediv(engine): + s = randn(10) + + __var_s = randn(10) From 99a3d280d86d6b6141086eef2fde29d979b9dc4f Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 17 Jun 2013 07:37:46 -0400 Subject: [PATCH 05/37] ENH: add modulus support --- pandas/computation/eval.py | 2 +- pandas/computation/expr.py | 13 +++++++------ pandas/computation/ops.py | 22 ++++++++++++++++++++-- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 21348f221bc99..64345e8d3a143 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -57,7 +57,7 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, engine = _maybe_convert_engine(env, engine) # parse the expression - parsed_expr = Expr(expr, engine, truediv) + parsed_expr = Expr(expr, engine, env, truediv) # choose the engine eng = _engines[engine] diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 105c0a020a2ad..f6d4ca39788ab 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -1,7 +1,7 @@ import ast from functools import partial -from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops +from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops, Mod from pandas.computation.ops import _cmp_ops_syms, _bool_ops_syms from pandas.computation.ops import _arith_ops_syms, _unary_ops_syms @@ -15,14 +15,14 @@ class ExprVisitor(ast.NodeVisitor): """ bin_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms bin_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'BitAnd', 'BitOr', - 'Add', 'Sub', 'Mult', 'Div', 'Pow', 'FloorDiv') + 'Add', 'Sub', 'Mult', 'Div', 'Pow', 'FloorDiv', 'Mod') bin_op_nodes_map = dict(zip(bin_ops, bin_op_nodes)) unary_ops = _unary_ops_syms unary_op_nodes = 'UAdd', 'USub', 'Invert' unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) - def __init__(self): + def __init__(self, env): for bin_op in self.bin_ops: setattr(self, 'visit_{0}'.format(self.bin_op_nodes_map[bin_op]), lambda node, bin_op=bin_op: partial(BinOp, bin_op)) @@ -31,6 +31,7 @@ def __init__(self): setattr(self, 'visit_{0}'.format(self.unary_op_nodes_map[unary_op]), lambda node, unary_op=unary_op: partial(UnaryOp, unary_op)) + self.env = env def visit(self, node): if not (isinstance(node, ast.AST) or isinstance(node, basestring)): @@ -91,15 +92,15 @@ def visit_Attribute(self, node): raise NotImplementedError("attribute access is not yet supported") def visit_Mod(self, node): - raise NotImplementedError("modulo operator not yet supported") + return partial(Mod, env=self.env) class Expr(object): """Expr object for pandas """ - def __init__(self, expr, engine, truediv): + def __init__(self, expr, engine, env, truediv): self.expr = expr - self._visitor = ExprVisitor() + self._visitor = ExprVisitor(env) self.terms = self.parse() self.engine = engine self.truediv = truediv diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index f79acc412023a..f81844d787a5a 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -1,6 +1,7 @@ import operator as op from functools import partial +import numpy as np from pandas.util.py3compat import PY3 @@ -74,9 +75,9 @@ def name(self): _bool_ops_funcs = op.and_, op.or_ _bool_ops_dict = dict(zip(_bool_ops_syms, _bool_ops_funcs)) -_arith_ops_syms = '+', '-', '*', '/', '**', '//' +_arith_ops_syms = '+', '-', '*', '/', '**', '//', '%' _arith_ops_funcs = (op.add, op.sub, op.mul, op.truediv if PY3 else op.div, - op.pow, op.floordiv) + op.pow, op.floordiv, op.mod) _arith_ops_dict = dict(zip(_arith_ops_syms, _arith_ops_funcs)) _binary_ops_dict = {} @@ -85,6 +86,17 @@ def name(self): _binary_ops_dict.update(d) +def _cast(terms, env, dtype): + resolver = partial(_resolve_name, env) + updater = partial(_update_name, env) + for term in terms: + t = resolver(term) + try: + new_value = t.astype(dtype) + except AttributeError: + new_value = dtype.type(t) + updater(term, t) + class BinOp(Op): """Hold a binary operator and its operands @@ -145,6 +157,12 @@ def __call__(self, env): return res +class Mod(BinOp): + def __init__(self, lhs, rhs, env=None): + super(Mod, self).__init__('%', lhs, rhs) + _cast(env, (lhs, rhs), np.float_) + + _unary_ops_syms = '+', '-', '~' _unary_ops_funcs = op.pos, op.neg, op.invert _unary_ops_dict = dict(zip(_unary_ops_syms, _unary_ops_funcs)) From 4db95fe90b529e2f25294acfad0408cdfe60f8ec Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 17 Jun 2013 08:02:44 -0400 Subject: [PATCH 06/37] TST: add failing modulus tests --- pandas/computation/tests/test_eval.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index cb52025e45df1..4e062d6a4e99b 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -10,7 +10,7 @@ from numpy.random import randn, rand import numpy as np -from numpy.testing import assert_array_equal +from numpy.testing import assert_array_equal, assert_allclose from numpy.testing.decorators import slow import pandas as pd @@ -212,7 +212,11 @@ def _create_arith_op_t(self, lhs, arith1, rhs): if arith1 != '//': expected = _eval_single_bin(lhs, arith1, rhs, engine_has_neg_frac(self.engine)) - assert_array_equal(result, expected) + # roundoff error with modulus + if arith1 == '%': + assert_allclose(result, expected) + else: + assert_array_equal(result, expected) # sanity check on recursive parsing try: @@ -243,7 +247,12 @@ def _create_arith_op_t(self, lhs, arith1, rhs): pass if arith1 != '//': expected = self.ne.evaluate('nlhs {0} ghs'.format(arith1)) - assert_array_equal(result, expected) + + # roundoff error with modulus + if arith1 == '%': + assert_allclose(result, expected) + else: + assert_array_equal(result, expected) def _create_invert_op_t(self, lhs, cmp1, rhs): # simple @@ -551,6 +560,11 @@ def test_datetime_index_rows_punts_to_python(): check_datetime_index_rows_punts_to_python(engine) +def test_truediv(): + for engine in _engines: + check_truediv(engine) + + def check_truediv(engine): s = randn(10) From 6000c89fe9af20c974ef9b5ff19ea13c4f49178a Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 17 Jun 2013 20:34:24 -0400 Subject: [PATCH 07/37] CLN: use format string for unicode --- pandas/computation/engines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 5bb43efec3e15..11843ffef1705 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -275,7 +275,7 @@ def __init__(self, expr): def convert(self): """Return a string""" - return str(self.expr) + return '%s' % self.expr def _evaluate(self, env): import numexpr as ne From c25a1d4b0853578183e75d341aaab051941bdce7 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 17 Jun 2013 20:35:45 -0400 Subject: [PATCH 08/37] CLN: remove engine detection and manip for datetimes --- pandas/computation/eval.py | 34 ++----------------- pandas/computation/expressions.py | 48 +++++++++++++-------------- pandas/computation/ops.py | 35 ++++++++++++++++--- pandas/computation/tests/test_eval.py | 23 +++++++++---- 4 files changed, 74 insertions(+), 66 deletions(-) diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 64345e8d3a143..298554005d6ed 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -2,40 +2,13 @@ import sys import numbers -import collections -import itertools import numpy as np -Scope = collections.namedtuple('Scope', 'globals locals') - -import pandas.core.common as com -from pandas.computation.expr import Expr +from pandas.computation.expr import Expr, Scope from pandas.computation.engines import _engines -def _scope_has_series_and_frame_datetime_index(env): - from pandas import DatetimeIndex - series_index = frame_index = 0 - - for v in itertools.chain(env.locals.itervalues(), - env.globals.itervalues()): - series_index += com.is_series(v) and isinstance(v.index, DatetimeIndex) - frame_index += com.is_frame(v) and isinstance(v.index, DatetimeIndex) - return series_index, frame_index - - -def _maybe_convert_engine(env, engine): - assert isinstance(env, Scope), 'environment must be an instance of Scope' - assert isinstance(engine, basestring), 'engine name must be a string' - - ret = engine - - if all(_scope_has_series_and_frame_datetime_index(env)): - ret = 'python' - return ret - - def eval(expr, engine='numexpr', truediv=True, local_dict=None, global_dict=None): # make sure we're passed a valid engine @@ -44,7 +17,8 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, ' {1}'.format(_engines.keys())) # 1 up in the call stack for locals/globals; see the documentation for the - # inspect module for why you must decrease the refcount of frame + # inspect module for why you must decrease the refcount of frame at all + # costs frame = sys._getframe(1) try: @@ -54,8 +28,6 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, # shallow copy the scope so we don't overwrite everything env = Scope(gbl.copy(), lcl.copy()) - engine = _maybe_convert_engine(env, engine) - # parse the expression parsed_expr = Expr(expr, engine, env, truediv) diff --git a/pandas/computation/expressions.py b/pandas/computation/expressions.py index e1551f9b0548e..0c13a50d15618 100644 --- a/pandas/computation/expressions.py +++ b/pandas/computation/expressions.py @@ -7,6 +7,7 @@ """ import numpy as np +import pandas.core.common as com try: import numexpr as ne @@ -46,13 +47,10 @@ def set_use_numexpr(v=True): def set_numexpr_threads(n=None): # if we are using numexpr, set the threads to n # otherwise reset - try: - if _NUMEXPR_INSTALLED and _USE_NUMEXPR: - if n is None: - n = ne.detect_number_of_cores() - ne.set_num_threads(n) - except: - pass + if _NUMEXPR_INSTALLED and _USE_NUMEXPR: + if n is None: + n = ne.detect_number_of_cores() + ne.set_num_threads(n) def _evaluate_standard(op, op_str, a, b, raise_on_error=True, **eval_kwargs): @@ -84,7 +82,8 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): return False -def _evaluate_numexpr(op, op_str, a, b, raise_on_error = False, **eval_kwargs): + +def _evaluate_numexpr(op, op_str, a, b, raise_on_error=False, **eval_kwargs): result = None if _can_use_numexpr(op, op_str, a, b, 'evaluate'): @@ -94,15 +93,13 @@ def _evaluate_numexpr(op, op_str, a, b, raise_on_error = False, **eval_kwargs): a_value = a_value.values if hasattr(b_value, 'values'): b_value = b_value.values - result = ne.evaluate('a_value %s b_value' % op_str, - local_dict={ 'a_value' : a_value, - 'b_value' : b_value }, + result = ne.evaluate('a_value %s b_value' % op_str, + local_dict={'a_value': a_value, + 'b_value': b_value}, casting='safe', **eval_kwargs) - except (ValueError), detail: - if 'unknown type object' in str(detail): - pass - except (Exception), detail: - if raise_on_error: + except Exception as detail: + if ('unknown type object' not in com.pprint_thing(detail) and + raise_on_error): raise if result is None: @@ -128,17 +125,15 @@ def _where_numexpr(cond, a, b, raise_on_error=False): a_value = a_value.values if hasattr(b_value, 'values'): b_value = b_value.values - result = ne.evaluate('where(cond_value,a_value,b_value)', + result = ne.evaluate('where(cond_value, a_value, b_value)', local_dict={'cond_value': cond_value, 'a_value': a_value, 'b_value': b_value}, casting='safe') - except (ValueError), detail: - if 'unknown type object' in str(detail): - pass - except (Exception), detail: - if raise_on_error: - raise TypeError(str(detail)) + except Exception as detail: + if ('unknown type object' not in com.pprint_thing(detail) and + raise_on_error): + raise if result is None: result = _where_standard(cond, a, b, raise_on_error) @@ -149,7 +144,9 @@ def _where_numexpr(cond, a, b, raise_on_error=False): # turn myself on set_use_numexpr(True) -def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, **eval_kwargs): + +def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, + **eval_kwargs): """ evaluate and return the expression of the op on a and b Parameters @@ -166,7 +163,8 @@ def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, **eval_kw """ if use_numexpr: - return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error, **eval_kwargs) + return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error, + **eval_kwargs) return _evaluate_standard(op, op_str, a, b, raise_on_error=raise_on_error) diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index f81844d787a5a..1a6d3fd1672ba 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -52,6 +52,32 @@ def _update_names(env, mapping): updater(key, value) +class Term(object): + def __init__(self, value, name, env): + self.value = value + self.name = name + self.env = env + self.type = type(value) + + def __iter__(self): + yield self.value + raise StopIteration + + def __str__(self): + return '{0}({1!r})'.format(self.__class__.__name__, self.name) + + __repr__ = __str__ + + def update(self, env, value): + _update_name(self.env, self.name, value) + self.value = value + + +class Constant(Term): + def __init__(self, value, env): + super(Constant, self).__init__(value, value, env) + + class Op(object): """Hold an operator of unknown arity """ @@ -89,13 +115,14 @@ def name(self): def _cast(terms, env, dtype): resolver = partial(_resolve_name, env) updater = partial(_update_name, env) + dt = np.dtype(dtype) for term in terms: t = resolver(term) try: - new_value = t.astype(dtype) + new_value = t.astype(dt) except AttributeError: - new_value = dtype.type(t) - updater(term, t) + new_value = dt.type(t) + updater(term, new_value) class BinOp(Op): """Hold a binary operator and its operands @@ -160,7 +187,7 @@ def __call__(self, env): class Mod(BinOp): def __init__(self, lhs, rhs, env=None): super(Mod, self).__init__('%', lhs, rhs) - _cast(env, (lhs, rhs), np.float_) + _cast((lhs, rhs), env, np.float_) _unary_ops_syms = '+', '-', '~' diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 4e062d6a4e99b..417fb106f90fa 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -5,8 +5,8 @@ from itertools import product import nose -from nose.tools import assert_raises, assert_tuple_equal, assert_equal -from nose.tools import assert_true +from nose.tools import assert_raises, assert_tuple_equal +from nose.tools import assert_true, assert_false from numpy.random import randn, rand import numpy as np @@ -23,8 +23,6 @@ import pandas.computation.expr as expr from pandas.computation.expressions import _USE_NUMEXPR from pandas.computation.eval import Scope -from pandas.computation.eval import _scope_has_series_and_frame_datetime_index -from pandas.computation.eval import _maybe_convert_engine from pandas.util.testing import assert_frame_equal, randbool @@ -551,8 +549,6 @@ def check_datetime_index_rows_punts_to_python(engine): index = getattr(df, 'index') s = Series(np.random.randn(5), index[:5]) env = Scope(globals(), locals()) - assert_true(_scope_has_series_and_frame_datetime_index(env)) - assert_equal(_maybe_convert_engine(env, engine), 'python') def test_datetime_index_rows_punts_to_python(): @@ -582,6 +578,21 @@ def test_global_scope(): yield check_global_scope, engine +def check_is_expr(engine): + s = 1 + valid = 's + 1' + invalid = 's +' + assert_true(expr.isexpr(valid, check_names=True)) + assert_false(expr.isexpr(valid, check_names=False)) + assert_false(expr.isexpr(invalid, check_names=False)) + assert_false(expr.isexpr(invalid, check_names=True)) + + +def test_is_expr(): + for engine in _engines: + check_is_expr(engine) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From 1132bc40843626fd2eb1afbb6755490a79924337 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Wed, 19 Jun 2013 21:40:11 -0400 Subject: [PATCH 09/37] CLN/ENH: add new interface to encapsulate Terms and Constants --- pandas/computation/engines.py | 100 +++++++++++++++------------------- pandas/computation/expr.py | 38 ++++++++++--- pandas/computation/ops.py | 72 +++++++++++++----------- 3 files changed, 112 insertions(+), 98 deletions(-) diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 11843ffef1705..4ebb4a15fdee4 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -1,27 +1,26 @@ import abc -import functools -from functools import partial +from functools import partial, wraps from itertools import izip import numpy as np import pandas as pd import pandas.core.common as com -from pandas.computation.ops import _resolve_name, _update_names +from pandas.computation.ops import is_const from pandas.computation.common import flatten def _align_core_single_unary_op(term): - if isinstance(term, np.ndarray) and not com.is_series(term): - typ = np.asanyarray + if isinstance(term.value, np.ndarray) and not com.is_series(term.value): + typ = partial(np.asanyarray, dtype=term.value.dtype) else: - typ = type(term) - ret = typ, [term] + typ = type(term.value) + ret = typ, - if not hasattr(term, 'axes'): + if not hasattr(term.value, 'axes'): ret += None, else: - ret += _zip_axes_from_type(typ, term.axes), + ret += _zip_axes_from_type(typ, term.value.axes), return ret @@ -69,33 +68,28 @@ def _maybe_promote_shape(values, naxes): def _any_pandas_objects(terms): """Check a sequence of terms for instances of PandasObject.""" - return any(com.is_pd_obj(term) for term in terms) + return any(com.is_pd_obj(term.value) for term in terms) def _filter_special_cases(f): - @functools.wraps(f) + @wraps(f) def wrapper(terms): - # need to ensure that terms is not an iterator - terms = list(terms) - - ## special cases - # single unary operand if len(terms) == 1: return _align_core_single_unary_op(terms[0]) # only scalars - elif all(np.isscalar(term) for term in terms): - return np.result_type(*terms), terms, None + elif all(term.isscalar for term in terms): + return np.result_type(*(term.value for term in terms)), None # single element ndarrays - all_has_size = all(hasattr(term, 'size') for term in terms) - if (all_has_size and all(term.size == 1 for term in terms)): - return np.result_type(*terms), terms, None + all_has_size = all(hasattr(term.value, 'size') for term in terms) + if (all_has_size and all(term.value.size == 1 for term in terms)): + return np.result_type(*(term.value for term in terms)), None # no pandas so just punt to the evaluator if not _any_pandas_objects(terms): - return np.result_type(*terms), terms, None + return np.result_type(*(term.value for term in terms)), None return f(terms) return wrapper @@ -103,27 +97,28 @@ def wrapper(terms): @_filter_special_cases def _align_core(terms): - term_index = [i for i, term in enumerate(terms) if hasattr(term, 'axes')] - term_dims = [terms[i].ndim for i in term_index] + term_index = [i for i, term in enumerate(terms) if hasattr(term.value, + 'axes')] + term_dims = [terms[i].value.ndim for i in term_index] ndims = pd.Series(dict(zip(term_index, term_dims))) # initial axes are the axes of the largest-axis'd term - biggest = terms[ndims.idxmax()] + biggest = terms[ndims.idxmax()].value typ = biggest._constructor axes = biggest.axes naxes = len(axes) for i in term_index: - for axis, items in enumerate(terms[i].axes): - if com.is_series(terms[i]) and naxes > 1: - axes[naxes - 1] = axes[naxes - 1].join(terms[i].index, + for axis, items in enumerate(terms[i].value.axes): + if com.is_series(terms[i].value) and naxes > 1: + axes[naxes - 1] = axes[naxes - 1].join(terms[i].value.index, how='outer') else: axes[axis] = axes[axis].join(items, how='outer') for i, ndim in ndims.iteritems(): for axis, items in izip(xrange(ndim), axes): - ti = terms[i] # needed here because we modify it in the inner loop + ti = terms[i].value # needed here because we modify it in the inner loop if hasattr(ti, 'reindex_axis'): transpose = com.is_series(ti) and naxes > 1 @@ -138,31 +133,31 @@ def _align_core(terms): else: r = f() - terms[i] = r + terms[i].update(r) - res = _maybe_promote_shape(terms[i].T if transpose else terms[i], - naxes) + res = _maybe_promote_shape(terms[i].value.T if transpose else + terms[i].value, naxes) res = res.T if transpose else res try: - terms[i] = res.values + v = res.values except AttributeError: - terms[i] = res + v = res + terms[i].update(v) - return typ, terms, _zip_axes_from_type(typ, axes) + return typ, _zip_axes_from_type(typ, axes) def _filter_terms(flat): # numeric literals - literals = filter(lambda string: not com.is_string(string), flat) - literals_set = set(literals) + literals = set(filter(is_const, flat)) # these are strings which are variable names - names = filter(com.is_string, flat) - names_set = set(names) + names = set(flat) - literals - # literals are not names and names are not literals, by definition - if literals_set & names_set: + # literals are not names and names are not literals, so intersection should + # be empty + if literals & names: raise ValueError('literals cannot be names and names cannot be ' 'literals') return names, literals @@ -170,30 +165,20 @@ def _filter_terms(flat): def _align(terms, env): # flatten the parse tree (a nested list) - flat = list(flatten(terms)) + terms = list(flatten(terms)) # separate names and literals - names, literals = _filter_terms(flat) + names, literals = _filter_terms(terms) if not names: # only literals so just promote to a common type return np.result_type(*literals).type, None - # get the variables out - resolve_in_env = partial(_resolve_name, env) - resolved = map(resolve_in_env, names) - # if all resolved variables are numeric scalars - if all(np.isscalar(rsv) for rsv in resolved): - return np.result_type(*resolved).type, None + if all(term.isscalar for term in terms): + return np.result_type(*(term.value for term in terms)).type, None # perform the main alignment - typ, resolved, axes = _align_core(resolved) - - # put the aligned arrays back in the table - _update_names(env, dict(izip(names, resolved))) - - # we need this to reconstruct things after evaluation since we CANNOT - # depend on the array interface + typ, axes = _align_core(terms) return typ, axes @@ -222,7 +207,8 @@ def _reconstruct_object(typ, obj, axes): except AttributeError: pass - if typ != np.asanyarray and issubclass(typ, pd.core.generic.PandasObject): + if (not isinstance(typ, partial) and + issubclass(typ, pd.core.generic.PandasObject)): return typ(obj, **axes) ret_value = typ(obj) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index f6d4ca39788ab..f0ed6b5de9ed6 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -1,9 +1,16 @@ import ast +import sys from functools import partial +import collections + from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops, Mod from pandas.computation.ops import _cmp_ops_syms, _bool_ops_syms from pandas.computation.ops import _arith_ops_syms, _unary_ops_syms +from pandas.computation.ops import _resolve_name, Term, Constant + + +Scope = collections.namedtuple('Scope', 'globals locals') class ExprParserError(Exception): @@ -65,10 +72,11 @@ def visit_UnaryOp(self, node): return op(self.visit(node.operand)) def visit_Name(self, node): - return node.id + name = node.id + return Term(_resolve_name(self.env, name), name, self.env) def visit_Num(self, node): - return node.n + return Constant(node.n, self.env) def visit_Compare(self, node): ops = node.ops @@ -92,19 +100,29 @@ def visit_Attribute(self, node): raise NotImplementedError("attribute access is not yet supported") def visit_Mod(self, node): - return partial(Mod, env=self.env) + return Mod class Expr(object): """Expr object for pandas """ - def __init__(self, expr, engine, env, truediv): + def __init__(self, expr, engine='numexpr', env=None, truediv=True): self.expr = expr - self._visitor = ExprVisitor(env) + self.env = env or self._get_calling_scope() + self._visitor = ExprVisitor(self.env) self.terms = self.parse() self.engine = engine self.truediv = truediv + def _get_calling_scope(self): + frame = sys._getframe(1) + gbl, lcl = frame.f_globals, frame.f_locals + + try: + return Scope(gbl, lcl) + finally: + del frame + def __call__(self, env): env.locals['truediv'] = self.truediv return self.terms(env) @@ -123,14 +141,16 @@ def parse(self): raise e return visited - def align(self, env): + def align(self): """align a set of Terms""" - return self.terms.align(env) + return self.terms.align(self.env) -def isexpr(s): +def isexpr(s, check_names=True): try: - Expr(s, engine=None) + Expr(s) except SyntaxError: return False + except NameError: + return not check_names return True diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 1a6d3fd1672ba..8c66fd0d122d5 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -3,6 +3,7 @@ import numpy as np from pandas.util.py3compat import PY3 +import pandas.core.common as com _reductions = 'sum', 'prod' @@ -59,19 +60,19 @@ def __init__(self, value, name, env): self.env = env self.type = type(value) - def __iter__(self): - yield self.value - raise StopIteration - def __str__(self): return '{0}({1!r})'.format(self.__class__.__name__, self.name) __repr__ = __str__ - def update(self, env, value): + def update(self, value): _update_name(self.env, self.name, value) self.value = value + @property + def isscalar(self): + return np.isscalar(self.value) + class Constant(Term): def __init__(self, value, env): @@ -112,17 +113,28 @@ def name(self): _binary_ops_dict.update(d) -def _cast(terms, env, dtype): - resolver = partial(_resolve_name, env) - updater = partial(_update_name, env) +def _cast(terms, dtype): dt = np.dtype(dtype) for term in terms: - t = resolver(term) + # cast all the way down the tree since operands must be try: - new_value = t.astype(dt) + _cast(term.operands, dtype) except AttributeError: - new_value = dt.type(t) - updater(term, new_value) + # we've bottomed out so cast + try: + new_value = term.value.astype(dt) + except AttributeError: + new_value = dt.type(term.value) + term.update(new_value) + + +def is_term(obj): + return isinstance(obj, Term) + + +def is_const(obj): + return isinstance(obj, Constant) + class BinOp(Op): """Hold a binary operator and its operands @@ -146,8 +158,9 @@ def __init__(self, op, lhs, rhs): ' operators are {1}'.format(op, keys)) def __repr__(self): - return '{0}(op={1!r}, lhs={2!r}, rhs={3!r})'.format(self.name, self.op, - self.lhs, self.rhs) + return com.pprint_thing('{0}(op={1!r}, lhs={2!r}, ' + 'rhs={3!r})'.format(self.name, self.op, + self.lhs, self.rhs)) __str__ = __repr__ @@ -169,25 +182,22 @@ def __call__(self, env): right = self.rhs # base cases - if not (isinstance(left, basestring) or isinstance(right, basestring)): + if is_term(left) and is_term(right): + res = self.func(left.value, right.value) + elif not is_term(left) and is_term(right): + res = self.func(left, right.value) + elif is_term(left) and not is_term(right): + res = self.func(left.value, right) + elif not (is_term(left) or is_term(right)): res = self.func(left, right) - elif isinstance(left, basestring) and not isinstance(right, - basestring): - res = self.func(_resolve_name(env, left), right) - elif not isinstance(left, basestring) and isinstance(right, - basestring): - res = self.func(left, _resolve_name(env, right)) - elif isinstance(left, basestring) and isinstance(right, basestring): - res = self.func(_resolve_name(env, left), _resolve_name(env, - right)) return res class Mod(BinOp): - def __init__(self, lhs, rhs, env=None): + def __init__(self, lhs, rhs): super(Mod, self).__init__('%', lhs, rhs) - _cast((lhs, rhs), env, np.float_) + _cast(self.operands, np.float_) _unary_ops_syms = '+', '-', '~' @@ -218,10 +228,7 @@ def __call__(self, env): except TypeError: operand = self.operand - if isinstance(operand, basestring): - v = _resolve_name(env, operand) - else: - v = operand + v = operand.value if is_term(operand) else operand try: res = self.func(v) @@ -231,5 +238,6 @@ def __call__(self, env): return res def __repr__(self): - return '{0}(op={1!r}, operand={2!r})'.format(self.name, self.op, - self.operand) + return com.pprint_thing('{0}(op={1!r}, ' + 'operand={2!r})'.format(self.name, self.op, + self.operand)) From 54f1897da4c119dec68b5f215981cd12787d5c77 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Wed, 19 Jun 2013 23:56:00 -0400 Subject: [PATCH 10/37] ENH: allow an already-parsed expression to be passed to eval --- pandas/computation/engines.py | 2 +- pandas/computation/eval.py | 22 ++++++++++++++-------- pandas/computation/expr.py | 5 +++-- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 4ebb4a15fdee4..342bde7b2beeb 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -48,7 +48,7 @@ def _maybe_promote_shape(values, naxes): axes_slice = [slice(None)] * naxes - # symmetric difference + # symmetric difference of numaxes and ndims slices = nax - ndim if ndims == naxes: diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 298554005d6ed..828ee334d71f9 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -22,14 +22,20 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, frame = sys._getframe(1) try: - # get the globals and locals - gbl, lcl = global_dict or frame.f_globals, local_dict or frame.f_locals - - # shallow copy the scope so we don't overwrite everything - env = Scope(gbl.copy(), lcl.copy()) - - # parse the expression - parsed_expr = Expr(expr, engine, env, truediv) + # parse the expression from a string + if isinstance(expr, basestring): + # get the globals and locals + gbl, lcl = (global_dict or frame.f_globals, + local_dict or frame.f_locals) + + # shallow copy the scope so we don't overwrite everything + env = Scope(gbl.copy(), lcl.copy()) + parsed_expr = Expr(expr, engine, env, truediv) + elif isinstance(expr, Expr): + parsed_expr = expr + else: + raise TypeError("eval only accepts strings and Expr objects, you " + "passed a {0!r}".format(expr.__class__.__name__)) # choose the engine eng = _engines[engine] diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index f0ed6b5de9ed6..63779da24394f 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -115,8 +115,9 @@ def __init__(self, expr, engine='numexpr', env=None, truediv=True): self.truediv = truediv def _get_calling_scope(self): - frame = sys._getframe(1) - gbl, lcl = frame.f_globals, frame.f_locals + # call this method **only** in the constructor + frame = sys._getframe(2) + gbl, lcl = frame.f_globals.copy(), frame.f_locals.copy() try: return Scope(gbl, lcl) From e20900ac61f7c22f5b78e6388f519a01eb53c12e Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Tue, 25 Jun 2013 20:30:09 -0400 Subject: [PATCH 11/37] CLN: add automatic scope creating object --- pandas/computation/expr.py | 31 +++++++++++++-------------- pandas/computation/tests/test_eval.py | 27 ++++++++++++++++++----- 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 63779da24394f..987f694bf0904 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -1,7 +1,6 @@ import ast import sys from functools import partial -import collections from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops, Mod @@ -10,7 +9,17 @@ from pandas.computation.ops import _resolve_name, Term, Constant -Scope = collections.namedtuple('Scope', 'globals locals') +class Scope(object): + __slots__ = 'globals', 'locals' + + def __init__(self, gbls=None, lcls=None, frame_level=1): + frame = sys._getframe(frame_level) + + try: + self.globals = gbls or frame.f_globals.copy() + self.locals = lcls or frame.f_locals.copy() + finally: + del frame class ExprParserError(Exception): @@ -104,26 +113,15 @@ def visit_Mod(self, node): class Expr(object): - """Expr object for pandas - """ + """Expr object""" def __init__(self, expr, engine='numexpr', env=None, truediv=True): self.expr = expr - self.env = env or self._get_calling_scope() + self.env = env or Scope(frame_level=2) self._visitor = ExprVisitor(self.env) self.terms = self.parse() self.engine = engine self.truediv = truediv - def _get_calling_scope(self): - # call this method **only** in the constructor - frame = sys._getframe(2) - gbl, lcl = frame.f_globals.copy(), frame.f_locals.copy() - - try: - return Scope(gbl, lcl) - finally: - del frame - def __call__(self, env): env.locals['truediv'] = self.truediv return self.terms(env) @@ -154,4 +152,5 @@ def isexpr(s, check_names=True): return False except NameError: return not check_names - return True + else: + return True diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 417fb106f90fa..18fe641db5ed2 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -19,11 +19,12 @@ from pandas.util.testing import makeCustomDataframe as mkdf from pandas.computation.engines import (_engines, _align_core, _reconstruct_object) -from pandas.computation.ops import _binary_ops_dict, _unary_ops_dict +from pandas.computation.ops import _binary_ops_dict, _unary_ops_dict, Term import pandas.computation.expr as expr from pandas.computation.expressions import _USE_NUMEXPR from pandas.computation.eval import Scope from pandas.util.testing import assert_frame_equal, randbool +from pandas.util.py3compat import PY3 def skip_numexpr_engine(engine): @@ -48,7 +49,9 @@ def _eval_from_expr(lhs, cmp1, rhs, binop, cmp2): f1 = _binary_ops_dict[cmp1] f2 = _binary_ops_dict[cmp2] bf = _binary_ops_dict[binop] - typ, (lhs, rhs), axes = _align_core((lhs, rhs)) + env = Scope() + typ, axes = _align_core((Term(lhs, 'lhs', env), Term(rhs, 'rhs', env))) + lhs, rhs = env.locals['lhs'], env.locals['rhs'] return _reconstruct_object(typ, bf(f1(lhs, rhs), f2(lhs, rhs)), axes) @@ -483,7 +486,7 @@ def check_series_frame_commutativity(engine, r_idx_type, c_idx_type, op, df) -INDEX_TYPES = 'i', 'f', 's', 'u', 'dt', # 'p' +INDEX_TYPES = 'i', 'f', 's', 'u', # 'dt', # 'p' @slow @@ -562,7 +565,21 @@ def test_truediv(): def check_truediv(engine): - s = randn(10) + s = np.array([1]) + ex = 's / 1' + + if PY3: + res = pd.eval(ex, truediv=False) + assert_array_equal(res, np.array([1.0])) + + res = pd.eval(ex, truediv=True) + assert_array_equal(res, np.array([1.0])) + else: + res = pd.eval(ex, truediv=False) + assert_array_equal(res, np.array([1])) + + res = pd.eval(ex, truediv=True) + assert_array_equal(res, np.array([1.0])) __var_s = randn(10) @@ -583,7 +600,7 @@ def check_is_expr(engine): valid = 's + 1' invalid = 's +' assert_true(expr.isexpr(valid, check_names=True)) - assert_false(expr.isexpr(valid, check_names=False)) + assert_true(expr.isexpr(valid, check_names=False)) assert_false(expr.isexpr(invalid, check_names=False)) assert_false(expr.isexpr(invalid, check_names=True)) From 51d80f6ca4febdcef4c11c65fa77b6861dae10bf Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 28 Jun 2013 16:15:37 -0400 Subject: [PATCH 12/37] CLN: make the environment an implementation detail --- pandas/computation/engines.py | 11 ++++--- pandas/computation/eval.py | 61 +++++++++++++++++++++-------------- 2 files changed, 42 insertions(+), 30 deletions(-) diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 342bde7b2beeb..39155ad112847 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -235,11 +235,12 @@ def convert(self): """Convert an expression for evaluation.""" pass - def evaluate(self, env): + def evaluate(self): if not self._is_aligned: - self.result_type, self.aligned_axes = _align(self.expr.terms, env) + self.result_type, self.aligned_axes = _align(self.expr.terms, + self.expr.env) - res = self._evaluate(env) + res = self._evaluate(self.expr.env) return _reconstruct_object(self.result_type, res, self.aligned_axes) @property @@ -284,8 +285,8 @@ def __init__(self, expr): def convert(self): pass - def evaluate(self, env): - return self.expr(env) + def evaluate(self): + return self.expr(self.expr.env) def _evaluate(self, env): pass diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 828ee334d71f9..7788eddf96f87 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -11,39 +11,50 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, global_dict=None): + """Evaluate a Python expression as a string. + + Parameters + ---------- + expr : string or Expr object + engine : string, optional, default 'numexpr' + The engine to use to evaluate the passed expression + truediv : bool, optional, default True + local_dict : dict or None, optional, default None + global_dict : dict or None, optional, default None + + Returns + ------- + obj : ndarray, scalar, DataFrame, Series, or Panel + """ # make sure we're passed a valid engine if not engine in _engines: raise KeyError('Invalid engine {0} passed, valid engines are' ' {1}'.format(_engines.keys())) - # 1 up in the call stack for locals/globals; see the documentation for the - # inspect module for why you must decrease the refcount of frame at all - # costs - frame = sys._getframe(1) + eng = _engines[engine] + + if isinstance(expr, basestring): + frame = sys._getframe(1) - try: - # parse the expression from a string - if isinstance(expr, basestring): - # get the globals and locals - gbl, lcl = (global_dict or frame.f_globals, - local_dict or frame.f_locals) + # get the globals and locals + gbl, lcl = (global_dict or frame.f_globals, + local_dict or frame.f_locals) - # shallow copy the scope so we don't overwrite everything + try: + # shallow copy the scope so we don't overwrite anything env = Scope(gbl.copy(), lcl.copy()) - parsed_expr = Expr(expr, engine, env, truediv) - elif isinstance(expr, Expr): - parsed_expr = expr - else: - raise TypeError("eval only accepts strings and Expr objects, you " - "passed a {0!r}".format(expr.__class__.__name__)) - - # choose the engine - eng = _engines[engine] - - # construct the engine and evaluate - ret = eng(parsed_expr).evaluate(env) - finally: - del frame + finally: + del frame + parsed_expr = Expr(expr, engine, env, truediv) + elif isinstance(expr, Expr): + parsed_expr = expr + else: + raise TypeError("eval only accepts strings and Expr objects, you " + "passed a {0!r}".format(expr.__class__.__name__)) + + + # construct the engine and evaluate + ret = eng(parsed_expr).evaluate() # sanity check for a number if np.isscalar(ret): From 038d79c25cf2c8968a176e37e8b6f2d14e44414a Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 28 Jun 2013 16:26:58 -0400 Subject: [PATCH 13/37] DOC: add docstring to eval --- pandas/computation/eval.py | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 7788eddf96f87..38248c26f88e3 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -11,20 +11,44 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, global_dict=None): - """Evaluate a Python expression as a string. + """Evaluate a Python expression as a string using various backends. + + The following arithmetic operations are supported: +, -, *, /, **, %, // + (python engine only) along with the following boolean operations: | (or), & + (and), and ~ (not). All Pandas objects are supported and behave as they + would with in-Python evaluation. Parameters ---------- expr : string or Expr object - engine : string, optional, default 'numexpr' - The engine to use to evaluate the passed expression + The expression to evaluate. This can be either a string or an ``Expr`` + object. + engine : string, optional, default 'numexpr', {'python', 'numexpr', 'pytables'} + The engine used to evaluate the expression. Supported engines are + + - 'numexpr': This default engine evaluates pandas objects using numexpr + for large speed ups in complex expressions with large + frames. + - 'python': Performs operations as if you had eval'd in top level + python + - 'pytables': Engine used for evaluating expressions for selection of + objects from PyTables HDF5 tables. + truediv : bool, optional, default True + Whether to use true division, like in Python >= 3 local_dict : dict or None, optional, default None + A dictionary of local variables, taken from locals() by default. global_dict : dict or None, optional, default None + A dictionary of global variables, taken from globals() by default. Returns ------- obj : ndarray, scalar, DataFrame, Series, or Panel + + Notes + ----- + The benefits of using ``eval`` are that very large frames that are terms in + long expressions are sped up, sometimes by as much as 10x. """ # make sure we're passed a valid engine if not engine in _engines: From 599cf32bdaaaf65e26478a6a9ae2e669f6ab7014 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 28 Jun 2013 17:09:42 -0400 Subject: [PATCH 14/37] CLN: cleanup pytables.py a bit --- pandas/io/pytables.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index fdb86c43b7160..a53907c518aab 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -219,7 +219,7 @@ def read_hdf(path_or_buf, key, **kwargs): # a passed store; user controls open/close f(path_or_buf, False) -class HDFStore(StringMixin): +class HDFStore(object): """ dict-like IO interface for storing pandas objects in PyTables format. @@ -421,7 +421,8 @@ def get(self, key): raise KeyError('No object named %s in the file' % key) return self._read_group(group) - def select(self, key, where=None, start=None, stop=None, columns=None, iterator=False, chunksize=None, auto_close=False, **kwargs): + def select(self, key, where=None, start=None, stop=None, columns=None, + iterator=False, chunksize=None, auto_close=False, **kwargs): """ Retrieve pandas object stored in file, optionally based on where criteria @@ -448,14 +449,18 @@ def select(self, key, where=None, start=None, stop=None, columns=None, iterator= # what we are actually going to do for a chunk def func(_start, _stop): - return s.read(where=where, start=_start, stop=_stop, columns=columns, **kwargs) + return s.read(where=where, start=_start, stop=_stop, + columns=columns, **kwargs) if iterator or chunksize is not None: if not s.is_table: raise TypeError("can only use an iterator or chunksize on a table") - return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, chunksize=chunksize, auto_close=auto_close) + return TableIterator(self, func, nrows=s.nrows, start=start, + stop=stop, chunksize=chunksize, + auto_close=auto_close) - return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, auto_close=auto_close).get_values() + return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, + auto_close=auto_close).get_values() def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs): """ @@ -1620,6 +1625,9 @@ def __unicode__(self): return "%-12.12s (shape->%s)" % (self.pandas_type,s) return self.pandas_type + def __str__(self): + return self.__repr__() + def set_object_info(self): """ set my pandas type & version """ self.attrs.pandas_type = self.pandas_kind From ea769e664d32e413bd44fedd5849ab077e8812f3 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 29 Jun 2013 11:08:00 -0400 Subject: [PATCH 15/37] CLN: clean up engines --- pandas/computation/engines.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 39155ad112847..64582192a9874 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -108,17 +108,17 @@ def _align_core(terms): axes = biggest.axes naxes = len(axes) - for i in term_index: - for axis, items in enumerate(terms[i].value.axes): - if com.is_series(terms[i].value) and naxes > 1: - axes[naxes - 1] = axes[naxes - 1].join(terms[i].value.index, - how='outer') + for term in (terms[i] for i in term_index): + for axis, items in enumerate(term.value.axes): + if com.is_series(term.value) and naxes > 1: + ax, itm = naxes - 1, term.value.index else: - axes[axis] = axes[axis].join(items, how='outer') + ax, itm = axis, items + axes[ax] = axes[ax].join(itm, how='outer') for i, ndim in ndims.iteritems(): for axis, items in izip(xrange(ndim), axes): - ti = terms[i].value # needed here because we modify it in the inner loop + ti = terms[i].value if hasattr(ti, 'reindex_axis'): transpose = com.is_series(ti) and naxes > 1 From ff78c08139f2c5f7e632827f313ccbf88aba9100 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Wed, 3 Jul 2013 21:21:49 -0400 Subject: [PATCH 16/37] CLN: clean up eval and have the Scope instance auto create the scope if none exists --- pandas/computation/eval.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 38248c26f88e3..591993bc4f228 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -58,17 +58,9 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, eng = _engines[engine] if isinstance(expr, basestring): - frame = sys._getframe(1) - - # get the globals and locals - gbl, lcl = (global_dict or frame.f_globals, - local_dict or frame.f_locals) - - try: - # shallow copy the scope so we don't overwrite anything - env = Scope(gbl.copy(), lcl.copy()) - finally: - del frame + # need to go 2 up in the call stack from the constructor since we want + # the calling scope's variables + env = Scope(global_dict, local_dict, frame_level=2) parsed_expr = Expr(expr, engine, env, truediv) elif isinstance(expr, Expr): parsed_expr = expr @@ -80,7 +72,7 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, # construct the engine and evaluate ret = eng(parsed_expr).evaluate() - # sanity check for a number + # sanity check for a number TODO: eventually take out if np.isscalar(ret): if not isinstance(ret, (np.number, numbers.Number, np.bool_, bool)): raise TypeError('scalar result must be numeric or bool, type is ' From f9f7fd7b6f841eae34ac1795f02320646b15708c Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Wed, 3 Jul 2013 21:29:19 -0400 Subject: [PATCH 17/37] CLN: add six.string_types checking instead of basestring --- pandas/computation/eval.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index 591993bc4f228..b7d15d1d009bc 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -5,6 +5,8 @@ import numpy as np +import six + from pandas.computation.expr import Expr, Scope from pandas.computation.engines import _engines @@ -57,7 +59,7 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, eng = _engines[engine] - if isinstance(expr, basestring): + if isinstance(expr, six.string_types): # need to go 2 up in the call stack from the constructor since we want # the calling scope's variables env = Scope(global_dict, local_dict, frame_level=2) @@ -72,9 +74,11 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, # construct the engine and evaluate ret = eng(parsed_expr).evaluate() - # sanity check for a number TODO: eventually take out + # sanity check for a number + # TODO: eventually take out + # TODO: pytables engine will probably need a string check if np.isscalar(ret): - if not isinstance(ret, (np.number, numbers.Number, np.bool_, bool)): - raise TypeError('scalar result must be numeric or bool, type is ' - '{0!r}'.format(ret.__class__.__name__)) + if not isinstance(ret, (np.number, np.bool_, numbers.Number)): + raise TypeError('scalar result must be numeric or bool, passed ' + 'type is {0!r}'.format(ret.__class__.__name__)) return ret From 48eff13c0418c146dbca43f4893d027d0624ffe2 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Wed, 3 Jul 2013 22:13:08 -0400 Subject: [PATCH 18/37] TST: clean up some tests, add minor assertions where none existed --- pandas/io/tests/test_pytables.py | 33 +++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 00d8089ad2ee7..6737408081f3d 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1,8 +1,9 @@ import nose import unittest -import os import sys +import os import warnings +from contextlib import contextmanager import datetime import numpy as np @@ -19,7 +20,6 @@ from pandas import concat, Timestamp from pandas.util import py3compat -from numpy.testing.decorators import slow try: import tables @@ -36,12 +36,12 @@ # contextmanager to ensure the file cleanup def safe_remove(path): if path is not None: - import os try: os.remove(path) except: pass + def safe_close(store): try: if store is not None: @@ -49,7 +49,6 @@ def safe_close(store): except: pass -from contextlib import contextmanager @contextmanager def ensure_clean(path, mode='a', complevel=None, complib=None, @@ -620,7 +619,6 @@ def test_append_with_different_block_ordering(self): store.append('df',df) - def test_ndim_indexables(self): """ test using ndim tables in new ways""" @@ -1011,6 +1009,7 @@ def test_big_table_frame(self): store.append('df', df) rows = store.root.df.table.nrows recons = store.select('df') + assert isinstance(recons, DataFrame) print ("\nbig_table frame [%s] -> %5.2f" % (rows, time.time() - x)) @@ -1064,7 +1063,7 @@ def test_big_put_frame(self): with ensure_clean(self.path, mode='w') as store: start_time = time.time() - store = HDFStore(fn, mode='w') + store = HDFStore(self.path, mode='w') store.put('df', df) print (df.get_dtype_counts()) @@ -1092,6 +1091,7 @@ def test_big_table_panel(self): store.append('wp', wp) rows = store.root.wp.table.nrows recons = store.select('wp') + assert isinstance(recons, Panel) print ("\nbig_table panel [%s] -> %5.2f" % (rows, time.time() - x)) @@ -1254,7 +1254,6 @@ def test_table_values_dtypes_roundtrip(self): expected.sort() tm.assert_series_equal(result,expected) - def test_table_mixed_dtypes(self): # frame @@ -2352,7 +2351,6 @@ def test_string_select(self): expected = df[df.int!=2] assert_frame_equal(result,expected) - def test_read_column(self): df = tm.makeTimeDataFrame() @@ -2580,7 +2578,6 @@ def _check_double_roundtrip(self, obj, comparator, compression=False, again = store['obj'] comparator(again, obj, **kwargs) - def _check_roundtrip_table(self, obj, comparator, compression=False): options = {} if compression: @@ -2597,6 +2594,7 @@ def test_pytables_native_read(self): try: store = HDFStore(tm.get_data_path('legacy_hdf/pytables_native.h5'), 'r') d2 = store['detector/readout'] + assert isinstance(d2, DataFrame) finally: safe_close(store) @@ -2604,6 +2602,7 @@ def test_pytables_native_read(self): store = HDFStore(tm.get_data_path('legacy_hdf/pytables_native2.h5'), 'r') str(store) d1 = store['detector'] + assert isinstance(d1, DataFrame) finally: safe_close(store) @@ -2653,11 +2652,18 @@ def test_legacy_0_10_read(self): def test_legacy_0_11_read(self): # legacy from 0.11 try: - store = HDFStore(tm.get_data_path('legacy_hdf/legacy_table_0.11.h5'), 'r') + path = os.path.join('legacy_hdf', 'legacy_table_0.11.h5') + store = HDFStore(tm.get_data_path(path), 'r') str(store) + assert 'df' in store + assert 'df1' in store + assert 'mi' in store df = store.select('df') df1 = store.select('df1') mi = store.select('mi') + assert isinstance(df, DataFrame) + assert isinstance(df1, DataFrame) + assert isinstance(mi, DataFrame) finally: safe_close(store) @@ -2665,10 +2671,9 @@ def test_copy(self): def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): try: - import os - if f is None: - f = tm.get_data_path('legacy_hdf/legacy_0.10.h5') + f = tm.get_data_path(os.path.join('legacy_hdf', + 'legacy_0.10.h5')) store = HDFStore(f, 'r') @@ -2738,6 +2743,7 @@ def test_legacy_table_write(self): df = DataFrame(dict(A = 'foo', B = 'bar'),index=range(10)) store.append('df', df, data_columns = ['B'], min_itemsize={'A' : 200 }) + store.append('wp', wp) store.close() @@ -2824,6 +2830,7 @@ def _test_sort(obj): else: raise ValueError('type not supported here') + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From d87f0271669824091ec3822956011bc0e2b55900 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 08:24:45 -0400 Subject: [PATCH 19/37] CLN: clean up frame.py a bit --- pandas/core/frame.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a8bb74f86a43e..7f0a8492a4403 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5681,6 +5681,7 @@ def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): return create_block_manager_from_arrays(arrays, arr_names, axes) + def extract_index(data): from pandas.core.index import _union_indexes @@ -5941,6 +5942,7 @@ def _homogenize(data, index, dtype=None): return homogenized + def _from_nested_dict(data): # TODO: this should be seriously cythonized new_data = OrderedDict() From 5b58a08d77141ef7ef1faab4d33089bd8f71c64c Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 08:25:17 -0400 Subject: [PATCH 20/37] CLN: clean up pytables arguments a bit --- pandas/core/base.py | 1 + pandas/io/pytables.py | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 6122e78fa8bce..2caaf00723824 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -39,6 +39,7 @@ def __repr__(self): """ return str(self) + class PandasObject(StringMixin): """baseclass for various pandas objects""" diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index a53907c518aab..4a538b22bf939 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -182,7 +182,8 @@ def get_store(path, mode='a', complevel=None, complib=None, ### interface to/from ### -def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, append=None, **kwargs): +def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, + append=None, **kwargs): """ store this object, close it if we opened it """ if append: f = lambda store: store.append(key, value, **kwargs) @@ -190,7 +191,8 @@ def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, app f = lambda store: store.put(key, value, **kwargs) if isinstance(path_or_buf, basestring): - with get_store(path_or_buf, mode=mode, complevel=complevel, complib=complib) as store: + with get_store(path_or_buf, mode=mode, complevel=complevel, + complib=complib) as store: f(store) else: f(path_or_buf) From 7482a277a8c0309faec6481d0a4885670deb7369 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 12:45:54 -0400 Subject: [PATCH 21/37] CLN: use shiny new string mixin to refactor repring --- pandas/computation/ops.py | 60 ++++++++++++++------------- pandas/computation/tests/test_eval.py | 2 +- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 8c66fd0d122d5..26774c17959fb 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -1,13 +1,15 @@ import operator as op -from functools import partial import numpy as np from pandas.util.py3compat import PY3 import pandas.core.common as com +from pandas.core.base import StringMixin _reductions = 'sum', 'prod' -_mathops = 'sin', 'cos', 'tan' +_mathops = ('sin', 'cos', 'exp', 'log', 'expm1', 'log1p', 'pow', 'div', 'sqrt', + 'inv', 'sinh', 'cosh', 'tanh', 'arcsin', 'arccos', 'arctan', + 'arccosh', 'arcsinh', 'arctanh', 'arctan2', 'abs') class OperatorError(Exception): @@ -47,23 +49,21 @@ def _update_name(env, key, value): raise NameError('{0!r} is undefined'.format(key)) -def _update_names(env, mapping): - updater = partial(_update_name, env) - for key, value in mapping.iteritems(): - updater(key, value) +class NamedObjectMixin(object): + @property + def typename(self): + return com.pprint_thing(self.__class__.__name__) -class Term(object): - def __init__(self, value, name, env): - self.value = value +class Term(StringMixin, NamedObjectMixin): + def __init__(self, name, env): self.name = name + self.value = _resolve_name(env, name) self.env = env - self.type = type(value) - - def __str__(self): - return '{0}({1!r})'.format(self.__class__.__name__, self.name) + self.type = type(self.value) - __repr__ = __str__ + def __unicode__(self): + return com.pprint_thing('{0}({1!r})'.format(self.typename, self.name)) def update(self, value): _update_name(self.env, self.name, value) @@ -76,10 +76,10 @@ def isscalar(self): class Constant(Term): def __init__(self, value, env): - super(Constant, self).__init__(value, value, env) + super(Constant, self).__init__(value, env) -class Op(object): +class Op(NamedObjectMixin, StringMixin): """Hold an operator of unknown arity """ def __init__(self, op, operands): @@ -89,9 +89,13 @@ def __init__(self, op, operands): def __iter__(self): return iter(self.operands) - @property - def name(self): - return self.__class__.__name__ + def __unicode__(self): + op = 'op={1!r}'.format(self.op) + operands = ', '.join('opr_{i}={opr}'.format(i=i, opr=opr) + for i, opr in enumerate(self.operands)) + return com.pprint_thing('{0}({op}, ' + '{operands})'.format(self.name, op=op, + operands=operands)) _cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=' @@ -113,14 +117,14 @@ def name(self): _binary_ops_dict.update(d) -def _cast(terms, dtype): +def _cast_inplace(terms, dtype): dt = np.dtype(dtype) for term in terms: # cast all the way down the tree since operands must be try: - _cast(term.operands, dtype) + _cast_inplace(term.operands, dtype) except AttributeError: - # we've bottomed out so cast + # we've bottomed out so actually do the cast try: new_value = term.value.astype(dt) except AttributeError: @@ -157,13 +161,11 @@ def __init__(self, op, lhs, rhs): raise BinaryOperatorError('Invalid binary operator {0}, valid' ' operators are {1}'.format(op, keys)) - def __repr__(self): + def __unicode__(self): return com.pprint_thing('{0}(op={1!r}, lhs={2!r}, ' - 'rhs={3!r})'.format(self.name, self.op, + 'rhs={3!r})'.format(self.typename, self.op, self.lhs, self.rhs)) - __str__ = __repr__ - def __call__(self, env): # handle truediv if self.op == '/' and env.locals['truediv']: @@ -197,7 +199,7 @@ def __call__(self, env): class Mod(BinOp): def __init__(self, lhs, rhs): super(Mod, self).__init__('%', lhs, rhs) - _cast(self.operands, np.float_) + _cast_inplace(self.operands, np.float_) _unary_ops_syms = '+', '-', '~' @@ -237,7 +239,7 @@ def __call__(self, env): return res - def __repr__(self): + def __unicode__(self): return com.pprint_thing('{0}(op={1!r}, ' - 'operand={2!r})'.format(self.name, self.op, + 'operand={2!r})'.format(self.typename, self.op, self.operand)) diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 18fe641db5ed2..15509e2e489df 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -50,7 +50,7 @@ def _eval_from_expr(lhs, cmp1, rhs, binop, cmp2): f2 = _binary_ops_dict[cmp2] bf = _binary_ops_dict[binop] env = Scope() - typ, axes = _align_core((Term(lhs, 'lhs', env), Term(rhs, 'rhs', env))) + typ, axes = _align_core((Term('lhs', env), Term('rhs', env))) lhs, rhs = env.locals['lhs'], env.locals['rhs'] return _reconstruct_object(typ, bf(f1(lhs, rhs), f2(lhs, rhs)), axes) From 0d40fe182af58cceda1d4fcc99a6556ac83293d3 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 14:02:16 -0400 Subject: [PATCH 22/37] CLN: move align to its own file --- pandas/computation/align.py | 219 ++++++++++++++++++++++++++++++++++ pandas/computation/engines.py | 218 +-------------------------------- 2 files changed, 220 insertions(+), 217 deletions(-) create mode 100644 pandas/computation/align.py diff --git a/pandas/computation/align.py b/pandas/computation/align.py new file mode 100644 index 0000000000000..f2bf11d41e185 --- /dev/null +++ b/pandas/computation/align.py @@ -0,0 +1,219 @@ +from functools import partial, wraps +from itertools import izip + +import numpy as np + +import pandas as pd +import pandas.core.common as com +from pandas.computation.ops import is_const +from pandas.computation.common import flatten + + +def _align_core_single_unary_op(term): + if isinstance(term.value, np.ndarray) and not com.is_series(term.value): + typ = partial(np.asanyarray, dtype=term.value.dtype) + else: + typ = type(term.value) + ret = typ, + + if not hasattr(term.value, 'axes'): + ret += None, + else: + ret += _zip_axes_from_type(typ, term.value.axes), + return ret + + +def _zip_axes_from_type(typ, new_axes): + axes = {} + for ax_ind, ax_name in typ._AXIS_NAMES.iteritems(): + axes[ax_name] = new_axes[ax_ind] + return axes + + +def _maybe_promote_shape(values, naxes): + # test to see if we have an array else leave since must be a number + if not isinstance(values, np.ndarray): + return values + + ndims = values.ndim + if ndims > naxes: + raise AssertionError('cannot have more dims than axes, ' + '{0} > {1}'.format(ndims, naxes)) + if ndims == naxes: + return values + + ndim = set(xrange(ndims)) + nax = set(xrange(naxes)) + + axes_slice = [slice(None)] * naxes + + # symmetric difference of numaxes and ndims + slices = nax - ndim + + if ndims == naxes: + if slices: + raise AssertionError('slices should be empty if ndims == naxes ' + '{0}'.format(slices)) + else: + if not slices: + raise AssertionError('slices should NOT be empty if ndim != naxes ' + '{0}'.format(slices)) + + for sl in slices: + axes_slice[sl] = np.newaxis + + return values[tuple(axes_slice)] + + +def _any_pandas_objects(terms): + """Check a sequence of terms for instances of PandasObject.""" + return any(com.is_pd_obj(term.value) for term in terms) + + +def _filter_special_cases(f): + @wraps(f) + def wrapper(terms): + # single unary operand + if len(terms) == 1: + return _align_core_single_unary_op(terms[0]) + + # only scalars + elif all(term.isscalar for term in terms): + return np.result_type(*(term.value for term in terms)), None + + # single element ndarrays + all_has_size = all(hasattr(term.value, 'size') for term in terms) + if (all_has_size and all(term.value.size == 1 for term in terms)): + return np.result_type(*(term.value for term in terms)), None + + # no pandas so just punt to the evaluator + if not _any_pandas_objects(terms): + return np.result_type(*(term.value for term in terms)), None + + return f(terms) + return wrapper + + +@_filter_special_cases +def _align_core(terms): + term_index = [i for i, term in enumerate(terms) if hasattr(term.value, + 'axes')] + term_dims = [terms[i].value.ndim for i in term_index] + ndims = pd.Series(dict(zip(term_index, term_dims))) + + # initial axes are the axes of the largest-axis'd term + biggest = terms[ndims.idxmax()].value + typ = biggest._constructor + axes = biggest.axes + naxes = len(axes) + + for term in (terms[i] for i in term_index): + for axis, items in enumerate(term.value.axes): + if com.is_series(term.value) and naxes > 1: + ax, itm = naxes - 1, term.value.index + else: + ax, itm = axis, items + axes[ax] = axes[ax].join(itm, how='outer') + + for i, ndim in ndims.iteritems(): + for axis, items in izip(xrange(ndim), axes): + ti = terms[i].value + + if hasattr(ti, 'reindex_axis'): + transpose = com.is_series(ti) and naxes > 1 + + if transpose: + f = partial(ti.reindex, index=axes[naxes - 1], copy=False) + else: + f = partial(ti.reindex_axis, items, axis=axis, copy=False) + + if pd.lib.is_bool_array(ti.values): + r = f(fill_value=True) + else: + r = f() + + terms[i].update(r) + + res = _maybe_promote_shape(terms[i].value.T if transpose else + terms[i].value, naxes) + res = res.T if transpose else res + + try: + v = res.values + except AttributeError: + v = res + terms[i].update(v) + + return typ, _zip_axes_from_type(typ, axes) + + +def _filter_terms(flat): + # numeric literals + literals = set(filter(is_const, flat)) + + # these are strings which are variable names + names = set(flat) - literals + + # literals are not names and names are not literals, so intersection should + # be empty + if literals & names: + raise ValueError('literals cannot be names and names cannot be ' + 'literals') + return names, literals + + +def _align(terms, env): + # flatten the parse tree (a nested list) + terms = list(flatten(terms)) + + # separate names and literals + names, literals = _filter_terms(terms) + + if not names: # only literals so just promote to a common type + return np.result_type(*literals).type, None + + # if all resolved variables are numeric scalars + if all(term.isscalar for term in terms): + return np.result_type(*(term.value for term in terms)).type, None + + # perform the main alignment + typ, axes = _align_core(terms) + return typ, axes + + +def _reconstruct_object(typ, obj, axes): + """Reconstruct an object given its type, raw value, and possibly empty + (None) axes. + + Parameters + ---------- + typ : object + A type + obj : object + The value to use in the type constructor + axes : dict + The axes to use to construct the resulting pandas object + + Returns + ------- + reconst : typ + An object of type ``typ`` with the value `obj` and possible axes + `axes`. + """ + try: + # handle numpy dtypes + typ = typ.type + except AttributeError: + pass + + if (not isinstance(typ, partial) and + issubclass(typ, pd.core.generic.PandasObject)): + return typ(obj, **axes) + + ret_value = typ(obj) + + try: + return ret_value.item() + except (AttributeError, ValueError): + return ret_value + diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 64582192a9874..db6beb87da3a5 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -1,222 +1,6 @@ import abc -from functools import partial, wraps -from itertools import izip -import numpy as np - -import pandas as pd -import pandas.core.common as com -from pandas.computation.ops import is_const -from pandas.computation.common import flatten - - -def _align_core_single_unary_op(term): - if isinstance(term.value, np.ndarray) and not com.is_series(term.value): - typ = partial(np.asanyarray, dtype=term.value.dtype) - else: - typ = type(term.value) - ret = typ, - - if not hasattr(term.value, 'axes'): - ret += None, - else: - ret += _zip_axes_from_type(typ, term.value.axes), - return ret - - -def _zip_axes_from_type(typ, new_axes): - axes = {} - for ax_ind, ax_name in typ._AXIS_NAMES.iteritems(): - axes[ax_name] = new_axes[ax_ind] - return axes - - -def _maybe_promote_shape(values, naxes): - # test to see if we have an array else leave since must be a number - if not isinstance(values, np.ndarray): - return values - - ndims = values.ndim - if ndims > naxes: - raise AssertionError('cannot have more dims than axes, ' - '{0} > {1}'.format(ndims, naxes)) - if ndims == naxes: - return values - - ndim = set(xrange(ndims)) - nax = set(xrange(naxes)) - - axes_slice = [slice(None)] * naxes - - # symmetric difference of numaxes and ndims - slices = nax - ndim - - if ndims == naxes: - if slices: - raise AssertionError('slices should be empty if ndims == naxes ' - '{0}'.format(slices)) - else: - if not slices: - raise AssertionError('slices should NOT be empty if ndim != naxes ' - '{0}'.format(slices)) - - for sl in slices: - axes_slice[sl] = np.newaxis - - return values[tuple(axes_slice)] - - -def _any_pandas_objects(terms): - """Check a sequence of terms for instances of PandasObject.""" - return any(com.is_pd_obj(term.value) for term in terms) - - -def _filter_special_cases(f): - @wraps(f) - def wrapper(terms): - # single unary operand - if len(terms) == 1: - return _align_core_single_unary_op(terms[0]) - - # only scalars - elif all(term.isscalar for term in terms): - return np.result_type(*(term.value for term in terms)), None - - # single element ndarrays - all_has_size = all(hasattr(term.value, 'size') for term in terms) - if (all_has_size and all(term.value.size == 1 for term in terms)): - return np.result_type(*(term.value for term in terms)), None - - # no pandas so just punt to the evaluator - if not _any_pandas_objects(terms): - return np.result_type(*(term.value for term in terms)), None - - return f(terms) - return wrapper - - -@_filter_special_cases -def _align_core(terms): - term_index = [i for i, term in enumerate(terms) if hasattr(term.value, - 'axes')] - term_dims = [terms[i].value.ndim for i in term_index] - ndims = pd.Series(dict(zip(term_index, term_dims))) - - # initial axes are the axes of the largest-axis'd term - biggest = terms[ndims.idxmax()].value - typ = biggest._constructor - axes = biggest.axes - naxes = len(axes) - - for term in (terms[i] for i in term_index): - for axis, items in enumerate(term.value.axes): - if com.is_series(term.value) and naxes > 1: - ax, itm = naxes - 1, term.value.index - else: - ax, itm = axis, items - axes[ax] = axes[ax].join(itm, how='outer') - - for i, ndim in ndims.iteritems(): - for axis, items in izip(xrange(ndim), axes): - ti = terms[i].value - - if hasattr(ti, 'reindex_axis'): - transpose = com.is_series(ti) and naxes > 1 - - if transpose: - f = partial(ti.reindex, index=axes[naxes - 1], copy=False) - else: - f = partial(ti.reindex_axis, items, axis=axis, copy=False) - - if pd.lib.is_bool_array(ti.values): - r = f(fill_value=True) - else: - r = f() - - terms[i].update(r) - - res = _maybe_promote_shape(terms[i].value.T if transpose else - terms[i].value, naxes) - res = res.T if transpose else res - - try: - v = res.values - except AttributeError: - v = res - terms[i].update(v) - - return typ, _zip_axes_from_type(typ, axes) - - -def _filter_terms(flat): - # numeric literals - literals = set(filter(is_const, flat)) - - # these are strings which are variable names - names = set(flat) - literals - - # literals are not names and names are not literals, so intersection should - # be empty - if literals & names: - raise ValueError('literals cannot be names and names cannot be ' - 'literals') - return names, literals - - -def _align(terms, env): - # flatten the parse tree (a nested list) - terms = list(flatten(terms)) - - # separate names and literals - names, literals = _filter_terms(terms) - - if not names: # only literals so just promote to a common type - return np.result_type(*literals).type, None - - # if all resolved variables are numeric scalars - if all(term.isscalar for term in terms): - return np.result_type(*(term.value for term in terms)).type, None - - # perform the main alignment - typ, axes = _align_core(terms) - return typ, axes - - -def _reconstruct_object(typ, obj, axes): - """Reconstruct an object given its type, raw value, and possibly empty - (None) axes. - - Parameters - ---------- - typ : object - A type - obj : object - The value to use in the type constructor - axes : dict - The axes to use to construct the resulting pandas object - - Returns - ------- - reconst : typ - An object of type ``typ`` with the value `obj` and possible axes - `axes`. - """ - try: - # handle numpy dtypes - typ = typ.type - except AttributeError: - pass - - if (not isinstance(typ, partial) and - issubclass(typ, pd.core.generic.PandasObject)): - return typ(obj, **axes) - - ret_value = typ(obj) - - try: - return ret_value.item() - except (AttributeError, ValueError): - return ret_value +from pandas.computation.align import _align, _reconstruct_object class AbstractEngine(object): From 87957d24f08f09f2f4a8574e435d4a9dad75ec55 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 14:06:20 -0400 Subject: [PATCH 23/37] CLN: clean up and use new stringmixin for Expr --- pandas/computation/expr.py | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 987f694bf0904..777ac2a03beea 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -2,11 +2,11 @@ import sys from functools import partial - -from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops, Mod +from pandas.core.base import StringMixin +from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops from pandas.computation.ops import _cmp_ops_syms, _bool_ops_syms from pandas.computation.ops import _arith_ops_syms, _unary_ops_syms -from pandas.computation.ops import _resolve_name, Term, Constant +from pandas.computation.ops import Term, Constant class Scope(object): @@ -51,8 +51,8 @@ def __init__(self, env): def visit(self, node): if not (isinstance(node, ast.AST) or isinstance(node, basestring)): - raise AssertionError('"node" must be an AST node or a string, you' - ' passed a(n) {0}'.format(node.__class__)) + raise TypeError('"node" must be an AST node or a string, you' + ' passed a(n) {0}'.format(node.__class__)) if isinstance(node, basestring): node = ast.fix_missing_locations(ast.parse(node)) return super(ExprVisitor, self).visit(node) @@ -81,8 +81,7 @@ def visit_UnaryOp(self, node): return op(self.visit(node.operand)) def visit_Name(self, node): - name = node.id - return Term(_resolve_name(self.env, name), name, self.env) + return Term(node.id, self.env) def visit_Num(self, node): return Constant(node.n, self.env) @@ -108,16 +107,14 @@ def visit_Call(self, node): def visit_Attribute(self, node): raise NotImplementedError("attribute access is not yet supported") - def visit_Mod(self, node): - return Mod - -class Expr(object): +class Expr(StringMixin): """Expr object""" - def __init__(self, expr, engine='numexpr', env=None, truediv=True): + def __init__(self, expr, engine='numexpr', env=None, truediv=True, + parsing='strict'): self.expr = expr self.env = env or Scope(frame_level=2) - self._visitor = ExprVisitor(self.env) + self._visitor = ExprVisitor(self.env, parsing) self.terms = self.parse() self.engine = engine self.truediv = truediv @@ -126,19 +123,12 @@ def __call__(self, env): env.locals['truediv'] = self.truediv return self.terms(env) - def __repr__(self): - return '{0} -> {1}'.format(self.expr, self.terms) - - def __str__(self): - return self.expr + def __unicode__(self): + return unicode(self.terms) def parse(self): """return a Termset""" - try: - visited = self._visitor.visit(self.expr) - except SyntaxError as e: - raise e - return visited + return self._visitor.visit(self.expr) def align(self): """align a set of Terms""" From e35cb5cf07b38390bdd66e583d0c98a0ae154193 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 14:06:36 -0400 Subject: [PATCH 24/37] ENH/CLN: be more careful about unicode --- pandas/computation/eval.py | 1 - pandas/computation/expr.py | 5 ++--- pandas/computation/ops.py | 12 +++++------- pandas/computation/tests/test_eval.py | 4 ++-- 4 files changed, 9 insertions(+), 13 deletions(-) diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index b7d15d1d009bc..e08e0f28d7877 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -1,6 +1,5 @@ #!/usr/bin/env python -import sys import numbers import numpy as np diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 777ac2a03beea..60fea6e935070 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -110,11 +110,10 @@ def visit_Attribute(self, node): class Expr(StringMixin): """Expr object""" - def __init__(self, expr, engine='numexpr', env=None, truediv=True, - parsing='strict'): + def __init__(self, expr, engine='numexpr', env=None, truediv=True): self.expr = expr self.env = env or Scope(frame_level=2) - self._visitor = ExprVisitor(self.env, parsing) + self._visitor = ExprVisitor(self.env) self.terms = self.parse() self.engine = engine self.truediv = truediv diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 26774c17959fb..24000b27a033a 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -63,7 +63,7 @@ def __init__(self, name, env): self.type = type(self.value) def __unicode__(self): - return com.pprint_thing('{0}({1!r})'.format(self.typename, self.name)) + return com.pprint_thing(self.name) def update(self, value): _update_name(self.env, self.name, value) @@ -162,9 +162,8 @@ def __init__(self, op, lhs, rhs): ' operators are {1}'.format(op, keys)) def __unicode__(self): - return com.pprint_thing('{0}(op={1!r}, lhs={2!r}, ' - 'rhs={3!r})'.format(self.typename, self.op, - self.lhs, self.rhs)) + return com.pprint_thing('({0}) {1} ({2})'.format(self.lhs, self.op, + self.rhs)) def __call__(self, env): # handle truediv @@ -240,6 +239,5 @@ def __call__(self, env): return res def __unicode__(self): - return com.pprint_thing('{0}(op={1!r}, ' - 'operand={2!r})'.format(self.typename, self.op, - self.operand)) + return com.pprint_thing('{0}({1})'.format(self.op, self.operand)) + diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 15509e2e489df..0a1356915523a 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -17,8 +17,8 @@ from pandas.core import common as com from pandas import DataFrame, Series from pandas.util.testing import makeCustomDataframe as mkdf -from pandas.computation.engines import (_engines, _align_core, - _reconstruct_object) +from pandas.computation.engines import _engines, _reconstruct_object +from pandas.computation.align import _align_core from pandas.computation.ops import _binary_ops_dict, _unary_ops_dict, Term import pandas.computation.expr as expr from pandas.computation.expressions import _USE_NUMEXPR From 1ceec39bf7e983d0deec9a5dec2fe8583e411a5e Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 14:30:29 -0400 Subject: [PATCH 25/37] CLN: run autopep8 on pandas/io/pytables.py --- pandas/io/pytables.py | 811 +++++++++++++++++++++++++++--------------- 1 file changed, 520 insertions(+), 291 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4a538b22bf939..013e596320250 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -87,40 +87,40 @@ class AttributeConflictWarning(Warning): # map object types _TYPE_MAP = { - Series : u'series', - SparseSeries : u'sparse_series', - TimeSeries : u'series', - DataFrame : u'frame', - SparseDataFrame : u'sparse_frame', - Panel : u'wide', - Panel4D : u'ndim', - SparsePanel : u'sparse_panel' + Series: u'series', + SparseSeries: u'sparse_series', + TimeSeries: u'series', + DataFrame: u'frame', + SparseDataFrame: u'sparse_frame', + Panel: u'wide', + Panel4D: u'ndim', + SparsePanel: u'sparse_panel' } # storer class map _STORER_MAP = { - u'TimeSeries' : 'LegacySeriesStorer', - u'Series' : 'LegacySeriesStorer', - u'DataFrame' : 'LegacyFrameStorer', - u'DataMatrix' : 'LegacyFrameStorer', - u'series' : 'SeriesStorer', - u'sparse_series' : 'SparseSeriesStorer', - u'frame' : 'FrameStorer', - u'sparse_frame' : 'SparseFrameStorer', - u'wide' : 'PanelStorer', - u'sparse_panel' : 'SparsePanelStorer', + u'TimeSeries': 'LegacySeriesStorer', + u'Series': 'LegacySeriesStorer', + u'DataFrame': 'LegacyFrameStorer', + u'DataMatrix': 'LegacyFrameStorer', + u'series': 'SeriesStorer', + u'sparse_series': 'SparseSeriesStorer', + u'frame': 'FrameStorer', + u'sparse_frame': 'SparseFrameStorer', + u'wide': 'PanelStorer', + u'sparse_panel': 'SparsePanelStorer', } # table class map _TABLE_MAP = { - u'generic_table' : 'GenericTable', - u'appendable_frame' : 'AppendableFrameTable', - u'appendable_multiframe' : 'AppendableMultiFrameTable', - u'appendable_panel' : 'AppendablePanelTable', - u'appendable_ndim' : 'AppendableNDimTable', - u'worm' : 'WORMTable', - u'legacy_frame' : 'LegacyFrameTable', - u'legacy_panel' : 'LegacyPanelTable', + u'generic_table': 'GenericTable', + u'appendable_frame': 'AppendableFrameTable', + u'appendable_multiframe': 'AppendableMultiFrameTable', + u'appendable_panel': 'AppendablePanelTable', + u'appendable_ndim': 'AppendableNDimTable', + u'worm': 'WORMTable', + u'legacy_frame': 'LegacyFrameTable', + u'legacy_panel': 'LegacyPanelTable', } # axes map @@ -149,6 +149,7 @@ def _tables(): return _table_mod + def h5_open(path, mode): tables = _tables() return tables.openFile(path, mode) @@ -180,7 +181,7 @@ def get_store(path, mode='a', complevel=None, complib=None, store.close() -### interface to/from ### +# interface to/from ### def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, append=None, **kwargs): @@ -197,9 +198,11 @@ def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, else: f(path_or_buf) + def read_hdf(path_or_buf, key, **kwargs): """ read from the store, closeit if we opened it """ - f = lambda store, auto_close: store.select(key, auto_close=auto_close, **kwargs) + f = lambda store, auto_close: store.select( + key, auto_close=auto_close, **kwargs) if isinstance(path_or_buf, basestring): @@ -221,7 +224,9 @@ def read_hdf(path_or_buf, key, **kwargs): # a passed store; user controls open/close f(path_or_buf, False) + class HDFStore(object): + """ dict-like IO interface for storing pandas objects in PyTables format. @@ -322,7 +327,7 @@ def __unicode__(self): output = '%s\nFile path: %s\n' % (type(self), pprint_thing(self._path)) if len(self.keys()): - keys = [] + keys = [] values = [] for k in self.keys(): @@ -330,10 +335,13 @@ def __unicode__(self): s = self.get_storer(k) if s is not None: keys.append(pprint_thing(s.pathname or k)) - values.append(pprint_thing(s or 'invalid_HDFStore node')) + values.append( + pprint_thing(s or 'invalid_HDFStore node')) except Exception as detail: keys.append(k) - values.append("[invalid_HDFStore node: %s]" % pprint_thing(detail)) + values.append( + "[invalid_HDFStore node: %s]" % + pprint_thing(detail)) output += adjoin(12, keys, values) else: @@ -387,7 +395,7 @@ def open(self, mode='a', warn=True): try: self._handle = h5_open(self._path, self._mode) - except IOError, e: # pragma: no cover + except IOError as e: # pragma: no cover if 'can not be written' in str(e): print ('Opening %s in read-only mode' % self._path) self._handle = h5_open(self._path, 'r') @@ -456,7 +464,8 @@ def func(_start, _stop): if iterator or chunksize is not None: if not s.is_table: - raise TypeError("can only use an iterator or chunksize on a table") + raise TypeError( + "can only use an iterator or chunksize on a table") return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, chunksize=chunksize, auto_close=auto_close) @@ -464,7 +473,8 @@ def func(_start, _stop): return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, auto_close=auto_close).get_values() - def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs): + def select_as_coordinates( + self, key, where=None, start=None, stop=None, **kwargs): """ return the selection as a Coordinates. @@ -480,7 +490,7 @@ def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs def unique(self, key, column, **kwargs): warnings.warn("unique(key,column) is deprecated\n" "use select_column(key,column).unique() instead") - return self.get_storer(key).read_column(column = column, **kwargs).unique() + return self.get_storer(key).read_column(column=column, **kwargs).unique() def select_column(self, key, column, **kwargs): """ @@ -497,9 +507,10 @@ def select_column(self, key, column, **kwargs): raises ValueError if the column can not be extracted indivually (it is part of a data block) """ - return self.get_storer(key).read_column(column = column, **kwargs) + return self.get_storer(key).read_column(column=column, **kwargs) - def select_as_multiple(self, keys, where=None, selector=None, columns=None, start=None, stop=None, iterator=False, chunksize=None, auto_close=False, **kwargs): + def select_as_multiple(self, keys, where=None, selector=None, columns=None, + start=None, stop=None, iterator=False, chunksize=None, auto_close=False, **kwargs): """ Retrieve pandas objects from multiple tables Parameters @@ -533,7 +544,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, star selector = keys[0] # collect the tables - tbls = [ self.get_storer(k) for k in keys ] + tbls = [self.get_storer(k) for k in keys] # validate rows nrows = None @@ -541,24 +552,32 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, star if t is None: raise TypeError("Invalid table [%s]" % k) if not t.is_table: - raise TypeError("object [%s] is not a table, and cannot be used in all select as multiple" % t.pathname) + raise TypeError( + "object [%s] is not a table, and cannot be used in all select as multiple" % + t.pathname) if nrows is None: nrows = t.nrows elif t.nrows != nrows: - raise ValueError("all tables must have exactly the same nrows!") + raise ValueError( + "all tables must have exactly the same nrows!") # select coordinates from the selector table try: - c = self.select_as_coordinates(selector, where, start=start, stop=stop) + c = self.select_as_coordinates( + selector, + where, + start=start, + stop=stop) nrows = len(c) - except (Exception), detail: + except (Exception) as detail: raise ValueError("invalid selector [%s]" % selector) def func(_start, _stop): # collect the returns objs - objs = [t.read(where=c[_start:_stop], columns=columns) for t in tbls] + objs = [t.read(where=c[_start:_stop], columns=columns) + for t in tbls] # axis is the concentation axes axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0] @@ -571,7 +590,6 @@ def func(_start, _stop): return TableIterator(self, func, nrows=nrows, start=start, stop=stop, auto_close=auto_close).get_values() - def put(self, key, value, table=None, append=False, **kwargs): """ Store object in HDFStore @@ -617,7 +635,8 @@ def remove(self, key, where=None, start=None, stop=None): except: if where is not None: - raise ValueError("trying to remove a node with a non-None where clause!") + raise ValueError( + "trying to remove a node with a non-None where clause!") # we are actually trying to remove a node (with children) s = self.get_node(key) @@ -635,8 +654,9 @@ def remove(self, key, where=None, start=None, stop=None): # delete from the table else: if not s.is_table: - raise ValueError('can only remove with where on objects written as tables') - return s.delete(where = where, start=start, stop=stop) + raise ValueError( + 'can only remove with where on objects written as tables') + return s.delete(where=where, start=start, stop=stop) def append(self, key, value, columns=None, **kwargs): """ @@ -660,11 +680,13 @@ def append(self, key, value, columns=None, **kwargs): data in the table, so be careful """ if columns is not None: - raise Exception("columns is not a supported keyword in append, try data_columns") + raise Exception( + "columns is not a supported keyword in append, try data_columns") self._write_to_group(key, value, table=True, append=True, **kwargs) - def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, **kwargs): + def append_to_multiple( + self, d, value, selector, data_columns=None, axes=None, **kwargs): """ Append to multiple tables @@ -683,13 +705,16 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, * """ if axes is not None: - raise Exception("axes is currently not accepted as a paremter to append_to_multiple; you can create the tables indepdently instead") + raise Exception( + "axes is currently not accepted as a paremter to append_to_multiple; you can create the tables indepdently instead") if not isinstance(d, dict): - raise ValueError("append_to_multiple must have a dictionary specified as the way to split the value") + raise ValueError( + "append_to_multiple must have a dictionary specified as the way to split the value") if selector not in d: - raise ValueError("append_to_multiple requires a selector that is in passed dict") + raise ValueError( + "append_to_multiple requires a selector that is in passed dict") # figure out the splitting axis (the non_index_axis) axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0] @@ -700,7 +725,8 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, * for k, v in d.items(): if v is None: if remain_key is not None: - raise ValueError("append_to_multiple can only have one value in d that is None") + raise ValueError( + "append_to_multiple can only have one value in d that is None") remain_key = k else: remain_values.extend(v) @@ -741,7 +767,8 @@ def create_table_index(self, key, **kwargs): raise Exception("PyTables >= 2.3 is required for table indexing") s = self.get_storer(key) - if s is None: return + if s is None: + return if not s.is_table: raise TypeError("cannot create table index on a non-table") @@ -750,8 +777,8 @@ def create_table_index(self, key, **kwargs): def groups(self): """ return a list of all the top-level nodes (that are not themselves a pandas storage object) """ _tables() - return [ g for g in self._handle.walkNodes() if getattr(g._v_attrs,'pandas_type',None) or getattr( - g,'table',None) or (isinstance(g,_table_mod.table.Table) and g._v_name != u'table') ] + return [g for g in self._handle.walkNodes() if getattr(g._v_attrs, 'pandas_type', None) or getattr( + g, 'table', None) or (isinstance(g, _table_mod.table.Table) and g._v_name != u'table')] def get_node(self, key): """ return the node with the key or None if it does not exist """ @@ -771,8 +798,9 @@ def get_storer(self, key): s.infer_axes() return s - def copy(self, file, mode = 'w', propindexes = True, keys = None, complib = None, complevel = None, - fletcher32 = False, overwrite = True): + def copy( + self, file, mode='w', propindexes=True, keys=None, complib=None, complevel=None, + fletcher32=False, overwrite=True): """ copy the existing store to a new file, upgrading in place Parameters @@ -787,13 +815,18 @@ def copy(self, file, mode = 'w', propindexes = True, keys = None, complib = None open file handle of the new store """ - new_store = HDFStore(file, mode = mode, complib = complib, complevel = complevel, fletcher32 = fletcher32) + new_store = HDFStore( + file, + mode=mode, + complib=complib, + complevel=complevel, + fletcher32=fletcher32) if keys is None: keys = self.keys() - if not isinstance(keys, (tuple,list)): - keys = [ keys ] + if not isinstance(keys, (tuple, list)): + keys = [keys] for k in keys: - s = self.get_storer(k) + s = self.get_storer(k) if s is not None: if k in new_store: @@ -805,35 +838,45 @@ def copy(self, file, mode = 'w', propindexes = True, keys = None, complib = None index = False if propindexes: - index = [ a.name for a in s.axes if a.is_indexed ] - new_store.append(k, data, index=index, data_columns=getattr(s,'data_columns',None), encoding=s.encoding) + index = [a.name for a in s.axes if a.is_indexed] + new_store.append( + k, + data, + index=index, + data_columns=getattr( + s, + 'data_columns', + None), + encoding=s.encoding) else: new_store.put(k, data, encoding=s.encoding) return new_store - ###### private methods ###### + # private methods ###### - def _create_storer(self, group, value = None, table = False, append = False, **kwargs): + def _create_storer( + self, group, value=None, table=False, append=False, **kwargs): """ return a suitable Storer class to operate """ def error(t): raise TypeError("cannot properly create the storer for: [%s] [group->%s,value->%s,table->%s,append->%s,kwargs->%s]" % - (t,group,type(value),table,append,kwargs)) + (t, group, type(value), table, append, kwargs)) - pt = _ensure_decoded(getattr(group._v_attrs,'pandas_type',None)) - tt = _ensure_decoded(getattr(group._v_attrs,'table_type',None)) + pt = _ensure_decoded(getattr(group._v_attrs, 'pandas_type', None)) + tt = _ensure_decoded(getattr(group._v_attrs, 'table_type', None)) # infer the pt from the passed value if pt is None: if value is None: _tables() - if getattr(group,'table',None) or isinstance(group,_table_mod.table.Table): + if getattr(group, 'table', None) or isinstance(group, _table_mod.table.Table): pt = u'frame_table' tt = u'generic_table' else: - raise TypeError("cannot create a storer if the object is not existing nor a value are passed") + raise TypeError( + "cannot create a storer if the object is not existing nor a value are passed") else: try: @@ -859,14 +902,14 @@ def error(t): if value is not None: if pt == u'frame_table': - index = getattr(value,'index',None) + index = getattr(value, 'index', None) if index is not None: if index.nlevels == 1: tt = u'appendable_frame' elif index.nlevels > 1: tt = u'appendable_multiframe' elif pt == u'wide_table': - tt = u'appendable_panel' + tt = u'appendable_panel' elif pt == u'ndim_table': tt = u'appendable_ndim' @@ -886,8 +929,9 @@ def error(t): except: error('_TABLE_MAP') - def _write_to_group(self, key, value, index=True, table=False, append=False, - complib=None, encoding=None, **kwargs): + def _write_to_group( + self, key, value, index=True, table=False, append=False, + complib=None, encoding=None, **kwargs): group = self.get_node(key) # remove the node if we are not appending @@ -927,16 +971,18 @@ def _write_to_group(self, key, value, index=True, table=False, append=False, if not s.is_table and complib: raise ValueError('Compression not supported on non-table') - s.write(obj = value, append=append, complib=complib, **kwargs) + s.write(obj=value, append=append, complib=complib, **kwargs) if s.is_table and index: - s.create_index(columns = index) + s.create_index(columns=index) def _read_group(self, group, **kwargs): s = self._create_storer(group) s.infer_axes() return s.read(**kwargs) + class TableIterator(object): + """ define the iteration interface on a table Parameters @@ -953,15 +999,16 @@ class TableIterator(object): kwargs : the passed kwargs """ - def __init__(self, store, func, nrows, start=None, stop=None, chunksize=None, auto_close=False): + def __init__(self, store, func, nrows, start=None, + stop=None, chunksize=None, auto_close=False): self.store = store - self.func = func + self.func = func self.nrows = nrows or 0 self.start = start or 0 if stop is None: stop = self.nrows - self.stop = min(self.nrows,stop) + self.stop = min(self.nrows, stop) if chunksize is None: chunksize = 100000 @@ -992,7 +1039,9 @@ def get_values(self): self.close() return results + class IndexCol(StringMixin): + """ an index column description class Parameters @@ -1008,11 +1057,12 @@ class IndexCol(StringMixin): is_an_indexable = True is_data_indexable = True is_searchable = False - _info_fields = ['freq','tz','index_name'] + _info_fields = ['freq', 'tz', 'index_name'] - def __init__(self, values=None, kind=None, typ=None, cname=None, itemsize=None, - name=None, axis=None, kind_attr=None, pos=None, freq=None, tz=None, - index_name=None, **kwargs): + def __init__( + self, values=None, kind=None, typ=None, cname=None, itemsize=None, + name=None, axis=None, kind_attr=None, pos=None, freq=None, tz=None, + index_name=None, **kwargs): self.values = values self.kind = kind self.typ = typ @@ -1059,7 +1109,13 @@ def set_table(self, table): return self def __unicode__(self): - temp = tuple(map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))) + temp = tuple( + map(pprint_thing, + (self.name, + self.cname, + self.axis, + self.pos, + self.kind))) return "name->%s,cname->%s,axis->%s,pos->%s,kind->%s" % temp def __eq__(self, other): @@ -1073,7 +1129,7 @@ def __ne__(self, other): def is_indexed(self): """ return whether I am an indexed column """ try: - return getattr(self.table.cols,self.cname).is_indexed + return getattr(self.table.cols, self.cname).is_indexed except: False @@ -1095,7 +1151,7 @@ def convert(self, values, nan_rep, encoding): except: pass - values =_maybe_convert(values, self.kind, encoding) + values = _maybe_convert(values, self.kind, encoding) kwargs = dict() if self.freq is not None: @@ -1106,15 +1162,22 @@ def convert(self, values, nan_rep, encoding): self.values = Index(values, **kwargs) except: - # if the output freq is different that what we recorded, then infer it + # if the output freq is different that what we recorded, then infer + # it if 'freq' in kwargs: kwargs['freq'] = 'infer' - self.values = Index(_maybe_convert(values, self.kind, encoding), **kwargs) + self.values = Index( + _maybe_convert( + values, + self.kind, + encoding), + **kwargs) # set the timezone if indicated # we stored in utc, so reverse to local timezone if self.tz is not None: - self.values = self.values.tz_localize('UTC').tz_convert(_ensure_decoded(self.tz)) + self.values = self.values.tz_localize( + 'UTC').tz_convert(_ensure_decoded(self.tz)) return self @@ -1177,7 +1240,7 @@ def validate_col(self, itemsize=None): raise ValueError("Trying to store a string with len [%s] in [%s] column but\n" "this column has a limit of [%s]!\n" "Consider using min_itemsize to preset the sizes on these columns" - % (itemsize,self.cname, c.itemsize)) + % (itemsize, self.cname, c.itemsize)) return c.itemsize return None @@ -1196,7 +1259,7 @@ def update_info(self, info): for key in self._info_fields: - value = getattr(self,key,None) + value = getattr(self, key, None) try: idx = info[self.name] @@ -1207,18 +1270,18 @@ def update_info(self, info): if key in idx and value is not None and existing_value != value: # frequency/name just warn - if key in ['freq','index_name']: - ws = attribute_conflict_doc % (key,existing_value,value) + if key in ['freq', 'index_name']: + ws = attribute_conflict_doc % (key, existing_value, value) warnings.warn(ws, AttributeConflictWarning) # reset idx[key] = None - setattr(self,key,None) + setattr(self, key, None) else: raise ValueError("invalid info for [%s] for [%s]""" ", existing_value [%s] conflicts with new value [%s]" % (self.name, - key,existing_value,value)) + key, existing_value, value)) else: if value is not None or existing_value is not None: idx[key] = value @@ -1239,7 +1302,9 @@ def set_attr(self): """ set the kind for this colummn """ setattr(self.attrs, self.kind_attr, self.kind) + class GenericIndexCol(IndexCol): + """ an index which is not represented in the data of the table """ @property @@ -1258,7 +1323,9 @@ def get_attr(self): def set_attr(self): pass + class DataCol(IndexCol): + """ a data holding column, by definition this is not indexable Parameters @@ -1273,7 +1340,8 @@ class DataCol(IndexCol): _info_fields = ['tz'] @classmethod - def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs): + def create_for_block( + cls, i=None, name=None, cname=None, version=None, **kwargs): """ return a new datacol with the block i """ if cname is None: @@ -1293,7 +1361,8 @@ def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs) return cls(name=name, cname=cname, **kwargs) - def __init__(self, values=None, kind=None, typ=None, cname=None, data=None, block=None, **kwargs): + def __init__(self, values=None, kind=None, typ=None, + cname=None, data=None, block=None, **kwargs): super(DataCol, self).__init__( values=values, kind=kind, typ=typ, cname=cname, **kwargs) self.dtype = None @@ -1337,13 +1406,16 @@ def set_kind(self): elif dtype.startswith(u'bool'): self.kind = 'bool' else: - raise AssertionError("cannot interpret dtype of [%s] in [%s]" % (dtype,self)) + raise AssertionError( + "cannot interpret dtype of [%s] in [%s]" % + (dtype, self)) # set my typ if we need if self.typ is None: - self.typ = getattr(self.description,self.cname,None) + self.typ = getattr(self.description, self.cname, None) - def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=None, **kwargs): + def set_atom(self, block, existing_col, min_itemsize, + nan_rep, info, encoding=None, **kwargs): """ create and setup my atom from the block b """ self.values = list(block.items) @@ -1357,7 +1429,7 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No raise TypeError( "[date] is not implemented as a table column") elif inferred_type == 'datetime': - if getattr(rvalues[0],'tzinfo',None) is not None: + if getattr(rvalues[0], 'tzinfo', None) is not None: # if this block has more than one timezone, raise if len(set([r.tzinfo for r in rvalues])) != 1: @@ -1366,7 +1438,7 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No # convert this column to datetime64[ns] utc, and save the tz index = DatetimeIndex(rvalues) - tz = getattr(index,'tz',None) + tz = getattr(index, 'tz', None) if tz is None: raise TypeError( "invalid timezone specification") @@ -1380,7 +1452,9 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No self.tz = zone self.update_info(info) - self.set_atom_datetime64(block, values.reshape(block.values.shape)) + self.set_atom_datetime64( + block, + values.reshape(block.values.shape)) else: raise TypeError( @@ -1392,7 +1466,12 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No # this is basically a catchall; if say a datetime64 has nans then will # end up here ### elif inferred_type == 'string' or dtype == 'object': - self.set_atom_string(block, existing_col, min_itemsize, nan_rep, encoding) + self.set_atom_string( + block, + existing_col, + min_itemsize, + nan_rep, + encoding) else: self.set_atom_data(block) @@ -1401,16 +1480,18 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No def get_atom_string(self, block, itemsize): return _tables().StringCol(itemsize=itemsize, shape=block.shape[0]) - def set_atom_string(self, block, existing_col, min_itemsize, nan_rep, encoding): + def set_atom_string( + self, block, existing_col, min_itemsize, nan_rep, encoding): # fill nan items with myself block = block.fillna(nan_rep) - data = block.values + data = block.values # see if we have a valid string type inferred_type = lib.infer_dtype(data.ravel()) if inferred_type != 'string': - # we cannot serialize this data, so report an exception on a column by column basis + # we cannot serialize this data, so report an exception on a column + # by column basis for item in block.items: col = block.get(item) @@ -1418,8 +1499,7 @@ def set_atom_string(self, block, existing_col, min_itemsize, nan_rep, encoding): if inferred_type != 'string': raise TypeError("Cannot serialize the column [%s] because\n" "its data contents are [%s] object dtype" % - (item,inferred_type)) - + (item, inferred_type)) # itemsize is the maximum length of a string (along any dimension) itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) @@ -1464,7 +1544,7 @@ def set_atom_data(self, block): def get_atom_datetime64(self, block): return _tables().Int64Col(shape=block.shape[0]) - def set_atom_datetime64(self, block, values = None): + def set_atom_datetime64(self, block, values=None): self.kind = 'datetime64' self.typ = self.get_atom_datetime64(block) if values is None: @@ -1487,13 +1567,13 @@ def validate_attr(self, append): if (existing_fields is not None and existing_fields != list(self.values)): raise ValueError("appended items do not match existing items" - " in table!") + " in table!") existing_dtype = getattr(self.attrs, self.dtype_attr, None) if (existing_dtype is not None and existing_dtype != self.dtype): raise ValueError("appended items dtype do not match existing items dtype" - " in table!") + " in table!") def convert(self, values, nan_rep, encoding): """ set the data from this selection (and convert to the correct dtype if we can) """ @@ -1515,8 +1595,12 @@ def convert(self, values, nan_rep, encoding): # data should be 2-dim here # we stored as utc, so just set the tz - index = DatetimeIndex(self.data.ravel(),tz='UTC').tz_convert(self.tz) - self.data = np.array(index.tolist(),dtype=object).reshape(self.data.shape) + index = DatetimeIndex( + self.data.ravel(), + tz='UTC').tz_convert(self.tz) + self.data = np.array( + index.tolist(), + dtype=object).reshape(self.data.shape) else: self.data = np.asarray(self.data, dtype='M8[ns]') @@ -1537,14 +1621,17 @@ def convert(self, values, nan_rep, encoding): # convert nans / decode if _ensure_decoded(self.kind) == u'string': - self.data = _unconvert_string_array(self.data, nan_rep=nan_rep, encoding=encoding) + self.data = _unconvert_string_array( + self.data, + nan_rep=nan_rep, + encoding=encoding) return self def get_attr(self): """ get the data for this colummn """ self.values = getattr(self.attrs, self.kind_attr, None) - self.dtype = getattr(self.attrs, self.dtype_attr, None) + self.dtype = getattr(self.attrs, self.dtype_attr, None) self.set_kind() def set_attr(self): @@ -1555,6 +1642,7 @@ def set_attr(self): class DataIndexableCol(DataCol): + """ represent a data column that can be indexed """ is_data_indexable = True @@ -1571,13 +1659,17 @@ def get_atom_data(self, block): def get_atom_datetime64(self, block): return _tables().Int64Col() + class GenericDataIndexableCol(DataIndexableCol): + """ represent a generic pytables data column """ def get_attr(self): pass + class Storer(StringMixin): + """ represent an object in my store facilitate read/write of various types of objects this is an abstract base class @@ -1589,14 +1681,14 @@ class Storer(StringMixin): group : the group node where the table resides """ pandas_kind = None - obj_type = None - ndim = None - is_table = False + obj_type = None + ndim = None + is_table = False def __init__(self, parent, group, encoding=None, **kwargs): - self.parent = parent - self.group = group - self.encoding = _ensure_encoding(encoding) + self.parent = parent + self.group = group + self.encoding = _ensure_encoding(encoding) self.set_version() @property @@ -1605,7 +1697,11 @@ def is_old_version(self): def set_version(self): """ compute and set our version """ - version = _ensure_decoded(getattr(self.group._v_attrs,'pandas_version',None)) + version = _ensure_decoded( + getattr( + self.group._v_attrs, + 'pandas_version', + None)) try: self.version = tuple([int(x) for x in version.split('.')]) if len(self.version) == 2: @@ -1622,9 +1718,9 @@ def __unicode__(self): self.infer_axes() s = self.shape if s is not None: - if isinstance(s, (list,tuple)): + if isinstance(s, (list, tuple)): s = "[%s]" % ','.join([pprint_thing(x) for x in s]) - return "%-12.12s (shape->%s)" % (self.pandas_type,s) + return "%-12.12s (shape->%s)" % (self.pandas_type, s) return self.pandas_type def __str__(self): @@ -1695,14 +1791,15 @@ def is_exists(self): @property def nrows(self): - return getattr(self.storable,'nrows',None) + return getattr(self.storable, 'nrows', None) def validate(self, other): """ validate against an existing storable """ - if other is None: return + if other is None: + return return True - def validate_version(self, where = None): + def validate_version(self, where=None): """ are we trying to operate on an old version? """ return True @@ -1717,12 +1814,14 @@ def infer_axes(self): return True def read(self, **kwargs): - raise NotImplementedError("cannot read on an abstract storer: subclasses should implement") + raise NotImplementedError( + "cannot read on an abstract storer: subclasses should implement") def write(self, **kwargs): - raise NotImplementedError("cannot write on an abstract storer: sublcasses should implement") + raise NotImplementedError( + "cannot write on an abstract storer: sublcasses should implement") - def delete(self, where = None, **kwargs): + def delete(self, where=None, **kwargs): """ support fully deleting the node in its entirety (only) - where specification must be None """ if where is None: self._handle.removeNode(self.group, recursive=True) @@ -1730,11 +1829,14 @@ def delete(self, where = None, **kwargs): raise TypeError("cannot delete on an abstract storer") + class GenericStorer(Storer): + """ a generified storer version """ - _index_type_map = { DatetimeIndex: 'datetime', - PeriodIndex: 'period'} - _reverse_index_map = dict([ (v,k) for k, v in _index_type_map.iteritems() ]) + _index_type_map = {DatetimeIndex: 'datetime', + PeriodIndex: 'period'} + _reverse_index_map = dict([(v, k) + for k, v in _index_type_map.iteritems()]) attributes = [] # indexer helpders @@ -1756,9 +1858,11 @@ def f(values, freq=None, tz=None): def validate_read(self, kwargs): if kwargs.get('columns') is not None: - raise TypeError("cannot pass a column specification when reading a Storer") + raise TypeError( + "cannot pass a column specification when reading a Storer") if kwargs.get('where') is not None: - raise TypeError("cannot pass a where specification when reading a Storer") + raise TypeError( + "cannot pass a where specification when reading a Storer") @property def is_exists(self): @@ -1770,9 +1874,9 @@ def set_attrs(self): def get_attrs(self): """ retrieve our attributes """ - self.encoding = _ensure_encoding(getattr(self.attrs,'encoding',None)) + self.encoding = _ensure_encoding(getattr(self.attrs, 'encoding', None)) for n in self.attributes: - setattr(self,n,_ensure_decoded(getattr(self.attrs, n, None))) + setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None))) def write(self, obj, **kwargs): self.set_attrs() @@ -1833,7 +1937,7 @@ def write_index(self, key, index): self.write_sparse_intindex(key, index) else: setattr(self.attrs, '%s_variety' % key, 'regular') - converted = _convert_index(index,self.encoding).set_name('index') + converted = _convert_index(index, self.encoding).set_name('index') self.write_array(key, converted.values) node = getattr(self.group, key) node._v_attrs.kind = converted.kind @@ -1851,7 +1955,6 @@ def write_index(self, key, index): zone = tslib.tot_seconds(index.tz.utcoffset()) node._v_attrs.tz = zone - def write_block_index(self, key, index): self.write_array('%s_blocs' % key, index.blocs) self.write_array('%s_blengths' % key, index.blengths) @@ -1931,10 +2034,15 @@ def read_index_node(self, node): kwargs['tz'] = node._v_attrs['tz'] if kind in (u'date', u'datetime'): - index = factory(_unconvert_index(data, kind, encoding=self.encoding), dtype=object, - **kwargs) + index = factory( + _unconvert_index(data, kind, encoding=self.encoding), dtype=object, + **kwargs) else: - index = factory(_unconvert_index(data, kind, encoding=self.encoding), **kwargs) + index = factory( + _unconvert_index(data, + kind, + encoding=self.encoding), + **kwargs) index.name = name @@ -1985,7 +2093,8 @@ def write_array(self, key, value, items=None): if value.dtype.type == np.object_: - # infer the type, warn if we have a non-string type here (for performance) + # infer the type, warn if we have a non-string type here (for + # performance) inferred_type = lib.infer_dtype(value.ravel()) if empty_array: pass @@ -1996,11 +2105,11 @@ def write_array(self, key, value, items=None): items = list(items) except: pass - ws = performance_doc % (inferred_type,key,items) + ws = performance_doc % (inferred_type, key, items) warnings.warn(ws, PerformanceWarning) vlarr = self._handle.createVLArray(self.group, key, - _tables().ObjectAtom()) + _tables().ObjectAtom()) vlarr.append(value) elif value.dtype.type == np.datetime64: self._handle.createArray(self.group, key, value.view('i8')) @@ -2013,14 +2122,16 @@ def write_array(self, key, value, items=None): getattr(self.group, key)._v_attrs.transposed = transposed + class LegacyStorer(GenericStorer): def read_index_legacy(self, key): - node = getattr(self.group,key) + node = getattr(self.group, key) data = node[:] kind = node._v_attrs.kind return _unconvert_index_legacy(data, kind, encoding=self.encoding) + class LegacySeriesStorer(LegacyStorer): def read(self, **kwargs): @@ -2029,6 +2140,7 @@ def read(self, **kwargs): values = self.read_array('values') return Series(values, index=index) + class LegacyFrameStorer(LegacyStorer): def read(self, **kwargs): @@ -2038,6 +2150,7 @@ def read(self, **kwargs): values = self.read_array('values') return DataFrame(values, index=index, columns=columns) + class SeriesStorer(GenericStorer): pandas_kind = u'series' attributes = ['name'] @@ -2045,7 +2158,7 @@ class SeriesStorer(GenericStorer): @property def shape(self): try: - return len(getattr(self.group,'values')), + return len(getattr(self.group, 'values')), except: return None @@ -2065,9 +2178,10 @@ def write(self, obj, **kwargs): self.write_array('values', obj.values) self.attrs.name = obj.name + class SparseSeriesStorer(GenericStorer): pandas_kind = u'sparse_series' - attributes = ['name','fill_value','kind'] + attributes = ['name', 'fill_value', 'kind'] def read(self, **kwargs): self.validate_read(kwargs) @@ -2087,9 +2201,10 @@ def write(self, obj, **kwargs): self.attrs.fill_value = obj.fill_value self.attrs.kind = obj.kind + class SparseFrameStorer(GenericStorer): pandas_kind = u'sparse_frame' - attributes = ['default_kind','default_fill_value'] + attributes = ['default_kind', 'default_fill_value'] def read(self, **kwargs): self.validate_read(kwargs) @@ -2097,7 +2212,7 @@ def read(self, **kwargs): sdict = {} for c in columns: key = 'sparse_series_%s' % c - s = SparseSeriesStorer(self.parent, getattr(self.group,key)) + s = SparseSeriesStorer(self.parent, getattr(self.group, key)) s.infer_axes() sdict[c] = s.read() return SparseDataFrame(sdict, columns=columns, @@ -2116,12 +2231,13 @@ def write(self, obj, **kwargs): s = SparseSeriesStorer(self.parent, node) s.write(ss) self.attrs.default_fill_value = obj.default_fill_value - self.attrs.default_kind = obj.default_kind + self.attrs.default_kind = obj.default_kind self.write_index('columns', obj.columns) + class SparsePanelStorer(GenericStorer): pandas_kind = u'sparse_panel' - attributes = ['default_kind','default_fill_value'] + attributes = ['default_kind', 'default_fill_value'] def read(self, **kwargs): self.validate_read(kwargs) @@ -2131,7 +2247,7 @@ def read(self, **kwargs): for name in items: key = 'sparse_frame_%s' % name node = getattr(self.group, key) - s = SparseFrameStorer(self.parent, getattr(self.group,key)) + s = SparseFrameStorer(self.parent, getattr(self.group, key)) s.infer_axes() sdict[name] = s.read() return SparsePanel(sdict, items=items, default_kind=self.default_kind, @@ -2140,7 +2256,7 @@ def read(self, **kwargs): def write(self, obj, **kwargs): super(SparsePanelStorer, self).write(obj, **kwargs) self.attrs.default_fill_value = obj.default_fill_value - self.attrs.default_kind = obj.default_kind + self.attrs.default_kind = obj.default_kind self.write_index('items', obj.items) for name, sdf in obj.iterkv(): @@ -2152,8 +2268,9 @@ def write(self, obj, **kwargs): s = SparseFrameStorer(self.parent, node) s.write(sdf) + class BlockManagerStorer(GenericStorer): - attributes = ['ndim','nblocks'] + attributes = ['ndim', 'nblocks'] is_shape_reversed = False @property @@ -2165,15 +2282,15 @@ def shape(self): items = 0 for i in range(self.nblocks): node = getattr(self.group, 'block%d_items' % i) - shape = getattr(node,'shape',None) + shape = getattr(node, 'shape', None) if shape is not None: items += shape[0] # data shape node = getattr(self.group, 'block0_values') - shape = getattr(node,'shape',None) + shape = getattr(node, 'shape', None) if shape is not None: - shape = list(shape[0:(ndim-1)]) + shape = list(shape[0:(ndim - 1)]) else: shape = [] @@ -2223,20 +2340,24 @@ def write(self, obj, **kwargs): self.write_array('block%d_values' % i, blk.values, items=blk.items) self.write_index('block%d_items' % i, blk.items) + class FrameStorer(BlockManagerStorer): pandas_kind = u'frame' - obj_type = DataFrame + obj_type = DataFrame + class PanelStorer(BlockManagerStorer): pandas_kind = u'wide' - obj_type = Panel + obj_type = Panel is_shape_reversed = True def write(self, obj, **kwargs): obj._consolidate_inplace() return super(PanelStorer, self).write(obj, **kwargs) + class Table(Storer): + """ represent a table: facilitate read/write of various types of tables @@ -2254,20 +2375,20 @@ class Table(Storer): """ pandas_kind = u'wide_table' - table_type = None - levels = 1 - is_table = True + table_type = None + levels = 1 + is_table = True is_shape_reversed = False def __init__(self, *args, **kwargs): super(Table, self).__init__(*args, **kwargs) - self.index_axes = [] + self.index_axes = [] self.non_index_axes = [] - self.values_axes = [] - self.data_columns = [] - self.info = dict() - self.nan_rep = None - self.selection = None + self.values_axes = [] + self.data_columns = [] + self.info = dict() + self.nan_rep = None + self.selection = None @property def table_type_short(self): @@ -2276,18 +2397,21 @@ def table_type_short(self): def __repr__(self): """ return a pretty representatgion of myself """ self.infer_axes() - dc = ",dc->[%s]" % ','.join(self.data_columns) if len(self.data_columns) else '' + dc = ",dc->[%s]" % ','.join( + self.data_columns) if len( + self.data_columns) else '' ver = '' if self.is_old_version: - ver = "[%s]" % '.'.join([ str(x) for x in self.version ]) + ver = "[%s]" % '.'.join([str(x) for x in self.version]) return "%-12.12s%s (typ->%s,nrows->%s,ncols->%s,indexers->[%s]%s)" % (self.pandas_type, ver, self.table_type_short, self.nrows, self.ncols, - ','.join([ a.name for a in self.index_axes ]), + ','.join( + [a.name for a in self.index_axes]), dc) def __getitem__(self, c): @@ -2299,30 +2423,35 @@ def __getitem__(self, c): def validate(self, other): """ validate against an existing table """ - if other is None: return + if other is None: + return if other.table_type != self.table_type: raise TypeError("incompatible table_type with existing [%s - %s]" % (other.table_type, self.table_type)) - for c in ['index_axes','non_index_axes','values_axes']: - sv = getattr(self,c,None) - ov = getattr(other,c,None) + for c in ['index_axes', 'non_index_axes', 'values_axes']: + sv = getattr(self, c, None) + ov = getattr(other, c, None) if sv != ov: # show the error for the specific axes for i, sax in enumerate(sv): oax = ov[i] if sax != oax: - raise ValueError("invalid combinate of [%s] on appending data [%s] vs current table [%s]" % (c,sax,oax)) + raise ValueError( + "invalid combinate of [%s] on appending data [%s] vs current table [%s]" % + (c, sax, oax)) # should never get here - raise Exception("invalid combinate of [%s] on appending data [%s] vs current table [%s]" % (c,sv,ov)) + raise Exception( + "invalid combinate of [%s] on appending data [%s] vs current table [%s]" % + (c, sv, ov)) @property def nrows_expected(self): """ based on our axes, compute the expected nrows """ - return np.prod([ i.cvalues.shape[0] for i in self.index_axes ]) + return np.prod([i.cvalues.shape[0] for i in self.index_axes]) @property def is_exists(self): @@ -2331,7 +2460,7 @@ def is_exists(self): @property def storable(self): - return getattr(self.group,'table',None) + return getattr(self.group, 'table', None) @property def table(self): @@ -2353,7 +2482,7 @@ def axes(self): @property def ncols(self): """ the number of total columns in the values axes """ - return sum([ len(a.values) for a in self.values_axes ]) + return sum([len(a.values) for a in self.values_axes]) @property def is_transposed(self): @@ -2370,7 +2499,8 @@ def queryables(self): # compute the values_axes queryables return dict([(a.cname, a.kind) for a in self.index_axes] + [(self.obj_type._AXIS_NAMES[axis], None) for axis, values in self.non_index_axes] + - [(v.cname, v.kind) for v in self.values_axes if v.name in set(self.data_columns)] + [(v.cname, v.kind) + for v in self.values_axes if v.name in set(self.data_columns)] ) def index_cols(self): @@ -2383,44 +2513,62 @@ def values_cols(self): def set_info(self): """ update our table index info """ - self.attrs.info = self.info + self.attrs.info = self.info def set_attrs(self): """ set our table type & indexables """ - self.attrs.table_type = self.table_type - self.attrs.index_cols = self.index_cols() - self.attrs.values_cols = self.values_cols() + self.attrs.table_type = self.table_type + self.attrs.index_cols = self.index_cols() + self.attrs.values_cols = self.values_cols() self.attrs.non_index_axes = self.non_index_axes self.attrs.data_columns = self.data_columns - self.attrs.nan_rep = self.nan_rep - self.attrs.encoding = self.encoding - self.attrs.levels = self.levels + self.attrs.nan_rep = self.nan_rep + self.attrs.encoding = self.encoding + self.attrs.levels = self.levels self.set_info() def get_attrs(self): """ retrieve our attributes """ - self.non_index_axes = getattr(self.attrs,'non_index_axes',None) or [] - self.data_columns = getattr(self.attrs,'data_columns',None) or [] - self.info = getattr(self.attrs,'info',None) or dict() - self.nan_rep = getattr(self.attrs,'nan_rep',None) - self.encoding = _ensure_encoding(getattr(self.attrs,'encoding',None)) - self.levels = getattr(self.attrs,'levels',None) or [] + self.non_index_axes = getattr( + self.attrs, + 'non_index_axes', + None) or [] + self.data_columns = getattr( + self.attrs, + 'data_columns', + None) or [] + self.info = getattr( + self.attrs, + 'info', + None) or dict() + self.nan_rep = getattr(self.attrs, 'nan_rep', None) + self.encoding = _ensure_encoding( + getattr(self.attrs, 'encoding', None)) + self.levels = getattr( + self.attrs, + 'levels', + None) or [] t = self.table - self.index_axes = [ a.infer(t) for a in self.indexables if a.is_an_indexable ] - self.values_axes = [ a.infer(t) for a in self.indexables if not a.is_an_indexable ] + self.index_axes = [a.infer(t) + for a in self.indexables if a.is_an_indexable] + self.values_axes = [a.infer(t) + for a in self.indexables if not a.is_an_indexable] - def validate_version(self, where = None): + def validate_version(self, where=None): """ are we trying to operate on an old version? """ if where is not None: if self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1: - ws = incompatibility_doc % '.'.join([ str(x) for x in self.version ]) + ws = incompatibility_doc % '.'.join( + [str(x) for x in self.version]) warnings.warn(ws, IncompatibilityWarning) def validate_min_itemsize(self, min_itemsize): """ validate the min_itemisze doesn't contain items that are not in the axes this needs data_columns to be defined """ - if min_itemsize is None: return - if not isinstance(min_itemsize, dict): return + if min_itemsize is None: + return + if not isinstance(min_itemsize, dict): + return q = self.queryables() for k, v in min_itemsize.items(): @@ -2429,7 +2577,9 @@ def validate_min_itemsize(self, min_itemsize): if k == 'values': continue if k not in q: - raise ValueError("min_itemsize has the key [%s] which is not an axis or data_column" % k) + raise ValueError( + "min_itemsize has the key [%s] which is not an axis or data_column" % + k) @property def indexables(self): @@ -2440,7 +2590,8 @@ def indexables(self): self._indexables = [] # index columns - self._indexables.extend([ IndexCol(name=name,axis=axis,pos=i) for i, (axis, name) in enumerate(self.attrs.index_cols)]) + self._indexables.extend([IndexCol(name=name, axis=axis, pos=i) + for i, (axis, name) in enumerate(self.attrs.index_cols)]) # values columns dc = set(self.data_columns) @@ -2558,15 +2709,17 @@ def validate_data_columns(self, data_columns, min_itemsize): data_columns = [] # if min_itemsize is a dict, add the keys (exclude 'values') - if isinstance(min_itemsize,dict): + if isinstance(min_itemsize, dict): existing_data_columns = set(data_columns) - data_columns.extend([ k for k in min_itemsize.keys() if k != 'values' and k not in existing_data_columns ]) + data_columns.extend( + [k for k in min_itemsize.keys() if k != 'values' and k not in existing_data_columns]) # return valid columns in the order of our axis return [c for c in data_columns if c in axis_labels] - def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, min_itemsize=None, **kwargs): + def create_axes(self, axes, obj, validate=True, nan_rep=None, + data_columns=None, min_itemsize=None, **kwargs): """ create and return the axes leagcy tables create an indexable column, indexable index, non-indexable fields @@ -2588,7 +2741,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, axes = _AXES_MAP[type(obj)] except: raise TypeError("cannot properly create the storer for: [group->%s,value->%s]" % - (self.group._v_name,type(obj))) + (self.group._v_name, type(obj))) # map axes to numbers axes = [obj._get_axis_number(a) for a in axes] @@ -2597,17 +2750,18 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, if self.infer_axes(): existing_table = self.copy() existing_table.infer_axes() - axes = [ a.axis for a in existing_table.index_axes] - data_columns = existing_table.data_columns - nan_rep = existing_table.nan_rep + axes = [a.axis for a in existing_table.index_axes] + data_columns = existing_table.data_columns + nan_rep = existing_table.nan_rep self.encoding = existing_table.encoding - self.info = copy.copy(existing_table.info) + self.info = copy.copy(existing_table.info) else: existing_table = None # currently support on ndim-1 axes if len(axes) != self.ndim - 1: - raise ValueError("currently only support ndim-1 indexers in an AppendableTable") + raise ValueError( + "currently only support ndim-1 indexers in an AppendableTable") # create according to the new data self.non_index_axes = [] @@ -2644,8 +2798,9 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, self.non_index_axes.append((i, append_axis)) # set axis positions (based on the axes) - self.index_axes = [index_axes_map[a].set_pos(j).update_info(self.info) for j, - a in enumerate(axes)] + self.index_axes = [index_axes_map[a].set_pos( + j).update_info(self.info) for j, + a in enumerate(axes)] j = len(self.index_axes) # check for column conflicts @@ -2662,17 +2817,18 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, blocks = block_obj._data.blocks if len(self.non_index_axes): axis, axis_labels = self.non_index_axes[0] - data_columns = self.validate_data_columns(data_columns, min_itemsize) + data_columns = self.validate_data_columns( + data_columns, min_itemsize) if len(data_columns): blocks = block_obj.reindex_axis(Index(axis_labels) - Index( - data_columns), axis=axis, copy=False)._data.blocks + data_columns), axis=axis, copy=False)._data.blocks for c in data_columns: blocks.extend(block_obj.reindex_axis( - [c], axis=axis, copy=False)._data.blocks) + [c], axis=axis, copy=False)._data.blocks) # reorder the blocks in the same order as the existing_table if we can if existing_table is not None: - by_items = dict([ (tuple(b.items.tolist()),b) for b in blocks ]) + by_items = dict([(tuple(b.items.tolist()), b) for b in blocks]) new_blocks = [] for ea in existing_table.values_axes: items = tuple(ea.values) @@ -2680,7 +2836,9 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, b = by_items.pop(items) new_blocks.append(b) except: - raise ValueError("cannot match existing table structure for [%s] on appending data" % items) + raise ValueError( + "cannot match existing table structure for [%s] on appending data" % + items) blocks = new_blocks # add my values @@ -2704,7 +2862,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, existing_col = existing_table.values_axes[i] except: raise ValueError("Incompatible appended table [%s] with existing table [%s]" % - (blocks,existing_table.values_axes)) + (blocks, existing_table.values_axes)) else: existing_col = None @@ -2721,10 +2879,12 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, col.set_pos(j) self.values_axes.append(col) - except (NotImplementedError, ValueError, TypeError), e: + except (NotImplementedError, ValueError, TypeError) as e: raise e - except (Exception), detail: - raise Exception("cannot find the correct atom type -> [dtype->%s,items->%s] %s" % (b.dtype.name, b.items, str(detail))) + except (Exception) as detail: + raise Exception( + "cannot find the correct atom type -> [dtype->%s,items->%s] %s" % + (b.dtype.name, b.items, str(detail))) j += 1 # validate our min_itemsize @@ -2755,29 +2915,32 @@ def process_filter(field, filt): # see if the field is the name of an axis if field == axis_name: - takers = op(axis_values,filt) - return obj.ix._getitem_axis(takers,axis=axis_number) + takers = op(axis_values, filt) + return obj.ix._getitem_axis(takers, axis=axis_number) # this might be the name of a file IN an axis elif field in axis_values: # we need to filter on this dimension - values = _ensure_index(getattr(obj,field).values) - filt = _ensure_index(filt) + values = _ensure_index(getattr(obj, field).values) + filt = _ensure_index(filt) # hack until we support reversed dim flags - if isinstance(obj,DataFrame): - axis_number = 1-axis_number - takers = op(values,filt) - return obj.ix._getitem_axis(takers,axis=axis_number) + if isinstance(obj, DataFrame): + axis_number = 1 - axis_number + takers = op(values, filt) + return obj.ix._getitem_axis(takers, axis=axis_number) - raise ValueError("cannot find the field [%s] for filtering!" % field) + raise ValueError( + "cannot find the field [%s] for filtering!" % + field) obj = process_filter(field, filt) return obj - def create_description(self, complib=None, complevel=None, fletcher32=False, expectedrows=None): + def create_description( + self, complib=None, complevel=None, fletcher32=False, expectedrows=None): """ create the description of the table from the axes & values """ # expected rows estimate @@ -2811,10 +2974,15 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs): return False # create the selection - self.selection = Selection(self, where=where, start=start, stop=stop, **kwargs) + self.selection = Selection( + self, + where=where, + start=start, + stop=stop, + **kwargs) return Coordinates(self.selection.select_coords(), group=self.group, where=where) - def read_column(self, column, where = None, **kwargs): + def read_column(self, column, where=None, **kwargs): """ return a single column from the table, generally only indexables are interesting """ # validate the version @@ -2825,14 +2993,17 @@ def read_column(self, column, where = None, **kwargs): return False if where is not None: - raise Exception("read_column does not currently accept a where clause") + raise Exception( + "read_column does not currently accept a where clause") # find the axes for a in self.axes: if column == a.name: if not a.is_data_indexable: - raise ValueError("column [%s] can not be extracted individually; it is not data indexable" % column) + raise ValueError( + "column [%s] can not be extracted individually; it is not data indexable" % + column) # column must be an indexable or a data column c = getattr(self.table.cols, column) @@ -2841,7 +3012,9 @@ def read_column(self, column, where = None, **kwargs): raise KeyError("column [%s] not found in the table" % column) + class WORMTable(Table): + """ a write-once read-many table: this format DOES NOT ALLOW appending to a table. writing is a one-time operation the data are stored in a format that allows for searching the data on disk @@ -2861,6 +3034,7 @@ def write(self, **kwargs): class LegacyTable(Table): + """ an appendable table: allow append/query/delete operations to a (possibily) already existing appendable table this table ALLOWS append (but doesn't require them), and stores the data in a format @@ -2960,6 +3134,7 @@ def read(self, where=None, columns=None, **kwargs): class LegacyFrameTable(LegacyTable): + """ support the legacy frame table """ pandas_kind = u'frame_table' table_type = u'legacy_frame' @@ -2970,12 +3145,14 @@ def read(self, *args, **kwargs): class LegacyPanelTable(LegacyTable): + """ support the legacy panel table """ table_type = u'legacy_panel' obj_type = Panel class AppendableTable(LegacyTable): + """ suppor the new appendable table formats """ _indexables = None table_type = u'appendable' @@ -3043,7 +3220,8 @@ def write_data(self, chunksize): values = [a.take_data() for a in self.values_axes] # transpose the values so first dimension is last - values = [ v.transpose(np.roll(np.arange(v.ndim),v.ndim-1)) for v in values ] + values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) + for v in values] # write the chunks if chunksize is None: @@ -3076,15 +3254,17 @@ def write_data_chunk(self, indexes, mask, search, values): args = list(indexes) args.extend([self.dtype, mask, search, values]) rows = func(*args) - except (Exception), detail: + except (Exception) as detail: raise Exception("cannot create row-data -> %s" % str(detail)) try: if len(rows): self.table.append(rows) self.table.flush() - except (Exception), detail: - raise Exception("tables cannot write this data -> %s" % str(detail)) + except (Exception) as detail: + raise Exception( + "tables cannot write this data -> %s" % + str(detail)) def delete(self, where=None, **kwargs): @@ -3140,6 +3320,7 @@ def delete(self, where=None, **kwargs): class AppendableFrameTable(AppendableTable): + """ suppor the new appendable table formats """ pandas_kind = u'frame_table' table_type = u'appendable_frame' @@ -3169,10 +3350,10 @@ def read(self, where=None, columns=None, **kwargs): if self.is_transposed: values = a.cvalues index_ = cols - cols_ = Index(index,name=getattr(index,'name',None)) + cols_ = Index(index, name=getattr(index, 'name', None)) else: values = a.cvalues.T - index_ = Index(index,name=getattr(index,'name',None)) + index_ = Index(index, name=getattr(index, 'name', None)) cols_ = cols # if we have a DataIndexableCol, its shape will only be 1 dim @@ -3195,6 +3376,7 @@ def read(self, where=None, columns=None, **kwargs): class GenericTable(AppendableFrameTable): + """ a table that read/writes the generic pytables table format """ pandas_kind = u'frame_table' table_type = u'generic_table' @@ -3207,17 +3389,19 @@ def pandas_type(self): @property def storable(self): - return getattr(self.group,'table',None) or self.group + return getattr(self.group, 'table', None) or self.group def get_attrs(self): """ retrieve our attributes """ - self.non_index_axes = [] - self.nan_rep = None - self.levels = [] + self.non_index_axes = [] + self.nan_rep = None + self.levels = [] t = self.table - self.index_axes = [ a.infer(t) for a in self.indexables if a.is_an_indexable ] - self.values_axes = [ a.infer(t) for a in self.indexables if not a.is_an_indexable ] - self.data_columns = [ a.name for a in self.values_axes ] + self.index_axes = [a.infer(t) + for a in self.indexables if a.is_an_indexable] + self.values_axes = [a.infer(t) + for a in self.indexables if not a.is_an_indexable] + self.data_columns = [a.name for a in self.values_axes] @property def indexables(self): @@ -3227,11 +3411,15 @@ def indexables(self): d = self.description # the index columns is just a simple index - self._indexables = [ GenericIndexCol(name='index',axis=0) ] + self._indexables = [GenericIndexCol(name='index', axis=0)] for i, n in enumerate(d._v_names): - dc = GenericDataIndexableCol(name = n, pos=i, values = [ n ], version = self.version) + dc = GenericDataIndexableCol( + name=n, + pos=i, + values=[n], + version=self.version) self._indexables.append(dc) return self._indexables @@ -3239,7 +3427,9 @@ def indexables(self): def write(self, **kwargs): raise NotImplementedError("cannot write on an generic table") + class AppendableMultiFrameTable(AppendableFrameTable): + """ a frame with a multi-index """ table_type = u'appendable_multiframe' obj_type = DataFrame @@ -3265,12 +3455,17 @@ def read(self, columns=None, **kwargs): for n in self.levels: if n not in columns: columns.insert(0, n) - df = super(AppendableMultiFrameTable, self).read(columns=columns, **kwargs) + df = super( + AppendableMultiFrameTable, + self).read( + columns=columns, + **kwargs) df.set_index(self.levels, inplace=True) return df class AppendablePanelTable(AppendableTable): + """ suppor the new appendable table formats """ table_type = u'appendable_panel' ndim = 3 @@ -3288,23 +3483,26 @@ def is_transposed(self): class AppendableNDimTable(AppendablePanelTable): + """ suppor the new appendable table formats """ table_type = u'appendable_ndim' ndim = 4 obj_type = Panel4D + def _convert_index(index, encoding=None): - index_name = getattr(index,'name',None) + index_name = getattr(index, 'name', None) if isinstance(index, DatetimeIndex): converted = index.asi8 return IndexCol(converted, 'datetime64', _tables().Int64Col(), - freq=getattr(index,'freq',None), tz=getattr(index,'tz',None), + freq=getattr(index, 'freq', None), tz=getattr(index, 'tz', None), index_name=index_name) elif isinstance(index, (Int64Index, PeriodIndex)): atom = _tables().Int64Col() - return IndexCol(index.values, 'integer', atom, freq=getattr(index,'freq',None), - index_name=index_name) + return IndexCol( + index.values, 'integer', atom, freq=getattr(index, 'freq', None), + index_name=index_name) if isinstance(index, MultiIndex): raise Exception('MultiIndex not supported here!') @@ -3316,7 +3514,7 @@ def _convert_index(index, encoding=None): if inferred_type == 'datetime64': converted = values.view('i8') return IndexCol(converted, 'datetime64', _tables().Int64Col(), - freq=getattr(index,'freq',None), tz=getattr(index,'tz',None), + freq=getattr(index, 'freq', None), tz=getattr(index, 'tz', None), index_name=index_name) elif inferred_type == 'datetime': converted = np.array([(time.mktime(v.timetuple()) + @@ -3335,8 +3533,9 @@ def _convert_index(index, encoding=None): converted = _convert_string_array(values, encoding) itemsize = converted.dtype.itemsize - return IndexCol(converted, 'string', _tables().StringCol(itemsize), itemsize=itemsize, - index_name=index_name) + return IndexCol( + converted, 'string', _tables().StringCol(itemsize), itemsize=itemsize, + index_name=index_name) elif inferred_type == 'unicode': atom = _tables().ObjectAtom() return IndexCol(np.asarray(values, dtype='O'), 'object', atom, @@ -3355,6 +3554,7 @@ def _convert_index(index, encoding=None): return IndexCol(np.asarray(values, dtype='O'), 'object', atom, index_name=index_name) + def _unconvert_index(data, kind, encoding=None): kind = _ensure_decoded(kind) if kind == u'datetime64': @@ -3374,6 +3574,7 @@ def _unconvert_index(data, kind, encoding=None): raise ValueError('unrecognized index type %s' % kind) return index + def _unconvert_index_legacy(data, kind, legacy=False, encoding=None): kind = _ensure_decoded(kind) if kind == u'datetime': @@ -3386,6 +3587,7 @@ def _unconvert_index_legacy(data, kind, legacy=False, encoding=None): raise ValueError('unrecognized index type %s' % kind) return index + def _convert_string_array(data, encoding, itemsize=None): # encode if needed @@ -3397,19 +3599,20 @@ def _convert_string_array(data, encoding, itemsize=None): if itemsize is None: itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) - data = np.array(data,dtype="S%d" % itemsize) + data = np.array(data, dtype="S%d" % itemsize) return data + def _unconvert_string_array(data, nan_rep=None, encoding=None): """ deserialize a string array, possibly decoding """ shape = data.shape - data = np.array(data.ravel(),dtype=object) + data = np.array(data.ravel(), dtype=object) # guard against a None encoding in PY3 (because of a legacy # where the passed encoding is actually None) encoding = _ensure_encoding(encoding) if encoding is not None and len(data): - f = np.vectorize(lambda x: x.decode(encoding),otypes=[np.object]) + f = np.vectorize(lambda x: x.decode(encoding), otypes=[np.object]) data = f(data) if nan_rep is None: @@ -3418,6 +3621,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): data = lib.string_array_replace_from_nan_rep(data, nan_rep) return data.reshape(shape) + def _maybe_convert(values, val_kind, encoding): if _need_convert(val_kind): conv = _get_converter(val_kind, encoding) @@ -3425,6 +3629,7 @@ def _maybe_convert(values, val_kind, encoding): values = conv(values) return values + def _get_converter(kind, encoding): kind = _ensure_decoded(kind) if kind == 'datetime64': @@ -3432,17 +3637,20 @@ def _get_converter(kind, encoding): elif kind == 'datetime': return lib.convert_timestamps elif kind == 'string': - return lambda x: _unconvert_string_array(x,encoding=encoding) + return lambda x: _unconvert_string_array(x, encoding=encoding) else: # pragma: no cover raise ValueError('invalid kind %s' % kind) + def _need_convert(kind): kind = _ensure_decoded(kind) if kind in (u'datetime', u'datetime64', u'string'): return True return False + class Term(StringMixin): + """create a term object that holds a field, op, and value Parameters @@ -3470,10 +3678,13 @@ class Term(StringMixin): """ _ops = ['<=', '<', '>=', '>', '!=', '==', '='] - _search = re.compile("^\s*(?P\w+)\s*(?P%s)\s*(?P.+)\s*$" % '|'.join(_ops)) + _search = re.compile( + "^\s*(?P\w+)\s*(?P%s)\s*(?P.+)\s*$" % + '|'.join(_ops)) _max_selectors = 31 - def __init__(self, field, op=None, value=None, queryables=None, encoding=None): + def __init__(self, field, op=None, + value=None, queryables=None, encoding=None): self.field = None self.op = None self.value = None @@ -3538,8 +3749,10 @@ def __init__(self, field, op=None, value=None, queryables=None, encoding=None): # we have valid conditions if self.op in ['>', '>=', '<', '<=']: - if hasattr(self.value, '__iter__') and len(self.value) > 1 and not isinstance(self.value,basestring): - raise ValueError("an inequality condition cannot have multiple values [%s]" % str(self)) + if hasattr(self.value, '__iter__') and len(self.value) > 1 and not isinstance(self.value, basestring): + raise ValueError( + "an inequality condition cannot have multiple values [%s]" % + str(self)) if not is_list_like(self.value): self.value = [self.value] @@ -3581,7 +3794,7 @@ def eval(self): if self.is_in_table: values = [self.convert_value(v) for v in self.value] else: - values = [TermValue(v,v,self.kind) for v in self.value] + values = [TermValue(v, v, self.kind) for v in self.value] # equality conditions if self.op in ['==', '!=']: @@ -3592,21 +3805,26 @@ def eval(self): else: filter_op = lambda axis, vals: axis.isin(vals) - if self.is_in_table: # too many values to create the expression? if len(values) <= self._max_selectors: - vs = [ self.generate(v) for v in values ] + vs = [self.generate(v) for v in values] self.condition = "(%s)" % ' | '.join(vs) # use a filter after reading else: - self.filter = (self.field, filter_op, Index([v.value for v in values])) + self.filter = ( + self.field, + filter_op, + Index([v.value for v in values])) else: - self.filter = (self.field, filter_op, Index([v.value for v in values])) + self.filter = ( + self.field, + filter_op, + Index([v.value for v in values])) else: @@ -3616,7 +3834,9 @@ def eval(self): else: - raise TypeError("passing a filterable condition to a non-table indexer [%s]" % str(self)) + raise TypeError( + "passing a filterable condition to a non-table indexer [%s]" % + str(self)) def convert_value(self, v): """ convert the expression that is in the term to something that is accepted by pytables """ @@ -3628,34 +3848,37 @@ def stringify(value): return value kind = _ensure_decoded(self.kind) - if kind == u'datetime64' or kind == u'datetime' : + if kind == u'datetime64' or kind == u'datetime': v = lib.Timestamp(v) if v.tz is not None: v = v.tz_convert('UTC') - return TermValue(v,v.value,kind) + return TermValue(v, v.value, kind) elif isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u'date': v = time.mktime(v.timetuple()) - return TermValue(v,Timestamp(v),kind) + return TermValue(v, Timestamp(v), kind) elif kind == u'integer': v = int(float(v)) - return TermValue(v,v,kind) + return TermValue(v, v, kind) elif kind == u'float': v = float(v) - return TermValue(v,v,kind) + return TermValue(v, v, kind) elif kind == u'bool': if isinstance(v, basestring): - v = not v.strip().lower() in [u'false', u'f', u'no', u'n', u'none', u'0', u'[]', u'{}', u''] + v = not v.strip().lower() in [ + u'false', u'f', u'no', u'n', u'none', u'0', u'[]', u'{}', u''] else: v = bool(v) - return TermValue(v,v,kind) + return TermValue(v, v, kind) elif not isinstance(v, basestring): v = stringify(v) - return TermValue(v,stringify(v),u'string') + return TermValue(v, stringify(v), u'string') # string quoting - return TermValue(v,stringify(v),u'string') + return TermValue(v, stringify(v), u'string') + class TermValue(object): + """ hold a term value the we use to construct a condition/filter """ def __init__(self, value, converted, kind): @@ -3672,7 +3895,9 @@ def tostring(self, encoding): return '"%s"' % self.converted return self.converted + class Coordinates(object): + """ holds a returned coordinates list, useful to select the same rows from different tables coordinates : holds the array of coordinates @@ -3692,7 +3917,9 @@ def __getitem__(self, key): """ return a new coordinates object, sliced by the key """ return Coordinates(self.values[key], self.group, self.where) + class Selection(object): + """ Carries out a selection operation on a tables.Table object. @@ -3703,6 +3930,7 @@ class Selection(object): start, stop: indicies to start and/or stop selection """ + def __init__(self, table, where=None, start=None, stop=None, **kwargs): self.table = table self.where = where @@ -3720,9 +3948,10 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): # create the numexpr & the filter if self.terms: - terms = [ t for t in self.terms if t.condition is not None ] + terms = [t for t in self.terms if t.condition is not None] if len(terms): - self.condition = "(%s)" % ' & '.join([ t.condition for t in terms ]) + self.condition = "(%s)" % ' & '.join( + [t.condition for t in terms]) self.filter = [] for t in self.terms: if t.filter is not None: @@ -3767,13 +3996,13 @@ def select_coords(self): return self.table.table.getWhereList(self.condition, start=self.start, stop=self.stop, sort=True) -### utilities ### +# utilities ### -def timeit(key,df,fn=None,remove=True,**kwargs): +def timeit(key, df, fn=None, remove=True, **kwargs): if fn is None: fn = 'timeit.h5' - store = HDFStore(fn,mode='w') - store.append(key,df,**kwargs) + store = HDFStore(fn, mode='w') + store.append(key, df, **kwargs) store.close() if remove: From c665a85b6f7422403acf684d086141d0d701f952 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 14:32:24 -0400 Subject: [PATCH 26/37] DOC: reference future enhancingperf.eval section --- pandas/computation/eval.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py index e08e0f28d7877..1a681e37d6130 100644 --- a/pandas/computation/eval.py +++ b/pandas/computation/eval.py @@ -48,8 +48,10 @@ def eval(expr, engine='numexpr', truediv=True, local_dict=None, Notes ----- - The benefits of using ``eval`` are that very large frames that are terms in - long expressions are sped up, sometimes by as much as 10x. + * The benefits of using ``eval`` are that very large frames that are terms in + long expressions are sped up, sometimes by as much as 10x. + + See :ref:`Enhancing performance ` for more details. """ # make sure we're passed a valid engine if not engine in _engines: From cb27934a41ebcd1085ac08b587f44202103c3413 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 14:33:45 -0400 Subject: [PATCH 27/37] CLN/DOC: clean up docstrings in pytables --- pandas/io/pytables.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 013e596320250..1cb465cbdf16a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -167,9 +167,12 @@ def get_store(path, mode='a', complevel=None, complib=None, Examples -------- + >>> from pandas import DataFrame + >>> from numpy.random import randn + >>> bar = DataFrame(randn(10, 4)) >>> with get_store('test.h5') as store: - >>> store['foo'] = bar # write to HDF5 - >>> bar = store['foo'] # retrieve + ... store['foo'] = bar # write to HDF5 + ... bar = store['foo'] # retrieve """ store = None try: @@ -262,6 +265,9 @@ class HDFStore(object): Examples -------- + >>> from pandas import DataFrame + >>> from numpy.random import randn + >>> bar = DataFrame(randn(10, 4)) >>> store = HDFStore('test.h5') >>> store['foo'] = bar # write to HDF5 >>> bar = store['foo'] # retrieve From 63ba37d0943607f679fd9d3b4715ba38e8ae9739 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 14:34:20 -0400 Subject: [PATCH 28/37] CLN: actually pass fletcher32 in get_store --- pandas/io/pytables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 1cb465cbdf16a..21da0d58b67f7 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -177,7 +177,7 @@ def get_store(path, mode='a', complevel=None, complib=None, store = None try: store = HDFStore(path, mode=mode, complevel=complevel, - complib=complib, fletcher32=False) + complib=complib, fletcher32=fletcher32) yield store finally: if store is not None: From dcde5901f7975c4aac046d4f3c0b7c6629bc4f15 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 14:35:32 -0400 Subject: [PATCH 29/37] CLN: remove unused variables --- pandas/io/pytables.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 21da0d58b67f7..2ac4e19a7eb7b 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -278,7 +278,7 @@ class HDFStore(object): def __init__(self, path, mode=None, complevel=None, complib=None, fletcher32=False): try: - import tables as _ + import tables except ImportError: # pragma: no cover raise Exception('HDFStore requires PyTables') @@ -576,7 +576,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, start=start, stop=stop) nrows = len(c) - except (Exception) as detail: + except Exception: raise ValueError("invalid selector [%s]" % selector) def func(_start, _stop): @@ -1235,7 +1235,6 @@ def validate_col(self, itemsize=None): """ validate this column: return the compared against itemsize """ # validate this column for string truncation (or reset to the max size) - dtype = getattr(self, 'dtype', None) if _ensure_decoded(self.kind) == u'string': c = self.col @@ -2252,7 +2251,6 @@ def read(self, **kwargs): sdict = {} for name in items: key = 'sparse_frame_%s' % name - node = getattr(self.group, key) s = SparseFrameStorer(self.parent, getattr(self.group, key)) s.infer_axes() sdict[name] = s.read() @@ -2592,7 +2590,6 @@ def indexables(self): """ create/cache the indexables if they don't exist """ if self._indexables is None: - d = self.description self._indexables = [] # index columns From 3c4e2b3fa40df21ba477693ce647542f156f1e92 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 14:41:02 -0400 Subject: [PATCH 30/37] CLN: more pep8 and get rid of most raise Exception clauses --- pandas/io/pytables.py | 83 ++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 41 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 2ac4e19a7eb7b..0f84884d51340 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -280,7 +280,7 @@ def __init__(self, path, mode=None, complevel=None, complib=None, try: import tables except ImportError: # pragma: no cover - raise Exception('HDFStore requires PyTables') + raise ImportError('HDFStore requires PyTables') self._path = path if mode is None: @@ -516,7 +516,8 @@ def select_column(self, key, column, **kwargs): return self.get_storer(key).read_column(column=column, **kwargs) def select_as_multiple(self, keys, where=None, selector=None, columns=None, - start=None, stop=None, iterator=False, chunksize=None, auto_close=False, **kwargs): + start=None, stop=None, iterator=False, + chunksize=None, auto_close=False, **kwargs): """ Retrieve pandas objects from multiple tables Parameters @@ -538,13 +539,15 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, if isinstance(keys, (list, tuple)) and len(keys) == 1: keys = keys[0] if isinstance(keys, basestring): - return self.select(key=keys, where=where, columns=columns, start=start, stop=stop, iterator=iterator, chunksize=chunksize, **kwargs) + return self.select(key=keys, where=where, columns=columns, + start=start, stop=stop, iterator=iterator, + chunksize=chunksize, **kwargs) if not isinstance(keys, (list, tuple)): - raise Exception("keys must be a list/tuple") + raise TypeError("keys must be a list/tuple") - if len(keys) == 0: - raise Exception("keys must have a non-zero length") + if not len(keys): + raise ValueError("keys must have a non-zero length") if selector is None: selector = keys[0] @@ -686,13 +689,13 @@ def append(self, key, value, columns=None, **kwargs): data in the table, so be careful """ if columns is not None: - raise Exception( - "columns is not a supported keyword in append, try data_columns") + raise TypeError("columns is not a supported keyword in append, " + "try data_columns") self._write_to_group(key, value, table=True, append=True, **kwargs) - def append_to_multiple( - self, d, value, selector, data_columns=None, axes=None, **kwargs): + def append_to_multiple(self, d, value, selector, data_columns=None, + axes=None, **kwargs): """ Append to multiple tables @@ -711,8 +714,9 @@ def append_to_multiple( """ if axes is not None: - raise Exception( - "axes is currently not accepted as a paremter to append_to_multiple; you can create the tables indepdently instead") + raise TypeError("axes is currently not accepted as a parameter to" + " append_to_multiple; you can create the " + "tables indepdently instead") if not isinstance(d, dict): raise ValueError( @@ -770,7 +774,7 @@ def create_table_index(self, key, **kwargs): # version requirements _tables() if not _table_supports_index: - raise Exception("PyTables >= 2.3 is required for table indexing") + raise ValueError("PyTables >= 2.3 is required for table indexing") s = self.get_storer(key) if s is None: @@ -1005,8 +1009,8 @@ class TableIterator(object): kwargs : the passed kwargs """ - def __init__(self, store, func, nrows, start=None, - stop=None, chunksize=None, auto_close=False): + def __init__(self, store, func, nrows, start=None, stop=None, + chunksize=None, auto_close=False): self.store = store self.func = func self.nrows = nrows or 0 @@ -1928,7 +1932,7 @@ def read_index(self, key): _, index = self.read_index_node(getattr(self.group, key)) return index else: # pragma: no cover - raise Exception('unrecognized index variety: %s' % variety) + raise TypeError('unrecognized index variety: %s' % variety) def write_index(self, key, index): if isinstance(index, MultiIndex): @@ -2448,7 +2452,7 @@ def validate(self, other): (c, sax, oax)) # should never get here - raise Exception( + raise ValueError( "invalid combinate of [%s] on appending data [%s] vs current table [%s]" % (c, sv, ov)) @@ -2884,10 +2888,11 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, self.values_axes.append(col) except (NotImplementedError, ValueError, TypeError) as e: raise e - except (Exception) as detail: - raise Exception( - "cannot find the correct atom type -> [dtype->%s,items->%s] %s" % - (b.dtype.name, b.items, str(detail))) + except Exception as detail: + raise TypeError("cannot find the correct atom type -> " + "[dtype->%s,items->%s] %s" % (b.dtype.name, + b.items, + str(detail))) j += 1 # validate our min_itemsize @@ -2996,8 +3001,8 @@ def read_column(self, column, where=None, **kwargs): return False if where is not None: - raise Exception( - "read_column does not currently accept a where clause") + raise TypeError("read_column does not currently accept a where " + "clause") # find the axes for a in self.axes: @@ -3052,7 +3057,7 @@ class LegacyTable(Table): ndim = 3 def write(self, **kwargs): - raise Exception("write operations are not allowed on legacy tables!") + raise TypeError("write operations are not allowed on legacy tables!") def read(self, where=None, columns=None, **kwargs): """ we have n indexable columns, with an arbitrary number of data axes """ @@ -3257,17 +3262,15 @@ def write_data_chunk(self, indexes, mask, search, values): args = list(indexes) args.extend([self.dtype, mask, search, values]) rows = func(*args) - except (Exception) as detail: - raise Exception("cannot create row-data -> %s" % str(detail)) + except Exception as detail: + raise Exception("cannot create row-data -> %s" % detail) try: if len(rows): self.table.append(rows) self.table.flush() - except (Exception) as detail: - raise Exception( - "tables cannot write this data -> %s" % - str(detail)) + except Exception as detail: + raise TypeError("tables cannot write this data -> %s" % detail) def delete(self, where=None, **kwargs): @@ -3499,16 +3502,15 @@ def _convert_index(index, encoding=None): if isinstance(index, DatetimeIndex): converted = index.asi8 return IndexCol(converted, 'datetime64', _tables().Int64Col(), - freq=getattr(index, 'freq', None), tz=getattr(index, 'tz', None), - index_name=index_name) + freq=getattr(index, 'freq', None), + tz=getattr(index, 'tz', None), index_name=index_name) elif isinstance(index, (Int64Index, PeriodIndex)): atom = _tables().Int64Col() - return IndexCol( - index.values, 'integer', atom, freq=getattr(index, 'freq', None), - index_name=index_name) + return IndexCol(index.values, 'integer', atom, freq=getattr(index, + 'freq', None), index_name=index_name) if isinstance(index, MultiIndex): - raise Exception('MultiIndex not supported here!') + raise TypeError('MultiIndex not supported here!') inferred_type = lib.infer_dtype(index) @@ -3517,8 +3519,8 @@ def _convert_index(index, encoding=None): if inferred_type == 'datetime64': converted = values.view('i8') return IndexCol(converted, 'datetime64', _tables().Int64Col(), - freq=getattr(index, 'freq', None), tz=getattr(index, 'tz', None), - index_name=index_name) + freq=getattr(index, 'freq', None), + tz=getattr(index, 'tz', None), index_name=index_name) elif inferred_type == 'datetime': converted = np.array([(time.mktime(v.timetuple()) + v.microsecond / 1E6) for v in values], @@ -3536,9 +3538,8 @@ def _convert_index(index, encoding=None): converted = _convert_string_array(values, encoding) itemsize = converted.dtype.itemsize - return IndexCol( - converted, 'string', _tables().StringCol(itemsize), itemsize=itemsize, - index_name=index_name) + return IndexCol(converted, 'string', _tables().StringCol(itemsize), + itemsize=itemsize, index_name=index_name) elif inferred_type == 'unicode': atom = _tables().ObjectAtom() return IndexCol(np.asarray(values, dtype='O'), 'object', atom, From 226c7869742582cf62af604dcab2237cfe1750c4 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Thu, 4 Jul 2013 18:52:36 -0400 Subject: [PATCH 31/37] CLN: change NameError to match python --- pandas/computation/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 24000b27a033a..a35d80568b482 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -31,7 +31,7 @@ def _resolve_name(env, key): if not isinstance(key, basestring): return key - raise NameError('{0!r} is undefined'.format(key)) + raise NameError('name {0!r} is not defined'.format(key)) return res From 79871d8b9c24d7b52d8ab86897b43c2a5481c89d Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 5 Jul 2013 11:26:31 -0400 Subject: [PATCH 32/37] API: expose the Expr object to top level pandas --- pandas/__init__.py | 2 +- pandas/computation/api.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index bec0877b13bb8..5315fd770e796 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -29,7 +29,7 @@ from pandas.stats.api import * from pandas.tseries.api import * from pandas.io.api import * -from pandas.computation.api import eval +from pandas.computation.api import * from pandas.util.testing import debug diff --git a/pandas/computation/api.py b/pandas/computation/api.py index 86f72902a52c8..db8269a497768 100644 --- a/pandas/computation/api.py +++ b/pandas/computation/api.py @@ -1 +1,2 @@ from pandas.computation.eval import eval +from pandas.computation.expr import Expr From 84fdb453fb497ec73ae70cd059840d3b087fa828 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 5 Jul 2013 11:27:02 -0400 Subject: [PATCH 33/37] CLN/TST: fail with a NotImplementedError on and or not --- pandas/computation/expr.py | 4 ++++ pandas/computation/tests/test_eval.py | 33 +++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 60fea6e935070..666eb891f9929 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -77,6 +77,8 @@ def visit_BinOp(self, node): return op(left, right) def visit_UnaryOp(self, node): + if isinstance(node.op, ast.Not): + raise NotImplementedError("not operator not yet supported") op = self.visit(node.op) return op(self.visit(node.operand)) @@ -107,6 +109,8 @@ def visit_Call(self, node): def visit_Attribute(self, node): raise NotImplementedError("attribute access is not yet supported") + def visit_BoolOp(self, node): + raise NotImplementedError("boolean operators are not yet supported") class Expr(StringMixin): """Expr object""" diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 0a1356915523a..8e185f5b9772b 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -610,6 +610,39 @@ def test_is_expr(): check_is_expr(engine) +def check_not_fails(engine): + x = True + assert_raises(NotImplementedError, pd.eval, 'not x', engine=engine, + local_dict={'x': x}) + + +def test_not_fails(): + for engine in _engines: + check_not_fails(engine) + + +def check_and_fails(engine): + x, y = False, True + assert_raises(NotImplementedError, pd.eval, 'x and y', engine=engine, + local_dict={'x': x, 'y': y}) + + +def test_and_fails(): + for engine in _engines: + check_and_fails(engine) + + +def check_or_fails(engine): + x, y = True, False + assert_raises(NotImplementedError, pd.eval, 'x or y', engine=engine, + local_dict={'x': x, 'y': y}) + + +def test_or_fails(): + for engine in _engines: + check_or_fails(engine) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) From 4d9f9a7805efa54f08af7719207703e7722bb59d Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 5 Jul 2013 11:27:17 -0400 Subject: [PATCH 34/37] CLN: generlize operator/expression printing --- pandas/computation/ops.py | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index a35d80568b482..0d67c56ba472a 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -49,13 +49,7 @@ def _update_name(env, key, value): raise NameError('{0!r} is undefined'.format(key)) -class NamedObjectMixin(object): - @property - def typename(self): - return com.pprint_thing(self.__class__.__name__) - - -class Term(StringMixin, NamedObjectMixin): +class Term(StringMixin): def __init__(self, name, env): self.name = name self.value = _resolve_name(env, name) @@ -79,7 +73,11 @@ def __init__(self, value, env): super(Constant, self).__init__(value, env) -class Op(NamedObjectMixin, StringMixin): +def _print_operand(opr): + return opr.name if is_term(opr) else unicode(opr) + + +class Op(StringMixin): """Hold an operator of unknown arity """ def __init__(self, op, operands): @@ -90,12 +88,11 @@ def __iter__(self): return iter(self.operands) def __unicode__(self): - op = 'op={1!r}'.format(self.op) - operands = ', '.join('opr_{i}={opr}'.format(i=i, opr=opr) - for i, opr in enumerate(self.operands)) - return com.pprint_thing('{0}({op}, ' - '{operands})'.format(self.name, op=op, - operands=operands)) + """Print a generic n-ary operator and its operands""" + # recurse over the operands + parened = ('({0})'.format(_print_operand(opr)) + for opr in self.operands) + return com.pprint_thing(' {0} '.format(self.op).join(parened)) _cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=' @@ -161,10 +158,6 @@ def __init__(self, op, lhs, rhs): raise BinaryOperatorError('Invalid binary operator {0}, valid' ' operators are {1}'.format(op, keys)) - def __unicode__(self): - return com.pprint_thing('({0}) {1} ({2})'.format(self.lhs, self.op, - self.rhs)) - def __call__(self, env): # handle truediv if self.op == '/' and env.locals['truediv']: From a0d2ce0f458f18d43e87e9971d2625457e7c1814 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Fri, 5 Jul 2013 11:34:20 -0400 Subject: [PATCH 35/37] CLN: clean up testing and expr --- pandas/computation/expr.py | 1 + pandas/computation/tests/test_eval.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 666eb891f9929..6d33f6ac50a0d 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -112,6 +112,7 @@ def visit_Attribute(self, node): def visit_BoolOp(self, node): raise NotImplementedError("boolean operators are not yet supported") + class Expr(StringMixin): """Expr object""" def __init__(self, expr, engine='numexpr', env=None, truediv=True): diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 8e185f5b9772b..fc1cccf320201 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -42,7 +42,7 @@ def fractional(x): def hasfractional(x): - return np.any(fractional(x) != 0.0) + return np.any(fractional(x)) def _eval_from_expr(lhs, cmp1, rhs, binop, cmp2): From 317a1530b1e46a61fb4c97388108fdd7e43ece77 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 6 Jul 2013 16:25:20 -0400 Subject: [PATCH 36/37] ENH: add modest type inference --- pandas/computation/align.py | 17 +++++++++-------- pandas/computation/engines.py | 3 ++- pandas/computation/ops.py | 31 +++++++++++++++++++++++++------ 3 files changed, 36 insertions(+), 15 deletions(-) diff --git a/pandas/computation/align.py b/pandas/computation/align.py index f2bf11d41e185..529fe84fd06a7 100644 --- a/pandas/computation/align.py +++ b/pandas/computation/align.py @@ -163,6 +163,7 @@ def _filter_terms(flat): def _align(terms, env): + # flatten the parse tree (a nested list) terms = list(flatten(terms)) @@ -181,7 +182,7 @@ def _align(terms, env): return typ, axes -def _reconstruct_object(typ, obj, axes): +def _reconstruct_object(typ, obj, axes, dtype): """Reconstruct an object given its type, raw value, and possibly empty (None) axes. @@ -200,20 +201,20 @@ def _reconstruct_object(typ, obj, axes): An object of type ``typ`` with the value `obj` and possible axes `axes`. """ + #import ipdb; ipdb.set_trace() try: - # handle numpy dtypes typ = typ.type except AttributeError: pass if (not isinstance(typ, partial) and issubclass(typ, pd.core.generic.PandasObject)): - return typ(obj, **axes) + return typ(obj, dtype=dtype, **axes) - ret_value = typ(obj) + ret_value = typ(obj).astype(dtype) try: - return ret_value.item() - except (AttributeError, ValueError): - return ret_value - + ret = ret_value.item() + except ValueError: + ret = ret_value + return ret diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index db6beb87da3a5..7f500dccb825b 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -25,7 +25,8 @@ def evaluate(self): self.expr.env) res = self._evaluate(self.expr.env) - return _reconstruct_object(self.result_type, res, self.aligned_axes) + return _reconstruct_object(self.result_type, res, self.aligned_axes, + self.expr.terms.return_type) @property def _is_aligned(self): diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 0d67c56ba472a..ca5f6d4872a72 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -4,6 +4,7 @@ from pandas.util.py3compat import PY3 import pandas.core.common as com from pandas.core.base import StringMixin +from pandas.computation.common import flatten _reductions = 'sum', 'prod' @@ -46,15 +47,25 @@ def _update_name(env, key, value): del env.globals[key] env.globals[key] = value except KeyError: - raise NameError('{0!r} is undefined'.format(key)) + raise NameError('name {0!r} is not defined'.format(key)) class Term(StringMixin): def __init__(self, name, env): self.name = name - self.value = _resolve_name(env, name) self.env = env - self.type = type(self.value) + self.value = _resolve_name(self.env, self.name) + + try: + # ndframe potentially very slow for large, mixed dtype frames + self.type = self.value.values.dtype + except AttributeError: + try: + # ndarray + self.type = self.value.dtype + except AttributeError: + # scalar + self.type = type(self.value) def __unicode__(self): return com.pprint_thing(self.name) @@ -88,15 +99,23 @@ def __iter__(self): return iter(self.operands) def __unicode__(self): - """Print a generic n-ary operator and its operands""" + """Print a generic n-ary operator and its operands using infix + notation""" # recurse over the operands parened = ('({0})'.format(_print_operand(opr)) for opr in self.operands) return com.pprint_thing(' {0} '.format(self.op).join(parened)) + @property + def return_type(self): + # clobber types to bool if the op is a boolean operator + if self.op in (_cmp_ops_syms + _bool_ops_syms): + return np.bool_ + return np.result_type(*(term.type for term in flatten(self))) + -_cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=' -_cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne +_cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=', '=' +_cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne, op.eq _cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs)) _bool_ops_syms = '&', '|' From 401bc288cc7e145a8a4076376204d59793e94b02 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Sat, 6 Jul 2013 16:25:58 -0400 Subject: [PATCH 37/37] ENH: rewrite assignment as equal comparison --- pandas/computation/expr.py | 47 ++++++++++++++++++++++++--- pandas/computation/tests/test_eval.py | 4 ++- 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 6d33f6ac50a0d..9a9cd226278bc 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -1,7 +1,12 @@ import ast import sys +import itertools +import tokenize +import re +from cStringIO import StringIO from functools import partial + from pandas.core.base import StringMixin from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops from pandas.computation.ops import _cmp_ops_syms, _bool_ops_syms @@ -26,12 +31,38 @@ class ExprParserError(Exception): pass +def _rewrite_assign(source): + res = [] + g = tokenize.generate_tokens(StringIO(source).readline) + for toknum, tokval, _, _, _ in g: + res.append((toknum, '==' if tokval == '=' else tokval)) + return tokenize.untokenize(res) + + +def _parenthesize_booleans(source, ops='|&'): + res = source + for op in ops: + terms = res.split(op) + + t = [] + for term in terms: + t.append('({0})'.format(term)) + + res = op.join(t) + return res + + +def preparse(source): + return _parenthesize_booleans(_rewrite_assign(source)) + + class ExprVisitor(ast.NodeVisitor): """Custom ast walker """ bin_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms - bin_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'BitAnd', 'BitOr', - 'Add', 'Sub', 'Mult', 'Div', 'Pow', 'FloorDiv', 'Mod') + bin_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', None, + 'BitAnd', 'BitOr', 'Add', 'Sub', 'Mult', 'Div', 'Pow', + 'FloorDiv', 'Mod') bin_op_nodes_map = dict(zip(bin_ops, bin_op_nodes)) unary_ops = _unary_ops_syms @@ -39,7 +70,7 @@ class ExprVisitor(ast.NodeVisitor): unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) def __init__(self, env): - for bin_op in self.bin_ops: + for bin_op in itertools.ifilter(lambda x: x is not None, self.bin_ops): setattr(self, 'visit_{0}'.format(self.bin_op_nodes_map[bin_op]), lambda node, bin_op=bin_op: partial(BinOp, bin_op)) @@ -54,7 +85,7 @@ def visit(self, node): raise TypeError('"node" must be an AST node or a string, you' ' passed a(n) {0}'.format(node.__class__)) if isinstance(node, basestring): - node = ast.fix_missing_locations(ast.parse(node)) + node = ast.fix_missing_locations(ast.parse(preparse(node))) return super(ExprVisitor, self).visit(node) def visit_Module(self, node): @@ -62,7 +93,7 @@ def visit_Module(self, node): raise ExprParserError('only a single expression is allowed') expr = node.body[0] - if not isinstance(expr, ast.Expr): + if not isinstance(expr, (ast.Expr, ast.Assign)): raise SyntaxError('only expressions are allowed') return self.visit(expr) @@ -95,6 +126,12 @@ def visit_Compare(self, node): raise ExprParserError('chained comparisons not supported') return self.visit(ops[0])(self.visit(node.left), self.visit(comps[0])) + def visit_Assign(self, node): + cmpr = ast.copy_location(ast.Compare(ops=[ast.Eq()], + left=node.targets[0], + comparators=[node.value]), node) + return self.visit(cmpr) + def visit_Call(self, node): if not isinstance(node.func, ast.Name): raise TypeError("Only named functions are supported") diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index fc1cccf320201..6ec630b80614d 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -52,7 +52,9 @@ def _eval_from_expr(lhs, cmp1, rhs, binop, cmp2): env = Scope() typ, axes = _align_core((Term('lhs', env), Term('rhs', env))) lhs, rhs = env.locals['lhs'], env.locals['rhs'] - return _reconstruct_object(typ, bf(f1(lhs, rhs), f2(lhs, rhs)), axes) + ret_type = np.result_type(lhs, rhs) + return _reconstruct_object(typ, bf(f1(lhs, rhs), f2(lhs, rhs)), axes, + ret_type) def _eval_single_bin(lhs, cmp1, rhs, has_neg_frac):