diff --git a/pandas/__init__.py b/pandas/__init__.py index a0edb397c28c1..5315fd770e796 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -29,6 +29,7 @@ from pandas.stats.api import * from pandas.tseries.api import * from pandas.io.api import * +from pandas.computation.api import * from pandas.util.testing import debug diff --git a/pandas/computation/__init__.py b/pandas/computation/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/computation/align.py b/pandas/computation/align.py new file mode 100644 index 0000000000000..529fe84fd06a7 --- /dev/null +++ b/pandas/computation/align.py @@ -0,0 +1,220 @@ +from functools import partial, wraps +from itertools import izip + +import numpy as np + +import pandas as pd +import pandas.core.common as com +from pandas.computation.ops import is_const +from pandas.computation.common import flatten + + +def _align_core_single_unary_op(term): + if isinstance(term.value, np.ndarray) and not com.is_series(term.value): + typ = partial(np.asanyarray, dtype=term.value.dtype) + else: + typ = type(term.value) + ret = typ, + + if not hasattr(term.value, 'axes'): + ret += None, + else: + ret += _zip_axes_from_type(typ, term.value.axes), + return ret + + +def _zip_axes_from_type(typ, new_axes): + axes = {} + for ax_ind, ax_name in typ._AXIS_NAMES.iteritems(): + axes[ax_name] = new_axes[ax_ind] + return axes + + +def _maybe_promote_shape(values, naxes): + # test to see if we have an array else leave since must be a number + if not isinstance(values, np.ndarray): + return values + + ndims = values.ndim + if ndims > naxes: + raise AssertionError('cannot have more dims than axes, ' + '{0} > {1}'.format(ndims, naxes)) + if ndims == naxes: + return values + + ndim = set(xrange(ndims)) + nax = set(xrange(naxes)) + + axes_slice = [slice(None)] * naxes + + # symmetric difference of numaxes and ndims + slices = nax - ndim + + if ndims == naxes: + if slices: + raise AssertionError('slices should be empty if ndims == naxes ' + '{0}'.format(slices)) + else: + if not slices: + raise AssertionError('slices should NOT be empty if ndim != naxes ' + '{0}'.format(slices)) + + for sl in slices: + axes_slice[sl] = np.newaxis + + return values[tuple(axes_slice)] + + +def _any_pandas_objects(terms): + """Check a sequence of terms for instances of PandasObject.""" + return any(com.is_pd_obj(term.value) for term in terms) + + +def _filter_special_cases(f): + @wraps(f) + def wrapper(terms): + # single unary operand + if len(terms) == 1: + return _align_core_single_unary_op(terms[0]) + + # only scalars + elif all(term.isscalar for term in terms): + return np.result_type(*(term.value for term in terms)), None + + # single element ndarrays + all_has_size = all(hasattr(term.value, 'size') for term in terms) + if (all_has_size and all(term.value.size == 1 for term in terms)): + return np.result_type(*(term.value for term in terms)), None + + # no pandas so just punt to the evaluator + if not _any_pandas_objects(terms): + return np.result_type(*(term.value for term in terms)), None + + return f(terms) + return wrapper + + +@_filter_special_cases +def _align_core(terms): + term_index = [i for i, term in enumerate(terms) if hasattr(term.value, + 'axes')] + term_dims = [terms[i].value.ndim for i in term_index] + ndims = pd.Series(dict(zip(term_index, term_dims))) + + # initial axes are the axes of the largest-axis'd term + biggest = terms[ndims.idxmax()].value + typ = biggest._constructor + axes = biggest.axes + naxes = len(axes) + + for term in (terms[i] for i in term_index): + for axis, items in enumerate(term.value.axes): + if com.is_series(term.value) and naxes > 1: + ax, itm = naxes - 1, term.value.index + else: + ax, itm = axis, items + axes[ax] = axes[ax].join(itm, how='outer') + + for i, ndim in ndims.iteritems(): + for axis, items in izip(xrange(ndim), axes): + ti = terms[i].value + + if hasattr(ti, 'reindex_axis'): + transpose = com.is_series(ti) and naxes > 1 + + if transpose: + f = partial(ti.reindex, index=axes[naxes - 1], copy=False) + else: + f = partial(ti.reindex_axis, items, axis=axis, copy=False) + + if pd.lib.is_bool_array(ti.values): + r = f(fill_value=True) + else: + r = f() + + terms[i].update(r) + + res = _maybe_promote_shape(terms[i].value.T if transpose else + terms[i].value, naxes) + res = res.T if transpose else res + + try: + v = res.values + except AttributeError: + v = res + terms[i].update(v) + + return typ, _zip_axes_from_type(typ, axes) + + +def _filter_terms(flat): + # numeric literals + literals = set(filter(is_const, flat)) + + # these are strings which are variable names + names = set(flat) - literals + + # literals are not names and names are not literals, so intersection should + # be empty + if literals & names: + raise ValueError('literals cannot be names and names cannot be ' + 'literals') + return names, literals + + +def _align(terms, env): + + # flatten the parse tree (a nested list) + terms = list(flatten(terms)) + + # separate names and literals + names, literals = _filter_terms(terms) + + if not names: # only literals so just promote to a common type + return np.result_type(*literals).type, None + + # if all resolved variables are numeric scalars + if all(term.isscalar for term in terms): + return np.result_type(*(term.value for term in terms)).type, None + + # perform the main alignment + typ, axes = _align_core(terms) + return typ, axes + + +def _reconstruct_object(typ, obj, axes, dtype): + """Reconstruct an object given its type, raw value, and possibly empty + (None) axes. + + Parameters + ---------- + typ : object + A type + obj : object + The value to use in the type constructor + axes : dict + The axes to use to construct the resulting pandas object + + Returns + ------- + reconst : typ + An object of type ``typ`` with the value `obj` and possible axes + `axes`. + """ + #import ipdb; ipdb.set_trace() + try: + typ = typ.type + except AttributeError: + pass + + if (not isinstance(typ, partial) and + issubclass(typ, pd.core.generic.PandasObject)): + return typ(obj, dtype=dtype, **axes) + + ret_value = typ(obj).astype(dtype) + + try: + ret = ret_value.item() + except ValueError: + ret = ret_value + return ret diff --git a/pandas/computation/api.py b/pandas/computation/api.py new file mode 100644 index 0000000000000..db8269a497768 --- /dev/null +++ b/pandas/computation/api.py @@ -0,0 +1,2 @@ +from pandas.computation.eval import eval +from pandas.computation.expr import Expr diff --git a/pandas/computation/common.py b/pandas/computation/common.py new file mode 100644 index 0000000000000..4061984dd5e08 --- /dev/null +++ b/pandas/computation/common.py @@ -0,0 +1,11 @@ +import collections +from pandas.core.common import is_string + + +def flatten(l): + for el in l: + if isinstance(el, collections.Iterable) and not is_string(el): + for s in flatten(el): + yield s + else: + yield el diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py new file mode 100644 index 0000000000000..7f500dccb825b --- /dev/null +++ b/pandas/computation/engines.py @@ -0,0 +1,80 @@ +import abc + +from pandas.computation.align import _align, _reconstruct_object + + +class AbstractEngine(object): + """""" + __metaclass__ = abc.ABCMeta + + has_neg_frac = False + + def __init__(self, expr): + self.expr = expr + self.aligned_axes = None + self.result_type = None + + @abc.abstractmethod + def convert(self): + """Convert an expression for evaluation.""" + pass + + def evaluate(self): + if not self._is_aligned: + self.result_type, self.aligned_axes = _align(self.expr.terms, + self.expr.env) + + res = self._evaluate(self.expr.env) + return _reconstruct_object(self.result_type, res, self.aligned_axes, + self.expr.terms.return_type) + + @property + def _is_aligned(self): + return self.aligned_axes is not None and self.result_type is not None + + @abc.abstractmethod + def _evaluate(self, env): + """Return an evaluated expression.""" + pass + + +class NumExprEngine(AbstractEngine): + """NumExpr engine class""" + has_neg_frac = True + + def __init__(self, expr): + super(NumExprEngine, self).__init__(expr) + + def convert(self): + """Return a string""" + return '%s' % self.expr + + def _evaluate(self, env): + import numexpr as ne + + try: + return ne.evaluate(self.convert(), local_dict=env.locals, + global_dict=env.globals, + truediv=self.expr.truediv) + except KeyError as e: + raise NameError('{0!r} is not defined'.format(e.message)) + + +class PythonEngine(AbstractEngine): + """Use NumPy even if numexpr is installed""" + has_neg_frac = False + + def __init__(self, expr): + super(PythonEngine, self).__init__(expr) + + def convert(self): + pass + + def evaluate(self): + return self.expr(self.expr.env) + + def _evaluate(self, env): + pass + + +_engines = {'numexpr': NumExprEngine, 'python': PythonEngine} diff --git a/pandas/computation/eval.py b/pandas/computation/eval.py new file mode 100644 index 0000000000000..1a681e37d6130 --- /dev/null +++ b/pandas/computation/eval.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python + +import numbers + +import numpy as np + +import six + +from pandas.computation.expr import Expr, Scope +from pandas.computation.engines import _engines + + +def eval(expr, engine='numexpr', truediv=True, local_dict=None, + global_dict=None): + """Evaluate a Python expression as a string using various backends. + + The following arithmetic operations are supported: +, -, *, /, **, %, // + (python engine only) along with the following boolean operations: | (or), & + (and), and ~ (not). All Pandas objects are supported and behave as they + would with in-Python evaluation. + + Parameters + ---------- + expr : string or Expr object + The expression to evaluate. This can be either a string or an ``Expr`` + object. + engine : string, optional, default 'numexpr', {'python', 'numexpr', 'pytables'} + The engine used to evaluate the expression. Supported engines are + + - 'numexpr': This default engine evaluates pandas objects using numexpr + for large speed ups in complex expressions with large + frames. + - 'python': Performs operations as if you had eval'd in top level + python + - 'pytables': Engine used for evaluating expressions for selection of + objects from PyTables HDF5 tables. + + truediv : bool, optional, default True + Whether to use true division, like in Python >= 3 + local_dict : dict or None, optional, default None + A dictionary of local variables, taken from locals() by default. + global_dict : dict or None, optional, default None + A dictionary of global variables, taken from globals() by default. + + Returns + ------- + obj : ndarray, scalar, DataFrame, Series, or Panel + + Notes + ----- + * The benefits of using ``eval`` are that very large frames that are terms in + long expressions are sped up, sometimes by as much as 10x. + + See :ref:`Enhancing performance ` for more details. + """ + # make sure we're passed a valid engine + if not engine in _engines: + raise KeyError('Invalid engine {0} passed, valid engines are' + ' {1}'.format(_engines.keys())) + + eng = _engines[engine] + + if isinstance(expr, six.string_types): + # need to go 2 up in the call stack from the constructor since we want + # the calling scope's variables + env = Scope(global_dict, local_dict, frame_level=2) + parsed_expr = Expr(expr, engine, env, truediv) + elif isinstance(expr, Expr): + parsed_expr = expr + else: + raise TypeError("eval only accepts strings and Expr objects, you " + "passed a {0!r}".format(expr.__class__.__name__)) + + + # construct the engine and evaluate + ret = eng(parsed_expr).evaluate() + + # sanity check for a number + # TODO: eventually take out + # TODO: pytables engine will probably need a string check + if np.isscalar(ret): + if not isinstance(ret, (np.number, np.bool_, numbers.Number)): + raise TypeError('scalar result must be numeric or bool, passed ' + 'type is {0!r}'.format(ret.__class__.__name__)) + return ret diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py new file mode 100644 index 0000000000000..9a9cd226278bc --- /dev/null +++ b/pandas/computation/expr.py @@ -0,0 +1,187 @@ +import ast +import sys +import itertools +import tokenize +import re +from cStringIO import StringIO +from functools import partial + + +from pandas.core.base import StringMixin +from pandas.computation.ops import BinOp, UnaryOp, _reductions, _mathops +from pandas.computation.ops import _cmp_ops_syms, _bool_ops_syms +from pandas.computation.ops import _arith_ops_syms, _unary_ops_syms +from pandas.computation.ops import Term, Constant + + +class Scope(object): + __slots__ = 'globals', 'locals' + + def __init__(self, gbls=None, lcls=None, frame_level=1): + frame = sys._getframe(frame_level) + + try: + self.globals = gbls or frame.f_globals.copy() + self.locals = lcls or frame.f_locals.copy() + finally: + del frame + + +class ExprParserError(Exception): + pass + + +def _rewrite_assign(source): + res = [] + g = tokenize.generate_tokens(StringIO(source).readline) + for toknum, tokval, _, _, _ in g: + res.append((toknum, '==' if tokval == '=' else tokval)) + return tokenize.untokenize(res) + + +def _parenthesize_booleans(source, ops='|&'): + res = source + for op in ops: + terms = res.split(op) + + t = [] + for term in terms: + t.append('({0})'.format(term)) + + res = op.join(t) + return res + + +def preparse(source): + return _parenthesize_booleans(_rewrite_assign(source)) + + +class ExprVisitor(ast.NodeVisitor): + """Custom ast walker + """ + bin_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms + bin_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', None, + 'BitAnd', 'BitOr', 'Add', 'Sub', 'Mult', 'Div', 'Pow', + 'FloorDiv', 'Mod') + bin_op_nodes_map = dict(zip(bin_ops, bin_op_nodes)) + + unary_ops = _unary_ops_syms + unary_op_nodes = 'UAdd', 'USub', 'Invert' + unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) + + def __init__(self, env): + for bin_op in itertools.ifilter(lambda x: x is not None, self.bin_ops): + setattr(self, 'visit_{0}'.format(self.bin_op_nodes_map[bin_op]), + lambda node, bin_op=bin_op: partial(BinOp, bin_op)) + + for unary_op in self.unary_ops: + setattr(self, + 'visit_{0}'.format(self.unary_op_nodes_map[unary_op]), + lambda node, unary_op=unary_op: partial(UnaryOp, unary_op)) + self.env = env + + def visit(self, node): + if not (isinstance(node, ast.AST) or isinstance(node, basestring)): + raise TypeError('"node" must be an AST node or a string, you' + ' passed a(n) {0}'.format(node.__class__)) + if isinstance(node, basestring): + node = ast.fix_missing_locations(ast.parse(preparse(node))) + return super(ExprVisitor, self).visit(node) + + def visit_Module(self, node): + if len(node.body) != 1: + raise ExprParserError('only a single expression is allowed') + + expr = node.body[0] + if not isinstance(expr, (ast.Expr, ast.Assign)): + raise SyntaxError('only expressions are allowed') + + return self.visit(expr) + + def visit_Expr(self, node): + return self.visit(node.value) + + def visit_BinOp(self, node): + op = self.visit(node.op) + left = self.visit(node.left) + right = self.visit(node.right) + return op(left, right) + + def visit_UnaryOp(self, node): + if isinstance(node.op, ast.Not): + raise NotImplementedError("not operator not yet supported") + op = self.visit(node.op) + return op(self.visit(node.operand)) + + def visit_Name(self, node): + return Term(node.id, self.env) + + def visit_Num(self, node): + return Constant(node.n, self.env) + + def visit_Compare(self, node): + ops = node.ops + comps = node.comparators + if len(ops) != 1: + raise ExprParserError('chained comparisons not supported') + return self.visit(ops[0])(self.visit(node.left), self.visit(comps[0])) + + def visit_Assign(self, node): + cmpr = ast.copy_location(ast.Compare(ops=[ast.Eq()], + left=node.targets[0], + comparators=[node.value]), node) + return self.visit(cmpr) + + def visit_Call(self, node): + if not isinstance(node.func, ast.Name): + raise TypeError("Only named functions are supported") + + valid_ops = _reductions + _mathops + + if node.func.id not in valid_ops: + raise ValueError("Only {0} are supported".format(valid_ops)) + + raise NotImplementedError("function calls not yet supported") + + def visit_Attribute(self, node): + raise NotImplementedError("attribute access is not yet supported") + + def visit_BoolOp(self, node): + raise NotImplementedError("boolean operators are not yet supported") + + +class Expr(StringMixin): + """Expr object""" + def __init__(self, expr, engine='numexpr', env=None, truediv=True): + self.expr = expr + self.env = env or Scope(frame_level=2) + self._visitor = ExprVisitor(self.env) + self.terms = self.parse() + self.engine = engine + self.truediv = truediv + + def __call__(self, env): + env.locals['truediv'] = self.truediv + return self.terms(env) + + def __unicode__(self): + return unicode(self.terms) + + def parse(self): + """return a Termset""" + return self._visitor.visit(self.expr) + + def align(self): + """align a set of Terms""" + return self.terms.align(self.env) + + +def isexpr(s, check_names=True): + try: + Expr(s) + except SyntaxError: + return False + except NameError: + return not check_names + else: + return True diff --git a/pandas/core/expressions.py b/pandas/computation/expressions.py similarity index 58% rename from pandas/core/expressions.py rename to pandas/computation/expressions.py index abe891b82410c..0c13a50d15618 100644 --- a/pandas/core/expressions.py +++ b/pandas/computation/expressions.py @@ -5,7 +5,9 @@ Offer fast expression evaluation thru numexpr """ + import numpy as np +import pandas.core.common as com try: import numexpr as ne @@ -14,17 +16,19 @@ _NUMEXPR_INSTALLED = False _USE_NUMEXPR = _NUMEXPR_INSTALLED -_evaluate = None -_where = None +_evaluate = None +_where = None # the set of dtypes that we will allow pass to numexpr -_ALLOWED_DTYPES = dict(evaluate = set(['int64','int32','float64','float32','bool']), - where = set(['int64','float64','bool'])) +_ALLOWED_DTYPES = dict( + evaluate=set(['int64', 'int32', 'float64', 'float32', 'bool']), + where=set(['int64', 'float64', 'bool'])) # the minimum prod shape that we will use numexpr -_MIN_ELEMENTS = 10000 +_MIN_ELEMENTS = 10000 + -def set_use_numexpr(v = True): +def set_use_numexpr(v=True): # set/unset to use numexpr global _USE_NUMEXPR if _NUMEXPR_INSTALLED: @@ -34,43 +38,42 @@ def set_use_numexpr(v = True): global _evaluate, _where if not _USE_NUMEXPR: _evaluate = _evaluate_standard - _where = _where_standard + _where = _where_standard else: _evaluate = _evaluate_numexpr - _where = _where_numexpr + _where = _where_numexpr + -def set_numexpr_threads(n = None): +def set_numexpr_threads(n=None): # if we are using numexpr, set the threads to n # otherwise reset - try: - if _NUMEXPR_INSTALLED and _USE_NUMEXPR: - if n is None: - n = ne.detect_number_of_cores() - ne.set_num_threads(n) - except: - pass + if _NUMEXPR_INSTALLED and _USE_NUMEXPR: + if n is None: + n = ne.detect_number_of_cores() + ne.set_num_threads(n) def _evaluate_standard(op, op_str, a, b, raise_on_error=True, **eval_kwargs): """ standard evaluation """ - return op(a,b) + return op(a, b) + def _can_use_numexpr(op, op_str, a, b, dtype_check): """ return a boolean if we WILL be using numexpr """ if op_str is not None: - + # required min elements (otherwise we are adding overhead) if np.prod(a.shape) > _MIN_ELEMENTS: # check for dtype compatiblity dtypes = set() - for o in [ a, b ]: - if hasattr(o,'get_dtype_counts'): + for o in [a, b]: + if hasattr(o, 'get_dtype_counts'): s = o.get_dtype_counts() if len(s) > 1: return False dtypes |= set(s.index) - elif isinstance(o,np.ndarray): + elif isinstance(o, np.ndarray): dtypes |= set([o.dtype.name]) # allowed are a superset @@ -79,62 +82,61 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): return False -def _evaluate_numexpr(op, op_str, a, b, raise_on_error = False, **eval_kwargs): + +def _evaluate_numexpr(op, op_str, a, b, raise_on_error=False, **eval_kwargs): result = None if _can_use_numexpr(op, op_str, a, b, 'evaluate'): try: a_value, b_value = a, b - if hasattr(a_value,'values'): + if hasattr(a_value, 'values'): a_value = a_value.values - if hasattr(b_value,'values'): + if hasattr(b_value, 'values'): b_value = b_value.values - result = ne.evaluate('a_value %s b_value' % op_str, - local_dict={ 'a_value' : a_value, - 'b_value' : b_value }, + result = ne.evaluate('a_value %s b_value' % op_str, + local_dict={'a_value': a_value, + 'b_value': b_value}, casting='safe', **eval_kwargs) - except (ValueError), detail: - if 'unknown type object' in str(detail): - pass - except (Exception), detail: - if raise_on_error: - raise TypeError(str(detail)) + except Exception as detail: + if ('unknown type object' not in com.pprint_thing(detail) and + raise_on_error): + raise if result is None: - result = _evaluate_standard(op,op_str,a,b,raise_on_error) + result = _evaluate_standard(op, op_str, a, b, raise_on_error) return result -def _where_standard(cond, a, b, raise_on_error=True): + +def _where_standard(cond, a, b, raise_on_error=True): return np.where(cond, a, b) -def _where_numexpr(cond, a, b, raise_on_error = False): + +def _where_numexpr(cond, a, b, raise_on_error=False): result = None if _can_use_numexpr(None, 'where', a, b, 'where'): try: cond_value, a_value, b_value = cond, a, b - if hasattr(cond_value,'values'): + if hasattr(cond_value, 'values'): cond_value = cond_value.values - if hasattr(a_value,'values'): + if hasattr(a_value, 'values'): a_value = a_value.values - if hasattr(b_value,'values'): + if hasattr(b_value, 'values'): b_value = b_value.values - result = ne.evaluate('where(cond_value,a_value,b_value)', - local_dict={ 'cond_value' : cond_value, - 'a_value' : a_value, - 'b_value' : b_value }, + result = ne.evaluate('where(cond_value, a_value, b_value)', + local_dict={'cond_value': cond_value, + 'a_value': a_value, + 'b_value': b_value}, casting='safe') - except (ValueError), detail: - if 'unknown type object' in str(detail): - pass - except (Exception), detail: - if raise_on_error: - raise TypeError(str(detail)) + except Exception as detail: + if ('unknown type object' not in com.pprint_thing(detail) and + raise_on_error): + raise if result is None: - result = _where_standard(cond,a,b,raise_on_error) + result = _where_standard(cond, a, b, raise_on_error) return result @@ -142,7 +144,9 @@ def _where_numexpr(cond, a, b, raise_on_error = False): # turn myself on set_use_numexpr(True) -def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, **eval_kwargs): + +def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, + **eval_kwargs): """ evaluate and return the expression of the op on a and b Parameters @@ -152,15 +156,18 @@ def evaluate(op, op_str, a, b, raise_on_error=False, use_numexpr=True, **eval_kw op_str: the string version of the op a : left operand b : right operand - raise_on_error : pass the error to the higher level if indicated (default is False), - otherwise evaluate the op with and return the results + raise_on_error : pass the error to the higher level if indicated + (default is False), otherwise evaluate the op with and + return the results use_numexpr : whether to try to use numexpr (default True) """ if use_numexpr: - return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error, **eval_kwargs) + return _evaluate(op, op_str, a, b, raise_on_error=raise_on_error, + **eval_kwargs) return _evaluate_standard(op, op_str, a, b, raise_on_error=raise_on_error) + def where(cond, a, b, raise_on_error=False, use_numexpr=True): """ evaluate the where condition cond on a and b @@ -170,8 +177,9 @@ def where(cond, a, b, raise_on_error=False, use_numexpr=True): cond : a boolean array a : return if cond is True b : return if cond is False - raise_on_error : pass the error to the higher level if indicated (default is False), - otherwise evaluate the op with and return the results + raise_on_error : pass the error to the higher level if indicated + (default is False), otherwise evaluate the op with and + return the results use_numexpr : whether to try to use numexpr (default True) """ diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py new file mode 100644 index 0000000000000..ca5f6d4872a72 --- /dev/null +++ b/pandas/computation/ops.py @@ -0,0 +1,255 @@ +import operator as op + +import numpy as np +from pandas.util.py3compat import PY3 +import pandas.core.common as com +from pandas.core.base import StringMixin +from pandas.computation.common import flatten + + +_reductions = 'sum', 'prod' +_mathops = ('sin', 'cos', 'exp', 'log', 'expm1', 'log1p', 'pow', 'div', 'sqrt', + 'inv', 'sinh', 'cosh', 'tanh', 'arcsin', 'arccos', 'arctan', + 'arccosh', 'arcsinh', 'arctanh', 'arctan2', 'abs') + + +class OperatorError(Exception): + pass + + +class UnaryOperatorError(OperatorError): + pass + + +class BinaryOperatorError(OperatorError): + pass + + +def _resolve_name(env, key): + res = env.locals.get(key, env.globals.get(key)) + + if res is None: + if not isinstance(key, basestring): + return key + + raise NameError('name {0!r} is not defined'.format(key)) + + return res + + +def _update_name(env, key, value): + if isinstance(key, basestring): + try: + del env.locals[key] + env.locals[key] = value + except KeyError: + try: + del env.globals[key] + env.globals[key] = value + except KeyError: + raise NameError('name {0!r} is not defined'.format(key)) + + +class Term(StringMixin): + def __init__(self, name, env): + self.name = name + self.env = env + self.value = _resolve_name(self.env, self.name) + + try: + # ndframe potentially very slow for large, mixed dtype frames + self.type = self.value.values.dtype + except AttributeError: + try: + # ndarray + self.type = self.value.dtype + except AttributeError: + # scalar + self.type = type(self.value) + + def __unicode__(self): + return com.pprint_thing(self.name) + + def update(self, value): + _update_name(self.env, self.name, value) + self.value = value + + @property + def isscalar(self): + return np.isscalar(self.value) + + +class Constant(Term): + def __init__(self, value, env): + super(Constant, self).__init__(value, env) + + +def _print_operand(opr): + return opr.name if is_term(opr) else unicode(opr) + + +class Op(StringMixin): + """Hold an operator of unknown arity + """ + def __init__(self, op, operands): + self.op = op + self.operands = operands + + def __iter__(self): + return iter(self.operands) + + def __unicode__(self): + """Print a generic n-ary operator and its operands using infix + notation""" + # recurse over the operands + parened = ('({0})'.format(_print_operand(opr)) + for opr in self.operands) + return com.pprint_thing(' {0} '.format(self.op).join(parened)) + + @property + def return_type(self): + # clobber types to bool if the op is a boolean operator + if self.op in (_cmp_ops_syms + _bool_ops_syms): + return np.bool_ + return np.result_type(*(term.type for term in flatten(self))) + + +_cmp_ops_syms = '>', '<', '>=', '<=', '==', '!=', '=' +_cmp_ops_funcs = op.gt, op.lt, op.ge, op.le, op.eq, op.ne, op.eq +_cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs)) + +_bool_ops_syms = '&', '|' +_bool_ops_funcs = op.and_, op.or_ +_bool_ops_dict = dict(zip(_bool_ops_syms, _bool_ops_funcs)) + +_arith_ops_syms = '+', '-', '*', '/', '**', '//', '%' +_arith_ops_funcs = (op.add, op.sub, op.mul, op.truediv if PY3 else op.div, + op.pow, op.floordiv, op.mod) +_arith_ops_dict = dict(zip(_arith_ops_syms, _arith_ops_funcs)) + +_binary_ops_dict = {} + +for d in (_cmp_ops_dict, _bool_ops_dict, _arith_ops_dict): + _binary_ops_dict.update(d) + + +def _cast_inplace(terms, dtype): + dt = np.dtype(dtype) + for term in terms: + # cast all the way down the tree since operands must be + try: + _cast_inplace(term.operands, dtype) + except AttributeError: + # we've bottomed out so actually do the cast + try: + new_value = term.value.astype(dt) + except AttributeError: + new_value = dt.type(term.value) + term.update(new_value) + + +def is_term(obj): + return isinstance(obj, Term) + + +def is_const(obj): + return isinstance(obj, Constant) + + +class BinOp(Op): + """Hold a binary operator and its operands + + Parameters + ---------- + op : str or Op + left : str or Op + right : str or Op + """ + def __init__(self, op, lhs, rhs): + super(BinOp, self).__init__(op, (lhs, rhs)) + self.lhs = lhs + self.rhs = rhs + + try: + self.func = _binary_ops_dict[op] + except KeyError: + keys = _binary_ops_dict.keys() + raise BinaryOperatorError('Invalid binary operator {0}, valid' + ' operators are {1}'.format(op, keys)) + + def __call__(self, env): + # handle truediv + if self.op == '/' and env.locals['truediv']: + self.func = op.truediv + + # recurse over the left nodes + try: + left = self.lhs(env) + except TypeError: + left = self.lhs + + # recurse over the right nodes + try: + right = self.rhs(env) + except TypeError: + right = self.rhs + + # base cases + if is_term(left) and is_term(right): + res = self.func(left.value, right.value) + elif not is_term(left) and is_term(right): + res = self.func(left, right.value) + elif is_term(left) and not is_term(right): + res = self.func(left.value, right) + elif not (is_term(left) or is_term(right)): + res = self.func(left, right) + + return res + + +class Mod(BinOp): + def __init__(self, lhs, rhs): + super(Mod, self).__init__('%', lhs, rhs) + _cast_inplace(self.operands, np.float_) + + +_unary_ops_syms = '+', '-', '~' +_unary_ops_funcs = op.pos, op.neg, op.invert +_unary_ops_dict = dict(zip(_unary_ops_syms, _unary_ops_funcs)) + + +class UnaryOp(Op): + """Hold a unary operator and its operands + """ + def __init__(self, op, operand): + super(UnaryOp, self).__init__(op, (operand,)) + self.operand = operand + + try: + self.func = _unary_ops_dict[op] + except KeyError: + raise UnaryOperatorError('Invalid unary operator {0}, valid ' + 'operators are ' + '{1}'.format(op, _unary_ops_syms)) + + def __call__(self, env): + operand = self.operand + + # recurse if operand is an Op + try: + operand = self.operand(env) + except TypeError: + operand = self.operand + + v = operand.value if is_term(operand) else operand + + try: + res = self.func(v) + except TypeError: + res = self.func(v.values) + + return res + + def __unicode__(self): + return com.pprint_thing('{0}({1})'.format(self.op, self.operand)) + diff --git a/pandas/computation/tests/__init__.py b/pandas/computation/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py new file mode 100644 index 0000000000000..6ec630b80614d --- /dev/null +++ b/pandas/computation/tests/test_eval.py @@ -0,0 +1,650 @@ +#!/usr/bin/env python + +import unittest +import itertools +from itertools import product + +import nose +from nose.tools import assert_raises, assert_tuple_equal +from nose.tools import assert_true, assert_false + +from numpy.random import randn, rand +import numpy as np +from numpy.testing import assert_array_equal, assert_allclose +from numpy.testing.decorators import slow + +import pandas as pd +from pandas.core import common as com +from pandas import DataFrame, Series +from pandas.util.testing import makeCustomDataframe as mkdf +from pandas.computation.engines import _engines, _reconstruct_object +from pandas.computation.align import _align_core +from pandas.computation.ops import _binary_ops_dict, _unary_ops_dict, Term +import pandas.computation.expr as expr +from pandas.computation.expressions import _USE_NUMEXPR +from pandas.computation.eval import Scope +from pandas.util.testing import assert_frame_equal, randbool +from pandas.util.py3compat import PY3 + + +def skip_numexpr_engine(engine): + if not _USE_NUMEXPR and engine == 'numexpr': + raise nose.SkipTest + + +def engine_has_neg_frac(engine): + return _engines[engine].has_neg_frac + + +def fractional(x): + frac, _ = np.modf(np.asanyarray(x)) + return frac + + +def hasfractional(x): + return np.any(fractional(x)) + + +def _eval_from_expr(lhs, cmp1, rhs, binop, cmp2): + f1 = _binary_ops_dict[cmp1] + f2 = _binary_ops_dict[cmp2] + bf = _binary_ops_dict[binop] + env = Scope() + typ, axes = _align_core((Term('lhs', env), Term('rhs', env))) + lhs, rhs = env.locals['lhs'], env.locals['rhs'] + ret_type = np.result_type(lhs, rhs) + return _reconstruct_object(typ, bf(f1(lhs, rhs), f2(lhs, rhs)), axes, + ret_type) + + +def _eval_single_bin(lhs, cmp1, rhs, has_neg_frac): + c = _binary_ops_dict[cmp1] + if has_neg_frac: + try: + result = c(lhs, rhs) + except ValueError: + result = np.nan + else: + result = c(lhs, rhs) + return result + + +def isframe(x): + return isinstance(x, pd.DataFrame) + + +def isseries(x): + return isinstance(x, pd.Series) + + +def are_compatible_types(op, lhs, rhs): + if op in ('&', '|'): + if isframe(lhs) and isseries(rhs) or isframe(rhs) and isseries(lhs): + return False + return True + + +def _eval_bin_and_unary(unary, lhs, arith1, rhs): + binop = _binary_ops_dict[arith1] + unop = expr._unary_ops_dict[unary] + return unop(binop(lhs, rhs)) + + +def _series_and_2d_ndarray(lhs, rhs): + return (com.is_series(lhs) and isinstance(rhs, np.ndarray) and rhs.ndim > 1 + or com.is_series(rhs) and isinstance(lhs, np.ndarray) and lhs.ndim + > 1) + + +# Smoke testing +class TestBasicEval(unittest.TestCase): + + @classmethod + def setUpClass(self): + self.cmp_ops = expr._cmp_ops_syms + self.cmp2_ops = self.cmp_ops[::-1] + self.bin_ops = expr._bool_ops_syms + self.arith_ops = tuple(o for o in expr._arith_ops_syms if o != '//') + self.unary_ops = '+', '-' + + def set_current_engine(self): + self.engine = 'numexpr' + + def setup_data(self): + nan_df = DataFrame(rand(10, 5)) + nan_df[nan_df > 0.5] = np.nan + self.lhses = (DataFrame(randn(10, 5)), Series(randn(5)), randn(), + np.float64(randn()), randn(10, 5), randn(5), np.nan, + Series([1, 2, np.nan, np.nan, 5]), nan_df) + self.rhses = (DataFrame(randn(10, 5)), Series(randn(5)), randn(), + np.float64(randn()), randn(10, 5), randn(5), np.nan, + Series([1, 2, np.nan, np.nan, 5]), nan_df) + + def setUp(self): + try: + import numexpr as ne + self.ne = ne + except ImportError: + raise nose.SkipTest + self.set_current_engine() + self.setup_data() + self.current_engines = filter(lambda x: x != self.engine, + _engines.iterkeys()) + + @slow + def test_complex_cmp_ops(self): + self.setUp() + lhses, rhses = self.lhses, self.rhses + args = itertools.product(lhses, self.cmp_ops, rhses, self.bin_ops, + self.cmp2_ops) + for lhs, cmp1, rhs, binop, cmp2 in args: + self._create_cmp_op_t(lhs, cmp1, rhs, binop, cmp2) + + def test_simple_cmp_ops(self): + bool_lhses = (DataFrame(randbool(size=(10, 5))), + Series(randbool((5,))), randbool()) + bool_rhses = (DataFrame(randbool(size=(10, 5))), + Series(randbool((5,))), randbool()) + args = itertools.product(bool_lhses, bool_rhses, self.cmp_ops) + for lhs, rhs, cmp_op in args: + self._create_simple_cmp_op_t(lhs, rhs, cmp_op) + + def test_binary_arith_ops(self): + self.setUp() + lhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + rhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + args = itertools.product(lhses, self.arith_ops, rhses) + for lhs, op, rhs in args: + self._create_arith_op_t(lhs, op, rhs) + + def test_unary_arith_ops(self): + self.setUp() + lhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + rhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + aops = tuple(aop for aop in self.arith_ops if aop not in '+-') + args = itertools.product(self.unary_ops, lhses, aops, rhses) + for unary_op, lhs, arith_op, rhs in args: + self._create_unary_arith_op_t(unary_op, lhs, arith_op, rhs) + + def test_invert(self): + self.setUp() + lhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + rhses = DataFrame(randn(10, 5)), Series(randn(5)), randn() + args = itertools.product(lhses, self.cmp_ops, rhses) + for lhs, op, rhs in args: + self._create_invert_op_t(lhs, op, rhs) + + def _create_cmp_op_t(self, lhs, cmp1, rhs, binop, cmp2): + ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format(cmp1=cmp1, + binop=binop, + cmp2=cmp2) + if _series_and_2d_ndarray(lhs, rhs): + self.assertRaises(Exception, _eval_from_expr, lhs, cmp1, rhs, + binop, cmp2) + self.assertRaises(Exception, pd.eval, ex, engine=self.engine) + else: + expected = _eval_from_expr(lhs, cmp1, rhs, binop, cmp2) + result = pd.eval(ex, engine=self.engine) + assert_array_equal(result, expected) + + def _create_simple_cmp_op_t(self, lhs, rhs, cmp1): + ex = 'lhs {0} rhs'.format(cmp1) + + if are_compatible_types(cmp1, lhs, rhs): + expected = _eval_single_bin(lhs, cmp1, rhs, + engine_has_neg_frac(self.engine)) + result = pd.eval(ex, engine=self.engine) + assert_array_equal(result, expected) + else: + assert_raises(TypeError, _eval_single_bin, lhs, cmp1, rhs, + engine_has_neg_frac(self.engine)) + + def _create_arith_op_t(self, lhs, arith1, rhs): + ex = 'lhs {0} rhs'.format(arith1) + nan_frac_neg = (arith1 == '**' and np.any(lhs < 0) and + hasfractional(rhs) and np.isscalar(lhs) and + np.isscalar(rhs) and + not (isinstance(lhs, tuple(np.typeDict.values())) + or isinstance(rhs, tuple(np.typeDict.values())))) + if nan_frac_neg and not engine_has_neg_frac(self.engine): + assert_raises(ValueError, pd.eval, ex, engine=self.engine, + local_dict=locals(), global_dict=globals()) + else: + result = pd.eval(ex, engine=self.engine) + + if arith1 != '//': + expected = _eval_single_bin(lhs, arith1, rhs, + engine_has_neg_frac(self.engine)) + # roundoff error with modulus + if arith1 == '%': + assert_allclose(result, expected) + else: + assert_array_equal(result, expected) + + # sanity check on recursive parsing + try: + ghs = rhs.copy() + except AttributeError: + ghs = rhs + + if nan_frac_neg and not engine_has_neg_frac(self.engine): + assert_raises(ValueError, pd.eval, ex, engine=self.engine, + local_dict=locals(), global_dict=globals()) + else: + if arith1 == '**': + ex = '(lhs {0} rhs) {0} ghs'.format(arith1) + else: + ex = 'lhs {0} rhs {0} ghs'.format(arith1) + result = pd.eval(ex, engine=self.engine) + + try: + nlhs = _eval_single_bin(lhs, arith1, rhs, + engine_has_neg_frac(self.engine)) + except ValueError: + assert_raises(ValueError, _eval_single_bin, lhs, arith1, rhs, + engine_has_neg_frac(self.engine)) + else: + try: + nlhs, ghs = nlhs.align(ghs) + except: + pass + if arith1 != '//': + expected = self.ne.evaluate('nlhs {0} ghs'.format(arith1)) + + # roundoff error with modulus + if arith1 == '%': + assert_allclose(result, expected) + else: + assert_array_equal(result, expected) + + def _create_invert_op_t(self, lhs, cmp1, rhs): + # simple + for el in (lhs, rhs): + try: + elb = el.astype(bool) + except AttributeError: + elb = np.array([bool(el)]) + expected = ~elb + result = pd.eval('~elb', engine=self.engine) + assert_array_equal(expected, result) + + for engine in self.current_engines: + assert_array_equal(result, pd.eval('~elb', engine=engine)) + + # compound + ex = '~(lhs {0} rhs)'.format(cmp1) + if np.isscalar(lhs) and np.isscalar(rhs): + lhs, rhs = map(lambda x: np.array([x]), (lhs, rhs)) + expected = ~_eval_single_bin(lhs, cmp1, rhs, + engine_has_neg_frac(self.engine)) + result = pd.eval(ex, engine=self.engine) + assert_array_equal(expected, result) + + # make sure the other engines work + for engine in self.current_engines: + ev = pd.eval(ex, engine=self.engine) + assert_array_equal(ev, result) + + def _create_unary_arith_op_t(self, unary_op, lhs, arith1, rhs): + # simple + ex = '{0}lhs'.format(unary_op, arith1) + f = _unary_ops_dict[unary_op] + bad_types = tuple(np.typeDict.values()) + + nan_frac_neg = (arith1 == '**' and + np.any(lhs < 0) and + hasfractional(rhs) and + np.isscalar(lhs) and np.isscalar(rhs) and + not (isinstance(lhs, bad_types) or + isinstance(rhs, bad_types)) + and not engine_has_neg_frac(self.engine)) + try: + expected = f(lhs.values) + except AttributeError: + expected = f(lhs) + result = pd.eval(ex, engine=self.engine) + assert_array_equal(result, expected) + + for engine in self.current_engines: + assert_array_equal(result, pd.eval(ex, engine=engine)) + + ex = '{0}(lhs {1} rhs)'.format(unary_op, arith1) + + if nan_frac_neg: + assert_raises(ValueError, pd.eval, ex, engine=self.engine, + local_dict=locals(), global_dict=globals()) + else: + # compound + result = pd.eval(ex, engine=self.engine) + + #(lhs, rhs), _ = _align((lhs, rhs)) + #if arith1 != '//': + #expected = self.ne.evaluate(ex) + #assert_array_equal(result, expected) + #else: + #assert_raises(TypeError, self.ne.evaluate, ex) + + #for engine in self.current_engines: + #if arith1 != '//': + #if engine_has_neg_frac(engine): + #assert_array_equal(result, pd.eval(ex, engine=engine)) + #else: + #assert_raises(TypeError, pd.eval, ex, engine=engine, + #local_dict=locals(), global_dict=globals()) + + +class TestBasicEvalPython(TestBasicEval): + + @classmethod + def setUpClass(cls): + cls.cmp_ops = expr._cmp_ops_syms + cls.cmp2_ops = cls.cmp_ops[::-1] + cls.bin_ops = expr._bool_ops_syms + cls.arith_ops = expr._arith_ops_syms + cls.unary_ops = '+', '-' + + def set_current_engine(self): + self.engine = 'python' + + +def test_syntax_error_exprs(): + for engine in _engines: + e = 's +' + assert_raises(SyntaxError, pd.eval, e, engine=engine) + + +def test_name_error_exprs(): + for engine in _engines: + e = 's + t' + assert_raises(NameError, pd.eval, e, engine=engine) + + +def test_align_nested_unary_op(): + for engine in _engines: + yield check_align_nested_unary_op, engine + + +f = lambda *args, **kwargs: np.random.randn() + + +def check_align_nested_unary_op(engine): + skip_numexpr_engine(engine) + s = 'df * ~2' + df = mkdf(10, 10, data_gen_f=f) + res = pd.eval(s, engine) + assert_frame_equal(res, df * ~2) + + +def check_basic_frame_alignment(engine): + df = mkdf(10, 10, data_gen_f=f) + df2 = mkdf(20, 10, data_gen_f=f) + res = pd.eval('df + df2', engine=engine) + assert_frame_equal(res, df + df2) + + +def test_basic_frame_alignment(): + for engine in _engines: + yield check_basic_frame_alignment, engine + + +def check_medium_complex_frame_alignment(engine, r1, r2, c1, c2): + skip_numexpr_engine(engine) + df = mkdf(5, 2, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) + df2 = mkdf(10, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + df3 = mkdf(15, 2, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + res = pd.eval('df + df2 + df3', engine=engine) + assert_frame_equal(res, df + df2 + df3) + + +@slow +def test_medium_complex_frame_alignment(): + args = product(_engines, *([INDEX_TYPES[:4]] * 4)) + for engine, r1, r2, c1, c2 in args: + check_medium_complex_frame_alignment(engine, r1, r2, c1, c2) + + +def check_basic_frame_series_alignment(engine, r_idx_type, c_idx_type, + index_name): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': + assert_raises(ValueError, pd.eval, 'df + s', local_dict=locals()) + assert_raises(ValueError, df.add, s, axis=1) + else: + res = pd.eval('df + s', engine=engine) + expected = df + s + assert_frame_equal(res, expected) + + +def check_not_both_period_fails_otherwise_succeeds(lhs, rhs, r_idx_type, + c_idx_type, index_name, s, + df, *terms): + if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': + assert_raises(ValueError, pd.eval, lhs, local_dict=locals()) + assert_raises(ValueError, pd.eval, rhs, local_dict=locals()) + else: + a, b = pd.eval(lhs), pd.eval(rhs) + assert_frame_equal(a, b) + + +def check_basic_series_frame_alignment(engine, r_idx_type, c_idx_type, + index_name): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': + assert_raises(ValueError, pd.eval, 's + df', local_dict=locals()) + assert_raises(ValueError, df.add, s, axis=1) + else: + res = pd.eval('s + df', engine=engine) + expected = s + df + assert_frame_equal(res, expected) + + +@slow +def check_basic_series_frame_alignment_datetime(engine, r_idx_type, c_idx_type, + index_name): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': + assert_raises(ValueError, pd.eval, 's + df', local_dict=locals()) + assert_raises(ValueError, df.add, s, axis=1) + else: + res = pd.eval('s + df', engine=engine) + expected = s + df + assert_frame_equal(res, expected) + + if r_idx_type != 'p' and c_idx_type == 'p' and index_name == 'index': + assert_raises(ValueError, pd.eval, 'df + s', local_dict=locals()) + assert_raises(ValueError, df.add, s, axis=1) + else: + res = pd.eval('df + s', engine=engine) + expected = df + s + assert_frame_equal(res, expected) + + +def check_series_frame_commutativity(engine, r_idx_type, c_idx_type, op, + index_name): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, + c_idx_type=c_idx_type) + index = getattr(df, index_name) + s = Series(np.random.randn(5), index[:5]) + + lhs = 's {0} df'.format(op) + rhs = 'df {0} s'.format(op) + check_not_both_period_fails_otherwise_succeeds(lhs, rhs, r_idx_type, + c_idx_type, index_name, s, + df) + + +INDEX_TYPES = 'i', 'f', 's', 'u', # 'dt', # 'p' + + +@slow +def test_series_frame_commutativity(): + args = product(_engines, INDEX_TYPES, INDEX_TYPES, ('+', '*'), ('index', + 'columns')) + for engine, r_idx_type, c_idx_type, op, index_name in args: + check_series_frame_commutativity(engine, r_idx_type, c_idx_type, op, + index_name) + + +def test_basic_frame_series_alignment(): + args = product(_engines, INDEX_TYPES, INDEX_TYPES, ('index', 'columns')) + for engine, r_idx_type, c_idx_type, index_name in args: + check_basic_frame_series_alignment(engine, r_idx_type, c_idx_type, + index_name) + + +@slow +def test_basic_series_frame_alignment_datetime(): + idx_types = INDEX_TYPES + args = product(_engines, idx_types, idx_types, ('index', 'columns')) + for engine, r_idx_type, c_idx_type, index_name in args: + check_basic_series_frame_alignment_datetime(engine, r_idx_type, + c_idx_type, index_name) + + +def test_basic_series_frame_alignment(): + args = product(_engines, INDEX_TYPES, INDEX_TYPES, ('index', 'columns')) + for engine, r_idx_type, c_idx_type, index_name in args: + check_basic_series_frame_alignment(engine, r_idx_type, c_idx_type, + index_name) + + +def check_complex_series_frame_alignment(engine, index_name, obj, r1, r2, c1, + c2): + skip_numexpr_engine(engine) + df = mkdf(10, 10, data_gen_f=f, r_idx_type=r1, c_idx_type=c1) + df2 = mkdf(20, 10, data_gen_f=f, r_idx_type=r2, c_idx_type=c2) + index = getattr(locals()[obj], index_name) + s = Series(np.random.randn(5), index[:5]) + if engine != 'python': + expected = df2.add(s, axis=1).add(df) + else: + expected = df2 + s + df + res = pd.eval('df2 + s + df', engine=engine) + expected = df2 + s + df + assert_tuple_equal(res.shape, expected.shape) + assert_frame_equal(res, expected) + + +@slow +def test_complex_series_frame_alignment(): + args = product(_engines, ('index', 'columns'), ('df', 'df2'), + *([INDEX_TYPES[:4]] * 4)) + for engine, index_name, obj, r1, r2, c1, c2 in args: + check_complex_series_frame_alignment(engine, index_name, obj, r1, r2, + c1, c2) + + +def check_datetime_index_rows_punts_to_python(engine): + df = mkdf(10, 10, data_gen_f=f, r_idx_type='dt', c_idx_type='dt') + index = getattr(df, 'index') + s = Series(np.random.randn(5), index[:5]) + env = Scope(globals(), locals()) + + +def test_datetime_index_rows_punts_to_python(): + for engine in _engines: + check_datetime_index_rows_punts_to_python(engine) + + +def test_truediv(): + for engine in _engines: + check_truediv(engine) + + +def check_truediv(engine): + s = np.array([1]) + ex = 's / 1' + + if PY3: + res = pd.eval(ex, truediv=False) + assert_array_equal(res, np.array([1.0])) + + res = pd.eval(ex, truediv=True) + assert_array_equal(res, np.array([1.0])) + else: + res = pd.eval(ex, truediv=False) + assert_array_equal(res, np.array([1])) + + res = pd.eval(ex, truediv=True) + assert_array_equal(res, np.array([1.0])) + + +__var_s = randn(10) + + +def check_global_scope(engine): + e = '__var_s * 2' + assert_array_equal(__var_s * 2, pd.eval(e, engine=engine)) + + +def test_global_scope(): + for engine in _engines: + yield check_global_scope, engine + + +def check_is_expr(engine): + s = 1 + valid = 's + 1' + invalid = 's +' + assert_true(expr.isexpr(valid, check_names=True)) + assert_true(expr.isexpr(valid, check_names=False)) + assert_false(expr.isexpr(invalid, check_names=False)) + assert_false(expr.isexpr(invalid, check_names=True)) + + +def test_is_expr(): + for engine in _engines: + check_is_expr(engine) + + +def check_not_fails(engine): + x = True + assert_raises(NotImplementedError, pd.eval, 'not x', engine=engine, + local_dict={'x': x}) + + +def test_not_fails(): + for engine in _engines: + check_not_fails(engine) + + +def check_and_fails(engine): + x, y = False, True + assert_raises(NotImplementedError, pd.eval, 'x and y', engine=engine, + local_dict={'x': x, 'y': y}) + + +def test_and_fails(): + for engine in _engines: + check_and_fails(engine) + + +def check_or_fails(engine): + x, y = True, False + assert_raises(NotImplementedError, pd.eval, 'x or y', engine=engine, + local_dict={'x': x, 'y': y}) + + +def test_or_fails(): + for engine in _engines: + check_or_fails(engine) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/computation/tests/test_expressions.py b/pandas/computation/tests/test_expressions.py new file mode 100644 index 0000000000000..f197b8ef7a0ac --- /dev/null +++ b/pandas/computation/tests/test_expressions.py @@ -0,0 +1,157 @@ +# pylint: disable-msg=W0612,E1101 + +import unittest +import operator + +import nose + + +import numpy as np +from numpy.testing import assert_array_equal + +from pandas.core.api import DataFrame +from pandas.computation import expressions as expr + +if not expr._USE_NUMEXPR: + raise nose.SkipTest + +import numexpr as ne + + +_frame = DataFrame(np.random.randn(10000, 4), columns=list('ABCD'), + dtype='float64') +_frame2 = DataFrame(np.random.randn(100, 4), columns=list('ABCD'), + dtype='float64') +_mixed = DataFrame({'A': _frame['A'].copy(), + 'B': _frame['B'].astype('float32'), + 'C': _frame['C'].astype('int64'), + 'D': _frame['D'].astype('int32')}) +_mixed2 = DataFrame({'A': _frame2['A'].copy(), + 'B': _frame2['B'].astype('float32'), + 'C': _frame2['C'].astype('int64'), + 'D': _frame2['D'].astype('int32')}) + + +class TestExpressions(unittest.TestCase): + + _multiprocess_can_split_ = False + + def setUp(self): + self.frame = _frame.copy() + self.frame2 = _frame2.copy() + self.mixed = _mixed.copy() + self.mixed2 = _mixed2.copy() + + def test_invalid(self): + # no op + result = expr._can_use_numexpr(operator.add, None, self.frame, + self.frame, 'evaluate') + self.assertFalse(result) + + # mixed + result = expr._can_use_numexpr( + operator.add, '+', self.mixed, self.frame, 'evaluate') + self.assertFalse(result) + + # min elements + result = expr._can_use_numexpr( + operator.add, '+', self.frame2, self.frame2, 'evaluate') + self.assertFalse(result) + + # ok, we only check on first part of expression + result = expr._can_use_numexpr( + operator.add, '+', self.frame, self.frame2, 'evaluate') + self.assert_(result) + + def test_binary_ops(self): + def testit(): + + for f, f2 in [(self.frame, self.frame2), + (self.mixed, self.mixed2)]: + + for op, op_str in [('add', '+'), ('sub', '-'), ('mul', '*'), + ('div', '/'), ('pow', '**')]: + + op = getattr(operator, op, None) + if op is not None: + result = expr._can_use_numexpr( + op, op_str, f, f, 'evaluate') + self.assert_(result == (not f._is_mixed_type)) + + result = expr.evaluate( + op, op_str, f, f, use_numexpr=True) + expected = expr.evaluate( + op, op_str, f, f, use_numexpr=False) + assert_array_equal(result, expected.values) + + result = expr._can_use_numexpr( + op, op_str, f2, f2, 'evaluate') + self.assertFalse(result) + + expr.set_use_numexpr(False) + testit() + expr.set_use_numexpr(True) + expr.set_numexpr_threads(1) + testit() + expr.set_numexpr_threads() + testit() + + def test_boolean_ops(self): + def testit(): + for f, f2 in [(self.frame, self.frame2), + (self.mixed, self.mixed2)]: + + f11 = f + f12 = f + 1 + + f21 = f2 + f22 = f2 + 1 + + for op, op_str in [('gt', '>'), ('lt', '<'), ('ge', '>='), + ('le', '<='), ('eq', '=='), ('ne', '!=')]: + + op = getattr(operator, op) + + result = expr._can_use_numexpr( + op, op_str, f11, f12, 'evaluate') + self.assert_(result == (not f11._is_mixed_type)) + + result = expr.evaluate( + op, op_str, f11, f12, use_numexpr=True) + expected = expr.evaluate( + op, op_str, f11, f12, use_numexpr=False) + assert_array_equal(result, expected.values) + + result = expr._can_use_numexpr( + op, op_str, f21, f22, 'evaluate') + self.assertFalse(result) + + expr.set_use_numexpr(False) + testit() + expr.set_use_numexpr(True) + expr.set_numexpr_threads(1) + testit() + expr.set_numexpr_threads() + testit() + + def test_where(self): + def testit(): + for f in [self.frame, self.frame2, self.mixed, self.mixed2]: + + for cond in [True, False]: + + c = np.empty(f.shape, dtype=np.bool_) + c.fill(cond) + result = expr.where(c, f.values, f.values + 1) + expected = np.where(c, f.values, f.values + 1) + assert_array_equal(result, expected) + + expr.set_use_numexpr(False) + testit() + expr.set_use_numexpr(True) + expr.set_numexpr_threads(1) + testit() + expr.set_numexpr_threads() + testit() + + diff --git a/pandas/core/base.py b/pandas/core/base.py index 6122e78fa8bce..2caaf00723824 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -39,6 +39,7 @@ def __repr__(self): """ return str(self) + class PandasObject(StringMixin): """baseclass for various pandas objects""" diff --git a/pandas/core/common.py b/pandas/core/common.py index ddacb98a2ddf3..4615571c5d86c 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -21,6 +21,7 @@ from pandas.core.config import get_option from pandas.core import array as pa +import pandas as pd # XXX: HACK for NumPy 1.5.1 to suppress warnings try: @@ -1509,6 +1510,29 @@ def is_bool(obj): return isinstance(obj, (bool, np.bool_)) +def is_string(obj): + return isinstance(obj, (basestring, np.str_, np.unicode_)) + + +def is_series(obj): + return isinstance(obj, pd.Series) + + +def is_frame(obj): + return isinstance(obj, pd.DataFrame) + + +def is_panel(obj): + return isinstance(obj, pd.Panel) + + +def is_pd_obj(obj): + return isinstance(obj, pd.core.generic.PandasObject) + + +def is_ndframe(obj): + return isinstance(obj, pd.core.generic.NDFrame) + def is_integer(obj): return isinstance(obj, (int, long, np.integer)) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 401a7746953cb..7f0a8492a4403 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -35,7 +35,7 @@ create_block_manager_from_arrays, create_block_manager_from_blocks) from pandas.core.series import Series, _radd_compat -import pandas.core.expressions as expressions +import pandas.computation.expressions as expressions from pandas.compat.scipy import scoreatpercentile as _quantile from pandas.util.compat import OrderedDict from pandas.util import py3compat @@ -2652,6 +2652,8 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, passed MultiIndex level limit : int, default None Maximum size gap to forward or backward fill + fill_value : object, default NA + The value to use to fill in missing data. Examples -------- @@ -5679,6 +5681,7 @@ def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): return create_block_manager_from_arrays(arrays, arr_names, axes) + def extract_index(data): from pandas.core.index import _union_indexes @@ -5939,6 +5942,7 @@ def _homogenize(data, index, dtype=None): return homogenized + def _from_nested_dict(data): # TODO: this should be seriously cythonized new_data = OrderedDict() diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f23a89635aaf2..ab29a38760a51 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -14,7 +14,7 @@ import pandas.core.common as com import pandas.lib as lib import pandas.tslib as tslib -import pandas.core.expressions as expressions +import pandas.computation.expressions as expressions from pandas.tslib import Timestamp from pandas.util import py3compat diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index fdb86c43b7160..0f84884d51340 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -87,40 +87,40 @@ class AttributeConflictWarning(Warning): # map object types _TYPE_MAP = { - Series : u'series', - SparseSeries : u'sparse_series', - TimeSeries : u'series', - DataFrame : u'frame', - SparseDataFrame : u'sparse_frame', - Panel : u'wide', - Panel4D : u'ndim', - SparsePanel : u'sparse_panel' + Series: u'series', + SparseSeries: u'sparse_series', + TimeSeries: u'series', + DataFrame: u'frame', + SparseDataFrame: u'sparse_frame', + Panel: u'wide', + Panel4D: u'ndim', + SparsePanel: u'sparse_panel' } # storer class map _STORER_MAP = { - u'TimeSeries' : 'LegacySeriesStorer', - u'Series' : 'LegacySeriesStorer', - u'DataFrame' : 'LegacyFrameStorer', - u'DataMatrix' : 'LegacyFrameStorer', - u'series' : 'SeriesStorer', - u'sparse_series' : 'SparseSeriesStorer', - u'frame' : 'FrameStorer', - u'sparse_frame' : 'SparseFrameStorer', - u'wide' : 'PanelStorer', - u'sparse_panel' : 'SparsePanelStorer', + u'TimeSeries': 'LegacySeriesStorer', + u'Series': 'LegacySeriesStorer', + u'DataFrame': 'LegacyFrameStorer', + u'DataMatrix': 'LegacyFrameStorer', + u'series': 'SeriesStorer', + u'sparse_series': 'SparseSeriesStorer', + u'frame': 'FrameStorer', + u'sparse_frame': 'SparseFrameStorer', + u'wide': 'PanelStorer', + u'sparse_panel': 'SparsePanelStorer', } # table class map _TABLE_MAP = { - u'generic_table' : 'GenericTable', - u'appendable_frame' : 'AppendableFrameTable', - u'appendable_multiframe' : 'AppendableMultiFrameTable', - u'appendable_panel' : 'AppendablePanelTable', - u'appendable_ndim' : 'AppendableNDimTable', - u'worm' : 'WORMTable', - u'legacy_frame' : 'LegacyFrameTable', - u'legacy_panel' : 'LegacyPanelTable', + u'generic_table': 'GenericTable', + u'appendable_frame': 'AppendableFrameTable', + u'appendable_multiframe': 'AppendableMultiFrameTable', + u'appendable_panel': 'AppendablePanelTable', + u'appendable_ndim': 'AppendableNDimTable', + u'worm': 'WORMTable', + u'legacy_frame': 'LegacyFrameTable', + u'legacy_panel': 'LegacyPanelTable', } # axes map @@ -149,6 +149,7 @@ def _tables(): return _table_mod + def h5_open(path, mode): tables = _tables() return tables.openFile(path, mode) @@ -166,23 +167,27 @@ def get_store(path, mode='a', complevel=None, complib=None, Examples -------- + >>> from pandas import DataFrame + >>> from numpy.random import randn + >>> bar = DataFrame(randn(10, 4)) >>> with get_store('test.h5') as store: - >>> store['foo'] = bar # write to HDF5 - >>> bar = store['foo'] # retrieve + ... store['foo'] = bar # write to HDF5 + ... bar = store['foo'] # retrieve """ store = None try: store = HDFStore(path, mode=mode, complevel=complevel, - complib=complib, fletcher32=False) + complib=complib, fletcher32=fletcher32) yield store finally: if store is not None: store.close() -### interface to/from ### +# interface to/from ### -def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, append=None, **kwargs): +def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, + append=None, **kwargs): """ store this object, close it if we opened it """ if append: f = lambda store: store.append(key, value, **kwargs) @@ -190,14 +195,17 @@ def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, app f = lambda store: store.put(key, value, **kwargs) if isinstance(path_or_buf, basestring): - with get_store(path_or_buf, mode=mode, complevel=complevel, complib=complib) as store: + with get_store(path_or_buf, mode=mode, complevel=complevel, + complib=complib) as store: f(store) else: f(path_or_buf) + def read_hdf(path_or_buf, key, **kwargs): """ read from the store, closeit if we opened it """ - f = lambda store, auto_close: store.select(key, auto_close=auto_close, **kwargs) + f = lambda store, auto_close: store.select( + key, auto_close=auto_close, **kwargs) if isinstance(path_or_buf, basestring): @@ -219,7 +227,9 @@ def read_hdf(path_or_buf, key, **kwargs): # a passed store; user controls open/close f(path_or_buf, False) -class HDFStore(StringMixin): + +class HDFStore(object): + """ dict-like IO interface for storing pandas objects in PyTables format. @@ -255,6 +265,9 @@ class HDFStore(StringMixin): Examples -------- + >>> from pandas import DataFrame + >>> from numpy.random import randn + >>> bar = DataFrame(randn(10, 4)) >>> store = HDFStore('test.h5') >>> store['foo'] = bar # write to HDF5 >>> bar = store['foo'] # retrieve @@ -265,9 +278,9 @@ class HDFStore(StringMixin): def __init__(self, path, mode=None, complevel=None, complib=None, fletcher32=False): try: - import tables as _ + import tables except ImportError: # pragma: no cover - raise Exception('HDFStore requires PyTables') + raise ImportError('HDFStore requires PyTables') self._path = path if mode is None: @@ -320,7 +333,7 @@ def __unicode__(self): output = '%s\nFile path: %s\n' % (type(self), pprint_thing(self._path)) if len(self.keys()): - keys = [] + keys = [] values = [] for k in self.keys(): @@ -328,10 +341,13 @@ def __unicode__(self): s = self.get_storer(k) if s is not None: keys.append(pprint_thing(s.pathname or k)) - values.append(pprint_thing(s or 'invalid_HDFStore node')) + values.append( + pprint_thing(s or 'invalid_HDFStore node')) except Exception as detail: keys.append(k) - values.append("[invalid_HDFStore node: %s]" % pprint_thing(detail)) + values.append( + "[invalid_HDFStore node: %s]" % + pprint_thing(detail)) output += adjoin(12, keys, values) else: @@ -385,7 +401,7 @@ def open(self, mode='a', warn=True): try: self._handle = h5_open(self._path, self._mode) - except IOError, e: # pragma: no cover + except IOError as e: # pragma: no cover if 'can not be written' in str(e): print ('Opening %s in read-only mode' % self._path) self._handle = h5_open(self._path, 'r') @@ -421,7 +437,8 @@ def get(self, key): raise KeyError('No object named %s in the file' % key) return self._read_group(group) - def select(self, key, where=None, start=None, stop=None, columns=None, iterator=False, chunksize=None, auto_close=False, **kwargs): + def select(self, key, where=None, start=None, stop=None, columns=None, + iterator=False, chunksize=None, auto_close=False, **kwargs): """ Retrieve pandas object stored in file, optionally based on where criteria @@ -448,16 +465,22 @@ def select(self, key, where=None, start=None, stop=None, columns=None, iterator= # what we are actually going to do for a chunk def func(_start, _stop): - return s.read(where=where, start=_start, stop=_stop, columns=columns, **kwargs) + return s.read(where=where, start=_start, stop=_stop, + columns=columns, **kwargs) if iterator or chunksize is not None: if not s.is_table: - raise TypeError("can only use an iterator or chunksize on a table") - return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, chunksize=chunksize, auto_close=auto_close) + raise TypeError( + "can only use an iterator or chunksize on a table") + return TableIterator(self, func, nrows=s.nrows, start=start, + stop=stop, chunksize=chunksize, + auto_close=auto_close) - return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, auto_close=auto_close).get_values() + return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop, + auto_close=auto_close).get_values() - def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs): + def select_as_coordinates( + self, key, where=None, start=None, stop=None, **kwargs): """ return the selection as a Coordinates. @@ -473,7 +496,7 @@ def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs def unique(self, key, column, **kwargs): warnings.warn("unique(key,column) is deprecated\n" "use select_column(key,column).unique() instead") - return self.get_storer(key).read_column(column = column, **kwargs).unique() + return self.get_storer(key).read_column(column=column, **kwargs).unique() def select_column(self, key, column, **kwargs): """ @@ -490,9 +513,11 @@ def select_column(self, key, column, **kwargs): raises ValueError if the column can not be extracted indivually (it is part of a data block) """ - return self.get_storer(key).read_column(column = column, **kwargs) + return self.get_storer(key).read_column(column=column, **kwargs) - def select_as_multiple(self, keys, where=None, selector=None, columns=None, start=None, stop=None, iterator=False, chunksize=None, auto_close=False, **kwargs): + def select_as_multiple(self, keys, where=None, selector=None, columns=None, + start=None, stop=None, iterator=False, + chunksize=None, auto_close=False, **kwargs): """ Retrieve pandas objects from multiple tables Parameters @@ -514,19 +539,21 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, star if isinstance(keys, (list, tuple)) and len(keys) == 1: keys = keys[0] if isinstance(keys, basestring): - return self.select(key=keys, where=where, columns=columns, start=start, stop=stop, iterator=iterator, chunksize=chunksize, **kwargs) + return self.select(key=keys, where=where, columns=columns, + start=start, stop=stop, iterator=iterator, + chunksize=chunksize, **kwargs) if not isinstance(keys, (list, tuple)): - raise Exception("keys must be a list/tuple") + raise TypeError("keys must be a list/tuple") - if len(keys) == 0: - raise Exception("keys must have a non-zero length") + if not len(keys): + raise ValueError("keys must have a non-zero length") if selector is None: selector = keys[0] # collect the tables - tbls = [ self.get_storer(k) for k in keys ] + tbls = [self.get_storer(k) for k in keys] # validate rows nrows = None @@ -534,24 +561,32 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, star if t is None: raise TypeError("Invalid table [%s]" % k) if not t.is_table: - raise TypeError("object [%s] is not a table, and cannot be used in all select as multiple" % t.pathname) + raise TypeError( + "object [%s] is not a table, and cannot be used in all select as multiple" % + t.pathname) if nrows is None: nrows = t.nrows elif t.nrows != nrows: - raise ValueError("all tables must have exactly the same nrows!") + raise ValueError( + "all tables must have exactly the same nrows!") # select coordinates from the selector table try: - c = self.select_as_coordinates(selector, where, start=start, stop=stop) + c = self.select_as_coordinates( + selector, + where, + start=start, + stop=stop) nrows = len(c) - except (Exception), detail: + except Exception: raise ValueError("invalid selector [%s]" % selector) def func(_start, _stop): # collect the returns objs - objs = [t.read(where=c[_start:_stop], columns=columns) for t in tbls] + objs = [t.read(where=c[_start:_stop], columns=columns) + for t in tbls] # axis is the concentation axes axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0] @@ -564,7 +599,6 @@ def func(_start, _stop): return TableIterator(self, func, nrows=nrows, start=start, stop=stop, auto_close=auto_close).get_values() - def put(self, key, value, table=None, append=False, **kwargs): """ Store object in HDFStore @@ -610,7 +644,8 @@ def remove(self, key, where=None, start=None, stop=None): except: if where is not None: - raise ValueError("trying to remove a node with a non-None where clause!") + raise ValueError( + "trying to remove a node with a non-None where clause!") # we are actually trying to remove a node (with children) s = self.get_node(key) @@ -628,8 +663,9 @@ def remove(self, key, where=None, start=None, stop=None): # delete from the table else: if not s.is_table: - raise ValueError('can only remove with where on objects written as tables') - return s.delete(where = where, start=start, stop=stop) + raise ValueError( + 'can only remove with where on objects written as tables') + return s.delete(where=where, start=start, stop=stop) def append(self, key, value, columns=None, **kwargs): """ @@ -653,11 +689,13 @@ def append(self, key, value, columns=None, **kwargs): data in the table, so be careful """ if columns is not None: - raise Exception("columns is not a supported keyword in append, try data_columns") + raise TypeError("columns is not a supported keyword in append, " + "try data_columns") self._write_to_group(key, value, table=True, append=True, **kwargs) - def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, **kwargs): + def append_to_multiple(self, d, value, selector, data_columns=None, + axes=None, **kwargs): """ Append to multiple tables @@ -676,13 +714,17 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, * """ if axes is not None: - raise Exception("axes is currently not accepted as a paremter to append_to_multiple; you can create the tables indepdently instead") + raise TypeError("axes is currently not accepted as a parameter to" + " append_to_multiple; you can create the " + "tables indepdently instead") if not isinstance(d, dict): - raise ValueError("append_to_multiple must have a dictionary specified as the way to split the value") + raise ValueError( + "append_to_multiple must have a dictionary specified as the way to split the value") if selector not in d: - raise ValueError("append_to_multiple requires a selector that is in passed dict") + raise ValueError( + "append_to_multiple requires a selector that is in passed dict") # figure out the splitting axis (the non_index_axis) axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0] @@ -693,7 +735,8 @@ def append_to_multiple(self, d, value, selector, data_columns=None, axes=None, * for k, v in d.items(): if v is None: if remain_key is not None: - raise ValueError("append_to_multiple can only have one value in d that is None") + raise ValueError( + "append_to_multiple can only have one value in d that is None") remain_key = k else: remain_values.extend(v) @@ -731,10 +774,11 @@ def create_table_index(self, key, **kwargs): # version requirements _tables() if not _table_supports_index: - raise Exception("PyTables >= 2.3 is required for table indexing") + raise ValueError("PyTables >= 2.3 is required for table indexing") s = self.get_storer(key) - if s is None: return + if s is None: + return if not s.is_table: raise TypeError("cannot create table index on a non-table") @@ -743,8 +787,8 @@ def create_table_index(self, key, **kwargs): def groups(self): """ return a list of all the top-level nodes (that are not themselves a pandas storage object) """ _tables() - return [ g for g in self._handle.walkNodes() if getattr(g._v_attrs,'pandas_type',None) or getattr( - g,'table',None) or (isinstance(g,_table_mod.table.Table) and g._v_name != u'table') ] + return [g for g in self._handle.walkNodes() if getattr(g._v_attrs, 'pandas_type', None) or getattr( + g, 'table', None) or (isinstance(g, _table_mod.table.Table) and g._v_name != u'table')] def get_node(self, key): """ return the node with the key or None if it does not exist """ @@ -764,8 +808,9 @@ def get_storer(self, key): s.infer_axes() return s - def copy(self, file, mode = 'w', propindexes = True, keys = None, complib = None, complevel = None, - fletcher32 = False, overwrite = True): + def copy( + self, file, mode='w', propindexes=True, keys=None, complib=None, complevel=None, + fletcher32=False, overwrite=True): """ copy the existing store to a new file, upgrading in place Parameters @@ -780,13 +825,18 @@ def copy(self, file, mode = 'w', propindexes = True, keys = None, complib = None open file handle of the new store """ - new_store = HDFStore(file, mode = mode, complib = complib, complevel = complevel, fletcher32 = fletcher32) + new_store = HDFStore( + file, + mode=mode, + complib=complib, + complevel=complevel, + fletcher32=fletcher32) if keys is None: keys = self.keys() - if not isinstance(keys, (tuple,list)): - keys = [ keys ] + if not isinstance(keys, (tuple, list)): + keys = [keys] for k in keys: - s = self.get_storer(k) + s = self.get_storer(k) if s is not None: if k in new_store: @@ -798,35 +848,45 @@ def copy(self, file, mode = 'w', propindexes = True, keys = None, complib = None index = False if propindexes: - index = [ a.name for a in s.axes if a.is_indexed ] - new_store.append(k, data, index=index, data_columns=getattr(s,'data_columns',None), encoding=s.encoding) + index = [a.name for a in s.axes if a.is_indexed] + new_store.append( + k, + data, + index=index, + data_columns=getattr( + s, + 'data_columns', + None), + encoding=s.encoding) else: new_store.put(k, data, encoding=s.encoding) return new_store - ###### private methods ###### + # private methods ###### - def _create_storer(self, group, value = None, table = False, append = False, **kwargs): + def _create_storer( + self, group, value=None, table=False, append=False, **kwargs): """ return a suitable Storer class to operate """ def error(t): raise TypeError("cannot properly create the storer for: [%s] [group->%s,value->%s,table->%s,append->%s,kwargs->%s]" % - (t,group,type(value),table,append,kwargs)) + (t, group, type(value), table, append, kwargs)) - pt = _ensure_decoded(getattr(group._v_attrs,'pandas_type',None)) - tt = _ensure_decoded(getattr(group._v_attrs,'table_type',None)) + pt = _ensure_decoded(getattr(group._v_attrs, 'pandas_type', None)) + tt = _ensure_decoded(getattr(group._v_attrs, 'table_type', None)) # infer the pt from the passed value if pt is None: if value is None: _tables() - if getattr(group,'table',None) or isinstance(group,_table_mod.table.Table): + if getattr(group, 'table', None) or isinstance(group, _table_mod.table.Table): pt = u'frame_table' tt = u'generic_table' else: - raise TypeError("cannot create a storer if the object is not existing nor a value are passed") + raise TypeError( + "cannot create a storer if the object is not existing nor a value are passed") else: try: @@ -852,14 +912,14 @@ def error(t): if value is not None: if pt == u'frame_table': - index = getattr(value,'index',None) + index = getattr(value, 'index', None) if index is not None: if index.nlevels == 1: tt = u'appendable_frame' elif index.nlevels > 1: tt = u'appendable_multiframe' elif pt == u'wide_table': - tt = u'appendable_panel' + tt = u'appendable_panel' elif pt == u'ndim_table': tt = u'appendable_ndim' @@ -879,8 +939,9 @@ def error(t): except: error('_TABLE_MAP') - def _write_to_group(self, key, value, index=True, table=False, append=False, - complib=None, encoding=None, **kwargs): + def _write_to_group( + self, key, value, index=True, table=False, append=False, + complib=None, encoding=None, **kwargs): group = self.get_node(key) # remove the node if we are not appending @@ -920,16 +981,18 @@ def _write_to_group(self, key, value, index=True, table=False, append=False, if not s.is_table and complib: raise ValueError('Compression not supported on non-table') - s.write(obj = value, append=append, complib=complib, **kwargs) + s.write(obj=value, append=append, complib=complib, **kwargs) if s.is_table and index: - s.create_index(columns = index) + s.create_index(columns=index) def _read_group(self, group, **kwargs): s = self._create_storer(group) s.infer_axes() return s.read(**kwargs) + class TableIterator(object): + """ define the iteration interface on a table Parameters @@ -946,15 +1009,16 @@ class TableIterator(object): kwargs : the passed kwargs """ - def __init__(self, store, func, nrows, start=None, stop=None, chunksize=None, auto_close=False): + def __init__(self, store, func, nrows, start=None, stop=None, + chunksize=None, auto_close=False): self.store = store - self.func = func + self.func = func self.nrows = nrows or 0 self.start = start or 0 if stop is None: stop = self.nrows - self.stop = min(self.nrows,stop) + self.stop = min(self.nrows, stop) if chunksize is None: chunksize = 100000 @@ -985,7 +1049,9 @@ def get_values(self): self.close() return results + class IndexCol(StringMixin): + """ an index column description class Parameters @@ -1001,11 +1067,12 @@ class IndexCol(StringMixin): is_an_indexable = True is_data_indexable = True is_searchable = False - _info_fields = ['freq','tz','index_name'] + _info_fields = ['freq', 'tz', 'index_name'] - def __init__(self, values=None, kind=None, typ=None, cname=None, itemsize=None, - name=None, axis=None, kind_attr=None, pos=None, freq=None, tz=None, - index_name=None, **kwargs): + def __init__( + self, values=None, kind=None, typ=None, cname=None, itemsize=None, + name=None, axis=None, kind_attr=None, pos=None, freq=None, tz=None, + index_name=None, **kwargs): self.values = values self.kind = kind self.typ = typ @@ -1052,7 +1119,13 @@ def set_table(self, table): return self def __unicode__(self): - temp = tuple(map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))) + temp = tuple( + map(pprint_thing, + (self.name, + self.cname, + self.axis, + self.pos, + self.kind))) return "name->%s,cname->%s,axis->%s,pos->%s,kind->%s" % temp def __eq__(self, other): @@ -1066,7 +1139,7 @@ def __ne__(self, other): def is_indexed(self): """ return whether I am an indexed column """ try: - return getattr(self.table.cols,self.cname).is_indexed + return getattr(self.table.cols, self.cname).is_indexed except: False @@ -1088,7 +1161,7 @@ def convert(self, values, nan_rep, encoding): except: pass - values =_maybe_convert(values, self.kind, encoding) + values = _maybe_convert(values, self.kind, encoding) kwargs = dict() if self.freq is not None: @@ -1099,15 +1172,22 @@ def convert(self, values, nan_rep, encoding): self.values = Index(values, **kwargs) except: - # if the output freq is different that what we recorded, then infer it + # if the output freq is different that what we recorded, then infer + # it if 'freq' in kwargs: kwargs['freq'] = 'infer' - self.values = Index(_maybe_convert(values, self.kind, encoding), **kwargs) + self.values = Index( + _maybe_convert( + values, + self.kind, + encoding), + **kwargs) # set the timezone if indicated # we stored in utc, so reverse to local timezone if self.tz is not None: - self.values = self.values.tz_localize('UTC').tz_convert(_ensure_decoded(self.tz)) + self.values = self.values.tz_localize( + 'UTC').tz_convert(_ensure_decoded(self.tz)) return self @@ -1159,7 +1239,6 @@ def validate_col(self, itemsize=None): """ validate this column: return the compared against itemsize """ # validate this column for string truncation (or reset to the max size) - dtype = getattr(self, 'dtype', None) if _ensure_decoded(self.kind) == u'string': c = self.col @@ -1170,7 +1249,7 @@ def validate_col(self, itemsize=None): raise ValueError("Trying to store a string with len [%s] in [%s] column but\n" "this column has a limit of [%s]!\n" "Consider using min_itemsize to preset the sizes on these columns" - % (itemsize,self.cname, c.itemsize)) + % (itemsize, self.cname, c.itemsize)) return c.itemsize return None @@ -1189,7 +1268,7 @@ def update_info(self, info): for key in self._info_fields: - value = getattr(self,key,None) + value = getattr(self, key, None) try: idx = info[self.name] @@ -1200,18 +1279,18 @@ def update_info(self, info): if key in idx and value is not None and existing_value != value: # frequency/name just warn - if key in ['freq','index_name']: - ws = attribute_conflict_doc % (key,existing_value,value) + if key in ['freq', 'index_name']: + ws = attribute_conflict_doc % (key, existing_value, value) warnings.warn(ws, AttributeConflictWarning) # reset idx[key] = None - setattr(self,key,None) + setattr(self, key, None) else: raise ValueError("invalid info for [%s] for [%s]""" ", existing_value [%s] conflicts with new value [%s]" % (self.name, - key,existing_value,value)) + key, existing_value, value)) else: if value is not None or existing_value is not None: idx[key] = value @@ -1232,7 +1311,9 @@ def set_attr(self): """ set the kind for this colummn """ setattr(self.attrs, self.kind_attr, self.kind) + class GenericIndexCol(IndexCol): + """ an index which is not represented in the data of the table """ @property @@ -1251,7 +1332,9 @@ def get_attr(self): def set_attr(self): pass + class DataCol(IndexCol): + """ a data holding column, by definition this is not indexable Parameters @@ -1266,7 +1349,8 @@ class DataCol(IndexCol): _info_fields = ['tz'] @classmethod - def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs): + def create_for_block( + cls, i=None, name=None, cname=None, version=None, **kwargs): """ return a new datacol with the block i """ if cname is None: @@ -1286,7 +1370,8 @@ def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs) return cls(name=name, cname=cname, **kwargs) - def __init__(self, values=None, kind=None, typ=None, cname=None, data=None, block=None, **kwargs): + def __init__(self, values=None, kind=None, typ=None, + cname=None, data=None, block=None, **kwargs): super(DataCol, self).__init__( values=values, kind=kind, typ=typ, cname=cname, **kwargs) self.dtype = None @@ -1330,13 +1415,16 @@ def set_kind(self): elif dtype.startswith(u'bool'): self.kind = 'bool' else: - raise AssertionError("cannot interpret dtype of [%s] in [%s]" % (dtype,self)) + raise AssertionError( + "cannot interpret dtype of [%s] in [%s]" % + (dtype, self)) # set my typ if we need if self.typ is None: - self.typ = getattr(self.description,self.cname,None) + self.typ = getattr(self.description, self.cname, None) - def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=None, **kwargs): + def set_atom(self, block, existing_col, min_itemsize, + nan_rep, info, encoding=None, **kwargs): """ create and setup my atom from the block b """ self.values = list(block.items) @@ -1350,7 +1438,7 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No raise TypeError( "[date] is not implemented as a table column") elif inferred_type == 'datetime': - if getattr(rvalues[0],'tzinfo',None) is not None: + if getattr(rvalues[0], 'tzinfo', None) is not None: # if this block has more than one timezone, raise if len(set([r.tzinfo for r in rvalues])) != 1: @@ -1359,7 +1447,7 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No # convert this column to datetime64[ns] utc, and save the tz index = DatetimeIndex(rvalues) - tz = getattr(index,'tz',None) + tz = getattr(index, 'tz', None) if tz is None: raise TypeError( "invalid timezone specification") @@ -1373,7 +1461,9 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No self.tz = zone self.update_info(info) - self.set_atom_datetime64(block, values.reshape(block.values.shape)) + self.set_atom_datetime64( + block, + values.reshape(block.values.shape)) else: raise TypeError( @@ -1385,7 +1475,12 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No # this is basically a catchall; if say a datetime64 has nans then will # end up here ### elif inferred_type == 'string' or dtype == 'object': - self.set_atom_string(block, existing_col, min_itemsize, nan_rep, encoding) + self.set_atom_string( + block, + existing_col, + min_itemsize, + nan_rep, + encoding) else: self.set_atom_data(block) @@ -1394,16 +1489,18 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No def get_atom_string(self, block, itemsize): return _tables().StringCol(itemsize=itemsize, shape=block.shape[0]) - def set_atom_string(self, block, existing_col, min_itemsize, nan_rep, encoding): + def set_atom_string( + self, block, existing_col, min_itemsize, nan_rep, encoding): # fill nan items with myself block = block.fillna(nan_rep) - data = block.values + data = block.values # see if we have a valid string type inferred_type = lib.infer_dtype(data.ravel()) if inferred_type != 'string': - # we cannot serialize this data, so report an exception on a column by column basis + # we cannot serialize this data, so report an exception on a column + # by column basis for item in block.items: col = block.get(item) @@ -1411,8 +1508,7 @@ def set_atom_string(self, block, existing_col, min_itemsize, nan_rep, encoding): if inferred_type != 'string': raise TypeError("Cannot serialize the column [%s] because\n" "its data contents are [%s] object dtype" % - (item,inferred_type)) - + (item, inferred_type)) # itemsize is the maximum length of a string (along any dimension) itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) @@ -1457,7 +1553,7 @@ def set_atom_data(self, block): def get_atom_datetime64(self, block): return _tables().Int64Col(shape=block.shape[0]) - def set_atom_datetime64(self, block, values = None): + def set_atom_datetime64(self, block, values=None): self.kind = 'datetime64' self.typ = self.get_atom_datetime64(block) if values is None: @@ -1480,13 +1576,13 @@ def validate_attr(self, append): if (existing_fields is not None and existing_fields != list(self.values)): raise ValueError("appended items do not match existing items" - " in table!") + " in table!") existing_dtype = getattr(self.attrs, self.dtype_attr, None) if (existing_dtype is not None and existing_dtype != self.dtype): raise ValueError("appended items dtype do not match existing items dtype" - " in table!") + " in table!") def convert(self, values, nan_rep, encoding): """ set the data from this selection (and convert to the correct dtype if we can) """ @@ -1508,8 +1604,12 @@ def convert(self, values, nan_rep, encoding): # data should be 2-dim here # we stored as utc, so just set the tz - index = DatetimeIndex(self.data.ravel(),tz='UTC').tz_convert(self.tz) - self.data = np.array(index.tolist(),dtype=object).reshape(self.data.shape) + index = DatetimeIndex( + self.data.ravel(), + tz='UTC').tz_convert(self.tz) + self.data = np.array( + index.tolist(), + dtype=object).reshape(self.data.shape) else: self.data = np.asarray(self.data, dtype='M8[ns]') @@ -1530,14 +1630,17 @@ def convert(self, values, nan_rep, encoding): # convert nans / decode if _ensure_decoded(self.kind) == u'string': - self.data = _unconvert_string_array(self.data, nan_rep=nan_rep, encoding=encoding) + self.data = _unconvert_string_array( + self.data, + nan_rep=nan_rep, + encoding=encoding) return self def get_attr(self): """ get the data for this colummn """ self.values = getattr(self.attrs, self.kind_attr, None) - self.dtype = getattr(self.attrs, self.dtype_attr, None) + self.dtype = getattr(self.attrs, self.dtype_attr, None) self.set_kind() def set_attr(self): @@ -1548,6 +1651,7 @@ def set_attr(self): class DataIndexableCol(DataCol): + """ represent a data column that can be indexed """ is_data_indexable = True @@ -1564,13 +1668,17 @@ def get_atom_data(self, block): def get_atom_datetime64(self, block): return _tables().Int64Col() + class GenericDataIndexableCol(DataIndexableCol): + """ represent a generic pytables data column """ def get_attr(self): pass + class Storer(StringMixin): + """ represent an object in my store facilitate read/write of various types of objects this is an abstract base class @@ -1582,14 +1690,14 @@ class Storer(StringMixin): group : the group node where the table resides """ pandas_kind = None - obj_type = None - ndim = None - is_table = False + obj_type = None + ndim = None + is_table = False def __init__(self, parent, group, encoding=None, **kwargs): - self.parent = parent - self.group = group - self.encoding = _ensure_encoding(encoding) + self.parent = parent + self.group = group + self.encoding = _ensure_encoding(encoding) self.set_version() @property @@ -1598,7 +1706,11 @@ def is_old_version(self): def set_version(self): """ compute and set our version """ - version = _ensure_decoded(getattr(self.group._v_attrs,'pandas_version',None)) + version = _ensure_decoded( + getattr( + self.group._v_attrs, + 'pandas_version', + None)) try: self.version = tuple([int(x) for x in version.split('.')]) if len(self.version) == 2: @@ -1615,11 +1727,14 @@ def __unicode__(self): self.infer_axes() s = self.shape if s is not None: - if isinstance(s, (list,tuple)): + if isinstance(s, (list, tuple)): s = "[%s]" % ','.join([pprint_thing(x) for x in s]) - return "%-12.12s (shape->%s)" % (self.pandas_type,s) + return "%-12.12s (shape->%s)" % (self.pandas_type, s) return self.pandas_type + def __str__(self): + return self.__repr__() + def set_object_info(self): """ set my pandas type & version """ self.attrs.pandas_type = self.pandas_kind @@ -1685,14 +1800,15 @@ def is_exists(self): @property def nrows(self): - return getattr(self.storable,'nrows',None) + return getattr(self.storable, 'nrows', None) def validate(self, other): """ validate against an existing storable """ - if other is None: return + if other is None: + return return True - def validate_version(self, where = None): + def validate_version(self, where=None): """ are we trying to operate on an old version? """ return True @@ -1707,12 +1823,14 @@ def infer_axes(self): return True def read(self, **kwargs): - raise NotImplementedError("cannot read on an abstract storer: subclasses should implement") + raise NotImplementedError( + "cannot read on an abstract storer: subclasses should implement") def write(self, **kwargs): - raise NotImplementedError("cannot write on an abstract storer: sublcasses should implement") + raise NotImplementedError( + "cannot write on an abstract storer: sublcasses should implement") - def delete(self, where = None, **kwargs): + def delete(self, where=None, **kwargs): """ support fully deleting the node in its entirety (only) - where specification must be None """ if where is None: self._handle.removeNode(self.group, recursive=True) @@ -1720,11 +1838,14 @@ def delete(self, where = None, **kwargs): raise TypeError("cannot delete on an abstract storer") + class GenericStorer(Storer): + """ a generified storer version """ - _index_type_map = { DatetimeIndex: 'datetime', - PeriodIndex: 'period'} - _reverse_index_map = dict([ (v,k) for k, v in _index_type_map.iteritems() ]) + _index_type_map = {DatetimeIndex: 'datetime', + PeriodIndex: 'period'} + _reverse_index_map = dict([(v, k) + for k, v in _index_type_map.iteritems()]) attributes = [] # indexer helpders @@ -1746,9 +1867,11 @@ def f(values, freq=None, tz=None): def validate_read(self, kwargs): if kwargs.get('columns') is not None: - raise TypeError("cannot pass a column specification when reading a Storer") + raise TypeError( + "cannot pass a column specification when reading a Storer") if kwargs.get('where') is not None: - raise TypeError("cannot pass a where specification when reading a Storer") + raise TypeError( + "cannot pass a where specification when reading a Storer") @property def is_exists(self): @@ -1760,9 +1883,9 @@ def set_attrs(self): def get_attrs(self): """ retrieve our attributes """ - self.encoding = _ensure_encoding(getattr(self.attrs,'encoding',None)) + self.encoding = _ensure_encoding(getattr(self.attrs, 'encoding', None)) for n in self.attributes: - setattr(self,n,_ensure_decoded(getattr(self.attrs, n, None))) + setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None))) def write(self, obj, **kwargs): self.set_attrs() @@ -1809,7 +1932,7 @@ def read_index(self, key): _, index = self.read_index_node(getattr(self.group, key)) return index else: # pragma: no cover - raise Exception('unrecognized index variety: %s' % variety) + raise TypeError('unrecognized index variety: %s' % variety) def write_index(self, key, index): if isinstance(index, MultiIndex): @@ -1823,7 +1946,7 @@ def write_index(self, key, index): self.write_sparse_intindex(key, index) else: setattr(self.attrs, '%s_variety' % key, 'regular') - converted = _convert_index(index,self.encoding).set_name('index') + converted = _convert_index(index, self.encoding).set_name('index') self.write_array(key, converted.values) node = getattr(self.group, key) node._v_attrs.kind = converted.kind @@ -1841,7 +1964,6 @@ def write_index(self, key, index): zone = tslib.tot_seconds(index.tz.utcoffset()) node._v_attrs.tz = zone - def write_block_index(self, key, index): self.write_array('%s_blocs' % key, index.blocs) self.write_array('%s_blengths' % key, index.blengths) @@ -1921,10 +2043,15 @@ def read_index_node(self, node): kwargs['tz'] = node._v_attrs['tz'] if kind in (u'date', u'datetime'): - index = factory(_unconvert_index(data, kind, encoding=self.encoding), dtype=object, - **kwargs) + index = factory( + _unconvert_index(data, kind, encoding=self.encoding), dtype=object, + **kwargs) else: - index = factory(_unconvert_index(data, kind, encoding=self.encoding), **kwargs) + index = factory( + _unconvert_index(data, + kind, + encoding=self.encoding), + **kwargs) index.name = name @@ -1975,7 +2102,8 @@ def write_array(self, key, value, items=None): if value.dtype.type == np.object_: - # infer the type, warn if we have a non-string type here (for performance) + # infer the type, warn if we have a non-string type here (for + # performance) inferred_type = lib.infer_dtype(value.ravel()) if empty_array: pass @@ -1986,11 +2114,11 @@ def write_array(self, key, value, items=None): items = list(items) except: pass - ws = performance_doc % (inferred_type,key,items) + ws = performance_doc % (inferred_type, key, items) warnings.warn(ws, PerformanceWarning) vlarr = self._handle.createVLArray(self.group, key, - _tables().ObjectAtom()) + _tables().ObjectAtom()) vlarr.append(value) elif value.dtype.type == np.datetime64: self._handle.createArray(self.group, key, value.view('i8')) @@ -2003,14 +2131,16 @@ def write_array(self, key, value, items=None): getattr(self.group, key)._v_attrs.transposed = transposed + class LegacyStorer(GenericStorer): def read_index_legacy(self, key): - node = getattr(self.group,key) + node = getattr(self.group, key) data = node[:] kind = node._v_attrs.kind return _unconvert_index_legacy(data, kind, encoding=self.encoding) + class LegacySeriesStorer(LegacyStorer): def read(self, **kwargs): @@ -2019,6 +2149,7 @@ def read(self, **kwargs): values = self.read_array('values') return Series(values, index=index) + class LegacyFrameStorer(LegacyStorer): def read(self, **kwargs): @@ -2028,6 +2159,7 @@ def read(self, **kwargs): values = self.read_array('values') return DataFrame(values, index=index, columns=columns) + class SeriesStorer(GenericStorer): pandas_kind = u'series' attributes = ['name'] @@ -2035,7 +2167,7 @@ class SeriesStorer(GenericStorer): @property def shape(self): try: - return len(getattr(self.group,'values')), + return len(getattr(self.group, 'values')), except: return None @@ -2055,9 +2187,10 @@ def write(self, obj, **kwargs): self.write_array('values', obj.values) self.attrs.name = obj.name + class SparseSeriesStorer(GenericStorer): pandas_kind = u'sparse_series' - attributes = ['name','fill_value','kind'] + attributes = ['name', 'fill_value', 'kind'] def read(self, **kwargs): self.validate_read(kwargs) @@ -2077,9 +2210,10 @@ def write(self, obj, **kwargs): self.attrs.fill_value = obj.fill_value self.attrs.kind = obj.kind + class SparseFrameStorer(GenericStorer): pandas_kind = u'sparse_frame' - attributes = ['default_kind','default_fill_value'] + attributes = ['default_kind', 'default_fill_value'] def read(self, **kwargs): self.validate_read(kwargs) @@ -2087,7 +2221,7 @@ def read(self, **kwargs): sdict = {} for c in columns: key = 'sparse_series_%s' % c - s = SparseSeriesStorer(self.parent, getattr(self.group,key)) + s = SparseSeriesStorer(self.parent, getattr(self.group, key)) s.infer_axes() sdict[c] = s.read() return SparseDataFrame(sdict, columns=columns, @@ -2106,12 +2240,13 @@ def write(self, obj, **kwargs): s = SparseSeriesStorer(self.parent, node) s.write(ss) self.attrs.default_fill_value = obj.default_fill_value - self.attrs.default_kind = obj.default_kind + self.attrs.default_kind = obj.default_kind self.write_index('columns', obj.columns) + class SparsePanelStorer(GenericStorer): pandas_kind = u'sparse_panel' - attributes = ['default_kind','default_fill_value'] + attributes = ['default_kind', 'default_fill_value'] def read(self, **kwargs): self.validate_read(kwargs) @@ -2120,8 +2255,7 @@ def read(self, **kwargs): sdict = {} for name in items: key = 'sparse_frame_%s' % name - node = getattr(self.group, key) - s = SparseFrameStorer(self.parent, getattr(self.group,key)) + s = SparseFrameStorer(self.parent, getattr(self.group, key)) s.infer_axes() sdict[name] = s.read() return SparsePanel(sdict, items=items, default_kind=self.default_kind, @@ -2130,7 +2264,7 @@ def read(self, **kwargs): def write(self, obj, **kwargs): super(SparsePanelStorer, self).write(obj, **kwargs) self.attrs.default_fill_value = obj.default_fill_value - self.attrs.default_kind = obj.default_kind + self.attrs.default_kind = obj.default_kind self.write_index('items', obj.items) for name, sdf in obj.iterkv(): @@ -2142,8 +2276,9 @@ def write(self, obj, **kwargs): s = SparseFrameStorer(self.parent, node) s.write(sdf) + class BlockManagerStorer(GenericStorer): - attributes = ['ndim','nblocks'] + attributes = ['ndim', 'nblocks'] is_shape_reversed = False @property @@ -2155,15 +2290,15 @@ def shape(self): items = 0 for i in range(self.nblocks): node = getattr(self.group, 'block%d_items' % i) - shape = getattr(node,'shape',None) + shape = getattr(node, 'shape', None) if shape is not None: items += shape[0] # data shape node = getattr(self.group, 'block0_values') - shape = getattr(node,'shape',None) + shape = getattr(node, 'shape', None) if shape is not None: - shape = list(shape[0:(ndim-1)]) + shape = list(shape[0:(ndim - 1)]) else: shape = [] @@ -2213,20 +2348,24 @@ def write(self, obj, **kwargs): self.write_array('block%d_values' % i, blk.values, items=blk.items) self.write_index('block%d_items' % i, blk.items) + class FrameStorer(BlockManagerStorer): pandas_kind = u'frame' - obj_type = DataFrame + obj_type = DataFrame + class PanelStorer(BlockManagerStorer): pandas_kind = u'wide' - obj_type = Panel + obj_type = Panel is_shape_reversed = True def write(self, obj, **kwargs): obj._consolidate_inplace() return super(PanelStorer, self).write(obj, **kwargs) + class Table(Storer): + """ represent a table: facilitate read/write of various types of tables @@ -2244,20 +2383,20 @@ class Table(Storer): """ pandas_kind = u'wide_table' - table_type = None - levels = 1 - is_table = True + table_type = None + levels = 1 + is_table = True is_shape_reversed = False def __init__(self, *args, **kwargs): super(Table, self).__init__(*args, **kwargs) - self.index_axes = [] + self.index_axes = [] self.non_index_axes = [] - self.values_axes = [] - self.data_columns = [] - self.info = dict() - self.nan_rep = None - self.selection = None + self.values_axes = [] + self.data_columns = [] + self.info = dict() + self.nan_rep = None + self.selection = None @property def table_type_short(self): @@ -2266,18 +2405,21 @@ def table_type_short(self): def __repr__(self): """ return a pretty representatgion of myself """ self.infer_axes() - dc = ",dc->[%s]" % ','.join(self.data_columns) if len(self.data_columns) else '' + dc = ",dc->[%s]" % ','.join( + self.data_columns) if len( + self.data_columns) else '' ver = '' if self.is_old_version: - ver = "[%s]" % '.'.join([ str(x) for x in self.version ]) + ver = "[%s]" % '.'.join([str(x) for x in self.version]) return "%-12.12s%s (typ->%s,nrows->%s,ncols->%s,indexers->[%s]%s)" % (self.pandas_type, ver, self.table_type_short, self.nrows, self.ncols, - ','.join([ a.name for a in self.index_axes ]), + ','.join( + [a.name for a in self.index_axes]), dc) def __getitem__(self, c): @@ -2289,30 +2431,35 @@ def __getitem__(self, c): def validate(self, other): """ validate against an existing table """ - if other is None: return + if other is None: + return if other.table_type != self.table_type: raise TypeError("incompatible table_type with existing [%s - %s]" % (other.table_type, self.table_type)) - for c in ['index_axes','non_index_axes','values_axes']: - sv = getattr(self,c,None) - ov = getattr(other,c,None) + for c in ['index_axes', 'non_index_axes', 'values_axes']: + sv = getattr(self, c, None) + ov = getattr(other, c, None) if sv != ov: # show the error for the specific axes for i, sax in enumerate(sv): oax = ov[i] if sax != oax: - raise ValueError("invalid combinate of [%s] on appending data [%s] vs current table [%s]" % (c,sax,oax)) + raise ValueError( + "invalid combinate of [%s] on appending data [%s] vs current table [%s]" % + (c, sax, oax)) # should never get here - raise Exception("invalid combinate of [%s] on appending data [%s] vs current table [%s]" % (c,sv,ov)) + raise ValueError( + "invalid combinate of [%s] on appending data [%s] vs current table [%s]" % + (c, sv, ov)) @property def nrows_expected(self): """ based on our axes, compute the expected nrows """ - return np.prod([ i.cvalues.shape[0] for i in self.index_axes ]) + return np.prod([i.cvalues.shape[0] for i in self.index_axes]) @property def is_exists(self): @@ -2321,7 +2468,7 @@ def is_exists(self): @property def storable(self): - return getattr(self.group,'table',None) + return getattr(self.group, 'table', None) @property def table(self): @@ -2343,7 +2490,7 @@ def axes(self): @property def ncols(self): """ the number of total columns in the values axes """ - return sum([ len(a.values) for a in self.values_axes ]) + return sum([len(a.values) for a in self.values_axes]) @property def is_transposed(self): @@ -2360,7 +2507,8 @@ def queryables(self): # compute the values_axes queryables return dict([(a.cname, a.kind) for a in self.index_axes] + [(self.obj_type._AXIS_NAMES[axis], None) for axis, values in self.non_index_axes] + - [(v.cname, v.kind) for v in self.values_axes if v.name in set(self.data_columns)] + [(v.cname, v.kind) + for v in self.values_axes if v.name in set(self.data_columns)] ) def index_cols(self): @@ -2373,44 +2521,62 @@ def values_cols(self): def set_info(self): """ update our table index info """ - self.attrs.info = self.info + self.attrs.info = self.info def set_attrs(self): """ set our table type & indexables """ - self.attrs.table_type = self.table_type - self.attrs.index_cols = self.index_cols() - self.attrs.values_cols = self.values_cols() + self.attrs.table_type = self.table_type + self.attrs.index_cols = self.index_cols() + self.attrs.values_cols = self.values_cols() self.attrs.non_index_axes = self.non_index_axes self.attrs.data_columns = self.data_columns - self.attrs.nan_rep = self.nan_rep - self.attrs.encoding = self.encoding - self.attrs.levels = self.levels + self.attrs.nan_rep = self.nan_rep + self.attrs.encoding = self.encoding + self.attrs.levels = self.levels self.set_info() def get_attrs(self): """ retrieve our attributes """ - self.non_index_axes = getattr(self.attrs,'non_index_axes',None) or [] - self.data_columns = getattr(self.attrs,'data_columns',None) or [] - self.info = getattr(self.attrs,'info',None) or dict() - self.nan_rep = getattr(self.attrs,'nan_rep',None) - self.encoding = _ensure_encoding(getattr(self.attrs,'encoding',None)) - self.levels = getattr(self.attrs,'levels',None) or [] + self.non_index_axes = getattr( + self.attrs, + 'non_index_axes', + None) or [] + self.data_columns = getattr( + self.attrs, + 'data_columns', + None) or [] + self.info = getattr( + self.attrs, + 'info', + None) or dict() + self.nan_rep = getattr(self.attrs, 'nan_rep', None) + self.encoding = _ensure_encoding( + getattr(self.attrs, 'encoding', None)) + self.levels = getattr( + self.attrs, + 'levels', + None) or [] t = self.table - self.index_axes = [ a.infer(t) for a in self.indexables if a.is_an_indexable ] - self.values_axes = [ a.infer(t) for a in self.indexables if not a.is_an_indexable ] + self.index_axes = [a.infer(t) + for a in self.indexables if a.is_an_indexable] + self.values_axes = [a.infer(t) + for a in self.indexables if not a.is_an_indexable] - def validate_version(self, where = None): + def validate_version(self, where=None): """ are we trying to operate on an old version? """ if where is not None: if self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1: - ws = incompatibility_doc % '.'.join([ str(x) for x in self.version ]) + ws = incompatibility_doc % '.'.join( + [str(x) for x in self.version]) warnings.warn(ws, IncompatibilityWarning) def validate_min_itemsize(self, min_itemsize): """ validate the min_itemisze doesn't contain items that are not in the axes this needs data_columns to be defined """ - if min_itemsize is None: return - if not isinstance(min_itemsize, dict): return + if min_itemsize is None: + return + if not isinstance(min_itemsize, dict): + return q = self.queryables() for k, v in min_itemsize.items(): @@ -2419,18 +2585,20 @@ def validate_min_itemsize(self, min_itemsize): if k == 'values': continue if k not in q: - raise ValueError("min_itemsize has the key [%s] which is not an axis or data_column" % k) + raise ValueError( + "min_itemsize has the key [%s] which is not an axis or data_column" % + k) @property def indexables(self): """ create/cache the indexables if they don't exist """ if self._indexables is None: - d = self.description self._indexables = [] # index columns - self._indexables.extend([ IndexCol(name=name,axis=axis,pos=i) for i, (axis, name) in enumerate(self.attrs.index_cols)]) + self._indexables.extend([IndexCol(name=name, axis=axis, pos=i) + for i, (axis, name) in enumerate(self.attrs.index_cols)]) # values columns dc = set(self.data_columns) @@ -2548,15 +2716,17 @@ def validate_data_columns(self, data_columns, min_itemsize): data_columns = [] # if min_itemsize is a dict, add the keys (exclude 'values') - if isinstance(min_itemsize,dict): + if isinstance(min_itemsize, dict): existing_data_columns = set(data_columns) - data_columns.extend([ k for k in min_itemsize.keys() if k != 'values' and k not in existing_data_columns ]) + data_columns.extend( + [k for k in min_itemsize.keys() if k != 'values' and k not in existing_data_columns]) # return valid columns in the order of our axis return [c for c in data_columns if c in axis_labels] - def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, min_itemsize=None, **kwargs): + def create_axes(self, axes, obj, validate=True, nan_rep=None, + data_columns=None, min_itemsize=None, **kwargs): """ create and return the axes leagcy tables create an indexable column, indexable index, non-indexable fields @@ -2578,7 +2748,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, axes = _AXES_MAP[type(obj)] except: raise TypeError("cannot properly create the storer for: [group->%s,value->%s]" % - (self.group._v_name,type(obj))) + (self.group._v_name, type(obj))) # map axes to numbers axes = [obj._get_axis_number(a) for a in axes] @@ -2587,17 +2757,18 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, if self.infer_axes(): existing_table = self.copy() existing_table.infer_axes() - axes = [ a.axis for a in existing_table.index_axes] - data_columns = existing_table.data_columns - nan_rep = existing_table.nan_rep + axes = [a.axis for a in existing_table.index_axes] + data_columns = existing_table.data_columns + nan_rep = existing_table.nan_rep self.encoding = existing_table.encoding - self.info = copy.copy(existing_table.info) + self.info = copy.copy(existing_table.info) else: existing_table = None # currently support on ndim-1 axes if len(axes) != self.ndim - 1: - raise ValueError("currently only support ndim-1 indexers in an AppendableTable") + raise ValueError( + "currently only support ndim-1 indexers in an AppendableTable") # create according to the new data self.non_index_axes = [] @@ -2634,8 +2805,9 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, self.non_index_axes.append((i, append_axis)) # set axis positions (based on the axes) - self.index_axes = [index_axes_map[a].set_pos(j).update_info(self.info) for j, - a in enumerate(axes)] + self.index_axes = [index_axes_map[a].set_pos( + j).update_info(self.info) for j, + a in enumerate(axes)] j = len(self.index_axes) # check for column conflicts @@ -2652,17 +2824,18 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, blocks = block_obj._data.blocks if len(self.non_index_axes): axis, axis_labels = self.non_index_axes[0] - data_columns = self.validate_data_columns(data_columns, min_itemsize) + data_columns = self.validate_data_columns( + data_columns, min_itemsize) if len(data_columns): blocks = block_obj.reindex_axis(Index(axis_labels) - Index( - data_columns), axis=axis, copy=False)._data.blocks + data_columns), axis=axis, copy=False)._data.blocks for c in data_columns: blocks.extend(block_obj.reindex_axis( - [c], axis=axis, copy=False)._data.blocks) + [c], axis=axis, copy=False)._data.blocks) # reorder the blocks in the same order as the existing_table if we can if existing_table is not None: - by_items = dict([ (tuple(b.items.tolist()),b) for b in blocks ]) + by_items = dict([(tuple(b.items.tolist()), b) for b in blocks]) new_blocks = [] for ea in existing_table.values_axes: items = tuple(ea.values) @@ -2670,7 +2843,9 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, b = by_items.pop(items) new_blocks.append(b) except: - raise ValueError("cannot match existing table structure for [%s] on appending data" % items) + raise ValueError( + "cannot match existing table structure for [%s] on appending data" % + items) blocks = new_blocks # add my values @@ -2694,7 +2869,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, existing_col = existing_table.values_axes[i] except: raise ValueError("Incompatible appended table [%s] with existing table [%s]" % - (blocks,existing_table.values_axes)) + (blocks, existing_table.values_axes)) else: existing_col = None @@ -2711,10 +2886,13 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, col.set_pos(j) self.values_axes.append(col) - except (NotImplementedError, ValueError, TypeError), e: + except (NotImplementedError, ValueError, TypeError) as e: raise e - except (Exception), detail: - raise Exception("cannot find the correct atom type -> [dtype->%s,items->%s] %s" % (b.dtype.name, b.items, str(detail))) + except Exception as detail: + raise TypeError("cannot find the correct atom type -> " + "[dtype->%s,items->%s] %s" % (b.dtype.name, + b.items, + str(detail))) j += 1 # validate our min_itemsize @@ -2745,29 +2923,32 @@ def process_filter(field, filt): # see if the field is the name of an axis if field == axis_name: - takers = op(axis_values,filt) - return obj.ix._getitem_axis(takers,axis=axis_number) + takers = op(axis_values, filt) + return obj.ix._getitem_axis(takers, axis=axis_number) # this might be the name of a file IN an axis elif field in axis_values: # we need to filter on this dimension - values = _ensure_index(getattr(obj,field).values) - filt = _ensure_index(filt) + values = _ensure_index(getattr(obj, field).values) + filt = _ensure_index(filt) # hack until we support reversed dim flags - if isinstance(obj,DataFrame): - axis_number = 1-axis_number - takers = op(values,filt) - return obj.ix._getitem_axis(takers,axis=axis_number) + if isinstance(obj, DataFrame): + axis_number = 1 - axis_number + takers = op(values, filt) + return obj.ix._getitem_axis(takers, axis=axis_number) - raise ValueError("cannot find the field [%s] for filtering!" % field) + raise ValueError( + "cannot find the field [%s] for filtering!" % + field) obj = process_filter(field, filt) return obj - def create_description(self, complib=None, complevel=None, fletcher32=False, expectedrows=None): + def create_description( + self, complib=None, complevel=None, fletcher32=False, expectedrows=None): """ create the description of the table from the axes & values """ # expected rows estimate @@ -2801,10 +2982,15 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs): return False # create the selection - self.selection = Selection(self, where=where, start=start, stop=stop, **kwargs) + self.selection = Selection( + self, + where=where, + start=start, + stop=stop, + **kwargs) return Coordinates(self.selection.select_coords(), group=self.group, where=where) - def read_column(self, column, where = None, **kwargs): + def read_column(self, column, where=None, **kwargs): """ return a single column from the table, generally only indexables are interesting """ # validate the version @@ -2815,14 +3001,17 @@ def read_column(self, column, where = None, **kwargs): return False if where is not None: - raise Exception("read_column does not currently accept a where clause") + raise TypeError("read_column does not currently accept a where " + "clause") # find the axes for a in self.axes: if column == a.name: if not a.is_data_indexable: - raise ValueError("column [%s] can not be extracted individually; it is not data indexable" % column) + raise ValueError( + "column [%s] can not be extracted individually; it is not data indexable" % + column) # column must be an indexable or a data column c = getattr(self.table.cols, column) @@ -2831,7 +3020,9 @@ def read_column(self, column, where = None, **kwargs): raise KeyError("column [%s] not found in the table" % column) + class WORMTable(Table): + """ a write-once read-many table: this format DOES NOT ALLOW appending to a table. writing is a one-time operation the data are stored in a format that allows for searching the data on disk @@ -2851,6 +3042,7 @@ def write(self, **kwargs): class LegacyTable(Table): + """ an appendable table: allow append/query/delete operations to a (possibily) already existing appendable table this table ALLOWS append (but doesn't require them), and stores the data in a format @@ -2865,7 +3057,7 @@ class LegacyTable(Table): ndim = 3 def write(self, **kwargs): - raise Exception("write operations are not allowed on legacy tables!") + raise TypeError("write operations are not allowed on legacy tables!") def read(self, where=None, columns=None, **kwargs): """ we have n indexable columns, with an arbitrary number of data axes """ @@ -2950,6 +3142,7 @@ def read(self, where=None, columns=None, **kwargs): class LegacyFrameTable(LegacyTable): + """ support the legacy frame table """ pandas_kind = u'frame_table' table_type = u'legacy_frame' @@ -2960,12 +3153,14 @@ def read(self, *args, **kwargs): class LegacyPanelTable(LegacyTable): + """ support the legacy panel table """ table_type = u'legacy_panel' obj_type = Panel class AppendableTable(LegacyTable): + """ suppor the new appendable table formats """ _indexables = None table_type = u'appendable' @@ -3033,7 +3228,8 @@ def write_data(self, chunksize): values = [a.take_data() for a in self.values_axes] # transpose the values so first dimension is last - values = [ v.transpose(np.roll(np.arange(v.ndim),v.ndim-1)) for v in values ] + values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) + for v in values] # write the chunks if chunksize is None: @@ -3066,15 +3262,15 @@ def write_data_chunk(self, indexes, mask, search, values): args = list(indexes) args.extend([self.dtype, mask, search, values]) rows = func(*args) - except (Exception), detail: - raise Exception("cannot create row-data -> %s" % str(detail)) + except Exception as detail: + raise Exception("cannot create row-data -> %s" % detail) try: if len(rows): self.table.append(rows) self.table.flush() - except (Exception), detail: - raise Exception("tables cannot write this data -> %s" % str(detail)) + except Exception as detail: + raise TypeError("tables cannot write this data -> %s" % detail) def delete(self, where=None, **kwargs): @@ -3130,6 +3326,7 @@ def delete(self, where=None, **kwargs): class AppendableFrameTable(AppendableTable): + """ suppor the new appendable table formats """ pandas_kind = u'frame_table' table_type = u'appendable_frame' @@ -3159,10 +3356,10 @@ def read(self, where=None, columns=None, **kwargs): if self.is_transposed: values = a.cvalues index_ = cols - cols_ = Index(index,name=getattr(index,'name',None)) + cols_ = Index(index, name=getattr(index, 'name', None)) else: values = a.cvalues.T - index_ = Index(index,name=getattr(index,'name',None)) + index_ = Index(index, name=getattr(index, 'name', None)) cols_ = cols # if we have a DataIndexableCol, its shape will only be 1 dim @@ -3185,6 +3382,7 @@ def read(self, where=None, columns=None, **kwargs): class GenericTable(AppendableFrameTable): + """ a table that read/writes the generic pytables table format """ pandas_kind = u'frame_table' table_type = u'generic_table' @@ -3197,17 +3395,19 @@ def pandas_type(self): @property def storable(self): - return getattr(self.group,'table',None) or self.group + return getattr(self.group, 'table', None) or self.group def get_attrs(self): """ retrieve our attributes """ - self.non_index_axes = [] - self.nan_rep = None - self.levels = [] + self.non_index_axes = [] + self.nan_rep = None + self.levels = [] t = self.table - self.index_axes = [ a.infer(t) for a in self.indexables if a.is_an_indexable ] - self.values_axes = [ a.infer(t) for a in self.indexables if not a.is_an_indexable ] - self.data_columns = [ a.name for a in self.values_axes ] + self.index_axes = [a.infer(t) + for a in self.indexables if a.is_an_indexable] + self.values_axes = [a.infer(t) + for a in self.indexables if not a.is_an_indexable] + self.data_columns = [a.name for a in self.values_axes] @property def indexables(self): @@ -3217,11 +3417,15 @@ def indexables(self): d = self.description # the index columns is just a simple index - self._indexables = [ GenericIndexCol(name='index',axis=0) ] + self._indexables = [GenericIndexCol(name='index', axis=0)] for i, n in enumerate(d._v_names): - dc = GenericDataIndexableCol(name = n, pos=i, values = [ n ], version = self.version) + dc = GenericDataIndexableCol( + name=n, + pos=i, + values=[n], + version=self.version) self._indexables.append(dc) return self._indexables @@ -3229,7 +3433,9 @@ def indexables(self): def write(self, **kwargs): raise NotImplementedError("cannot write on an generic table") + class AppendableMultiFrameTable(AppendableFrameTable): + """ a frame with a multi-index """ table_type = u'appendable_multiframe' obj_type = DataFrame @@ -3255,12 +3461,17 @@ def read(self, columns=None, **kwargs): for n in self.levels: if n not in columns: columns.insert(0, n) - df = super(AppendableMultiFrameTable, self).read(columns=columns, **kwargs) + df = super( + AppendableMultiFrameTable, + self).read( + columns=columns, + **kwargs) df.set_index(self.levels, inplace=True) return df class AppendablePanelTable(AppendableTable): + """ suppor the new appendable table formats """ table_type = u'appendable_panel' ndim = 3 @@ -3278,26 +3489,28 @@ def is_transposed(self): class AppendableNDimTable(AppendablePanelTable): + """ suppor the new appendable table formats """ table_type = u'appendable_ndim' ndim = 4 obj_type = Panel4D + def _convert_index(index, encoding=None): - index_name = getattr(index,'name',None) + index_name = getattr(index, 'name', None) if isinstance(index, DatetimeIndex): converted = index.asi8 return IndexCol(converted, 'datetime64', _tables().Int64Col(), - freq=getattr(index,'freq',None), tz=getattr(index,'tz',None), - index_name=index_name) + freq=getattr(index, 'freq', None), + tz=getattr(index, 'tz', None), index_name=index_name) elif isinstance(index, (Int64Index, PeriodIndex)): atom = _tables().Int64Col() - return IndexCol(index.values, 'integer', atom, freq=getattr(index,'freq',None), - index_name=index_name) + return IndexCol(index.values, 'integer', atom, freq=getattr(index, + 'freq', None), index_name=index_name) if isinstance(index, MultiIndex): - raise Exception('MultiIndex not supported here!') + raise TypeError('MultiIndex not supported here!') inferred_type = lib.infer_dtype(index) @@ -3306,8 +3519,8 @@ def _convert_index(index, encoding=None): if inferred_type == 'datetime64': converted = values.view('i8') return IndexCol(converted, 'datetime64', _tables().Int64Col(), - freq=getattr(index,'freq',None), tz=getattr(index,'tz',None), - index_name=index_name) + freq=getattr(index, 'freq', None), + tz=getattr(index, 'tz', None), index_name=index_name) elif inferred_type == 'datetime': converted = np.array([(time.mktime(v.timetuple()) + v.microsecond / 1E6) for v in values], @@ -3325,8 +3538,8 @@ def _convert_index(index, encoding=None): converted = _convert_string_array(values, encoding) itemsize = converted.dtype.itemsize - return IndexCol(converted, 'string', _tables().StringCol(itemsize), itemsize=itemsize, - index_name=index_name) + return IndexCol(converted, 'string', _tables().StringCol(itemsize), + itemsize=itemsize, index_name=index_name) elif inferred_type == 'unicode': atom = _tables().ObjectAtom() return IndexCol(np.asarray(values, dtype='O'), 'object', atom, @@ -3345,6 +3558,7 @@ def _convert_index(index, encoding=None): return IndexCol(np.asarray(values, dtype='O'), 'object', atom, index_name=index_name) + def _unconvert_index(data, kind, encoding=None): kind = _ensure_decoded(kind) if kind == u'datetime64': @@ -3364,6 +3578,7 @@ def _unconvert_index(data, kind, encoding=None): raise ValueError('unrecognized index type %s' % kind) return index + def _unconvert_index_legacy(data, kind, legacy=False, encoding=None): kind = _ensure_decoded(kind) if kind == u'datetime': @@ -3376,6 +3591,7 @@ def _unconvert_index_legacy(data, kind, legacy=False, encoding=None): raise ValueError('unrecognized index type %s' % kind) return index + def _convert_string_array(data, encoding, itemsize=None): # encode if needed @@ -3387,19 +3603,20 @@ def _convert_string_array(data, encoding, itemsize=None): if itemsize is None: itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) - data = np.array(data,dtype="S%d" % itemsize) + data = np.array(data, dtype="S%d" % itemsize) return data + def _unconvert_string_array(data, nan_rep=None, encoding=None): """ deserialize a string array, possibly decoding """ shape = data.shape - data = np.array(data.ravel(),dtype=object) + data = np.array(data.ravel(), dtype=object) # guard against a None encoding in PY3 (because of a legacy # where the passed encoding is actually None) encoding = _ensure_encoding(encoding) if encoding is not None and len(data): - f = np.vectorize(lambda x: x.decode(encoding),otypes=[np.object]) + f = np.vectorize(lambda x: x.decode(encoding), otypes=[np.object]) data = f(data) if nan_rep is None: @@ -3408,6 +3625,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): data = lib.string_array_replace_from_nan_rep(data, nan_rep) return data.reshape(shape) + def _maybe_convert(values, val_kind, encoding): if _need_convert(val_kind): conv = _get_converter(val_kind, encoding) @@ -3415,6 +3633,7 @@ def _maybe_convert(values, val_kind, encoding): values = conv(values) return values + def _get_converter(kind, encoding): kind = _ensure_decoded(kind) if kind == 'datetime64': @@ -3422,17 +3641,20 @@ def _get_converter(kind, encoding): elif kind == 'datetime': return lib.convert_timestamps elif kind == 'string': - return lambda x: _unconvert_string_array(x,encoding=encoding) + return lambda x: _unconvert_string_array(x, encoding=encoding) else: # pragma: no cover raise ValueError('invalid kind %s' % kind) + def _need_convert(kind): kind = _ensure_decoded(kind) if kind in (u'datetime', u'datetime64', u'string'): return True return False + class Term(StringMixin): + """create a term object that holds a field, op, and value Parameters @@ -3460,10 +3682,13 @@ class Term(StringMixin): """ _ops = ['<=', '<', '>=', '>', '!=', '==', '='] - _search = re.compile("^\s*(?P\w+)\s*(?P%s)\s*(?P.+)\s*$" % '|'.join(_ops)) + _search = re.compile( + "^\s*(?P\w+)\s*(?P%s)\s*(?P.+)\s*$" % + '|'.join(_ops)) _max_selectors = 31 - def __init__(self, field, op=None, value=None, queryables=None, encoding=None): + def __init__(self, field, op=None, + value=None, queryables=None, encoding=None): self.field = None self.op = None self.value = None @@ -3528,8 +3753,10 @@ def __init__(self, field, op=None, value=None, queryables=None, encoding=None): # we have valid conditions if self.op in ['>', '>=', '<', '<=']: - if hasattr(self.value, '__iter__') and len(self.value) > 1 and not isinstance(self.value,basestring): - raise ValueError("an inequality condition cannot have multiple values [%s]" % str(self)) + if hasattr(self.value, '__iter__') and len(self.value) > 1 and not isinstance(self.value, basestring): + raise ValueError( + "an inequality condition cannot have multiple values [%s]" % + str(self)) if not is_list_like(self.value): self.value = [self.value] @@ -3571,7 +3798,7 @@ def eval(self): if self.is_in_table: values = [self.convert_value(v) for v in self.value] else: - values = [TermValue(v,v,self.kind) for v in self.value] + values = [TermValue(v, v, self.kind) for v in self.value] # equality conditions if self.op in ['==', '!=']: @@ -3582,21 +3809,26 @@ def eval(self): else: filter_op = lambda axis, vals: axis.isin(vals) - if self.is_in_table: # too many values to create the expression? if len(values) <= self._max_selectors: - vs = [ self.generate(v) for v in values ] + vs = [self.generate(v) for v in values] self.condition = "(%s)" % ' | '.join(vs) # use a filter after reading else: - self.filter = (self.field, filter_op, Index([v.value for v in values])) + self.filter = ( + self.field, + filter_op, + Index([v.value for v in values])) else: - self.filter = (self.field, filter_op, Index([v.value for v in values])) + self.filter = ( + self.field, + filter_op, + Index([v.value for v in values])) else: @@ -3606,7 +3838,9 @@ def eval(self): else: - raise TypeError("passing a filterable condition to a non-table indexer [%s]" % str(self)) + raise TypeError( + "passing a filterable condition to a non-table indexer [%s]" % + str(self)) def convert_value(self, v): """ convert the expression that is in the term to something that is accepted by pytables """ @@ -3618,34 +3852,37 @@ def stringify(value): return value kind = _ensure_decoded(self.kind) - if kind == u'datetime64' or kind == u'datetime' : + if kind == u'datetime64' or kind == u'datetime': v = lib.Timestamp(v) if v.tz is not None: v = v.tz_convert('UTC') - return TermValue(v,v.value,kind) + return TermValue(v, v.value, kind) elif isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u'date': v = time.mktime(v.timetuple()) - return TermValue(v,Timestamp(v),kind) + return TermValue(v, Timestamp(v), kind) elif kind == u'integer': v = int(float(v)) - return TermValue(v,v,kind) + return TermValue(v, v, kind) elif kind == u'float': v = float(v) - return TermValue(v,v,kind) + return TermValue(v, v, kind) elif kind == u'bool': if isinstance(v, basestring): - v = not v.strip().lower() in [u'false', u'f', u'no', u'n', u'none', u'0', u'[]', u'{}', u''] + v = not v.strip().lower() in [ + u'false', u'f', u'no', u'n', u'none', u'0', u'[]', u'{}', u''] else: v = bool(v) - return TermValue(v,v,kind) + return TermValue(v, v, kind) elif not isinstance(v, basestring): v = stringify(v) - return TermValue(v,stringify(v),u'string') + return TermValue(v, stringify(v), u'string') # string quoting - return TermValue(v,stringify(v),u'string') + return TermValue(v, stringify(v), u'string') + class TermValue(object): + """ hold a term value the we use to construct a condition/filter """ def __init__(self, value, converted, kind): @@ -3662,7 +3899,9 @@ def tostring(self, encoding): return '"%s"' % self.converted return self.converted + class Coordinates(object): + """ holds a returned coordinates list, useful to select the same rows from different tables coordinates : holds the array of coordinates @@ -3682,7 +3921,9 @@ def __getitem__(self, key): """ return a new coordinates object, sliced by the key """ return Coordinates(self.values[key], self.group, self.where) + class Selection(object): + """ Carries out a selection operation on a tables.Table object. @@ -3693,6 +3934,7 @@ class Selection(object): start, stop: indicies to start and/or stop selection """ + def __init__(self, table, where=None, start=None, stop=None, **kwargs): self.table = table self.where = where @@ -3710,9 +3952,10 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): # create the numexpr & the filter if self.terms: - terms = [ t for t in self.terms if t.condition is not None ] + terms = [t for t in self.terms if t.condition is not None] if len(terms): - self.condition = "(%s)" % ' & '.join([ t.condition for t in terms ]) + self.condition = "(%s)" % ' & '.join( + [t.condition for t in terms]) self.filter = [] for t in self.terms: if t.filter is not None: @@ -3757,13 +4000,13 @@ def select_coords(self): return self.table.table.getWhereList(self.condition, start=self.start, stop=self.stop, sort=True) -### utilities ### +# utilities ### -def timeit(key,df,fn=None,remove=True,**kwargs): +def timeit(key, df, fn=None, remove=True, **kwargs): if fn is None: fn = 'timeit.h5' - store = HDFStore(fn,mode='w') - store.append(key,df,**kwargs) + store = HDFStore(fn, mode='w') + store.append(key, df, **kwargs) store.close() if remove: diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 00d8089ad2ee7..6737408081f3d 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -1,8 +1,9 @@ import nose import unittest -import os import sys +import os import warnings +from contextlib import contextmanager import datetime import numpy as np @@ -19,7 +20,6 @@ from pandas import concat, Timestamp from pandas.util import py3compat -from numpy.testing.decorators import slow try: import tables @@ -36,12 +36,12 @@ # contextmanager to ensure the file cleanup def safe_remove(path): if path is not None: - import os try: os.remove(path) except: pass + def safe_close(store): try: if store is not None: @@ -49,7 +49,6 @@ def safe_close(store): except: pass -from contextlib import contextmanager @contextmanager def ensure_clean(path, mode='a', complevel=None, complib=None, @@ -620,7 +619,6 @@ def test_append_with_different_block_ordering(self): store.append('df',df) - def test_ndim_indexables(self): """ test using ndim tables in new ways""" @@ -1011,6 +1009,7 @@ def test_big_table_frame(self): store.append('df', df) rows = store.root.df.table.nrows recons = store.select('df') + assert isinstance(recons, DataFrame) print ("\nbig_table frame [%s] -> %5.2f" % (rows, time.time() - x)) @@ -1064,7 +1063,7 @@ def test_big_put_frame(self): with ensure_clean(self.path, mode='w') as store: start_time = time.time() - store = HDFStore(fn, mode='w') + store = HDFStore(self.path, mode='w') store.put('df', df) print (df.get_dtype_counts()) @@ -1092,6 +1091,7 @@ def test_big_table_panel(self): store.append('wp', wp) rows = store.root.wp.table.nrows recons = store.select('wp') + assert isinstance(recons, Panel) print ("\nbig_table panel [%s] -> %5.2f" % (rows, time.time() - x)) @@ -1254,7 +1254,6 @@ def test_table_values_dtypes_roundtrip(self): expected.sort() tm.assert_series_equal(result,expected) - def test_table_mixed_dtypes(self): # frame @@ -2352,7 +2351,6 @@ def test_string_select(self): expected = df[df.int!=2] assert_frame_equal(result,expected) - def test_read_column(self): df = tm.makeTimeDataFrame() @@ -2580,7 +2578,6 @@ def _check_double_roundtrip(self, obj, comparator, compression=False, again = store['obj'] comparator(again, obj, **kwargs) - def _check_roundtrip_table(self, obj, comparator, compression=False): options = {} if compression: @@ -2597,6 +2594,7 @@ def test_pytables_native_read(self): try: store = HDFStore(tm.get_data_path('legacy_hdf/pytables_native.h5'), 'r') d2 = store['detector/readout'] + assert isinstance(d2, DataFrame) finally: safe_close(store) @@ -2604,6 +2602,7 @@ def test_pytables_native_read(self): store = HDFStore(tm.get_data_path('legacy_hdf/pytables_native2.h5'), 'r') str(store) d1 = store['detector'] + assert isinstance(d1, DataFrame) finally: safe_close(store) @@ -2653,11 +2652,18 @@ def test_legacy_0_10_read(self): def test_legacy_0_11_read(self): # legacy from 0.11 try: - store = HDFStore(tm.get_data_path('legacy_hdf/legacy_table_0.11.h5'), 'r') + path = os.path.join('legacy_hdf', 'legacy_table_0.11.h5') + store = HDFStore(tm.get_data_path(path), 'r') str(store) + assert 'df' in store + assert 'df1' in store + assert 'mi' in store df = store.select('df') df1 = store.select('df1') mi = store.select('mi') + assert isinstance(df, DataFrame) + assert isinstance(df1, DataFrame) + assert isinstance(mi, DataFrame) finally: safe_close(store) @@ -2665,10 +2671,9 @@ def test_copy(self): def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): try: - import os - if f is None: - f = tm.get_data_path('legacy_hdf/legacy_0.10.h5') + f = tm.get_data_path(os.path.join('legacy_hdf', + 'legacy_0.10.h5')) store = HDFStore(f, 'r') @@ -2738,6 +2743,7 @@ def test_legacy_table_write(self): df = DataFrame(dict(A = 'foo', B = 'bar'),index=range(10)) store.append('df', df, data_columns = ['B'], min_itemsize={'A' : 200 }) + store.append('wp', wp) store.close() @@ -2824,6 +2830,7 @@ def _test_sort(obj): else: raise ValueError('type not supported here') + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index db01545fb3c9d..974e301c5d303 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -1,20 +1,19 @@ from datetime import datetime -import sys import re import nose import unittest -from pandas import Series, DataFrame, date_range, DatetimeIndex +from pandas import Series, DataFrame, date_range, DatetimeIndex, Panel from pandas.core.common import notnull, isnull import pandas.core.common as com import pandas.util.testing as tm import pandas.core.config as cf import numpy as np +from numpy.random import randn from pandas.tslib import iNaT -from pandas.util import py3compat _multiprocess_can_split_ = True @@ -33,6 +32,7 @@ def __getitem__(self): assert(not is_seq(A())) + def test_notnull(): assert notnull(1.) assert not notnull(None) @@ -98,6 +98,61 @@ def test_isnull_lists(): assert(not result.any()) +def test_is_string(): + class MyString(str): + pass + + class MyUnicode(unicode): + pass + + strings = ('s', np.str_('a'), np.unicode_('unicode_string'), + MyString('a _string blah'), u'asdf', MyUnicode(u'asdf')) + not_strings = [], 1, {}, set(), np.array(['1']), np.array([u'1']) + + for string in strings: + assert com.is_string(string), '{0} is not a string'.format(string) + + for not_string in not_strings: + assert not com.is_string(not_string), ('{0} is a ' + 'string'.format(not_string)) + + +def test_is_frame(): + df = DataFrame(randn(2, 1)) + assert com.is_frame(df) + assert not com.is_frame('s') + + +def test_is_series(): + s = Series(randn(2)) + assert com.is_series(s) + assert not com.is_series(s.values) + + +def test_is_panel(): + p = Panel(randn(2, 3, 4)) + assert com.is_panel(p) + assert not com.is_panel(2) + + +def test_is_pd_obj(): + df = DataFrame(randn(2, 1)) + s = Series(randn(2)) + p = Panel(randn(2, 3, 4)) + for obj in (df, s, p): + assert com.is_pd_obj(obj) + assert not com.is_pd_obj(obj.values) + + +def test_is_ndframe(): + df = DataFrame(randn(2, 1)) + p = Panel(randn(2, 3, 4)) + # should add series after @jreback's ndframe to series pr + for obj in (df, p): + assert com.is_ndframe(obj) + assert not com.is_ndframe(obj.values) + + def test_isnull_datetime(): assert (not isnull(datetime.now())) assert notnull(datetime.now()) @@ -112,11 +167,13 @@ def test_isnull_datetime(): assert(mask[0]) assert(not mask[1:].any()) + def test_datetimeindex_from_empty_datetime64_array(): for unit in [ 'ms', 'us', 'ns' ]: idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit)) assert(len(idx) == 0) + def test_any_none(): assert(com._any_none(1, 2, 3, None)) assert(not com._any_none(1, 2, 3, 4)) @@ -266,6 +323,7 @@ def test_ensure_int32(): result = com._ensure_int32(values) assert(result.dtype == np.int32) + def test_ensure_platform_int(): # verify that when we create certain types of indices @@ -748,6 +806,7 @@ def test_2d_datetime64(self): expected[:, [2, 4]] = datetime(2007, 1, 1) tm.assert_almost_equal(result, expected) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py deleted file mode 100644 index ba0a9926dfa78..0000000000000 --- a/pandas/tests/test_expressions.py +++ /dev/null @@ -1,203 +0,0 @@ -# pylint: disable-msg=W0612,E1101 - -import unittest -import nose - -import operator -from numpy import random, nan -from numpy.random import randn -import numpy as np -from numpy.testing import assert_array_equal - -import pandas as pan -from pandas.core.api import DataFrame, Series, notnull, isnull -from pandas.core import expressions as expr - -from pandas.util.testing import (assert_almost_equal, - assert_series_equal, - assert_frame_equal) -from pandas.util import py3compat - -import pandas.util.testing as tm -import pandas.lib as lib - -from numpy.testing.decorators import slow - -if not expr._USE_NUMEXPR: - raise nose.SkipTest - -_frame = DataFrame(np.random.randn(10000, 4), columns = list('ABCD'), dtype='float64') -_frame2 = DataFrame(np.random.randn(100, 4), columns = list('ABCD'), dtype='float64') -_mixed = DataFrame({ 'A' : _frame['A'].copy(), 'B' : _frame['B'].astype('float32'), 'C' : _frame['C'].astype('int64'), 'D' : _frame['D'].astype('int32') }) -_mixed2 = DataFrame({ 'A' : _frame2['A'].copy(), 'B' : _frame2['B'].astype('float32'), 'C' : _frame2['C'].astype('int64'), 'D' : _frame2['D'].astype('int32') }) -_integer = DataFrame(np.random.randint(1, 100, size=(10001, 4)), columns = list('ABCD'), dtype='int64') - -class TestExpressions(unittest.TestCase): - - _multiprocess_can_split_ = False - - def setUp(self): - - self.frame = _frame.copy() - self.frame2 = _frame2.copy() - self.mixed = _mixed.copy() - self.mixed2 = _mixed2.copy() - self.integer = _integer.copy() - self._MIN_ELEMENTS = expr._MIN_ELEMENTS - - def tearDown(self): - expr._MIN_ELEMENTS = self._MIN_ELEMENTS - - #TODO: add test for Panel - #TODO: add tests for binary operations - @nose.tools.nottest - def run_arithmetic_test(self, df, assert_func, check_dtype=False): - expr._MIN_ELEMENTS = 0 - operations = ['add', 'sub', 'mul','mod','truediv','floordiv','pow'] - if not py3compat.PY3: - operations.append('div') - for arith in operations: - op = getattr(operator, arith) - expr.set_use_numexpr(False) - expected = op(df, df) - expr.set_use_numexpr(True) - result = op(df, df) - try: - if check_dtype: - if arith == 'div': - assert expected.dtype.kind == df.dtype.kind - if arith == 'truediv': - assert expected.dtype.kind == 'f' - assert_func(expected, result) - except Exception: - print("Failed test with operator %r" % op.__name__) - raise - - def test_integer_arithmetic(self): - self.run_arithmetic_test(self.integer, assert_frame_equal) - self.run_arithmetic_test(self.integer.icol(0), assert_series_equal, - check_dtype=True) - - def test_float_arithemtic(self): - self.run_arithmetic_test(self.frame, assert_frame_equal) - self.run_arithmetic_test(self.frame.icol(0), assert_series_equal, - check_dtype=True) - - def test_mixed_arithmetic(self): - self.run_arithmetic_test(self.mixed, assert_frame_equal) - for col in self.mixed.columns: - self.run_arithmetic_test(self.mixed[col], assert_series_equal) - - def test_integer_with_zeros(self): - self.integer *= np.random.randint(0, 2, size=np.shape(self.integer)) - self.run_arithmetic_test(self.integer, assert_frame_equal) - self.run_arithmetic_test(self.integer.icol(0), assert_series_equal) - - def test_invalid(self): - - # no op - result = expr._can_use_numexpr(operator.add, None, self.frame, self.frame, 'evaluate') - self.assert_(result == False) - - # mixed - result = expr._can_use_numexpr(operator.add, '+', self.mixed, self.frame, 'evaluate') - self.assert_(result == False) - - # min elements - result = expr._can_use_numexpr(operator.add, '+', self.frame2, self.frame2, 'evaluate') - self.assert_(result == False) - - # ok, we only check on first part of expression - result = expr._can_use_numexpr(operator.add, '+', self.frame, self.frame2, 'evaluate') - self.assert_(result == True) - - def test_binary_ops(self): - - def testit(): - - for f, f2 in [ (self.frame, self.frame2), (self.mixed, self.mixed2) ]: - - for op, op_str in [('add','+'),('sub','-'),('mul','*'),('div','/'),('pow','**')]: - - op = getattr(operator,op,None) - if op is not None: - result = expr._can_use_numexpr(op, op_str, f, f, 'evaluate') - self.assert_(result == (not f._is_mixed_type)) - - result = expr.evaluate(op, op_str, f, f, use_numexpr=True) - expected = expr.evaluate(op, op_str, f, f, use_numexpr=False) - assert_array_equal(result,expected.values) - - result = expr._can_use_numexpr(op, op_str, f2, f2, 'evaluate') - self.assert_(result == False) - - - expr.set_use_numexpr(False) - testit() - expr.set_use_numexpr(True) - expr.set_numexpr_threads(1) - testit() - expr.set_numexpr_threads() - testit() - - def test_boolean_ops(self): - - - def testit(): - for f, f2 in [ (self.frame, self.frame2), (self.mixed, self.mixed2) ]: - - f11 = f - f12 = f + 1 - - f21 = f2 - f22 = f2 + 1 - - for op, op_str in [('gt','>'),('lt','<'),('ge','>='),('le','<='),('eq','=='),('ne','!=')]: - - op = getattr(operator,op) - - result = expr._can_use_numexpr(op, op_str, f11, f12, 'evaluate') - self.assert_(result == (not f11._is_mixed_type)) - - result = expr.evaluate(op, op_str, f11, f12, use_numexpr=True) - expected = expr.evaluate(op, op_str, f11, f12, use_numexpr=False) - assert_array_equal(result,expected.values) - - result = expr._can_use_numexpr(op, op_str, f21, f22, 'evaluate') - self.assert_(result == False) - - expr.set_use_numexpr(False) - testit() - expr.set_use_numexpr(True) - expr.set_numexpr_threads(1) - testit() - expr.set_numexpr_threads() - testit() - - def test_where(self): - - def testit(): - for f in [ self.frame, self.frame2, self.mixed, self.mixed2 ]: - - - for cond in [ True, False ]: - - c = np.empty(f.shape,dtype=np.bool_) - c.fill(cond) - result = expr.where(c, f.values, f.values+1) - expected = np.where(c, f.values, f.values+1) - assert_array_equal(result,expected) - - expr.set_use_numexpr(False) - testit() - expr.set_use_numexpr(True) - expr.set_numexpr_threads(1) - testit() - expr.set_numexpr_threads() - testit() - -if __name__ == '__main__': - # unittest.main() - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 7fdb6d9d2603d..4c75ef66feb08 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -912,7 +912,8 @@ def join(self, other, how='left', level=None, return_indexers=False): See Index.join """ if (not isinstance(other, DatetimeIndex) and len(other) > 0 and - other.inferred_type != 'mixed-integer'): + other.inferred_type not in ('floating', 'mixed-integer', + 'mixed-integer-float', 'mixed')): try: other = DatetimeIndex(other) except TypeError: diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 47bde4ecb32a7..e1b2950b5c8d3 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -17,7 +17,7 @@ from urllib2 import urlopen from distutils.version import LooseVersion -from numpy.random import randn +from numpy.random import randn, rand import numpy as np from pandas.core.common import isnull, _is_sequence @@ -45,6 +45,9 @@ _RAISE_NETWORK_ERROR_DEFAULT = False +def randbool(size=(), p=0.5): + return rand(*size) <= p + def rands(n): choices = string.ascii_letters + string.digits return ''.join(random.choice(choices) for _ in xrange(n)) diff --git a/setup.py b/setup.py index 7d59e0f95f0e8..3984dc075d4f7 100755 --- a/setup.py +++ b/setup.py @@ -85,7 +85,7 @@ except ImportError: cython = False -from os.path import splitext, basename, join as pjoin +from os.path import join as pjoin class build_ext(_build_ext): @@ -502,6 +502,7 @@ def pxd(name): maintainer=AUTHOR, packages=['pandas', 'pandas.compat', + 'pandas.computation', 'pandas.core', 'pandas.io', 'pandas.rpy', diff --git a/vb_suite/binary_ops.py b/vb_suite/binary_ops.py index 54774344520c9..3f076f9f922a3 100644 --- a/vb_suite/binary_ops.py +++ b/vb_suite/binary_ops.py @@ -21,7 +21,7 @@ start_date=datetime(2012, 1, 1)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_numexpr_threads(1) @@ -32,7 +32,7 @@ start_date=datetime(2013, 2, 26)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_use_numexpr(False) @@ -53,7 +53,7 @@ start_date=datetime(2012, 1, 1)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_numexpr_threads(1) @@ -63,7 +63,7 @@ start_date=datetime(2013, 2, 26)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_use_numexpr(False) @@ -84,7 +84,7 @@ start_date=datetime(2012, 1, 1)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_numexpr_threads(1) @@ -94,7 +94,7 @@ start_date=datetime(2013, 2, 26)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(20000, 100)) df2 = DataFrame(np.random.randn(20000, 100)) expr.set_use_numexpr(False) diff --git a/vb_suite/indexing.py b/vb_suite/indexing.py index 9f07cc6ed15c3..2edb7548ebeef 100644 --- a/vb_suite/indexing.py +++ b/vb_suite/indexing.py @@ -103,7 +103,7 @@ start_date=datetime(2012, 1, 1)) setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(50000, 100)) df2 = DataFrame(np.random.randn(50000, 100)) expr.set_numexpr_threads(1) @@ -115,7 +115,7 @@ setup = common_setup + """ -import pandas.core.expressions as expr +import pandas.computation.expressions as expr df = DataFrame(np.random.randn(50000, 100)) df2 = DataFrame(np.random.randn(50000, 100)) expr.set_use_numexpr(False)