Skip to content

CLN: minimize tokenizer passes #6432

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 22, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/computation/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def _check_for_locals(expr, stack_level, parser):
"prefix")

if at_top_of_stack or not_pandas_parser:
for toknum, tokval, _, _, _ in tokenize_string(expr):
for toknum, tokval in tokenize_string(expr):
if toknum == tokenize.OP and tokval == '@':
raise SyntaxError(msg)

Expand Down
135 changes: 99 additions & 36 deletions pandas/computation/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@

import pandas as pd
from pandas import compat
from pandas.compat import StringIO, zip, reduce, string_types
from pandas.compat import StringIO, lmap, zip, reduce, string_types
from pandas.core.base import StringMixin
from pandas.core import common as com
from pandas.tools.util import compose
from pandas.computation.ops import (_cmp_ops_syms, _bool_ops_syms,
_arith_ops_syms, _unary_ops_syms, is_term)
from pandas.computation.ops import _reductions, _mathops, _LOCAL_TAG
Expand All @@ -23,52 +24,113 @@
from pandas.computation.scope import Scope, _ensure_scope


def tokenize_string(s):
return tokenize.generate_tokens(StringIO(s).readline)
def tokenize_string(source):
"""Tokenize a Python source code string.

Parameters
----------
source : str
A Python source code string
"""
line_reader = StringIO(source).readline
for toknum, tokval, _, _, _ in tokenize.generate_tokens(line_reader):
yield toknum, tokval


def _rewrite_assign(tok):
"""Rewrite the assignment operator for PyTables expressions that use ``=``
as a substitute for ``==``.

def _rewrite_assign(source):
"""Rewrite the assignment operator for PyTables expression that want to use
``=`` as a substitute for ``==``.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module

Returns
-------
t : tuple of int, str
Either the input or token or the replacement values
"""
res = []
for toknum, tokval, _, _, _ in tokenize_string(source):
res.append((toknum, '==' if tokval == '=' else tokval))
return tokenize.untokenize(res)
toknum, tokval = tok
return toknum, '==' if tokval == '=' else tokval


def _replace_booleans(source):
def _replace_booleans(tok):
"""Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise
precedence is changed to boolean precedence.

Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module

Returns
-------
t : tuple of int, str
Either the input or token or the replacement values
"""
res = []
for toknum, tokval, _, _, _ in tokenize_string(source):
if toknum == tokenize.OP:
if tokval == '&':
res.append((tokenize.NAME, 'and'))
elif tokval == '|':
res.append((tokenize.NAME, 'or'))
else:
res.append((toknum, tokval))
else:
res.append((toknum, tokval))
return tokenize.untokenize(res)
toknum, tokval = tok
if toknum == tokenize.OP:
if tokval == '&':
return tokenize.NAME, 'and'
elif tokval == '|':
return tokenize.NAME, 'or'
return toknum, tokval
return toknum, tokval


def _replace_locals(source, local_symbol='@'):
"""Replace local variables with a syntactically valid name."""
res = []
for toknum, tokval, _, _, _ in tokenize_string(source):
if toknum == tokenize.OP and tokval == local_symbol:
res.append((tokenize.OP, _LOCAL_TAG))
else:
res.append((toknum, tokval))
return tokenize.untokenize(res)
def _replace_locals(tok):
"""Replace local variables with a syntactically valid name.

Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module

Returns
-------
t : tuple of int, str
Either the input or token or the replacement values

Notes
-----
This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as
``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_``
is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it.
"""
toknum, tokval = tok
if toknum == tokenize.OP and tokval == '@':
return tokenize.OP, _LOCAL_TAG
return toknum, tokval


def _preparse(source):
"""Compose assignment and boolean replacement."""
return _replace_booleans(_rewrite_assign(source))
def _preparse(source, f=compose(_replace_locals, _replace_booleans,
_rewrite_assign)):
"""Compose a collection of tokenization functions

Parameters
----------
source : str
A Python source code string
f : callable
This takes a tuple of (toknum, tokval) as its argument and returns a
tuple with the same structure but possibly different elements. Defaults
to the composition of ``_rewrite_assign``, ``_replace_booleans``, and
``_replace_locals``.

Returns
-------
s : str
Valid Python source code

Notes
-----
The `f` parameter can be any callable that takes *and* returns input of the
form ``(toknum, tokval)``, where ``toknum`` is one of the constants from
the ``tokenize`` module and ``tokval`` is a string.
"""
assert callable(f), 'f must be callable'
return tokenize.untokenize(lmap(f, tokenize_string(source)))


def _is_type(t):
Expand Down Expand Up @@ -535,7 +597,8 @@ def visitor(x, y):
class PandasExprVisitor(BaseExprVisitor):

def __init__(self, env, engine, parser,
preparser=lambda x: _replace_locals(_replace_booleans(x))):
preparser=partial(_preparse, f=compose(_replace_locals,
_replace_booleans))):
super(PandasExprVisitor, self).__init__(env, engine, parser, preparser)


Expand Down
17 changes: 15 additions & 2 deletions pandas/tools/util.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from pandas.compat import reduce
from pandas.core.index import Index
import numpy as np

Expand All @@ -6,6 +7,7 @@ def match(needles, haystack):
needles = Index(needles)
return haystack.get_indexer(needles)


def cartesian_product(X):
'''
Numpy version of itertools.product or pandas.compat.product.
Expand All @@ -27,6 +29,17 @@ def cartesian_product(X):

b = cumprodX[-1] / cumprodX

return [np.tile(np.repeat(x, b[i]),
return [np.tile(np.repeat(x, b[i]),
np.product(a[i]))
for i, x in enumerate(X)]
for i, x in enumerate(X)]


def _compose2(f, g):
"""Compose 2 callables"""
return lambda *args, **kwargs: f(g(*args, **kwargs))


def compose(*funcs):
"""Compose 2 or more callables"""
assert len(funcs) > 1, 'At least 2 callables must be passed to compose'
return reduce(_compose2, funcs)