Skip to content

Commit d20454e

Browse files
committed
Merge pull request #6432 from cpcloud/eval-consolidate-tokenization
CLN: minimize tokenizer passes
2 parents a96b53d + 86e746d commit d20454e

File tree

3 files changed

+115
-39
lines changed

3 files changed

+115
-39
lines changed

pandas/computation/eval.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ def _check_for_locals(expr, stack_level, parser):
131131
"prefix")
132132

133133
if at_top_of_stack or not_pandas_parser:
134-
for toknum, tokval, _, _, _ in tokenize_string(expr):
134+
for toknum, tokval in tokenize_string(expr):
135135
if toknum == tokenize.OP and tokval == '@':
136136
raise SyntaxError(msg)
137137

pandas/computation/expr.py

+99-36
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,10 @@
1212

1313
import pandas as pd
1414
from pandas import compat
15-
from pandas.compat import StringIO, zip, reduce, string_types
15+
from pandas.compat import StringIO, lmap, zip, reduce, string_types
1616
from pandas.core.base import StringMixin
1717
from pandas.core import common as com
18+
from pandas.tools.util import compose
1819
from pandas.computation.ops import (_cmp_ops_syms, _bool_ops_syms,
1920
_arith_ops_syms, _unary_ops_syms, is_term)
2021
from pandas.computation.ops import _reductions, _mathops, _LOCAL_TAG
@@ -23,52 +24,113 @@
2324
from pandas.computation.scope import Scope, _ensure_scope
2425

2526

26-
def tokenize_string(s):
27-
return tokenize.generate_tokens(StringIO(s).readline)
27+
def tokenize_string(source):
28+
"""Tokenize a Python source code string.
2829
30+
Parameters
31+
----------
32+
source : str
33+
A Python source code string
34+
"""
35+
line_reader = StringIO(source).readline
36+
for toknum, tokval, _, _, _ in tokenize.generate_tokens(line_reader):
37+
yield toknum, tokval
38+
39+
40+
def _rewrite_assign(tok):
41+
"""Rewrite the assignment operator for PyTables expressions that use ``=``
42+
as a substitute for ``==``.
2943
30-
def _rewrite_assign(source):
31-
"""Rewrite the assignment operator for PyTables expression that want to use
32-
``=`` as a substitute for ``==``.
44+
Parameters
45+
----------
46+
tok : tuple of int, str
47+
ints correspond to the all caps constants in the tokenize module
48+
49+
Returns
50+
-------
51+
t : tuple of int, str
52+
Either the input or token or the replacement values
3353
"""
34-
res = []
35-
for toknum, tokval, _, _, _ in tokenize_string(source):
36-
res.append((toknum, '==' if tokval == '=' else tokval))
37-
return tokenize.untokenize(res)
54+
toknum, tokval = tok
55+
return toknum, '==' if tokval == '=' else tokval
3856

3957

40-
def _replace_booleans(source):
58+
def _replace_booleans(tok):
4159
"""Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise
4260
precedence is changed to boolean precedence.
61+
62+
Parameters
63+
----------
64+
tok : tuple of int, str
65+
ints correspond to the all caps constants in the tokenize module
66+
67+
Returns
68+
-------
69+
t : tuple of int, str
70+
Either the input or token or the replacement values
4371
"""
44-
res = []
45-
for toknum, tokval, _, _, _ in tokenize_string(source):
46-
if toknum == tokenize.OP:
47-
if tokval == '&':
48-
res.append((tokenize.NAME, 'and'))
49-
elif tokval == '|':
50-
res.append((tokenize.NAME, 'or'))
51-
else:
52-
res.append((toknum, tokval))
53-
else:
54-
res.append((toknum, tokval))
55-
return tokenize.untokenize(res)
72+
toknum, tokval = tok
73+
if toknum == tokenize.OP:
74+
if tokval == '&':
75+
return tokenize.NAME, 'and'
76+
elif tokval == '|':
77+
return tokenize.NAME, 'or'
78+
return toknum, tokval
79+
return toknum, tokval
5680

5781

58-
def _replace_locals(source, local_symbol='@'):
59-
"""Replace local variables with a syntactically valid name."""
60-
res = []
61-
for toknum, tokval, _, _, _ in tokenize_string(source):
62-
if toknum == tokenize.OP and tokval == local_symbol:
63-
res.append((tokenize.OP, _LOCAL_TAG))
64-
else:
65-
res.append((toknum, tokval))
66-
return tokenize.untokenize(res)
82+
def _replace_locals(tok):
83+
"""Replace local variables with a syntactically valid name.
84+
85+
Parameters
86+
----------
87+
tok : tuple of int, str
88+
ints correspond to the all caps constants in the tokenize module
89+
90+
Returns
91+
-------
92+
t : tuple of int, str
93+
Either the input or token or the replacement values
94+
95+
Notes
96+
-----
97+
This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as
98+
``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_``
99+
is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it.
100+
"""
101+
toknum, tokval = tok
102+
if toknum == tokenize.OP and tokval == '@':
103+
return tokenize.OP, _LOCAL_TAG
104+
return toknum, tokval
67105

68106

69-
def _preparse(source):
70-
"""Compose assignment and boolean replacement."""
71-
return _replace_booleans(_rewrite_assign(source))
107+
def _preparse(source, f=compose(_replace_locals, _replace_booleans,
108+
_rewrite_assign)):
109+
"""Compose a collection of tokenization functions
110+
111+
Parameters
112+
----------
113+
source : str
114+
A Python source code string
115+
f : callable
116+
This takes a tuple of (toknum, tokval) as its argument and returns a
117+
tuple with the same structure but possibly different elements. Defaults
118+
to the composition of ``_rewrite_assign``, ``_replace_booleans``, and
119+
``_replace_locals``.
120+
121+
Returns
122+
-------
123+
s : str
124+
Valid Python source code
125+
126+
Notes
127+
-----
128+
The `f` parameter can be any callable that takes *and* returns input of the
129+
form ``(toknum, tokval)``, where ``toknum`` is one of the constants from
130+
the ``tokenize`` module and ``tokval`` is a string.
131+
"""
132+
assert callable(f), 'f must be callable'
133+
return tokenize.untokenize(lmap(f, tokenize_string(source)))
72134

73135

74136
def _is_type(t):
@@ -535,7 +597,8 @@ def visitor(x, y):
535597
class PandasExprVisitor(BaseExprVisitor):
536598

537599
def __init__(self, env, engine, parser,
538-
preparser=lambda x: _replace_locals(_replace_booleans(x))):
600+
preparser=partial(_preparse, f=compose(_replace_locals,
601+
_replace_booleans))):
539602
super(PandasExprVisitor, self).__init__(env, engine, parser, preparser)
540603

541604

pandas/tools/util.py

+15-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from pandas.compat import reduce
12
from pandas.core.index import Index
23
import numpy as np
34

@@ -6,6 +7,7 @@ def match(needles, haystack):
67
needles = Index(needles)
78
return haystack.get_indexer(needles)
89

10+
911
def cartesian_product(X):
1012
'''
1113
Numpy version of itertools.product or pandas.compat.product.
@@ -27,6 +29,17 @@ def cartesian_product(X):
2729

2830
b = cumprodX[-1] / cumprodX
2931

30-
return [np.tile(np.repeat(x, b[i]),
32+
return [np.tile(np.repeat(x, b[i]),
3133
np.product(a[i]))
32-
for i, x in enumerate(X)]
34+
for i, x in enumerate(X)]
35+
36+
37+
def _compose2(f, g):
38+
"""Compose 2 callables"""
39+
return lambda *args, **kwargs: f(g(*args, **kwargs))
40+
41+
42+
def compose(*funcs):
43+
"""Compose 2 or more callables"""
44+
assert len(funcs) > 1, 'At least 2 callables must be passed to compose'
45+
return reduce(_compose2, funcs)

0 commit comments

Comments
 (0)