|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | + |
| 3 | +# Copyright (c) 2008/2013 Andrey Vlasovskikh |
| 4 | +# |
| 5 | +# Permission is hereby granted, free of charge, to any person obtaining |
| 6 | +# a copy of this software and associated documentation files (the |
| 7 | +# "Software"), to deal in the Software without restriction, including |
| 8 | +# without limitation the rights to use, copy, modify, merge, publish, |
| 9 | +# distribute, sublicense, and/or sell copies of the Software, and to |
| 10 | +# permit persons to whom the Software is furnished to do so, subject to |
| 11 | +# the following conditions: |
| 12 | +# |
| 13 | +# The above copyright notice and this permission notice shall be included |
| 14 | +# in all copies or substantial portions of the Software. |
| 15 | +# |
| 16 | +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
| 17 | +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
| 18 | +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
| 19 | +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
| 20 | +# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
| 21 | +# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
| 22 | +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
| 23 | + |
| 24 | +__all__ = ['make_tokenizer', 'Token', 'LexerError'] |
| 25 | + |
| 26 | +import re |
| 27 | + |
| 28 | + |
| 29 | +class LexerError(Exception): |
| 30 | + def __init__(self, place, msg): |
| 31 | + self.place = place |
| 32 | + self.msg = msg |
| 33 | + |
| 34 | + def __str__(self): |
| 35 | + s = u'cannot tokenize data' |
| 36 | + line, pos = self.place |
| 37 | + return u'%s: %d,%d: "%s"' % (s, line, pos, self.msg) |
| 38 | + |
| 39 | + |
| 40 | +class Token(object): |
| 41 | + def __init__(self, type, value, start=None, end=None): |
| 42 | + self.type = type |
| 43 | + self.value = value |
| 44 | + self.start = start |
| 45 | + self.end = end |
| 46 | + |
| 47 | + def __repr__(self): |
| 48 | + return u'Token(%r, %r)' % (self.type, self.value) |
| 49 | + |
| 50 | + def __eq__(self, other): |
| 51 | + # FIXME: Case sensitivity is assumed here |
| 52 | + return self.type == other.type and self.value == other.value |
| 53 | + |
| 54 | + def _pos_str(self): |
| 55 | + if self.start is None or self.end is None: |
| 56 | + return '' |
| 57 | + else: |
| 58 | + sl, sp = self.start |
| 59 | + el, ep = self.end |
| 60 | + return u'%d,%d-%d,%d:' % (sl, sp, el, ep) |
| 61 | + |
| 62 | + def __str__(self): |
| 63 | + s = u"%s %s '%s'" % (self._pos_str(), self.type, self.value) |
| 64 | + return s.strip() |
| 65 | + |
| 66 | + @property |
| 67 | + def name(self): |
| 68 | + return self.value |
| 69 | + |
| 70 | + def pformat(self): |
| 71 | + return u"%s %s '%s'" % (self._pos_str().ljust(20), |
| 72 | + self.type.ljust(14), |
| 73 | + self.value) |
| 74 | + |
| 75 | + |
| 76 | +def make_tokenizer(specs): |
| 77 | + """[(str, (str, int?))] -> (str -> Iterable(Token))""" |
| 78 | + |
| 79 | + def compile_spec(spec): |
| 80 | + name, args = spec |
| 81 | + return name, re.compile(*args) |
| 82 | + |
| 83 | + compiled = [compile_spec(s) for s in specs] |
| 84 | + |
| 85 | + def match_specs(specs, str, i, position): |
| 86 | + line, pos = position |
| 87 | + for type, regexp in specs: |
| 88 | + m = regexp.match(str, i) |
| 89 | + if m is not None: |
| 90 | + value = m.group() |
| 91 | + nls = value.count(u'\n') |
| 92 | + n_line = line + nls |
| 93 | + if nls == 0: |
| 94 | + n_pos = pos + len(value) |
| 95 | + else: |
| 96 | + n_pos = len(value) - value.rfind(u'\n') - 1 |
| 97 | + return Token(type, value, (line, pos + 1), (n_line, n_pos)) |
| 98 | + else: |
| 99 | + errline = str.splitlines()[line - 1] |
| 100 | + raise LexerError((line, pos + 1), errline) |
| 101 | + |
| 102 | + def f(str): |
| 103 | + length = len(str) |
| 104 | + line, pos = 1, 0 |
| 105 | + i = 0 |
| 106 | + while i < length: |
| 107 | + t = match_specs(compiled, str, i, (line, pos)) |
| 108 | + yield t |
| 109 | + line, pos = t.end |
| 110 | + i += len(t.value) |
| 111 | + |
| 112 | + return f |
| 113 | + |
| 114 | +# This is an example of a token spec. See also [this article][1] for a |
| 115 | +# discussion of searching for multiline comments using regexps (including `*?`). |
| 116 | +# |
| 117 | +# [1]: http://ostermiller.org/findcomment.html |
| 118 | +_example_token_specs = [ |
| 119 | + ('COMMENT', (r'\(\*(.|[\r\n])*?\*\)', re.MULTILINE)), |
| 120 | + ('COMMENT', (r'\{(.|[\r\n])*?\}', re.MULTILINE)), |
| 121 | + ('COMMENT', (r'//.*',)), |
| 122 | + ('NL', (r'[\r\n]+',)), |
| 123 | + ('SPACE', (r'[ \t\r\n]+',)), |
| 124 | + ('NAME', (r'[A-Za-z_][A-Za-z_0-9]*',)), |
| 125 | + ('REAL', (r'[0-9]+\.[0-9]*([Ee][+\-]?[0-9]+)*',)), |
| 126 | + ('INT', (r'[0-9]+',)), |
| 127 | + ('INT', (r'\$[0-9A-Fa-f]+',)), |
| 128 | + ('OP', (r'(\.\.)|(<>)|(<=)|(>=)|(:=)|[;,=\(\):\[\]\.+\-<>\*/@\^]',)), |
| 129 | + ('STRING', (r"'([^']|(''))*'",)), |
| 130 | + ('CHAR', (r'#[0-9]+',)), |
| 131 | + ('CHAR', (r'#\$[0-9A-Fa-f]+',)), |
| 132 | +] |
| 133 | +#tokenize = make_tokenizer(_example_token_specs) |
0 commit comments