|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | + |
| 3 | +# Copyright © 2009/2021 Andrey Vlasovskikh |
| 4 | +# |
| 5 | +# Permission is hereby granted, free of charge, to any person obtaining a copy of this |
| 6 | +# software and associated documentation files (the "Software"), to deal in the Software |
| 7 | +# without restriction, including without limitation the rights to use, copy, modify, |
| 8 | +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to |
| 9 | +# permit persons to whom the Software is furnished to do so, subject to the following |
| 10 | +# conditions: |
| 11 | +# |
| 12 | +# The above copyright notice and this permission notice shall be included in all copies |
| 13 | +# or substantial portions of the Software. |
| 14 | +# |
| 15 | +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, |
| 16 | +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A |
| 17 | +# PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
| 18 | +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF |
| 19 | +# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE |
| 20 | +# OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
| 21 | + |
| 22 | +from __future__ import unicode_literals |
| 23 | + |
| 24 | +__all__ = ["make_tokenizer", "TokenSpec", "Token", "LexerError"] |
| 25 | + |
| 26 | +import re |
| 27 | + |
| 28 | + |
| 29 | +class LexerError(Exception): |
| 30 | + def __init__(self, place, msg): |
| 31 | + self.place = place |
| 32 | + self.msg = msg |
| 33 | + |
| 34 | + def __str__(self): |
| 35 | + s = "cannot tokenize data" |
| 36 | + line, pos = self.place |
| 37 | + return '%s: %d,%d: "%s"' % (s, line, pos, self.msg) |
| 38 | + |
| 39 | + |
| 40 | +class TokenSpec(object): |
| 41 | + """A token specification for generating a lexer via `make_tokenizer()`.""" |
| 42 | + |
| 43 | + def __init__(self, type, pattern, flags=0): |
| 44 | + """Initialize a `TokenSpec` object. |
| 45 | +
|
| 46 | + Parameters: |
| 47 | + type (str): User-defined type of the token (e.g. `"name"`, `"number"`, |
| 48 | + `"operator"`) |
| 49 | + pattern (str): Regexp for matching this token type |
| 50 | + flags (int, optional): Regexp flags, the second argument of `re.compile()` |
| 51 | + """ |
| 52 | + self.type = type |
| 53 | + self.pattern = pattern |
| 54 | + self.flags = flags |
| 55 | + |
| 56 | + def __repr__(self): |
| 57 | + return "TokenSpec(%r, %r, %r)" % (self.type, self.pattern, self.flags) |
| 58 | + |
| 59 | + |
| 60 | +class Token(object): |
| 61 | + """A token object that represents a substring of certain type in your text. |
| 62 | +
|
| 63 | + You can compare tokens for equality using the `==` operator. Tokens also define |
| 64 | + custom `repr()` and `str()`. |
| 65 | +
|
| 66 | + Attributes: |
| 67 | + type (str): User-defined type of the token (e.g. `"name"`, `"number"`, |
| 68 | + `"operator"`) |
| 69 | + value (str): Text value of the token |
| 70 | + start (Optional[Tuple[int, int]]): Start position (_line_, _column_) |
| 71 | + end (Optional[Tuple[int, int]]): End position (_line_, _column_) |
| 72 | + """ |
| 73 | + |
| 74 | + def __init__(self, type, value, start=None, end=None): |
| 75 | + """Initialize a `Token` object.""" |
| 76 | + self.type = type |
| 77 | + self.value = value |
| 78 | + self.start = start |
| 79 | + self.end = end |
| 80 | + |
| 81 | + def __repr__(self): |
| 82 | + return "Token(%r, %r)" % (self.type, self.value) |
| 83 | + |
| 84 | + def __eq__(self, other): |
| 85 | + # FIXME: Case sensitivity is assumed here |
| 86 | + if other is None: |
| 87 | + return False |
| 88 | + else: |
| 89 | + return self.type == other.type and self.value == other.value |
| 90 | + |
| 91 | + def _pos_str(self): |
| 92 | + if self.start is None or self.end is None: |
| 93 | + return "" |
| 94 | + else: |
| 95 | + sl, sp = self.start |
| 96 | + el, ep = self.end |
| 97 | + return "%d,%d-%d,%d:" % (sl, sp, el, ep) |
| 98 | + |
| 99 | + def __str__(self): |
| 100 | + s = "%s %s '%s'" % (self._pos_str(), self.type, self.value) |
| 101 | + return s.strip() |
| 102 | + |
| 103 | + @property |
| 104 | + def name(self): |
| 105 | + return self.value |
| 106 | + |
| 107 | + def pformat(self): |
| 108 | + return "%s %s '%s'" % ( |
| 109 | + self._pos_str().ljust(20), # noqa |
| 110 | + self.type.ljust(14), |
| 111 | + self.value, |
| 112 | + ) |
| 113 | + |
| 114 | + |
| 115 | +def make_tokenizer(specs): |
| 116 | + # noinspection GrazieInspection |
| 117 | + """Make a function that tokenizes text based on the regexp specs. |
| 118 | +
|
| 119 | + Type: `(Sequence[TokenSpec | Tuple]) -> Callable[[str], Iterable[Token]]` |
| 120 | +
|
| 121 | + A token spec is `TokenSpec` instance. |
| 122 | +
|
| 123 | + !!! Note |
| 124 | +
|
| 125 | + For legacy reasons, a token spec may also be a tuple of (_type_, _args_), where |
| 126 | + _type_ sets the value of `Token.type` for the token, and _args_ are the |
| 127 | + positional arguments for `re.compile()`: either just (_pattern_,) or |
| 128 | + (_pattern_, _flags_). |
| 129 | +
|
| 130 | + It returns a tokenizer function that takes a string and returns an iterable of |
| 131 | + `Token` objects, or raises `LexerError` if it cannot tokenize the string according |
| 132 | + to its token specs. |
| 133 | +
|
| 134 | + Examples: |
| 135 | +
|
| 136 | + ```pycon |
| 137 | + >>> tokenize = make_tokenizer([ |
| 138 | + ... TokenSpec("space", r"\\s+"), |
| 139 | + ... TokenSpec("id", r"\\w+"), |
| 140 | + ... TokenSpec("op", r"[,!]"), |
| 141 | + ... ]) |
| 142 | + >>> text = "Hello, World!" |
| 143 | + >>> [t for t in tokenize(text) if t.type != "space"] # noqa |
| 144 | + [Token('id', 'Hello'), Token('op', ','), Token('id', 'World'), Token('op', '!')] |
| 145 | + >>> text = "Bye?" |
| 146 | + >>> list(tokenize(text)) |
| 147 | + Traceback (most recent call last): |
| 148 | + ... |
| 149 | + lexer.LexerError: cannot tokenize data: 1,4: "Bye?" |
| 150 | +
|
| 151 | + ``` |
| 152 | + """ |
| 153 | + compiled = [] |
| 154 | + for spec in specs: |
| 155 | + if isinstance(spec, TokenSpec): |
| 156 | + c = spec.type, re.compile(spec.pattern, spec.flags) |
| 157 | + else: |
| 158 | + name, args = spec |
| 159 | + c = name, re.compile(*args) |
| 160 | + compiled.append(c) |
| 161 | + |
| 162 | + def match_specs(s, i, position): |
| 163 | + line, pos = position |
| 164 | + for type, regexp in compiled: |
| 165 | + m = regexp.match(s, i) |
| 166 | + if m is not None: |
| 167 | + value = m.group() |
| 168 | + nls = value.count("\n") |
| 169 | + n_line = line + nls |
| 170 | + if nls == 0: |
| 171 | + n_pos = pos + len(value) |
| 172 | + else: |
| 173 | + n_pos = len(value) - value.rfind("\n") - 1 |
| 174 | + return Token(type, value, (line, pos + 1), (n_line, n_pos)) |
| 175 | + else: |
| 176 | + err_line = s.splitlines()[line - 1] |
| 177 | + raise LexerError((line, pos + 1), err_line) |
| 178 | + |
| 179 | + def f(s): |
| 180 | + length = len(s) |
| 181 | + line, pos = 1, 0 |
| 182 | + i = 0 |
| 183 | + while i < length: |
| 184 | + t = match_specs(s, i, (line, pos)) |
| 185 | + yield t |
| 186 | + line, pos = t.end |
| 187 | + i += len(t.value) |
| 188 | + |
| 189 | + return f |
| 190 | + |
| 191 | + |
| 192 | +# This is an example of token specs. See also [this article][1] for a |
| 193 | +# discussion of searching for multiline comments using regexps (including `*?`). |
| 194 | +# |
| 195 | +# [1]: http://ostermiller.org/findcomment.html |
| 196 | +_example_token_specs = [ |
| 197 | + TokenSpec("COMMENT", r"\(\*(.|[\r\n])*?\*\)", re.MULTILINE), |
| 198 | + TokenSpec("COMMENT", r"\{(.|[\r\n])*?\}", re.MULTILINE), |
| 199 | + TokenSpec("COMMENT", r"//.*"), |
| 200 | + TokenSpec("NL", r"[\r\n]+"), |
| 201 | + TokenSpec("SPACE", r"[ \t\r\n]+"), |
| 202 | + TokenSpec("NAME", r"[A-Za-z_][A-Za-z_0-9]*"), |
| 203 | + TokenSpec("REAL", r"[0-9]+\.[0-9]*([Ee][+\-]?[0-9]+)*"), |
| 204 | + TokenSpec("INT", r"[0-9]+"), |
| 205 | + TokenSpec("INT", r"\$[0-9A-Fa-f]+"), |
| 206 | + TokenSpec("OP", r"(\.\.)|(<>)|(<=)|(>=)|(:=)|[;,=\(\):\[\]\.+\-<>\*/@\^]"), |
| 207 | + TokenSpec("STRING", r"'([^']|(''))*'"), |
| 208 | + TokenSpec("CHAR", r"#[0-9]+"), |
| 209 | + TokenSpec("CHAR", r"#\$[0-9A-Fa-f]+"), |
| 210 | +] |
| 211 | +# tokenize = make_tokenizer(_example_token_specs) |
0 commit comments