|
| 1 | +from __future__ import unicode_literals, print_function |
| 2 | + |
| 3 | +import codecs |
| 4 | +import re |
| 5 | +import json |
| 6 | +import os |
| 7 | +import sys |
| 8 | +from collections import Counter, OrderedDict |
| 9 | +from os.path import dirname, join, pardir, relpath |
| 10 | + |
| 11 | +from funcparserlib.parser import NoParseError |
| 12 | + |
| 13 | +from . import parser |
| 14 | + |
| 15 | +text_type = type("") |
| 16 | +binary_type = type(b"") |
| 17 | + |
| 18 | +try: |
| 19 | + unichr |
| 20 | +except NameError: |
| 21 | + unichr = chr |
| 22 | + |
| 23 | +base = join(dirname(__file__), pardir) |
| 24 | + |
| 25 | +_surrogateRe = re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?") |
| 26 | + |
| 27 | + |
| 28 | +def clean_path(path): |
| 29 | + return relpath(path, base) |
| 30 | + |
| 31 | + |
| 32 | +def is_subsequence(l1, l2): |
| 33 | + """ checks if l1 is a subsequence of l2""" |
| 34 | + i = 0 |
| 35 | + for x in l2: |
| 36 | + if l1[i] == x: |
| 37 | + i += 1 |
| 38 | + if i == len(l1): |
| 39 | + return True |
| 40 | + return False |
| 41 | + |
| 42 | + |
| 43 | +def unescape_json(obj): |
| 44 | + def decode_str(inp): |
| 45 | + """Decode \\uXXXX escapes |
| 46 | +
|
| 47 | + This decodes \\uXXXX escapes, possibly into non-BMP characters when |
| 48 | + two surrogate character escapes are adjacent to each other. |
| 49 | + """ |
| 50 | + # This cannot be implemented using the unicode_escape codec |
| 51 | + # because that requires its input be ISO-8859-1, and we need |
| 52 | + # arbitrary unicode as input. |
| 53 | + def repl(m): |
| 54 | + if m.group(2) is not None: |
| 55 | + high = int(m.group(1), 16) |
| 56 | + low = int(m.group(2), 16) |
| 57 | + if (0xD800 <= high <= 0xDBFF and |
| 58 | + 0xDC00 <= low <= 0xDFFF and |
| 59 | + sys.maxunicode == 0x10FFFF): |
| 60 | + cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000 |
| 61 | + return unichr(cp) |
| 62 | + else: |
| 63 | + return unichr(high) + unichr(low) |
| 64 | + else: |
| 65 | + return unichr(int(m.group(1), 16)) |
| 66 | + return _surrogateRe.sub(repl, inp) |
| 67 | + |
| 68 | + if isinstance(obj, dict): |
| 69 | + return {decode_str(k): unescape_json(v) for k, v in obj.items()} |
| 70 | + elif isinstance(obj, list): |
| 71 | + return [unescape_json(x) for x in obj] |
| 72 | + elif isinstance(obj, text_type): |
| 73 | + return decode_str(obj) |
| 74 | + else: |
| 75 | + return obj |
| 76 | + |
| 77 | + |
| 78 | +def lint_dat_format(path, encoding, first_header): |
| 79 | + try: |
| 80 | + if encoding is not None: |
| 81 | + with codecs.open(path, "r", encoding=encoding) as fp: |
| 82 | + dat = fp.read() |
| 83 | + parsed = parser.parse(dat, first_header) |
| 84 | + else: |
| 85 | + with open(path, "rb") as fp: |
| 86 | + dat = fp.read() |
| 87 | + parsed = parser.parse(dat, first_header) |
| 88 | + except NoParseError as e: |
| 89 | + print("parse error in %s, %s" % (path, e)) |
| 90 | + return |
| 91 | + |
| 92 | + for item in parsed: |
| 93 | + headers = Counter(x[0] for x in item) |
| 94 | + headers.subtract(set(headers.elements())) # remove one instance of each |
| 95 | + for header in set(headers.elements()): |
| 96 | + c = headers[header] |
| 97 | + print("%s occurs %d times in one test in %s" % (header, c + 1, path)) |
| 98 | + |
| 99 | + return [OrderedDict(x) for x in parsed] |
| 100 | + |
| 101 | + |
| 102 | +def lint_encoding_test(path): |
| 103 | + parsed = lint_dat_format(path, None, b"data") |
| 104 | + if not parsed: |
| 105 | + return |
| 106 | + for test in parsed: |
| 107 | + if not is_subsequence(list(test.keys()), [b"data", b"encoding"]): |
| 108 | + print("unexpected test headings %r in %s" % (test.keys(), path)) |
| 109 | + |
| 110 | + |
| 111 | +def lint_encoding_tests(path): |
| 112 | + for root, dirs, files in os.walk(path): |
| 113 | + for file in files: |
| 114 | + if not file.endswith(".dat"): |
| 115 | + continue |
| 116 | + lint_encoding_test(clean_path(join(root, file))) |
| 117 | + |
| 118 | + |
| 119 | +def lint_tokenizer_test(path): |
| 120 | + all_keys = set(["description", "input", "output", "initialStates", |
| 121 | + "lastStartTag", "ignoreErrorOrder", "doubleEscaped"]) |
| 122 | + required = set(["input", "output"]) |
| 123 | + with codecs.open(path, "r", "utf-8") as fp: |
| 124 | + parsed = json.load(fp) |
| 125 | + if not parsed: |
| 126 | + return |
| 127 | + if not isinstance(parsed, dict): |
| 128 | + print("Top-level must be an object in %s" % path) |
| 129 | + return |
| 130 | + for test_group in parsed.values(): |
| 131 | + if not isinstance(test_group, list): |
| 132 | + print("Test groups must be a lists in %s" % path) |
| 133 | + continue |
| 134 | + for test in test_group: |
| 135 | + if 'doubleEscaped' in test and test['doubleEscaped'] is True: |
| 136 | + test = unescape_json(test) |
| 137 | + keys = set(test.keys()) |
| 138 | + if not (required <= keys): |
| 139 | + print("missing test properties %r in %s" % (required - keys, path)) |
| 140 | + if not (keys <= all_keys): |
| 141 | + print("unknown test properties %r in %s" % (keys - all_keys, path)) |
| 142 | + |
| 143 | + |
| 144 | +def lint_tokenizer_tests(path): |
| 145 | + for root, dirs, files in os.walk(path): |
| 146 | + for file in files: |
| 147 | + if not file.endswith(".test"): |
| 148 | + continue |
| 149 | + lint_tokenizer_test(clean_path(join(root, file))) |
| 150 | + |
| 151 | + |
| 152 | +def lint_tree_construction_test(path): |
| 153 | + parsed = lint_dat_format(path, "utf-8", "data") |
| 154 | + if not parsed: |
| 155 | + return |
| 156 | + for test in parsed: |
| 157 | + if not is_subsequence(list(test.keys()), ["data", "errors", "document-fragment", |
| 158 | + "script-off", "script-on", "document"]): |
| 159 | + print("unexpected test headings %r in %s" % (test.keys(), path)) |
| 160 | + |
| 161 | + |
| 162 | +def lint_tree_construction_tests(path): |
| 163 | + for root, dirs, files in os.walk(path): |
| 164 | + for file in files: |
| 165 | + if not file.endswith(".dat"): |
| 166 | + continue |
| 167 | + lint_tree_construction_test(clean_path(join(root, file))) |
| 168 | + |
| 169 | + |
| 170 | +if __name__ == "__main__": |
| 171 | + lint_encoding_tests(join(base, "encoding")) |
| 172 | + lint_tokenizer_tests(join(base, "tokenizer")) |
| 173 | + lint_tree_construction_tests(join(base, "tree-construction")) |
0 commit comments