First attempt at linter

gsnedders · gsnedders · commit a65a0e863e80 · 2016-07-03T23:19:26.000+01:00
diff --git a/lint b/lint
@@ -0,0 +1,2 @@
+#!/bin/sh
+python -m lint_lib.lint
diff --git a/lint_lib/__init__.py b/lint_lib/__init__.py
diff --git a/lint_lib/lint.py b/lint_lib/lint.py
@@ -0,0 +1,173 @@
+from __future__ import unicode_literals, print_function
+
+import codecs
+import re
+import json
+import os
+import sys
+from collections import Counter, OrderedDict
+from os.path import dirname, join, pardir, relpath
+
+from funcparserlib.parser import NoParseError
+
+from . import parser
+
+text_type = type("")
+binary_type = type(b"")
+
+try:
+    unichr
+except NameError:
+    unichr = chr
+
+base = join(dirname(__file__), pardir)
+
+_surrogateRe = re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?")
+
+
+def clean_path(path):
+    return relpath(path, base)
+
+
+def is_subsequence(l1, l2):
+    """ checks if l1 is a subsequence of l2"""
+    i = 0
+    for x in l2:
+        if l1[i] == x:
+            i += 1
+            if i == len(l1):
+                return True
+    return False
+
+
+def unescape_json(obj):
+    def decode_str(inp):
+        """Decode \\uXXXX escapes
+
+        This decodes \\uXXXX escapes, possibly into non-BMP characters when
+        two surrogate character escapes are adjacent to each other.
+        """
+        # This cannot be implemented using the unicode_escape codec
+        # because that requires its input be ISO-8859-1, and we need
+        # arbitrary unicode as input.
+        def repl(m):
+            if m.group(2) is not None:
+                high = int(m.group(1), 16)
+                low = int(m.group(2), 16)
+                if (0xD800 <= high <= 0xDBFF and
+                        0xDC00 <= low <= 0xDFFF and
+                        sys.maxunicode == 0x10FFFF):
+                    cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000
+                    return unichr(cp)
+                else:
+                    return unichr(high) + unichr(low)
+            else:
+                return unichr(int(m.group(1), 16))
+        return _surrogateRe.sub(repl, inp)
+
+    if isinstance(obj, dict):
+        return {decode_str(k): unescape_json(v) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        return [unescape_json(x) for x in obj]
+    elif isinstance(obj, text_type):
+        return decode_str(obj)
+    else:
+        return obj
+
+
+def lint_dat_format(path, encoding, first_header):
+    try:
+        if encoding is not None:
+            with codecs.open(path, "r", encoding=encoding) as fp:
+                dat = fp.read()
+                parsed = parser.parse(dat, first_header)
+        else:
+            with open(path, "rb") as fp:
+                dat = fp.read()
+                parsed = parser.parse(dat, first_header)
+    except NoParseError as e:
+        print("parse error in %s, %s" % (path, e))
+        return
+
+    for item in parsed:
+        headers = Counter(x[0] for x in item)
+        headers.subtract(set(headers.elements()))  # remove one instance of each
+        for header in set(headers.elements()):
+            c = headers[header]
+            print("%s occurs %d times in one test in %s" % (header, c + 1, path))
+
+    return [OrderedDict(x) for x in parsed]
+
+
+def lint_encoding_test(path):
+    parsed = lint_dat_format(path, None, b"data")
+    if not parsed:
+        return
+    for test in parsed:
+        if not is_subsequence(list(test.keys()), [b"data", b"encoding"]):
+            print("unexpected test headings %r in %s" % (test.keys(), path))
+
+
+def lint_encoding_tests(path):
+    for root, dirs, files in os.walk(path):
+        for file in files:
+            if not file.endswith(".dat"):
+                continue
+            lint_encoding_test(clean_path(join(root, file)))
+
+
+def lint_tokenizer_test(path):
+    all_keys = set(["description", "input", "output", "initialStates",
+                    "lastStartTag", "ignoreErrorOrder", "doubleEscaped"])
+    required = set(["input", "output"])
+    with codecs.open(path, "r", "utf-8") as fp:
+        parsed = json.load(fp)
+    if not parsed:
+        return
+    if not isinstance(parsed, dict):
+        print("Top-level must be an object in %s" % path)
+        return
+    for test_group in parsed.values():
+        if not isinstance(test_group, list):
+            print("Test groups must be a lists in %s" % path)
+            continue
+        for test in test_group:
+            if 'doubleEscaped' in test and test['doubleEscaped'] is True:
+                test = unescape_json(test)
+            keys = set(test.keys())
+            if not (required <= keys):
+                print("missing test properties %r in %s" % (required - keys, path))
+            if not (keys <= all_keys):
+                print("unknown test properties %r in %s" % (keys - all_keys, path))
+
+
+def lint_tokenizer_tests(path):
+    for root, dirs, files in os.walk(path):
+        for file in files:
+            if not file.endswith(".test"):
+                continue
+            lint_tokenizer_test(clean_path(join(root, file)))
+
+
+def lint_tree_construction_test(path):
+    parsed = lint_dat_format(path, "utf-8", "data")
+    if not parsed:
+        return
+    for test in parsed:
+        if not is_subsequence(list(test.keys()), ["data", "errors", "document-fragment",
+                                                  "script-off", "script-on", "document"]):
+            print("unexpected test headings %r in %s" % (test.keys(), path))
+
+
+def lint_tree_construction_tests(path):
+    for root, dirs, files in os.walk(path):
+        for file in files:
+            if not file.endswith(".dat"):
+                continue
+            lint_tree_construction_test(clean_path(join(root, file)))
+
+
+if __name__ == "__main__":
+    lint_encoding_tests(join(base, "encoding"))
+    lint_tokenizer_tests(join(base, "tokenizer"))
+    lint_tree_construction_tests(join(base, "tree-construction"))
diff --git a/lint_lib/parser.py b/lint_lib/parser.py
@@ -0,0 +1,131 @@
+from __future__ import unicode_literals
+
+import re
+
+from funcparserlib.lexer import Token, LexerError
+from funcparserlib.parser import (Parser, State, NoParseError,
+                                  finished, many, pure, skip, some)
+
+text_type = type("")
+binary_type = type(b"")
+
+
+def _make_tokenizer(specs):
+    # Forked from upstream funcparserlib.lexer to fix #44 and #46
+    def compile_spec(spec):
+        name, args = spec
+        return name, re.compile(*args)
+
+    compiled = [compile_spec(s) for s in specs]
+
+    def match_specs(specs, str, i, position):
+        if isinstance(str, text_type):
+            lf = "\n"
+        else:
+            lf = b"\n"
+        line, pos = position
+        for type, regexp in specs:
+            m = regexp.match(str, i)
+            if m is not None:
+                value = m.group()
+                nls = value.count(lf)
+                n_line = line + nls
+                if nls == 0:
+                    n_pos = pos + len(value)
+                else:
+                    n_pos = len(value) - value.rfind(lf) - 1
+                return Token(type, value, (line, pos + 1), (n_line, n_pos))
+        else:
+            errline = str.splitlines()[line - 1]
+            raise LexerError((line, pos + 1), errline)
+
+    def f(str):
+        length = len(str)
+        line, pos = 1, 0
+        i = 0
+        r = []
+        while i < length:
+            t = match_specs(compiled, str, i, (line, pos))
+            r.append(t)
+            line, pos = t.end
+            i += len(t.value)
+        return r
+
+    return f
+
+
+_token_specs_u = [
+    ('HEADER', (r"#[^\n]*\n",)),
+    ('BODY', (r"[^#\n][^\n]*\n",)),
+    ('EMPTY', (r'\n',)),
+]
+
+_token_specs_b = [(name, (regexp.encode("ascii"),))
+                  for (name, (regexp,)) in _token_specs_u]
+
+_tokenizer_u = _make_tokenizer(_token_specs_u)
+_tokenizer_b = _make_tokenizer(_token_specs_b)
+
+
+def _tokval(tok):
+    return tok.value
+
+
+def _headerval(tok):
+    return tok.value[1:].strip()
+
+
+def _many_merge(toks):
+    x, xs = toks
+    return [x] + xs
+
+
+def _notFollowedBy(p):
+    @Parser
+    def __notFollowedBy(tokens, s):
+        try:
+            p.run(tokens, s)
+        except NoParseError as e:
+            return skip(pure(None)).run(tokens, State(s.pos, e.state.max))
+        else:
+            raise NoParseError(u'is followed by', s)
+
+    __notFollowedBy.name = u'(notFollowedBy %s)' % (p,)
+    return __notFollowedBy
+
+
+def _parser(tokens, new_test_header, tok_type):
+    first_header = (some(lambda tok: tok.type == "HEADER" and
+                         _headerval(tok) == new_test_header) >>
+                    _headerval)
+    header = (some(lambda tok: tok.type == "HEADER" and
+                   _headerval(tok) != new_test_header) >>
+              _headerval)
+    body = some(lambda tok: tok.type == "BODY") >> _tokval
+    empty = some(lambda tok: tok.type == "EMPTY") >> _tokval
+
+    actual_body = (many(body | (empty + _notFollowedBy(first_header))) >>
+                   (lambda xs: tok_type().join(xs)[:-1]))
+
+    first_segment = first_header + actual_body >> tuple
+    rest_segment = header + actual_body >> tuple
+
+    test = first_segment + many(rest_segment) >> _many_merge
+
+    tests = (test + many(skip(empty) + test)) >> _many_merge
+
+    toplevel = tests + skip(finished)
+
+    return toplevel.parse(tokens)
+
+
+def parse(s, new_test_header):
+    if type(s) != type(new_test_header):
+        raise TypeError("s and new_test_header must have same type")
+
+    if isinstance(s, text_type):
+        return _parser(_tokenizer_u(s), new_test_header, text_type)
+    elif isinstance(s, binary_type):
+        return _parser(_tokenizer_b(s), new_test_header, binary_type)
+    else:
+        raise TypeError("s must be unicode or bytes object")

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+#!/bin/sh`
	`2`	`+python -m lint_lib.lint`