Skip to content

Commit a65a0e8

Browse files
committed
First attempt at linter
1 parent 2998f9a commit a65a0e8

File tree

4 files changed

+306
-0
lines changed

4 files changed

+306
-0
lines changed

lint

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
#!/bin/sh
2+
python -m lint_lib.lint

lint_lib/__init__.py

Whitespace-only changes.

lint_lib/lint.py

+173
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
from __future__ import unicode_literals, print_function
2+
3+
import codecs
4+
import re
5+
import json
6+
import os
7+
import sys
8+
from collections import Counter, OrderedDict
9+
from os.path import dirname, join, pardir, relpath
10+
11+
from funcparserlib.parser import NoParseError
12+
13+
from . import parser
14+
15+
text_type = type("")
16+
binary_type = type(b"")
17+
18+
try:
19+
unichr
20+
except NameError:
21+
unichr = chr
22+
23+
base = join(dirname(__file__), pardir)
24+
25+
_surrogateRe = re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?")
26+
27+
28+
def clean_path(path):
29+
return relpath(path, base)
30+
31+
32+
def is_subsequence(l1, l2):
33+
""" checks if l1 is a subsequence of l2"""
34+
i = 0
35+
for x in l2:
36+
if l1[i] == x:
37+
i += 1
38+
if i == len(l1):
39+
return True
40+
return False
41+
42+
43+
def unescape_json(obj):
44+
def decode_str(inp):
45+
"""Decode \\uXXXX escapes
46+
47+
This decodes \\uXXXX escapes, possibly into non-BMP characters when
48+
two surrogate character escapes are adjacent to each other.
49+
"""
50+
# This cannot be implemented using the unicode_escape codec
51+
# because that requires its input be ISO-8859-1, and we need
52+
# arbitrary unicode as input.
53+
def repl(m):
54+
if m.group(2) is not None:
55+
high = int(m.group(1), 16)
56+
low = int(m.group(2), 16)
57+
if (0xD800 <= high <= 0xDBFF and
58+
0xDC00 <= low <= 0xDFFF and
59+
sys.maxunicode == 0x10FFFF):
60+
cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000
61+
return unichr(cp)
62+
else:
63+
return unichr(high) + unichr(low)
64+
else:
65+
return unichr(int(m.group(1), 16))
66+
return _surrogateRe.sub(repl, inp)
67+
68+
if isinstance(obj, dict):
69+
return {decode_str(k): unescape_json(v) for k, v in obj.items()}
70+
elif isinstance(obj, list):
71+
return [unescape_json(x) for x in obj]
72+
elif isinstance(obj, text_type):
73+
return decode_str(obj)
74+
else:
75+
return obj
76+
77+
78+
def lint_dat_format(path, encoding, first_header):
79+
try:
80+
if encoding is not None:
81+
with codecs.open(path, "r", encoding=encoding) as fp:
82+
dat = fp.read()
83+
parsed = parser.parse(dat, first_header)
84+
else:
85+
with open(path, "rb") as fp:
86+
dat = fp.read()
87+
parsed = parser.parse(dat, first_header)
88+
except NoParseError as e:
89+
print("parse error in %s, %s" % (path, e))
90+
return
91+
92+
for item in parsed:
93+
headers = Counter(x[0] for x in item)
94+
headers.subtract(set(headers.elements())) # remove one instance of each
95+
for header in set(headers.elements()):
96+
c = headers[header]
97+
print("%s occurs %d times in one test in %s" % (header, c + 1, path))
98+
99+
return [OrderedDict(x) for x in parsed]
100+
101+
102+
def lint_encoding_test(path):
103+
parsed = lint_dat_format(path, None, b"data")
104+
if not parsed:
105+
return
106+
for test in parsed:
107+
if not is_subsequence(list(test.keys()), [b"data", b"encoding"]):
108+
print("unexpected test headings %r in %s" % (test.keys(), path))
109+
110+
111+
def lint_encoding_tests(path):
112+
for root, dirs, files in os.walk(path):
113+
for file in files:
114+
if not file.endswith(".dat"):
115+
continue
116+
lint_encoding_test(clean_path(join(root, file)))
117+
118+
119+
def lint_tokenizer_test(path):
120+
all_keys = set(["description", "input", "output", "initialStates",
121+
"lastStartTag", "ignoreErrorOrder", "doubleEscaped"])
122+
required = set(["input", "output"])
123+
with codecs.open(path, "r", "utf-8") as fp:
124+
parsed = json.load(fp)
125+
if not parsed:
126+
return
127+
if not isinstance(parsed, dict):
128+
print("Top-level must be an object in %s" % path)
129+
return
130+
for test_group in parsed.values():
131+
if not isinstance(test_group, list):
132+
print("Test groups must be a lists in %s" % path)
133+
continue
134+
for test in test_group:
135+
if 'doubleEscaped' in test and test['doubleEscaped'] is True:
136+
test = unescape_json(test)
137+
keys = set(test.keys())
138+
if not (required <= keys):
139+
print("missing test properties %r in %s" % (required - keys, path))
140+
if not (keys <= all_keys):
141+
print("unknown test properties %r in %s" % (keys - all_keys, path))
142+
143+
144+
def lint_tokenizer_tests(path):
145+
for root, dirs, files in os.walk(path):
146+
for file in files:
147+
if not file.endswith(".test"):
148+
continue
149+
lint_tokenizer_test(clean_path(join(root, file)))
150+
151+
152+
def lint_tree_construction_test(path):
153+
parsed = lint_dat_format(path, "utf-8", "data")
154+
if not parsed:
155+
return
156+
for test in parsed:
157+
if not is_subsequence(list(test.keys()), ["data", "errors", "document-fragment",
158+
"script-off", "script-on", "document"]):
159+
print("unexpected test headings %r in %s" % (test.keys(), path))
160+
161+
162+
def lint_tree_construction_tests(path):
163+
for root, dirs, files in os.walk(path):
164+
for file in files:
165+
if not file.endswith(".dat"):
166+
continue
167+
lint_tree_construction_test(clean_path(join(root, file)))
168+
169+
170+
if __name__ == "__main__":
171+
lint_encoding_tests(join(base, "encoding"))
172+
lint_tokenizer_tests(join(base, "tokenizer"))
173+
lint_tree_construction_tests(join(base, "tree-construction"))

lint_lib/parser.py

+131
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
from __future__ import unicode_literals
2+
3+
import re
4+
5+
from funcparserlib.lexer import Token, LexerError
6+
from funcparserlib.parser import (Parser, State, NoParseError,
7+
finished, many, pure, skip, some)
8+
9+
text_type = type("")
10+
binary_type = type(b"")
11+
12+
13+
def _make_tokenizer(specs):
14+
# Forked from upstream funcparserlib.lexer to fix #44 and #46
15+
def compile_spec(spec):
16+
name, args = spec
17+
return name, re.compile(*args)
18+
19+
compiled = [compile_spec(s) for s in specs]
20+
21+
def match_specs(specs, str, i, position):
22+
if isinstance(str, text_type):
23+
lf = "\n"
24+
else:
25+
lf = b"\n"
26+
line, pos = position
27+
for type, regexp in specs:
28+
m = regexp.match(str, i)
29+
if m is not None:
30+
value = m.group()
31+
nls = value.count(lf)
32+
n_line = line + nls
33+
if nls == 0:
34+
n_pos = pos + len(value)
35+
else:
36+
n_pos = len(value) - value.rfind(lf) - 1
37+
return Token(type, value, (line, pos + 1), (n_line, n_pos))
38+
else:
39+
errline = str.splitlines()[line - 1]
40+
raise LexerError((line, pos + 1), errline)
41+
42+
def f(str):
43+
length = len(str)
44+
line, pos = 1, 0
45+
i = 0
46+
r = []
47+
while i < length:
48+
t = match_specs(compiled, str, i, (line, pos))
49+
r.append(t)
50+
line, pos = t.end
51+
i += len(t.value)
52+
return r
53+
54+
return f
55+
56+
57+
_token_specs_u = [
58+
('HEADER', (r"#[^\n]*\n",)),
59+
('BODY', (r"[^#\n][^\n]*\n",)),
60+
('EMPTY', (r'\n',)),
61+
]
62+
63+
_token_specs_b = [(name, (regexp.encode("ascii"),))
64+
for (name, (regexp,)) in _token_specs_u]
65+
66+
_tokenizer_u = _make_tokenizer(_token_specs_u)
67+
_tokenizer_b = _make_tokenizer(_token_specs_b)
68+
69+
70+
def _tokval(tok):
71+
return tok.value
72+
73+
74+
def _headerval(tok):
75+
return tok.value[1:].strip()
76+
77+
78+
def _many_merge(toks):
79+
x, xs = toks
80+
return [x] + xs
81+
82+
83+
def _notFollowedBy(p):
84+
@Parser
85+
def __notFollowedBy(tokens, s):
86+
try:
87+
p.run(tokens, s)
88+
except NoParseError as e:
89+
return skip(pure(None)).run(tokens, State(s.pos, e.state.max))
90+
else:
91+
raise NoParseError(u'is followed by', s)
92+
93+
__notFollowedBy.name = u'(notFollowedBy %s)' % (p,)
94+
return __notFollowedBy
95+
96+
97+
def _parser(tokens, new_test_header, tok_type):
98+
first_header = (some(lambda tok: tok.type == "HEADER" and
99+
_headerval(tok) == new_test_header) >>
100+
_headerval)
101+
header = (some(lambda tok: tok.type == "HEADER" and
102+
_headerval(tok) != new_test_header) >>
103+
_headerval)
104+
body = some(lambda tok: tok.type == "BODY") >> _tokval
105+
empty = some(lambda tok: tok.type == "EMPTY") >> _tokval
106+
107+
actual_body = (many(body | (empty + _notFollowedBy(first_header))) >>
108+
(lambda xs: tok_type().join(xs)[:-1]))
109+
110+
first_segment = first_header + actual_body >> tuple
111+
rest_segment = header + actual_body >> tuple
112+
113+
test = first_segment + many(rest_segment) >> _many_merge
114+
115+
tests = (test + many(skip(empty) + test)) >> _many_merge
116+
117+
toplevel = tests + skip(finished)
118+
119+
return toplevel.parse(tokens)
120+
121+
122+
def parse(s, new_test_header):
123+
if type(s) != type(new_test_header):
124+
raise TypeError("s and new_test_header must have same type")
125+
126+
if isinstance(s, text_type):
127+
return _parser(_tokenizer_u(s), new_test_header, text_type)
128+
elif isinstance(s, binary_type):
129+
return _parser(_tokenizer_b(s), new_test_header, binary_type)
130+
else:
131+
raise TypeError("s must be unicode or bytes object")

0 commit comments

Comments
 (0)