Skip to content

Commit 71bd617

Browse files
inikulinDiegozcorpan
authored andcommitted
Refer to tokenization errors as per spec (html5lib#92)
Now the spec defines a unique ID for each parse error in the tokenizer, refer to the errors using those IDs. This also separates them out, no longer treating them as tokens, to allow implementations to check line/col positions for each error. Co-authored-by: Diego <[email protected]> Co-authored-by: Simon Pieters <[email protected]>
1 parent ddb8a23 commit 71bd617

34 files changed

+25131
-18585
lines changed

tokenizer/README.md

+4-2
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,16 @@ Basic Structure
1414
    "output": [expected_output_tokens],
1515
    "initialStates": [initial_states],
1616
    "lastStartTag": last_start_tag,
17-
    "ignoreErrorOrder": ignore_error_order
17+
"errors": [parse_errors]
1818
    }
1919
]}
2020

2121
Multiple tests per file are allowed simply by adding more objects to the
2222
"tests" list.
2323

24+
Each parse error is an object that contains error `code` and one-based
25+
error location indices: `line` and `col`.
26+
2427
`description`, `input` and `output` are always present. The other values
2528
are optional.
2629

@@ -65,7 +68,6 @@ tokens are:
6568
["EndTag", name]
6669
["Comment", data]
6770
["Character", data]
68-
"ParseError"
6971

7072
`public_id` and `system_id` are either strings or `null`. `correctness`
7173
is either `true` or `false`; `true` corresponds to the force-quirks flag

tokenizer/contentModelFlags.test

+8-2
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@
2222
"initialStates":["RCDATA state", "RAWTEXT state"],
2323
"lastStartTag":"xmp",
2424
"input":"foo</xmp ",
25-
"output":[["Character", "foo"], "ParseError"]},
25+
"output":[["Character", "foo"]],
26+
"errors":[
27+
{ "code": "eof-in-tag", "line": 1, "col": 10 }
28+
]},
2629

2730
{"description":"End tag closing RCDATA or RAWTEXT (ending with EOF)",
2831
"initialStates":["RCDATA state", "RAWTEXT state"],
@@ -34,7 +37,10 @@
3437
"initialStates":["RCDATA state", "RAWTEXT state"],
3538
"lastStartTag":"xmp",
3639
"input":"foo</xmp/",
37-
"output":[["Character", "foo"], "ParseError"]},
40+
"output":[["Character", "foo"]],
41+
"errors":[
42+
{ "code": "eof-in-tag", "line": 1, "col": 10 }
43+
]},
3844

3945
{"description":"End tag not closing RCDATA or RAWTEXT (ending with left-angle-bracket)",
4046
"initialStates":["RCDATA state", "RAWTEXT state"],

tokenizer/domjs.test

+125-9
Original file line numberDiff line numberDiff line change
@@ -3,24 +3,114 @@
33
{
44
"description":"CR in bogus comment state",
55
"input":"<?\u000d",
6-
"output":["ParseError", ["Comment", "?\u000a"]]
6+
"output":[["Comment", "?\u000a"]],
7+
"errors":[
8+
{ "code": "unexpected-question-mark-instead-of-tag-name", "line": 1, "col": 2 }
9+
]
710
},
811
{
912
"description":"CRLF in bogus comment state",
1013
"input":"<?\u000d\u000a",
11-
"output":["ParseError", ["Comment", "?\u000a"]]
14+
"output":[["Comment", "?\u000a"]],
15+
"errors":[
16+
{ "code": "unexpected-question-mark-instead-of-tag-name", "line": 1, "col": 2 }
17+
]
1218
},
1319
{
1420
"description":"CRLFLF in bogus comment state",
1521
"input":"<?\u000d\u000a\u000a",
16-
"output":["ParseError", ["Comment", "?\u000a\u000a"]]
22+
"output":[["Comment", "?\u000a\u000a"]],
23+
"errors":[
24+
{ "code": "unexpected-question-mark-instead-of-tag-name", "line": 1, "col": 2 }
25+
]
1726
},
1827
{
19-
"description":"NUL in RCDATA and RAWTEXT",
28+
"description":"NUL in RCDATA, RAWTEXT, PLAINTEXT and Script data",
2029
"doubleEscaped":true,
21-
"initialStates":["RCDATA state", "RAWTEXT state"],
30+
"initialStates":["RCDATA state", "RAWTEXT state", "PLAINTEXT state", "Script data state"],
2231
"input":"\\u0000",
23-
"output":["ParseError", ["Character", "\\uFFFD"]]
32+
"output":[["Character", "\\uFFFD"]],
33+
"errors":[
34+
{ "code": "unexpected-null-character", "line": 1, "col": 1 }
35+
]
36+
},
37+
{
38+
"description":"NUL in script HTML comment",
39+
"doubleEscaped":true,
40+
"initialStates":["Script data state"],
41+
"input":"<!--test\\u0000--><!--test-\\u0000--><!--test--\\u0000-->",
42+
"output":[["Character", "<!--test\\uFFFD--><!--test-\\uFFFD--><!--test--\\uFFFD-->"]],
43+
"errors":[
44+
{ "code": "unexpected-null-character", "line": 1, "col": 9 },
45+
{ "code": "unexpected-null-character", "line": 1, "col": 22 },
46+
{ "code": "unexpected-null-character", "line": 1, "col": 36 }
47+
]
48+
},
49+
{
50+
"description":"NUL in script HTML comment - double escaped",
51+
"doubleEscaped":true,
52+
"initialStates":["Script data state"],
53+
"input":"<!--<script>\\u0000--><!--<script>-\\u0000--><!--<script>--\\u0000-->",
54+
"output":[["Character", "<!--<script>\\uFFFD--><!--<script>-\\uFFFD--><!--<script>--\\uFFFD-->"]],
55+
"errors":[
56+
{ "code": "unexpected-null-character", "line": 1, "col": 13 },
57+
{ "code": "unexpected-null-character", "line": 1, "col": 30 },
58+
{ "code": "unexpected-null-character", "line": 1, "col": 48 }
59+
]
60+
},
61+
{
62+
"description":"EOF in script HTML comment",
63+
"initialStates":["Script data state"],
64+
"input":"<!--test",
65+
"output":[["Character", "<!--test"]],
66+
"errors":[
67+
{ "code": "eof-in-script-html-comment-like-text", "line": 1, "col": 9 }
68+
]
69+
},
70+
{
71+
"description":"EOF in script HTML comment after dash",
72+
"initialStates":["Script data state"],
73+
"input":"<!--test-",
74+
"output":[["Character", "<!--test-"]],
75+
"errors":[
76+
{ "code": "eof-in-script-html-comment-like-text", "line": 1, "col": 10 }
77+
]
78+
},
79+
{
80+
"description":"EOF in script HTML comment after dash dash",
81+
"initialStates":["Script data state"],
82+
"input":"<!--test--",
83+
"output":[["Character", "<!--test--"]],
84+
"errors":[
85+
{ "code": "eof-in-script-html-comment-like-text", "line": 1, "col": 11 }
86+
]
87+
},
88+
{
89+
"description":"EOF in script HTML comment double escaped after dash",
90+
"initialStates":["Script data state"],
91+
"input":"<!--<script>-",
92+
"output":[["Character", "<!--<script>-"]],
93+
"errors":[
94+
{ "code": "eof-in-script-html-comment-like-text", "line": 1, "col": 14 }
95+
]
96+
},
97+
{
98+
"description":"EOF in script HTML comment double escaped after dash dash",
99+
"initialStates":["Script data state"],
100+
"input":"<!--<script>--",
101+
"output":[["Character", "<!--<script>--"]],
102+
"errors":[
103+
{ "code": "eof-in-script-html-comment-like-text", "line": 1, "col": 15 }
104+
]
105+
},
106+
{
107+
"description":"EOF in script HTML comment - double escaped",
108+
"initialStates":["Script data state"],
109+
"input":"<!--<script>",
110+
"output":[["Character", "<!--<script>"]],
111+
"errors":[
112+
{ "code": "eof-in-script-html-comment-like-text", "line": 1, "col": 13 }
113+
]
24114
},
25115
{
26116
"description":"leading U+FEFF must pass through",
@@ -38,7 +128,10 @@
38128
"description":"Bad charref in in RCDATA",
39129
"initialStates":["RCDATA state"],
40130
"input":"&NotEqualTild;",
41-
"output":["ParseError", ["Character", "&NotEqualTild;"]]
131+
"output":[["Character", "&NotEqualTild;"]],
132+
"errors":[
133+
{ "code": "unknown-named-character-reference", "line": 1, "col": 14 }
134+
]
42135
},
43136
{
44137
"description":"lowercase endtags in RCDATA and RAWTEXT",
@@ -84,12 +177,35 @@
84177
"description":"--!NUL in comment ",
85178
"doubleEscaped":true,
86179
"input":"<!----!\\u0000-->",
87-
"output":["ParseError", "ParseError", ["Comment", "--!\\uFFFD"]]
180+
"output":[["Comment", "--!\\uFFFD"]],
181+
"errors":[
182+
{ "code": "unexpected-null-character", "line": 1, "col": 8 }
183+
]
88184
},
89185
{
90186
"description":"space EOF after doctype ",
91187
"input":"<!DOCTYPE html ",
92-
"output":["ParseError", ["DOCTYPE", "html", null, null , false]]
188+
"output":[["DOCTYPE", "html", null, null , false]],
189+
"errors":[
190+
{ "code": "eof-in-doctype", "line": 1, "col": 16 }
191+
]
192+
},
193+
{
194+
"description":"CDATA in HTML content",
195+
"input":"<![CDATA[foo]]>",
196+
"output":[["Comment", "[CDATA[foo]]"]],
197+
"errors":[
198+
{ "code": "cdata-in-html-content", "line": 1, "col": 9 }
199+
]
200+
},
201+
{
202+
"description":"CDATA content",
203+
"input":"foo&bar",
204+
"initialStates":["CDATA section state"],
205+
"output":[["Character", "foo&bar"]],
206+
"errors":[
207+
{ "code": "eof-in-cdata", "line": 1, "col": 8 }
208+
]
93209
}
94210

95211
]

0 commit comments

Comments
 (0)