Concatenate character tokens

RReverser · RReverser · commit 8e19e7ad2947 · 2017-07-18T15:26:43.000+01:00
Looks like these few places were missed when ParseError token type was removed.

This PR fixes them to restore the state promised in the README:

&gt; All adjacent character tokens are coalesced into a single ["Character", data] token.
diff --git a/tokenizer/test1.test b/tokenizer/test1.test
@@ -182,14 +182,14 @@
 
 {"description":"Entity without trailing semicolon (1)",
 "input":"I'm &notit",
-"output":[["Character","I'm "], ["Character", "\u00ACit"]],
+"output":[["Character","I'm \u00ACit"]],
 "errors": [
     {"code" : "missing-semicolon-after-character-reference", "line": 1, "col": 9 }
 ]},
 
 {"description":"Entity without trailing semicolon (2)",
 "input":"I'm &notin",
-"output":[["Character","I'm "], ["Character", "\u00ACin"]],
+"output":[["Character","I'm \u00ACin"]],
 "errors": [
     {"code" : "missing-semicolon-after-character-reference", "line": 1, "col": 9 }
 ]},
diff --git a/tokenizer/test2.test b/tokenizer/test2.test
@@ -119,7 +119,7 @@
 
 {"description":"Hexadecimal entity pair representing a surrogate pair",
 "input":"&#xD869;&#xDED6;",
-"output":[["Character", "\uFFFD"], ["Character", "\uFFFD"]],
+"output":[["Character", "\uFFFD\uFFFD"]],
 "errors":[
     { "code": "surrogate-character-reference", "line": 1, "col": 9 },
     { "code": "surrogate-character-reference", "line": 1, "col": 17 }
@@ -195,7 +195,7 @@
 
 {"description":"Unescaped <",
 "input":"foo < bar",
-"output":[["Character", "foo "], ["Character", "< bar"]],
+"output":[["Character", "foo < bar"]],
 "errors":[
     { "code": "invalid-first-character-of-tag-name", "line": 1, "col": 6 }
 ]},
@@ -242,7 +242,7 @@
 
 {"description":"Empty end tag with following characters",
 "input":"a</>bc",
-"output":[["Character", "a"], ["Character", "bc"]],
+"output":[["Character", "abc"]],
 "errors":[
     { "code": "missing-end-tag-name", "line": 1, "col": 4 }
 ]},
diff --git a/tokenizer/test3.test b/tokenizer/test3.test
@@ -88,7 +88,7 @@
 
 {"description":"<\\u0000",
 "input":"<\u0000",
-"output":[["Character", "<"], ["Character", "\u0000"]],
+"output":[["Character", "<\u0000"]],
 "errors":[
     { "code": "invalid-first-character-of-tag-name", "line": 1, "col": 2 },
     { "code": "unexpected-null-character", "line": 1, "col": 2 }
@@ -8415,7 +8415,7 @@
 
 {"description":"<<",
 "input":"<<",
-"output":[["Character", "<"], ["Character", "<"]],
+"output":[["Character", "<<"]],
 "errors":[
     { "code": "invalid-first-character-of-tag-name", "line": 1, "col": 2 },
     { "code": "eof-before-tag-name", "line": 1, "col": 3 }
diff --git a/tokenizer/test4.test b/tokenizer/test4.test
@@ -190,7 +190,7 @@
 
 {"description":"Empty hex numeric entities",
 "input":"&#x &#X ",
-"output":[["Character", "&#x "], ["Character", "&#X "]],
+"output":[["Character", "&#x &#X "]],
 "errors":[
     { "code": "absence-of-digits-in-numeric-character-reference", "line": 1, "col": 4 },
     { "code": "absence-of-digits-in-numeric-character-reference", "line": 1, "col": 8 }
@@ -205,7 +205,7 @@
 
 {"description":"Empty decimal numeric entities",
 "input":"&# &#; ",
-"output":[["Character", "&# "], ["Character", "&#; "]],
+"output":[["Character", "&# &#; "]],
 "errors":[
     { "code": "absence-of-digits-in-numeric-character-reference", "line": 1, "col": 3 },
     { "code": "absence-of-digits-in-numeric-character-reference", "line": 1, "col": 6 }
@@ -274,7 +274,7 @@
 
 {"description":"Surrogate code point edge cases",
 "input":"&#xD7FF;&#xD800;&#xD801;&#xDFFE;&#xDFFF;&#xE000;",
-"output":[["Character", "\uD7FF"], ["Character", "\uFFFD"], ["Character", "\uFFFD"], ["Character", "\uFFFD"], ["Character", "\uFFFD\uE000"]],
+"output":[["Character", "\uD7FF\uFFFD\uFFFD\uFFFD\uFFFD\uE000"]],
 "errors":[
     { "code": "surrogate-character-reference", "line": 1, "col": 17 },
     { "code": "surrogate-character-reference", "line": 1, "col": 25 },
diff --git a/tokenizer/unicodeCharsProblematic.test b/tokenizer/unicodeCharsProblematic.test
@@ -18,7 +18,7 @@
 {"description": "Invalid Unicode character U+DFFF with valid preceding character",
 "doubleEscaped":true,
 "input": "a\\uDFFF",
-"output":[["Character", "a"], ["Character", "\\uDFFF"]],
+"output":[["Character", "a\\uDFFF"]],
 "errors":[
     { "code": "surrogate-in-input-stream", "line": 1, "col": 2 }
 ]},
@@ -33,7 +33,7 @@
 
 {"description":"CR followed by U+0000",
 "input":"\r\u0000",
-"output":[["Character", "\n"], ["Character", "\u0000"]],
+"output":[["Character", "\n\u0000"]],
 "errors":[
     { "code": "unexpected-null-character", "line": 2, "col": 1 }
 ]}