From 8e19e7ad29473842154977d7624aee0097a6def2 Mon Sep 17 00:00:00 2001 From: Ingvar Stepanyan Date: Mon, 17 Jul 2017 15:56:04 +0100 Subject: [PATCH] Concatenate character tokens Looks like these few places were missed when ParseError token type was removed. This PR fixes them to restore the state promised in the README: > All adjacent character tokens are coalesced into a single ["Character", data] token. --- tokenizer/test1.test | 4 ++-- tokenizer/test2.test | 6 +++--- tokenizer/test3.test | 4 ++-- tokenizer/test4.test | 6 +++--- tokenizer/unicodeCharsProblematic.test | 4 ++-- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tokenizer/test1.test b/tokenizer/test1.test index 09d15024..8b85050f 100644 --- a/tokenizer/test1.test +++ b/tokenizer/test1.test @@ -182,14 +182,14 @@ {"description":"Entity without trailing semicolon (1)", "input":"I'm ¬it", -"output":[["Character","I'm "], ["Character", "\u00ACit"]], +"output":[["Character","I'm \u00ACit"]], "errors": [ {"code" : "missing-semicolon-after-character-reference", "line": 1, "col": 9 } ]}, {"description":"Entity without trailing semicolon (2)", "input":"I'm ¬in", -"output":[["Character","I'm "], ["Character", "\u00ACin"]], +"output":[["Character","I'm \u00ACin"]], "errors": [ {"code" : "missing-semicolon-after-character-reference", "line": 1, "col": 9 } ]}, diff --git a/tokenizer/test2.test b/tokenizer/test2.test index 73f0421d..521694ca 100644 --- a/tokenizer/test2.test +++ b/tokenizer/test2.test @@ -119,7 +119,7 @@ {"description":"Hexadecimal entity pair representing a surrogate pair", "input":"��", -"output":[["Character", "\uFFFD"], ["Character", "\uFFFD"]], +"output":[["Character", "\uFFFD\uFFFD"]], "errors":[ { "code": "surrogate-character-reference", "line": 1, "col": 9 }, { "code": "surrogate-character-reference", "line": 1, "col": 17 } @@ -195,7 +195,7 @@ {"description":"Unescaped <", "input":"foo < bar", -"output":[["Character", "foo "], ["Character", "< bar"]], +"output":[["Character", "foo < bar"]], "errors":[ { "code": "invalid-first-character-of-tag-name", "line": 1, "col": 6 } ]}, @@ -242,7 +242,7 @@ {"description":"Empty end tag with following characters", "input":"abc", -"output":[["Character", "a"], ["Character", "bc"]], +"output":[["Character", "abc"]], "errors":[ { "code": "missing-end-tag-name", "line": 1, "col": 4 } ]}, diff --git a/tokenizer/test3.test b/tokenizer/test3.test index ba3c15b3..85139d4d 100644 --- a/tokenizer/test3.test +++ b/tokenizer/test3.test @@ -88,7 +88,7 @@ {"description":"<\\u0000", "input":"<\u0000", -"output":[["Character", "<"], ["Character", "\u0000"]], +"output":[["Character", "<\u0000"]], "errors":[ { "code": "invalid-first-character-of-tag-name", "line": 1, "col": 2 }, { "code": "unexpected-null-character", "line": 1, "col": 2 } @@ -8415,7 +8415,7 @@ {"description":"<<", "input":"<<", -"output":[["Character", "<"], ["Character", "<"]], +"output":[["Character", "<<"]], "errors":[ { "code": "invalid-first-character-of-tag-name", "line": 1, "col": 2 }, { "code": "eof-before-tag-name", "line": 1, "col": 3 } diff --git a/tokenizer/test4.test b/tokenizer/test4.test index 8e55e767..dd247d54 100644 --- a/tokenizer/test4.test +++ b/tokenizer/test4.test @@ -190,7 +190,7 @@ {"description":"Empty hex numeric entities", "input":"&#x &#X ", -"output":[["Character", "&#x "], ["Character", "&#X "]], +"output":[["Character", "&#x &#X "]], "errors":[ { "code": "absence-of-digits-in-numeric-character-reference", "line": 1, "col": 4 }, { "code": "absence-of-digits-in-numeric-character-reference", "line": 1, "col": 8 } @@ -205,7 +205,7 @@ {"description":"Empty decimal numeric entities", "input":"&# &#; ", -"output":[["Character", "&# "], ["Character", "&#; "]], +"output":[["Character", "&# &#; "]], "errors":[ { "code": "absence-of-digits-in-numeric-character-reference", "line": 1, "col": 3 }, { "code": "absence-of-digits-in-numeric-character-reference", "line": 1, "col": 6 } @@ -274,7 +274,7 @@ {"description":"Surrogate code point edge cases", "input":"퟿����", -"output":[["Character", "\uD7FF"], ["Character", "\uFFFD"], ["Character", "\uFFFD"], ["Character", "\uFFFD"], ["Character", "\uFFFD\uE000"]], +"output":[["Character", "\uD7FF\uFFFD\uFFFD\uFFFD\uFFFD\uE000"]], "errors":[ { "code": "surrogate-character-reference", "line": 1, "col": 17 }, { "code": "surrogate-character-reference", "line": 1, "col": 25 }, diff --git a/tokenizer/unicodeCharsProblematic.test b/tokenizer/unicodeCharsProblematic.test index 346cad17..3ddb96c0 100644 --- a/tokenizer/unicodeCharsProblematic.test +++ b/tokenizer/unicodeCharsProblematic.test @@ -18,7 +18,7 @@ {"description": "Invalid Unicode character U+DFFF with valid preceding character", "doubleEscaped":true, "input": "a\\uDFFF", -"output":[["Character", "a"], ["Character", "\\uDFFF"]], +"output":[["Character", "a\\uDFFF"]], "errors":[ { "code": "surrogate-in-input-stream", "line": 1, "col": 2 } ]}, @@ -33,7 +33,7 @@ {"description":"CR followed by U+0000", "input":"\r\u0000", -"output":[["Character", "\n"], ["Character", "\u0000"]], +"output":[["Character", "\n\u0000"]], "errors":[ { "code": "unexpected-null-character", "line": 2, "col": 1 } ]}