From c5a800779b5af3d46aa4dbc69a171fdc47906005 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 4 May 2016 00:05:51 +0100 Subject: [PATCH 1/3] Add a better test for encoding prescan length --- html5lib/tests/test_encoding.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py index 837e989f..6c996b00 100644 --- a/html5lib/tests/test_encoding.py +++ b/html5lib/tests/test_encoding.py @@ -12,6 +12,15 @@ from html5lib import HTMLParser, inputstream +def test_basic_prescan_length(): + data = "Caf\u00E9".encode('utf-8') + pad = 1024 - len(data) + 1 + data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-") + assert len(data) == 1024 # Sanity + stream = inputstream.HTMLBinaryInputStream(data, chardet=False) + assert 'utf-8' == stream.charEncoding[0].name + + def runParserEncodingTest(data, encoding): p = HTMLParser() assert p.documentEncoding is None From 1d9f391f6f92677c29803272d9a3e27831a84814 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 4 May 2016 00:06:24 +0100 Subject: [PATCH 2/3] Fix changing encoding to actually change encoding; add test for it --- html5lib/inputstream.py | 2 +- html5lib/tests/test_encoding.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index ad5ca7dc..15acba0d 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -509,8 +509,8 @@ def changeEncoding(self, newEncoding): self.charEncoding = (self.charEncoding[0], "certain") else: self.rawStream.seek(0) - self.reset() self.charEncoding = (newEncoding, "certain") + self.reset() raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding)) def detectBOM(self): diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py index 6c996b00..3837fe09 100644 --- a/html5lib/tests/test_encoding.py +++ b/html5lib/tests/test_encoding.py @@ -21,6 +21,19 @@ def test_basic_prescan_length(): assert 'utf-8' == stream.charEncoding[0].name +def test_parser_reparse(): + data = "Caf\u00E9".encode('utf-8') + pad = 10240 - len(data) + 1 + data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-") + assert len(data) == 10240 # Sanity + stream = inputstream.HTMLBinaryInputStream(data, chardet=False) + assert 'windows-1252' == stream.charEncoding[0].name + p = HTMLParser(namespaceHTMLElements=False) + doc = p.parse(data, useChardet=False) + assert 'utf-8' == p.documentEncoding + assert doc.find(".//title").text == "Caf\u00E9" + + def runParserEncodingTest(data, encoding): p = HTMLParser() assert p.documentEncoding is None From b0ae0c2aaa5c2d74602a9dc576f10753f1383882 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Wed, 4 May 2016 00:06:48 +0100 Subject: [PATCH 3/3] Ensure we only ever reparse *once* --- html5lib/html5parser.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index a7cb98be..34f7ac5c 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -89,12 +89,11 @@ def _parse(self, stream, innerHTML=False, container="div", parser=self, **kwargs) self.reset() - while True: - try: - self.mainLoop() - break - except ReparseException: - self.reset() + try: + self.mainLoop() + except ReparseException: + self.reset() + self.mainLoop() def reset(self): self.tree.reset()