Skip to content

Commit 1d9f391

Browse files
committed
Fix changing encoding to actually change encoding; add test for it
1 parent c5a8007 commit 1d9f391

File tree

2 files changed

+14
-1
lines changed

2 files changed

+14
-1
lines changed

html5lib/inputstream.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -509,8 +509,8 @@ def changeEncoding(self, newEncoding):
509509
self.charEncoding = (self.charEncoding[0], "certain")
510510
else:
511511
self.rawStream.seek(0)
512-
self.reset()
513512
self.charEncoding = (newEncoding, "certain")
513+
self.reset()
514514
raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
515515

516516
def detectBOM(self):

html5lib/tests/test_encoding.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,19 @@ def test_basic_prescan_length():
2121
assert 'utf-8' == stream.charEncoding[0].name
2222

2323

24+
def test_parser_reparse():
25+
data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8')
26+
pad = 10240 - len(data) + 1
27+
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
28+
assert len(data) == 10240 # Sanity
29+
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
30+
assert 'windows-1252' == stream.charEncoding[0].name
31+
p = HTMLParser(namespaceHTMLElements=False)
32+
doc = p.parse(data, useChardet=False)
33+
assert 'utf-8' == p.documentEncoding
34+
assert doc.find(".//title").text == "Caf\u00E9"
35+
36+
2437
def runParserEncodingTest(data, encoding):
2538
p = HTMLParser()
2639
assert p.documentEncoding is None

0 commit comments

Comments
 (0)