From 04ff4c12f4098c9f1260d50e4c613864a5f8c054 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Tue, 3 May 2016 14:59:05 +0200 Subject: [PATCH 1/2] assert that after assigning self.charEncoding it's not None --- html5lib/inputstream.py | 1 + 1 file changed, 1 insertion(+) diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 5694efe3..27987a8a 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -430,6 +430,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True): # Detect encoding iff no explicit "transport level" encoding is supplied if (self.charEncoding[0] is None): self.charEncoding = self.detectEncoding(parseMeta, chardet) + assert self.charEncoding[0] is not None # Call superclass self.reset() From 9ba3b280d8548862703076bbbc2b5377718d4c83 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Tue, 3 May 2016 15:05:06 +0200 Subject: [PATCH 2/2] Increase encoding pre-scan length to 1024, per spec from 2011(!) 51babfe760a1dbe28c4521b2070e692ac872550a was the spec change. --- html5lib/inputstream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py index 27987a8a..ad5ca7dc 100644 --- a/html5lib/inputstream.py +++ b/html5lib/inputstream.py @@ -421,7 +421,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True): # Encoding Information # Number of bytes to use when looking for a meta element with # encoding information - self.numBytesMeta = 512 + self.numBytesMeta = 1024 # Number of bytes to use when using detecting encoding using chardet self.numBytesChardet = 100 # Encoding to use if no other information can be found