Skip to content

Commit a2d2e05

Browse files
kovidgoyalgsnedders
authored andcommitted
Speed up parsing some more by using a faster stream class
1 parent 7702d80 commit a2d2e05

File tree

1 file changed

+5
-1
lines changed

1 file changed

+5
-1
lines changed

html5lib/inputstream.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ class BufferedIOBase(object):
2727
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
2828
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
2929

30-
invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
30+
invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]") # noqa
3131

3232
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
3333
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@@ -118,6 +118,10 @@ def _readFromBuffer(self, bytes):
118118

119119

120120
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
121+
if (hasattr(source, 'unget') and hasattr(source, 'charsUntil') and
122+
hasattr(source, 'position') and hasattr(source, 'char') and
123+
hasattr(source, 'reset') and hasattr(source, 'errors')):
124+
return source
121125
if hasattr(source, "read"):
122126
isUnicode = isinstance(source.read(0), text_type)
123127
else:

0 commit comments

Comments
 (0)