Skip to content

Commit af0199c

Browse files
committed
Merge pull request #175 from gsnedders/fix_tokenizer_201411
Get rid of obsolete replacement of unpaired surrogates with U+FFFD.
2 parents 93ee3b3 + f27af70 commit af0199c

File tree

2 files changed

+1
-8
lines changed

2 files changed

+1
-8
lines changed

.pytest.expect

-228 Bytes
Binary file not shown.

html5lib/inputstream.py

+1-8
Original file line numberDiff line numberDiff line change
@@ -185,14 +185,10 @@ def __init__(self, source):
185185
# Such platforms will have already checked for such
186186
# surrogate errors, so no need to do this checking.
187187
self.reportCharacterErrors = None
188-
self.replaceCharactersRegexp = None
189188
elif len("\U0010FFFF") == 1:
190189
self.reportCharacterErrors = self.characterErrorsUCS4
191-
self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
192190
else:
193191
self.reportCharacterErrors = self.characterErrorsUCS2
194-
self.replaceCharactersRegexp = re.compile(
195-
eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))
196192

197193
# List of where new lines occur
198194
self.newLines = [0]
@@ -290,10 +286,7 @@ def readChunk(self, chunkSize=None):
290286
if self.reportCharacterErrors:
291287
self.reportCharacterErrors(data)
292288

293-
# Replace invalid characters
294-
# Note U+0000 is dealt with in the tokenizer
295-
data = self.replaceCharactersRegexp.sub("\ufffd", data)
296-
289+
# Replace invalid characters
297290
data = data.replace("\r\n", "\n")
298291
data = data.replace("\r", "\n")
299292

0 commit comments

Comments
 (0)