Skip to content

Commit a5e861a

Browse files
committed
WIP More general fix for html5lib#127 with addinfourl
See html5lib#134.
1 parent c36197d commit a5e861a

File tree

2 files changed

+57
-10
lines changed

2 files changed

+57
-10
lines changed

html5lib/inputstream.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from __future__ import absolute_import, division, unicode_literals
22
from six import text_type
3-
from six.moves import http_client
43

54
import codecs
65
import re
@@ -119,22 +118,24 @@ def _readFromBuffer(self, bytes):
119118

120119

121120
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
122-
if isinstance(source, http_client.HTTPResponse):
123-
# Work around Python bug #20007: read(0) closes the connection.
121+
if hasattr(source, "read"):
122+
# Do no use .read(0) because of Python bug #20007
124123
# http://bugs.python.org/issue20007
125-
isUnicode = False
126-
elif hasattr(source, "read"):
127-
isUnicode = isinstance(source.read(0), text_type)
124+
firstChunk = source.read(HTMLUnicodeInputStream._defaultChunkSize)
125+
print(firstChunk)
126+
isUnicode = isinstance(firstChunk, text_type)
128127
else:
129128
isUnicode = isinstance(source, text_type)
129+
firstChunk = "" if isUnicode else b""
130130

131131
if isUnicode:
132132
if encoding is not None:
133133
raise TypeError("Cannot explicitly set an encoding with a unicode string")
134134

135-
return HTMLUnicodeInputStream(source)
135+
return HTMLUnicodeInputStream(source, firstChunk)
136136
else:
137-
return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
137+
return HTMLBinaryInputStream(
138+
source, firstChunk, encoding, parseMeta, chardet)
138139

139140

140141
class HTMLUnicodeInputStream(object):
@@ -147,7 +148,7 @@ class HTMLUnicodeInputStream(object):
147148

148149
_defaultChunkSize = 10240
149150

150-
def __init__(self, source):
151+
def __init__(self, source, firstChunk=""):
151152
"""Initialises the HTMLInputStream.
152153
153154
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -163,6 +164,7 @@ def __init__(self, source):
163164
parseMeta - Look for a <meta> element containing encoding information
164165
165166
"""
167+
# XXX do something with firstChunk
166168

167169
# Craziness
168170
if len("\U0010FFFF") == 1:
@@ -378,7 +380,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
378380
379381
"""
380382

381-
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
383+
def __init__(self, source, firstChunk=b"", encoding=None,
384+
parseMeta=True, chardet=True):
382385
"""Initialises the HTMLInputStream.
383386
384387
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -394,6 +397,8 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
394397
parseMeta - Look for a <meta> element containing encoding information
395398
396399
"""
400+
# XXX do something with firstChunk
401+
397402
# Raw Stream - for unicode objects this will encode to utf-8 and set
398403
# self.charEncoding as appropriate
399404
self.rawStream = self.openStream(source)

html5lib/tests/test_stream.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# coding: utf8
2+
13
from __future__ import absolute_import, division, unicode_literals
24

35
from . import support # flake8: noqa
@@ -6,6 +8,7 @@
68
from io import BytesIO
79

810
from six.moves import http_client
11+
from six.moves.urllib.response import addinfourl
912

1013
from html5lib.inputstream import (BufferedStream, HTMLInputStream,
1114
HTMLUnicodeInputStream, HTMLBinaryInputStream)
@@ -156,6 +159,25 @@ def test_position2(self):
156159
self.assertEqual(stream.char(), "d")
157160
self.assertEqual(stream.position(), (2, 1))
158161

162+
def test_non_seekable_stream(self):
163+
class Stream(object):
164+
def __init__(self, data):
165+
self.data = data
166+
167+
def read(self, n=None):
168+
if n is None:
169+
data = self.data
170+
self.data = b''
171+
return data
172+
else:
173+
data = self.data[:n]
174+
self.data = self.data[n:]
175+
return data
176+
177+
# Fails when firstChunk is ignored
178+
stream = HTMLInputStream(Stream(b"Test"))
179+
self.assertEqual(stream.charsUntil(" "), "Test")
180+
159181
def test_python_issue_20007(self):
160182
"""
161183
Make sure we have a work-around for Python bug #20007
@@ -170,6 +192,26 @@ def makefile(self, _mode, _bufsize=None):
170192
stream = HTMLInputStream(source)
171193
self.assertEqual(stream.charsUntil(" "), "Text")
172194

195+
def test_python_issue_20007_addinfourl(self):
196+
"""
197+
Same as above, but the source is not necessarily an instance
198+
of HTTPResponse.
199+
"""
200+
class FakeSocket(object):
201+
def makefile(self, _mode, _bufsize=None):
202+
return BytesIO(b"HTTP/1.1 200 Ok\r\n\r\nText")
203+
204+
source = http_client.HTTPResponse(FakeSocket())
205+
source.begin()
206+
try:
207+
source = addinfourl(source, None, None)
208+
except AttributeError:
209+
# Fails on Python 2.x where HTTPResponse does not have .readline()
210+
# Apparently, addinfourl it only used with HTTPResponse on 3.x
211+
pass
212+
else:
213+
stream = HTMLInputStream(source)
214+
self.assertEqual(stream.charsUntil(" "), "Text")
173215

174216
def buildTestSuite():
175217
return unittest.defaultTestLoader.loadTestsFromName(__name__)

0 commit comments

Comments
 (0)