1
1
from __future__ import absolute_import , division , unicode_literals
2
2
from six import text_type
3
- from six .moves import http_client
4
3
5
4
import codecs
6
5
import re
@@ -119,22 +118,24 @@ def _readFromBuffer(self, bytes):
119
118
120
119
121
120
def HTMLInputStream (source , encoding = None , parseMeta = True , chardet = True ):
122
- if isinstance (source , http_client . HTTPResponse ):
123
- # Work around Python bug #20007: read(0) closes the connection.
121
+ if hasattr (source , "read" ):
122
+ # Do no use . read(0) because of Python bug #20007
124
123
# http://bugs.python.org/issue20007
125
- isUnicode = False
126
- elif hasattr ( source , "read" ):
127
- isUnicode = isinstance (source . read ( 0 ) , text_type )
124
+ firstChunk = source . read ( HTMLUnicodeInputStream . _defaultChunkSize )
125
+ print ( firstChunk )
126
+ isUnicode = isinstance (firstChunk , text_type )
128
127
else :
129
128
isUnicode = isinstance (source , text_type )
129
+ firstChunk = "" if isUnicode else b""
130
130
131
131
if isUnicode :
132
132
if encoding is not None :
133
133
raise TypeError ("Cannot explicitly set an encoding with a unicode string" )
134
134
135
- return HTMLUnicodeInputStream (source )
135
+ return HTMLUnicodeInputStream (source , firstChunk )
136
136
else :
137
- return HTMLBinaryInputStream (source , encoding , parseMeta , chardet )
137
+ return HTMLBinaryInputStream (
138
+ source , firstChunk , encoding , parseMeta , chardet )
138
139
139
140
140
141
class HTMLUnicodeInputStream (object ):
@@ -147,7 +148,7 @@ class HTMLUnicodeInputStream(object):
147
148
148
149
_defaultChunkSize = 10240
149
150
150
- def __init__ (self , source ):
151
+ def __init__ (self , source , firstChunk = "" ):
151
152
"""Initialises the HTMLInputStream.
152
153
153
154
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -163,6 +164,7 @@ def __init__(self, source):
163
164
parseMeta - Look for a <meta> element containing encoding information
164
165
165
166
"""
167
+ # XXX do something with firstChunk
166
168
167
169
# Craziness
168
170
if len ("\U0010FFFF " ) == 1 :
@@ -378,7 +380,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
378
380
379
381
"""
380
382
381
- def __init__ (self , source , encoding = None , parseMeta = True , chardet = True ):
383
+ def __init__ (self , source , firstChunk = b"" , encoding = None ,
384
+ parseMeta = True , chardet = True ):
382
385
"""Initialises the HTMLInputStream.
383
386
384
387
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -394,6 +397,8 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
394
397
parseMeta - Look for a <meta> element containing encoding information
395
398
396
399
"""
400
+ # XXX do something with firstChunk
401
+
397
402
# Raw Stream - for unicode objects this will encode to utf-8 and set
398
403
# self.charEncoding as appropriate
399
404
self .rawStream = self .openStream (source )
0 commit comments