Skip to content

Commit f1d1d0a

Browse files
committed
Merge remote-tracking branch 'origin/master' into escape-characters-serializer
2 parents 9dac020 + f130676 commit f1d1d0a

File tree

6 files changed

+60
-38
lines changed

6 files changed

+60
-38
lines changed

CHANGES.rst

+25
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,31 @@
11
Change Log
22
----------
33

4+
0.99
5+
~~~~
6+
7+
Released on September 10, 2013
8+
9+
* No library changes from 1.0b3; released as 0.99 as pip has changed
10+
behaviour from 1.4 to avoid installing pre-release versions per
11+
PEP 440.
12+
13+
14+
1.0b3
15+
~~~~~
16+
17+
Released on July 24, 2013
18+
19+
* Removed ``RecursiveTreeWalker`` from ``treewalkers._base``. Any
20+
implementation using it should be moved to
21+
``NonRecursiveTreeWalker``, as everything bundled with html5lib has
22+
for years.
23+
24+
* Fix #67 so that ``BufferedStream`` to correctly returns a bytes
25+
object, thereby fixing any case where html5lib is passed a
26+
non-seekable RawIOBase-like object.
27+
28+
429
1.0b2
530
~~~~~
631

README.rst

+23
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,29 @@ a treebuilder:
4141
with open("mydocument.html", "rb") as f:
4242
lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
4343
44+
When using with ``urllib2`` (Python 2), the charset from HTTP should be
45+
pass into html5lib as follows:
46+
47+
.. code-block:: python
48+
49+
from contextlib import closing
50+
from urllib2 import urlopen
51+
import html5lib
52+
53+
with closing(urlopen("http://example.com/")) as f:
54+
document = html5lib.parse(f, encoding=f.info().getparam("charset"))
55+
56+
When using with ``urllib.request`` (Python 3), the charset from HTTP
57+
should be pass into html5lib as follows:
58+
59+
.. code-block:: python
60+
61+
from urllib.request import urlopen
62+
import html5lib
63+
64+
with urlopen("http://example.com/") as f:
65+
document = html5lib.parse(f, encoding=f.info().get_content_charset())
66+
4467
To have more control over the parser, create a parser object explicitly.
4568
For instance, to make the parser raise exceptions on parse errors, use:
4669

html5lib/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,4 @@
2020

2121
__all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
2222
"getTreeWalker", "serialize"]
23-
__version__ = "1.0b2"
23+
__version__ = "0.999-dev"

html5lib/tests/test_treewalkers.py

-10
Original file line numberDiff line numberDiff line change
@@ -83,16 +83,6 @@ def PullDOMAdapter(node):
8383
"walker": treewalkers.getTreeWalker("lxml")}
8484

8585

86-
# Try whatever etree implementations are available from a list that are
87-
#"supposed" to work
88-
try:
89-
import pxdom
90-
treeTypes['pxdom'] = \
91-
{"builder": treebuilders.getTreeBuilder("dom", pxdom),
92-
"walker": treewalkers.getTreeWalker("dom")}
93-
except ImportError:
94-
pass
95-
9686
try:
9787
from genshi.core import QName, Attrs
9888
from genshi.core import START, END, TEXT, COMMENT, DOCTYPE

html5lib/treewalkers/_base.py

+10-26
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,16 @@
44
import gettext
55
_ = gettext.gettext
66

7+
from xml.dom import Node
8+
9+
DOCUMENT = Node.DOCUMENT_NODE
10+
DOCTYPE = Node.DOCUMENT_TYPE_NODE
11+
TEXT = Node.TEXT_NODE
12+
ELEMENT = Node.ELEMENT_NODE
13+
COMMENT = Node.COMMENT_NODE
14+
ENTITY = Node.ENTITY_NODE
15+
UNKNOWN = "<#UNKNOWN#>"
16+
717
from ..constants import voidElements, spaceCharacters
818
spaceCharacters = "".join(spaceCharacters)
919

@@ -115,32 +125,6 @@ def unknown(self, nodeType):
115125
return self.error(_("Unknown node type: ") + nodeType)
116126

117127

118-
class RecursiveTreeWalker(TreeWalker):
119-
def walkChildren(self, node):
120-
raise NotImplementedError
121-
122-
def element(self, node, namespace, name, attrs, hasChildren):
123-
if name in voidElements:
124-
for token in self.emptyTag(namespace, name, attrs, hasChildren):
125-
yield token
126-
else:
127-
yield self.startTag(name, attrs)
128-
if hasChildren:
129-
for token in self.walkChildren(node):
130-
yield token
131-
yield self.endTag(name)
132-
133-
from xml.dom import Node
134-
135-
DOCUMENT = Node.DOCUMENT_NODE
136-
DOCTYPE = Node.DOCUMENT_TYPE_NODE
137-
TEXT = Node.TEXT_NODE
138-
ELEMENT = Node.ELEMENT_NODE
139-
COMMENT = Node.COMMENT_NODE
140-
ENTITY = Node.ENTITY_NODE
141-
UNKNOWN = "<#UNKNOWN#>"
142-
143-
144128
class NonRecursiveTreeWalker(TreeWalker):
145129
def getNodeDetails(self, node):
146130
raise NotImplementedError

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
long_description = readme_file.read() + '\n' + changes_file.read()
3030

3131
setup(name='html5lib',
32-
version='1.0b2',
32+
version='0.999-dev',
3333
url='https://github.com/html5lib/html5lib-python',
3434
license="MIT License",
3535
description='HTML parser based on the WHATWG HTML specifcation',

0 commit comments

Comments
 (0)