Skip to content

Commit 73215c5

Browse files
committed
Merge pull request #222 from gsnedders/lint_fixes
Various fixes for the lint filter, and use it to validate treewalker sanity in tests.
2 parents af0199c + ca6591c commit 73215c5

File tree

5 files changed

+74
-132
lines changed

5 files changed

+74
-132
lines changed

html5lib/filters/lint.py

+44-57
Original file line numberDiff line numberDiff line change
@@ -1,90 +1,77 @@
11
from __future__ import absolute_import, division, unicode_literals
22

3+
from six import text_type
4+
35
from . import _base
4-
from ..constants import cdataElements, rcdataElements, voidElements
6+
from ..constants import namespaces, voidElements
57

68
from ..constants import spaceCharacters
79
spaceCharacters = "".join(spaceCharacters)
810

911

10-
class LintError(Exception):
11-
pass
12-
13-
1412
class Filter(_base.Filter):
1513
def __iter__(self):
1614
open_elements = []
17-
contentModelFlag = "PCDATA"
1815
for token in _base.Filter.__iter__(self):
1916
type = token["type"]
2017
if type in ("StartTag", "EmptyTag"):
18+
namespace = token["namespace"]
2119
name = token["name"]
22-
if contentModelFlag != "PCDATA":
23-
raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name})
24-
if not isinstance(name, str):
25-
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
26-
if not name:
27-
raise LintError("Empty tag name")
28-
if type == "StartTag" and name in voidElements:
29-
raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name})
30-
elif type == "EmptyTag" and name not in voidElements:
31-
raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]})
20+
assert namespace is None or isinstance(namespace, text_type)
21+
assert namespace != ""
22+
assert isinstance(name, text_type)
23+
assert name != ""
24+
assert isinstance(token["data"], dict)
25+
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
26+
assert type == "EmptyTag"
27+
else:
28+
assert type == "StartTag"
3229
if type == "StartTag":
33-
open_elements.append(name)
34-
for name, value in token["data"]:
35-
if not isinstance(name, str):
36-
raise LintError("Attribute name is not a string: %(name)r" % {"name": name})
37-
if not name:
38-
raise LintError("Empty attribute name")
39-
if not isinstance(value, str):
40-
raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
41-
if name in cdataElements:
42-
contentModelFlag = "CDATA"
43-
elif name in rcdataElements:
44-
contentModelFlag = "RCDATA"
45-
elif name == "plaintext":
46-
contentModelFlag = "PLAINTEXT"
30+
open_elements.append((namespace, name))
31+
for (namespace, name), value in token["data"].items():
32+
assert namespace is None or isinstance(namespace, text_type)
33+
assert namespace != ""
34+
assert isinstance(name, text_type)
35+
assert name != ""
36+
assert isinstance(value, text_type)
4737

4838
elif type == "EndTag":
39+
namespace = token["namespace"]
4940
name = token["name"]
50-
if not isinstance(name, str):
51-
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
52-
if not name:
53-
raise LintError("Empty tag name")
54-
if name in voidElements:
55-
raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name})
56-
start_name = open_elements.pop()
57-
if start_name != name:
58-
raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name})
59-
contentModelFlag = "PCDATA"
41+
assert namespace is None or isinstance(namespace, text_type)
42+
assert namespace != ""
43+
assert isinstance(name, text_type)
44+
assert name != ""
45+
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
46+
assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
47+
else:
48+
start = open_elements.pop()
49+
assert start == (namespace, name)
6050

6151
elif type == "Comment":
62-
if contentModelFlag != "PCDATA":
63-
raise LintError("Comment not in PCDATA content model flag")
52+
data = token["data"]
53+
assert isinstance(data, text_type)
6454

6555
elif type in ("Characters", "SpaceCharacters"):
6656
data = token["data"]
67-
if not isinstance(data, str):
68-
raise LintError("Attribute name is not a string: %(name)r" % {"name": data})
69-
if not data:
70-
raise LintError("%(type)s token with empty data" % {"type": type})
57+
assert isinstance(data, text_type)
58+
assert data != ""
7159
if type == "SpaceCharacters":
72-
data = data.strip(spaceCharacters)
73-
if data:
74-
raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data})
60+
assert data.strip(spaceCharacters) == ""
7561

7662
elif type == "Doctype":
7763
name = token["name"]
78-
if contentModelFlag != "PCDATA":
79-
raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name})
80-
if not isinstance(name, str):
81-
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
82-
# XXX: what to do with token["data"] ?
64+
assert name is None or isinstance(name, text_type)
65+
assert token["publicId"] is None or isinstance(name, text_type)
66+
assert token["systemId"] is None or isinstance(name, text_type)
67+
68+
elif type == "Entity":
69+
assert isinstance(token["name"], text_type)
8370

84-
elif type in ("ParseError", "SerializeError"):
85-
pass
71+
elif type == "SerializerError":
72+
assert isinstance(token["data"], text_type)
8673

8774
else:
88-
raise LintError("Unknown token type: %(type)s" % {"type": type})
75+
assert False, "Unknown token type: %(type)s" % {"type": type}
8976

9077
yield token

html5lib/tests/test_treewalkers.py

+7-6
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from .support import get_data_files, TestData, convertExpected
1515

1616
from html5lib import html5parser, treewalkers, treebuilders, treeadapters, constants
17+
from html5lib.filters.lint import Filter as Lint
1718

1819

1920
treeTypes = {
@@ -77,21 +78,21 @@ def test_all_tokens(self):
7778
expected = [
7879
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'},
7980
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
80-
{'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
81+
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'head'},
8182
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
8283
{'data': 'a', 'type': 'Characters'},
8384
{'data': {}, 'type': 'StartTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
8485
{'data': 'b', 'type': 'Characters'},
85-
{'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
86+
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'div'},
8687
{'data': 'c', 'type': 'Characters'},
87-
{'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
88-
{'data': {}, 'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'}
88+
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'body'},
89+
{'type': 'EndTag', 'namespace': 'http://www.w3.org/1999/xhtml', 'name': 'html'}
8990
]
9091
for treeName, treeCls in sorted(treeTypes.items()):
9192
p = html5parser.HTMLParser(tree=treeCls["builder"])
9293
document = p.parse("<html><head></head><body>a<div>b</div>c</body></html>")
9394
document = treeCls.get("adapter", lambda x: x)(document)
94-
output = treeCls["walker"](document)
95+
output = Lint(treeCls["walker"](document))
9596
for expectedToken, outputToken in zip(expected, output):
9697
self.assertEqual(expectedToken, outputToken)
9798

@@ -111,7 +112,7 @@ def runTreewalkerTest(innerHTML, input, expected, errors, treeClass):
111112

112113
document = treeClass.get("adapter", lambda x: x)(document)
113114
try:
114-
output = treewalkers.pprint(treeClass["walker"](document))
115+
output = treewalkers.pprint(Lint(treeClass["walker"](document)))
115116
output = attrlist.sub(sortattrs, output)
116117
expected = attrlist.sub(sortattrs, convertExpected(expected))
117118
diff = "".join(unified_diff([line + "\n" for line in expected.splitlines()],

html5lib/treewalkers/_base.py

+17-67
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
from __future__ import absolute_import, division, unicode_literals
2-
from six import text_type, string_types
32

43
from xml.dom import Node
5-
from ..constants import voidElements, spaceCharacters
4+
from ..constants import namespaces, voidElements, spaceCharacters
65

76
__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
87
"TreeWalker", "NonRecursiveTreeWalker"]
@@ -18,24 +17,6 @@
1817
spaceCharacters = "".join(spaceCharacters)
1918

2019

21-
def to_text(s, blank_if_none=True):
22-
"""Wrapper around six.text_type to convert None to empty string"""
23-
if s is None:
24-
if blank_if_none:
25-
return ""
26-
else:
27-
return None
28-
elif isinstance(s, text_type):
29-
return s
30-
else:
31-
return text_type(s)
32-
33-
34-
def is_text_or_none(string):
35-
"""Wrapper around isinstance(string_types) or is None"""
36-
return string is None or isinstance(string, string_types)
37-
38-
3920
class TreeWalker(object):
4021
def __init__(self, tree):
4122
self.tree = tree
@@ -47,47 +28,25 @@ def error(self, msg):
4728
return {"type": "SerializeError", "data": msg}
4829

4930
def emptyTag(self, namespace, name, attrs, hasChildren=False):
50-
assert namespace is None or isinstance(namespace, string_types), type(namespace)
51-
assert isinstance(name, string_types), type(name)
52-
assert all((namespace is None or isinstance(namespace, string_types)) and
53-
isinstance(name, string_types) and
54-
isinstance(value, string_types)
55-
for (namespace, name), value in attrs.items())
56-
57-
yield {"type": "EmptyTag", "name": to_text(name, False),
58-
"namespace": to_text(namespace),
31+
yield {"type": "EmptyTag", "name": name,
32+
"namespace": namespace,
5933
"data": attrs}
6034
if hasChildren:
6135
yield self.error("Void element has children")
6236

6337
def startTag(self, namespace, name, attrs):
64-
assert namespace is None or isinstance(namespace, string_types), type(namespace)
65-
assert isinstance(name, string_types), type(name)
66-
assert all((namespace is None or isinstance(namespace, string_types)) and
67-
isinstance(name, string_types) and
68-
isinstance(value, string_types)
69-
for (namespace, name), value in attrs.items())
70-
7138
return {"type": "StartTag",
72-
"name": text_type(name),
73-
"namespace": to_text(namespace),
74-
"data": dict(((to_text(namespace, False), to_text(name)),
75-
to_text(value, False))
76-
for (namespace, name), value in attrs.items())}
39+
"name": name,
40+
"namespace": namespace,
41+
"data": attrs}
7742

7843
def endTag(self, namespace, name):
79-
assert namespace is None or isinstance(namespace, string_types), type(namespace)
80-
assert isinstance(name, string_types), type(namespace)
81-
8244
return {"type": "EndTag",
83-
"name": to_text(name, False),
84-
"namespace": to_text(namespace),
85-
"data": {}}
45+
"name": name,
46+
"namespace": namespace}
8647

8748
def text(self, data):
88-
assert isinstance(data, string_types), type(data)
89-
90-
data = to_text(data)
49+
data = data
9150
middle = data.lstrip(spaceCharacters)
9251
left = data[:len(data) - len(middle)]
9352
if left:
@@ -101,25 +60,16 @@ def text(self, data):
10160
yield {"type": "SpaceCharacters", "data": right}
10261

10362
def comment(self, data):
104-
assert isinstance(data, string_types), type(data)
105-
106-
return {"type": "Comment", "data": text_type(data)}
107-
108-
def doctype(self, name, publicId=None, systemId=None, correct=True):
109-
assert is_text_or_none(name), type(name)
110-
assert is_text_or_none(publicId), type(publicId)
111-
assert is_text_or_none(systemId), type(systemId)
63+
return {"type": "Comment", "data": data}
11264

65+
def doctype(self, name, publicId=None, systemId=None):
11366
return {"type": "Doctype",
114-
"name": to_text(name),
115-
"publicId": to_text(publicId),
116-
"systemId": to_text(systemId),
117-
"correct": to_text(correct)}
67+
"name": name,
68+
"publicId": publicId,
69+
"systemId": systemId}
11870

11971
def entity(self, name):
120-
assert isinstance(name, string_types), type(name)
121-
122-
return {"type": "Entity", "name": text_type(name)}
72+
return {"type": "Entity", "name": name}
12373

12474
def unknown(self, nodeType):
12575
return self.error("Unknown node type: " + nodeType)
@@ -154,7 +104,7 @@ def __iter__(self):
154104

155105
elif type == ELEMENT:
156106
namespace, name, attributes, hasChildren = details
157-
if name in voidElements:
107+
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
158108
for token in self.emptyTag(namespace, name, attributes,
159109
hasChildren):
160110
yield token
@@ -187,7 +137,7 @@ def __iter__(self):
187137
type, details = details[0], details[1:]
188138
if type == ELEMENT:
189139
namespace, name, attributes, hasChildren = details
190-
if name not in voidElements:
140+
if (namespace and namespace != namespaces["html"]) or name not in voidElements:
191141
yield self.endTag(namespace, name)
192142
if self.tree is currentNode:
193143
currentNode = None

html5lib/treewalkers/genshistream.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def tokens(self, event, next):
4848
elif kind == END:
4949
name = data.localname
5050
namespace = data.namespace
51-
if name not in voidElements:
51+
if namespace != namespaces["html"] or name not in voidElements:
5252
yield self.endTag(namespace, name)
5353

5454
elif kind == COMMENT:

html5lib/treewalkers/lxmletree.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,10 @@ def __len__(self):
118118
class TreeWalker(_base.NonRecursiveTreeWalker):
119119
def __init__(self, tree):
120120
if hasattr(tree, "getroot"):
121+
self.fragmentChildren = set()
121122
tree = Root(tree)
122123
elif isinstance(tree, list):
124+
self.fragmentChildren = set(tree)
123125
tree = FragmentRoot(tree)
124126
_base.NonRecursiveTreeWalker.__init__(self, tree)
125127
self.filter = ihatexml.InfosetFilter()
@@ -137,7 +139,7 @@ def getNodeDetails(self, node):
137139
return _base.DOCTYPE, node.name, node.public_id, node.system_id
138140

139141
elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
140-
return _base.TEXT, node.obj
142+
return _base.TEXT, ensure_str(node.obj)
141143

142144
elif node.tag == etree.Comment:
143145
return _base.COMMENT, ensure_str(node.text)
@@ -197,5 +199,7 @@ def getParentNode(self, node):
197199
if key == "text":
198200
return node
199201
# else: fallback to "normal" processing
202+
elif node in self.fragmentChildren:
203+
return None
200204

201205
return node.getparent()

0 commit comments

Comments
 (0)