diff --git a/README.rst b/README.rst index e73b1639..47eb90d3 100644 --- a/README.rst +++ b/README.rst @@ -116,10 +116,6 @@ functionality: - ``chardet`` can be used as a fallback when character encoding cannot be determined. -- ``ordereddict`` can be used under Python 2.6 - (``collections.OrderedDict`` is used instead on later versions) to - serialize attributes in alphabetical order. - Bugs ---- diff --git a/html5lib/constants.py b/html5lib/constants.py index df1f061e..9e7541d3 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -437,6 +437,73 @@ (namespaces["mathml"], "mtext") ]) +adjustSVGAttributes = { + "attributename": "attributeName", + "attributetype": "attributeType", + "basefrequency": "baseFrequency", + "baseprofile": "baseProfile", + "calcmode": "calcMode", + "clippathunits": "clipPathUnits", + "contentscripttype": "contentScriptType", + "contentstyletype": "contentStyleType", + "diffuseconstant": "diffuseConstant", + "edgemode": "edgeMode", + "externalresourcesrequired": "externalResourcesRequired", + "filterres": "filterRes", + "filterunits": "filterUnits", + "glyphref": "glyphRef", + "gradienttransform": "gradientTransform", + "gradientunits": "gradientUnits", + "kernelmatrix": "kernelMatrix", + "kernelunitlength": "kernelUnitLength", + "keypoints": "keyPoints", + "keysplines": "keySplines", + "keytimes": "keyTimes", + "lengthadjust": "lengthAdjust", + "limitingconeangle": "limitingConeAngle", + "markerheight": "markerHeight", + "markerunits": "markerUnits", + "markerwidth": "markerWidth", + "maskcontentunits": "maskContentUnits", + "maskunits": "maskUnits", + "numoctaves": "numOctaves", + "pathlength": "pathLength", + "patterncontentunits": "patternContentUnits", + "patterntransform": "patternTransform", + "patternunits": "patternUnits", + "pointsatx": "pointsAtX", + "pointsaty": "pointsAtY", + "pointsatz": "pointsAtZ", + "preservealpha": "preserveAlpha", + "preserveaspectratio": "preserveAspectRatio", + "primitiveunits": "primitiveUnits", + "refx": "refX", + "refy": "refY", + "repeatcount": "repeatCount", + "repeatdur": "repeatDur", + "requiredextensions": "requiredExtensions", + "requiredfeatures": "requiredFeatures", + "specularconstant": "specularConstant", + "specularexponent": "specularExponent", + "spreadmethod": "spreadMethod", + "startoffset": "startOffset", + "stddeviation": "stdDeviation", + "stitchtiles": "stitchTiles", + "surfacescale": "surfaceScale", + "systemlanguage": "systemLanguage", + "tablevalues": "tableValues", + "targetx": "targetX", + "targety": "targetY", + "textlength": "textLength", + "viewbox": "viewBox", + "viewtarget": "viewTarget", + "xchannelselector": "xChannelSelector", + "ychannelselector": "yChannelSelector", + "zoomandpan": "zoomAndPan" +} + +adjustMathMLAttributes = {"definitionurl": "definitionURL"} + adjustForeignAttributes = { "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]), "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]), diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 331b8fd7..df2a6cf7 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -1,8 +1,13 @@ from __future__ import absolute_import, division, unicode_literals -from six import with_metaclass +from six import with_metaclass, viewkeys, PY3 import types +try: + from collections import OrderedDict +except ImportError: + from ordereddict import OrderedDict + from . import inputstream from . import tokenizer @@ -10,15 +15,17 @@ from .treebuilders._base import Marker from . import utils -from . import constants -from .constants import spaceCharacters, asciiUpper2Lower -from .constants import specialElements -from .constants import headingElements -from .constants import cdataElements, rcdataElements -from .constants import tokenTypes, ReparseException, namespaces -from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements -from .constants import adjustForeignAttributes as adjustForeignAttributesMap -from .constants import E +from .constants import ( + spaceCharacters, asciiUpper2Lower, + specialElements, headingElements, cdataElements, rcdataElements, + tokenTypes, tagTokenTypes, + namespaces, + htmlIntegrationPointElements, mathmlTextIntegrationPointElements, + adjustForeignAttributes as adjustForeignAttributesMap, + adjustMathMLAttributes, adjustSVGAttributes, + E, + ReparseException +) def parse(doc, treebuilder="etree", encoding=None, @@ -272,96 +279,18 @@ def normalizeToken(self, token): """ HTML5 specific normalizations to the token stream """ if token["type"] == tokenTypes["StartTag"]: - token["data"] = dict(token["data"][::-1]) + token["data"] = OrderedDict(token['data'][::-1]) return token def adjustMathMLAttributes(self, token): - replacements = {"definitionurl": "definitionURL"} - for k, v in replacements.items(): - if k in token["data"]: - token["data"][v] = token["data"][k] - del token["data"][k] + adjust_attributes(token, adjustMathMLAttributes) def adjustSVGAttributes(self, token): - replacements = { - "attributename": "attributeName", - "attributetype": "attributeType", - "basefrequency": "baseFrequency", - "baseprofile": "baseProfile", - "calcmode": "calcMode", - "clippathunits": "clipPathUnits", - "contentscripttype": "contentScriptType", - "contentstyletype": "contentStyleType", - "diffuseconstant": "diffuseConstant", - "edgemode": "edgeMode", - "externalresourcesrequired": "externalResourcesRequired", - "filterres": "filterRes", - "filterunits": "filterUnits", - "glyphref": "glyphRef", - "gradienttransform": "gradientTransform", - "gradientunits": "gradientUnits", - "kernelmatrix": "kernelMatrix", - "kernelunitlength": "kernelUnitLength", - "keypoints": "keyPoints", - "keysplines": "keySplines", - "keytimes": "keyTimes", - "lengthadjust": "lengthAdjust", - "limitingconeangle": "limitingConeAngle", - "markerheight": "markerHeight", - "markerunits": "markerUnits", - "markerwidth": "markerWidth", - "maskcontentunits": "maskContentUnits", - "maskunits": "maskUnits", - "numoctaves": "numOctaves", - "pathlength": "pathLength", - "patterncontentunits": "patternContentUnits", - "patterntransform": "patternTransform", - "patternunits": "patternUnits", - "pointsatx": "pointsAtX", - "pointsaty": "pointsAtY", - "pointsatz": "pointsAtZ", - "preservealpha": "preserveAlpha", - "preserveaspectratio": "preserveAspectRatio", - "primitiveunits": "primitiveUnits", - "refx": "refX", - "refy": "refY", - "repeatcount": "repeatCount", - "repeatdur": "repeatDur", - "requiredextensions": "requiredExtensions", - "requiredfeatures": "requiredFeatures", - "specularconstant": "specularConstant", - "specularexponent": "specularExponent", - "spreadmethod": "spreadMethod", - "startoffset": "startOffset", - "stddeviation": "stdDeviation", - "stitchtiles": "stitchTiles", - "surfacescale": "surfaceScale", - "systemlanguage": "systemLanguage", - "tablevalues": "tableValues", - "targetx": "targetX", - "targety": "targetY", - "textlength": "textLength", - "viewbox": "viewBox", - "viewtarget": "viewTarget", - "xchannelselector": "xChannelSelector", - "ychannelselector": "yChannelSelector", - "zoomandpan": "zoomAndPan" - } - for originalName in list(token["data"].keys()): - if originalName in replacements: - svgName = replacements[originalName] - token["data"][svgName] = token["data"][originalName] - del token["data"][originalName] + adjust_attributes(token, adjustSVGAttributes) def adjustForeignAttributes(self, token): - replacements = adjustForeignAttributesMap - - for originalName in token["data"].keys(): - if originalName in replacements: - foreignName = replacements[originalName] - token["data"][foreignName] = token["data"][originalName] - del token["data"][originalName] + adjust_attributes(token, adjustForeignAttributesMap) def reparseTokenNormal(self, token): # pylint:disable=unused-argument @@ -434,7 +363,7 @@ def getPhases(debug): def log(function): """Logger that records which phase processes each token""" type_names = dict((value, key) for key, value in - constants.tokenTypes.items()) + tokenTypes.items()) def wrapped(self, *args, **kwargs): if function.__name__.startswith("process") and len(args) > 0: @@ -443,7 +372,7 @@ def wrapped(self, *args, **kwargs): info = {"type": type_names[token['type']]} except: raise - if token['type'] in constants.tagTokenTypes: + if token['type'] in tagTokenTypes: info["name"] = token['name'] self.parser.log.append((self.parser.tokenizer.state.__name__, @@ -1022,17 +951,9 @@ def __init__(self, parser, tree): self.endTagHandler.default = self.endTagOther def isMatchingFormattingElement(self, node1, node2): - if node1.name != node2.name or node1.namespace != node2.namespace: - return False - elif len(node1.attributes) != len(node2.attributes): - return False - else: - attributes1 = sorted(node1.attributes.items()) - attributes2 = sorted(node2.attributes.items()) - for attr1, attr2 in zip(attributes1, attributes2): - if attr1 != attr2: - return False - return True + return (node1.name == node2.name and + node1.namespace == node2.namespace and + node1.attributes == node2.attributes) # helper def addFormattingElement(self, token): @@ -2798,6 +2719,16 @@ def processEndTag(self, token): } +def adjust_attributes(token, replacements): + if PY3 or utils.PY27: + needs_adjustment = viewkeys(token['data']) & viewkeys(replacements) + else: + needs_adjustment = frozenset(token['data']) & frozenset(replacements) + if needs_adjustment: + token['data'] = OrderedDict((replacements.get(k, k), v) + for k, v in token['data'].items()) + + def impliedTagToken(name, type="EndTag", attributes=None, selfClosing=False): if attributes is None: diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py index f8e1ac43..0ec5b049 100644 --- a/html5lib/tests/test_parser2.py +++ b/html5lib/tests/test_parser2.py @@ -1,11 +1,13 @@ from __future__ import absolute_import, division, unicode_literals +from six import PY2, text_type + import io from . import support # noqa from html5lib.constants import namespaces -from html5lib import parse +from html5lib import parse, HTMLParser # tests that aren't autogenerated from text files @@ -49,3 +51,40 @@ def test_namespace_html_elements_1_etree(): def test_unicode_file(): assert parse(io.StringIO("a")) is not None + + +def test_duplicate_attribute(): + # This is here because we impl it in parser and not tokenizer + doc = parse('
') + el = doc[1][0] + assert el.get("class") == "a" + + +def test_debug_log(): + parser = HTMLParser(debug=True) + parser.parse("
bd
e") + + expected = [('dataState', 'InitialPhase', 'InitialPhase', 'processDoctype', {'type': 'Doctype'}), + ('dataState', 'BeforeHtmlPhase', 'BeforeHtmlPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}), + ('dataState', 'BeforeHeadPhase', 'BeforeHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}), + ('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}), + ('rcdataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}), + ('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'title', 'type': 'EndTag'}), + ('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}), + ('dataState', 'AfterHeadPhase', 'AfterHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}), + ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}), + ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}), + ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}), + ('dataState', 'InBodyPhase', 'InHeadPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}), + ('scriptDataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}), + ('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'script', 'type': 'EndTag'}), + ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}), + ('dataState', 'InBodyPhase', 'InBodyPhase', 'processEndTag', {'name': 'p', 'type': 'EndTag'}), + ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'})] + + if PY2: + for i, log in enumerate(expected): + log = [x.encode("ascii") if isinstance(x, text_type) else x for x in log] + expected[i] = tuple(log) + + assert parser.log == expected diff --git a/html5lib/treebuilders/dom.py b/html5lib/treebuilders/dom.py index b7df74b2..9d7f4824 100644 --- a/html5lib/treebuilders/dom.py +++ b/html5lib/treebuilders/dom.py @@ -1,6 +1,7 @@ from __future__ import absolute_import, division, unicode_literals +from collections import MutableMapping from xml.dom import minidom, Node import weakref @@ -13,34 +14,41 @@ def getDomBuilder(DomImplementation): Dom = DomImplementation - class AttrList(object): + class AttrList(MutableMapping): def __init__(self, element): self.element = element def __iter__(self): - return list(self.element.attributes.items()).__iter__() + return iter(self.element.attributes.keys()) def __setitem__(self, name, value): - self.element.setAttribute(name, value) + if isinstance(name, tuple): + raise NotImplementedError + else: + attr = self.element.ownerDocument.createAttribute(name) + attr.value = value + self.element.attributes[name] = attr def __len__(self): - return len(list(self.element.attributes.items())) + return len(self.element.attributes) def items(self): - return [(item[0], item[1]) for item in - list(self.element.attributes.items())] + return list(self.element.attributes.items()) - def keys(self): - return list(self.element.attributes.keys()) + def values(self): + return list(self.element.attributes.values()) def __getitem__(self, name): - return self.element.getAttribute(name) + if isinstance(name, tuple): + raise NotImplementedError + else: + return self.element.attributes[name].value - def __contains__(self, name): + def __delitem__(self, name): if isinstance(name, tuple): raise NotImplementedError else: - return self.element.hasAttribute(name) + del self.element.attributes[name] class NodeBuilder(_base.Node): def __init__(self, element): diff --git a/html5lib/utils.py b/html5lib/utils.py index 5fe237a0..ea65ab6b 100644 --- a/html5lib/utils.py +++ b/html5lib/utils.py @@ -1,5 +1,6 @@ from __future__ import absolute_import, division, unicode_literals +import sys from types import ModuleType from six import text_type @@ -12,9 +13,11 @@ __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair", "surrogatePairToCodepoint", "moduleFactoryFactory", - "supports_lone_surrogates"] + "supports_lone_surrogates", "PY27"] +PY27 = sys.version_info[0] == 2 and sys.version_info[1] >= 7 + # Platforms not supporting lone surrogates (\uD800-\uDFFF) should be # caught by the below test. In general this would be any platform # using UTF-16 as its encoding of unicode strings, such as diff --git a/requirements-optional.txt b/requirements-optional.txt index 781ab8c2..c00fd242 100644 --- a/requirements-optional.txt +++ b/requirements-optional.txt @@ -15,7 +15,3 @@ lxml ; platform_python_implementation == 'CPython' # DATrie can be used in place of our Python trie implementation for # slightly better parsing performance. datrie ; platform_python_implementation == 'CPython' - -# Can be used to force attributes to be serialized in alphabetical -# order. -ordereddict ; python_version < '2.7' diff --git a/requirements.txt b/requirements.txt index 15cae9dc..745993b9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ six webencodings +ordereddict ; python_version < '2.7'