html5lib · gsnedders · May 22, 2016 · May 7, 2016 · Oct 26, 2013 · May 7, 2016
diff --git a/README.rst b/README.rst
@@ -116,10 +116,6 @@ functionality:
 - ``chardet`` can be used as a fallback when character encoding cannot
   be determined.
 
-- ``ordereddict`` can be used under Python 2.6
-  (``collections.OrderedDict`` is used instead on later versions) to
-  serialize attributes in alphabetical order.
-
 
 Bugs
 ----

diff --git a/html5lib/constants.py b/html5lib/constants.py
@@ -437,6 +437,73 @@
     (namespaces["mathml"], "mtext")
 ])
 
+adjustSVGAttributes = {
+    "attributename": "attributeName",
+    "attributetype": "attributeType",
+    "basefrequency": "baseFrequency",
+    "baseprofile": "baseProfile",
+    "calcmode": "calcMode",
+    "clippathunits": "clipPathUnits",
+    "contentscripttype": "contentScriptType",
+    "contentstyletype": "contentStyleType",
+    "diffuseconstant": "diffuseConstant",
+    "edgemode": "edgeMode",
+    "externalresourcesrequired": "externalResourcesRequired",
+    "filterres": "filterRes",
+    "filterunits": "filterUnits",
+    "glyphref": "glyphRef",
+    "gradienttransform": "gradientTransform",
+    "gradientunits": "gradientUnits",
+    "kernelmatrix": "kernelMatrix",
+    "kernelunitlength": "kernelUnitLength",
+    "keypoints": "keyPoints",
+    "keysplines": "keySplines",
+    "keytimes": "keyTimes",
+    "lengthadjust": "lengthAdjust",
+    "limitingconeangle": "limitingConeAngle",
+    "markerheight": "markerHeight",
+    "markerunits": "markerUnits",
+    "markerwidth": "markerWidth",
+    "maskcontentunits": "maskContentUnits",
+    "maskunits": "maskUnits",
+    "numoctaves": "numOctaves",
+    "pathlength": "pathLength",
+    "patterncontentunits": "patternContentUnits",
+    "patterntransform": "patternTransform",
+    "patternunits": "patternUnits",
+    "pointsatx": "pointsAtX",
+    "pointsaty": "pointsAtY",
+    "pointsatz": "pointsAtZ",
+    "preservealpha": "preserveAlpha",
+    "preserveaspectratio": "preserveAspectRatio",
+    "primitiveunits": "primitiveUnits",
+    "refx": "refX",
+    "refy": "refY",
+    "repeatcount": "repeatCount",
+    "repeatdur": "repeatDur",
+    "requiredextensions": "requiredExtensions",
+    "requiredfeatures": "requiredFeatures",
+    "specularconstant": "specularConstant",
+    "specularexponent": "specularExponent",
+    "spreadmethod": "spreadMethod",
+    "startoffset": "startOffset",
+    "stddeviation": "stdDeviation",
+    "stitchtiles": "stitchTiles",
+    "surfacescale": "surfaceScale",
+    "systemlanguage": "systemLanguage",
+    "tablevalues": "tableValues",
+    "targetx": "targetX",
+    "targety": "targetY",
+    "textlength": "textLength",
+    "viewbox": "viewBox",
+    "viewtarget": "viewTarget",
+    "xchannelselector": "xChannelSelector",
+    "ychannelselector": "yChannelSelector",
+    "zoomandpan": "zoomAndPan"
+}
+
+adjustMathMLAttributes = {"definitionurl": "definitionURL"}
+
 adjustForeignAttributes = {
     "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
     "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),

diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
@@ -1,24 +1,31 @@
 from __future__ import absolute_import, division, unicode_literals
-from six import with_metaclass
+from six import with_metaclass, viewkeys, PY3
 
 import types
 
+try:
+    from collections import OrderedDict
+except ImportError:
+    from ordereddict import OrderedDict
+
 from . import inputstream
 from . import tokenizer
 
 from . import treebuilders
 from .treebuilders._base import Marker
 
 from . import utils
-from . import constants
-from .constants import spaceCharacters, asciiUpper2Lower
-from .constants import specialElements
-from .constants import headingElements
-from .constants import cdataElements, rcdataElements
-from .constants import tokenTypes, ReparseException, namespaces
-from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
-from .constants import adjustForeignAttributes as adjustForeignAttributesMap
-from .constants import E
+from .constants import (
+    spaceCharacters, asciiUpper2Lower,
+    specialElements, headingElements, cdataElements, rcdataElements,
+    tokenTypes, tagTokenTypes,
+    namespaces,
+    htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
+    adjustForeignAttributes as adjustForeignAttributesMap,
+    adjustMathMLAttributes, adjustSVGAttributes,
+    E,
+    ReparseException
+)
 
 
 def parse(doc, treebuilder="etree", encoding=None,
@@ -272,96 +279,18 @@ def normalizeToken(self, token):
         """ HTML5 specific normalizations to the token stream """
 
         if token["type"] == tokenTypes["StartTag"]:
-            token["data"] = dict(token["data"][::-1])
+            token["data"] = OrderedDict(token['data'][::-1])
 
         return token
 
     def adjustMathMLAttributes(self, token):
-        replacements = {"definitionurl": "definitionURL"}
-        for k, v in replacements.items():
-            if k in token["data"]:
-                token["data"][v] = token["data"][k]
-                del token["data"][k]
+        adjust_attributes(token, adjustMathMLAttributes)
 
     def adjustSVGAttributes(self, token):
-        replacements = {
-            "attributename": "attributeName",
-            "attributetype": "attributeType",
-            "basefrequency": "baseFrequency",
-            "baseprofile": "baseProfile",
-            "calcmode": "calcMode",
-            "clippathunits": "clipPathUnits",
-            "contentscripttype": "contentScriptType",
-            "contentstyletype": "contentStyleType",
-            "diffuseconstant": "diffuseConstant",
-            "edgemode": "edgeMode",
-            "externalresourcesrequired": "externalResourcesRequired",
-            "filterres": "filterRes",
-            "filterunits": "filterUnits",
-            "glyphref": "glyphRef",
-            "gradienttransform": "gradientTransform",
-            "gradientunits": "gradientUnits",
-            "kernelmatrix": "kernelMatrix",
-            "kernelunitlength": "kernelUnitLength",
-            "keypoints": "keyPoints",
-            "keysplines": "keySplines",
-            "keytimes": "keyTimes",
-            "lengthadjust": "lengthAdjust",
-            "limitingconeangle": "limitingConeAngle",
-            "markerheight": "markerHeight",
-            "markerunits": "markerUnits",
-            "markerwidth": "markerWidth",
-            "maskcontentunits": "maskContentUnits",
-            "maskunits": "maskUnits",
-            "numoctaves": "numOctaves",
-            "pathlength": "pathLength",
-            "patterncontentunits": "patternContentUnits",
-            "patterntransform": "patternTransform",
-            "patternunits": "patternUnits",
-            "pointsatx": "pointsAtX",
-            "pointsaty": "pointsAtY",
-            "pointsatz": "pointsAtZ",
-            "preservealpha": "preserveAlpha",
-            "preserveaspectratio": "preserveAspectRatio",
-            "primitiveunits": "primitiveUnits",
-            "refx": "refX",
-            "refy": "refY",
-            "repeatcount": "repeatCount",
-            "repeatdur": "repeatDur",
-            "requiredextensions": "requiredExtensions",
-            "requiredfeatures": "requiredFeatures",
-            "specularconstant": "specularConstant",
-            "specularexponent": "specularExponent",
-            "spreadmethod": "spreadMethod",
-            "startoffset": "startOffset",
-            "stddeviation": "stdDeviation",
-            "stitchtiles": "stitchTiles",
-            "surfacescale": "surfaceScale",
-            "systemlanguage": "systemLanguage",
-            "tablevalues": "tableValues",
-            "targetx": "targetX",
-            "targety": "targetY",
-            "textlength": "textLength",
-            "viewbox": "viewBox",
-            "viewtarget": "viewTarget",
-            "xchannelselector": "xChannelSelector",
-            "ychannelselector": "yChannelSelector",
-            "zoomandpan": "zoomAndPan"
-        }
-        for originalName in list(token["data"].keys()):
-            if originalName in replacements:
-                svgName = replacements[originalName]
-                token["data"][svgName] = token["data"][originalName]
-                del token["data"][originalName]
+        adjust_attributes(token, adjustSVGAttributes)
 
     def adjustForeignAttributes(self, token):
-        replacements = adjustForeignAttributesMap
-
-        for originalName in token["data"].keys():
-            if originalName in replacements:
-                foreignName = replacements[originalName]
-                token["data"][foreignName] = token["data"][originalName]
-                del token["data"][originalName]
+        adjust_attributes(token, adjustForeignAttributesMap)
 
     def reparseTokenNormal(self, token):
         # pylint:disable=unused-argument
@@ -434,7 +363,7 @@ def getPhases(debug):
     def log(function):
         """Logger that records which phase processes each token"""
         type_names = dict((value, key) for key, value in
-                          constants.tokenTypes.items())
+                          tokenTypes.items())
 
         def wrapped(self, *args, **kwargs):
             if function.__name__.startswith("process") and len(args) > 0:
@@ -443,7 +372,7 @@ def wrapped(self, *args, **kwargs):
                     info = {"type": type_names[token['type']]}
                 except:
                     raise
-                if token['type'] in constants.tagTokenTypes:
+                if token['type'] in tagTokenTypes:
                     info["name"] = token['name']
 
                 self.parser.log.append((self.parser.tokenizer.state.__name__,
@@ -1022,17 +951,9 @@ def __init__(self, parser, tree):
             self.endTagHandler.default = self.endTagOther
 
         def isMatchingFormattingElement(self, node1, node2):
-            if node1.name != node2.name or node1.namespace != node2.namespace:
-                return False
-            elif len(node1.attributes) != len(node2.attributes):
-                return False
-            else:
-                attributes1 = sorted(node1.attributes.items())
-                attributes2 = sorted(node2.attributes.items())
-                for attr1, attr2 in zip(attributes1, attributes2):
-                    if attr1 != attr2:
-                        return False
-            return True
+            return (node1.name == node2.name and
+                    node1.namespace == node2.namespace and
+                    node1.attributes == node2.attributes)
 
         # helper
         def addFormattingElement(self, token):
@@ -2798,6 +2719,16 @@ def processEndTag(self, token):
     }
 
 
+def adjust_attributes(token, replacements):
+    if PY3 or utils.PY27:
+        needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
+    else:
+        needs_adjustment = frozenset(token['data']) & frozenset(replacements)
+    if needs_adjustment:
+        token['data'] = OrderedDict((replacements.get(k, k), v)
+                                    for k, v in token['data'].items())
+
+
 def impliedTagToken(name, type="EndTag", attributes=None,
                     selfClosing=False):
     if attributes is None:

diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py
@@ -1,11 +1,13 @@
 from __future__ import absolute_import, division, unicode_literals
 
+from six import PY2, text_type
+
 import io
 
 from . import support  # noqa
 
 from html5lib.constants import namespaces
-from html5lib import parse
+from html5lib import parse, HTMLParser
 
 
 # tests that aren't autogenerated from text files
@@ -49,3 +51,40 @@ def test_namespace_html_elements_1_etree():
 
 def test_unicode_file():
     assert parse(io.StringIO("a")) is not None
+
+
+def test_duplicate_attribute():
+    # This is here because we impl it in parser and not tokenizer
+    doc = parse('<p class=a class=b>')
+    el = doc[1][0]
+    assert el.get("class") == "a"
+
+
+def test_debug_log():
+    parser = HTMLParser(debug=True)
+    parser.parse("<!doctype html><title>a</title><p>b<script>c</script>d</p>e")
+
+    expected = [('dataState', 'InitialPhase', 'InitialPhase', 'processDoctype', {'type': 'Doctype'}),
+                ('dataState', 'BeforeHtmlPhase', 'BeforeHtmlPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
+                ('dataState', 'BeforeHeadPhase', 'BeforeHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
+                ('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'title', 'type': 'StartTag'}),
+                ('rcdataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
+                ('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'title', 'type': 'EndTag'}),
+                ('dataState', 'InHeadPhase', 'InHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
+                ('dataState', 'AfterHeadPhase', 'AfterHeadPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
+                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}),
+                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
+                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
+                ('dataState', 'InBodyPhase', 'InHeadPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}),
+                ('scriptDataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}),
+                ('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'script', 'type': 'EndTag'}),
+                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),
+                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processEndTag', {'name': 'p', 'type': 'EndTag'}),
+                ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'})]
+
+    if PY2:
+        for i, log in enumerate(expected):
+            log = [x.encode("ascii") if isinstance(x, text_type) else x for x in log]
+            expected[i] = tuple(log)
+
+    assert parser.log == expected