Merge pull request #275 from gsnedders/attr_order

gsnedders · web-flow · commit 14d4851ef707 · 2016-07-15T02:34:38.000+01:00
Fix attribute order
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -4,9 +4,10 @@ Change Log
 0.999999999/1.0b10
 ~~~~~~~~~~~~~~~~~~
 
-Released on XXX
+Released on July 15, 2016
 
-* XXX
+* Fix attribute order going to the tree builder to be document order
+  instead of reverse document order(!).
 
 
 0.99999999/1.0b9
diff --git a/html5lib/__init__.py b/html5lib/__init__.py
@@ -22,4 +22,4 @@
            "getTreeWalker", "serialize"]
 
 # this has to be at the top level, see how setup.py parses this
-__version__ = "0.999999999-dev"
+__version__ = "0.9999999999-dev"
diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
@@ -265,7 +265,11 @@ def normalizeToken(self, token):
         """ HTML5 specific normalizations to the token stream """
 
         if token["type"] == tokenTypes["StartTag"]:
-            token["data"] = OrderedDict(token['data'][::-1])
+            raw = token["data"]
+            token["data"] = OrderedDict(raw)
+            if len(raw) > len(token["data"]):
+                # we had some duplicated attribute, fix so first wins
+                token["data"].update(raw[::-1])
 
         return token
 
diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py
@@ -1,12 +1,12 @@
 from __future__ import absolute_import, division, unicode_literals
 
-from six import PY2, text_type
+from six import PY2, text_type, unichr
 
 import io
 
 from . import support  # noqa
 
-from html5lib.constants import namespaces
+from html5lib.constants import namespaces, tokenTypes
 from html5lib import parse, parseFragment, HTMLParser
 
 
@@ -53,13 +53,42 @@ def test_unicode_file():
     assert parse(io.StringIO("a")) is not None
 
 
+def test_maintain_attribute_order():
+    # This is here because we impl it in parser and not tokenizer
+    p = HTMLParser()
+    # generate loads to maximize the chance a hash-based mutation will occur
+    attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))]
+    token = {'name': 'html',
+             'selfClosing': False,
+             'selfClosingAcknowledged': False,
+             'type': tokenTypes["StartTag"],
+             'data': attrs}
+    out = p.normalizeToken(token)
+    attr_order = list(out["data"].keys())
+    assert attr_order == [x for x, i in attrs]
+
+
 def test_duplicate_attribute():
     # This is here because we impl it in parser and not tokenizer
     doc = parse('<p class=a class=b>')
     el = doc[1][0]
     assert el.get("class") == "a"
 
 
+def test_maintain_duplicate_attribute_order():
+    # This is here because we impl it in parser and not tokenizer
+    p = HTMLParser()
+    attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))]
+    token = {'name': 'html',
+             'selfClosing': False,
+             'selfClosingAcknowledged': False,
+             'type': tokenTypes["StartTag"],
+             'data': attrs + [('a', len(attrs))]}
+    out = p.normalizeToken(token)
+    attr_order = list(out["data"].keys())
+    assert attr_order == [x for x, i in attrs]
+
+
 def test_debug_log():
     parser = HTMLParser(debug=True)
     parser.parse("<!doctype html><title>a</title><p>b<script>c</script>d</p>e")