From e0dc25f335d3df610f752df29d5c4301717eb452 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Fri, 15 Jul 2016 02:23:19 +0100 Subject: [PATCH 1/3] Fix attribute order to the treebuilder to be document order Somehow I managed to screw this up so it became reverse document order! --- CHANGES.rst | 5 +++-- html5lib/html5parser.py | 6 +++++- html5lib/tests/test_parser2.py | 33 +++++++++++++++++++++++++++++++-- 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 483bdedb..570c9605 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,9 +4,10 @@ Change Log 0.999999999/1.0b10 ~~~~~~~~~~~~~~~~~~ -Released on XXX +Released on July 15, 2016 -* XXX +* Fix attribute order going to the tree builder to be document order + instead of reverse document order(!). 0.99999999/1.0b9 diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 470c8a7d..2abd63e4 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -265,7 +265,11 @@ def normalizeToken(self, token): """ HTML5 specific normalizations to the token stream """ if token["type"] == tokenTypes["StartTag"]: - token["data"] = OrderedDict(token['data'][::-1]) + raw = token["data"] + token["data"] = OrderedDict(raw) + if len(raw) > len(token["data"]): + # we had some duplicated attribute, fix so first wins + token["data"].update(raw[::-1]) return token diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py index 21dc59d9..bcc0bf48 100644 --- a/html5lib/tests/test_parser2.py +++ b/html5lib/tests/test_parser2.py @@ -1,12 +1,12 @@ from __future__ import absolute_import, division, unicode_literals -from six import PY2, text_type +from six import PY2, text_type, unichr import io from . import support # noqa -from html5lib.constants import namespaces +from html5lib.constants import namespaces, tokenTypes from html5lib import parse, parseFragment, HTMLParser @@ -53,6 +53,21 @@ def test_unicode_file(): assert parse(io.StringIO("a")) is not None +def test_maintain_attribute_order(): + # This is here because we impl it in parser and not tokenizer + p = HTMLParser() + # generate loads to maximize the chance a hash-based mutation will occur + attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))] + token = {'name': 'html', + 'selfClosing': False, + 'selfClosingAcknowledged': False, + 'type': tokenTypes["StartTag"], + 'data': attrs} + out = p.normalizeToken(token) + attr_order = list(out["data"].keys()) + assert attr_order == [x for x, i in attrs] + + def test_duplicate_attribute(): # This is here because we impl it in parser and not tokenizer doc = parse('

') @@ -60,6 +75,20 @@ def test_duplicate_attribute(): assert el.get("class") == "a" +def test_maintain_duplicate_attribute_order(): + # This is here because we impl it in parser and not tokenizer + p = HTMLParser() + attrs = [(unichr(x), i) for i, x in enumerate(range(ord('a'), ord('z')))] + token = {'name': 'html', + 'selfClosing': False, + 'selfClosingAcknowledged': False, + 'type': tokenTypes["StartTag"], + 'data': attrs + [('a', len(attrs))]} + out = p.normalizeToken(token) + attr_order = list(out["data"].keys()) + assert attr_order == [x for x, i in attrs] + + def test_debug_log(): parser = HTMLParser(debug=True) parser.parse("a

bd

e") From 6a73efa01754253605284b5a5688de3961b120fa Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Fri, 15 Jul 2016 02:24:18 +0100 Subject: [PATCH 2/3] Yes, another release, already. :( --- html5lib/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/__init__.py b/html5lib/__init__.py index 473c265f..8ee9b53e 100644 --- a/html5lib/__init__.py +++ b/html5lib/__init__.py @@ -22,4 +22,4 @@ "getTreeWalker", "serialize"] # this has to be at the top level, see how setup.py parses this -__version__ = "0.999999999-dev" +__version__ = "0.999999999" From 983a9355ea66a8c1626a42fd0682b48e246685bd Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Fri, 15 Jul 2016 02:24:33 +0100 Subject: [PATCH 3/3] And back to dev. --- html5lib/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/__init__.py b/html5lib/__init__.py index 8ee9b53e..f3cd9455 100644 --- a/html5lib/__init__.py +++ b/html5lib/__init__.py @@ -22,4 +22,4 @@ "getTreeWalker", "serialize"] # this has to be at the top level, see how setup.py parses this -__version__ = "0.999999999" +__version__ = "0.9999999999-dev"