html5lib · May 17, 2016
diff --git a/‎html5lib/serializer/htmlserializer.py
Lines changed: 6 additions & 5 deletions b/‎html5lib/serializer/htmlserializer.py
Lines changed: 6 additions & 5 deletions
diff --git a/‎html5lib/tests/conftest.py
Lines changed: 5 additions & 0 deletions b/‎html5lib/tests/conftest.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎html5lib/tests/sanitizer-testdata/tests1.dat
Lines changed: 433 additions & 0 deletions b/‎html5lib/tests/sanitizer-testdata/tests1.dat
Lines changed: 433 additions & 0 deletions
diff --git a/‎html5lib/tests/sanitizer.py
Lines changed: 50 additions & 0 deletions b/‎html5lib/tests/sanitizer.py
Lines changed: 50 additions & 0 deletions
@@ -184,6 +184,12 @@ def serialize(self, treewalker, encoding=None):
         if encoding and self.inject_meta_charset:
             from ..filters.inject_meta_charset import Filter
             treewalker = Filter(treewalker, encoding)
+        # Alphabetical attributes is here under the assumption that none of
+        # the later filters add or change order of attributes; it needs to be
+        # before the sanitizer so escaped elements come out correctly
+        if self.alphabetical_attributes:
+            from ..filters.alphabeticalattributes import Filter
+            treewalker = Filter(treewalker)
         # WhitespaceFilter should be used before OptionalTagFilter
         # for maximum efficiently of this latter filter
         if self.strip_whitespace:
@@ -195,11 +201,6 @@ def serialize(self, treewalker, encoding=None):
         if self.omit_optional_tags:
             from ..filters.optionaltags import Filter
             treewalker = Filter(treewalker)
-        # Alphabetical attributes must be last, as other filters
-        # could add attributes and alter the order
-        if self.alphabetical_attributes:
-            from ..filters.alphabeticalattributes import Filter
-            treewalker = Filter(treewalker)
 
         for token in treewalker:
             type = token["type"]
 
@@ -2,11 +2,13 @@
 
 from .tree_construction import TreeConstructionFile
 from .tokenizer import TokenizerFile
+from .sanitizer import SanitizerFile
 
 _dir = os.path.abspath(os.path.dirname(__file__))
 _testdata = os.path.join(_dir, "testdata")
 _tree_construction = os.path.join(_testdata, "tree-construction")
 _tokenizer = os.path.join(_testdata, "tokenizer")
+_sanitizer_testdata = os.path.join(_dir, "sanitizer-testdata")
 
 
 def pytest_collectstart():
@@ -24,3 +26,6 @@ def pytest_collect_file(path, parent):
     elif dir == _tokenizer:
         if path.ext == ".test":
             return TokenizerFile(path, parent)
+    elif dir == _sanitizer_testdata:
+        if path.ext == ".dat":
+            return SanitizerFile(path, parent)
@@ -0,0 +1,433 @@
+[
+  {
+    "name": "IE_Comments",
+    "input": "<!--[if gte IE 4]><script>alert('XSS');</script><![endif]-->",
+    "output": ""
+  },
+
+  {
+    "name": "IE_Comments_2",
+    "input": "<![if !IE 5]><script>alert('XSS');</script><![endif]>",
+    "output": "&lt;script&gt;alert('XSS');&lt;/script&gt;"
+  },
+
+  {
+    "name": "allow_colons_in_path_component",
+    "input": "<a href=\"./this:that\">foo</a>",
+    "output": "<a href='./this:that'>foo</a>"
+  },
+
+  {
+    "name": "background_attribute",
+    "input": "<div background=\"javascript:alert('XSS')\"></div>",
+    "output": "<div></div>"
+  },
+
+  {
+    "name": "bgsound",
+    "input": "<bgsound src=\"javascript:alert('XSS');\" />",
+    "output": "&lt;bgsound src=\"javascript:alert('XSS');\"&gt;&lt;/bgsound&gt;"
+  },
+
+  {
+    "name": "div_background_image_unicode_encoded",
+    "input": "<div style=\"background-image:\u00a5\u00a2\u006C\u0028'\u006a\u0061\u00a6\u0061\u00a3\u0063\u00a2\u0069\u00a0\u00a4\u003a\u0061\u006c\u0065\u00a2\u00a4\u0028.1027\u0058.1053\u0053\u0027\u0029'\u0029\">foo</div>",
+    "output": "<div style=''>foo</div>"
+  },
+
+  {
+    "name": "div_expression",
+    "input": "<div style=\"width: expression(alert('XSS'));\">foo</div>",
+    "output": "<div style=''>foo</div>"
+  },
+
+  {
+    "name": "double_open_angle_brackets",
+    "input": "<img src=http://ha.ckers.org/scriptlet.html <",
+    "output": ""
+  },
+
+  {
+    "name": "double_open_angle_brackets_2",
+    "input": "<script src=http://ha.ckers.org/scriptlet.html <",
+    "output": ""
+  },
+
+  {
+    "name": "grave_accents",
+    "input": "<img src=`javascript:alert('XSS')` />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "img_dynsrc_lowsrc",
+    "input": "<img dynsrc=\"javascript:alert('XSS')\" />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "img_vbscript",
+    "input": "<img src='vbscript:msgbox(\"XSS\")' />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "input_image",
+    "input": "<input type=\"image\" src=\"javascript:alert('XSS');\" />",
+    "output": "<input type='image'/>"
+  },
+
+  {
+    "name": "link_stylesheets",
+    "input": "<link rel=\"stylesheet\" href=\"javascript:alert('XSS');\" />",
+    "output": "&lt;link href=\"javascript:alert('XSS');\" rel=\"stylesheet\"&gt;"
+  },
+
+  {
+    "name": "link_stylesheets_2",
+    "input": "<link rel=\"stylesheet\" href=\"http://ha.ckers.org/xss.css\" />",
+    "output": "&lt;link href=\"http://ha.ckers.org/xss.css\" rel=\"stylesheet\"&gt;"
+  },
+
+  {
+    "name": "list_style_image",
+    "input": "<li style=\"list-style-image: url(javascript:alert('XSS'))\">foo</li>",
+    "output": "<li style=''>foo</li>"
+  },
+
+  {
+    "name": "no_closing_script_tags",
+    "input": "<script src=http://ha.ckers.org/xss.js?<b>",
+    "output": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;&lt;/script&gt;"
+  },
+
+  {
+    "name": "non_alpha_non_digit",
+    "input": "<script/XSS src=\"http://ha.ckers.org/xss.js\"></script>",
+    "output": "&lt;script src=\"http://ha.ckers.org/xss.js\" xss=\"\"&gt;&lt;/script&gt;"
+  },
+
+  {
+    "name": "non_alpha_non_digit_2",
+    "input": "<a onclick!\\#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>foo</a>",
+    "output": "<a>foo</a>"
+  },
+
+  {
+    "name": "non_alpha_non_digit_3",
+    "input": "<img/src=\"http://ha.ckers.org/xss.js\"/>",
+    "output": "<img src='http://ha.ckers.org/xss.js'/>"
+  },
+
+  {
+    "name": "non_alpha_non_digit_II",
+    "input": "<a href!\\#$%&()*~+-_.,:;?@[/|]^`=alert('XSS')>foo</a>",
+    "output": "<a>foo</a>"
+  },
+
+  {
+    "name": "non_alpha_non_digit_III",
+    "input": "<a/href=\"javascript:alert('XSS');\">foo</a>",
+    "output": "<a>foo</a>"
+  },
+
+  {
+    "name": "platypus",
+    "input": "<a href=\"http://www.ragingplatypus.com/\" style=\"display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;\">never trust your upstream platypus</a>",
+    "output": "<a href='http://www.ragingplatypus.com/' style='display: block; width: 100%; height: 100%; background-color: black; background-x: center; background-y: center;'>never trust your upstream platypus</a>"
+  },
+
+  {
+    "name": "protocol_resolution_in_script_tag",
+    "input": "<script src=//ha.ckers.org/.j></script>",
+    "output": "&lt;script src=\"//ha.ckers.org/.j\"&gt;&lt;/script&gt;"
+  },
+
+  {
+    "name": "should_allow_anchors",
+    "input": "<a href='foo' onclick='bar'><script>baz</script></a>",
+    "output": "<a href='foo'>&lt;script&gt;baz&lt;/script&gt;</a>"
+  },
+
+  {
+    "name": "should_allow_image_alt_attribute",
+    "input": "<img alt='foo' onclick='bar' />",
+    "output": "<img alt='foo'/>"
+  },
+
+  {
+    "name": "should_allow_image_height_attribute",
+    "input": "<img height='foo' onclick='bar' />",
+    "output": "<img height='foo'/>"
+  },
+
+  {
+    "name": "should_allow_image_src_attribute",
+    "input": "<img src='foo' onclick='bar' />",
+    "output": "<img src='foo'/>"
+  },
+
+  {
+    "name": "should_allow_image_width_attribute",
+    "input": "<img width='foo' onclick='bar' />",
+    "output": "<img width='foo'/>"
+  },
+
+  {
+    "name": "should_handle_blank_text",
+    "input": "",
+    "output": ""
+  },
+
+  {
+    "name": "should_handle_malformed_image_tags",
+    "input": "<img \"\"\"><script>alert(\"XSS\")</script>\">",
+    "output": "<img/>&lt;script&gt;alert(\"XSS\")&lt;/script&gt;\"&gt;"
+  },
+
+  {
+    "name": "should_handle_non_html",
+    "input": "abc",
+    "output": "abc"
+  },
+
+  {
+    "name": "should_not_fall_for_ridiculous_hack",
+    "input": "<img\nsrc\n=\n\"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n\"\n />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_0",
+    "input": "<img src=\"javascript:alert('XSS');\" />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_1",
+    "input": "<img src=javascript:alert('XSS') />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_10",
+    "input": "<img src=\"jav&#x0A;ascript:alert('XSS');\" />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_11",
+    "input": "<img src=\"jav&#x0D;ascript:alert('XSS');\" />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_12",
+    "input": "<img src=\" &#14;  javascript:alert('XSS');\" />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_13",
+    "input": "<img src=\"&#x20;javascript:alert('XSS');\" />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_14",
+    "input": "<img src=\"&#xA0;javascript:alert('XSS');\" />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_2",
+    "input": "<img src=\"JaVaScRiPt:alert('XSS')\" />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_3",
+    "input": "<img src='javascript:alert(&quot;XSS&quot;)' />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_4",
+    "input": "<img src='javascript:alert(String.fromCharCode(88,83,83))' />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_5",
+    "input": "<img src='&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;' />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_6",
+    "input": "<img src='&#0000106;&#0000097;&#0000118;&#0000097;&#0000115;&#0000099;&#0000114;&#0000105;&#0000112;&#0000116;&#0000058;&#0000097;&#0000108;&#0000101;&#0000114;&#0000116;&#0000040;&#0000039;&#0000088;&#0000083;&#0000083;&#0000039;&#0000041' />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_7",
+    "input": "<img src='&#x6A;&#x61;&#x76;&#x61;&#x73;&#x63;&#x72;&#x69;&#x70;&#x74;&#x3A;&#x61;&#x6C;&#x65;&#x72;&#x74;&#x28;&#x27;&#x58;&#x53;&#x53;&#x27;&#x29' />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_8",
+    "input": "<img src=\"jav\tascript:alert('XSS');\" />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_not_fall_for_xss_image_hack_9",
+    "input": "<img src=\"jav&#x09;ascript:alert('XSS');\" />",
+    "output": "<img/>"
+  },
+
+  {
+    "name": "should_sanitize_half_open_scripts",
+    "input": "<img src=\"javascript:alert('XSS')\"",
+    "output": ""
+  },
+
+  {
+    "name": "should_sanitize_invalid_script_tag",
+    "input": "<script/XSS SRC=\"http://ha.ckers.org/xss.js\"></script>",
+    "output": "&lt;script src=\"http://ha.ckers.org/xss.js\" xss=\"\"&gt;&lt;/script&gt;"
+  },
+
+  {
+    "name": "should_sanitize_script_tag_with_multiple_open_brackets",
+    "input": "<<script>alert(\"XSS\");//<</script>",
+    "output": "&lt;&lt;script&gt;alert(\"XSS\");//&lt;&lt;/script&gt;"
+  },
+
+  {
+    "name": "should_sanitize_script_tag_with_multiple_open_brackets_2",
+    "input": "<iframe src=http://ha.ckers.org/scriptlet.html\n<",
+    "output": ""
+  },
+
+  {
+    "name": "should_sanitize_tag_broken_up_by_null",
+    "input": "<scr\u0000ipt>alert(\"XSS\")</scr\u0000ipt>",
+    "output": "&lt;scr\ufffdipt&gt;alert(\"XSS\")&lt;/scr\ufffdipt&gt;"
+  },
+
+  {
+    "name": "should_sanitize_unclosed_script",
+    "input": "<script src=http://ha.ckers.org/xss.js?<b>",
+    "output": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;&lt;/script&gt;"
+  },
+
+  {
+    "name": "should_strip_href_attribute_in_a_with_bad_protocols",
+    "input": "<a href=\"javascript:XSS\" title=\"1\">boo</a>",
+    "output": "<a title='1'>boo</a>"
+  },
+
+  {
+    "name": "should_strip_href_attribute_in_a_with_bad_protocols_and_whitespace",
+    "input": "<a href=\" javascript:XSS\" title=\"1\">boo</a>",
+    "output": "<a title='1'>boo</a>"
+  },
+
+  {
+    "name": "should_strip_src_attribute_in_img_with_bad_protocols",
+    "input": "<img src=\"javascript:XSS\" title=\"1\">boo</img>",
+    "output": "<img title='1'/>boo"
+  },
+
+  {
+    "name": "should_strip_src_attribute_in_img_with_bad_protocols_and_whitespace",
+    "input": "<img src=\" javascript:XSS\" title=\"1\">boo</img>",
+    "output": "<img title='1'/>boo"
+  },
+
+  {
+    "name": "xml_base",
+    "input": "<div xml:base=\"javascript:alert('XSS');//\">foo</div>",
+    "output": "<div>foo</div>"
+  },
+
+  {
+    "name": "xul",
+    "input": "<p style=\"-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')\">fubar</p>",
+    "output": "<p style=''>fubar</p>"
+  },
+
+  {
+    "name": "quotes_in_attributes",
+    "input": "<img src='foo' title='\"foo\" bar' />",
+    "output": "<img src='foo' title='\"foo\" bar'/>"
+  },
+
+  {
+    "name": "uri_refs_in_svg_attributes",
+    "input": "<svg><rect fill='url(#foo)' />",
+    "output": "<svg><rect fill='url(#foo)'></rect></svg>"
+  },
+
+  {
+    "name": "absolute_uri_refs_in_svg_attributes",
+    "input": "<svg><rect fill='url(http://bad.com/) #fff' />",
+    "output": "<svg><rect fill='  #fff'></rect></svg>"
+  },
+
+  {
+    "name": "uri_ref_with_space_in svg_attribute",
+    "input": "<svg><rect fill='url(\n#foo)' />",
+    "output": "<svg><rect fill='url(\n#foo)'></rect></svg>"
+  },
+
+  {
+    "name": "absolute_uri_ref_with_space_in svg_attribute",
+    "input": "<svg><rect fill=\"url(\nhttp://bad.com/)\" />",
+    "output": "<svg><rect fill=' '></rect></svg>"
+  },
+
+  {
+    "name": "allow_html5_image_tag",
+    "input": "<image src='foo' />",
+    "output": "<img src='foo'/>"
+  },
+
+  {
+    "name": "style_attr_end_with_nothing",
+    "input": "<div style=\"color: blue\" />",
+    "output": "<div style='color: blue;'></div>"
+  },
+
+  {
+    "name": "style_attr_end_with_space",
+    "input": "<div style=\"color: blue \" />",
+    "output": "<div style='color: blue ;'></div>"
+  },
+
+  {
+    "name": "style_attr_end_with_semicolon",
+    "input": "<div style=\"color: blue;\" />",
+    "output": "<div style='color: blue;'></div>"
+  },
+
+  {
+    "name": "style_attr_end_with_semicolon_space",
+    "input": "<div style=\"color: blue; \" />",
+    "output": "<div style='color: blue;'></div>"
+  },
+  
+  {
+   "name": "attributes_with_embedded_quotes",
+   "input": "<img src=doesntexist.jpg\"'onerror=\"alert(1) />",
+   "output": "<img src='doesntexist.jpg\"&#39;onerror=\"alert(1)'/>"
+  },
+  
+  {
+   "name": "attributes_with_embedded_quotes_II",
+   "input": "<img src=notthere.jpg\"\"onerror=\"alert(2) />",
+   "output": "<img src='notthere.jpg\"\"onerror=\"alert(2)'/>"
+  }
+]
@@ -0,0 +1,50 @@
+from __future__ import absolute_import, division, unicode_literals
+
+import codecs
+import json
+
+import pytest
+
+from html5lib import parseFragment, serialize
+
+
+class SanitizerFile(pytest.File):
+    def collect(self):
+        with codecs.open(str(self.fspath), "r", encoding="utf-8") as fp:
+            tests = json.load(fp)
+        for i, test in enumerate(tests):
+            yield SanitizerTest(str(i), self, test=test)
+
+
+class SanitizerTest(pytest.Item):
+    def __init__(self, name, parent, test):
+        super(SanitizerTest, self).__init__(name, parent)
+        self.obj = lambda: 1  # this is to hack around skipif needing a function!
+        self.test = test
+
+    def runtest(self):
+        input = self.test["input"]
+        expected = self.test["output"]
+
+        parsed = parseFragment(input)
+        serialized = serialize(parsed,
+                               sanitize=True,
+                               omit_optional_tags=False,
+                               use_trailing_solidus=True,
+                               space_before_trailing_solidus=False,
+                               quote_attr_values="always",
+                               quote_char="'",
+                               alphabetical_attributes=True)
+        errorMsg = "\n".join(["\n\nInput:", input,
+                              "\nExpected:", expected,
+                              "\nReceived:", serialized])
+        assert expected == serialized, errorMsg
+
+    def repr_failure(self, excinfo):
+        traceback = excinfo.traceback
+        ntraceback = traceback.cut(path=__file__)
+        excinfo.traceback = ntraceback.filter()
+
+        return excinfo.getrepr(funcargs=True,
+                               showlocals=False,
+                               style="short", tbfilter=False)