Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 556043b

Browse files
committedMay 17, 2016
Reintroduce the old sanitizer testsuite from html5lib-tests
This is imported into this repo as its expectations are very much implementation dependent
1 parent 516227e commit 556043b

File tree

4 files changed

+494
-5
lines changed

4 files changed

+494
-5
lines changed
 

‎html5lib/serializer/htmlserializer.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,12 @@ def serialize(self, treewalker, encoding=None):
184184
if encoding and self.inject_meta_charset:
185185
from ..filters.inject_meta_charset import Filter
186186
treewalker = Filter(treewalker, encoding)
187+
# Alphabetical attributes is here under the assumption that none of
188+
# the later filters add or change order of attributes; it needs to be
189+
# before the sanitizer so escaped elements come out correctly
190+
if self.alphabetical_attributes:
191+
from ..filters.alphabeticalattributes import Filter
192+
treewalker = Filter(treewalker)
187193
# WhitespaceFilter should be used before OptionalTagFilter
188194
# for maximum efficiently of this latter filter
189195
if self.strip_whitespace:
@@ -195,11 +201,6 @@ def serialize(self, treewalker, encoding=None):
195201
if self.omit_optional_tags:
196202
from ..filters.optionaltags import Filter
197203
treewalker = Filter(treewalker)
198-
# Alphabetical attributes must be last, as other filters
199-
# could add attributes and alter the order
200-
if self.alphabetical_attributes:
201-
from ..filters.alphabeticalattributes import Filter
202-
treewalker = Filter(treewalker)
203204

204205
for token in treewalker:
205206
type = token["type"]

‎html5lib/tests/conftest.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@
22

33
from .tree_construction import TreeConstructionFile
44
from .tokenizer import TokenizerFile
5+
from .sanitizer import SanitizerFile
56

67
_dir = os.path.abspath(os.path.dirname(__file__))
78
_testdata = os.path.join(_dir, "testdata")
89
_tree_construction = os.path.join(_testdata, "tree-construction")
910
_tokenizer = os.path.join(_testdata, "tokenizer")
11+
_sanitizer_testdata = os.path.join(_dir, "sanitizer-testdata")
1012

1113

1214
def pytest_collectstart():
@@ -24,3 +26,6 @@ def pytest_collect_file(path, parent):
2426
elif dir == _tokenizer:
2527
if path.ext == ".test":
2628
return TokenizerFile(path, parent)
29+
elif dir == _sanitizer_testdata:
30+
if path.ext == ".dat":
31+
return SanitizerFile(path, parent)
Lines changed: 433 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,433 @@
1+
[
2+
{
3+
"name": "IE_Comments",
4+
"input": "<!--[if gte IE 4]><script>alert('XSS');</script><![endif]-->",
5+
"output": ""
6+
},
7+
8+
{
9+
"name": "IE_Comments_2",
10+
"input": "<![if !IE 5]><script>alert('XSS');</script><![endif]>",
11+
"output": "&lt;script&gt;alert('XSS');&lt;/script&gt;"
12+
},
13+
14+
{
15+
"name": "allow_colons_in_path_component",
16+
"input": "<a href=\"./this:that\">foo</a>",
17+
"output": "<a href='./this:that'>foo</a>"
18+
},
19+
20+
{
21+
"name": "background_attribute",
22+
"input": "<div background=\"javascript:alert('XSS')\"></div>",
23+
"output": "<div></div>"
24+
},
25+
26+
{
27+
"name": "bgsound",
28+
"input": "<bgsound src=\"javascript:alert('XSS');\" />",
29+
"output": "&lt;bgsound src=\"javascript:alert('XSS');\"&gt;&lt;/bgsound&gt;"
30+
},
31+
32+
{
33+
"name": "div_background_image_unicode_encoded",
34+
"input": "<div style=\"background-image:\u00a5\u00a2\u006C\u0028'\u006a\u0061\u00a6\u0061\u00a3\u0063\u00a2\u0069\u00a0\u00a4\u003a\u0061\u006c\u0065\u00a2\u00a4\u0028.1027\u0058.1053\u0053\u0027\u0029'\u0029\">foo</div>",
35+
"output": "<div style=''>foo</div>"
36+
},
37+
38+
{
39+
"name": "div_expression",
40+
"input": "<div style=\"width: expression(alert('XSS'));\">foo</div>",
41+
"output": "<div style=''>foo</div>"
42+
},
43+
44+
{
45+
"name": "double_open_angle_brackets",
46+
"input": "<img src=http://ha.ckers.org/scriptlet.html <",
47+
"output": ""
48+
},
49+
50+
{
51+
"name": "double_open_angle_brackets_2",
52+
"input": "<script src=http://ha.ckers.org/scriptlet.html <",
53+
"output": ""
54+
},
55+
56+
{
57+
"name": "grave_accents",
58+
"input": "<img src=`javascript:alert('XSS')` />",
59+
"output": "<img/>"
60+
},
61+
62+
{
63+
"name": "img_dynsrc_lowsrc",
64+
"input": "<img dynsrc=\"javascript:alert('XSS')\" />",
65+
"output": "<img/>"
66+
},
67+
68+
{
69+
"name": "img_vbscript",
70+
"input": "<img src='vbscript:msgbox(\"XSS\")' />",
71+
"output": "<img/>"
72+
},
73+
74+
{
75+
"name": "input_image",
76+
"input": "<input type=\"image\" src=\"javascript:alert('XSS');\" />",
77+
"output": "<input type='image'/>"
78+
},
79+
80+
{
81+
"name": "link_stylesheets",
82+
"input": "<link rel=\"stylesheet\" href=\"javascript:alert('XSS');\" />",
83+
"output": "&lt;link href=\"javascript:alert('XSS');\" rel=\"stylesheet\"&gt;"
84+
},
85+
86+
{
87+
"name": "link_stylesheets_2",
88+
"input": "<link rel=\"stylesheet\" href=\"http://ha.ckers.org/xss.css\" />",
89+
"output": "&lt;link href=\"http://ha.ckers.org/xss.css\" rel=\"stylesheet\"&gt;"
90+
},
91+
92+
{
93+
"name": "list_style_image",
94+
"input": "<li style=\"list-style-image: url(javascript:alert('XSS'))\">foo</li>",
95+
"output": "<li style=''>foo</li>"
96+
},
97+
98+
{
99+
"name": "no_closing_script_tags",
100+
"input": "<script src=http://ha.ckers.org/xss.js?<b>",
101+
"output": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;&lt;/script&gt;"
102+
},
103+
104+
{
105+
"name": "non_alpha_non_digit",
106+
"input": "<script/XSS src=\"http://ha.ckers.org/xss.js\"></script>",
107+
"output": "&lt;script src=\"http://ha.ckers.org/xss.js\" xss=\"\"&gt;&lt;/script&gt;"
108+
},
109+
110+
{
111+
"name": "non_alpha_non_digit_2",
112+
"input": "<a onclick!\\#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>foo</a>",
113+
"output": "<a>foo</a>"
114+
},
115+
116+
{
117+
"name": "non_alpha_non_digit_3",
118+
"input": "<img/src=\"http://ha.ckers.org/xss.js\"/>",
119+
"output": "<img src='http://ha.ckers.org/xss.js'/>"
120+
},
121+
122+
{
123+
"name": "non_alpha_non_digit_II",
124+
"input": "<a href!\\#$%&()*~+-_.,:;?@[/|]^`=alert('XSS')>foo</a>",
125+
"output": "<a>foo</a>"
126+
},
127+
128+
{
129+
"name": "non_alpha_non_digit_III",
130+
"input": "<a/href=\"javascript:alert('XSS');\">foo</a>",
131+
"output": "<a>foo</a>"
132+
},
133+
134+
{
135+
"name": "platypus",
136+
"input": "<a href=\"http://www.ragingplatypus.com/\" style=\"display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;\">never trust your upstream platypus</a>",
137+
"output": "<a href='http://www.ragingplatypus.com/' style='display: block; width: 100%; height: 100%; background-color: black; background-x: center; background-y: center;'>never trust your upstream platypus</a>"
138+
},
139+
140+
{
141+
"name": "protocol_resolution_in_script_tag",
142+
"input": "<script src=//ha.ckers.org/.j></script>",
143+
"output": "&lt;script src=\"//ha.ckers.org/.j\"&gt;&lt;/script&gt;"
144+
},
145+
146+
{
147+
"name": "should_allow_anchors",
148+
"input": "<a href='foo' onclick='bar'><script>baz</script></a>",
149+
"output": "<a href='foo'>&lt;script&gt;baz&lt;/script&gt;</a>"
150+
},
151+
152+
{
153+
"name": "should_allow_image_alt_attribute",
154+
"input": "<img alt='foo' onclick='bar' />",
155+
"output": "<img alt='foo'/>"
156+
},
157+
158+
{
159+
"name": "should_allow_image_height_attribute",
160+
"input": "<img height='foo' onclick='bar' />",
161+
"output": "<img height='foo'/>"
162+
},
163+
164+
{
165+
"name": "should_allow_image_src_attribute",
166+
"input": "<img src='foo' onclick='bar' />",
167+
"output": "<img src='foo'/>"
168+
},
169+
170+
{
171+
"name": "should_allow_image_width_attribute",
172+
"input": "<img width='foo' onclick='bar' />",
173+
"output": "<img width='foo'/>"
174+
},
175+
176+
{
177+
"name": "should_handle_blank_text",
178+
"input": "",
179+
"output": ""
180+
},
181+
182+
{
183+
"name": "should_handle_malformed_image_tags",
184+
"input": "<img \"\"\"><script>alert(\"XSS\")</script>\">",
185+
"output": "<img/>&lt;script&gt;alert(\"XSS\")&lt;/script&gt;\"&gt;"
186+
},
187+
188+
{
189+
"name": "should_handle_non_html",
190+
"input": "abc",
191+
"output": "abc"
192+
},
193+
194+
{
195+
"name": "should_not_fall_for_ridiculous_hack",
196+
"input": "<img\nsrc\n=\n\"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n\"\n />",
197+
"output": "<img/>"
198+
},
199+
200+
{
201+
"name": "should_not_fall_for_xss_image_hack_0",
202+
"input": "<img src=\"javascript:alert('XSS');\" />",
203+
"output": "<img/>"
204+
},
205+
206+
{
207+
"name": "should_not_fall_for_xss_image_hack_1",
208+
"input": "<img src=javascript:alert('XSS') />",
209+
"output": "<img/>"
210+
},
211+
212+
{
213+
"name": "should_not_fall_for_xss_image_hack_10",
214+
"input": "<img src=\"jav&#x0A;ascript:alert('XSS');\" />",
215+
"output": "<img/>"
216+
},
217+
218+
{
219+
"name": "should_not_fall_for_xss_image_hack_11",
220+
"input": "<img src=\"jav&#x0D;ascript:alert('XSS');\" />",
221+
"output": "<img/>"
222+
},
223+
224+
{
225+
"name": "should_not_fall_for_xss_image_hack_12",
226+
"input": "<img src=\" &#14; javascript:alert('XSS');\" />",
227+
"output": "<img/>"
228+
},
229+
230+
{
231+
"name": "should_not_fall_for_xss_image_hack_13",
232+
"input": "<img src=\"&#x20;javascript:alert('XSS');\" />",
233+
"output": "<img/>"
234+
},
235+
236+
{
237+
"name": "should_not_fall_for_xss_image_hack_14",
238+
"input": "<img src=\"&#xA0;javascript:alert('XSS');\" />",
239+
"output": "<img/>"
240+
},
241+
242+
{
243+
"name": "should_not_fall_for_xss_image_hack_2",
244+
"input": "<img src=\"JaVaScRiPt:alert('XSS')\" />",
245+
"output": "<img/>"
246+
},
247+
248+
{
249+
"name": "should_not_fall_for_xss_image_hack_3",
250+
"input": "<img src='javascript:alert(&quot;XSS&quot;)' />",
251+
"output": "<img/>"
252+
},
253+
254+
{
255+
"name": "should_not_fall_for_xss_image_hack_4",
256+
"input": "<img src='javascript:alert(String.fromCharCode(88,83,83))' />",
257+
"output": "<img/>"
258+
},
259+
260+
{
261+
"name": "should_not_fall_for_xss_image_hack_5",
262+
"input": "<img src='&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;' />",
263+
"output": "<img/>"
264+
},
265+
266+
{
267+
"name": "should_not_fall_for_xss_image_hack_6",
268+
"input": "<img src='&#0000106;&#0000097;&#0000118;&#0000097;&#0000115;&#0000099;&#0000114;&#0000105;&#0000112;&#0000116;&#0000058;&#0000097;&#0000108;&#0000101;&#0000114;&#0000116;&#0000040;&#0000039;&#0000088;&#0000083;&#0000083;&#0000039;&#0000041' />",
269+
"output": "<img/>"
270+
},
271+
272+
{
273+
"name": "should_not_fall_for_xss_image_hack_7",
274+
"input": "<img src='&#x6A;&#x61;&#x76;&#x61;&#x73;&#x63;&#x72;&#x69;&#x70;&#x74;&#x3A;&#x61;&#x6C;&#x65;&#x72;&#x74;&#x28;&#x27;&#x58;&#x53;&#x53;&#x27;&#x29' />",
275+
"output": "<img/>"
276+
},
277+
278+
{
279+
"name": "should_not_fall_for_xss_image_hack_8",
280+
"input": "<img src=\"jav\tascript:alert('XSS');\" />",
281+
"output": "<img/>"
282+
},
283+
284+
{
285+
"name": "should_not_fall_for_xss_image_hack_9",
286+
"input": "<img src=\"jav&#x09;ascript:alert('XSS');\" />",
287+
"output": "<img/>"
288+
},
289+
290+
{
291+
"name": "should_sanitize_half_open_scripts",
292+
"input": "<img src=\"javascript:alert('XSS')\"",
293+
"output": ""
294+
},
295+
296+
{
297+
"name": "should_sanitize_invalid_script_tag",
298+
"input": "<script/XSS SRC=\"http://ha.ckers.org/xss.js\"></script>",
299+
"output": "&lt;script src=\"http://ha.ckers.org/xss.js\" xss=\"\"&gt;&lt;/script&gt;"
300+
},
301+
302+
{
303+
"name": "should_sanitize_script_tag_with_multiple_open_brackets",
304+
"input": "<<script>alert(\"XSS\");//<</script>",
305+
"output": "&lt;&lt;script&gt;alert(\"XSS\");//&lt;&lt;/script&gt;"
306+
},
307+
308+
{
309+
"name": "should_sanitize_script_tag_with_multiple_open_brackets_2",
310+
"input": "<iframe src=http://ha.ckers.org/scriptlet.html\n<",
311+
"output": ""
312+
},
313+
314+
{
315+
"name": "should_sanitize_tag_broken_up_by_null",
316+
"input": "<scr\u0000ipt>alert(\"XSS\")</scr\u0000ipt>",
317+
"output": "&lt;scr\ufffdipt&gt;alert(\"XSS\")&lt;/scr\ufffdipt&gt;"
318+
},
319+
320+
{
321+
"name": "should_sanitize_unclosed_script",
322+
"input": "<script src=http://ha.ckers.org/xss.js?<b>",
323+
"output": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;&lt;/script&gt;"
324+
},
325+
326+
{
327+
"name": "should_strip_href_attribute_in_a_with_bad_protocols",
328+
"input": "<a href=\"javascript:XSS\" title=\"1\">boo</a>",
329+
"output": "<a title='1'>boo</a>"
330+
},
331+
332+
{
333+
"name": "should_strip_href_attribute_in_a_with_bad_protocols_and_whitespace",
334+
"input": "<a href=\" javascript:XSS\" title=\"1\">boo</a>",
335+
"output": "<a title='1'>boo</a>"
336+
},
337+
338+
{
339+
"name": "should_strip_src_attribute_in_img_with_bad_protocols",
340+
"input": "<img src=\"javascript:XSS\" title=\"1\">boo</img>",
341+
"output": "<img title='1'/>boo"
342+
},
343+
344+
{
345+
"name": "should_strip_src_attribute_in_img_with_bad_protocols_and_whitespace",
346+
"input": "<img src=\" javascript:XSS\" title=\"1\">boo</img>",
347+
"output": "<img title='1'/>boo"
348+
},
349+
350+
{
351+
"name": "xml_base",
352+
"input": "<div xml:base=\"javascript:alert('XSS');//\">foo</div>",
353+
"output": "<div>foo</div>"
354+
},
355+
356+
{
357+
"name": "xul",
358+
"input": "<p style=\"-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')\">fubar</p>",
359+
"output": "<p style=''>fubar</p>"
360+
},
361+
362+
{
363+
"name": "quotes_in_attributes",
364+
"input": "<img src='foo' title='\"foo\" bar' />",
365+
"output": "<img src='foo' title='\"foo\" bar'/>"
366+
},
367+
368+
{
369+
"name": "uri_refs_in_svg_attributes",
370+
"input": "<svg><rect fill='url(#foo)' />",
371+
"output": "<svg><rect fill='url(#foo)'></rect></svg>"
372+
},
373+
374+
{
375+
"name": "absolute_uri_refs_in_svg_attributes",
376+
"input": "<svg><rect fill='url(http://bad.com/) #fff' />",
377+
"output": "<svg><rect fill=' #fff'></rect></svg>"
378+
},
379+
380+
{
381+
"name": "uri_ref_with_space_in svg_attribute",
382+
"input": "<svg><rect fill='url(\n#foo)' />",
383+
"output": "<svg><rect fill='url(\n#foo)'></rect></svg>"
384+
},
385+
386+
{
387+
"name": "absolute_uri_ref_with_space_in svg_attribute",
388+
"input": "<svg><rect fill=\"url(\nhttp://bad.com/)\" />",
389+
"output": "<svg><rect fill=' '></rect></svg>"
390+
},
391+
392+
{
393+
"name": "allow_html5_image_tag",
394+
"input": "<image src='foo' />",
395+
"output": "<img src='foo'/>"
396+
},
397+
398+
{
399+
"name": "style_attr_end_with_nothing",
400+
"input": "<div style=\"color: blue\" />",
401+
"output": "<div style='color: blue;'></div>"
402+
},
403+
404+
{
405+
"name": "style_attr_end_with_space",
406+
"input": "<div style=\"color: blue \" />",
407+
"output": "<div style='color: blue ;'></div>"
408+
},
409+
410+
{
411+
"name": "style_attr_end_with_semicolon",
412+
"input": "<div style=\"color: blue;\" />",
413+
"output": "<div style='color: blue;'></div>"
414+
},
415+
416+
{
417+
"name": "style_attr_end_with_semicolon_space",
418+
"input": "<div style=\"color: blue; \" />",
419+
"output": "<div style='color: blue;'></div>"
420+
},
421+
422+
{
423+
"name": "attributes_with_embedded_quotes",
424+
"input": "<img src=doesntexist.jpg\"'onerror=\"alert(1) />",
425+
"output": "<img src='doesntexist.jpg\"&#39;onerror=\"alert(1)'/>"
426+
},
427+
428+
{
429+
"name": "attributes_with_embedded_quotes_II",
430+
"input": "<img src=notthere.jpg\"\"onerror=\"alert(2) />",
431+
"output": "<img src='notthere.jpg\"\"onerror=\"alert(2)'/>"
432+
}
433+
]

‎html5lib/tests/sanitizer.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
from __future__ import absolute_import, division, unicode_literals
2+
3+
import codecs
4+
import json
5+
6+
import pytest
7+
8+
from html5lib import parseFragment, serialize
9+
10+
11+
class SanitizerFile(pytest.File):
12+
def collect(self):
13+
with codecs.open(str(self.fspath), "r", encoding="utf-8") as fp:
14+
tests = json.load(fp)
15+
for i, test in enumerate(tests):
16+
yield SanitizerTest(str(i), self, test=test)
17+
18+
19+
class SanitizerTest(pytest.Item):
20+
def __init__(self, name, parent, test):
21+
super(SanitizerTest, self).__init__(name, parent)
22+
self.obj = lambda: 1 # this is to hack around skipif needing a function!
23+
self.test = test
24+
25+
def runtest(self):
26+
input = self.test["input"]
27+
expected = self.test["output"]
28+
29+
parsed = parseFragment(input)
30+
serialized = serialize(parsed,
31+
sanitize=True,
32+
omit_optional_tags=False,
33+
use_trailing_solidus=True,
34+
space_before_trailing_solidus=False,
35+
quote_attr_values="always",
36+
quote_char="'",
37+
alphabetical_attributes=True)
38+
errorMsg = "\n".join(["\n\nInput:", input,
39+
"\nExpected:", expected,
40+
"\nReceived:", serialized])
41+
assert expected == serialized, errorMsg
42+
43+
def repr_failure(self, excinfo):
44+
traceback = excinfo.traceback
45+
ntraceback = traceback.cut(path=__file__)
46+
excinfo.traceback = ntraceback.filter()
47+
48+
return excinfo.getrepr(funcargs=True,
49+
showlocals=False,
50+
style="short", tbfilter=False)

0 commit comments

Comments
 (0)
Please sign in to comment.