html5lib · gsnedders · Jun 7, 2020 · May 21, 2020 · May 21, 2020 · May 21, 2020
diff --git a/.pytest.expect b/.pytest.expect
@@ -1,7 +1,7 @@
 pytest-expect file v1
-(2, 7, 11, 'final', 0)
-b'html5lib/tests/test_encoding.py::test_encoding::[110]': FAIL
-b'html5lib/tests/test_encoding.py::test_encoding::[111]': FAIL
+(2, 7, 18, 'final', 0)
+b'html5lib/tests/test_encoding.py::test_parser_encoding[<!DOCTYPE HTML>\\n<script>document.write(\'<meta charset="ISO-8859-\' + \'2">\')</script>-iso-8859-2]': FAIL
+b'html5lib/tests/test_encoding.py::test_prescan_encoding[<!DOCTYPE HTML>\\n<script>document.write(\'<meta charset="ISO-8859-\' + \'2">\')</script>-iso-8859-2]': FAIL
 u'html5lib/tests/testdata/tokenizer/test2.test::0::dataState': FAIL
 u'html5lib/tests/testdata/tokenizer/test3.test::228::dataState': FAIL
 u'html5lib/tests/testdata/tokenizer/test3.test::231::dataState': FAIL

diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py
@@ -75,7 +75,15 @@ def test_parser_args_raises(kwargs):
     assert exc_info.value.args[0].startswith("Cannot set an encoding with a unicode input")
 
 
-def runParserEncodingTest(data, encoding):
+def param_encoding():
+    for filename in get_data_files("encoding"):
+        tests = _TestData(filename, b"data", encoding=None)
+        for test in tests:
+            yield test[b'data'], test[b'encoding']
+
+
+@pytest.mark.parametrize("data, encoding", param_encoding())
+def test_parser_encoding(data, encoding):
     p = HTMLParser()
     assert p.documentEncoding is None
     p.parse(data, useChardet=False)
@@ -84,7 +92,8 @@ def runParserEncodingTest(data, encoding):
     assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
 
 
-def runPreScanEncodingTest(data, encoding):
+@pytest.mark.parametrize("data, encoding", param_encoding())
+def test_prescan_encoding(data, encoding):
     stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False)
     encoding = encoding.lower().decode("ascii")
 
@@ -95,14 +104,6 @@ def runPreScanEncodingTest(data, encoding):
     assert encoding == stream.charEncoding[0].name, errorMessage(data, encoding, stream.charEncoding[0].name)
 
 
-def test_encoding():
-    for filename in get_data_files("encoding"):
-        tests = _TestData(filename, b"data", encoding=None)
-        for test in tests:
-            yield (runParserEncodingTest, test[b'data'], test[b'encoding'])
-            yield (runPreScanEncodingTest, test[b'data'], test[b'encoding'])
-
-
 # pylint:disable=wrong-import-position
 try:
     import chardet  # noqa

diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py
@@ -1,21 +1,11 @@
 from __future__ import absolute_import, division, unicode_literals
 
+import pytest
+
 from html5lib import constants, parseFragment, serialize
 from html5lib.filters import sanitizer
 
 
-def runSanitizerTest(_, expected, input):
-    parsed = parseFragment(expected)
-    expected = serialize(parsed,
-                         omit_optional_tags=False,
-                         use_trailing_solidus=True,
-                         space_before_trailing_solidus=False,
-                         quote_attr_values="always",
-                         quote_char='"',
-                         alphabetical_attributes=True)
-    assert expected == sanitize_html(input)
-
-
 def sanitize_html(stream):
     parsed = parseFragment(stream)
     serialized = serialize(parsed,
@@ -59,27 +49,27 @@ def test_data_uri_disallowed_type():
     assert expected == sanitized
 
 
-def test_sanitizer():
+def param_sanitizer():
     for ns, tag_name in sanitizer.allowed_elements:
         if ns != constants.namespaces["html"]:
             continue
         if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td',
                         'tfoot', 'th', 'thead', 'tr', 'select']:
             continue  # TODO
         if tag_name == 'image':
-            yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
+            yield ("test_should_allow_%s_tag" % tag_name,
                    "<img title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
                    "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
         elif tag_name == 'br':
-            yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
+            yield ("test_should_allow_%s_tag" % tag_name,
                    "<br title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz<br/>",
                    "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
         elif tag_name in constants.voidElements:
-            yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
+            yield ("test_should_allow_%s_tag" % tag_name,
                    "<%s title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz" % tag_name,
                    "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
         else:
-            yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
+            yield ("test_should_allow_%s_tag" % tag_name,
                    "<%s title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</%s>" % (tag_name, tag_name),
                    "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
 
@@ -93,15 +83,15 @@ def test_sanitizer():
         attribute_value = 'foo'
         if attribute_name in sanitizer.attr_val_is_uri:
             attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.allowed_protocols[0]
-        yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
+        yield ("test_should_allow_%s_attribute" % attribute_name,
                "<p %s=\"%s\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % (attribute_name, attribute_value),
                "<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value))
 
     for protocol in sanitizer.allowed_protocols:
         rest_of_uri = '//sub.domain.tld/path/object.ext'
         if protocol == 'data':
             rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
-        yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
+        yield ("test_should_allow_uppercase_%s_uris" % protocol,
                "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
                """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri))
 
@@ -110,11 +100,26 @@ def test_sanitizer():
         if protocol == 'data':
             rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
         protocol = protocol.upper()
-        yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
+        yield ("test_should_allow_uppercase_%s_uris" % protocol,
                "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
                """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri))
 
 
+@pytest.mark.parametrize("expected, input",
+                         (pytest.param(expected, input, id=id)
+                          for id, expected, input in param_sanitizer()))
+def test_sanitizer(expected, input):
+    parsed = parseFragment(expected)
+    expected = serialize(parsed,
+                         omit_optional_tags=False,
+                         use_trailing_solidus=True,
+                         space_before_trailing_solidus=False,
+                         quote_attr_values="always",
+                         quote_char='"',
+                         alphabetical_attributes=True)
+    assert expected == sanitize_html(input)
+
+
 def test_lowercase_color_codes_in_style():
     sanitized = sanitize_html("<p style=\"border: 1px solid #a2a2a2;\"></p>")
     expected = '<p style=\"border: 1px solid #a2a2a2;\"></p>'

diff --git a/html5lib/tests/test_serializer.py b/html5lib/tests/test_serializer.py
@@ -89,19 +89,6 @@ def serialize_html(input, options):
     return serializer.render(stream, encoding)
 
 
-def runSerializerTest(input, expected, options):
-    encoding = options.get("encoding", None)
-
-    if encoding:
-        expected = list(map(lambda x: x.encode(encoding), expected))
-
-    result = serialize_html(input, options)
-    if len(expected) == 1:
-        assert expected[0] == result, "Expected:\n%s\nActual:\n%s\nOptions:\n%s" % (expected[0], result, str(options))
-    elif result not in expected:
-        assert False, "Expected: %s, Received: %s" % (expected, result)
-
-
 def throwsWithLatin1(input):
     with pytest.raises(UnicodeEncodeError):
         serialize_html(input, {"encoding": "iso-8859-1"})
@@ -120,13 +107,13 @@ def testDoctypeSystemId():
 
 
 def testCdataCharacters():
-    runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "style", {}], ["Characters", "\u0101"]],
-                      ["<style>&amacr;"], {"encoding": "iso-8859-1"})
+    test_serializer([["StartTag", "http://www.w3.org/1999/xhtml", "style", {}], ["Characters", "\u0101"]],
+                    ["<style>&amacr;"], {"encoding": "iso-8859-1"})
 
 
 def testCharacters():
-    runSerializerTest([["Characters", "\u0101"]],
-                      ["&amacr;"], {"encoding": "iso-8859-1"})
+    test_serializer([["Characters", "\u0101"]],
+                    ["&amacr;"], {"encoding": "iso-8859-1"})
 
 
 def testStartTagName():
@@ -138,9 +125,9 @@ def testAttributeName():
 
 
 def testAttributeValue():
-    runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "span",
-                        [{"namespace": None, "name": "potato", "value": "\u0101"}]]],
-                      ["<span potato=&amacr;>"], {"encoding": "iso-8859-1"})
+    test_serializer([["StartTag", "http://www.w3.org/1999/xhtml", "span",
+                      [{"namespace": None, "name": "potato", "value": "\u0101"}]]],
+                    ["<span potato=&amacr;>"], {"encoding": "iso-8859-1"})
 
 
 def testEndTagName():
@@ -165,7 +152,7 @@ def testSpecQuoteAttribute(c):
     else:
         output_ = ['<span foo="%s">' % c]
     options_ = {"quote_attr_values": "spec"}
-    runSerializerTest(input_, output_, options_)
+    test_serializer(input_, output_, options_)
 
 
 @pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`"
@@ -184,7 +171,7 @@ def testLegacyQuoteAttribute(c):
     else:
         output_ = ['<span foo="%s">' % c]
     options_ = {"quote_attr_values": "legacy"}
-    runSerializerTest(input_, output_, options_)
+    test_serializer(input_, output_, options_)
 
 
 @pytest.fixture
@@ -217,9 +204,23 @@ def testEntityNoResolve(lxml_parser):
     assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>'
 
 
-def test_serializer():
+def param_serializer():
     for filename in get_data_files('serializer-testdata', '*.test', os.path.dirname(__file__)):
         with open(filename) as fp:
             tests = json.load(fp)
             for test in tests['tests']:
-                yield runSerializerTest, test["input"], test["expected"], test.get("options", {})
+                yield test["input"], test["expected"], test.get("options", {})
+
+
+@pytest.mark.parametrize("input, expected, options", param_serializer())
+def test_serializer(input, expected, options):
+    encoding = options.get("encoding", None)
+
+    if encoding:
+        expected = list(map(lambda x: x.encode(encoding), expected))
+
+    result = serialize_html(input, options)
+    if len(expected) == 1:
+        assert expected[0] == result, "Expected:\n%s\nActual:\n%s\nOptions:\n%s" % (expected[0], result, str(options))
+    elif result not in expected:
+        assert False, "Expected: %s, Received: %s" % (expected, result)
diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py
@@ -61,24 +61,7 @@ def set_attribute_on_first_child(docfrag, name, value, treeName):
         setter['ElementTree'](docfrag)(name, value)
 
 
-def runTreewalkerEditTest(intext, expected, attrs_to_add, tree):
-    """tests what happens when we add attributes to the intext"""
-    treeName, treeClass = tree
-    if treeClass is None:
-        pytest.skip("Treebuilder not loaded")
-    parser = html5parser.HTMLParser(tree=treeClass["builder"])
-    document = parser.parseFragment(intext)
-    for nom, val in attrs_to_add:
-        set_attribute_on_first_child(document, nom, val, treeName)
-
-    document = treeClass.get("adapter", lambda x: x)(document)
-    output = treewalkers.pprint(treeClass["walker"](document))
-    output = attrlist.sub(sortattrs, output)
-    if output not in expected:
-        raise AssertionError("TreewalkerEditTest: %s\nExpected:\n%s\nReceived:\n%s" % (treeName, expected, output))
-
-
-def test_treewalker_six_mix():
+def param_treewalker_six_mix():
     """Str/Unicode mix. If str attrs added to tree"""
 
     # On Python 2.x string literals are of type str. Unless, like this
@@ -99,7 +82,25 @@ def test_treewalker_six_mix():
 
     for tree in sorted(treeTypes.items()):
         for intext, attrs, expected in sm_tests:
-            yield runTreewalkerEditTest, intext, expected, attrs, tree
+            yield intext, expected, attrs, tree
+
+
+@pytest.mark.parametrize("intext, expected, attrs_to_add, tree", param_treewalker_six_mix())
+def test_treewalker_six_mix(intext, expected, attrs_to_add, tree):
+    """tests what happens when we add attributes to the intext"""
+    treeName, treeClass = tree
+    if treeClass is None:
+        pytest.skip("Treebuilder not loaded")
+    parser = html5parser.HTMLParser(tree=treeClass["builder"])
+    document = parser.parseFragment(intext)
+    for nom, val in attrs_to_add:
+        set_attribute_on_first_child(document, nom, val, treeName)
+
+    document = treeClass.get("adapter", lambda x: x)(document)
+    output = treewalkers.pprint(treeClass["walker"](document))
+    output = attrlist.sub(sortattrs, output)
+    if output not in expected:
+        raise AssertionError("TreewalkerEditTest: %s\nExpected:\n%s\nReceived:\n%s" % (treeName, expected, output))
 
 
 @pytest.mark.parametrize("tree,char", itertools.product(sorted(treeTypes.items()), ["x", "\u1234"]))

diff --git a/html5lib/tests/tree_construction.py b/html5lib/tests/tree_construction.py
@@ -57,8 +57,6 @@ def _getParserTests(self, treeName, treeAPIs):
             item.add_marker(pytest.mark.parser)
             if namespaceHTMLElements:
                 item.add_marker(pytest.mark.namespaced)
-            if treeAPIs is None:
-                item.add_marker(pytest.mark.skipif(True, reason="Treebuilder not loaded"))
             yield item
 
     def _getTreeWalkerTests(self, treeName, treeAPIs):
@@ -69,8 +67,6 @@ def _getTreeWalkerTests(self, treeName, treeAPIs):
                               treeAPIs)
         item.add_marker(getattr(pytest.mark, treeName))
         item.add_marker(pytest.mark.treewalker)
-        if treeAPIs is None:
-            item.add_marker(pytest.mark.skipif(True, reason="Treebuilder not loaded"))
         yield item
 
 
@@ -84,12 +80,14 @@ def convertTreeDump(data):
 class ParserTest(pytest.Item):
     def __init__(self, name, parent, test, treeClass, namespaceHTMLElements):
         super(ParserTest, self).__init__(name, parent)
-        self.obj = lambda: 1  # this is to hack around skipif needing a function!
         self.test = test
         self.treeClass = treeClass
         self.namespaceHTMLElements = namespaceHTMLElements
 
     def runtest(self):
+        if self.treeClass is None:
+            pytest.skip("Treebuilder not loaded")
+
         p = html5parser.HTMLParser(tree=self.treeClass,
                                    namespaceHTMLElements=self.namespaceHTMLElements)
 
@@ -147,11 +145,13 @@ def repr_failure(self, excinfo):
 class TreeWalkerTest(pytest.Item):
     def __init__(self, name, parent, test, treeAPIs):
         super(TreeWalkerTest, self).__init__(name, parent)
-        self.obj = lambda: 1  # this is to hack around skipif needing a function!
         self.test = test
         self.treeAPIs = treeAPIs
 
     def runtest(self):
+        if self.treeAPIs is None:
+            pytest.skip("Treebuilder not loaded")
+
         p = html5parser.HTMLParser(tree=self.treeAPIs["builder"])
 
         input = self.test['data']