html5lib · gsnedders · May 17, 2016 · May 8, 2016 · May 8, 2016 · May 8, 2016
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -33,6 +33,13 @@ Released on XXX
 * **Use scripting disabled by default (as we don't implement
   scripting).**
 
+* **Fix #11, avoiding the XSS bug potentially caused by serializer
+  allowing attribute values to be escaped out of in old browser versions,
+  changing the quote_attr_values option on serializer to take one of
+  three values, "always" (the old True value), "legacy" (the new option,
+  and the new default), and "spec" (the old False value, and the old
+  default).**
+
 
 0.9999999/1.0b8
 ~~~~~~~~~~~~~~~

diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py
@@ -10,6 +10,10 @@
 
 
 class Filter(_base.Filter):
+    def __init__(self, source, require_matching_tags=True):
+        super(Filter, self).__init__(source)
+        self.require_matching_tags = require_matching_tags
+
     def __iter__(self):
         open_elements = []
         for token in _base.Filter.__iter__(self):
@@ -26,7 +30,7 @@ def __iter__(self):
                     assert type == "EmptyTag"
                 else:
                     assert type == "StartTag"
-                if type == "StartTag":
+                if type == "StartTag" and self.require_matching_tags:
                     open_elements.append((namespace, name))
                 for (namespace, name), value in token["data"].items():
                     assert namespace is None or isinstance(namespace, text_type)
@@ -44,7 +48,7 @@ def __iter__(self):
                 assert name != ""
                 if (not namespace or namespace == namespaces["html"]) and name in voidElements:
                     assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
-                else:
+                elif self.require_matching_tags:
                     start = open_elements.pop()
                     assert start == (namespace, name)
 

diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py
@@ -1,10 +1,7 @@
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type
 
-try:
-    from functools import reduce
-except ImportError:
-    pass
+import re
 
 from ..constants import voidElements, booleanAttributes, spaceCharacters
 from ..constants import rcdataElements, entities, xmlEntities
@@ -13,6 +10,17 @@
 
 spaceCharacters = "".join(spaceCharacters)
 
+quoteAttributeSpecChars = spaceCharacters + "\"'=<>`"
+quoteAttributeSpec = re.compile("[" + quoteAttributeSpecChars + "]")
+quoteAttributeLegacy = re.compile("[" + quoteAttributeSpecChars +
+                                  "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
+                                  "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
+                                  "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+                                  "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
+                                  "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
+                                  "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
+                                  "\u3000]")
+
 try:
     from codecs import register_error, xmlcharrefreplace_errors
 except ImportError:
@@ -73,7 +81,7 @@ def htmlentityreplace_errors(exc):
 class HTMLSerializer(object):
 
     # attribute quoting options
-    quote_attr_values = False
+    quote_attr_values = "legacy"  # be secure by default
     quote_char = '"'
     use_best_quote_char = True
 
@@ -109,9 +117,9 @@ def __init__(self, **kwargs):
         inject_meta_charset=True|False
           Whether it insert a meta element to define the character set of the
           document.
-        quote_attr_values=True|False
+        quote_attr_values="legacy"|"spec"|"always"
           Whether to quote attribute values that don't require quoting
-          per HTML5 parsing rules.
+          per legacy browser behaviour, when required by the standard, or always.
         quote_char=u'"'|u"'"
           Use given quote character for attribute quoting. Default is to
           use double quote unless attribute value contains a double quote,
@@ -240,11 +248,15 @@ def serialize(self, treewalker, encoding=None):
                         (k not in booleanAttributes.get(name, tuple()) and
                          k not in booleanAttributes.get("", tuple())):
                         yield self.encodeStrict("=")
-                        if self.quote_attr_values or not v:
+                        if self.quote_attr_values == "always" or len(v) == 0:
                             quote_attr = True
+                        elif self.quote_attr_values == "spec":
+                            quote_attr = quoteAttributeSpec.search(v) is not None
+                        elif self.quote_attr_values == "legacy":
+                            quote_attr = quoteAttributeLegacy.search(v) is not None
                         else:
-                            quote_attr = reduce(lambda x, y: x or (y in v),
-                                                spaceCharacters + ">\"'=", False)
+                            raise ValueError("quote_attr_values must be one of: "
+                                             "'always', 'spec', or 'legacy'")
                         v = v.replace("&", "&amp;")
                         if self.escape_lt_in_attrs:
                             v = v.replace("<", "&lt;")