Skip to content

Commit a337d3b

Browse files
committed
Fix html5lib#11, html5lib#12: quote attributes that need escaping in legacy browsers
These are mostly out of the market now, so this isn't massively needed any more; nevertheless, avoiding XSS as much as possible is inevitably desirable. This alters the API so that quote_attr_values is now a ternary setting, choosing between legacy-safe behaviour, spec behaviour, and always quoting.
1 parent f628385 commit a337d3b

File tree

1 file changed

+13
-6
lines changed

1 file changed

+13
-6
lines changed

html5lib/serializer/htmlserializer.py

+13-6
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,13 @@
1414
spaceCharacters = "".join(spaceCharacters)
1515

1616
quoteAttributeSpec = re.compile("[" + spaceCharacters + "\"'=<>`]")
17+
quoteAttributeLegacy = re.compile("[\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
18+
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
19+
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
20+
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
21+
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
22+
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
23+
"\u3000]")
1724

1825
try:
1926
from codecs import register_error, xmlcharrefreplace_errors
@@ -75,7 +82,7 @@ def htmlentityreplace_errors(exc):
7582
class HTMLSerializer(object):
7683

7784
# attribute quoting options
78-
quote_attr_values = False
85+
quote_attr_values = "legacy"
7986
quote_char = '"'
8087
use_best_quote_char = True
8188

@@ -111,9 +118,9 @@ def __init__(self, **kwargs):
111118
inject_meta_charset=True|False
112119
Whether it insert a meta element to define the character set of the
113120
document.
114-
quote_attr_values=True|False
121+
quote_attr_values="legacy"|"spec"|True
115122
Whether to quote attribute values that don't require quoting
116-
per HTML5 parsing rules.
123+
per legacy browser behaviour, HTML authoring rules, or always.
117124
quote_char=u'"'|u"'"
118125
Use given quote character for attribute quoting. Default is to
119126
use double quote unless attribute value contains a double quote,
@@ -242,10 +249,10 @@ def serialize(self, treewalker, encoding=None):
242249
(k not in booleanAttributes.get(name, tuple())
243250
and k not in booleanAttributes.get("", tuple())):
244251
yield self.encodeStrict("=")
245-
if self.quote_attr_values:
252+
if self.quote_attr_values or len(v) == 0:
246253
quote_attr = True
247-
else:
248-
quote_attr = len(v) == 0 or quoteAttributeSpec.search(v)
254+
elif :
255+
quoteAttributeSpec.search(v)
249256
v = v.replace("&", "&amp;")
250257
if self.escape_lt_in_attrs:
251258
v = v.replace("<", "&lt;")

0 commit comments

Comments
 (0)