Skip to content

Commit fd5a486

Browse files
committed
Fix html5lib#11, html5lib#12: quote attributes that need escaping in legacy browsers
These are mostly out of the market now, so this isn't massively needed any more; nevertheless, avoiding XSS as much as possible is inevitably desirable. This alters the API so that quote_attr_values is now a ternary setting, choosing between legacy-safe behaviour, spec behaviour, and always quoting.
1 parent 3960479 commit fd5a486

File tree

2 files changed

+26
-6
lines changed

2 files changed

+26
-6
lines changed

CHANGES.rst

+6
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,12 @@ Released on XXX
2828

2929
* Cease supporting Python 3.2 (in both CPython and PyPy forms).
3030

31+
* Fix #11, avoiding the XSS bug potentially caused by serializer allowing
32+
attribute values to be escaped out of in old browser versions, changing
33+
the quote_attr_values option on serializer to take one of three values,
34+
"always" (the old True value), "legacy" (the new option, and the new
35+
default), and "spec" (the old False value, and the old default).
36+
3137

3238
0.9999999/1.0b8
3339
~~~~~~~~~~~~~~~

html5lib/serializer/htmlserializer.py

+20-6
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,16 @@
1010

1111
spaceCharacters = "".join(spaceCharacters)
1212

13-
quoteAttributeSpec = re.compile("[" + spaceCharacters + "\"'=<>`]")
13+
quoteAttributeSpecChars = spaceCharacters + "\"'=<>`"
14+
quoteAttributeSpec = re.compile("[" + quoteAttributeSpecChars + "]")
15+
quoteAttributeLegacy = re.compile("[" + quoteAttributeSpecChars +
16+
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
17+
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
18+
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
19+
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
20+
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
21+
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
22+
"\u3000]")
1423

1524
try:
1625
from codecs import register_error, xmlcharrefreplace_errors
@@ -72,7 +81,7 @@ def htmlentityreplace_errors(exc):
7281
class HTMLSerializer(object):
7382

7483
# attribute quoting options
75-
quote_attr_values = False
84+
quote_attr_values = "legacy" # be secure by default
7685
quote_char = '"'
7786
use_best_quote_char = True
7887

@@ -108,9 +117,9 @@ def __init__(self, **kwargs):
108117
inject_meta_charset=True|False
109118
Whether it insert a meta element to define the character set of the
110119
document.
111-
quote_attr_values=True|False
120+
quote_attr_values="legacy"|"spec"|"always"
112121
Whether to quote attribute values that don't require quoting
113-
per HTML5 parsing rules.
122+
per legacy browser behaviour, when required by the standard, or always.
114123
quote_char=u'"'|u"'"
115124
Use given quote character for attribute quoting. Default is to
116125
use double quote unless attribute value contains a double quote,
@@ -239,10 +248,15 @@ def serialize(self, treewalker, encoding=None):
239248
(k not in booleanAttributes.get(name, tuple()) and
240249
k not in booleanAttributes.get("", tuple())):
241250
yield self.encodeStrict("=")
242-
if self.quote_attr_values:
251+
if self.quote_attr_values == "always" or len(v) == 0:
243252
quote_attr = True
253+
elif self.quote_attr_values == "spec":
254+
quote_attr = quoteAttributeSpec.search(v) is not None
255+
elif self.quote_attr_values == "legacy":
256+
quote_attr = quoteAttributeLegacy.search(v) is not None
244257
else:
245-
quote_attr = len(v) == 0 or quoteAttributeSpec.search(v)
258+
raise ValueError("quote_attr_values must be one of: "
259+
"'always', 'spec', or 'legacy'")
246260
v = v.replace("&", "&amp;")
247261
if self.escape_lt_in_attrs:
248262
v = v.replace("<", "&lt;")

0 commit comments

Comments
 (0)