Skip to content

Fix #11 by escaping enough to be safe in legacy browsers #95

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
May 17, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ Released on XXX
* **Use scripting disabled by default (as we don't implement
scripting).**

* **Fix #11, avoiding the XSS bug potentially caused by serializer
allowing attribute values to be escaped out of in old browser versions,
changing the quote_attr_values option on serializer to take one of
three values, "always" (the old True value), "legacy" (the new option,
and the new default), and "spec" (the old False value, and the old
default).**


0.9999999/1.0b8
~~~~~~~~~~~~~~~
Expand Down
8 changes: 6 additions & 2 deletions html5lib/filters/lint.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@


class Filter(_base.Filter):
def __init__(self, source, require_matching_tags=True):
super(Filter, self).__init__(source)
self.require_matching_tags = require_matching_tags

def __iter__(self):
open_elements = []
for token in _base.Filter.__iter__(self):
Expand All @@ -26,7 +30,7 @@ def __iter__(self):
assert type == "EmptyTag"
else:
assert type == "StartTag"
if type == "StartTag":
if type == "StartTag" and self.require_matching_tags:
open_elements.append((namespace, name))
for (namespace, name), value in token["data"].items():
assert namespace is None or isinstance(namespace, text_type)
Expand All @@ -44,7 +48,7 @@ def __iter__(self):
assert name != ""
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
else:
elif self.require_matching_tags:
start = open_elements.pop()
assert start == (namespace, name)

Expand Down
32 changes: 22 additions & 10 deletions html5lib/serializer/htmlserializer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type

try:
from functools import reduce
except ImportError:
pass
import re

from ..constants import voidElements, booleanAttributes, spaceCharacters
from ..constants import rcdataElements, entities, xmlEntities
Expand All @@ -13,6 +10,17 @@

spaceCharacters = "".join(spaceCharacters)

quoteAttributeSpecChars = spaceCharacters + "\"'=<>`"
quoteAttributeSpec = re.compile("[" + quoteAttributeSpecChars + "]")
quoteAttributeLegacy = re.compile("[" + quoteAttributeSpecChars +
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
"\u3000]")

try:
from codecs import register_error, xmlcharrefreplace_errors
except ImportError:
Expand Down Expand Up @@ -73,7 +81,7 @@ def htmlentityreplace_errors(exc):
class HTMLSerializer(object):

# attribute quoting options
quote_attr_values = False
quote_attr_values = "legacy" # be secure by default
quote_char = '"'
use_best_quote_char = True

Expand Down Expand Up @@ -109,9 +117,9 @@ def __init__(self, **kwargs):
inject_meta_charset=True|False
Whether it insert a meta element to define the character set of the
document.
quote_attr_values=True|False
quote_attr_values="legacy"|"spec"|"always"
Whether to quote attribute values that don't require quoting
per HTML5 parsing rules.
per legacy browser behaviour, when required by the standard, or always.
quote_char=u'"'|u"'"
Use given quote character for attribute quoting. Default is to
use double quote unless attribute value contains a double quote,
Expand Down Expand Up @@ -240,11 +248,15 @@ def serialize(self, treewalker, encoding=None):
(k not in booleanAttributes.get(name, tuple()) and
k not in booleanAttributes.get("", tuple())):
yield self.encodeStrict("=")
if self.quote_attr_values or not v:
if self.quote_attr_values == "always" or len(v) == 0:
quote_attr = True
elif self.quote_attr_values == "spec":
quote_attr = quoteAttributeSpec.search(v) is not None
elif self.quote_attr_values == "legacy":
quote_attr = quoteAttributeLegacy.search(v) is not None
else:
quote_attr = reduce(lambda x, y: x or (y in v),
spaceCharacters + ">\"'=", False)
raise ValueError("quote_attr_values must be one of: "
"'always', 'spec', or 'legacy'")
v = v.replace("&", "&amp;")
if self.escape_lt_in_attrs:
v = v.replace("<", "&lt;")
Expand Down
Loading