Skip to content

Commit 3990a6d

Browse files
committed
fix(ngSanitize): follow HTML parser rules for start tags / allow < in text content
ngSanitize will now permit opening braces in text content, provided they are not followed by either an unescaped backslash, or by an ASCII letter (u+0041 - u+005A, u+0061 - u+007A), in compliance with rules of the parsing spec, without taking insertion mode into account. BREAKING CHANGE Previously, $sanitize would "fix" invalid markup in which a space preceded alphanumeric characters in a start-tag. Following this change, any opening angle bracket which is not followed by either a forward slash, or by an ASCII letter (a-z | A-Z) will not be considered a start tag delimiter, per the HTML parsing spec (http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html). Closes angular#8193
1 parent 8781c72 commit 3990a6d

File tree

2 files changed

+39
-19
lines changed

2 files changed

+39
-19
lines changed

src/ngSanitize/sanitize.js

+13-9
Original file line numberDiff line numberDiff line change
@@ -154,11 +154,11 @@ function sanitizeText(chars) {
154154

155155
// Regular Expressions for parsing tags and attributes
156156
var START_TAG_REGEXP =
157-
/^<\s*([\w:-]+)((?:\s+[\w:-]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)\s*>/,
158-
END_TAG_REGEXP = /^<\s*\/\s*([\w:-]+)[^>]*>/,
157+
/^<((?:[a-zA-Z])[\w:-]*)((?:\s+[\w:-]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)\s*(>?)/,
158+
END_TAG_REGEXP = /^<\/\s*([\w:-]+)[^>]*>/,
159159
ATTR_REGEXP = /([\w:-]+)(?:\s*=\s*(?:(?:"((?:[^"])*)")|(?:'((?:[^'])*)')|([^>\s]+)))?/g,
160160
BEGIN_TAG_REGEXP = /^</,
161-
BEGING_END_TAGE_REGEXP = /^<\s*\//,
161+
BEGING_END_TAGE_REGEXP = /^<\//,
162162
COMMENT_REGEXP = /<!--(.*?)-->/g,
163163
DOCTYPE_REGEXP = /<!DOCTYPE([^>]*?)>/i,
164164
CDATA_REGEXP = /<!\[CDATA\[(.*?)]]>/g,
@@ -232,10 +232,11 @@ function makeMap(str) {
232232
* @param {object} handler
233233
*/
234234
function htmlParser( html, handler ) {
235-
var index, chars, match, stack = [], last = html;
235+
var index, chars, match, stack = [], last = html, text;
236236
stack.last = function() { return stack[ stack.length - 1 ]; };
237237

238238
while ( html ) {
239+
text = '';
239240
chars = true;
240241

241242
// Make sure we're not in a script or style element
@@ -274,20 +275,23 @@ function htmlParser( html, handler ) {
274275
match = html.match( START_TAG_REGEXP );
275276

276277
if ( match ) {
277-
html = html.substring( match[0].length );
278-
match[0].replace( START_TAG_REGEXP, parseStartTag );
278+
// We only have a valid start-tag if there is a '>'.
279+
if ( match[4] ) {
280+
html = html.substring( match[0].length );
281+
match[0].replace( START_TAG_REGEXP, parseStartTag );
282+
}
279283
chars = false;
280284
} else {
281-
// no ending tag found
282-
if (handler.chars) handler.chars( '<' );
285+
// no ending tag found --- this piece should be encoded as an entity.
286+
text += '<';
283287
html = html.substring(1);
284288
}
285289
}
286290

287291
if ( chars ) {
288292
index = html.indexOf("<");
289293

290-
var text = index < 0 ? html : html.substring( 0, index );
294+
text += index < 0 ? html : html.substring( 0, index );
291295
html = index < 0 ? "" : html.substring( index );
292296

293297
if (handler.chars) handler.chars( decodeEntities(text) );

test/ngSanitize/sanitizeSpec.js

+26-10
Original file line numberDiff line numberDiff line change
@@ -82,18 +82,31 @@ describe('HTML', function() {
8282
expect(text).toEqual('text');
8383
});
8484

85-
it('should parse unterminated tags as regular content', function() {
86-
htmlParser('<a text1 text2 <a text1 text2', handler);
87-
expect(text).toEqual('<a text1 text2 <a text1 text2');
85+
it('should not treat "<" followed by a non-/ or non-letter as a tag', function() {
86+
expectHTML('<- text1 text2 <1 text1 text2 <{', handler).
87+
toBe('&lt;- text1 text2 &lt;1 text1 text2 &lt;{');
88+
});
89+
90+
it('should throw badparse if text content contains "<" followed by "/" without matching ">"', function() {
91+
expect(function() {
92+
htmlParser('foo </ bar', handler);
93+
}).toThrowMinErr('$sanitize', 'badparse', 'The sanitizer was unable to parse the following block of html: </ bar');
94+
});
95+
96+
it('should throw badparse if text content contains "<" followed by an ASCII letter without matching ">"', function() {
97+
expect(function() {
98+
htmlParser('foo <a bar', handler);
99+
}).toThrowMinErr('$sanitize', 'badparse', 'The sanitizer was unable to parse the following block of html: <a bar');
88100
});
89101

90102
it('should accept tag delimiters such as "<" inside real tags', function() {
103+
// Assert that the < is part of the text node content, and not part of a tag name.
91104
htmlParser('<p> 10 < 100 </p>', handler);
92105
expect(text).toEqual(' 10 < 100 ');
93106
});
94107

95108
it('should parse newlines in tags', function() {
96-
htmlParser('<\ntag\n attr="value"\n>text<\n/\ntag\n>', handler);
109+
htmlParser('<tag\n attr="value"\n>text</\ntag\n>', handler);
97110
expect(start).toEqual({tag:'tag', attrs:{attr:'value'}, unary:false});
98111
expect(text).toEqual('text');
99112
});
@@ -134,8 +147,9 @@ describe('HTML', function() {
134147
expectHTML('a<!DocTyPe html>c.').toEqual('ac.');
135148
});
136149

137-
it('should remove nested script', function() {
138-
expectHTML('a< SCRIPT >A< SCRIPT >evil< / scrIpt >B< / scrIpt >c.').toEqual('ac.');
150+
it('should escape non-start tags', function() {
151+
expectHTML('a< SCRIPT >A< SCRIPT >evil< / scrIpt >B< / scrIpt >c.').
152+
toBe('a&lt; SCRIPT &gt;A&lt; SCRIPT &gt;evil&lt; / scrIpt &gt;B&lt; / scrIpt &gt;c.');
139153
});
140154

141155
it('should remove attrs', function() {
@@ -176,14 +190,16 @@ describe('HTML', function() {
176190
expectHTML(everything).toEqual(everything);
177191
});
178192

179-
it('should handle improper html', function() {
193+
it('should mangle improper html', function() {
194+
// This text is encoded more than a real HTML parser would, but it should render the same.
180195
expectHTML('< div rel="</div>" alt=abc dir=\'"\' >text< /div>').
181-
toEqual('<div rel="&lt;/div&gt;" alt="abc" dir="&#34;">text</div>');
196+
toBe('&lt; div rel=&#34;&#34; alt=abc dir=\'&#34;\' &gt;text&lt; /div&gt;');
182197
});
183198

184-
it('should handle improper html2', function() {
199+
it('should mangle improper html2', function() {
200+
// A proper HTML parser would clobber this more in most cases, but it looks reasonable.
185201
expectHTML('< div rel="</div>" / >').
186-
toEqual('<div rel="&lt;/div&gt;"/>');
202+
toBe('&lt; div rel=&#34;&#34; / &gt;');
187203
});
188204

189205
it('should ignore back slash as escape', function() {

0 commit comments

Comments
 (0)