Skip to content

Commit ba0ed00

Browse files
committed
url: Properly parse certain oddly formed urls
In cases where there are multiple @-chars in a url, Node currently parses the hostname and auth sections differently than web browsers. This part of the bug is serious, and should be landed in v0.10, and also ported to v0.8, and releases made as soon as possible. The less serious issue is that there are many other sorts of malformed urls which Node either accepts when it should reject, or interprets differently than web browsers. For example, `http://a.com*foo` is interpreted by Node like `http://a.com/*foo` when web browsers treat this as `http://a.com%3Bfoo/`. In general, *only* the `hostEndingChars` should be the characters that delimit the host portion of the URL. Most of the current `nonHostChars` that appear in the hostname should be escaped, but some of them (such as `;` and `%` when it does not introduce a hex pair) should raise an error. We need to have a broader discussion about whether it's best to throw in these cases, and potentially break extant programs, or return an object that has every field set to `null` so that any attempt to read the hostname/auth/etc. will appear to be empty.
1 parent 4dc5b13 commit ba0ed00

File tree

2 files changed

+110
-57
lines changed

2 files changed

+110
-57
lines changed

lib/url.js

+73-57
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ var protocolPattern = /^([a-z0-9.+-]+:)/i,
4848
// them.
4949
nonHostChars = ['%', '/', '?', ';', '#']
5050
.concat(unwise).concat(autoEscape),
51-
nonAuthChars = ['/', '@', '?', '#'].concat(delims),
51+
hostEndingChars = ['/', '?', '#'],
5252
hostnameMaxLen = 255,
5353
hostnamePartPattern = /^[a-z0-9A-Z_-]{0,63}$/,
5454
hostnamePartStart = /^([a-z0-9A-Z_-]{0,63})(.*)$/,
@@ -90,8 +90,6 @@ var protocolPattern = /^([a-z0-9.+-]+:)/i,
9090
querystring = require('querystring');
9191

9292
function urlParse(url, parseQueryString, slashesDenoteHost) {
93-
if (url && typeof(url) === 'object' && url.href) return url;
94-
9593
if (typeof url !== 'string') {
9694
throw new TypeError("Parameter 'url' must be a string, not " + typeof url);
9795
}
@@ -125,57 +123,66 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
125123

126124
if (!hostlessProtocol[proto] &&
127125
(slashes || (proto && !slashedProtocol[proto]))) {
126+
128127
// there's a hostname.
129128
// the first instance of /, ?, ;, or # ends the host.
130-
// don't enforce full RFC correctness, just be unstupid about it.
131-
129+
//
132130
// If there is an @ in the hostname, then non-host chars *are* allowed
133-
// to the left of the first @ sign, unless some non-auth character
131+
// to the left of the last @ sign, unless some host-ending character
134132
// comes *before* the @-sign.
135133
// URLs are obnoxious.
136-
var atSign = rest.indexOf('@');
137-
if (atSign !== -1) {
138-
var auth = rest.slice(0, atSign);
139-
140-
// there *may be* an auth
141-
var hasAuth = true;
142-
for (var i = 0, l = nonAuthChars.length; i < l; i++) {
143-
if (auth.indexOf(nonAuthChars[i]) !== -1) {
144-
// not a valid auth. Something like http://foo.com/bar@baz/
145-
hasAuth = false;
146-
break;
147-
}
148-
}
134+
//
135+
// ex:
136+
// http://a@b@c/ => user:a@b host:c
137+
// http://a@b?@c => user:a host:c path:/?@c
138+
139+
// v0.12 TODO(isaacs): This is not quite how Chrome does things.
140+
// Review our test case against browsers more comprehensively.
141+
142+
// find the first instance of any hostEndingChars
143+
var hostEnd = -1;
144+
for (var i = 0; i < hostEndingChars.length; i++) {
145+
var hec = rest.indexOf(hostEndingChars[i]);
146+
if (hec !== -1 && (hostEnd === -1 || hec < hostEnd))
147+
hostEnd = hec;
148+
}
149149

150-
if (hasAuth) {
151-
// pluck off the auth portion.
152-
out.auth = decodeURIComponent(auth);
153-
rest = rest.substr(atSign + 1);
154-
}
150+
// at this point, either we have an explicit point where the
151+
// auth portion cannot go past, or the last @ char is the decider.
152+
var auth, atSign;
153+
if (hostEnd === -1) {
154+
// atSign can be anywhere.
155+
atSign = rest.lastIndexOf('@');
156+
} else {
157+
// atSign must be in auth portion.
158+
// http://a@b/c@d => host:b auth:a path:/c@d
159+
atSign = rest.lastIndexOf('@', hostEnd);
155160
}
156161

157-
var firstNonHost = -1;
158-
for (var i = 0, l = nonHostChars.length; i < l; i++) {
159-
var index = rest.indexOf(nonHostChars[i]);
160-
if (index !== -1 &&
161-
(firstNonHost < 0 || index < firstNonHost)) firstNonHost = index;
162+
// Now we have a portion which is definitely the auth.
163+
// Pull that off.
164+
if (atSign !== -1) {
165+
auth = rest.slice(0, atSign);
166+
rest = rest.slice(atSign + 1);
167+
out.auth = decodeURIComponent(auth);
162168
}
163169

164-
if (firstNonHost !== -1) {
165-
out.host = rest.substr(0, firstNonHost);
166-
rest = rest.substr(firstNonHost);
167-
} else {
168-
out.host = rest;
169-
rest = '';
170+
// the host is the remaining to the left of the first non-host char
171+
hostEnd = -1;
172+
for (var i = 0; i < nonHostChars.length; i++) {
173+
var hec = rest.indexOf(nonHostChars[i]);
174+
if (hec !== -1 && (hostEnd === -1 || hec < hostEnd))
175+
hostEnd = hec;
170176
}
177+
// if we still have not hit it, then the entire thing is a host.
178+
if (hostEnd === -1)
179+
hostEnd = rest.length;
180+
181+
out.host = rest.slice(0, hostEnd);
182+
rest = rest.slice(hostEnd);
171183

172184
// pull out port.
173-
var p = parseHost(out.host);
174-
var keys = Object.keys(p);
175-
for (var i = 0, l = keys.length; i < l; i++) {
176-
var key = keys[i];
177-
out[key] = p[key];
178-
}
185+
parseHost(out);
179186

180187
// we've indicated that there is a hostname,
181188
// so even if it's empty, it has to be present.
@@ -187,9 +194,7 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
187194
out.hostname[out.hostname.length - 1] === ']';
188195

189196
// validate a little.
190-
if (out.hostname.length > hostnameMaxLen) {
191-
out.hostname = '';
192-
} else if (!ipv6Hostname) {
197+
if (!ipv6Hostname) {
193198
var hostparts = out.hostname.split(/\./);
194199
for (var i = 0, l = hostparts.length; i < l; i++) {
195200
var part = hostparts[i];
@@ -225,8 +230,12 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
225230
}
226231
}
227232

228-
// hostnames are always lower case.
229-
out.hostname = out.hostname.toLowerCase();
233+
if (out.hostname.length > hostnameMaxLen) {
234+
out.hostname = '';
235+
} else {
236+
// hostnames are always lower case.
237+
out.hostname = out.hostname.toLowerCase();
238+
}
230239

231240
if (!ipv6Hostname) {
232241
// IDNA Support: Returns a puny coded representation of "domain".
@@ -243,11 +252,13 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
243252
out.hostname = newOut.join('.');
244253
}
245254

246-
out.host = (out.hostname || '') +
247-
((out.port) ? ':' + out.port : '');
255+
var p = out.port ? ':' + out.port : '';
256+
var h = out.hostname || '';
257+
out.host = h + p;
248258
out.href += out.host;
249259

250260
// strip [ and ] from the hostname
261+
// the host field still retains them, though
251262
if (ipv6Hostname) {
252263
out.hostname = out.hostname.substr(1, out.hostname.length - 2);
253264
if (rest[0] !== '/') {
@@ -302,8 +313,9 @@ function urlParse(url, parseQueryString, slashesDenoteHost) {
302313

303314
//to support http.request
304315
if (out.pathname || out.search) {
305-
out.path = (out.pathname ? out.pathname : '') +
306-
(out.search ? out.search : '');
316+
var p = out.pathname || '';
317+
var s = out.search || '';
318+
out.path = p + s;
307319
}
308320

309321
// finally, reconstruct the href based on what has been validated.
@@ -332,9 +344,9 @@ function urlFormat(obj) {
332344
host = false,
333345
query = '';
334346

335-
if (obj.host !== undefined) {
347+
if (obj.host) {
336348
host = auth + obj.host;
337-
} else if (obj.hostname !== undefined) {
349+
} else if (obj.hostname) {
338350
host = auth + (obj.hostname.indexOf(':') === -1 ?
339351
obj.hostname :
340352
'[' + obj.hostname + ']');
@@ -365,6 +377,11 @@ function urlFormat(obj) {
365377
if (hash && hash.charAt(0) !== '#') hash = '#' + hash;
366378
if (search && search.charAt(0) !== '?') search = '?' + search;
367379

380+
pathname = pathname.replace(/[?#]/g, function(match) {
381+
return encodeURIComponent(match);
382+
});
383+
search = search.replace('#', '%23');
384+
368385
return protocol + host + pathname + search + hash;
369386
}
370387

@@ -610,16 +627,15 @@ function urlResolveObject(source, relative) {
610627
return source;
611628
}
612629

613-
function parseHost(host) {
614-
var out = {};
630+
function parseHost(obj) {
631+
var host = obj.host;
615632
var port = portPattern.exec(host);
616633
if (port) {
617634
port = port[0];
618635
if (port !== ':') {
619-
out.port = port.substr(1);
636+
obj.port = port.substr(1);
620637
}
621638
host = host.substr(0, host.length - port.length);
622639
}
623-
if (host) out.hostname = host;
624-
return out;
640+
if (host) obj.hostname = host;
625641
}

test/simple/test-url.js

+37
Original file line numberDiff line numberDiff line change
@@ -741,6 +741,43 @@ var parseTests = {
741741
'path': '/test',
742742
},
743743

744+
'http://a@b@c/': {
745+
protocol: 'http:',
746+
slashes: true,
747+
auth: 'a@b',
748+
host: 'c',
749+
hostname: 'c',
750+
href: 'http://a%40b@c/',
751+
path: '/',
752+
pathname: '/'
753+
},
754+
755+
'http://a@b?@c': {
756+
protocol: 'http:',
757+
slashes: true,
758+
auth: 'a',
759+
host: 'b',
760+
hostname: 'b',
761+
href: 'http://a@b/?@c',
762+
path: '/?@c',
763+
pathname: '/',
764+
search: '?@c',
765+
query: '@c'
766+
},
767+
768+
'http://a\r" \t\n<\'b:b@c\r\nd/e?f':{
769+
protocol: 'http:',
770+
slashes: true,
771+
auth: 'a\r" \t\n<\'b:b',
772+
host: 'c',
773+
hostname: 'c',
774+
search: '?f',
775+
query: 'f',
776+
pathname: '%0D%0Ad/e',
777+
path: '%0D%0Ad/e?f',
778+
href: 'http://a%0D%22%20%09%0A%3C\'b:b@c/%0D%0Ad/e?f'
779+
}
780+
744781
};
745782

746783
for (var u in parseTests) {

0 commit comments

Comments
 (0)