Skip to content

Commit db9072f

Browse files
Add URL parsing tests from WHATWG (#3188)
Co-authored-by: Kar Petrosyan <[email protected]>
1 parent 92e9dfb commit db9072f

File tree

4 files changed

+9819
-23
lines changed

4 files changed

+9819
-23
lines changed

httpx/_urlparse.py

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -253,22 +253,27 @@ def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
253253
parsed_userinfo != "" or parsed_host != "" or parsed_port is not None
254254
)
255255
validate_path(path, has_scheme=has_scheme, has_authority=has_authority)
256-
if has_authority:
256+
if has_scheme or has_authority:
257257
path = normalize_path(path)
258258

259259
# The GEN_DELIMS set is... : / ? # [ ] @
260260
# These do not need to be percent-quoted unless they serve as delimiters for the
261261
# specific component.
262+
WHATWG_SAFE = '`{}%|^\\"'
262263

263264
# For 'path' we need to drop ? and # from the GEN_DELIMS set.
264-
parsed_path: str = quote(path, safe=SUB_DELIMS + ":/[]@")
265+
parsed_path: str = quote(path, safe=SUB_DELIMS + WHATWG_SAFE + ":/[]@")
265266
# For 'query' we need to drop '#' from the GEN_DELIMS set.
266267
parsed_query: str | None = (
267-
None if query is None else quote(query, safe=SUB_DELIMS + ":/?[]@")
268+
None
269+
if query is None
270+
else quote(query, safe=SUB_DELIMS + WHATWG_SAFE + ":/?[]@")
268271
)
269272
# For 'fragment' we can include all of the GEN_DELIMS set.
270273
parsed_fragment: str | None = (
271-
None if fragment is None else quote(fragment, safe=SUB_DELIMS + ":/?#[]@")
274+
None
275+
if fragment is None
276+
else quote(fragment, safe=SUB_DELIMS + WHATWG_SAFE + ":/?#[]@")
272277
)
273278

274279
# The parsed ASCII bytestrings are our canonical form.
@@ -321,7 +326,8 @@ def encode_host(host: str) -> str:
321326
# From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
322327
#
323328
# reg-name = *( unreserved / pct-encoded / sub-delims )
324-
return quote(host.lower(), safe=SUB_DELIMS)
329+
WHATWG_SAFE = '"`{}%|\\'
330+
return quote(host.lower(), safe=SUB_DELIMS + WHATWG_SAFE)
325331

326332
# IDNA hostnames
327333
try:
@@ -369,19 +375,17 @@ def validate_path(path: str, has_scheme: bool, has_authority: bool) -> None:
369375
# must either be empty or begin with a slash ("/") character."
370376
if path and not path.startswith("/"):
371377
raise InvalidURL("For absolute URLs, path must be empty or begin with '/'")
372-
else:
378+
379+
if not has_scheme and not has_authority:
373380
# If a URI does not contain an authority component, then the path cannot begin
374381
# with two slash characters ("//").
375382
if path.startswith("//"):
376-
raise InvalidURL(
377-
"URLs with no authority component cannot have a path starting with '//'"
378-
)
383+
raise InvalidURL("Relative URLs cannot have a path starting with '//'")
384+
379385
# In addition, a URI reference (Section 4.1) may be a relative-path reference,
380386
# in which case the first path segment cannot contain a colon (":") character.
381-
if path.startswith(":") and not has_scheme:
382-
raise InvalidURL(
383-
"URLs with no scheme component cannot have a path starting with ':'"
384-
)
387+
if path.startswith(":"):
388+
raise InvalidURL("Relative URLs cannot have a path starting with ':'")
385389

386390

387391
def normalize_path(path: str) -> str:

tests/models/test_url.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -230,8 +230,8 @@ def test_url_normalized_host():
230230

231231

232232
def test_url_percent_escape_host():
233-
url = httpx.URL("https://exam%le.com/")
234-
assert url.host == "exam%25le.com"
233+
url = httpx.URL("https://exam le.com/")
234+
assert url.host == "exam%20le.com"
235235

236236

237237
def test_url_ipv4_like_host():
@@ -415,17 +415,11 @@ def test_urlparse_with_invalid_path():
415415

416416
with pytest.raises(httpx.InvalidURL) as exc:
417417
httpx.URL(path="//abc")
418-
assert (
419-
str(exc.value)
420-
== "URLs with no authority component cannot have a path starting with '//'"
421-
)
418+
assert str(exc.value) == "Relative URLs cannot have a path starting with '//'"
422419

423420
with pytest.raises(httpx.InvalidURL) as exc:
424421
httpx.URL(path=":abc")
425-
assert (
426-
str(exc.value)
427-
== "URLs with no scheme component cannot have a path starting with ':'"
428-
)
422+
assert str(exc.value) == "Relative URLs cannot have a path starting with ':'"
429423

430424

431425
def test_url_with_relative_path():

tests/models/test_whatwg.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# The WHATWG have various tests that can be used to validate the URL parsing.
2+
#
3+
# https://url.spec.whatwg.org/
4+
5+
import json
6+
7+
import pytest
8+
9+
from httpx._urlparse import urlparse
10+
11+
# URL test cases from...
12+
# https://github.com/web-platform-tests/wpt/blob/master/url/resources/urltestdata.json
13+
with open("tests/models/whatwg.json", "r") as input:
14+
test_cases = json.load(input)
15+
test_cases = [
16+
item
17+
for item in test_cases
18+
if not isinstance(item, str) and not item.get("failure")
19+
]
20+
21+
22+
@pytest.mark.parametrize("test_case", test_cases)
23+
def test_urlparse(test_case):
24+
if test_case["href"] in ("a: foo.com", "lolscheme:x x#x%20x"):
25+
# Skip these two test cases.
26+
# WHATWG cases where are not using percent-encoding for the space character.
27+
# Anyone know what's going on here?
28+
return
29+
30+
p = urlparse(test_case["href"])
31+
32+
# Test cases include the protocol with the trailing ":"
33+
protocol = p.scheme + ":"
34+
# Include the square brackets for IPv6 addresses.
35+
hostname = f"[{p.host}]" if ":" in p.host else p.host
36+
# The test cases use a string representation of the port.
37+
port = "" if p.port is None else str(p.port)
38+
# I have nothing to say about this one.
39+
path = p.path
40+
# The 'search' and 'hash' components in the whatwg tests are semantic, not literal.
41+
# Our parsing differentiates between no query/hash and empty-string query/hash.
42+
search = "" if p.query in (None, "") else "?" + str(p.query)
43+
hash = "" if p.fragment in (None, "") else "#" + str(p.fragment)
44+
45+
# URL hostnames are case-insensitive.
46+
# We normalize these, unlike the WHATWG test cases.
47+
assert protocol == test_case["protocol"]
48+
assert hostname.lower() == test_case["hostname"].lower()
49+
assert port == test_case["port"]
50+
assert path == test_case["pathname"]
51+
assert search == test_case["search"]
52+
assert hash == test_case["hash"]

0 commit comments

Comments
 (0)