Skip to content

Commit 33ccdfb

Browse files
Dreamsorcererpajodwebknjaz
authored
Improve validation in HTTP parser (#8074)
Co-authored-by: Paul J. Dorn <[email protected]> Co-authored-by: Sviatoslav Sydorenko (Святослав Сидоренко) <[email protected]>
1 parent 822fbc7 commit 33ccdfb

File tree

4 files changed

+164
-17
lines changed

4 files changed

+164
-17
lines changed

CHANGES/8074.bugfix.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Fixed an unhandled exception in the Python HTTP parser on header lines starting with a colon -- by :user:`pajod`.
2+
3+
Invalid request lines with anything but a dot between the HTTP major and minor version are now rejected. Invalid header field names containing question mark or slash are now rejected. Such requests are incompatible with :rfc:`9110#section-5.6.2` and are not known to be of any legitimate use.
4+
5+
(BACKWARD INCOMPATIBLE)

CONTRIBUTORS.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,7 @@ Pankaj Pandey
263263
Parag Jain
264264
Pau Freixes
265265
Paul Colomiets
266+
Paul J. Dorn
266267
Paulius Šileikis
267268
Paulus Schoutsen
268269
Pavel Kamaev

aiohttp/http_parser.py

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -69,12 +69,11 @@
6969
# tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." /
7070
# "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA
7171
# token = 1*tchar
72-
METHRE: Final[Pattern[str]] = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]+")
73-
VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d).(\d)")
74-
HDRRE: Final[Pattern[bytes]] = re.compile(
75-
rb"[\x00-\x1F\x7F-\xFF()<>@,;:\[\]={} \t\"\\]"
76-
)
77-
HEXDIGIT = re.compile(rb"[0-9a-fA-F]+")
72+
_TCHAR_SPECIALS: Final[str] = re.escape("!#$%&'*+-.^_`|~")
73+
TOKENRE: Final[Pattern[str]] = re.compile(f"[0-9A-Za-z{_TCHAR_SPECIALS}]+")
74+
VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d)\.(\d)", re.ASCII)
75+
DIGITS: Final[Pattern[str]] = re.compile(r"\d+", re.ASCII)
76+
HEXDIGITS: Final[Pattern[bytes]] = re.compile(rb"[0-9a-fA-F]+")
7877

7978

8079
class RawRequestMessage(NamedTuple):
@@ -133,6 +132,7 @@ def parse_headers(
133132
self, lines: List[bytes]
134133
) -> Tuple["CIMultiDictProxy[str]", RawHeaders]:
135134
headers: CIMultiDict[str] = CIMultiDict()
135+
# note: "raw" does not mean inclusion of OWS before/after the field value
136136
raw_headers = []
137137

138138
lines_idx = 1
@@ -146,13 +146,14 @@ def parse_headers(
146146
except ValueError:
147147
raise InvalidHeader(line) from None
148148

149+
if len(bname) == 0:
150+
raise InvalidHeader(bname)
151+
149152
# https://www.rfc-editor.org/rfc/rfc9112.html#section-5.1-2
150153
if {bname[0], bname[-1]} & {32, 9}: # {" ", "\t"}
151154
raise InvalidHeader(line)
152155

153156
bvalue = bvalue.lstrip(b" \t")
154-
if HDRRE.search(bname):
155-
raise InvalidHeader(bname)
156157
if len(bname) > self.max_field_size:
157158
raise LineTooLong(
158159
"request header name {}".format(
@@ -161,6 +162,9 @@ def parse_headers(
161162
str(self.max_field_size),
162163
str(len(bname)),
163164
)
165+
name = bname.decode("utf-8", "surrogateescape")
166+
if not TOKENRE.fullmatch(name):
167+
raise InvalidHeader(bname)
164168

165169
header_length = len(bvalue)
166170

@@ -207,7 +211,6 @@ def parse_headers(
207211
)
208212

209213
bvalue = bvalue.strip(b" \t")
210-
name = bname.decode("utf-8", "surrogateescape")
211214
value = bvalue.decode("utf-8", "surrogateescape")
212215

213216
# https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5
@@ -331,7 +334,8 @@ def get_content_length() -> Optional[int]:
331334

332335
# Shouldn't allow +/- or other number formats.
333336
# https://www.rfc-editor.org/rfc/rfc9110#section-8.6-2
334-
if not length_hdr.strip(" \t").isdecimal():
337+
# msg.headers is already stripped of leading/trailing wsp
338+
if not DIGITS.fullmatch(length_hdr):
335339
raise InvalidHeader(CONTENT_LENGTH)
336340

337341
return int(length_hdr)
@@ -559,7 +563,7 @@ def parse_message(self, lines: List[bytes]) -> RawRequestMessage:
559563
)
560564

561565
# method
562-
if not METHRE.fullmatch(method):
566+
if not TOKENRE.fullmatch(method):
563567
raise BadStatusLine(method)
564568

565569
# version
@@ -676,8 +680,8 @@ def parse_message(self, lines: List[bytes]) -> RawResponseMessage:
676680
raise BadStatusLine(line)
677681
version_o = HttpVersion(int(match.group(1)), int(match.group(2)))
678682

679-
# The status code is a three-digit number
680-
if len(status) != 3 or not status.isdecimal():
683+
# The status code is a three-digit ASCII number, no padding
684+
if len(status) != 3 or not DIGITS.fullmatch(status):
681685
raise BadStatusLine(line)
682686
status_i = int(status)
683687

@@ -818,7 +822,7 @@ def feed_data(
818822
if self._lax: # Allow whitespace in lax mode.
819823
size_b = size_b.strip()
820824

821-
if not re.fullmatch(HEXDIGIT, size_b):
825+
if not re.fullmatch(HEXDIGITS, size_b):
822826
exc = TransferEncodingError(
823827
chunk[:pos].decode("ascii", "surrogateescape")
824828
)

tests/test_http_parser.py

Lines changed: 140 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33

44
import asyncio
55
import re
6-
from typing import Any, List
6+
from contextlib import nullcontext
7+
from typing import Any, Dict, List
78
from unittest import mock
89
from urllib.parse import quote
910

@@ -168,11 +169,27 @@ def test_cve_2023_37276(parser: Any) -> None:
168169
parser.feed_data(text)
169170

170171

172+
@pytest.mark.parametrize(
173+
"rfc9110_5_6_2_token_delim",
174+
r'"(),/:;<=>?@[\]{}',
175+
)
176+
def test_bad_header_name(parser: Any, rfc9110_5_6_2_token_delim: str) -> None:
177+
text = f"POST / HTTP/1.1\r\nhead{rfc9110_5_6_2_token_delim}er: val\r\n\r\n".encode()
178+
expectation = pytest.raises(http_exceptions.BadHttpMessage)
179+
if rfc9110_5_6_2_token_delim == ":":
180+
# Inserting colon into header just splits name/value earlier.
181+
expectation = nullcontext()
182+
with expectation:
183+
parser.feed_data(text)
184+
185+
171186
@pytest.mark.parametrize(
172187
"hdr",
173188
(
174189
"Content-Length: -5", # https://www.rfc-editor.org/rfc/rfc9110.html#name-content-length
175190
"Content-Length: +256",
191+
"Content-Length: \N{superscript one}",
192+
"Content-Length: \N{mathematical double-struck digit one}",
176193
"Foo: abc\rdef", # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5
177194
"Bar: abc\ndef",
178195
"Baz: abc\x00def",
@@ -265,6 +282,20 @@ def test_parse_headers_longline(parser: Any) -> None:
265282
parser.feed_data(text)
266283

267284

285+
def test_parse_unusual_request_line(parser: Any) -> None:
286+
if not isinstance(response, HttpResponseParserPy):
287+
pytest.xfail("Regression test for Py parser. May match C behaviour later.")
288+
text = b"#smol //a HTTP/1.3\r\n\r\n"
289+
messages, upgrade, tail = parser.feed_data(text)
290+
assert len(messages) == 1
291+
msg, _ = messages[0]
292+
assert msg.compression is None
293+
assert not msg.upgrade
294+
assert msg.method == "#smol"
295+
assert msg.path == "//a"
296+
assert msg.version == (1, 3)
297+
298+
268299
def test_parse(parser: Any) -> None:
269300
text = b"GET /test HTTP/1.1\r\n\r\n"
270301
messages, upgrade, tail = parser.feed_data(text)
@@ -567,6 +598,45 @@ def test_headers_content_length_err_2(parser: Any) -> None:
567598
parser.feed_data(text)
568599

569600

601+
_pad: Dict[bytes, str] = {
602+
b"": "empty",
603+
# not a typo. Python likes triple zero
604+
b"\000": "NUL",
605+
b" ": "SP",
606+
b" ": "SPSP",
607+
# not a typo: both 0xa0 and 0x0a in case of 8-bit fun
608+
b"\n": "LF",
609+
b"\xa0": "NBSP",
610+
b"\t ": "TABSP",
611+
}
612+
613+
614+
@pytest.mark.parametrize("hdr", [b"", b"foo"], ids=["name-empty", "with-name"])
615+
@pytest.mark.parametrize("pad2", _pad.keys(), ids=["post-" + n for n in _pad.values()])
616+
@pytest.mark.parametrize("pad1", _pad.keys(), ids=["pre-" + n for n in _pad.values()])
617+
def test_invalid_header_spacing(
618+
parser: Any, pad1: bytes, pad2: bytes, hdr: bytes
619+
) -> None:
620+
text = b"GET /test HTTP/1.1\r\n" b"%s%s%s: value\r\n\r\n" % (pad1, hdr, pad2)
621+
expectation = pytest.raises(http_exceptions.BadHttpMessage)
622+
if pad1 == pad2 == b"" and hdr != b"":
623+
# one entry in param matrix is correct: non-empty name, not padded
624+
expectation = nullcontext()
625+
if pad1 == pad2 == hdr == b"":
626+
if not isinstance(response, HttpResponseParserPy):
627+
pytest.xfail("Regression test for Py parser. May match C behaviour later.")
628+
with expectation:
629+
parser.feed_data(text)
630+
631+
632+
def test_empty_header_name(parser: Any) -> None:
633+
if not isinstance(response, HttpResponseParserPy):
634+
pytest.xfail("Regression test for Py parser. May match C behaviour later.")
635+
text = b"GET /test HTTP/1.1\r\n" b":test\r\n\r\n"
636+
with pytest.raises(http_exceptions.BadHttpMessage):
637+
parser.feed_data(text)
638+
639+
570640
def test_invalid_header(parser: Any) -> None:
571641
text = b"GET /test HTTP/1.1\r\n" b"test line\r\n\r\n"
572642
with pytest.raises(http_exceptions.BadHttpMessage):
@@ -689,6 +759,34 @@ def test_http_request_bad_status_line(parser: Any) -> None:
689759
assert r"\n" not in exc_info.value.message
690760

691761

762+
_num: Dict[bytes, str] = {
763+
# dangerous: accepted by Python int()
764+
# unicodedata.category("\U0001D7D9") == 'Nd'
765+
"\N{mathematical double-struck digit one}".encode(): "utf8digit",
766+
# only added for interop tests, refused by Python int()
767+
# unicodedata.category("\U000000B9") == 'No'
768+
"\N{superscript one}".encode(): "utf8number",
769+
"\N{superscript one}".encode("latin-1"): "latin1number",
770+
}
771+
772+
773+
@pytest.mark.parametrize("nonascii_digit", _num.keys(), ids=_num.values())
774+
def test_http_request_bad_status_line_number(
775+
parser: Any, nonascii_digit: bytes
776+
) -> None:
777+
text = b"GET /digit HTTP/1." + nonascii_digit + b"\r\n\r\n"
778+
with pytest.raises(http_exceptions.BadStatusLine):
779+
parser.feed_data(text)
780+
781+
782+
def test_http_request_bad_status_line_separator(parser: Any) -> None:
783+
# single code point, old, multibyte NFKC, multibyte NFKD
784+
utf8sep = "\N{arabic ligature sallallahou alayhe wasallam}".encode()
785+
text = b"GET /ligature HTTP/1" + utf8sep + b"1\r\n\r\n"
786+
with pytest.raises(http_exceptions.BadStatusLine):
787+
parser.feed_data(text)
788+
789+
692790
def test_http_request_bad_status_line_whitespace(parser: Any) -> None:
693791
text = b"GET\n/path\fHTTP/1.1\r\n\r\n"
694792
with pytest.raises(http_exceptions.BadStatusLine):
@@ -710,6 +808,31 @@ def test_http_request_upgrade(parser: Any) -> None:
710808
assert tail == b"some raw data"
711809

712810

811+
def test_http_request_parser_utf8_request_line(parser: Any) -> None:
812+
if not isinstance(response, HttpResponseParserPy):
813+
pytest.xfail("Regression test for Py parser. May match C behaviour later.")
814+
messages, upgrade, tail = parser.feed_data(
815+
# note the truncated unicode sequence
816+
b"GET /P\xc3\xbcnktchen\xa0\xef\xb7 HTTP/1.1\r\n" +
817+
# for easier grep: ASCII 0xA0 more commonly known as non-breaking space
818+
# note the leading and trailing spaces
819+
"sTeP: \N{latin small letter sharp s}nek\t\N{no-break space} "
820+
"\r\n\r\n".encode()
821+
)
822+
msg = messages[0][0]
823+
824+
assert msg.method == "GET"
825+
assert msg.path == "/Pünktchen\udca0\udcef\udcb7"
826+
assert msg.version == (1, 1)
827+
assert msg.headers == CIMultiDict([("STEP", "ßnek\t\xa0")])
828+
assert msg.raw_headers == ((b"sTeP", "ßnek\t\xa0".encode()),)
829+
assert not msg.should_close
830+
assert msg.compression is None
831+
assert not msg.upgrade
832+
assert not msg.chunked
833+
assert msg.url.path == URL("/P%C3%BCnktchen\udca0\udcef\udcb7").path
834+
835+
713836
def test_http_request_parser_utf8(parser: Any) -> None:
714837
text = "GET /path HTTP/1.1\r\nx-test:тест\r\n\r\n".encode()
715838
messages, upgrade, tail = parser.feed_data(text)
@@ -759,9 +882,15 @@ def test_http_request_parser_two_slashes(parser: Any) -> None:
759882
assert not msg.chunked
760883

761884

762-
def test_http_request_parser_bad_method(parser: Any) -> None:
885+
@pytest.mark.parametrize(
886+
"rfc9110_5_6_2_token_delim",
887+
[bytes([i]) for i in rb'"(),/:;<=>?@[\]{}'],
888+
)
889+
def test_http_request_parser_bad_method(
890+
parser: Any, rfc9110_5_6_2_token_delim: bytes
891+
) -> None:
763892
with pytest.raises(http_exceptions.BadStatusLine):
764-
parser.feed_data(b'G=":<>(e),[T];?" /get HTTP/1.1\r\n\r\n')
893+
parser.feed_data(rfc9110_5_6_2_token_delim + b'ET" /get HTTP/1.1\r\n\r\n')
765894

766895

767896
def test_http_request_parser_bad_version(parser: Any) -> None:
@@ -979,6 +1108,14 @@ def test_http_response_parser_code_not_int(response: Any) -> None:
9791108
response.feed_data(b"HTTP/1.1 ttt test\r\n\r\n")
9801109

9811110

1111+
@pytest.mark.parametrize("nonascii_digit", _num.keys(), ids=_num.values())
1112+
def test_http_response_parser_code_not_ascii(
1113+
response: Any, nonascii_digit: bytes
1114+
) -> None:
1115+
with pytest.raises(http_exceptions.BadStatusLine):
1116+
response.feed_data(b"HTTP/1.1 20" + nonascii_digit + b" test\r\n\r\n")
1117+
1118+
9821119
def test_http_request_chunked_payload(parser: Any) -> None:
9831120
text = b"GET /test HTTP/1.1\r\n" b"transfer-encoding: chunked\r\n\r\n"
9841121
msg, payload = parser.feed_data(text)[0][0]

0 commit comments

Comments
 (0)