Skip to content

Commit d33bc21

Browse files
Improve validation in HTTP parser (#8074) (#8078)
Co-authored-by: Paul J. Dorn <[email protected]> Co-authored-by: Sviatoslav Sydorenko (Святослав Сидоренко) <[email protected]> (cherry picked from commit 33ccdfb)
1 parent 0d945d1 commit d33bc21

File tree

4 files changed

+160
-17
lines changed

4 files changed

+160
-17
lines changed

CHANGES/8074.bugfix.rst

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Fixed an unhandled exception in the Python HTTP parser on header lines starting with a colon -- by :user:`pajod`.
2+
3+
Invalid request lines with anything but a dot between the HTTP major and minor version are now rejected. Invalid header field names containing question mark or slash are now rejected. Such requests are incompatible with :rfc:`9110#section-5.6.2` and are not known to be of any legitimate use.
4+
5+
(BACKWARD INCOMPATIBLE)

CONTRIBUTORS.txt

+1
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,7 @@ Pankaj Pandey
255255
Parag Jain
256256
Pau Freixes
257257
Paul Colomiets
258+
Paul J. Dorn
258259
Paulius Šileikis
259260
Paulus Schoutsen
260261
Pavel Kamaev

aiohttp/http_parser.py

+18-14
Original file line numberDiff line numberDiff line change
@@ -69,12 +69,11 @@
6969
# tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." /
7070
# "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA
7171
# token = 1*tchar
72-
METHRE: Final[Pattern[str]] = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]+")
73-
VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d).(\d)")
74-
HDRRE: Final[Pattern[bytes]] = re.compile(
75-
rb"[\x00-\x1F\x7F-\xFF()<>@,;:\[\]={} \t\"\\]"
76-
)
77-
HEXDIGIT = re.compile(rb"[0-9a-fA-F]+")
72+
_TCHAR_SPECIALS: Final[str] = re.escape("!#$%&'*+-.^_`|~")
73+
TOKENRE: Final[Pattern[str]] = re.compile(f"[0-9A-Za-z{_TCHAR_SPECIALS}]+")
74+
VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d)\.(\d)", re.ASCII)
75+
DIGITS: Final[Pattern[str]] = re.compile(r"\d+", re.ASCII)
76+
HEXDIGITS: Final[Pattern[bytes]] = re.compile(rb"[0-9a-fA-F]+")
7877

7978

8079
class RawRequestMessage(NamedTuple):
@@ -136,6 +135,7 @@ def parse_headers(
136135
self, lines: List[bytes]
137136
) -> Tuple["CIMultiDictProxy[str]", RawHeaders]:
138137
headers: CIMultiDict[str] = CIMultiDict()
138+
# note: "raw" does not mean inclusion of OWS before/after the field value
139139
raw_headers = []
140140

141141
lines_idx = 1
@@ -149,13 +149,14 @@ def parse_headers(
149149
except ValueError:
150150
raise InvalidHeader(line) from None
151151

152+
if len(bname) == 0:
153+
raise InvalidHeader(bname)
154+
152155
# https://www.rfc-editor.org/rfc/rfc9112.html#section-5.1-2
153156
if {bname[0], bname[-1]} & {32, 9}: # {" ", "\t"}
154157
raise InvalidHeader(line)
155158

156159
bvalue = bvalue.lstrip(b" \t")
157-
if HDRRE.search(bname):
158-
raise InvalidHeader(bname)
159160
if len(bname) > self.max_field_size:
160161
raise LineTooLong(
161162
"request header name {}".format(
@@ -164,6 +165,9 @@ def parse_headers(
164165
str(self.max_field_size),
165166
str(len(bname)),
166167
)
168+
name = bname.decode("utf-8", "surrogateescape")
169+
if not TOKENRE.fullmatch(name):
170+
raise InvalidHeader(bname)
167171

168172
header_length = len(bvalue)
169173

@@ -210,7 +214,6 @@ def parse_headers(
210214
)
211215

212216
bvalue = bvalue.strip(b" \t")
213-
name = bname.decode("utf-8", "surrogateescape")
214217
value = bvalue.decode("utf-8", "surrogateescape")
215218

216219
# https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5
@@ -338,7 +341,8 @@ def get_content_length() -> Optional[int]:
338341

339342
# Shouldn't allow +/- or other number formats.
340343
# https://www.rfc-editor.org/rfc/rfc9110#section-8.6-2
341-
if not length_hdr.strip(" \t").isdecimal():
344+
# msg.headers is already stripped of leading/trailing wsp
345+
if not DIGITS.fullmatch(length_hdr):
342346
raise InvalidHeader(CONTENT_LENGTH)
343347

344348
return int(length_hdr)
@@ -566,7 +570,7 @@ def parse_message(self, lines: List[bytes]) -> RawRequestMessage:
566570
)
567571

568572
# method
569-
if not METHRE.fullmatch(method):
573+
if not TOKENRE.fullmatch(method):
570574
raise BadStatusLine(method)
571575

572576
# version
@@ -683,8 +687,8 @@ def parse_message(self, lines: List[bytes]) -> RawResponseMessage:
683687
raise BadStatusLine(line)
684688
version_o = HttpVersion(int(match.group(1)), int(match.group(2)))
685689

686-
# The status code is a three-digit number
687-
if len(status) != 3 or not status.isdecimal():
690+
# The status code is a three-digit ASCII number, no padding
691+
if len(status) != 3 or not DIGITS.fullmatch(status):
688692
raise BadStatusLine(line)
689693
status_i = int(status)
690694

@@ -826,7 +830,7 @@ def feed_data(
826830
if self._lax: # Allow whitespace in lax mode.
827831
size_b = size_b.strip()
828832

829-
if not re.fullmatch(HEXDIGIT, size_b):
833+
if not re.fullmatch(HEXDIGITS, size_b):
830834
exc = TransferEncodingError(
831835
chunk[:pos].decode("ascii", "surrogateescape")
832836
)

tests/test_http_parser.py

+136-3
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22

33
import asyncio
44
import re
5-
from typing import Any, List
5+
from contextlib import nullcontext
6+
from typing import Any, Dict, List
67
from unittest import mock
78
from urllib.parse import quote
89

@@ -169,11 +170,27 @@ def test_cve_2023_37276(parser: Any) -> None:
169170
parser.feed_data(text)
170171

171172

173+
@pytest.mark.parametrize(
174+
"rfc9110_5_6_2_token_delim",
175+
r'"(),/:;<=>?@[\]{}',
176+
)
177+
def test_bad_header_name(parser: Any, rfc9110_5_6_2_token_delim: str) -> None:
178+
text = f"POST / HTTP/1.1\r\nhead{rfc9110_5_6_2_token_delim}er: val\r\n\r\n".encode()
179+
expectation = pytest.raises(http_exceptions.BadHttpMessage)
180+
if rfc9110_5_6_2_token_delim == ":":
181+
# Inserting colon into header just splits name/value earlier.
182+
expectation = nullcontext()
183+
with expectation:
184+
parser.feed_data(text)
185+
186+
172187
@pytest.mark.parametrize(
173188
"hdr",
174189
(
175190
"Content-Length: -5", # https://www.rfc-editor.org/rfc/rfc9110.html#name-content-length
176191
"Content-Length: +256",
192+
"Content-Length: \N{superscript one}",
193+
"Content-Length: \N{mathematical double-struck digit one}",
177194
"Foo: abc\rdef", # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5
178195
"Bar: abc\ndef",
179196
"Baz: abc\x00def",
@@ -266,6 +283,20 @@ def test_parse_headers_longline(parser: Any) -> None:
266283
parser.feed_data(text)
267284

268285

286+
def test_parse_unusual_request_line(parser) -> None:
287+
if not isinstance(response, HttpResponseParserPy):
288+
pytest.xfail("Regression test for Py parser. May match C behaviour later.")
289+
text = b"#smol //a HTTP/1.3\r\n\r\n"
290+
messages, upgrade, tail = parser.feed_data(text)
291+
assert len(messages) == 1
292+
msg, _ = messages[0]
293+
assert msg.compression is None
294+
assert not msg.upgrade
295+
assert msg.method == "#smol"
296+
assert msg.path == "//a"
297+
assert msg.version == (1, 3)
298+
299+
269300
def test_parse(parser) -> None:
270301
text = b"GET /test HTTP/1.1\r\n\r\n"
271302
messages, upgrade, tail = parser.feed_data(text)
@@ -568,6 +599,43 @@ def test_headers_content_length_err_2(parser) -> None:
568599
parser.feed_data(text)
569600

570601

602+
_pad: Dict[bytes, str] = {
603+
b"": "empty",
604+
# not a typo. Python likes triple zero
605+
b"\000": "NUL",
606+
b" ": "SP",
607+
b" ": "SPSP",
608+
# not a typo: both 0xa0 and 0x0a in case of 8-bit fun
609+
b"\n": "LF",
610+
b"\xa0": "NBSP",
611+
b"\t ": "TABSP",
612+
}
613+
614+
615+
@pytest.mark.parametrize("hdr", [b"", b"foo"], ids=["name-empty", "with-name"])
616+
@pytest.mark.parametrize("pad2", _pad.keys(), ids=["post-" + n for n in _pad.values()])
617+
@pytest.mark.parametrize("pad1", _pad.keys(), ids=["pre-" + n for n in _pad.values()])
618+
def test_invalid_header_spacing(parser, pad1: bytes, pad2: bytes, hdr: bytes) -> None:
619+
text = b"GET /test HTTP/1.1\r\n" b"%s%s%s: value\r\n\r\n" % (pad1, hdr, pad2)
620+
expectation = pytest.raises(http_exceptions.BadHttpMessage)
621+
if pad1 == pad2 == b"" and hdr != b"":
622+
# one entry in param matrix is correct: non-empty name, not padded
623+
expectation = nullcontext()
624+
if pad1 == pad2 == hdr == b"":
625+
if not isinstance(response, HttpResponseParserPy):
626+
pytest.xfail("Regression test for Py parser. May match C behaviour later.")
627+
with expectation:
628+
parser.feed_data(text)
629+
630+
631+
def test_empty_header_name(parser) -> None:
632+
if not isinstance(response, HttpResponseParserPy):
633+
pytest.xfail("Regression test for Py parser. May match C behaviour later.")
634+
text = b"GET /test HTTP/1.1\r\n" b":test\r\n\r\n"
635+
with pytest.raises(http_exceptions.BadHttpMessage):
636+
parser.feed_data(text)
637+
638+
571639
def test_invalid_header(parser) -> None:
572640
text = b"GET /test HTTP/1.1\r\n" b"test line\r\n\r\n"
573641
with pytest.raises(http_exceptions.BadHttpMessage):
@@ -690,6 +758,34 @@ def test_http_request_bad_status_line(parser) -> None:
690758
assert r"\n" not in exc_info.value.message
691759

692760

761+
_num: Dict[bytes, str] = {
762+
# dangerous: accepted by Python int()
763+
# unicodedata.category("\U0001D7D9") == 'Nd'
764+
"\N{mathematical double-struck digit one}".encode(): "utf8digit",
765+
# only added for interop tests, refused by Python int()
766+
# unicodedata.category("\U000000B9") == 'No'
767+
"\N{superscript one}".encode(): "utf8number",
768+
"\N{superscript one}".encode("latin-1"): "latin1number",
769+
}
770+
771+
772+
@pytest.mark.parametrize("nonascii_digit", _num.keys(), ids=_num.values())
773+
def test_http_request_bad_status_line_number(
774+
parser: Any, nonascii_digit: bytes
775+
) -> None:
776+
text = b"GET /digit HTTP/1." + nonascii_digit + b"\r\n\r\n"
777+
with pytest.raises(http_exceptions.BadStatusLine):
778+
parser.feed_data(text)
779+
780+
781+
def test_http_request_bad_status_line_separator(parser: Any) -> None:
782+
# single code point, old, multibyte NFKC, multibyte NFKD
783+
utf8sep = "\N{arabic ligature sallallahou alayhe wasallam}".encode()
784+
text = b"GET /ligature HTTP/1" + utf8sep + b"1\r\n\r\n"
785+
with pytest.raises(http_exceptions.BadStatusLine):
786+
parser.feed_data(text)
787+
788+
693789
def test_http_request_bad_status_line_whitespace(parser: Any) -> None:
694790
text = b"GET\n/path\fHTTP/1.1\r\n\r\n"
695791
with pytest.raises(http_exceptions.BadStatusLine):
@@ -711,6 +807,31 @@ def test_http_request_upgrade(parser: Any) -> None:
711807
assert tail == b"some raw data"
712808

713809

810+
def test_http_request_parser_utf8_request_line(parser) -> None:
811+
if not isinstance(response, HttpResponseParserPy):
812+
pytest.xfail("Regression test for Py parser. May match C behaviour later.")
813+
messages, upgrade, tail = parser.feed_data(
814+
# note the truncated unicode sequence
815+
b"GET /P\xc3\xbcnktchen\xa0\xef\xb7 HTTP/1.1\r\n" +
816+
# for easier grep: ASCII 0xA0 more commonly known as non-breaking space
817+
# note the leading and trailing spaces
818+
"sTeP: \N{latin small letter sharp s}nek\t\N{no-break space} "
819+
"\r\n\r\n".encode()
820+
)
821+
msg = messages[0][0]
822+
823+
assert msg.method == "GET"
824+
assert msg.path == "/Pünktchen\udca0\udcef\udcb7"
825+
assert msg.version == (1, 1)
826+
assert msg.headers == CIMultiDict([("STEP", "ßnek\t\xa0")])
827+
assert msg.raw_headers == ((b"sTeP", "ßnek\t\xa0".encode()),)
828+
assert not msg.should_close
829+
assert msg.compression is None
830+
assert not msg.upgrade
831+
assert not msg.chunked
832+
assert msg.url.path == URL("/P%C3%BCnktchen\udca0\udcef\udcb7").path
833+
834+
714835
def test_http_request_parser_utf8(parser) -> None:
715836
text = "GET /path HTTP/1.1\r\nx-test:тест\r\n\r\n".encode()
716837
messages, upgrade, tail = parser.feed_data(text)
@@ -760,9 +881,15 @@ def test_http_request_parser_two_slashes(parser) -> None:
760881
assert not msg.chunked
761882

762883

763-
def test_http_request_parser_bad_method(parser) -> None:
884+
@pytest.mark.parametrize(
885+
"rfc9110_5_6_2_token_delim",
886+
[bytes([i]) for i in rb'"(),/:;<=>?@[\]{}'],
887+
)
888+
def test_http_request_parser_bad_method(
889+
parser, rfc9110_5_6_2_token_delim: bytes
890+
) -> None:
764891
with pytest.raises(http_exceptions.BadStatusLine):
765-
parser.feed_data(b'G=":<>(e),[T];?" /get HTTP/1.1\r\n\r\n')
892+
parser.feed_data(rfc9110_5_6_2_token_delim + b'ET" /get HTTP/1.1\r\n\r\n')
766893

767894

768895
def test_http_request_parser_bad_version(parser) -> None:
@@ -974,6 +1101,12 @@ def test_http_response_parser_code_not_int(response) -> None:
9741101
response.feed_data(b"HTTP/1.1 ttt test\r\n\r\n")
9751102

9761103

1104+
@pytest.mark.parametrize("nonascii_digit", _num.keys(), ids=_num.values())
1105+
def test_http_response_parser_code_not_ascii(response, nonascii_digit: bytes) -> None:
1106+
with pytest.raises(http_exceptions.BadStatusLine):
1107+
response.feed_data(b"HTTP/1.1 20" + nonascii_digit + b" test\r\n\r\n")
1108+
1109+
9771110
def test_http_request_chunked_payload(parser) -> None:
9781111
text = b"GET /test HTTP/1.1\r\n" b"transfer-encoding: chunked\r\n\r\n"
9791112
msg, payload = parser.feed_data(text)[0][0]

0 commit comments

Comments
 (0)