Skip to content

Commit bd5f924

Browse files
Allow lax response parsing on Py parser (#7663)
1 parent 454092d commit bd5f924

File tree

4 files changed

+205
-86
lines changed

4 files changed

+205
-86
lines changed

CHANGES/7663.feature

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Updated Python parser to comply with latest HTTP specs and allow lax response parsing -- by :user:`Dreamorcerer`

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ aiohttp/_find_header.c: $(call to-hash,aiohttp/hdrs.py ./tools/gen.py)
5858

5959
# _find_headers generator creates _headers.pyi as well
6060
aiohttp/%.c: aiohttp/%.pyx $(call to-hash,$(CYS)) aiohttp/_find_header.c
61-
cython -3 -o $@ $< -I aiohttp
61+
cython -3 -o $@ $< -I aiohttp -Werror
6262

6363
vendor/llhttp/node_modules: vendor/llhttp/package.json
6464
cd vendor/llhttp; npm install

aiohttp/http_parser.py

+66-20
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,12 @@
55
from contextlib import suppress
66
from enum import IntEnum
77
from typing import (
8+
Any,
9+
ClassVar,
810
Final,
911
Generic,
1012
List,
13+
Literal,
1114
NamedTuple,
1215
Optional,
1316
Pattern,
@@ -24,7 +27,7 @@
2427
from . import hdrs
2528
from .base_protocol import BaseProtocol
2629
from .compression_utils import HAS_BROTLI, BrotliDecompressor, ZLibDecompressor
27-
from .helpers import NO_EXTENSIONS, BaseTimerContext
30+
from .helpers import DEBUG, NO_EXTENSIONS, BaseTimerContext
2831
from .http_exceptions import (
2932
BadHttpMessage,
3033
BadStatusLine,
@@ -48,6 +51,8 @@
4851
"RawResponseMessage",
4952
)
5053

54+
_SEP = Literal[b"\r\n", b"\n"]
55+
5156
ASCIISET: Final[Set[str]] = set(string.printable)
5257

5358
# See https://www.rfc-editor.org/rfc/rfc9110.html#name-overview
@@ -60,6 +65,7 @@
6065
METHRE: Final[Pattern[str]] = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]+")
6166
VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d).(\d)")
6267
HDRRE: Final[Pattern[bytes]] = re.compile(rb"[\x00-\x1F\x7F()<>@,;:\[\]={} \t\"\\]")
68+
HEXDIGIT = re.compile(rb"[0-9a-fA-F]+")
6369

6470

6571
class RawRequestMessage(NamedTuple):
@@ -206,6 +212,8 @@ def parse_headers(
206212

207213

208214
class HttpParser(abc.ABC, Generic[_MsgT]):
215+
lax: ClassVar[bool] = False
216+
209217
def __init__(
210218
self,
211219
protocol: BaseProtocol,
@@ -266,7 +274,7 @@ def feed_eof(self) -> Optional[_MsgT]:
266274
def feed_data(
267275
self,
268276
data: bytes,
269-
SEP: bytes = b"\r\n",
277+
SEP: _SEP = b"\r\n",
270278
EMPTY: bytes = b"",
271279
CONTENT_LENGTH: istr = hdrs.CONTENT_LENGTH,
272280
METH_CONNECT: str = hdrs.METH_CONNECT,
@@ -288,13 +296,16 @@ def feed_data(
288296
pos = data.find(SEP, start_pos)
289297
# consume \r\n
290298
if pos == start_pos and not self._lines:
291-
start_pos = pos + 2
299+
start_pos = pos + len(SEP)
292300
continue
293301

294302
if pos >= start_pos:
295303
# line found
296-
self._lines.append(data[start_pos:pos])
297-
start_pos = pos + 2
304+
line = data[start_pos:pos]
305+
if SEP == b"\n": # For lax response parsing
306+
line = line.rstrip(b"\r")
307+
self._lines.append(line)
308+
start_pos = pos + len(SEP)
298309

299310
# \r\n\r\n found
300311
if self._lines[-1] == EMPTY:
@@ -311,7 +322,7 @@ def get_content_length() -> Optional[int]:
311322

312323
# Shouldn't allow +/- or other number formats.
313324
# https://www.rfc-editor.org/rfc/rfc9110#section-8.6-2
314-
if not length_hdr.strip(" \t").isdigit():
325+
if not length_hdr.strip(" \t").isdecimal():
315326
raise InvalidHeader(CONTENT_LENGTH)
316327

317328
return int(length_hdr)
@@ -348,6 +359,7 @@ def get_content_length() -> Optional[int]:
348359
readall=self.readall,
349360
response_with_body=self.response_with_body,
350361
auto_decompress=self._auto_decompress,
362+
lax=self.lax,
351363
)
352364
if not payload_parser.done:
353365
self._payload_parser = payload_parser
@@ -366,6 +378,7 @@ def get_content_length() -> Optional[int]:
366378
compression=msg.compression,
367379
readall=True,
368380
auto_decompress=self._auto_decompress,
381+
lax=self.lax,
369382
)
370383
else:
371384
if (
@@ -389,6 +402,7 @@ def get_content_length() -> Optional[int]:
389402
readall=True,
390403
response_with_body=self.response_with_body,
391404
auto_decompress=self._auto_decompress,
405+
lax=self.lax,
392406
)
393407
if not payload_parser.done:
394408
self._payload_parser = payload_parser
@@ -411,7 +425,7 @@ def get_content_length() -> Optional[int]:
411425
assert not self._lines
412426
assert self._payload_parser is not None
413427
try:
414-
eof, data = self._payload_parser.feed_data(data[start_pos:])
428+
eof, data = self._payload_parser.feed_data(data[start_pos:], SEP)
415429
except BaseException as exc:
416430
if self.payload_exception is not None:
417431
self._payload_parser.payload.set_exception(
@@ -456,12 +470,21 @@ def parse_headers(
456470

457471
# https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-6
458472
# https://www.rfc-editor.org/rfc/rfc9110.html#name-collected-abnf
459-
singletons = (hdrs.CONTENT_LENGTH, hdrs.CONTENT_LOCATION, hdrs.CONTENT_RANGE,
460-
hdrs.CONTENT_TYPE, hdrs.ETAG, hdrs.HOST, hdrs.MAX_FORWARDS,
461-
hdrs.SERVER, hdrs.TRANSFER_ENCODING, hdrs.USER_AGENT)
473+
singletons = (
474+
hdrs.CONTENT_LENGTH,
475+
hdrs.CONTENT_LOCATION,
476+
hdrs.CONTENT_RANGE,
477+
hdrs.CONTENT_TYPE,
478+
hdrs.ETAG,
479+
hdrs.HOST,
480+
hdrs.MAX_FORWARDS,
481+
hdrs.SERVER,
482+
hdrs.TRANSFER_ENCODING,
483+
hdrs.USER_AGENT,
484+
)
462485
bad_hdr = next((h for h in singletons if len(headers.getall(h, ())) > 1), None)
463486
if bad_hdr is not None:
464-
raise BadHttpMessage("Duplicate '{}' header found.".format(bad_hdr))
487+
raise BadHttpMessage(f"Duplicate '{bad_hdr}' header found.")
465488

466489
# keep-alive
467490
conn = headers.get(hdrs.CONNECTION)
@@ -597,6 +620,20 @@ class HttpResponseParser(HttpParser[RawResponseMessage]):
597620
Returns RawResponseMessage.
598621
"""
599622

623+
# Lax mode should only be enabled on response parser.
624+
lax = not DEBUG
625+
626+
def feed_data(
627+
self,
628+
data: bytes,
629+
SEP: Optional[_SEP] = None,
630+
*args: Any,
631+
**kwargs: Any,
632+
) -> Tuple[List[Tuple[RawResponseMessage, StreamReader]], bool, bytes]:
633+
if SEP is None:
634+
SEP = b"\r\n" if DEBUG else b"\n"
635+
return super().feed_data(data, SEP, *args, **kwargs)
636+
600637
def parse_message(self, lines: List[bytes]) -> RawResponseMessage:
601638
line = lines[0].decode("utf-8", "surrogateescape")
602639
try:
@@ -621,7 +658,7 @@ def parse_message(self, lines: List[bytes]) -> RawResponseMessage:
621658
version_o = HttpVersion(int(match.group(1)), int(match.group(2)))
622659

623660
# The status code is a three-digit number
624-
if len(status) != 3 or not status.isdigit():
661+
if len(status) != 3 or not status.isdecimal():
625662
raise BadStatusLine(line)
626663
status_i = int(status)
627664

@@ -663,13 +700,15 @@ def __init__(
663700
readall: bool = False,
664701
response_with_body: bool = True,
665702
auto_decompress: bool = True,
703+
lax: bool = False,
666704
) -> None:
667705
self._length = 0
668706
self._type = ParseState.PARSE_NONE
669707
self._chunk = ChunkState.PARSE_CHUNKED_SIZE
670708
self._chunk_size = 0
671709
self._chunk_tail = b""
672710
self._auto_decompress = auto_decompress
711+
self._lax = lax
673712
self.done = False
674713

675714
# payload decompression wrapper
@@ -721,7 +760,7 @@ def feed_eof(self) -> None:
721760
)
722761

723762
def feed_data(
724-
self, chunk: bytes, SEP: bytes = b"\r\n", CHUNK_EXT: bytes = b";"
763+
self, chunk: bytes, SEP: _SEP = b"\r\n", CHUNK_EXT: bytes = b";"
725764
) -> Tuple[bool, bytes]:
726765
# Read specified amount of bytes
727766
if self._type == ParseState.PARSE_LENGTH:
@@ -757,17 +796,22 @@ def feed_data(
757796
else:
758797
size_b = chunk[:pos]
759798

760-
if not size_b.isdigit():
799+
if self._lax: # Allow whitespace in lax mode.
800+
size_b = size_b.strip()
801+
802+
if not re.fullmatch(HEXDIGIT, size_b):
761803
exc = TransferEncodingError(
762804
chunk[:pos].decode("ascii", "surrogateescape")
763805
)
764806
self.payload.set_exception(exc)
765807
raise exc
766808
size = int(bytes(size_b), 16)
767809

768-
chunk = chunk[pos + 2 :]
810+
chunk = chunk[pos + len(SEP) :]
769811
if size == 0: # eof marker
770812
self._chunk = ChunkState.PARSE_MAYBE_TRAILERS
813+
if self._lax and chunk.startswith(b"\r"):
814+
chunk = chunk[1:]
771815
else:
772816
self._chunk = ChunkState.PARSE_CHUNKED_CHUNK
773817
self._chunk_size = size
@@ -789,13 +833,15 @@ def feed_data(
789833
self._chunk_size = 0
790834
self.payload.feed_data(chunk[:required], required)
791835
chunk = chunk[required:]
836+
if self._lax and chunk.startswith(b"\r"):
837+
chunk = chunk[1:]
792838
self._chunk = ChunkState.PARSE_CHUNKED_CHUNK_EOF
793839
self.payload.end_http_chunk_receiving()
794840

795841
# toss the CRLF at the end of the chunk
796842
if self._chunk == ChunkState.PARSE_CHUNKED_CHUNK_EOF:
797-
if chunk[:2] == SEP:
798-
chunk = chunk[2:]
843+
if chunk[: len(SEP)] == SEP:
844+
chunk = chunk[len(SEP) :]
799845
self._chunk = ChunkState.PARSE_CHUNKED_SIZE
800846
else:
801847
self._chunk_tail = chunk
@@ -805,11 +851,11 @@ def feed_data(
805851
# we should get another \r\n otherwise
806852
# trailers needs to be skiped until \r\n\r\n
807853
if self._chunk == ChunkState.PARSE_MAYBE_TRAILERS:
808-
head = chunk[:2]
854+
head = chunk[: len(SEP)]
809855
if head == SEP:
810856
# end of stream
811857
self.payload.feed_eof()
812-
return True, chunk[2:]
858+
return True, chunk[len(SEP) :]
813859
# Both CR and LF, or only LF may not be received yet. It is
814860
# expected that CRLF or LF will be shown at the very first
815861
# byte next time, otherwise trailers should come. The last
@@ -827,7 +873,7 @@ def feed_data(
827873
if self._chunk == ChunkState.PARSE_TRAILERS:
828874
pos = chunk.find(SEP)
829875
if pos >= 0:
830-
chunk = chunk[pos + 2 :]
876+
chunk = chunk[pos + len(SEP) :]
831877
self._chunk = ChunkState.PARSE_MAYBE_TRAILERS
832878
else:
833879
self._chunk_tail = chunk

0 commit comments

Comments
 (0)