Skip to content

Commit 89b7df1

Browse files
Allow lax response parsing on Py parser (#7663) (#7664)
(cherry picked from commit bd5f924) --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent d5c12ba commit 89b7df1

File tree

4 files changed

+198
-86
lines changed

4 files changed

+198
-86
lines changed

CHANGES/7663.feature

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Updated Python parser to comply with latest HTTP specs and allow lax response parsing -- by :user:`Dreamorcerer`

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ aiohttp/_find_header.c: $(call to-hash,aiohttp/hdrs.py ./tools/gen.py)
5858

5959
# _find_headers generator creates _headers.pyi as well
6060
aiohttp/%.c: aiohttp/%.pyx $(call to-hash,$(CYS)) aiohttp/_find_header.c
61-
cython -3 -o $@ $< -I aiohttp
61+
cython -3 -o $@ $< -I aiohttp -Werror
6262

6363
vendor/llhttp/node_modules: vendor/llhttp/package.json
6464
cd vendor/llhttp; npm install

aiohttp/http_parser.py

Lines changed: 59 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
import collections
44
import re
55
import string
6+
import sys
67
import zlib
78
from contextlib import suppress
89
from enum import IntEnum
910
from typing import (
1011
Any,
12+
ClassVar,
1113
Generic,
1214
List,
1315
NamedTuple,
@@ -26,7 +28,7 @@
2628

2729
from . import hdrs
2830
from .base_protocol import BaseProtocol
29-
from .helpers import NO_EXTENSIONS, BaseTimerContext
31+
from .helpers import DEBUG, NO_EXTENSIONS, BaseTimerContext
3032
from .http_exceptions import (
3133
BadHttpMessage,
3234
BadStatusLine,
@@ -41,6 +43,11 @@
4143
from .streams import EMPTY_PAYLOAD, StreamReader
4244
from .typedefs import Final, RawHeaders
4345

46+
if sys.version_info >= (3, 8):
47+
from typing import Literal
48+
else:
49+
from typing_extensions import Literal
50+
4451
try:
4552
import brotli
4653

@@ -58,6 +65,8 @@
5865
"RawResponseMessage",
5966
)
6067

68+
_SEP = Literal[b"\r\n", b"\n"]
69+
6170
ASCIISET: Final[Set[str]] = set(string.printable)
6271

6372
# See https://www.rfc-editor.org/rfc/rfc9110.html#name-overview
@@ -70,6 +79,7 @@
7079
METHRE: Final[Pattern[str]] = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]+")
7180
VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d).(\d)")
7281
HDRRE: Final[Pattern[bytes]] = re.compile(rb"[\x00-\x1F\x7F()<>@,;:\[\]={} \t\"\\]")
82+
HEXDIGIT = re.compile(rb"[0-9a-fA-F]+")
7383

7484

7585
class RawRequestMessage(NamedTuple):
@@ -173,7 +183,8 @@ def parse_headers(
173183
# consume continuation lines
174184
continuation = line and line[0] in (32, 9) # (' ', '\t')
175185

176-
# Deprecated: https://www.rfc-editor.org/rfc/rfc9112.html#name-obsolete-line-folding
186+
# Deprecated:
187+
# https://www.rfc-editor.org/rfc/rfc9112.html#name-obsolete-line-folding
177188
if continuation:
178189
bvalue_lst = [bvalue]
179190
while continuation:
@@ -223,6 +234,8 @@ def parse_headers(
223234

224235

225236
class HttpParser(abc.ABC, Generic[_MsgT]):
237+
lax: ClassVar[bool] = False
238+
226239
def __init__(
227240
self,
228241
protocol: Optional[BaseProtocol] = None,
@@ -285,7 +298,7 @@ def feed_eof(self) -> Optional[_MsgT]:
285298
def feed_data(
286299
self,
287300
data: bytes,
288-
SEP: bytes = b"\r\n",
301+
SEP: _SEP = b"\r\n",
289302
EMPTY: bytes = b"",
290303
CONTENT_LENGTH: istr = hdrs.CONTENT_LENGTH,
291304
METH_CONNECT: str = hdrs.METH_CONNECT,
@@ -309,13 +322,16 @@ def feed_data(
309322
pos = data.find(SEP, start_pos)
310323
# consume \r\n
311324
if pos == start_pos and not self._lines:
312-
start_pos = pos + 2
325+
start_pos = pos + len(SEP)
313326
continue
314327

315328
if pos >= start_pos:
316329
# line found
317-
self._lines.append(data[start_pos:pos])
318-
start_pos = pos + 2
330+
line = data[start_pos:pos]
331+
if SEP == b"\n": # For lax response parsing
332+
line = line.rstrip(b"\r")
333+
self._lines.append(line)
334+
start_pos = pos + len(SEP)
319335

320336
# \r\n\r\n found
321337
if self._lines[-1] == EMPTY:
@@ -332,7 +348,7 @@ def get_content_length() -> Optional[int]:
332348

333349
# Shouldn't allow +/- or other number formats.
334350
# https://www.rfc-editor.org/rfc/rfc9110#section-8.6-2
335-
if not length_hdr.strip(" \t").isdigit():
351+
if not length_hdr.strip(" \t").isdecimal():
336352
raise InvalidHeader(CONTENT_LENGTH)
337353

338354
return int(length_hdr)
@@ -369,6 +385,7 @@ def get_content_length() -> Optional[int]:
369385
readall=self.readall,
370386
response_with_body=self.response_with_body,
371387
auto_decompress=self._auto_decompress,
388+
lax=self.lax,
372389
)
373390
if not payload_parser.done:
374391
self._payload_parser = payload_parser
@@ -387,6 +404,7 @@ def get_content_length() -> Optional[int]:
387404
compression=msg.compression,
388405
readall=True,
389406
auto_decompress=self._auto_decompress,
407+
lax=self.lax,
390408
)
391409
else:
392410
if (
@@ -410,6 +428,7 @@ def get_content_length() -> Optional[int]:
410428
readall=True,
411429
response_with_body=self.response_with_body,
412430
auto_decompress=self._auto_decompress,
431+
lax=self.lax,
413432
)
414433
if not payload_parser.done:
415434
self._payload_parser = payload_parser
@@ -432,7 +451,7 @@ def get_content_length() -> Optional[int]:
432451
assert not self._lines
433452
assert self._payload_parser is not None
434453
try:
435-
eof, data = self._payload_parser.feed_data(data[start_pos:])
454+
eof, data = self._payload_parser.feed_data(data[start_pos:], SEP)
436455
except BaseException as exc:
437456
if self.payload_exception is not None:
438457
self._payload_parser.payload.set_exception(
@@ -627,6 +646,20 @@ class HttpResponseParser(HttpParser[RawResponseMessage]):
627646
Returns RawResponseMessage.
628647
"""
629648

649+
# Lax mode should only be enabled on response parser.
650+
lax = not DEBUG
651+
652+
def feed_data(
653+
self,
654+
data: bytes,
655+
SEP: Optional[_SEP] = None,
656+
*args: Any,
657+
**kwargs: Any,
658+
) -> Tuple[List[Tuple[RawResponseMessage, StreamReader]], bool, bytes]:
659+
if SEP is None:
660+
SEP = b"\r\n" if DEBUG else b"\n"
661+
return super().feed_data(data, SEP, *args, **kwargs)
662+
630663
def parse_message(self, lines: List[bytes]) -> RawResponseMessage:
631664
line = lines[0].decode("utf-8", "surrogateescape")
632665
try:
@@ -651,7 +684,7 @@ def parse_message(self, lines: List[bytes]) -> RawResponseMessage:
651684
version_o = HttpVersion(int(match.group(1)), int(match.group(2)))
652685

653686
# The status code is a three-digit number
654-
if len(status) != 3 or not status.isdigit():
687+
if len(status) != 3 or not status.isdecimal():
655688
raise BadStatusLine(line)
656689
status_i = int(status)
657690

@@ -693,13 +726,15 @@ def __init__(
693726
readall: bool = False,
694727
response_with_body: bool = True,
695728
auto_decompress: bool = True,
729+
lax: bool = False,
696730
) -> None:
697731
self._length = 0
698732
self._type = ParseState.PARSE_NONE
699733
self._chunk = ChunkState.PARSE_CHUNKED_SIZE
700734
self._chunk_size = 0
701735
self._chunk_tail = b""
702736
self._auto_decompress = auto_decompress
737+
self._lax = lax
703738
self.done = False
704739

705740
# payload decompression wrapper
@@ -751,7 +786,7 @@ def feed_eof(self) -> None:
751786
)
752787

753788
def feed_data(
754-
self, chunk: bytes, SEP: bytes = b"\r\n", CHUNK_EXT: bytes = b";"
789+
self, chunk: bytes, SEP: _SEP = b"\r\n", CHUNK_EXT: bytes = b";"
755790
) -> Tuple[bool, bytes]:
756791
# Read specified amount of bytes
757792
if self._type == ParseState.PARSE_LENGTH:
@@ -788,17 +823,22 @@ def feed_data(
788823
else:
789824
size_b = chunk[:pos]
790825

791-
if not size_b.isdigit():
826+
if self._lax: # Allow whitespace in lax mode.
827+
size_b = size_b.strip()
828+
829+
if not re.fullmatch(HEXDIGIT, size_b):
792830
exc = TransferEncodingError(
793831
chunk[:pos].decode("ascii", "surrogateescape")
794832
)
795833
self.payload.set_exception(exc)
796834
raise exc
797835
size = int(bytes(size_b), 16)
798836

799-
chunk = chunk[pos + 2 :]
837+
chunk = chunk[pos + len(SEP) :]
800838
if size == 0: # eof marker
801839
self._chunk = ChunkState.PARSE_MAYBE_TRAILERS
840+
if self._lax and chunk.startswith(b"\r"):
841+
chunk = chunk[1:]
802842
else:
803843
self._chunk = ChunkState.PARSE_CHUNKED_CHUNK
804844
self._chunk_size = size
@@ -820,13 +860,15 @@ def feed_data(
820860
self._chunk_size = 0
821861
self.payload.feed_data(chunk[:required], required)
822862
chunk = chunk[required:]
863+
if self._lax and chunk.startswith(b"\r"):
864+
chunk = chunk[1:]
823865
self._chunk = ChunkState.PARSE_CHUNKED_CHUNK_EOF
824866
self.payload.end_http_chunk_receiving()
825867

826868
# toss the CRLF at the end of the chunk
827869
if self._chunk == ChunkState.PARSE_CHUNKED_CHUNK_EOF:
828-
if chunk[:2] == SEP:
829-
chunk = chunk[2:]
870+
if chunk[: len(SEP)] == SEP:
871+
chunk = chunk[len(SEP) :]
830872
self._chunk = ChunkState.PARSE_CHUNKED_SIZE
831873
else:
832874
self._chunk_tail = chunk
@@ -836,11 +878,11 @@ def feed_data(
836878
# we should get another \r\n otherwise
837879
# trailers needs to be skiped until \r\n\r\n
838880
if self._chunk == ChunkState.PARSE_MAYBE_TRAILERS:
839-
head = chunk[:2]
881+
head = chunk[: len(SEP)]
840882
if head == SEP:
841883
# end of stream
842884
self.payload.feed_eof()
843-
return True, chunk[2:]
885+
return True, chunk[len(SEP) :]
844886
# Both CR and LF, or only LF may not be received yet. It is
845887
# expected that CRLF or LF will be shown at the very first
846888
# byte next time, otherwise trailers should come. The last
@@ -858,7 +900,7 @@ def feed_data(
858900
if self._chunk == ChunkState.PARSE_TRAILERS:
859901
pos = chunk.find(SEP)
860902
if pos >= 0:
861-
chunk = chunk[pos + 2 :]
903+
chunk = chunk[pos + len(SEP) :]
862904
self._chunk = ChunkState.PARSE_MAYBE_TRAILERS
863905
else:
864906
self._chunk_tail = chunk

0 commit comments

Comments
 (0)