5
5
from contextlib import suppress
6
6
from enum import IntEnum
7
7
from typing import (
8
+ Any ,
9
+ ClassVar ,
8
10
Final ,
9
11
Generic ,
10
12
List ,
13
+ Literal ,
11
14
NamedTuple ,
12
15
Optional ,
13
16
Pattern ,
24
27
from . import hdrs
25
28
from .base_protocol import BaseProtocol
26
29
from .compression_utils import HAS_BROTLI , BrotliDecompressor , ZLibDecompressor
27
- from .helpers import NO_EXTENSIONS , BaseTimerContext
30
+ from .helpers import DEBUG , NO_EXTENSIONS , BaseTimerContext
28
31
from .http_exceptions import (
29
32
BadHttpMessage ,
30
33
BadStatusLine ,
48
51
"RawResponseMessage" ,
49
52
)
50
53
54
+ _SEP = Literal [b"\r \n " , b"\n " ]
55
+
51
56
ASCIISET : Final [Set [str ]] = set (string .printable )
52
57
53
58
# See https://www.rfc-editor.org/rfc/rfc9110.html#name-overview
60
65
METHRE : Final [Pattern [str ]] = re .compile (r"[!#$%&'*+\-.^_`|~0-9A-Za-z]+" )
61
66
VERSRE : Final [Pattern [str ]] = re .compile (r"HTTP/(\d).(\d)" )
62
67
HDRRE : Final [Pattern [bytes ]] = re .compile (rb"[\x00-\x1F\x7F()<>@,;:\[\]={} \t\"\\]" )
68
+ HEXDIGIT = re .compile (rb"[0-9a-fA-F]+" )
63
69
64
70
65
71
class RawRequestMessage (NamedTuple ):
@@ -206,6 +212,8 @@ def parse_headers(
206
212
207
213
208
214
class HttpParser (abc .ABC , Generic [_MsgT ]):
215
+ lax : ClassVar [bool ] = False
216
+
209
217
def __init__ (
210
218
self ,
211
219
protocol : BaseProtocol ,
@@ -266,7 +274,7 @@ def feed_eof(self) -> Optional[_MsgT]:
266
274
def feed_data (
267
275
self ,
268
276
data : bytes ,
269
- SEP : bytes = b"\r \n " ,
277
+ SEP : _SEP = b"\r \n " ,
270
278
EMPTY : bytes = b"" ,
271
279
CONTENT_LENGTH : istr = hdrs .CONTENT_LENGTH ,
272
280
METH_CONNECT : str = hdrs .METH_CONNECT ,
@@ -288,13 +296,16 @@ def feed_data(
288
296
pos = data .find (SEP , start_pos )
289
297
# consume \r\n
290
298
if pos == start_pos and not self ._lines :
291
- start_pos = pos + 2
299
+ start_pos = pos + len ( SEP )
292
300
continue
293
301
294
302
if pos >= start_pos :
295
303
# line found
296
- self ._lines .append (data [start_pos :pos ])
297
- start_pos = pos + 2
304
+ line = data [start_pos :pos ]
305
+ if SEP == b"\n " : # For lax response parsing
306
+ line = line .rstrip (b"\r " )
307
+ self ._lines .append (line )
308
+ start_pos = pos + len (SEP )
298
309
299
310
# \r\n\r\n found
300
311
if self ._lines [- 1 ] == EMPTY :
@@ -311,7 +322,7 @@ def get_content_length() -> Optional[int]:
311
322
312
323
# Shouldn't allow +/- or other number formats.
313
324
# https://www.rfc-editor.org/rfc/rfc9110#section-8.6-2
314
- if not length_hdr .strip (" \t " ).isdigit ():
325
+ if not length_hdr .strip (" \t " ).isdecimal ():
315
326
raise InvalidHeader (CONTENT_LENGTH )
316
327
317
328
return int (length_hdr )
@@ -348,6 +359,7 @@ def get_content_length() -> Optional[int]:
348
359
readall = self .readall ,
349
360
response_with_body = self .response_with_body ,
350
361
auto_decompress = self ._auto_decompress ,
362
+ lax = self .lax ,
351
363
)
352
364
if not payload_parser .done :
353
365
self ._payload_parser = payload_parser
@@ -366,6 +378,7 @@ def get_content_length() -> Optional[int]:
366
378
compression = msg .compression ,
367
379
readall = True ,
368
380
auto_decompress = self ._auto_decompress ,
381
+ lax = self .lax ,
369
382
)
370
383
else :
371
384
if (
@@ -389,6 +402,7 @@ def get_content_length() -> Optional[int]:
389
402
readall = True ,
390
403
response_with_body = self .response_with_body ,
391
404
auto_decompress = self ._auto_decompress ,
405
+ lax = self .lax ,
392
406
)
393
407
if not payload_parser .done :
394
408
self ._payload_parser = payload_parser
@@ -411,7 +425,7 @@ def get_content_length() -> Optional[int]:
411
425
assert not self ._lines
412
426
assert self ._payload_parser is not None
413
427
try :
414
- eof , data = self ._payload_parser .feed_data (data [start_pos :])
428
+ eof , data = self ._payload_parser .feed_data (data [start_pos :], SEP )
415
429
except BaseException as exc :
416
430
if self .payload_exception is not None :
417
431
self ._payload_parser .payload .set_exception (
@@ -456,12 +470,21 @@ def parse_headers(
456
470
457
471
# https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-6
458
472
# https://www.rfc-editor.org/rfc/rfc9110.html#name-collected-abnf
459
- singletons = (hdrs .CONTENT_LENGTH , hdrs .CONTENT_LOCATION , hdrs .CONTENT_RANGE ,
460
- hdrs .CONTENT_TYPE , hdrs .ETAG , hdrs .HOST , hdrs .MAX_FORWARDS ,
461
- hdrs .SERVER , hdrs .TRANSFER_ENCODING , hdrs .USER_AGENT )
473
+ singletons = (
474
+ hdrs .CONTENT_LENGTH ,
475
+ hdrs .CONTENT_LOCATION ,
476
+ hdrs .CONTENT_RANGE ,
477
+ hdrs .CONTENT_TYPE ,
478
+ hdrs .ETAG ,
479
+ hdrs .HOST ,
480
+ hdrs .MAX_FORWARDS ,
481
+ hdrs .SERVER ,
482
+ hdrs .TRANSFER_ENCODING ,
483
+ hdrs .USER_AGENT ,
484
+ )
462
485
bad_hdr = next ((h for h in singletons if len (headers .getall (h , ())) > 1 ), None )
463
486
if bad_hdr is not None :
464
- raise BadHttpMessage ("Duplicate '{}' header found." . format ( bad_hdr ) )
487
+ raise BadHttpMessage (f "Duplicate '{ bad_hdr } ' header found." )
465
488
466
489
# keep-alive
467
490
conn = headers .get (hdrs .CONNECTION )
@@ -597,6 +620,20 @@ class HttpResponseParser(HttpParser[RawResponseMessage]):
597
620
Returns RawResponseMessage.
598
621
"""
599
622
623
+ # Lax mode should only be enabled on response parser.
624
+ lax = not DEBUG
625
+
626
+ def feed_data (
627
+ self ,
628
+ data : bytes ,
629
+ SEP : Optional [_SEP ] = None ,
630
+ * args : Any ,
631
+ ** kwargs : Any ,
632
+ ) -> Tuple [List [Tuple [RawResponseMessage , StreamReader ]], bool , bytes ]:
633
+ if SEP is None :
634
+ SEP = b"\r \n " if DEBUG else b"\n "
635
+ return super ().feed_data (data , SEP , * args , ** kwargs )
636
+
600
637
def parse_message (self , lines : List [bytes ]) -> RawResponseMessage :
601
638
line = lines [0 ].decode ("utf-8" , "surrogateescape" )
602
639
try :
@@ -621,7 +658,7 @@ def parse_message(self, lines: List[bytes]) -> RawResponseMessage:
621
658
version_o = HttpVersion (int (match .group (1 )), int (match .group (2 )))
622
659
623
660
# The status code is a three-digit number
624
- if len (status ) != 3 or not status .isdigit ():
661
+ if len (status ) != 3 or not status .isdecimal ():
625
662
raise BadStatusLine (line )
626
663
status_i = int (status )
627
664
@@ -663,13 +700,15 @@ def __init__(
663
700
readall : bool = False ,
664
701
response_with_body : bool = True ,
665
702
auto_decompress : bool = True ,
703
+ lax : bool = False ,
666
704
) -> None :
667
705
self ._length = 0
668
706
self ._type = ParseState .PARSE_NONE
669
707
self ._chunk = ChunkState .PARSE_CHUNKED_SIZE
670
708
self ._chunk_size = 0
671
709
self ._chunk_tail = b""
672
710
self ._auto_decompress = auto_decompress
711
+ self ._lax = lax
673
712
self .done = False
674
713
675
714
# payload decompression wrapper
@@ -721,7 +760,7 @@ def feed_eof(self) -> None:
721
760
)
722
761
723
762
def feed_data (
724
- self , chunk : bytes , SEP : bytes = b"\r \n " , CHUNK_EXT : bytes = b";"
763
+ self , chunk : bytes , SEP : _SEP = b"\r \n " , CHUNK_EXT : bytes = b";"
725
764
) -> Tuple [bool , bytes ]:
726
765
# Read specified amount of bytes
727
766
if self ._type == ParseState .PARSE_LENGTH :
@@ -757,17 +796,22 @@ def feed_data(
757
796
else :
758
797
size_b = chunk [:pos ]
759
798
760
- if not size_b .isdigit ():
799
+ if self ._lax : # Allow whitespace in lax mode.
800
+ size_b = size_b .strip ()
801
+
802
+ if not re .fullmatch (HEXDIGIT , size_b ):
761
803
exc = TransferEncodingError (
762
804
chunk [:pos ].decode ("ascii" , "surrogateescape" )
763
805
)
764
806
self .payload .set_exception (exc )
765
807
raise exc
766
808
size = int (bytes (size_b ), 16 )
767
809
768
- chunk = chunk [pos + 2 :]
810
+ chunk = chunk [pos + len ( SEP ) :]
769
811
if size == 0 : # eof marker
770
812
self ._chunk = ChunkState .PARSE_MAYBE_TRAILERS
813
+ if self ._lax and chunk .startswith (b"\r " ):
814
+ chunk = chunk [1 :]
771
815
else :
772
816
self ._chunk = ChunkState .PARSE_CHUNKED_CHUNK
773
817
self ._chunk_size = size
@@ -789,13 +833,15 @@ def feed_data(
789
833
self ._chunk_size = 0
790
834
self .payload .feed_data (chunk [:required ], required )
791
835
chunk = chunk [required :]
836
+ if self ._lax and chunk .startswith (b"\r " ):
837
+ chunk = chunk [1 :]
792
838
self ._chunk = ChunkState .PARSE_CHUNKED_CHUNK_EOF
793
839
self .payload .end_http_chunk_receiving ()
794
840
795
841
# toss the CRLF at the end of the chunk
796
842
if self ._chunk == ChunkState .PARSE_CHUNKED_CHUNK_EOF :
797
- if chunk [:2 ] == SEP :
798
- chunk = chunk [2 :]
843
+ if chunk [: len ( SEP ) ] == SEP :
844
+ chunk = chunk [len ( SEP ) :]
799
845
self ._chunk = ChunkState .PARSE_CHUNKED_SIZE
800
846
else :
801
847
self ._chunk_tail = chunk
@@ -805,11 +851,11 @@ def feed_data(
805
851
# we should get another \r\n otherwise
806
852
# trailers needs to be skiped until \r\n\r\n
807
853
if self ._chunk == ChunkState .PARSE_MAYBE_TRAILERS :
808
- head = chunk [:2 ]
854
+ head = chunk [: len ( SEP ) ]
809
855
if head == SEP :
810
856
# end of stream
811
857
self .payload .feed_eof ()
812
- return True , chunk [2 :]
858
+ return True , chunk [len ( SEP ) :]
813
859
# Both CR and LF, or only LF may not be received yet. It is
814
860
# expected that CRLF or LF will be shown at the very first
815
861
# byte next time, otherwise trailers should come. The last
@@ -827,7 +873,7 @@ def feed_data(
827
873
if self ._chunk == ChunkState .PARSE_TRAILERS :
828
874
pos = chunk .find (SEP )
829
875
if pos >= 0 :
830
- chunk = chunk [pos + 2 :]
876
+ chunk = chunk [pos + len ( SEP ) :]
831
877
self ._chunk = ChunkState .PARSE_MAYBE_TRAILERS
832
878
else :
833
879
self ._chunk_tail = chunk
0 commit comments