@@ -1278,25 +1278,25 @@ def proofread_canonicals(
1278
1278
purge (http , * paths_to_purge )
1279
1279
1280
1280
1281
- _canonical_re = re .compile (
1282
- """<link rel="canonical" href="https://docs.python.org/([^"]*)" />"""
1283
- )
1284
-
1285
-
1286
1281
def _check_canonical_rel (file : Path , www_root : Path ):
1287
1282
# Check for a canonical relation link in the HTML.
1288
1283
# If one exists, ensure that the target exists
1289
1284
# or otherwise remove the canonical link element.
1290
- html = file .read_text (encoding = "UTF-8" , errors = "surrogateescape" )
1291
- canonical = _canonical_re .search (html )
1292
- if canonical is None :
1285
+ prefix = b'<link rel="canonical" href="https://docs.python.org/'
1286
+ suffix = b'" />'
1287
+ pfx_len = len (prefix )
1288
+ sfx_len = len (suffix )
1289
+ html = file .read_bytes ()
1290
+ try :
1291
+ start = html .index (prefix )
1292
+ end = html .index (suffix , start + pfx_len )
1293
+ except ValueError :
1293
1294
return None
1294
- target = canonical . group ( 1 )
1295
+ target = html [ start + pfx_len : end ]. decode ( errors = "surrogateescape" )
1295
1296
if (www_root / target ).exists ():
1296
1297
return None
1297
1298
logging .info ("Removing broken canonical from %s to %s" , file , target )
1298
- html = html .replace (canonical .group (0 ), "" )
1299
- file .write_text (html , encoding = "UTF-8" , errors = "surrogateescape" )
1299
+ file .write_bytes (html [:start ] + html [end + sfx_len :])
1300
1300
return file
1301
1301
1302
1302
0 commit comments