@@ -18,11 +18,11 @@ def recurse_while_none(element):
18
18
19
19
def clean_references (obj , url , html_raw_response = False ):
20
20
"""
21
- Rewrite (internal) links to make them absolute.
21
+ Rewrite (internal) links (href) and images (src) to make them absolute.
22
22
23
- 1. external links are not changed
23
+ 1. external links/images are not changed
24
24
2. prepend URL to links that are just fragments (e.g. #section)
25
- 3. prepend URL (without filename) to internal relative links
25
+ 3. prepend URL (without filename) to internal relative links/images
26
26
"""
27
27
28
28
# TODO: do not depend on PyQuery
@@ -31,23 +31,25 @@ def clean_references(obj, url, html_raw_response=False):
31
31
if url is None :
32
32
return obj
33
33
34
- for link in obj .find ('a' ):
34
+ for tag in obj .find ("a" ) + obj . find ( "img" ):
35
35
base_url = urlparse (url )
36
- # We need to make all internal links, to be absolute
37
- href = link .attrib ['href' ]
38
- parsed_href = urlparse (href )
36
+ attribute = "href" if tag .tag == "a" else "src"
37
+ value = tag .attrib [attribute ]
38
+
39
+ # We need to make all internal links/images, to be absolute
40
+ parsed_href = urlparse (value )
39
41
if parsed_href .scheme or parsed_href .path .startswith ('/' ):
40
- # don't change external links
42
+ # don't change external links/images
41
43
continue
42
44
43
- if not parsed_href .path and parsed_href .fragment :
44
- # href="#section-link"
45
- new_href = base_url .geturl () + href
46
- link .attrib ['href' ] = new_href
45
+ if tag . tag == "a" and not parsed_href .path and parsed_href .fragment :
46
+ # It's a link pointing to a specific section inside the target `` href="#section-link"``
47
+ cleaned_value = base_url .geturl () + value
48
+ tag .attrib [attribute ] = cleaned_value
47
49
continue
48
50
49
51
if not base_url .path .endswith ('/' ):
50
- # internal relative link
52
+ # internal relative link/image
51
53
# href="../../another.html" and ``base_url`` is not HTMLDir
52
54
# (e.g. /en/latest/deep/internal/section/page.html)
53
55
# we want to remove the trailing filename (page.html) and use the rest as base URL
@@ -56,11 +58,11 @@ def clean_references(obj, url, html_raw_response=False):
56
58
57
59
# remove the filename (page.html) from the original document URL (base_url) and,
58
60
path , _ = base_url .path .rsplit ('/' , 1 )
59
- # append the value of href (../../another.html) to the base URL.
61
+ # append the value of href/src (../../another.html) to the base URL.
60
62
base_url = base_url ._replace (path = path + '/' )
61
63
62
- new_href = base_url .geturl () + href
63
- link .attrib ['href' ] = new_href
64
+ cleaned_value = base_url .geturl () + value
65
+ tag .attrib [attribute ] = cleaned_value
64
66
65
67
if html_raw_response :
66
68
return obj .outerHtml ()
0 commit comments