Skip to content

Commit ee18557

Browse files
authored
EmbedAPI: clean images (src) properly from inside a tooltip (#9337)
* EmbedAPI: small refactor - rename `clean_links` as `clean_references` - rename `URLData.href` as `URLData.ref` This is done because this function will clean `a.href` and also `img.src` now. * EmbedAPI: clean images (src) properly from inside a tooltip When returning the content to be injected in a tooltip, besides cleaning the `a.href` we are also cleaning the `img.src` to make them absolute and render the images properly. See readthedocs/sphinx-hoverxref#200 * EmbedAPI: use absolute URL on HTML data for tests These `src=` fields are not cleaned by `clean_references`.
1 parent 05799f6 commit ee18557

File tree

7 files changed

+59
-33
lines changed

7 files changed

+59
-33
lines changed

readthedocs/embed/tests/data/sphinx/latest/page-sub-title-one.html

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ <h3>Subsub title<a class="headerlink" href="http://project.readthedocs.io/en/lat
99
<!-- Figure with permalink and caption -->
1010
<div class="figure align-center" id="id1">
1111
<a class="reference internal image-reference" href="http://project.readthedocs.io/en/latest/_images/search-analytics-demo.png">
12-
<img alt="Search analytics demo" src="_images/figure.png" style="width: 50%;">
12+
<img alt="Search analytics demo" src="http://project.readthedocs.io/en/latest/_images/figure.png" style="width: 50%;">
1313
</a>
1414
<p class="caption">
1515
<span class="caption-number">Fig. 4 </span><span class="caption-text">I'm a figure!</span>

readthedocs/embed/tests/data/sphinx/latest/page-subsub-title.html

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ <h3>Subsub title<a class="headerlink" href="http://project.readthedocs.io/en/lat
55
<!-- Figure with permalink and caption -->
66
<div class="figure align-center" id="id1">
77
<a class="reference internal image-reference" href="http://project.readthedocs.io/en/latest/_images/search-analytics-demo.png">
8-
<img alt="Search analytics demo" src="_images/figure.png" style="width: 50%;">
8+
<img alt="Search analytics demo" src="http://project.readthedocs.io/en/latest/_images/figure.png" style="width: 50%;">
99
</a>
1010
<p class="caption">
1111
<span class="caption-number">Fig. 4 </span><span class="caption-text">I'm a figure!</span>

readthedocs/embed/tests/data/sphinx/latest/page-title-one.html

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ <h3>Subsub title<a class="headerlink" href="http://project.readthedocs.io/en/lat
1313
<!-- Figure with permalink and caption -->
1414
<div class="figure align-center" id="id1">
1515
<a class="reference internal image-reference" href="http://project.readthedocs.io/en/latest/_images/search-analytics-demo.png">
16-
<img alt="Search analytics demo" src="_images/figure.png" style="width: 50%;">
16+
<img alt="Search analytics demo" src="http://project.readthedocs.io/en/latest/_images/figure.png" style="width: 50%;">
1717
</a>
1818
<p class="caption">
1919
<span class="caption-number">Fig. 4 </span><span class="caption-text">I'm a figure!</span>

readthedocs/embed/tests/test_links.py

+32-6
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
import pytest
44
from pyquery import PyQuery
55

6-
from readthedocs.embed.utils import clean_links
6+
from readthedocs.embed.utils import clean_references
77

8-
URLData = namedtuple('URLData', ['docurl', 'href', 'expected'])
8+
URLData = namedtuple("URLData", ["docurl", "ref", "expected"])
99

1010
html_base_url = 'https://t.readthedocs.io/en/latest/page.html'
1111
dirhtml_base_url = 'https://t.readthedocs.io/en/latest/page/'
@@ -90,14 +90,38 @@
9090
),
9191
]
9292

93+
imagedata = [
94+
URLData(
95+
html_base_url,
96+
"/_images/image.png",
97+
"/_images/image.png",
98+
),
99+
URLData(
100+
html_base_url,
101+
"relative/section/image.png",
102+
"https://t.readthedocs.io/en/latest/relative/section/image.png",
103+
),
104+
URLData(
105+
"https://t.readthedocs.io/en/latest/internal/deep/page/topic.html",
106+
"../../../_images/image.png",
107+
"https://t.readthedocs.io/en/latest/internal/deep/page/../../../_images/image.png",
108+
),
109+
]
93110

94111
@pytest.mark.parametrize('url', htmldata + dirhtmldata)
95112
def test_clean_links(url):
96-
pq = PyQuery(f'<body><a href="{url.href}">Click here</a></body>')
97-
response = clean_links(pq, url.docurl)
113+
pq = PyQuery(f'<body><a href="{url.ref}">Click here</a></body>')
114+
response = clean_references(pq, url.docurl)
98115
assert response.find('a').attr['href'] == url.expected
99116

100117

118+
@pytest.mark.parametrize("url", imagedata)
119+
def test_clean_images(url):
120+
pq = PyQuery(f'<body><img alt="image alt content" src="{url.ref}"></img></body>')
121+
response = clean_references(pq, url.docurl)
122+
assert response.find("img").attr["src"] == url.expected
123+
124+
101125
def test_two_links():
102126
"""
103127
First link does not affect the second one.
@@ -115,7 +139,9 @@ def test_two_links():
115139
'#to-a-section',
116140
'https://t.readthedocs.io/en/latest/internal/deep/page/section.html#to-a-section',
117141
)
118-
pq = PyQuery(f'<body><a href="{firsturl.href}">Click here</a><a href="{secondurl.href}">Click here</a></body>')
119-
response = clean_links(pq, firsturl.docurl)
142+
pq = PyQuery(
143+
f'<body><a href="{firsturl.ref}">Click here</a><a href="{secondurl.ref}">Click here</a></body>'
144+
)
145+
response = clean_references(pq, firsturl.docurl)
120146
firstlink, secondlink = response.find('a')
121147
assert (firstlink.attrib['href'], secondlink.attrib['href']) == (firsturl.expected, secondurl.expected)

readthedocs/embed/utils.py

+20-17
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Embed utils."""
22

33
from urllib.parse import urlparse
4+
45
from pyquery import PyQuery as PQ # noqa
56

67

@@ -15,13 +16,13 @@ def recurse_while_none(element):
1516
return {element.text: href}
1617

1718

18-
def clean_links(obj, url, html_raw_response=False):
19+
def clean_references(obj, url, html_raw_response=False):
1920
"""
20-
Rewrite (internal) links to make them absolute.
21+
Rewrite (internal) links (href) and images (src) to make them absolute.
2122
22-
1. external links are not changed
23+
1. external links/images are not changed
2324
2. prepend URL to links that are just fragments (e.g. #section)
24-
3. prepend URL (without filename) to internal relative links
25+
3. prepend URL (without filename) to internal relative links/images
2526
"""
2627

2728
# TODO: do not depend on PyQuery
@@ -30,23 +31,25 @@ def clean_links(obj, url, html_raw_response=False):
3031
if url is None:
3132
return obj
3233

33-
for link in obj.find('a'):
34+
for tag in obj.find("a") + obj.find("img"):
3435
base_url = urlparse(url)
35-
# We need to make all internal links, to be absolute
36-
href = link.attrib['href']
37-
parsed_href = urlparse(href)
36+
attribute = "href" if tag.tag == "a" else "src"
37+
value = tag.attrib[attribute]
38+
39+
# We need to make all internal links/images, to be absolute
40+
parsed_href = urlparse(value)
3841
if parsed_href.scheme or parsed_href.path.startswith('/'):
39-
# don't change external links
42+
# don't change external links/images
4043
continue
4144

42-
if not parsed_href.path and parsed_href.fragment:
43-
# href="#section-link"
44-
new_href = base_url.geturl() + href
45-
link.attrib['href'] = new_href
45+
if tag.tag == "a" and not parsed_href.path and parsed_href.fragment:
46+
# It's a link pointing to a specific section inside the target ``href="#section-link"``
47+
cleaned_value = base_url.geturl() + value
48+
tag.attrib[attribute] = cleaned_value
4649
continue
4750

4851
if not base_url.path.endswith('/'):
49-
# internal relative link
52+
# internal relative link/image
5053
# href="../../another.html" and ``base_url`` is not HTMLDir
5154
# (e.g. /en/latest/deep/internal/section/page.html)
5255
# we want to remove the trailing filename (page.html) and use the rest as base URL
@@ -55,11 +58,11 @@ def clean_links(obj, url, html_raw_response=False):
5558

5659
# remove the filename (page.html) from the original document URL (base_url) and,
5760
path, _ = base_url.path.rsplit('/', 1)
58-
# append the value of href (../../another.html) to the base URL.
61+
# append the value of href/src (../../another.html) to the base URL.
5962
base_url = base_url._replace(path=path + '/')
6063

61-
new_href = base_url.geturl() + href
62-
link.attrib['href'] = new_href
64+
cleaned_value = base_url.geturl() + value
65+
tag.attrib[attribute] = cleaned_value
6366

6467
if html_raw_response:
6568
return obj.outerHtml()

readthedocs/embed/v3/views.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
from readthedocs.api.mixins import CDNCacheTagsMixin, EmbedAPIMixin
1919
from readthedocs.core.utils.extend import SettingsOverrideObject
20-
from readthedocs.embed.utils import clean_links
20+
from readthedocs.embed.utils import clean_references
2121
from readthedocs.projects.constants import PUBLIC
2222
from readthedocs.storage import build_media_storage
2323

@@ -359,7 +359,7 @@ def get(self, request): # noqa
359359
# Sanitize the URL before requesting it
360360
sanitized_url = urlparse(url)._replace(fragment='', query='').geturl()
361361
# Make links from the content to be absolute
362-
content = clean_links(
362+
content = clean_references(
363363
content_requested,
364364
sanitized_url,
365365
html_raw_response=True,

readthedocs/embed/views.py

+2-5
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from readthedocs.builds.constants import EXTERNAL
1818
from readthedocs.core.resolver import resolve
1919
from readthedocs.core.utils.extend import SettingsOverrideObject
20-
from readthedocs.embed.utils import clean_links, recurse_while_none
20+
from readthedocs.embed.utils import clean_references, recurse_while_none
2121
from readthedocs.storage import build_media_storage
2222

2323
log = structlog.get_logger(__name__)
@@ -312,10 +312,7 @@ def dump(obj):
312312
return obj.parent().outerHtml()
313313
return obj.outerHtml()
314314

315-
ret = [
316-
dump(clean_links(obj, url))
317-
for obj in query_result
318-
]
315+
ret = [dump(clean_references(obj, url)) for obj in query_result]
319316
return ret, headers, section
320317

321318

0 commit comments

Comments
 (0)