EmbedAPI: clean images (src) properly from inside a tooltip (#9337)

humitos · web-flow · commit ee18557b913b · 2022-06-16T23:38:11.000+02:00
* EmbedAPI: small refactor - rename `clean_links` as `clean_references` - rename `URLData.href` as `URLData.ref` This is done because this function will clean `a.href` and also `img.src` now. * EmbedAPI: clean images (src) properly from inside a tooltip When returning the content to be injected in a tooltip, besides cleaning the `a.href` we are also cleaning the `img.src` to make them absolute and render the images properly. See readthedocs/sphinx-hoverxref#200 * EmbedAPI: use absolute URL on HTML data for tests These `src=` fields are not cleaned by `clean_references`.
diff --git a/readthedocs/embed/tests/data/sphinx/latest/page-sub-title-one.html b/readthedocs/embed/tests/data/sphinx/latest/page-sub-title-one.html
@@ -9,7 +9,7 @@ <h3>Subsub title<a class="headerlink" href="http://project.readthedocs.io/en/lat
       <!-- Figure with permalink and caption -->
       <div class="figure align-center" id="id1">
         <a class="reference internal image-reference" href="http://project.readthedocs.io/en/latest/_images/search-analytics-demo.png">
-          <img alt="Search analytics demo" src="_images/figure.png" style="width: 50%;">
+          <img alt="Search analytics demo" src="http://project.readthedocs.io/en/latest/_images/figure.png" style="width: 50%;">
         </a>
         <p class="caption">
           <span class="caption-number">Fig. 4 </span><span class="caption-text">I'm a figure!</span>
diff --git a/readthedocs/embed/tests/data/sphinx/latest/page-subsub-title.html b/readthedocs/embed/tests/data/sphinx/latest/page-subsub-title.html
@@ -5,7 +5,7 @@ <h3>Subsub title<a class="headerlink" href="http://project.readthedocs.io/en/lat
       <!-- Figure with permalink and caption -->
       <div class="figure align-center" id="id1">
         <a class="reference internal image-reference" href="http://project.readthedocs.io/en/latest/_images/search-analytics-demo.png">
-          <img alt="Search analytics demo" src="_images/figure.png" style="width: 50%;">
+          <img alt="Search analytics demo" src="http://project.readthedocs.io/en/latest/_images/figure.png" style="width: 50%;">
         </a>
         <p class="caption">
           <span class="caption-number">Fig. 4 </span><span class="caption-text">I'm a figure!</span>
diff --git a/readthedocs/embed/tests/data/sphinx/latest/page-title-one.html b/readthedocs/embed/tests/data/sphinx/latest/page-title-one.html
@@ -13,7 +13,7 @@ <h3>Subsub title<a class="headerlink" href="http://project.readthedocs.io/en/lat
       <!-- Figure with permalink and caption -->
       <div class="figure align-center" id="id1">
         <a class="reference internal image-reference" href="http://project.readthedocs.io/en/latest/_images/search-analytics-demo.png">
-          <img alt="Search analytics demo" src="_images/figure.png" style="width: 50%;">
+          <img alt="Search analytics demo" src="http://project.readthedocs.io/en/latest/_images/figure.png" style="width: 50%;">
         </a>
         <p class="caption">
           <span class="caption-number">Fig. 4 </span><span class="caption-text">I'm a figure!</span>
diff --git a/readthedocs/embed/tests/test_links.py b/readthedocs/embed/tests/test_links.py
@@ -3,9 +3,9 @@
 import pytest
 from pyquery import PyQuery
 
-from readthedocs.embed.utils import clean_links
+from readthedocs.embed.utils import clean_references
 
-URLData = namedtuple('URLData', ['docurl', 'href', 'expected'])
+URLData = namedtuple("URLData", ["docurl", "ref", "expected"])
 
 html_base_url = 'https://t.readthedocs.io/en/latest/page.html'
 dirhtml_base_url = 'https://t.readthedocs.io/en/latest/page/'
@@ -90,14 +90,38 @@
     ),
 ]
 
+imagedata = [
+    URLData(
+        html_base_url,
+        "/_images/image.png",
+        "/_images/image.png",
+    ),
+    URLData(
+        html_base_url,
+        "relative/section/image.png",
+        "https://t.readthedocs.io/en/latest/relative/section/image.png",
+    ),
+    URLData(
+        "https://t.readthedocs.io/en/latest/internal/deep/page/topic.html",
+        "../../../_images/image.png",
+        "https://t.readthedocs.io/en/latest/internal/deep/page/../../../_images/image.png",
+    ),
+]
 
 @pytest.mark.parametrize('url', htmldata + dirhtmldata)
 def test_clean_links(url):
-    pq = PyQuery(f'<body><a href="{url.href}">Click here</a></body>')
-    response = clean_links(pq, url.docurl)
+    pq = PyQuery(f'<body><a href="{url.ref}">Click here</a></body>')
+    response = clean_references(pq, url.docurl)
     assert response.find('a').attr['href'] == url.expected
 
 
+@pytest.mark.parametrize("url", imagedata)
+def test_clean_images(url):
+    pq = PyQuery(f'<body><img alt="image alt content" src="{url.ref}"></img></body>')
+    response = clean_references(pq, url.docurl)
+    assert response.find("img").attr["src"] == url.expected
+
+
 def test_two_links():
     """
     First link does not affect the second one.
@@ -115,7 +139,9 @@ def test_two_links():
         '#to-a-section',
         'https://t.readthedocs.io/en/latest/internal/deep/page/section.html#to-a-section',
     )
-    pq = PyQuery(f'<body><a href="{firsturl.href}">Click here</a><a href="{secondurl.href}">Click here</a></body>')
-    response = clean_links(pq, firsturl.docurl)
+    pq = PyQuery(
+        f'<body><a href="{firsturl.ref}">Click here</a><a href="{secondurl.ref}">Click here</a></body>'
+    )
+    response = clean_references(pq, firsturl.docurl)
     firstlink, secondlink = response.find('a')
     assert (firstlink.attrib['href'], secondlink.attrib['href']) == (firsturl.expected, secondurl.expected)
diff --git a/readthedocs/embed/utils.py b/readthedocs/embed/utils.py
@@ -1,6 +1,7 @@
 """Embed utils."""
 
 from urllib.parse import urlparse
+
 from pyquery import PyQuery as PQ  # noqa
 
 
@@ -15,13 +16,13 @@ def recurse_while_none(element):
     return {element.text: href}
 
 
-def clean_links(obj, url, html_raw_response=False):
+def clean_references(obj, url, html_raw_response=False):
     """
-    Rewrite (internal) links to make them absolute.
+    Rewrite (internal) links (href) and images (src) to make them absolute.
 
-    1. external links are not changed
+    1. external links/images are not changed
     2. prepend URL to links that are just fragments (e.g. #section)
-    3. prepend URL (without filename) to internal relative links
+    3. prepend URL (without filename) to internal relative links/images
     """
 
     # TODO: do not depend on PyQuery
@@ -30,23 +31,25 @@ def clean_links(obj, url, html_raw_response=False):
     if url is None:
         return obj
 
-    for link in obj.find('a'):
+    for tag in obj.find("a") + obj.find("img"):
         base_url = urlparse(url)
-        # We need to make all internal links, to be absolute
-        href = link.attrib['href']
-        parsed_href = urlparse(href)
+        attribute = "href" if tag.tag == "a" else "src"
+        value = tag.attrib[attribute]
+
+        # We need to make all internal links/images, to be absolute
+        parsed_href = urlparse(value)
         if parsed_href.scheme or parsed_href.path.startswith('/'):
-            # don't change external links
+            # don't change external links/images
             continue
 
-        if not parsed_href.path and parsed_href.fragment:
-            # href="#section-link"
-            new_href = base_url.geturl() + href
-            link.attrib['href'] = new_href
+        if tag.tag == "a" and not parsed_href.path and parsed_href.fragment:
+            # It's a link pointing to a specific section inside the target ``href="#section-link"``
+            cleaned_value = base_url.geturl() + value
+            tag.attrib[attribute] = cleaned_value
             continue
 
         if not base_url.path.endswith('/'):
-            # internal relative link
+            # internal relative link/image
             # href="../../another.html" and ``base_url`` is not HTMLDir
             # (e.g. /en/latest/deep/internal/section/page.html)
             # we want to remove the trailing filename (page.html) and use the rest as base URL
@@ -55,11 +58,11 @@ def clean_links(obj, url, html_raw_response=False):
 
             # remove the filename (page.html) from the original document URL (base_url) and,
             path, _ = base_url.path.rsplit('/', 1)
-            # append the value of href (../../another.html) to the base URL.
+            # append the value of href/src (../../another.html) to the base URL.
             base_url = base_url._replace(path=path + '/')
 
-        new_href = base_url.geturl() + href
-        link.attrib['href'] = new_href
+        cleaned_value = base_url.geturl() + value
+        tag.attrib[attribute] = cleaned_value
 
     if html_raw_response:
         return obj.outerHtml()
diff --git a/readthedocs/embed/v3/views.py b/readthedocs/embed/v3/views.py
@@ -17,7 +17,7 @@
 
 from readthedocs.api.mixins import CDNCacheTagsMixin, EmbedAPIMixin
 from readthedocs.core.utils.extend import SettingsOverrideObject
-from readthedocs.embed.utils import clean_links
+from readthedocs.embed.utils import clean_references
 from readthedocs.projects.constants import PUBLIC
 from readthedocs.storage import build_media_storage
 
@@ -359,7 +359,7 @@ def get(self, request):  # noqa
         # Sanitize the URL before requesting it
         sanitized_url = urlparse(url)._replace(fragment='', query='').geturl()
         # Make links from the content to be absolute
-        content = clean_links(
+        content = clean_references(
             content_requested,
             sanitized_url,
             html_raw_response=True,
diff --git a/readthedocs/embed/views.py b/readthedocs/embed/views.py
@@ -17,7 +17,7 @@
 from readthedocs.builds.constants import EXTERNAL
 from readthedocs.core.resolver import resolve
 from readthedocs.core.utils.extend import SettingsOverrideObject
-from readthedocs.embed.utils import clean_links, recurse_while_none
+from readthedocs.embed.utils import clean_references, recurse_while_none
 from readthedocs.storage import build_media_storage
 
 log = structlog.get_logger(__name__)
@@ -312,10 +312,7 @@ def dump(obj):
             return obj.parent().outerHtml()
         return obj.outerHtml()
 
-    ret = [
-        dump(clean_links(obj, url))
-        for obj in query_result
-    ]
+    ret = [dump(clean_references(obj, url)) for obj in query_result]
     return ret, headers, section