readthedocs · humitos · Sep 27, 2021 · Sep 22, 2021 · Sep 22, 2021 · Sep 23, 2021
diff --git a/readthedocs/embed/v3/views.py b/readthedocs/embed/v3/views.py
@@ -6,7 +6,6 @@
 import requests
 
 from selectolax.parser import HTMLParser
-from pyquery import PyQuery as PQ  # noqa
 
 from django.conf import settings
 from django.core.cache import cache
@@ -71,12 +70,15 @@ def _download_page_content(self, url):
 
         response = requests.get(url, timeout=settings.RTD_EMBED_API_DEFAULT_REQUEST_TIMEOUT)
         if response.ok:
+            # NOTE: we use ``response.content`` to get its binary
+            # representation. Then ``selectolax`` is in charge to auto-detect
+            # its encoding. We trust more in selectolax for this than in requests.
             cache.set(
                 cache_key,
-                response.text,
+                response.content,
                 timeout=settings.RTD_EMBED_API_PAGE_CACHE_TIMEOUT,
             )
-            return response.text
+            return response.content
 
     def _get_page_content_from_storage(self, project, version_slug, filename):
         version = get_object_or_404(
@@ -138,7 +140,11 @@ def _parse_based_on_doctool(self, page_content, fragment, doctool, doctoolversio
 
         node = None
         if fragment:
-            selector = f'#{fragment}'
+            # NOTE: we use the `[id=]` selector because using `#{id}` requires
+            # escaping the selector since CSS does not support the same
+            # characters as the `id=` HTML attribute
+            # https://www.w3.org/TR/CSS21/syndata.html#value-def-identifier
+            selector = f'[id="{fragment}"]'
             node = HTMLParser(page_content).css_first(selector)
         else:
             html = HTMLParser(page_content)
@@ -150,7 +156,10 @@ def _parse_based_on_doctool(self, page_content, fragment, doctool, doctoolversio
         if doctool == 'sphinx':
             # Handle ``dt`` special cases
             if node.tag == 'dt':
-                if 'glossary' in node.parent.attributes.get('class'):
+                if any([
+                        'glossary' in node.parent.attributes.get('class'),
+                        'citation' in node.parent.attributes.get('class'),
+                ]):
                     # Sphinx HTML structure for term glossary puts the ``id`` in the
                     # ``dt`` element with the title of the term. In this case, we
                     # return the parent node which contains the definition list
@@ -163,32 +172,26 @@ def _parse_based_on_doctool(self, page_content, fragment, doctool, doctoolversio
                     # ...
                     # </dl>
 
-                    # TODO: figure it out if it's needed to remove the siblings here
-                    # parent = node.parent
-                    # for n in parent.traverse():
-                    #     if n not in (node, node.next):
-                    #         n.remove()
-                    node = node.parent
-
-                elif 'citation' in node.parent.attributes.get('class'):
-                    # Sphinx HTML structure for sphinxcontrib-bibtex puts the ``id`` in the
-                    # ``dt`` element with the title of the cite. In this case, we
-                    # return the parent node which contains the definition list
-                    # and remove all ``dt/dd`` that are not the requested one
-
-                    # Structure:
-                    # <dl class="citation">
-                    # <dt id="cite-id"><span><a>Title of the cite</a></span></dt>
-                    # <dd>Content of the cite</dd>
-                    # ...
-                    # </dl>
-
-                    # TODO: figure it out if it's needed to remove the siblings here
-                    # parent = node.parent
-                    # for n in parent.traverse():
-                    #     if n not in (node, node.next):
-                    #         n.remove()
-                    node = node.parent
+                    parent_node = node.parent
+                    if 'glossary' in node.parent.attributes.get('class'):
+                        next_node = node.next
+
+                    elif 'citation' in node.parent.attributes.get('class'):
+                        next_node = node.next.next
+
+                    # Iterate over all the siblings (``.iter()``) of the parent
+                    # node and remove ``dt`` and ``dd`` that are not the ones
+                    # we are looking for. Then return the parent node as
+                    # result.
+                    #
+                    # Note that ``.iter()`` returns a generator and we modify
+                    # the HTML in-place, so we have to convert it to a list
+                    # before removing elements. Otherwise we break the
+                    # iteration before completing it
+                    for n in list(parent_node.iter()):  # pylint: disable=invalid-name
+                        if n not in (node, next_node):
+                            n.remove()
+                    node = parent_node
 
                 else:
                     # Sphinx HTML structure for definition list puts the ``id``

diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py
@@ -818,6 +818,8 @@ def DOCKER_LIMITS(self):
         r'docs\.python\.org',
         r'docs\.scipy\.org',
         r'docs\.sympy\.org',
+        r'www.sphinx-doc.org',
+        r'numpy\.org',
     ]
     RTD_EMBED_API_PAGE_CACHE_TIMEOUT = 5 * 10
     RTD_EMBED_API_DEFAULT_REQUEST_TIMEOUT = 1

diff --git a/readthedocs/settings/docker_compose.py b/readthedocs/settings/docker_compose.py
@@ -66,7 +66,12 @@ def RTD_EXT_THEME_DEV_SERVER(self):
     @property
     def RTD_EMBED_API_EXTERNAL_DOMAINS(self):
         domains = super().RTD_EMBED_API_EXTERNAL_DOMAINS
-        domains.append(r'.*\.readthedocs\.io')
+        domains.extend([
+            r'.*\.readthedocs\.io',
+            r'.*\.org\.readthedocs\.build',
+            r'.*\.readthedocs-hosted\.com',
+            r'.*\.com\.readthedocs\.build',
+        ])
         return domains
 
     @property

diff --git a/tox.embedapi.ini b/tox.embedapi.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = sphinx-{18,20,21,22,23,24,30,31,32,33,34,35,40,41,latest}
+envlist = sphinx-{18,20,21,22,23,24,30,31,32,33,34,35,40,41,42,latest}
 
 [testenv]
 description = run test suite for the EmbedAPIv3
@@ -27,6 +27,7 @@ deps =
     sphinx-35: Sphinx~=3.5.0
     sphinx-40: Sphinx~=4.0.0
     sphinx-41: Sphinx~=4.1.0
+    sphinx-42: Sphinx~=4.2.0
     sphinx-latest: Sphinx
 setenv =
     DJANGO_SETTINGS_MODULE=readthedocs.settings.test