6
6
import requests
7
7
8
8
from selectolax .parser import HTMLParser
9
- from pyquery import PyQuery as PQ # noqa
10
9
11
10
from django .conf import settings
12
11
from django .core .cache import cache
@@ -71,12 +70,15 @@ def _download_page_content(self, url):
71
70
72
71
response = requests .get (url , timeout = settings .RTD_EMBED_API_DEFAULT_REQUEST_TIMEOUT )
73
72
if response .ok :
73
+ # NOTE: we use ``response.content`` to get its binary
74
+ # representation. Then ``selectolax`` is in charge to auto-detect
75
+ # its encoding. We trust more in selectolax for this than in requests.
74
76
cache .set (
75
77
cache_key ,
76
- response .text ,
78
+ response .content ,
77
79
timeout = settings .RTD_EMBED_API_PAGE_CACHE_TIMEOUT ,
78
80
)
79
- return response .text
81
+ return response .content
80
82
81
83
def _get_page_content_from_storage (self , project , version_slug , filename ):
82
84
version = get_object_or_404 (
@@ -138,7 +140,11 @@ def _parse_based_on_doctool(self, page_content, fragment, doctool, doctoolversio
138
140
139
141
node = None
140
142
if fragment :
141
- selector = f'#{ fragment } '
143
+ # NOTE: we use the `[id=]` selector because using `#{id}` requires
144
+ # escaping the selector since CSS does not support the same
145
+ # characters as the `id=` HTML attribute
146
+ # https://www.w3.org/TR/CSS21/syndata.html#value-def-identifier
147
+ selector = f'[id="{ fragment } "]'
142
148
node = HTMLParser (page_content ).css_first (selector )
143
149
else :
144
150
html = HTMLParser (page_content )
@@ -150,7 +156,10 @@ def _parse_based_on_doctool(self, page_content, fragment, doctool, doctoolversio
150
156
if doctool == 'sphinx' :
151
157
# Handle ``dt`` special cases
152
158
if node .tag == 'dt' :
153
- if 'glossary' in node .parent .attributes .get ('class' ):
159
+ if any ([
160
+ 'glossary' in node .parent .attributes .get ('class' ),
161
+ 'citation' in node .parent .attributes .get ('class' ),
162
+ ]):
154
163
# Sphinx HTML structure for term glossary puts the ``id`` in the
155
164
# ``dt`` element with the title of the term. In this case, we
156
165
# return the parent node which contains the definition list
@@ -163,32 +172,26 @@ def _parse_based_on_doctool(self, page_content, fragment, doctool, doctoolversio
163
172
# ...
164
173
# </dl>
165
174
166
- # TODO: figure it out if it's needed to remove the siblings here
167
- # parent = node.parent
168
- # for n in parent.traverse():
169
- # if n not in (node, node.next):
170
- # n.remove()
171
- node = node .parent
172
-
173
- elif 'citation' in node .parent .attributes .get ('class' ):
174
- # Sphinx HTML structure for sphinxcontrib-bibtex puts the ``id`` in the
175
- # ``dt`` element with the title of the cite. In this case, we
176
- # return the parent node which contains the definition list
177
- # and remove all ``dt/dd`` that are not the requested one
178
-
179
- # Structure:
180
- # <dl class="citation">
181
- # <dt id="cite-id"><span><a>Title of the cite</a></span></dt>
182
- # <dd>Content of the cite</dd>
183
- # ...
184
- # </dl>
185
-
186
- # TODO: figure it out if it's needed to remove the siblings here
187
- # parent = node.parent
188
- # for n in parent.traverse():
189
- # if n not in (node, node.next):
190
- # n.remove()
191
- node = node .parent
175
+ parent_node = node .parent
176
+ if 'glossary' in node .parent .attributes .get ('class' ):
177
+ next_node = node .next
178
+
179
+ elif 'citation' in node .parent .attributes .get ('class' ):
180
+ next_node = node .next .next
181
+
182
+ # Iterate over all the siblings (``.iter()``) of the parent
183
+ # node and remove ``dt`` and ``dd`` that are not the ones
184
+ # we are looking for. Then return the parent node as
185
+ # result.
186
+ #
187
+ # Note that ``.iter()`` returns a generator and we modify
188
+ # the HTML in-place, so we have to convert it to a list
189
+ # before removing elements. Otherwise we break the
190
+ # iteration before completing it
191
+ for n in list (parent_node .iter ()): # pylint: disable=invalid-name
192
+ if n not in (node , next_node ):
193
+ n .remove ()
194
+ node = parent_node
192
195
193
196
else :
194
197
# Sphinx HTML structure for definition list puts the ``id``
0 commit comments