Skip to content

Commit 27676b7

Browse files
authored
Search: fix parsing of footnotes (#9154)
Our code to detect sphinx domains is a little naive, sphinx also doesn't make it easy to detect these.
1 parent 97a3a62 commit 27676b7

File tree

3 files changed

+92
-17
lines changed

3 files changed

+92
-17
lines changed

readthedocs/search/parsers.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
"""JSON/HTML parsers for search indexing."""
22

33
import itertools
4-
import structlog
54
import os
65
import re
76
from urllib.parse import urlparse
87

98
import orjson as json
9+
import structlog
1010
from selectolax.parser import HTMLParser
1111

1212
from readthedocs.storage import build_media_storage
@@ -386,14 +386,14 @@ def _process_fjson(self, fjson_path):
386386
try:
387387
body = HTMLParser(data['body'])
388388
sections = self._get_sections(title=title, body=body.body)
389-
except Exception as e:
389+
except Exception:
390390
log.info('Unable to index sections.', path=fjson_path)
391391

392392
try:
393393
# Create a new html object, since the previous one could have been modified.
394394
body = HTMLParser(data['body'])
395395
domain_data = self._generate_domains_data(body)
396-
except Exception as e:
396+
except Exception:
397397
log.info('Unable to index domains.', path=fjson_path)
398398
else:
399399
log.info('Unable to index content.', path=fjson_path)
@@ -405,6 +405,21 @@ def _process_fjson(self, fjson_path):
405405
'domain_data': domain_data,
406406
}
407407

408+
def _get_sphinx_domains(self, body):
409+
"""
410+
Get all nodes that are a sphinx domain.
411+
412+
A Sphinx domain is a <dl> tag which contains <dt> tags with an 'id' attribute,
413+
dl tags that have the "footnote" class aren't domains.
414+
"""
415+
domains = []
416+
dl_tags = body.css("dl:has(dt[id])")
417+
for tag in dl_tags:
418+
classes = tag.attributes.get("class", "").split()
419+
if "footnote" not in classes:
420+
domains.append(tag)
421+
return domains
422+
408423
def _clean_body(self, body):
409424
"""
410425
Removes sphinx domain nodes.
@@ -414,16 +429,7 @@ def _clean_body(self, body):
414429
We already index those in another step.
415430
"""
416431
body = super()._clean_body(body)
417-
418-
nodes_to_be_removed = []
419-
420-
# remove all <dl> tags which contains <dt> tags having 'id' attribute
421-
dt_tags = body.css('dt[id]')
422-
for tag in dt_tags:
423-
parent = tag.parent
424-
if parent.tag == 'dl':
425-
nodes_to_be_removed.append(parent)
426-
432+
nodes_to_be_removed = self._get_sphinx_domains(body)
427433
# TODO: see if we really need to remove these
428434
# remove `Table of Contents` elements
429435
nodes_to_be_removed += body.css('.toctree-wrapper') + body.css('.contents.local.topic')
@@ -452,7 +458,7 @@ def _generate_domains_data(self, body):
452458
"""
453459

454460
domain_data = {}
455-
dl_tags = body.css('dl')
461+
dl_tags = self._get_sphinx_domains(body)
456462
number_of_domains = 0
457463

458464
for dl_tag in dl_tags:
@@ -466,7 +472,10 @@ def _generate_domains_data(self, body):
466472
try:
467473
id_ = title.attributes.get('id', '')
468474
if id_:
469-
docstrings = self._parse_domain_tag(desc)
475+
# Create a copy of the node,
476+
# since _parse_domain_tag will modify it.
477+
copy_desc = HTMLParser(desc.html).body.child
478+
docstrings = self._parse_domain_tag(copy_desc)
470479
domain_data[id_] = docstrings
471480
number_of_domains += 1
472481
if number_of_domains >= self.max_inner_documents:
@@ -490,7 +499,8 @@ def _parse_domain_tag(self, tag):
490499
# and all 'dl' tags are already captured.
491500
nodes_to_be_removed = tag.css('dl') + tag.css('dt') + tag.css('dd')
492501
for node in nodes_to_be_removed:
493-
node.decompose()
502+
if tag != node:
503+
node.decompose()
494504

495505
docstring = self._parse_content(tag.text())
496506
return docstring

readthedocs/search/tests/data/sphinx/in/page.html

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,4 +100,61 @@ <h2>Adding a new scenario to the repository<a class="headerlink" href="#adding-a
100100
</div>
101101
</div>
102102
<!--End code blocks-->
103+
104+
<div class="section" id="footnotes-and-domains">
105+
<h2>Footnotes and domains<a class="headerlink" href="#footnotes-and-domains" title="Permalink to this headline"></a></h2>
106+
<!-- This is a sphinx domain, won't be parsed. -->
107+
<dl class="py class">
108+
<dt class="sig sig-object py" id="test_py_module.test.Foo">
109+
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">test_py_module.test.</span></span><span class="sig-name descname"><span class="pre">Foo</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">qux</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">spam</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/test_py_module/test.html#Foo"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#test_py_module.test.Foo" title="Permalink to this definition"></a>
110+
</dt>
111+
<dd>
112+
<p>Docstring for class Foo.</p>
113+
<p>This text tests for the formatting of docstrings generated from output
114+
<code class="docutils literal notranslate"><span class="pre">sphinx.ext.autodoc</span></code>.
115+
</p>
116+
<dl class="py method">
117+
<dt class="sig sig-object py" id="test_py_module.test.Foo.__init__">
118+
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span>
119+
<em class="sig-param"><span class="n"><span class="pre">qux</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">spam</span></span><span class="o"><span class="pre">=</span></span>
120+
<span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span>
121+
<a class="reference internal" href="../_modules/test_py_module/test.html#Foo.__init__">
122+
<span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#test_py_module.test.Foo.__init__" title="Permalink to this definition"></a>
123+
</dt>
124+
<dd>
125+
<p>Start the Foo.</p>
126+
<dl class="field-list simple">
127+
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
128+
<dd class="field-odd">
129+
<ul class="simple">
130+
<li>
131+
<p><strong>qux</strong> (<em>string</em>) – The first argument to initialize class.</p>
132+
</li>
133+
<li>
134+
<p><strong>spam</strong> (<em>bool</em>) – Spam me yes or no…</p>
135+
</li>
136+
</ul>
137+
</dd>
138+
</dl>
139+
</dd>
140+
</dl>
141+
</dd>
142+
</dl>
143+
<!-- End of sphinx domain -->
144+
145+
<!-- These are footnotes, shouldn't be confused wiht a sphinx domain. -->
146+
<dl class="footnote brackets">
147+
<dt class="label" id="id6"><span class="brackets">1</span><span class="fn-backref">(<a href="#id1">1</a>,<a href="#id7">2</a>)</span></dt>
148+
<dd>
149+
<p>A <span class="highlighted">footnote</span> contains body elements, consistently indented by at least 3 spaces.</p>
150+
<p>This is the <span class="highlighted">footnote</span>’s second paragraph.</p>
151+
</dd>
152+
<dt class="label" id="id9"><span class="brackets"><a class="fn-backref" href="#id2">3</a></span></dt>
153+
<dd>
154+
<p>This <span class="highlighted">footnote</span> is numbered automatically and anonymously using a label of “#” only.</p>
155+
</dd>
156+
</dl>
157+
</div>
158+
<!-- End of footnote -->
159+
103160
</div>

readthedocs/search/tests/data/sphinx/out/page.json

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,19 @@
2727
"title": "Adding a new scenario to the repository",
2828
"content": "Sphinx configuration file used to build this docs: # -*- coding: utf-8 -*- # Default settings project = 'Test Builds' extensions = [ 'sphinx_autorun', ] latex_engine = 'xelatex' # allow us to build Unicode chars # Include all your settings here html_theme = 'sphinx_rtd_theme' >>> # Build at >>> import datetime >>> datetime.datetime.utcnow() # UTC datetime.datetime(2020, 5, 3, 16, 38, 11, 137311)"
2929
},
30+
{
31+
"id": "footnotes-and-domains",
32+
"title": "Footnotes and domains",
33+
"content": "1(1,2) A footnote contains body elements, consistently indented by at least 3 spaces. This is the footnote’s second paragraph. 3 This footnote is numbered automatically and anonymously using a label of “#” only."
34+
},
3035
{
3136
"content": "This is a H3 title. Fig. 4 I'm a figure!",
3237
"id": "subsub-title",
3338
"title": "Subsub title"
3439
}
3540
],
36-
"domain_data": {}
41+
"domain_data": {
42+
"test_py_module.test.Foo": "Docstring for class Foo. This text tests for the formatting of docstrings generated from output sphinx.ext.autodoc.",
43+
"test_py_module.test.Foo.__init__": "Start the Foo."
44+
}
3745
}

0 commit comments

Comments
 (0)