Search: fix parsing of footnotes (#9154)

stsewd · web-flow · commit 27676b7ad0b8 · 2022-05-02T16:18:57.000-05:00
Our code to detect sphinx domains is a little naive,
sphinx also doesn't make it easy to detect these.
diff --git a/readthedocs/search/parsers.py b/readthedocs/search/parsers.py
@@ -1,12 +1,12 @@
 """JSON/HTML parsers for search indexing."""
 
 import itertools
-import structlog
 import os
 import re
 from urllib.parse import urlparse
 
 import orjson as json
+import structlog
 from selectolax.parser import HTMLParser
 
 from readthedocs.storage import build_media_storage
@@ -386,14 +386,14 @@ def _process_fjson(self, fjson_path):
             try:
                 body = HTMLParser(data['body'])
                 sections = self._get_sections(title=title, body=body.body)
-            except Exception as e:
+            except Exception:
                 log.info('Unable to index sections.', path=fjson_path)
 
             try:
                 # Create a new html object, since the previous one could have been modified.
                 body = HTMLParser(data['body'])
                 domain_data = self._generate_domains_data(body)
-            except Exception as e:
+            except Exception:
                 log.info('Unable to index domains.', path=fjson_path)
         else:
             log.info('Unable to index content.', path=fjson_path)
@@ -405,6 +405,21 @@ def _process_fjson(self, fjson_path):
             'domain_data': domain_data,
         }
 
+    def _get_sphinx_domains(self, body):
+        """
+        Get all nodes that are a sphinx domain.
+
+        A Sphinx domain is a <dl> tag which contains <dt> tags with an 'id' attribute,
+        dl tags that have the "footnote" class aren't domains.
+        """
+        domains = []
+        dl_tags = body.css("dl:has(dt[id])")
+        for tag in dl_tags:
+            classes = tag.attributes.get("class", "").split()
+            if "footnote" not in classes:
+                domains.append(tag)
+        return domains
+
     def _clean_body(self, body):
         """
         Removes sphinx domain nodes.
@@ -414,16 +429,7 @@ def _clean_body(self, body):
         We already index those in another step.
         """
         body = super()._clean_body(body)
-
-        nodes_to_be_removed = []
-
-        # remove all <dl> tags which contains <dt> tags having 'id' attribute
-        dt_tags = body.css('dt[id]')
-        for tag in dt_tags:
-            parent = tag.parent
-            if parent.tag == 'dl':
-                nodes_to_be_removed.append(parent)
-
+        nodes_to_be_removed = self._get_sphinx_domains(body)
         # TODO: see if we really need to remove these
         # remove `Table of Contents` elements
         nodes_to_be_removed += body.css('.toctree-wrapper') + body.css('.contents.local.topic')
@@ -452,7 +458,7 @@ def _generate_domains_data(self, body):
         """
 
         domain_data = {}
-        dl_tags = body.css('dl')
+        dl_tags = self._get_sphinx_domains(body)
         number_of_domains = 0
 
         for dl_tag in dl_tags:
@@ -466,7 +472,10 @@ def _generate_domains_data(self, body):
                 try:
                     id_ = title.attributes.get('id', '')
                     if id_:
-                        docstrings = self._parse_domain_tag(desc)
+                        # Create a copy of the node,
+                        # since _parse_domain_tag will modify it.
+                        copy_desc = HTMLParser(desc.html).body.child
+                        docstrings = self._parse_domain_tag(copy_desc)
                         domain_data[id_] = docstrings
                         number_of_domains += 1
                     if number_of_domains >= self.max_inner_documents:
@@ -490,7 +499,8 @@ def _parse_domain_tag(self, tag):
         # and all 'dl' tags are already captured.
         nodes_to_be_removed = tag.css('dl') + tag.css('dt') + tag.css('dd')
         for node in nodes_to_be_removed:
-            node.decompose()
+            if tag != node:
+                node.decompose()
 
         docstring = self._parse_content(tag.text())
         return docstring
diff --git a/readthedocs/search/tests/data/sphinx/in/page.html b/readthedocs/search/tests/data/sphinx/in/page.html
@@ -100,4 +100,61 @@ <h2>Adding a new scenario to the repository<a class="headerlink" href="#adding-a
     </div>
   </div>
   <!--End code blocks-->
+
+  <div class="section" id="footnotes-and-domains">
+    <h2>Footnotes and domains<a class="headerlink" href="#footnotes-and-domains" title="Permalink to this headline">¶</a></h2>
+    <!-- This is a sphinx domain, won't be parsed. -->
+    <dl class="py class">
+      <dt class="sig sig-object py" id="test_py_module.test.Foo">
+        <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">test_py_module.test.</span></span><span class="sig-name descname"><span class="pre">Foo</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">qux</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">spam</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/test_py_module/test.html#Foo"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#test_py_module.test.Foo" title="Permalink to this definition"></a>
+      </dt>
+      <dd>
+        <p>Docstring for class Foo.</p>
+        <p>This text tests for the formatting of docstrings generated from output
+          <code class="docutils literal notranslate"><span class="pre">sphinx.ext.autodoc</span></code>.
+        </p>
+        <dl class="py method">
+          <dt class="sig sig-object py" id="test_py_module.test.Foo.__init__">
+            <span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span>
+            <em class="sig-param"><span class="n"><span class="pre">qux</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">spam</span></span><span class="o"><span class="pre">=</span></span>
+              <span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span>
+            <a class="reference internal" href="../_modules/test_py_module/test.html#Foo.__init__">
+              <span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#test_py_module.test.Foo.__init__" title="Permalink to this definition"></a>
+          </dt>
+          <dd>
+            <p>Start the Foo.</p>
+            <dl class="field-list simple">
+              <dt class="field-odd">Parameters<span class="colon">:</span></dt>
+              <dd class="field-odd">
+                <ul class="simple">
+                  <li>
+                    <p><strong>qux</strong> (<em>string</em>) – The first argument to initialize class.</p>
+                  </li>
+                  <li>
+                    <p><strong>spam</strong> (<em>bool</em>) – Spam me yes or no…</p>
+                  </li>
+                </ul>
+              </dd>
+            </dl>
+          </dd>
+        </dl>
+      </dd>
+    </dl>
+    <!-- End of sphinx domain -->
+
+    <!-- These are footnotes, shouldn't be confused wiht a sphinx domain. -->
+    <dl class="footnote brackets">
+      <dt class="label" id="id6"><span class="brackets">1</span><span class="fn-backref">(<a href="#id1">1</a>,<a href="#id7">2</a>)</span></dt>
+      <dd>
+        <p>A <span class="highlighted">footnote</span> contains body elements, consistently indented by at least 3 spaces.</p>
+        <p>This is the <span class="highlighted">footnote</span>’s second paragraph.</p>
+      </dd>
+      <dt class="label" id="id9"><span class="brackets"><a class="fn-backref" href="#id2">3</a></span></dt>
+      <dd>
+        <p>This <span class="highlighted">footnote</span> is numbered automatically and anonymously using a label of “#” only.</p>
+      </dd>
+    </dl>
+  </div>
+  <!-- End of footnote -->
+
 </div>
diff --git a/readthedocs/search/tests/data/sphinx/out/page.json b/readthedocs/search/tests/data/sphinx/out/page.json
@@ -27,11 +27,19 @@
       "title": "Adding a new scenario to the repository",
       "content": "Sphinx configuration file used to build this docs: # -*- coding: utf-8 -*- # Default settings project = 'Test Builds' extensions = [ 'sphinx_autorun', ] latex_engine = 'xelatex' # allow us to build Unicode chars # Include all your settings here html_theme = 'sphinx_rtd_theme' >>> # Build at >>> import datetime >>> datetime.datetime.utcnow() # UTC datetime.datetime(2020, 5, 3, 16, 38, 11, 137311)"
     },
+    {
+      "id": "footnotes-and-domains",
+      "title": "Footnotes and domains",
+      "content": "1(1,2) A footnote contains body elements, consistently indented by at least 3 spaces. This is the footnote’s second paragraph. 3 This footnote is numbered automatically and anonymously using a label of “#” only."
+    },
     {
       "content": "This is a H3 title. Fig. 4 I'm a figure!",
       "id": "subsub-title",
       "title": "Subsub title"
     }
   ],
-  "domain_data": {}
+  "domain_data": {
+    "test_py_module.test.Foo": "Docstring for class Foo. This text tests for the formatting of docstrings generated from output sphinx.ext.autodoc.",
+    "test_py_module.test.Foo.__init__": "Start the Foo."
+  }
 }