Search: support section titles inside header tags (#9339)

stsewd · ericholscher · web-flow · commit 05799f661564 · 2022-06-16T14:51:56.000-05:00
Another convention to single `h` headers is to put them inside a `header` tag. https://developer.mozilla.org/en-US/docs/Web/HTML/Element/header#usage_notes Co-authored-by: Eric Holscher <25510+ericholscher@users.noreply.github.com>
diff --git a/docs/dev/search-integration.rst b/docs/dev/search-integration.rst
@@ -30,8 +30,9 @@ Read the Docs makes use of ARIA_ roles and other heuristics in order to process
 Main content node
 ~~~~~~~~~~~~~~~~~
 
-The main content node should have a main role (or a ``main`` tag), and there should only be one per page.
-This node is the one that contains all the page content. Example:
+The main content should be inside a ``<main>`` tag or an element with ``role=main``,
+and there should only be one per page.
+This node is the one that contains all the page content to be indexed. Example:
 
 .. code-block:: html
    :emphasize-lines: 10-12
@@ -55,6 +56,51 @@ This node is the one that contains all the page content. Example:
       </body>
    </html>
 
+If a main node isn't found,
+we try to infer the main node from the parent of the first section with a ``h1`` tag.
+Example:
+
+.. code-block:: html
+   :emphasize-lines: 10-20
+
+   <html>
+      <head>
+         ...
+      </head>
+      <body>
+         <div>
+            This content isn't processed
+         </div>
+
+         <div id="parent">
+            <h1>First title</h1>
+            <p>
+               The parent of the h1 title will
+               be taken as the main node,
+               this is the div tag.
+            </p>
+
+            <h2>Second title</h2>
+            <p>More content</p>
+         </div>
+      </body>
+   </html>
+
+If a section title isn't found, we default to the ``body`` tag.
+Example:
+
+.. code-block:: html
+   :emphasize-lines: 5-7
+
+   <html>
+      <head>
+         ...
+      </head>
+      <body>
+         <p>Content</p>
+      </body>
+   </html>
+
 Irrelevant content
 ~~~~~~~~~~~~~~~~~~
 
@@ -87,12 +133,15 @@ Example:
 Sections
 ~~~~~~~~
 
-Sections are ``h`` tags, and sections of the same level should be neighbors.
-Additionally, sections should have an unique ``id`` attribute per page (this is used to link to the section).
-All content below the section, till the new section will be indexed as part of the section. Example:
+Sections are composed of a title, and a content.
+A section title can be a ``h`` tag, or a ``header`` tag containing a ``h`` tag,
+the ``h`` tag or its parent can contain an ``id`` attribute, which will be used to link to the section.
+
+All content below the title, until a new section is found, will be indexed as part of the section content.
+Example:
 
 .. code-block:: html
-   :emphasize-lines: 2-10
+   :emphasize-lines: 2-10, 12-17, 21-26
 
    <div role="main">
       <h1 id="section-title">
@@ -114,17 +163,17 @@ All content below the section, till the new section will be indexed as part of t
 
       ...
 
-      <h1 id="neigbor-section">
-         This section is neighbor of "section-title"
-      </h1>
+      <header>
+         <h1 id="3">This is also a valid section title</h1>
+      </header>
       <p>
-         ...
+         Thi is the content of the third section.
       </p>
    </div>
 
-Sections can be inside till two nested tags (and have nested sections),
-and its immediate parent can contain the ``id`` attribute.
-Note that the section content still needs to be below the ``h`` tag. Example:
+Sections can be contained in up to two nested tags, and can contain other sections (nested sections).
+Note that the section content still needs to be below the section title.
+Example:
 
 .. code-block:: html
    :emphasize-lines: 3-11,14-21
diff --git a/readthedocs/search/parsers.py b/readthedocs/search/parsers.py
@@ -88,10 +88,23 @@ def _get_main_node(self, html):
         # checking for common parents between all h nodes.
         first_header = body.css_first("h1")
         if first_header:
-            return first_header.parent
+            return self._get_header_container(first_header).parent
 
         return body
 
+    def _get_header_container(self, h_tag):
+        """
+        Get the *real* container of a header tag or title.
+
+        If the parent of the ``h`` tag is a ``header`` tag,
+        then we return the ``header`` tag,
+        since the header tag acts as a container for the title of the section.
+        Otherwise, we return the tag itself.
+        """
+        if h_tag.parent.tag == "header":
+            return h_tag.parent
+        return h_tag
+
     def _parse_content(self, content):
         """Converts all new line characters and multiple spaces to a single space."""
         content = content.strip().split()
@@ -110,8 +123,6 @@ def _parse_sections(self, title, body):
         We can have pages that have content before the first title or that don't have a title,
         we index that content first under the title of the original page.
         """
-        body = self._clean_body(body)
-
         # Index content for pages that don't start with a title.
         # We check for sections till 3 levels to avoid indexing all the content
         # in this step.
@@ -135,7 +146,8 @@ def _parse_sections(self, title, body):
             for tag in tags:
                 try:
                     title, id = self._parse_section_title(tag)
-                    content, _ = self._parse_section_content(tag.next, depth=2)
+                    next_tag = self._get_header_container(tag).next
+                    content, _ = self._parse_section_content(next_tag, depth=2)
                     yield {
                         'id': id,
                         'title': title,
@@ -186,10 +198,10 @@ def _is_section(self, tag):
         """
         Check if `tag` is a section (linkeable header).
 
-        The tag is a section if it's a ``h`` tag.
+        The tag is a section if it's a ``h`` or a ``header`` tag.
         """
-        is_header_tag = re.match(r'h\d$', tag.tag)
-        return is_header_tag
+        is_h_tag = re.match(r"h\d$", tag.tag)
+        return is_h_tag or tag.tag == "header"
 
     def _parse_section_title(self, tag):
         """
@@ -199,15 +211,7 @@ def _parse_section_title(self, tag):
 
         - Get the id from the node itself.
         - Get the id from the parent node.
-
-        Additionally:
-
-        - Removes permalink values
         """
-        nodes_to_be_removed = tag.css('.headerlink')
-        for node in nodes_to_be_removed:
-            node.decompose()
-
         section_id = tag.attributes.get('id', '')
         if not section_id:
             parent = tag.parent
@@ -328,6 +332,7 @@ def _process_content(self, page, content):
         title = ""
         sections = []
         if body:
+            body = self._clean_body(body)
             title = self._get_page_title(body, html) or page
             sections = self._get_sections(title=title, body=body)
         else:
@@ -417,7 +422,7 @@ def _process_fjson(self, fjson_path):
 
         if 'body' in data:
             try:
-                body = HTMLParser(data["body"])
+                body = self._clean_body(HTMLParser(data["body"]))
                 sections = self._get_sections(title=title, body=body.body)
             except Exception:
                 log.info('Unable to index sections.', path=fjson_path)
diff --git a/readthedocs/search/tests/data/pelican/in/default/index.html b/readthedocs/search/tests/data/pelican/in/default/index.html
@@ -0,0 +1,98 @@
+<!DOCTYPE html>
+
+<!--
+  Default pelican theme.
+  From https://blog.getpelican.com/pelican-4.7-released.html
+-->
+
+<html lang="en">
+<head>
+        <meta charset="utf-8">
+        <title>Pelican 4.7 released</title>
+        <link rel="stylesheet" href="https://blog.getpelican.com/theme/css/A.main.css.pagespeed.cf.zFbdR40MwZ.css" type="text/css"/>
+        <link href="https://blog.getpelican.com/feeds/all.atom.xml" type="application/atom+xml" rel="alternate" title="Pelican Development Blog Atom Feed"/>
+
+        <!--[if IE]>
+            <script src="https://html5shiv.googlecode.com/svn/trunk/html5.js"></script>
+        <![endif]-->
+</head>
+
+<body id="index" class="home">
+        <header id="banner" class="body">
+                <h1><a href="https://blog.getpelican.com/">Pelican Development Blog </a></h1>
+                <nav><ul>
+                    <li class="active"><a href="https://blog.getpelican.com/category/news.html">news</a></li>
+                    <li><a href="https://docs.getpelican.com">documentation</a></li>
+                    <li><a href="https://donate.getpelican.com">contribute</a></li>
+                    <li><a href="/pages/gratitude.html">gratitude</a></li>
+                </ul></nav>
+        </header><!-- /#banner -->
+<section id="content" class="body">
+  <article>
+    <header>
+      <h1 class="entry-title">
+        <a href="https://blog.getpelican.com/pelican-4.7-released.html" rel="bookmark" title="Permalink to Pelican 4.7 released">Pelican 4.7 released</a></h1>
+    </header>
+
+    <div class="entry-content">
+<footer class="post-info">
+        <abbr class="published" title="2021-10-01T00:00:00+02:00">
+                Fri 01 October 2021
+        </abbr>
+
+        <address class="vcard author">
+                By <a class="url fn" href="https://blog.getpelican.com/author/pelican-contributors.html">Pelican Contributors</a>
+        </address>
+<p>In <a href="https://blog.getpelican.com/category/news.html">news</a>. </p>
+
+</footer><!-- /.post-info -->      <p>Pelican 4.7 is now available. This new release includes the following enhancements, fixes, and tweaks:</p>
+<ul class="simple">
+<li>Improve default theme rendering on mobile and other small screen devices <a class="reference external" href="https://github.com/getpelican/pelican/pull/2914">(#2914)</a></li>
+</ul>
+<p>For more info, please refer to the release page.</p>
+<div class="section" id="upgrading-from-previous-releases">
+<h2>Upgrading from previous releases</h2>
+<p>Upgrading from Pelican 4.6.x should be smooth and require few (if any) changes to
+your environment.</p>
+<p>If you run into problems, please see the <a class="reference external" href="https://docs.getpelican.com/en/latest/contribute.html#how-to-get-help">How to Get Help</a> section
+of the documentation, and we will update this post with any upgrade tips
+contributed by the Pelican community.</p>
+</div>
+
+    </div><!-- /.entry-content -->
+
+  </article>
+</section>
+        <section id="extras" class="body">
+                <div class="blogroll">
+                        <h2>links</h2>
+                        <ul>
+                            <li><a href="https://docs.getpelican.com/">Pelican Docs</a></li>
+                            <li><a href="https://donate.getpelican.com/">Support Pelican</a></li>
+                            <li><a href="https://justinmayer.com/">Justin Mayer</a></li>
+                        </ul>
+                </div><!-- /.blogroll -->
+                <div class="social">
+                        <h2>follow</h2>
+                        <ul>
+                            <li><a href="https://blog.getpelican.com/feeds/all.atom.xml" type="application/atom+xml" rel="alternate">atom feed</a></li>
+
+                            <li><a href="https://twitter.com/getpelican">@getpelican</a></li>
+                            <li><a href="https://twitter.com/jmayer">@jmayer</a></li>
+                            <li><a href="https://github.com/getpelican">github</a></li>
+                        </ul>
+                </div><!-- /.social -->
+        </section><!-- /#extras -->
+
+        <footer id="contentinfo" class="body">
+                <address id="about" class="vcard body">
+                Proudly powered by <a href="http://getpelican.com/">Pelican</a>, which takes great advantage of <a href="http://python.org">Python</a>.
+                </address><!-- /#about -->
+
+                <p>The theme is by <a href="http://coding.smashingmagazine.com/2009/08/04/designing-a-html-5-layout-from-scratch/">Smashing Magazine</a>, thanks!</p>
+        </footer><!-- /#contentinfo -->
+
+    <script>(function(f,a,t,h,o,m){a[h]=a[h]||function(){(a[h].q=a[h].q||[]).push(arguments)};o=f.createElement('script'),m=f.getElementsByTagName('script')[0];o.async=1;o.src=t;o.id='fathom-script';m.parentNode.insertBefore(o,m)})(document,window,'https://stats.justinmayer.com/tracker.js','fathom');fathom('set','siteId','EWNWB');fathom('trackPageview');</script>
+    <script type="text/javascript">var _gaq=_gaq||[];_gaq.push(['_setAccount','UA-295694-7']);_gaq.push(['_trackPageview']);(function(){var ga=document.createElement('script');ga.type='text/javascript';ga.async=true;ga.src=('https:'==document.location.protocol?'https://ssl':'http://www')+'.google-analytics.com/ga.js';var s=document.getElementsByTagName('script')[0];s.parentNode.insertBefore(ga,s);})();</script>
+</body>
+</html>
diff --git a/readthedocs/search/tests/data/pelican/out/default.json b/readthedocs/search/tests/data/pelican/out/default.json
@@ -0,0 +1,34 @@
+[
+  {
+    "path": "index.html",
+    "title": "Pelican Development Blog",
+    "sections": [
+      {
+        "id": "banner",
+        "title": "Pelican Development Blog",
+        "content": ""
+      },
+      {
+        "id": "",
+        "title": "Pelican 4.7 released",
+        "content": "Fri 01 October 2021 By Pelican Contributors In news. Pelican 4.7 is now available. This new release includes the following enhancements, fixes, and tweaks: Improve default theme rendering on mobile and other small screen devices (#2914) For more info, please refer to the release page."
+      },
+      {
+        "id": "upgrading-from-previous-releases",
+        "title": "Upgrading from previous releases",
+        "content": "Upgrading from Pelican 4.6.x should be smooth and require few (if any) changes to your environment. If you run into problems, please see the How to Get Help section of the documentation, and we will update this post with any upgrade tips contributed by the Pelican community."
+      },
+      {
+        "id": "",
+        "title": "links",
+        "content": "Pelican Docs Support Pelican Justin Mayer"
+      },
+      {
+        "id": "",
+        "title": "follow",
+        "content": "atom feed @getpelican @jmayer github"
+      }
+    ],
+    "domain_data": {}
+  }
+]
diff --git a/readthedocs/search/tests/test_parsers.py b/readthedocs/search/tests/test_parsers.py
@@ -303,3 +303,22 @@ def test_generic_simple_page(self, storage_open, storage_exists):
         parsed_json = [file.processed_json]
         expected_json = json.load(open(data_path / "generic/out/basic.json"))
         assert parsed_json == expected_json
+
+    @mock.patch.object(BuildMediaFileSystemStorage, "exists")
+    @mock.patch.object(BuildMediaFileSystemStorage, "open")
+    def test_generic_pelican_default_theme(self, storage_open, storage_exists):
+        file = data_path / "pelican/in/default/index.html"
+        storage_exists.return_value = True
+        self.version.documentation_type = GENERIC
+        self.version.save()
+
+        storage_open.side_effect = self._mock_open(file.open().read())
+        file = get(
+            HTMLFile,
+            project=self.project,
+            version=self.version,
+            path="index.html",
+        )
+        parsed_json = [file.processed_json]
+        expected_json = json.load(open(data_path / "pelican/out/default.json"))
+        assert parsed_json == expected_json