readthedocs · stsewd · Jun 23, 2020 · Jun 19, 2020 · Jun 23, 2020 · ericholscher
diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py
@@ -1540,6 +1540,7 @@ def add_features(sender, **kwargs):
     DEDUPLICATE_BUILDS = 'deduplicate_builds'
     USE_SPHINX_RTD_EXT_LATEST = 'rtd_sphinx_ext_latest'
     DEFAULT_TO_FUZZY_SEARCH = 'default_to_fuzzy_search'
+    INDEX_FROM_HTML_FILES = 'index_from_html_files'
 
     FEATURES = (
         (USE_SPHINX_LATEST, _('Use latest version of Sphinx')),
@@ -1661,6 +1662,10 @@ def add_features(sender, **kwargs):
             DEFAULT_TO_FUZZY_SEARCH,
             _('Default to fuzzy search for simple search queries'),
         ),
+        (
+            INDEX_FROM_HTML_FILES,
+            _('Index content directly from html files instead or relying in other sources'),
+        ),
     )
 
     projects = models.ManyToManyField(

diff --git a/readthedocs/search/parsers.py b/readthedocs/search/parsers.py
@@ -21,6 +21,68 @@ def __init__(self, version):
         self.project = self.version.project
         self.storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()
 
+    def _get_page_content(self, page):
+        """Gets the page content from storage."""
+        content = None
+        try:
+            storage_path = self.project.get_storage_path(
+                type_='html',
+                version_slug=self.version.slug,
+                include_file=False,
+            )
+            file_path = self.storage.join(storage_path, page)
+            with self.storage.open(file_path, mode='r') as f:
+                content = f.read()
+        except Exception:
+            log.warning(
+                'Unhandled exception during search processing file: %s',
+                page,
+            )
+        return content
+
+    def _get_page_title(self, body, html):
+        """
+        Gets the title from the html page.
+
+        The title is the first section in the document,
+        falling back to the ``title`` tag.
+        """
+        first_header = body.css_first('h1')
+        if first_header:
+            title, _ = self._parse_section_title(first_header)
+            return title
+
+        title = html.css_first('title')
+        if title:
+            return self._parse_content(title.text())
+
+        return None
+
+    def _get_main_node(self, html):
+        """
+        Gets the main node from where to start indexing content.
+
+        The main node is tested in the following order:
+
+        - Try with a tag with the ``main`` role.
+          This role is used by several static sites and themes.
+        - Try the first ``h1`` node and return its parent
+          Usually all sections are neighbors,
+          so they are children of the same parent node.
+        """
+        body = html.body
+        main_node = body.css_first('[role=main]')
+        if main_node:
+            return main_node
+
+        # TODO: this could be done in smarter way,
+        # checking for common parents between all h nodes.
+        first_header = body.css_first('h1')
+        if first_header:
+            return first_header.parent
+
+        return None
+
     def _parse_content(self, content):
         """Removes new line characters and strips all whitespaces."""
         content = content.strip().split('\n')
@@ -404,9 +466,56 @@ def _parse_domain_tag(self, tag):
 
 class MkDocsParser(BaseParser):
 
-    """MkDocs parser, it relies on the json index files."""
+    """
+    MkDocs parser.
+
+    Index from the json index file or directly from the html content.
+    """
 
     def parse(self, page):
+        # Avoid circular import
+        from readthedocs.projects.models import Feature
+        if self.project.has_feature(Feature.INDEX_FROM_HTML_FILES):
+            return self.parse_from_html(page)
+        return self.parse_from_index_file(page)
+
+    def parse_from_html(self, page):
+        try:
+            content = self._get_page_content(page)
+            if content:
+                return self._process_content(page, content)
+        except Exception as e:
+            log.info('Failed to index page %s, %s', page, str(e))
+        return {
+            'path': page,
+            'title': '',
+            'sections': [],
+            'domain_data': {},
+        }
+
+    def _process_content(self, page, content):
+        """Parses the content into a structured dict."""
+        html = HTMLParser(content)
+        body = self._get_main_node(html)
+        title = ""
+        sections = []
+        if body:
+            title = self._get_page_title(body, html) or page
+            sections = list(self._parse_sections(title, body))
+        else:
+            log.info(
+                'Page doesn\'t look like it has valid content, skipping. '
+                'page=%s',
+                page,
+            )
+        return {
+            'path': page,
+            'title': title,
+            'sections': sections,
+            'domain_data': {},
+        }
+
+    def parse_from_index_file(self, page):
         storage_path = self.project.get_storage_path(
             type_='html',
             version_slug=self.version.slug,

diff --git a/readthedocs/search/tests/data/mkdocs/in/gitbook/index.html b/readthedocs/search/tests/data/mkdocs/in/gitbook/index.html
@@ -0,0 +1,121 @@
+<!DOCTYPE html>
+
+<!--
+  Gitbook them https://gitlab.com/lramage/mkdocs-gitbook-theme
+  From https://lramage.gitlab.io/mkdocs-gitbook-theme/
+-->
+
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
+    <title>Mkdocs - GitBook Theme - Mkdocs - GitBook Theme</title>
+    <meta http-equiv="X-UA-Compatible" content="IE=edge">
+
+    <meta name="generator" content="mkdocs-1.1.2, mkdocs-gitbook-1.0.7">
+
+    <link rel="shortcut icon" href="./images/favicon.ico" type="image/x-icon">
+    <meta name="HandheldFriendly" content="true"/>
+    <meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
+    <meta name="apple-mobile-web-app-capable" content="yes">
+    <meta name="apple-mobile-web-app-status-bar-style" content="black">
+    <meta rel="next" href="" />
+    <link href="./css/style.min.css" rel="stylesheet">
+  </head>
+
+  <body>
+    <div class="book">
+      <div class="book-summary">
+
+        <nav role="navigation">
+          <ul class="summary">
+            <li>
+              <a href="." target="_blank" class="custom-link">Mkdocs - GitBook Theme</a>
+            </li>
+            <li class="divider"></li>
+            <li class="chapter active" data-path="">
+              <a href=".">Mkdocs - GitBook Theme</a>
+              <li class="header">Post</li>
+
+              <li>
+                <a href="post/2015-10-30/" class="">Oldest Post</a>
+              </li>
+
+              <li>
+                <a href="post/2018-12-31/" class="">Older Post</a>
+              </li>
+
+              <li>
+                <a href="post/2019-01-02/" class="">Latest Post</a>
+              </li>
+
+              <li class="divider"></li>
+
+
+
+              <li><a href="http://www.mkdocs.org">
+                  Published with MkDocs
+                </a></li>
+
+                <li><a href="https://github.com/GitbookIO/theme-default">
+                    Theme by GitBook
+                  </a></li>
+          </ul>
+
+        </nav>
+
+      </div> <!-- end of book-summary -->
+
+      <div class="book-body">
+        <div class="body-inner">
+          <div class="book-header" role="navigation">
+
+            <!-- Title -->
+            <h1>
+              <i class="fa fa-circle-o-notch fa-spin"></i>
+              <a href="." ></a>
+            </h1>
+
+          </div> <!-- end of book-header -->
+
+          <div class="page-wrapper" tabindex="-1" role="main">
+            <div class="page-inner">
+
+              <section class="normal markdown-section">
+
+                <h1 id="mkdocs-gitbook-theme">Mkdocs - GitBook Theme</h1>
+                <p><a href="LICENSE"><img alt="Apache 2.0 License" src="https://img.shields.io/badge/license-Apache--2.0-blue.svg?style=flat-square" /></a>
+                  <a href="https://pypi.python.org/pypi/mkdocs-gitbook"><img alt="PyPI" src="https://img.shields.io/pypi/v/mkdocs-gitbook.svg?style=flat-square" /></a></p>
+                <h2 id="installation">Installation</h2>
+                <p>First, install the package via PyPI:</p>
+                <pre><code class="sh">pip install mkdocs-gitbook
+                </code></pre>
+
+                <p>Then include the theme in your <code>mkdocs.yml</code> file:</p>
+                <pre><code class="yaml">theme:
+  name: gitbook
+                </code></pre>
+
+                <h2 id="motivation">Motivation</h2>
+                <p>Gitbook was a static-site generator written in JavaScript.</p>
+                <p>Mkdocs is a static-site generator written in Python.</p>
+                <p><strong>Gitbook is <a href="https://docs.gitbook.com/v2-changes/important-differences#cli-toolchain">no longer a static-site generator</a>, <a href="https://docs.gitbook.com/v2-changes/important-differences#git-hosting-and-integration">nor does it use git</a>, nor is it <a href="https://www.gnu.org/philosophy/free-sw.html">free</a> or <a href="https://opensource.org/osd">open source</a>!</strong></p>
+                <h2 id="screenshot">Screenshot</h2>
+                <p><a href="https://gitlab.com/lramage/mkdocs-gitbook-theme"><img src="img/screenshot.png" alt="Default theme for GitBook for Mkdocs"></a></p>
+                <h2 id="license">License</h2>
+                <p>SPDX-License-Identifier: <a href="https://spdx.org/licenses/Apache-2.0">Apache-2.0</a></p>
+
+
+              </section>
+
+            </div> <!-- end of page-inner -->
+          </div> <!-- end of page-wrapper -->
+
+        </div> <!-- end of body-inner -->
+
+      </div> <!-- end of book-body -->
+      <script src="./js/main.js"></script>
+      <script src="./js/gitbook.min.js"></script>
+      <script src="./js/theme.min.js"></script>
+  </body>
+</html>