Search: index generic doctype (#9322)

stsewd · web-flow · commit fe2f79c76245 · 2022-06-16T14:14:27.000-05:00
Closes #9307
diff --git a/docs/user/build-customization.rst b/docs/user/build-customization.rst
@@ -12,6 +12,8 @@ and also how to override the build process completely:
 `Override the build process`_
     If you want full control over your build. This option supports any tool that generates HTML as part of the build.
 
+.. contents:: Table of contents
+   :local:
 
 Extend the build process
 ------------------------
@@ -245,7 +247,7 @@ Override the build process
 .. warning::
 
    This feature is in a *beta phase* and could suffer incompatible changes or even removed completely in the near feature.
-   It does not yet support some of the Read the Docs' features like the :term:`flyout menu`, search and ads.
+   It does not yet support some of the Read the Docs' features like the :term:`flyout menu`, and ads.
    However, we do plan to support these features in the future.
    Use this feature at your own risk.
 
@@ -273,3 +275,18 @@ your project could use the following configuration file:
 As Read the Docs does not have control over the build process,
 you are responsible for running all the commands required to install requirements and build the documentation properly.
 Once the build process finishes, the ``_readthedocs/html/`` folder will be hosted.
+
+Search support
+++++++++++++++
+
+Read the Docs will automatically index the content of all your HTML files,
+respecting the :ref:`search <config-file/v2:search>` options from your config file.
+
+You can access the search results from the :guilabel:`Search` tab of your project,
+or by using the :ref:`search API <server-side-search:api>`.
+
+.. note::
+
+   In order for Read the Docs to index your HTML files correctly,
+   they should follow some of the conventions described
+   at :doc:`rtd-dev:search-integration`.
diff --git a/readthedocs/builds/models.py b/readthedocs/builds/models.py
@@ -74,6 +74,8 @@
     GITLAB_MERGE_REQUEST_COMMIT_URL,
     GITLAB_URL,
     MEDIA_TYPES,
+    MKDOCS,
+    MKDOCS_HTML,
     PRIVACY_CHOICES,
     PRIVATE,
     SPHINX,
@@ -379,6 +381,10 @@ def supports_wipe(self):
     def is_sphinx_type(self):
         return self.documentation_type in {SPHINX, SPHINX_HTMLDIR, SPHINX_SINGLEHTML}
 
+    @property
+    def is_mkdocs_type(self):
+        return self.documentation_type in {MKDOCS, MKDOCS_HTML}
+
     def get_subdomain_url(self):
         external = self.type == EXTERNAL
         return self.project.get_docs_url(
diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py
@@ -46,7 +46,7 @@
     validate_repository_url,
 )
 from readthedocs.projects.version_handling import determine_stable_version
-from readthedocs.search.parsers import MkDocsParser, SphinxParser
+from readthedocs.search.parsers import GenericParser, MkDocsParser, SphinxParser
 from readthedocs.storage import build_media_storage
 from readthedocs.vcs_support.backends import backend_cls
 
@@ -1430,9 +1430,23 @@ class Meta:
     objects = HTMLFileManager()
 
     def get_processed_json(self):
-        parser_class = (
-            SphinxParser if self.version.is_sphinx_type else MkDocsParser
-        )
+        if (
+            self.version.documentation_type == constants.GENERIC
+            or self.project.has_feature(Feature.INDEX_FROM_HTML_FILES)
+        ):
+            parser_class = GenericParser
+        elif self.version.is_sphinx_type:
+            parser_class = SphinxParser
+        elif self.version.is_mkdocs_type:
+            parser_class = MkDocsParser
+        else:
+            log.warning(
+                "Invalid documentation type",
+                documentation_type=self.version.documentation_type,
+                version_slug=self.version.slug,
+                project_slug=self.project.slug,
+            )
+            return {}
         parser = parser_class(self.version)
         return parser.parse(self.path)
 
diff --git a/readthedocs/search/parsers.py b/readthedocs/search/parsers.py
@@ -14,7 +14,7 @@
 log = structlog.get_logger(__name__)
 
 
-class BaseParser:
+class GenericParser:
 
     # Limit that matches the ``index.mapping.nested_objects.limit`` ES setting.
     max_inner_documents = 10000
@@ -73,6 +73,7 @@ def _get_main_node(self, html):
         - Try the first ``h1`` node and return its parent
           Usually all sections are neighbors,
           so they are children of the same parent node.
+        - Return the body element itself if all checks above fail.
         """
         body = html.body
         main_node = body.css_first('[role=main]')
@@ -85,11 +86,11 @@ def _get_main_node(self, html):
 
         # TODO: this could be done in smarter way,
         # checking for common parents between all h nodes.
-        first_header = body.css_first('h1')
+        first_header = body.css_first("h1")
         if first_header:
             return first_header.parent
 
-        return None
+        return body
 
     def _parse_content(self, content):
         """Converts all new line characters and multiple spaces to a single space."""
@@ -248,7 +249,7 @@ def _parse_section_content(self, tag, *, depth=0):
                 contents.append(content)
             next_tag = next_tag.next
 
-        return self._parse_content(''.join(contents)), section_found
+        return self._parse_content("".join(contents)), section_found
 
     def _is_code_section(self, tag):
         """
@@ -307,10 +308,42 @@ def parse(self, page):
             'domain_data': {},
         }
         """
-        raise NotImplementedError
+        try:
+            content = self._get_page_content(page)
+            if content:
+                return self._process_content(page, content)
+        except Exception:
+            log.info("Failed to index page.", path=page, exc_info=True)
+        return {
+            "path": page,
+            "title": "",
+            "sections": [],
+            "domain_data": {},
+        }
 
+    def _process_content(self, page, content):
+        """Parses the content into a structured dict."""
+        html = HTMLParser(content)
+        body = self._get_main_node(html)
+        title = ""
+        sections = []
+        if body:
+            title = self._get_page_title(body, html) or page
+            sections = self._get_sections(title=title, body=body)
+        else:
+            log.info(
+                "Page doesn't look like it has valid content, skipping.",
+                page=page,
+            )
+        return {
+            "path": page,
+            "title": title,
+            "sections": sections,
+            "domain_data": {},
+        }
 
-class SphinxParser(BaseParser):
+
+class SphinxParser(GenericParser):
 
     """
     Parser for Sphinx generated html pages.
@@ -384,7 +417,7 @@ def _process_fjson(self, fjson_path):
 
         if 'body' in data:
             try:
-                body = HTMLParser(data['body'])
+                body = HTMLParser(data["body"])
                 sections = self._get_sections(title=title, body=body.body)
             except Exception:
                 log.info('Unable to index sections.', path=fjson_path)
@@ -506,57 +539,15 @@ def _parse_domain_tag(self, tag):
         return docstring
 
 
-class MkDocsParser(BaseParser):
+class MkDocsParser(GenericParser):
 
     """
     MkDocs parser.
 
-    Index from the json index file or directly from the html content.
+    Index using the json index file instead of the html content.
     """
 
     def parse(self, page):
-        # Avoid circular import
-        from readthedocs.projects.models import Feature
-        if self.project.has_feature(Feature.INDEX_FROM_HTML_FILES):
-            return self.parse_from_html(page)
-        return self.parse_from_index_file(page)
-
-    def parse_from_html(self, page):
-        try:
-            content = self._get_page_content(page)
-            if content:
-                return self._process_content(page, content)
-        except Exception as e:
-            log.info('Failed to index page.', path=page, exception=str(e))
-        return {
-            'path': page,
-            'title': '',
-            'sections': [],
-            'domain_data': {},
-        }
-
-    def _process_content(self, page, content):
-        """Parses the content into a structured dict."""
-        html = HTMLParser(content)
-        body = self._get_main_node(html)
-        title = ""
-        sections = []
-        if body:
-            title = self._get_page_title(body, html) or page
-            sections = self._get_sections(title=title, body=body)
-        else:
-            log.info(
-                "Page doesn't look like it has valid content, skipping.",
-                page=page,
-            )
-        return {
-            'path': page,
-            'title': title,
-            'sections': sections,
-            'domain_data': {},
-        }
-
-    def parse_from_index_file(self, page):
         storage_path = self.project.get_storage_path(
             type_='html',
             version_slug=self.version.slug,
diff --git a/readthedocs/search/serializers.py b/readthedocs/search/serializers.py
@@ -14,7 +14,7 @@
 
 from rest_framework import serializers
 
-from readthedocs.projects.constants import MKDOCS, SPHINX_HTMLDIR
+from readthedocs.projects.constants import GENERIC, MKDOCS, SPHINX_HTMLDIR
 from readthedocs.projects.models import Project
 
 # Structures used for storing cached data of a version mostly.
@@ -134,7 +134,8 @@ def _get_full_path(self, obj):
 
             # Generate an appropriate link for the doctypes that use htmldir,
             # and always end it with / so it goes directly to proxito.
-            if obj.doctype in {SPHINX_HTMLDIR, MKDOCS}:
+            # For a generic doctype we just strip the index.html part if it exists.
+            if obj.doctype in {SPHINX_HTMLDIR, MKDOCS, GENERIC}:
                 path = re.sub('(^|/)index.html$', '/', path)
 
             return docs_url.rstrip('/') + '/' + path.lstrip('/')
diff --git a/readthedocs/search/tests/data/generic/in/basic.html b/readthedocs/search/tests/data/generic/in/basic.html
@@ -0,0 +1,10 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <meta charset="utf-8">
+    <title>Title of the page</title>
+  </head>
+  <body>
+    Content of the body.
+  </body>
+</html>
diff --git a/readthedocs/search/tests/data/generic/out/basic.json b/readthedocs/search/tests/data/generic/out/basic.json
@@ -0,0 +1,14 @@
+[
+  {
+    "title": "Title of the page",
+    "path": "basic.html",
+    "sections": [
+      {
+        "id": "",
+        "title": "Title of the page",
+        "content": "Content of the body."
+      }
+    ],
+    "domain_data": {}
+  }
+]
diff --git a/readthedocs/search/tests/test_parsers.py b/readthedocs/search/tests/test_parsers.py
@@ -7,8 +7,8 @@
 from django_dynamic_fixture import get
 
 from readthedocs.builds.storage import BuildMediaFileSystemStorage
-from readthedocs.projects.constants import MKDOCS, SPHINX
-from readthedocs.projects.models import HTMLFile, Project, Feature
+from readthedocs.projects.constants import GENERIC, MKDOCS, SPHINX
+from readthedocs.projects.models import Feature, HTMLFile, Project
 
 data_path = Path(__file__).parent.resolve() / 'data'
 
@@ -284,3 +284,22 @@ def test_sphinx_page_without_title(self, storage_open, storage_exists):
         parsed_json = page_file.processed_json
         expected_json = json.load(open(data_path / 'sphinx/out/no-title.json'))
         assert parsed_json == expected_json
+
+    @mock.patch.object(BuildMediaFileSystemStorage, "exists")
+    @mock.patch.object(BuildMediaFileSystemStorage, "open")
+    def test_generic_simple_page(self, storage_open, storage_exists):
+        file = data_path / "generic/in/basic.html"
+        storage_exists.return_value = True
+        self.version.documentation_type = GENERIC
+        self.version.save()
+
+        storage_open.side_effect = self._mock_open(file.open().read())
+        file = get(
+            HTMLFile,
+            project=self.project,
+            version=self.version,
+            path="basic.html",
+        )
+        parsed_json = [file.processed_json]
+        expected_json = json.load(open(data_path / "generic/out/basic.json"))
+        assert parsed_json == expected_json