Search: index generic doctype

stsewd · stsewd · commit d987801c5e7f · 2022-06-09T19:53:44.000-05:00
Closes #9307
diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py
@@ -46,7 +46,7 @@
     validate_repository_url,
 )
 from readthedocs.projects.version_handling import determine_stable_version
-from readthedocs.search.parsers import MkDocsParser, SphinxParser
+from readthedocs.search.parsers import GenericParser, MkDocsParser, SphinxParser
 from readthedocs.storage import build_media_storage
 from readthedocs.vcs_support.backends import backend_cls
 
@@ -1430,9 +1430,15 @@ class Meta:
     objects = HTMLFileManager()
 
     def get_processed_json(self):
-        parser_class = (
-            SphinxParser if self.version.is_sphinx_type else MkDocsParser
-        )
+        if (
+            self.version.documentation_type == constants.GENERIC
+            or self.project.has_feature(Feature.INDEX_FROM_HTML_FILES)
+        ):
+            parser_class = GenericParser
+        elif self.version.is_sphinx_type:
+            parser_class = SphinxParser
+        else:
+            parser_class = MkDocsParser
         parser = parser_class(self.version)
         return parser.parse(self.path)
 
diff --git a/readthedocs/search/parsers.py b/readthedocs/search/parsers.py
@@ -14,7 +14,7 @@
 log = structlog.get_logger(__name__)
 
 
-class BaseParser:
+class GenericParser:
 
     # Limit that matches the ``index.mapping.nested_objects.limit`` ES setting.
     max_inner_documents = 10000
@@ -83,13 +83,7 @@ def _get_main_node(self, html):
         if main_node:
             return main_node
 
-        # TODO: this could be done in smarter way,
-        # checking for common parents between all h nodes.
-        first_header = body.css_first('h1')
-        if first_header:
-            return first_header.parent
-
-        return None
+        return body
 
     def _parse_content(self, content):
         """Converts all new line characters and multiple spaces to a single space."""
@@ -109,8 +103,6 @@ def _parse_sections(self, title, body):
         We can have pages that have content before the first title or that don't have a title,
         we index that content first under the title of the original page.
         """
-        body = self._clean_body(body)
-
         # Index content for pages that don't start with a title.
         # We check for sections till 3 levels to avoid indexing all the content
         # in this step.
@@ -248,7 +240,7 @@ def _parse_section_content(self, tag, *, depth=0):
                 contents.append(content)
             next_tag = next_tag.next
 
-        return self._parse_content(''.join(contents)), section_found
+        return self._parse_content("".join(contents)), section_found
 
     def _is_code_section(self, tag):
         """
@@ -307,10 +299,42 @@ def parse(self, page):
             'domain_data': {},
         }
         """
-        raise NotImplementedError
+        try:
+            content = self._get_page_content(page)
+            if content:
+                return self._process_content(page, content)
+        except Exception as e:
+            log.info("Failed to index page.", path=page, exception=str(e))
+        return {
+            "path": page,
+            "title": "",
+            "sections": [],
+            "domain_data": {},
+        }
+
+    def _process_content(self, page, content):
+        """Parses the content into a structured dict."""
+        html = self._clean_body(HTMLParser(content))
+        body = self._get_main_node(html)
+        title = ""
+        sections = []
+        if body:
+            title = self._get_page_title(body, html) or page
+            sections = self._get_sections(title=title, body=body)
+        else:
+            log.info(
+                "Page doesn't look like it has valid content, skipping.",
+                page=page,
+            )
+        return {
+            "path": page,
+            "title": title,
+            "sections": sections,
+            "domain_data": {},
+        }
 
 
-class SphinxParser(BaseParser):
+class SphinxParser(GenericParser):
 
     """
     Parser for Sphinx generated html pages.
@@ -384,7 +408,7 @@ def _process_fjson(self, fjson_path):
 
         if 'body' in data:
             try:
-                body = HTMLParser(data['body'])
+                body = self._clean_body(HTMLParser(data["body"]))
                 sections = self._get_sections(title=title, body=body.body)
             except Exception:
                 log.info('Unable to index sections.', path=fjson_path)
@@ -506,57 +530,15 @@ def _parse_domain_tag(self, tag):
         return docstring
 
 
-class MkDocsParser(BaseParser):
+class MkDocsParser(GenericParser):
 
     """
     MkDocs parser.
 
-    Index from the json index file or directly from the html content.
+    Index using the json index file instead of the html content.
     """
 
     def parse(self, page):
-        # Avoid circular import
-        from readthedocs.projects.models import Feature
-        if self.project.has_feature(Feature.INDEX_FROM_HTML_FILES):
-            return self.parse_from_html(page)
-        return self.parse_from_index_file(page)
-
-    def parse_from_html(self, page):
-        try:
-            content = self._get_page_content(page)
-            if content:
-                return self._process_content(page, content)
-        except Exception as e:
-            log.info('Failed to index page.', path=page, exception=str(e))
-        return {
-            'path': page,
-            'title': '',
-            'sections': [],
-            'domain_data': {},
-        }
-
-    def _process_content(self, page, content):
-        """Parses the content into a structured dict."""
-        html = HTMLParser(content)
-        body = self._get_main_node(html)
-        title = ""
-        sections = []
-        if body:
-            title = self._get_page_title(body, html) or page
-            sections = self._get_sections(title=title, body=body)
-        else:
-            log.info(
-                "Page doesn't look like it has valid content, skipping.",
-                page=page,
-            )
-        return {
-            'path': page,
-            'title': title,
-            'sections': sections,
-            'domain_data': {},
-        }
-
-    def parse_from_index_file(self, page):
         storage_path = self.project.get_storage_path(
             type_='html',
             version_slug=self.version.slug,
diff --git a/readthedocs/search/serializers.py b/readthedocs/search/serializers.py
@@ -14,7 +14,7 @@
 
 from rest_framework import serializers
 
-from readthedocs.projects.constants import MKDOCS, SPHINX_HTMLDIR
+from readthedocs.projects.constants import GENERIC, MKDOCS, SPHINX_HTMLDIR
 from readthedocs.projects.models import Project
 
 # Structures used for storing cached data of a version mostly.
@@ -134,7 +134,8 @@ def _get_full_path(self, obj):
 
             # Generate an appropriate link for the doctypes that use htmldir,
             # and always end it with / so it goes directly to proxito.
-            if obj.doctype in {SPHINX_HTMLDIR, MKDOCS}:
+            # For a generic doctype we just strip the index.html part if it exists.
+            if obj.doctype in {SPHINX_HTMLDIR, MKDOCS, GENERIC}:
                 path = re.sub('(^|/)index.html$', '/', path)
 
             return docs_url.rstrip('/') + '/' + path.lstrip('/')