Skip to content

Commit a35cea8

Browse files
committed
Search: index generic doctype
Closes #9307
1 parent 0849a23 commit a35cea8

File tree

4 files changed

+67
-59
lines changed

4 files changed

+67
-59
lines changed

readthedocs/builds/models.py

+6
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@
7474
GITLAB_MERGE_REQUEST_COMMIT_URL,
7575
GITLAB_URL,
7676
MEDIA_TYPES,
77+
MKDOCS,
78+
MKDOCS_HTML,
7779
PRIVACY_CHOICES,
7880
PRIVATE,
7981
SPHINX,
@@ -379,6 +381,10 @@ def supports_wipe(self):
379381
def is_sphinx_type(self):
380382
return self.documentation_type in {SPHINX, SPHINX_HTMLDIR, SPHINX_SINGLEHTML}
381383

384+
@property
385+
def is_mkdocs_type(self):
386+
return self.documentation_type in {MKDOCS, MKDOCS_HTML}
387+
382388
def get_subdomain_url(self):
383389
external = self.type == EXTERNAL
384390
return self.project.get_docs_url(

readthedocs/projects/models.py

+16-4
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
validate_repository_url,
4747
)
4848
from readthedocs.projects.version_handling import determine_stable_version
49-
from readthedocs.search.parsers import MkDocsParser, SphinxParser
49+
from readthedocs.search.parsers import GenericParser, MkDocsParser, SphinxParser
5050
from readthedocs.storage import build_media_storage
5151
from readthedocs.vcs_support.backends import backend_cls
5252

@@ -1430,9 +1430,21 @@ class Meta:
14301430
objects = HTMLFileManager()
14311431

14321432
def get_processed_json(self):
1433-
parser_class = (
1434-
SphinxParser if self.version.is_sphinx_type else MkDocsParser
1435-
)
1433+
if (
1434+
self.version.documentation_type == constants.GENERIC
1435+
or self.project.has_feature(Feature.INDEX_FROM_HTML_FILES)
1436+
):
1437+
parser_class = GenericParser
1438+
elif self.version.is_sphinx_type:
1439+
parser_class = SphinxParser
1440+
elif self.version.is_mkdocs_type:
1441+
parser_class = MkDocsParser
1442+
else:
1443+
log.warning(
1444+
"Invalid documentation type",
1445+
documentation_type=self.version.documentation_type,
1446+
)
1447+
return {}
14361448
parser = parser_class(self.version)
14371449
return parser.parse(self.path)
14381450

readthedocs/search/parsers.py

+42-53
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
log = structlog.get_logger(__name__)
1515

1616

17-
class BaseParser:
17+
class GenericParser:
1818

1919
# Limit that matches the ``index.mapping.nested_objects.limit`` ES setting.
2020
max_inner_documents = 10000
@@ -73,6 +73,7 @@ def _get_main_node(self, html):
7373
- Try the first ``h1`` node and return its parent
7474
Usually all sections are neighbors,
7575
so they are children of the same parent node.
76+
- Return the body element itself if all checks above fail.
7677
"""
7778
body = html.body
7879
main_node = body.css_first('[role=main]')
@@ -85,11 +86,11 @@ def _get_main_node(self, html):
8586

8687
# TODO: this could be done in smarter way,
8788
# checking for common parents between all h nodes.
88-
first_header = body.css_first('h1')
89+
first_header = body.css_first("h1")
8990
if first_header:
9091
return first_header.parent
9192

92-
return None
93+
return body
9394

9495
def _parse_content(self, content):
9596
"""Converts all new line characters and multiple spaces to a single space."""
@@ -109,8 +110,6 @@ def _parse_sections(self, title, body):
109110
We can have pages that have content before the first title or that don't have a title,
110111
we index that content first under the title of the original page.
111112
"""
112-
body = self._clean_body(body)
113-
114113
# Index content for pages that don't start with a title.
115114
# We check for sections till 3 levels to avoid indexing all the content
116115
# in this step.
@@ -248,7 +247,7 @@ def _parse_section_content(self, tag, *, depth=0):
248247
contents.append(content)
249248
next_tag = next_tag.next
250249

251-
return self._parse_content(''.join(contents)), section_found
250+
return self._parse_content("".join(contents)), section_found
252251

253252
def _is_code_section(self, tag):
254253
"""
@@ -307,10 +306,42 @@ def parse(self, page):
307306
'domain_data': {},
308307
}
309308
"""
310-
raise NotImplementedError
309+
try:
310+
content = self._get_page_content(page)
311+
if content:
312+
return self._process_content(page, content)
313+
except Exception as e:
314+
log.info("Failed to index page.", path=page, exception=str(e))
315+
return {
316+
"path": page,
317+
"title": "",
318+
"sections": [],
319+
"domain_data": {},
320+
}
311321

322+
def _process_content(self, page, content):
323+
"""Parses the content into a structured dict."""
324+
html = self._clean_body(HTMLParser(content))
325+
body = self._get_main_node(html)
326+
title = ""
327+
sections = []
328+
if body:
329+
title = self._get_page_title(body, html) or page
330+
sections = self._get_sections(title=title, body=body)
331+
else:
332+
log.info(
333+
"Page doesn't look like it has valid content, skipping.",
334+
page=page,
335+
)
336+
return {
337+
"path": page,
338+
"title": title,
339+
"sections": sections,
340+
"domain_data": {},
341+
}
312342

313-
class SphinxParser(BaseParser):
343+
344+
class SphinxParser(GenericParser):
314345

315346
"""
316347
Parser for Sphinx generated html pages.
@@ -384,7 +415,7 @@ def _process_fjson(self, fjson_path):
384415

385416
if 'body' in data:
386417
try:
387-
body = HTMLParser(data['body'])
418+
body = self._clean_body(HTMLParser(data["body"]))
388419
sections = self._get_sections(title=title, body=body.body)
389420
except Exception:
390421
log.info('Unable to index sections.', path=fjson_path)
@@ -506,57 +537,15 @@ def _parse_domain_tag(self, tag):
506537
return docstring
507538

508539

509-
class MkDocsParser(BaseParser):
540+
class MkDocsParser(GenericParser):
510541

511542
"""
512543
MkDocs parser.
513544
514-
Index from the json index file or directly from the html content.
545+
Index using the json index file instead of the html content.
515546
"""
516547

517548
def parse(self, page):
518-
# Avoid circular import
519-
from readthedocs.projects.models import Feature
520-
if self.project.has_feature(Feature.INDEX_FROM_HTML_FILES):
521-
return self.parse_from_html(page)
522-
return self.parse_from_index_file(page)
523-
524-
def parse_from_html(self, page):
525-
try:
526-
content = self._get_page_content(page)
527-
if content:
528-
return self._process_content(page, content)
529-
except Exception as e:
530-
log.info('Failed to index page.', path=page, exception=str(e))
531-
return {
532-
'path': page,
533-
'title': '',
534-
'sections': [],
535-
'domain_data': {},
536-
}
537-
538-
def _process_content(self, page, content):
539-
"""Parses the content into a structured dict."""
540-
html = HTMLParser(content)
541-
body = self._get_main_node(html)
542-
title = ""
543-
sections = []
544-
if body:
545-
title = self._get_page_title(body, html) or page
546-
sections = self._get_sections(title=title, body=body)
547-
else:
548-
log.info(
549-
"Page doesn't look like it has valid content, skipping.",
550-
page=page,
551-
)
552-
return {
553-
'path': page,
554-
'title': title,
555-
'sections': sections,
556-
'domain_data': {},
557-
}
558-
559-
def parse_from_index_file(self, page):
560549
storage_path = self.project.get_storage_path(
561550
type_='html',
562551
version_slug=self.version.slug,

readthedocs/search/serializers.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
from rest_framework import serializers
1616

17-
from readthedocs.projects.constants import MKDOCS, SPHINX_HTMLDIR
17+
from readthedocs.projects.constants import GENERIC, MKDOCS, SPHINX_HTMLDIR
1818
from readthedocs.projects.models import Project
1919

2020
# Structures used for storing cached data of a version mostly.
@@ -134,7 +134,8 @@ def _get_full_path(self, obj):
134134

135135
# Generate an appropriate link for the doctypes that use htmldir,
136136
# and always end it with / so it goes directly to proxito.
137-
if obj.doctype in {SPHINX_HTMLDIR, MKDOCS}:
137+
# For a generic doctype we just strip the index.html part if it exists.
138+
if obj.doctype in {SPHINX_HTMLDIR, MKDOCS, GENERIC}:
138139
path = re.sub('(^|/)index.html$', '/', path)
139140

140141
return docs_url.rstrip('/') + '/' + path.lstrip('/')

0 commit comments

Comments
 (0)