Skip to content

Commit d987801

Browse files
committed
Search: index generic doctype
Closes #9307
1 parent 3e70b81 commit d987801

File tree

3 files changed

+53
-64
lines changed

3 files changed

+53
-64
lines changed

readthedocs/projects/models.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
validate_repository_url,
4747
)
4848
from readthedocs.projects.version_handling import determine_stable_version
49-
from readthedocs.search.parsers import MkDocsParser, SphinxParser
49+
from readthedocs.search.parsers import GenericParser, MkDocsParser, SphinxParser
5050
from readthedocs.storage import build_media_storage
5151
from readthedocs.vcs_support.backends import backend_cls
5252

@@ -1430,9 +1430,15 @@ class Meta:
14301430
objects = HTMLFileManager()
14311431

14321432
def get_processed_json(self):
1433-
parser_class = (
1434-
SphinxParser if self.version.is_sphinx_type else MkDocsParser
1435-
)
1433+
if (
1434+
self.version.documentation_type == constants.GENERIC
1435+
or self.project.has_feature(Feature.INDEX_FROM_HTML_FILES)
1436+
):
1437+
parser_class = GenericParser
1438+
elif self.version.is_sphinx_type:
1439+
parser_class = SphinxParser
1440+
else:
1441+
parser_class = MkDocsParser
14361442
parser = parser_class(self.version)
14371443
return parser.parse(self.path)
14381444

readthedocs/search/parsers.py

+40-58
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
log = structlog.get_logger(__name__)
1515

1616

17-
class BaseParser:
17+
class GenericParser:
1818

1919
# Limit that matches the ``index.mapping.nested_objects.limit`` ES setting.
2020
max_inner_documents = 10000
@@ -83,13 +83,7 @@ def _get_main_node(self, html):
8383
if main_node:
8484
return main_node
8585

86-
# TODO: this could be done in smarter way,
87-
# checking for common parents between all h nodes.
88-
first_header = body.css_first('h1')
89-
if first_header:
90-
return first_header.parent
91-
92-
return None
86+
return body
9387

9488
def _parse_content(self, content):
9589
"""Converts all new line characters and multiple spaces to a single space."""
@@ -109,8 +103,6 @@ def _parse_sections(self, title, body):
109103
We can have pages that have content before the first title or that don't have a title,
110104
we index that content first under the title of the original page.
111105
"""
112-
body = self._clean_body(body)
113-
114106
# Index content for pages that don't start with a title.
115107
# We check for sections till 3 levels to avoid indexing all the content
116108
# in this step.
@@ -248,7 +240,7 @@ def _parse_section_content(self, tag, *, depth=0):
248240
contents.append(content)
249241
next_tag = next_tag.next
250242

251-
return self._parse_content(''.join(contents)), section_found
243+
return self._parse_content("".join(contents)), section_found
252244

253245
def _is_code_section(self, tag):
254246
"""
@@ -307,10 +299,42 @@ def parse(self, page):
307299
'domain_data': {},
308300
}
309301
"""
310-
raise NotImplementedError
302+
try:
303+
content = self._get_page_content(page)
304+
if content:
305+
return self._process_content(page, content)
306+
except Exception as e:
307+
log.info("Failed to index page.", path=page, exception=str(e))
308+
return {
309+
"path": page,
310+
"title": "",
311+
"sections": [],
312+
"domain_data": {},
313+
}
314+
315+
def _process_content(self, page, content):
316+
"""Parses the content into a structured dict."""
317+
html = self._clean_body(HTMLParser(content))
318+
body = self._get_main_node(html)
319+
title = ""
320+
sections = []
321+
if body:
322+
title = self._get_page_title(body, html) or page
323+
sections = self._get_sections(title=title, body=body)
324+
else:
325+
log.info(
326+
"Page doesn't look like it has valid content, skipping.",
327+
page=page,
328+
)
329+
return {
330+
"path": page,
331+
"title": title,
332+
"sections": sections,
333+
"domain_data": {},
334+
}
311335

312336

313-
class SphinxParser(BaseParser):
337+
class SphinxParser(GenericParser):
314338

315339
"""
316340
Parser for Sphinx generated html pages.
@@ -384,7 +408,7 @@ def _process_fjson(self, fjson_path):
384408

385409
if 'body' in data:
386410
try:
387-
body = HTMLParser(data['body'])
411+
body = self._clean_body(HTMLParser(data["body"]))
388412
sections = self._get_sections(title=title, body=body.body)
389413
except Exception:
390414
log.info('Unable to index sections.', path=fjson_path)
@@ -506,57 +530,15 @@ def _parse_domain_tag(self, tag):
506530
return docstring
507531

508532

509-
class MkDocsParser(BaseParser):
533+
class MkDocsParser(GenericParser):
510534

511535
"""
512536
MkDocs parser.
513537
514-
Index from the json index file or directly from the html content.
538+
Index using the json index file instead of the html content.
515539
"""
516540

517541
def parse(self, page):
518-
# Avoid circular import
519-
from readthedocs.projects.models import Feature
520-
if self.project.has_feature(Feature.INDEX_FROM_HTML_FILES):
521-
return self.parse_from_html(page)
522-
return self.parse_from_index_file(page)
523-
524-
def parse_from_html(self, page):
525-
try:
526-
content = self._get_page_content(page)
527-
if content:
528-
return self._process_content(page, content)
529-
except Exception as e:
530-
log.info('Failed to index page.', path=page, exception=str(e))
531-
return {
532-
'path': page,
533-
'title': '',
534-
'sections': [],
535-
'domain_data': {},
536-
}
537-
538-
def _process_content(self, page, content):
539-
"""Parses the content into a structured dict."""
540-
html = HTMLParser(content)
541-
body = self._get_main_node(html)
542-
title = ""
543-
sections = []
544-
if body:
545-
title = self._get_page_title(body, html) or page
546-
sections = self._get_sections(title=title, body=body)
547-
else:
548-
log.info(
549-
"Page doesn't look like it has valid content, skipping.",
550-
page=page,
551-
)
552-
return {
553-
'path': page,
554-
'title': title,
555-
'sections': sections,
556-
'domain_data': {},
557-
}
558-
559-
def parse_from_index_file(self, page):
560542
storage_path = self.project.get_storage_path(
561543
type_='html',
562544
version_slug=self.version.slug,

readthedocs/search/serializers.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
from rest_framework import serializers
1616

17-
from readthedocs.projects.constants import MKDOCS, SPHINX_HTMLDIR
17+
from readthedocs.projects.constants import GENERIC, MKDOCS, SPHINX_HTMLDIR
1818
from readthedocs.projects.models import Project
1919

2020
# Structures used for storing cached data of a version mostly.
@@ -134,7 +134,8 @@ def _get_full_path(self, obj):
134134

135135
# Generate an appropriate link for the doctypes that use htmldir,
136136
# and always end it with / so it goes directly to proxito.
137-
if obj.doctype in {SPHINX_HTMLDIR, MKDOCS}:
137+
# For a generic doctype we just strip the index.html part if it exists.
138+
if obj.doctype in {SPHINX_HTMLDIR, MKDOCS, GENERIC}:
138139
path = re.sub('(^|/)index.html$', '/', path)
139140

140141
return docs_url.rstrip('/') + '/' + path.lstrip('/')

0 commit comments

Comments
 (0)