Skip to content

Commit fe2f79c

Browse files
authored
Search: index generic doctype (#9322)
Closes #9307
1 parent 0a0af81 commit fe2f79c

File tree

8 files changed

+132
-60
lines changed

8 files changed

+132
-60
lines changed

docs/user/build-customization.rst

+18-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ and also how to override the build process completely:
1212
`Override the build process`_
1313
If you want full control over your build. This option supports any tool that generates HTML as part of the build.
1414

15+
.. contents:: Table of contents
16+
:local:
1517

1618
Extend the build process
1719
------------------------
@@ -245,7 +247,7 @@ Override the build process
245247
.. warning::
246248

247249
This feature is in a *beta phase* and could suffer incompatible changes or even removed completely in the near feature.
248-
It does not yet support some of the Read the Docs' features like the :term:`flyout menu`, search and ads.
250+
It does not yet support some of the Read the Docs' features like the :term:`flyout menu`, and ads.
249251
However, we do plan to support these features in the future.
250252
Use this feature at your own risk.
251253

@@ -273,3 +275,18 @@ your project could use the following configuration file:
273275
As Read the Docs does not have control over the build process,
274276
you are responsible for running all the commands required to install requirements and build the documentation properly.
275277
Once the build process finishes, the ``_readthedocs/html/`` folder will be hosted.
278+
279+
Search support
280+
++++++++++++++
281+
282+
Read the Docs will automatically index the content of all your HTML files,
283+
respecting the :ref:`search <config-file/v2:search>` options from your config file.
284+
285+
You can access the search results from the :guilabel:`Search` tab of your project,
286+
or by using the :ref:`search API <server-side-search:api>`.
287+
288+
.. note::
289+
290+
In order for Read the Docs to index your HTML files correctly,
291+
they should follow some of the conventions described
292+
at :doc:`rtd-dev:search-integration`.

readthedocs/builds/models.py

+6
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@
7474
GITLAB_MERGE_REQUEST_COMMIT_URL,
7575
GITLAB_URL,
7676
MEDIA_TYPES,
77+
MKDOCS,
78+
MKDOCS_HTML,
7779
PRIVACY_CHOICES,
7880
PRIVATE,
7981
SPHINX,
@@ -379,6 +381,10 @@ def supports_wipe(self):
379381
def is_sphinx_type(self):
380382
return self.documentation_type in {SPHINX, SPHINX_HTMLDIR, SPHINX_SINGLEHTML}
381383

384+
@property
385+
def is_mkdocs_type(self):
386+
return self.documentation_type in {MKDOCS, MKDOCS_HTML}
387+
382388
def get_subdomain_url(self):
383389
external = self.type == EXTERNAL
384390
return self.project.get_docs_url(

readthedocs/projects/models.py

+18-4
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
validate_repository_url,
4747
)
4848
from readthedocs.projects.version_handling import determine_stable_version
49-
from readthedocs.search.parsers import MkDocsParser, SphinxParser
49+
from readthedocs.search.parsers import GenericParser, MkDocsParser, SphinxParser
5050
from readthedocs.storage import build_media_storage
5151
from readthedocs.vcs_support.backends import backend_cls
5252

@@ -1430,9 +1430,23 @@ class Meta:
14301430
objects = HTMLFileManager()
14311431

14321432
def get_processed_json(self):
1433-
parser_class = (
1434-
SphinxParser if self.version.is_sphinx_type else MkDocsParser
1435-
)
1433+
if (
1434+
self.version.documentation_type == constants.GENERIC
1435+
or self.project.has_feature(Feature.INDEX_FROM_HTML_FILES)
1436+
):
1437+
parser_class = GenericParser
1438+
elif self.version.is_sphinx_type:
1439+
parser_class = SphinxParser
1440+
elif self.version.is_mkdocs_type:
1441+
parser_class = MkDocsParser
1442+
else:
1443+
log.warning(
1444+
"Invalid documentation type",
1445+
documentation_type=self.version.documentation_type,
1446+
version_slug=self.version.slug,
1447+
project_slug=self.project.slug,
1448+
)
1449+
return {}
14361450
parser = parser_class(self.version)
14371451
return parser.parse(self.path)
14381452

readthedocs/search/parsers.py

+42-51
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
log = structlog.get_logger(__name__)
1515

1616

17-
class BaseParser:
17+
class GenericParser:
1818

1919
# Limit that matches the ``index.mapping.nested_objects.limit`` ES setting.
2020
max_inner_documents = 10000
@@ -73,6 +73,7 @@ def _get_main_node(self, html):
7373
- Try the first ``h1`` node and return its parent
7474
Usually all sections are neighbors,
7575
so they are children of the same parent node.
76+
- Return the body element itself if all checks above fail.
7677
"""
7778
body = html.body
7879
main_node = body.css_first('[role=main]')
@@ -85,11 +86,11 @@ def _get_main_node(self, html):
8586

8687
# TODO: this could be done in smarter way,
8788
# checking for common parents between all h nodes.
88-
first_header = body.css_first('h1')
89+
first_header = body.css_first("h1")
8990
if first_header:
9091
return first_header.parent
9192

92-
return None
93+
return body
9394

9495
def _parse_content(self, content):
9596
"""Converts all new line characters and multiple spaces to a single space."""
@@ -248,7 +249,7 @@ def _parse_section_content(self, tag, *, depth=0):
248249
contents.append(content)
249250
next_tag = next_tag.next
250251

251-
return self._parse_content(''.join(contents)), section_found
252+
return self._parse_content("".join(contents)), section_found
252253

253254
def _is_code_section(self, tag):
254255
"""
@@ -307,10 +308,42 @@ def parse(self, page):
307308
'domain_data': {},
308309
}
309310
"""
310-
raise NotImplementedError
311+
try:
312+
content = self._get_page_content(page)
313+
if content:
314+
return self._process_content(page, content)
315+
except Exception:
316+
log.info("Failed to index page.", path=page, exc_info=True)
317+
return {
318+
"path": page,
319+
"title": "",
320+
"sections": [],
321+
"domain_data": {},
322+
}
311323

324+
def _process_content(self, page, content):
325+
"""Parses the content into a structured dict."""
326+
html = HTMLParser(content)
327+
body = self._get_main_node(html)
328+
title = ""
329+
sections = []
330+
if body:
331+
title = self._get_page_title(body, html) or page
332+
sections = self._get_sections(title=title, body=body)
333+
else:
334+
log.info(
335+
"Page doesn't look like it has valid content, skipping.",
336+
page=page,
337+
)
338+
return {
339+
"path": page,
340+
"title": title,
341+
"sections": sections,
342+
"domain_data": {},
343+
}
312344

313-
class SphinxParser(BaseParser):
345+
346+
class SphinxParser(GenericParser):
314347

315348
"""
316349
Parser for Sphinx generated html pages.
@@ -384,7 +417,7 @@ def _process_fjson(self, fjson_path):
384417

385418
if 'body' in data:
386419
try:
387-
body = HTMLParser(data['body'])
420+
body = HTMLParser(data["body"])
388421
sections = self._get_sections(title=title, body=body.body)
389422
except Exception:
390423
log.info('Unable to index sections.', path=fjson_path)
@@ -506,57 +539,15 @@ def _parse_domain_tag(self, tag):
506539
return docstring
507540

508541

509-
class MkDocsParser(BaseParser):
542+
class MkDocsParser(GenericParser):
510543

511544
"""
512545
MkDocs parser.
513546
514-
Index from the json index file or directly from the html content.
547+
Index using the json index file instead of the html content.
515548
"""
516549

517550
def parse(self, page):
518-
# Avoid circular import
519-
from readthedocs.projects.models import Feature
520-
if self.project.has_feature(Feature.INDEX_FROM_HTML_FILES):
521-
return self.parse_from_html(page)
522-
return self.parse_from_index_file(page)
523-
524-
def parse_from_html(self, page):
525-
try:
526-
content = self._get_page_content(page)
527-
if content:
528-
return self._process_content(page, content)
529-
except Exception as e:
530-
log.info('Failed to index page.', path=page, exception=str(e))
531-
return {
532-
'path': page,
533-
'title': '',
534-
'sections': [],
535-
'domain_data': {},
536-
}
537-
538-
def _process_content(self, page, content):
539-
"""Parses the content into a structured dict."""
540-
html = HTMLParser(content)
541-
body = self._get_main_node(html)
542-
title = ""
543-
sections = []
544-
if body:
545-
title = self._get_page_title(body, html) or page
546-
sections = self._get_sections(title=title, body=body)
547-
else:
548-
log.info(
549-
"Page doesn't look like it has valid content, skipping.",
550-
page=page,
551-
)
552-
return {
553-
'path': page,
554-
'title': title,
555-
'sections': sections,
556-
'domain_data': {},
557-
}
558-
559-
def parse_from_index_file(self, page):
560551
storage_path = self.project.get_storage_path(
561552
type_='html',
562553
version_slug=self.version.slug,

readthedocs/search/serializers.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
from rest_framework import serializers
1616

17-
from readthedocs.projects.constants import MKDOCS, SPHINX_HTMLDIR
17+
from readthedocs.projects.constants import GENERIC, MKDOCS, SPHINX_HTMLDIR
1818
from readthedocs.projects.models import Project
1919

2020
# Structures used for storing cached data of a version mostly.
@@ -134,7 +134,8 @@ def _get_full_path(self, obj):
134134

135135
# Generate an appropriate link for the doctypes that use htmldir,
136136
# and always end it with / so it goes directly to proxito.
137-
if obj.doctype in {SPHINX_HTMLDIR, MKDOCS}:
137+
# For a generic doctype we just strip the index.html part if it exists.
138+
if obj.doctype in {SPHINX_HTMLDIR, MKDOCS, GENERIC}:
138139
path = re.sub('(^|/)index.html$', '/', path)
139140

140141
return docs_url.rstrip('/') + '/' + path.lstrip('/')
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
<!DOCTYPE html>
2+
<html>
3+
<head>
4+
<meta charset="utf-8">
5+
<title>Title of the page</title>
6+
</head>
7+
<body>
8+
Content of the body.
9+
</body>
10+
</html>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
[
2+
{
3+
"title": "Title of the page",
4+
"path": "basic.html",
5+
"sections": [
6+
{
7+
"id": "",
8+
"title": "Title of the page",
9+
"content": "Content of the body."
10+
}
11+
],
12+
"domain_data": {}
13+
}
14+
]

readthedocs/search/tests/test_parsers.py

+21-2
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
from django_dynamic_fixture import get
88

99
from readthedocs.builds.storage import BuildMediaFileSystemStorage
10-
from readthedocs.projects.constants import MKDOCS, SPHINX
11-
from readthedocs.projects.models import HTMLFile, Project, Feature
10+
from readthedocs.projects.constants import GENERIC, MKDOCS, SPHINX
11+
from readthedocs.projects.models import Feature, HTMLFile, Project
1212

1313
data_path = Path(__file__).parent.resolve() / 'data'
1414

@@ -284,3 +284,22 @@ def test_sphinx_page_without_title(self, storage_open, storage_exists):
284284
parsed_json = page_file.processed_json
285285
expected_json = json.load(open(data_path / 'sphinx/out/no-title.json'))
286286
assert parsed_json == expected_json
287+
288+
@mock.patch.object(BuildMediaFileSystemStorage, "exists")
289+
@mock.patch.object(BuildMediaFileSystemStorage, "open")
290+
def test_generic_simple_page(self, storage_open, storage_exists):
291+
file = data_path / "generic/in/basic.html"
292+
storage_exists.return_value = True
293+
self.version.documentation_type = GENERIC
294+
self.version.save()
295+
296+
storage_open.side_effect = self._mock_open(file.open().read())
297+
file = get(
298+
HTMLFile,
299+
project=self.project,
300+
version=self.version,
301+
path="basic.html",
302+
)
303+
parsed_json = [file.processed_json]
304+
expected_json = json.load(open(data_path / "generic/out/basic.json"))
305+
assert parsed_json == expected_json

0 commit comments

Comments
 (0)