Skip to content

Commit 9802ad0

Browse files
authored
Merge pull request #6937 from readthedocs/mkdocs-search
Add support for Mkdocs search
2 parents 69f3eba + da184f5 commit 9802ad0

17 files changed

+330
-30
lines changed

readthedocs/builds/models.py

+6
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@
7878
MEDIA_TYPES,
7979
PRIVACY_CHOICES,
8080
SPHINX,
81+
SPHINX_HTMLDIR,
82+
SPHINX_SINGLEHTML,
8183
)
8284
from readthedocs.projects.models import APIProject, Project
8385
from readthedocs.projects.version_handling import determine_stable_version
@@ -368,6 +370,10 @@ def supports_wipe(self):
368370
"""Return True if version is not external."""
369371
return not self.type == EXTERNAL
370372

373+
@property
374+
def is_sphinx_type(self):
375+
return self.documentation_type in {SPHINX, SPHINX_HTMLDIR, SPHINX_SINGLEHTML}
376+
371377
def get_subdomain_url(self):
372378
external = self.type == EXTERNAL
373379
return self.project.get_docs_url(

readthedocs/doc_builder/backends/mkdocs.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@
1111
import yaml
1212
from django.conf import settings
1313
from django.template import loader as template_loader
14-
from readthedocs.projects.constants import MKDOCS_HTML, MKDOCS
1514

1615
from readthedocs.doc_builder.base import BaseBuilder
1716
from readthedocs.doc_builder.exceptions import MkDocsYAMLParseError
17+
from readthedocs.projects.constants import MKDOCS, MKDOCS_HTML
1818
from readthedocs.projects.models import Feature
1919

2020

@@ -314,17 +314,12 @@ def get_theme_name(self, mkdocs_config):
314314

315315

316316
class MkdocsHTML(BaseMkdocs):
317+
317318
type = 'mkdocs'
318319
builder = 'build'
319320
build_dir = '_build/html'
320321

321322

322-
class MkdocsJSON(BaseMkdocs):
323-
type = 'mkdocs_json'
324-
builder = 'json'
325-
build_dir = '_build/json'
326-
327-
328323
class SafeLoaderIgnoreUnknown(yaml.SafeLoader): # pylint: disable=too-many-ancestors
329324

330325
"""

readthedocs/doc_builder/loader.py

-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
# -*- coding: utf-8 -*-
2-
31
"""Lookup tables for builders and backends."""
42
from importlib import import_module
53

@@ -21,7 +19,6 @@
2119
'sphinx_singlehtmllocalmedia': sphinx.LocalMediaBuilder,
2220
# Other markup
2321
'mkdocs': mkdocs.MkdocsHTML,
24-
'mkdocs_json': mkdocs.MkdocsJSON,
2522
}
2623

2724

readthedocs/projects/models.py

+48-2
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
validate_repository_url,
4141
)
4242
from readthedocs.projects.version_handling import determine_stable_version
43-
from readthedocs.search.parse_json import process_file
43+
from readthedocs.search.parse_json import process_file, process_mkdocs_index_file
4444
from readthedocs.vcs_support.backends import backend_cls
4545
from readthedocs.vcs_support.utils import Lock, NonBlockingLock
4646

@@ -1330,7 +1330,7 @@ class Meta:
13301330

13311331
objects = HTMLFileManager.from_queryset(HTMLFileQuerySet)()
13321332

1333-
def get_processed_json(self):
1333+
def get_processed_json_sphinx(self):
13341334
"""
13351335
Get the parsed JSON for search indexing.
13361336
@@ -1374,6 +1374,52 @@ def get_processed_json(self):
13741374
'domain_data': {},
13751375
}
13761376

1377+
def get_processed_json_mkdocs(self):
1378+
log.debug('Processing mkdocs index')
1379+
storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()
1380+
storage_path = self.project.get_storage_path(
1381+
type_='html', version_slug=self.version.slug, include_file=False
1382+
)
1383+
try:
1384+
file_path = storage.join(storage_path, 'search/search_index.json')
1385+
if storage.exists(file_path):
1386+
index_data = process_mkdocs_index_file(file_path, page=self.path)
1387+
if index_data:
1388+
return index_data
1389+
except Exception:
1390+
log.warning(
1391+
'Unhandled exception during search processing file: %s',
1392+
file_path,
1393+
)
1394+
return {
1395+
'path': self.path,
1396+
'title': '',
1397+
'sections': [],
1398+
'domain_data': {},
1399+
}
1400+
1401+
def get_processed_json(self):
1402+
"""
1403+
Get the parsed JSON for search indexing.
1404+
1405+
Returns a dictionary with the following structure.
1406+
{
1407+
'path': 'file path',
1408+
'title': 'Title',
1409+
'sections': [
1410+
{
1411+
'id': 'section-anchor',
1412+
'title': 'Section title',
1413+
'content': 'Section content',
1414+
},
1415+
],
1416+
'domain_data': {},
1417+
}
1418+
"""
1419+
if self.version.is_sphinx_type:
1420+
return self.get_processed_json_sphinx()
1421+
return self.get_processed_json_mkdocs()
1422+
13771423
@cached_property
13781424
def processed_json(self):
13791425
return self.get_processed_json()

readthedocs/projects/tasks.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -1234,12 +1234,14 @@ def get_final_doctype(self):
12341234
return html_builder.get_final_doctype()
12351235

12361236
def build_docs_search(self):
1237-
"""Build search data."""
1238-
# Search is always run in sphinx using the rtd-sphinx-extension.
1239-
# Mkdocs has no search currently.
1240-
if self.is_type_sphinx() and self.version.type != EXTERNAL:
1241-
return True
1242-
return False
1237+
"""
1238+
Build search data.
1239+
1240+
.. note::
1241+
For MkDocs search is indexed from its ``html`` artifacts.
1242+
And in sphinx is run using the rtd-sphinx-extension.
1243+
"""
1244+
return self.is_type_sphinx() and self.version.type != EXTERNAL
12431245

12441246
def build_docs_localmedia(self):
12451247
"""Get local media files with separate build."""
@@ -1593,6 +1595,9 @@ def _create_intersphinx_data(version, commit, build):
15931595
:param commit: Commit that updated path
15941596
:param build: Build id
15951597
"""
1598+
if not version.is_sphinx_type:
1599+
return
1600+
15961601
storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()
15971602

15981603
html_storage_path = version.project.get_storage_path(

readthedocs/search/documents.py

+5-6
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,10 @@ class Meta:
109109

110110
def prepare_domains(self, html_file):
111111
"""Prepares and returns the values for domains field."""
112-
all_domains = []
112+
if not html_file.version.is_sphinx_type:
113+
return []
113114

115+
all_domains = []
114116
try:
115117
domains_qs = html_file.sphinx_domains.exclude(
116118
domain='std',
@@ -172,11 +174,8 @@ def get_queryset(self):
172174
"""Overwrite default queryset to filter certain files to index."""
173175
queryset = super().get_queryset()
174176

175-
# Do not index files that belong to non sphinx project
176-
# Also do not index certain files
177-
queryset = queryset.internal().filter(
178-
project__documentation_type__contains='sphinx'
179-
)
177+
# Do not index files from external versions
178+
queryset = queryset.internal().all()
180179

181180
# TODO: Make this smarter
182181
# This was causing issues excluding some valid user documentation pages

readthedocs/search/parse_json.py

+53-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Functions related to converting content into dict/JSON structures."""
22

33
import logging
4+
from urllib.parse import urlparse
45
import orjson as json
56

67
from django.conf import settings
@@ -195,5 +196,56 @@ def parse_content(content, remove_first_line=False):
195196
content = content[1:]
196197

197198
# converting newlines to ". "
198-
content = ' '.join([text.strip() for text in content if text])
199+
content = ' '.join(text.strip() for text in content if text)
199200
return content
201+
202+
203+
def process_mkdocs_index_file(json_storage_path, page):
204+
"""Reads the json index file and parses it into a structured dict."""
205+
log.debug('Processing JSON index file: %s', json_storage_path)
206+
207+
storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()
208+
try:
209+
with storage.open(json_storage_path, mode='r') as f:
210+
file_contents = f.read()
211+
except IOError:
212+
log.info('Unable to read file: %s', json_storage_path)
213+
raise
214+
215+
data = json.loads(file_contents)
216+
page_data = {}
217+
218+
for section in data.get('docs', []):
219+
parsed_path = urlparse(section.get('location', ''))
220+
fragment = parsed_path.fragment
221+
path = parsed_path.path
222+
223+
# Some old versions of mkdocs
224+
# index the pages as ``/page.html`` insted of ``page.html``.
225+
path = path.lstrip('/')
226+
227+
if path == '' or path.endswith('/'):
228+
path += 'index.html'
229+
230+
if page != path:
231+
continue
232+
233+
title = HTMLParser(section.get('title')).text()
234+
content = parse_content(
235+
HTMLParser(section.get('text')).text()
236+
)
237+
238+
if not fragment:
239+
page_data.update({
240+
'path': path,
241+
'title': title,
242+
'domain_data': {},
243+
})
244+
else:
245+
page_data.setdefault('sections', []).append({
246+
'id': fragment,
247+
'title': title,
248+
'content': content,
249+
})
250+
251+
return page_data

readthedocs/search/tests/conftest.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from .dummy_data import ALL_PROJECTS, PROJECT_DATA_FILES
1515

1616

17-
@pytest.fixture()
17+
@pytest.fixture
1818
def es_index():
1919
call_command('search_index', '--delete', '-f')
2020
call_command('search_index', '--create')
@@ -23,7 +23,7 @@ def es_index():
2323
call_command('search_index', '--delete', '-f')
2424

2525

26-
@pytest.fixture(autouse=True)
26+
@pytest.fixture
2727
def all_projects(es_index, mock_processed_json, db, settings):
2828
settings.ELASTICSEARCH_DSL_AUTOSYNC = True
2929
projects_list = []
@@ -95,7 +95,7 @@ def get_dummy_processed_json(instance):
9595
return json.load(f)
9696

9797

98-
@pytest.fixture(autouse=True)
98+
@pytest.fixture
9999
def mock_processed_json(mocker):
100100
mocked_function = mocker.patch.object(HTMLFile, 'get_processed_json', autospec=True)
101101
mocked_function.side_effect = get_dummy_processed_json
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
{
2+
"config": {
3+
"lang": [
4+
"en"
5+
],
6+
"prebuild_index": false,
7+
"separator": "[\\s\\-]+"
8+
},
9+
"docs": [
10+
{
11+
"location": "",
12+
"text": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs.",
13+
"title": "Read the Docs MkDocs Test Project"
14+
},
15+
{
16+
"location": "#read-the-docs-mkdocs-test-project",
17+
"text": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs.",
18+
"title": "Read the Docs MkDocs Test Project"
19+
},
20+
{
21+
"location": "versions/",
22+
"text": "Versions & Themes There are a number of versions and themes for mkdocs.",
23+
"title": "Versions & Themes"
24+
},
25+
{
26+
"location": "versions/#versions-themes",
27+
"text": "Versions & Themes There are a number of versions and themes for mkdocs.",
28+
"title": "Versions & Themes"
29+
}
30+
]
31+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
{
2+
"docs": [
3+
{
4+
"location": "/",
5+
"text": "Read the Docs MkDocs Test Project\n\n\nThis is a test of \nMkDocs\n as it appears on \nRead the Docs\n.",
6+
"title": "Read the Docs MkDocs Test Project"
7+
},
8+
{
9+
"location": "/#read-the-docs-mkdocs-test-project",
10+
"text": "Read the Docs MkDocs Test Project\n\n\nThis is a test of \nMkDocs\n as it appears on \nRead the Docs\n.",
11+
"title": "Read the Docs MkDocs Test Project"
12+
},
13+
{
14+
"location": "/versions/",
15+
"text": "Versions & Themes\n\n\nThere are a number of versions and themes for mkdocs.",
16+
"title": "Versions & Themes"
17+
},
18+
{
19+
"location": "/versions/#versions-themes",
20+
"text": "Versions & Themes\n\n\nThere are a number of versions and themes for mkdocs.",
21+
"title": "Versions & Themes"
22+
}
23+
]
24+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
[
2+
{
3+
"title": "Read the Docs MkDocs Test Project",
4+
"path": "index.html",
5+
"sections": [
6+
{
7+
"id": "read-the-docs-mkdocs-test-project",
8+
"title": "Read the Docs MkDocs Test Project",
9+
"content": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs."
10+
}
11+
],
12+
"domain_data": {}
13+
},
14+
{
15+
"title": "Versions & Themes",
16+
"path": "versions/index.html",
17+
"sections": [
18+
{
19+
"id": "versions-themes",
20+
"title": "Versions & Themes",
21+
"content": "Versions & Themes There are a number of versions and themes for mkdocs."
22+
}
23+
],
24+
"domain_data": {}
25+
}
26+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
[
2+
{
3+
"title": "Read the Docs MkDocs Test Project",
4+
"path": "index.html",
5+
"sections": [
6+
{
7+
"id": "read-the-docs-mkdocs-test-project",
8+
"title": "Read the Docs MkDocs Test Project",
9+
"content": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs ."
10+
}
11+
],
12+
"domain_data": {}
13+
},
14+
{
15+
"title": "Versions & Themes",
16+
"path": "versions/index.html",
17+
"sections": [
18+
{
19+
"id": "versions-themes",
20+
"title": "Versions & Themes",
21+
"content": "Versions & Themes There are a number of versions and themes for mkdocs."
22+
}
23+
],
24+
"domain_data": {}
25+
}
26+
]

0 commit comments

Comments
 (0)