Skip to content

Commit c9f44fa

Browse files
authored
Search: use generic parser for MkDocs projects (#10516)
We aren't doing anything special for MkDocs projects, and we don't override their search, so they are an easy target to start testing the generic parser more broadly.
1 parent 84f889a commit c9f44fa

File tree

7 files changed

+2
-296
lines changed

7 files changed

+2
-296
lines changed

readthedocs/projects/models.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
validate_repository_url,
5252
)
5353
from readthedocs.projects.version_handling import determine_stable_version
54-
from readthedocs.search.parsers import GenericParser, MkDocsParser, SphinxParser
54+
from readthedocs.search.parsers import GenericParser, SphinxParser
5555
from readthedocs.storage import build_media_storage
5656
from readthedocs.vcs_support.backends import backend_cls
5757

@@ -1527,13 +1527,12 @@ class Meta:
15271527
def get_processed_json(self):
15281528
if (
15291529
self.version.documentation_type == constants.GENERIC
1530+
or self.version.is_mkdocs_type
15301531
or self.project.has_feature(Feature.INDEX_FROM_HTML_FILES)
15311532
):
15321533
parser_class = GenericParser
15331534
elif self.version.is_sphinx_type:
15341535
parser_class = SphinxParser
1535-
elif self.version.is_mkdocs_type:
1536-
parser_class = MkDocsParser
15371536
else:
15381537
log.warning(
15391538
"Invalid documentation type",

readthedocs/search/parsers.py

-85
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import itertools
44
import os
55
import re
6-
from urllib.parse import urlparse
76

87
import orjson as json
98
import structlog
@@ -543,87 +542,3 @@ def _clean_body(self, body):
543542
node.decompose()
544543

545544
return body
546-
547-
548-
class MkDocsParser(GenericParser):
549-
550-
"""
551-
MkDocs parser.
552-
553-
Index using the json index file instead of the html content.
554-
"""
555-
556-
def parse(self, page):
557-
storage_path = self.project.get_storage_path(
558-
type_='html',
559-
version_slug=self.version.slug,
560-
include_file=False,
561-
)
562-
try:
563-
file_path = self.storage.join(storage_path, 'search/search_index.json')
564-
if self.storage.exists(file_path):
565-
index_data = self._process_index_file(file_path, page=page)
566-
if index_data:
567-
return index_data
568-
except Exception:
569-
log.warning(
570-
'Unhandled exception during search processing file.',
571-
page=page,
572-
)
573-
return {
574-
'path': page,
575-
'title': '',
576-
'sections': [],
577-
}
578-
579-
def _process_index_file(self, json_path, page):
580-
"""Reads the json index file and parses it into a structured dict."""
581-
try:
582-
with self.storage.open(json_path, mode='r') as f:
583-
file_contents = f.read()
584-
except IOError:
585-
log.info('Unable to read file.', path=json_path)
586-
raise
587-
588-
data = json.loads(file_contents)
589-
page_data = {}
590-
591-
for section in data.get('docs', []):
592-
parsed_path = urlparse(section.get('location', ''))
593-
fragment = parsed_path.fragment
594-
path = parsed_path.path
595-
596-
# Some old versions of mkdocs
597-
# index the pages as ``/page.html`` instead of ``page.html``.
598-
path = path.lstrip('/')
599-
600-
if path == '' or path.endswith('/'):
601-
path += 'index.html'
602-
603-
if page != path:
604-
continue
605-
606-
title = self._parse_content(
607-
HTMLParser(section.get('title')).text()
608-
)
609-
content = self._parse_content(
610-
HTMLParser(section.get('text')).text()
611-
)
612-
613-
# If it doesn't have a fragment,
614-
# it means is the page itself.
615-
if not fragment:
616-
page_data.update({
617-
'path': path,
618-
'title': title,
619-
})
620-
# Content without a fragment need to be indexed as well,
621-
# this happens when the page doesn't start with a header,
622-
# or if it doesn't contain any headers at all.
623-
page_data.setdefault('sections', []).append({
624-
'id': fragment,
625-
'title': title,
626-
'content': content,
627-
})
628-
629-
return page_data

readthedocs/search/tests/data/mkdocs/in/search_index.json

-36
This file was deleted.

readthedocs/search/tests/data/mkdocs/in/search_index_old.json

-24
This file was deleted.

readthedocs/search/tests/data/mkdocs/out/search_index.json

-45
This file was deleted.

readthedocs/search/tests/data/mkdocs/out/search_index_old.json

-34
This file was deleted.

readthedocs/search/tests/test_parsers.py

-69
Original file line numberDiff line numberDiff line change
@@ -37,44 +37,6 @@ def f(*args, **kwargs):
3737
yield read_mock
3838
return f
3939

40-
@mock.patch.object(BuildMediaFileSystemStorage, 'exists')
41-
@mock.patch.object(BuildMediaFileSystemStorage, 'open')
42-
def test_mkdocs(self, storage_open, storage_exists):
43-
json_file = data_path / 'mkdocs/in/search_index.json'
44-
storage_open.side_effect = self._mock_open(
45-
json_file.open().read()
46-
)
47-
storage_exists.return_value = True
48-
49-
self.version.documentation_type = MKDOCS
50-
self.version.save()
51-
52-
index_file = get(
53-
HTMLFile,
54-
project=self.project,
55-
version=self.version,
56-
path='index.html',
57-
)
58-
versions_file = get(
59-
HTMLFile,
60-
project=self.project,
61-
version=self.version,
62-
path='versions/index.html',
63-
)
64-
no_title_file = get(
65-
HTMLFile,
66-
project=self.project,
67-
version=self.version,
68-
path='no-title/index.html',
69-
)
70-
71-
parsed_json = [
72-
index_file.processed_json,
73-
versions_file.processed_json,
74-
no_title_file.processed_json,
75-
]
76-
expected_json = json.load(open(data_path / 'mkdocs/out/search_index.json'))
77-
assert parsed_json == expected_json
7840

7941
@mock.patch.object(BuildMediaFileSystemStorage, 'exists')
8042
@mock.patch.object(BuildMediaFileSystemStorage, 'open')
@@ -199,37 +161,6 @@ def test_mkdocs_readthedocs_theme(self, storage_open, storage_exists):
199161
expected_json = json.load(open(data_path / 'mkdocs/out/readthedocs-1.1.json'))
200162
assert parsed_json == expected_json
201163

202-
@mock.patch.object(BuildMediaFileSystemStorage, 'exists')
203-
@mock.patch.object(BuildMediaFileSystemStorage, 'open')
204-
def test_mkdocs_old_version(self, storage_open, storage_exists):
205-
json_file = data_path / 'mkdocs/in/search_index_old.json'
206-
storage_open.side_effect = self._mock_open(
207-
json_file.open().read()
208-
)
209-
storage_exists.return_value = True
210-
211-
self.version.documentation_type = MKDOCS
212-
self.version.save()
213-
214-
index_file = get(
215-
HTMLFile,
216-
project=self.project,
217-
version=self.version,
218-
path='index.html',
219-
)
220-
versions_file = get(
221-
HTMLFile,
222-
project=self.project,
223-
version=self.version,
224-
path='versions/index.html',
225-
)
226-
227-
parsed_json = [
228-
index_file.processed_json,
229-
versions_file.processed_json,
230-
]
231-
expected_json = json.load(open(data_path / 'mkdocs/out/search_index_old.json'))
232-
assert parsed_json == expected_json
233164

234165
@mock.patch.object(BuildMediaFileSystemStorage, 'exists')
235166
@mock.patch.object(BuildMediaFileSystemStorage, 'open')

0 commit comments

Comments
 (0)