Skip to content

Search: use generic parser for MkDocs projects #10516

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
validate_repository_url,
)
from readthedocs.projects.version_handling import determine_stable_version
from readthedocs.search.parsers import GenericParser, MkDocsParser, SphinxParser
from readthedocs.search.parsers import GenericParser, SphinxParser
from readthedocs.storage import build_media_storage
from readthedocs.vcs_support.backends import backend_cls

Expand Down Expand Up @@ -1527,13 +1527,12 @@ class Meta:
def get_processed_json(self):
if (
self.version.documentation_type == constants.GENERIC
or self.version.is_mkdocs_type
or self.project.has_feature(Feature.INDEX_FROM_HTML_FILES)
):
parser_class = GenericParser
elif self.version.is_sphinx_type:
parser_class = SphinxParser
elif self.version.is_mkdocs_type:
parser_class = MkDocsParser
else:
log.warning(
"Invalid documentation type",
Expand Down
85 changes: 0 additions & 85 deletions readthedocs/search/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import itertools
import os
import re
from urllib.parse import urlparse

import orjson as json
import structlog
Expand Down Expand Up @@ -543,87 +542,3 @@ def _clean_body(self, body):
node.decompose()

return body


class MkDocsParser(GenericParser):

"""
MkDocs parser.

Index using the json index file instead of the html content.
"""

def parse(self, page):
storage_path = self.project.get_storage_path(
type_='html',
version_slug=self.version.slug,
include_file=False,
)
try:
file_path = self.storage.join(storage_path, 'search/search_index.json')
if self.storage.exists(file_path):
index_data = self._process_index_file(file_path, page=page)
if index_data:
return index_data
except Exception:
log.warning(
'Unhandled exception during search processing file.',
page=page,
)
return {
'path': page,
'title': '',
'sections': [],
}

def _process_index_file(self, json_path, page):
"""Reads the json index file and parses it into a structured dict."""
try:
with self.storage.open(json_path, mode='r') as f:
file_contents = f.read()
except IOError:
log.info('Unable to read file.', path=json_path)
raise

data = json.loads(file_contents)
page_data = {}

for section in data.get('docs', []):
parsed_path = urlparse(section.get('location', ''))
fragment = parsed_path.fragment
path = parsed_path.path

# Some old versions of mkdocs
# index the pages as ``/page.html`` instead of ``page.html``.
path = path.lstrip('/')

if path == '' or path.endswith('/'):
path += 'index.html'

if page != path:
continue

title = self._parse_content(
HTMLParser(section.get('title')).text()
)
content = self._parse_content(
HTMLParser(section.get('text')).text()
)

# If it doesn't have a fragment,
# it means is the page itself.
if not fragment:
page_data.update({
'path': path,
'title': title,
})
# Content without a fragment need to be indexed as well,
# this happens when the page doesn't start with a header,
# or if it doesn't contain any headers at all.
page_data.setdefault('sections', []).append({
'id': fragment,
'title': title,
'content': content,
})

return page_data
36 changes: 0 additions & 36 deletions readthedocs/search/tests/data/mkdocs/in/search_index.json

This file was deleted.

24 changes: 0 additions & 24 deletions readthedocs/search/tests/data/mkdocs/in/search_index_old.json

This file was deleted.

45 changes: 0 additions & 45 deletions readthedocs/search/tests/data/mkdocs/out/search_index.json

This file was deleted.

34 changes: 0 additions & 34 deletions readthedocs/search/tests/data/mkdocs/out/search_index_old.json

This file was deleted.

69 changes: 0 additions & 69 deletions readthedocs/search/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,44 +37,6 @@ def f(*args, **kwargs):
yield read_mock
return f

@mock.patch.object(BuildMediaFileSystemStorage, 'exists')
@mock.patch.object(BuildMediaFileSystemStorage, 'open')
def test_mkdocs(self, storage_open, storage_exists):
json_file = data_path / 'mkdocs/in/search_index.json'
storage_open.side_effect = self._mock_open(
json_file.open().read()
)
storage_exists.return_value = True

self.version.documentation_type = MKDOCS
self.version.save()

index_file = get(
HTMLFile,
project=self.project,
version=self.version,
path='index.html',
)
versions_file = get(
HTMLFile,
project=self.project,
version=self.version,
path='versions/index.html',
)
no_title_file = get(
HTMLFile,
project=self.project,
version=self.version,
path='no-title/index.html',
)

parsed_json = [
index_file.processed_json,
versions_file.processed_json,
no_title_file.processed_json,
]
expected_json = json.load(open(data_path / 'mkdocs/out/search_index.json'))
assert parsed_json == expected_json

@mock.patch.object(BuildMediaFileSystemStorage, 'exists')
@mock.patch.object(BuildMediaFileSystemStorage, 'open')
Expand Down Expand Up @@ -199,37 +161,6 @@ def test_mkdocs_readthedocs_theme(self, storage_open, storage_exists):
expected_json = json.load(open(data_path / 'mkdocs/out/readthedocs-1.1.json'))
assert parsed_json == expected_json

@mock.patch.object(BuildMediaFileSystemStorage, 'exists')
@mock.patch.object(BuildMediaFileSystemStorage, 'open')
def test_mkdocs_old_version(self, storage_open, storage_exists):
json_file = data_path / 'mkdocs/in/search_index_old.json'
storage_open.side_effect = self._mock_open(
json_file.open().read()
)
storage_exists.return_value = True

self.version.documentation_type = MKDOCS
self.version.save()

index_file = get(
HTMLFile,
project=self.project,
version=self.version,
path='index.html',
)
versions_file = get(
HTMLFile,
project=self.project,
version=self.version,
path='versions/index.html',
)

parsed_json = [
index_file.processed_json,
versions_file.processed_json,
]
expected_json = json.load(open(data_path / 'mkdocs/out/search_index_old.json'))
assert parsed_json == expected_json

@mock.patch.object(BuildMediaFileSystemStorage, 'exists')
@mock.patch.object(BuildMediaFileSystemStorage, 'open')
Expand Down