From af4cedbdd994a0a23bc1aaf699d9f7ee65f760a3 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 6 Jul 2023 11:40:27 -0500 Subject: [PATCH 1/2] Search: use generic parser for MkDocs projects We aren't doing anything special for MkDocs projects, and we don't override their search, so they are an easy target to start testing the generic parser more broadly. --- readthedocs/projects/models.py | 5 +- readthedocs/search/parsers.py | 84 ------------------- .../tests/data/mkdocs/in/search_index.json | 36 -------- .../data/mkdocs/in/search_index_old.json | 24 ------ .../tests/data/mkdocs/out/search_index.json | 45 ---------- .../data/mkdocs/out/search_index_old.json | 34 -------- readthedocs/search/tests/test_parsers.py | 69 --------------- 7 files changed, 2 insertions(+), 295 deletions(-) delete mode 100644 readthedocs/search/tests/data/mkdocs/in/search_index.json delete mode 100644 readthedocs/search/tests/data/mkdocs/in/search_index_old.json delete mode 100644 readthedocs/search/tests/data/mkdocs/out/search_index.json delete mode 100644 readthedocs/search/tests/data/mkdocs/out/search_index_old.json diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index 76dd2bfd4a0..8c1c338283f 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -51,7 +51,7 @@ validate_repository_url, ) from readthedocs.projects.version_handling import determine_stable_version -from readthedocs.search.parsers import GenericParser, MkDocsParser, SphinxParser +from readthedocs.search.parsers import GenericParser, SphinxParser from readthedocs.storage import build_media_storage from readthedocs.vcs_support.backends import backend_cls @@ -1527,13 +1527,12 @@ class Meta: def get_processed_json(self): if ( self.version.documentation_type == constants.GENERIC + or self.version.is_mkdocs_type or self.project.has_feature(Feature.INDEX_FROM_HTML_FILES) ): parser_class = GenericParser elif self.version.is_sphinx_type: parser_class = SphinxParser - elif self.version.is_mkdocs_type: - parser_class = MkDocsParser else: log.warning( "Invalid documentation type", diff --git a/readthedocs/search/parsers.py b/readthedocs/search/parsers.py index 199054ac240..c9ced42e227 100644 --- a/readthedocs/search/parsers.py +++ b/readthedocs/search/parsers.py @@ -543,87 +543,3 @@ def _clean_body(self, body): node.decompose() return body - - -class MkDocsParser(GenericParser): - - """ - MkDocs parser. - - Index using the json index file instead of the html content. - """ - - def parse(self, page): - storage_path = self.project.get_storage_path( - type_='html', - version_slug=self.version.slug, - include_file=False, - ) - try: - file_path = self.storage.join(storage_path, 'search/search_index.json') - if self.storage.exists(file_path): - index_data = self._process_index_file(file_path, page=page) - if index_data: - return index_data - except Exception: - log.warning( - 'Unhandled exception during search processing file.', - page=page, - ) - return { - 'path': page, - 'title': '', - 'sections': [], - } - - def _process_index_file(self, json_path, page): - """Reads the json index file and parses it into a structured dict.""" - try: - with self.storage.open(json_path, mode='r') as f: - file_contents = f.read() - except IOError: - log.info('Unable to read file.', path=json_path) - raise - - data = json.loads(file_contents) - page_data = {} - - for section in data.get('docs', []): - parsed_path = urlparse(section.get('location', '')) - fragment = parsed_path.fragment - path = parsed_path.path - - # Some old versions of mkdocs - # index the pages as ``/page.html`` instead of ``page.html``. - path = path.lstrip('/') - - if path == '' or path.endswith('/'): - path += 'index.html' - - if page != path: - continue - - title = self._parse_content( - HTMLParser(section.get('title')).text() - ) - content = self._parse_content( - HTMLParser(section.get('text')).text() - ) - - # If it doesn't have a fragment, - # it means is the page itself. - if not fragment: - page_data.update({ - 'path': path, - 'title': title, - }) - # Content without a fragment need to be indexed as well, - # this happens when the page doesn't start with a header, - # or if it doesn't contain any headers at all. - page_data.setdefault('sections', []).append({ - 'id': fragment, - 'title': title, - 'content': content, - }) - - return page_data diff --git a/readthedocs/search/tests/data/mkdocs/in/search_index.json b/readthedocs/search/tests/data/mkdocs/in/search_index.json deleted file mode 100644 index ef148f1e8ad..00000000000 --- a/readthedocs/search/tests/data/mkdocs/in/search_index.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "config": { - "lang": [ - "en" - ], - "prebuild_index": false, - "separator": "[\\s\\-]+" - }, - "docs": [ - { - "location": "", - "text": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs.", - "title": "Read the Docs MkDocs Test Project" - }, - { - "location": "#read-the-docs-mkdocs-test-project", - "text": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs.", - "title": "Read the Docs MkDocs Test Project" - }, - { - "location": "versions/", - "text": "Versions & Themes There are a number of versions and themes for mkdocs.", - "title": "Versions & Themes" - }, - { - "location": "versions/#versions-themes", - "text": "Versions & Themes There are a number of versions and themes for mkdocs.", - "title": "Versions & Themes" - }, - { - "location": "no-title/", - "text": "No title", - "title": "no-title" - } - ] -} diff --git a/readthedocs/search/tests/data/mkdocs/in/search_index_old.json b/readthedocs/search/tests/data/mkdocs/in/search_index_old.json deleted file mode 100644 index 29a3b63811b..00000000000 --- a/readthedocs/search/tests/data/mkdocs/in/search_index_old.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "docs": [ - { - "location": "/", - "text": "Read the Docs MkDocs Test Project\n\n\nThis is a test of \nMkDocs\n as it appears on \nRead the Docs\n.", - "title": "Read the Docs MkDocs Test Project" - }, - { - "location": "/#read-the-docs-mkdocs-test-project", - "text": "Read the Docs MkDocs Test Project\n\n\nThis is a test of \nMkDocs\n as it appears on \nRead the Docs\n.", - "title": "Read the Docs MkDocs Test Project" - }, - { - "location": "/versions/", - "text": "Versions & Themes\n\n\nThere are a number of versions and themes for mkdocs.", - "title": "Versions & Themes" - }, - { - "location": "/versions/#versions-themes", - "text": "Versions & Themes\n\n\nThere are a number of versions and themes for mkdocs.", - "title": "Versions & Themes" - } - ] -} diff --git a/readthedocs/search/tests/data/mkdocs/out/search_index.json b/readthedocs/search/tests/data/mkdocs/out/search_index.json deleted file mode 100644 index c69e91033f2..00000000000 --- a/readthedocs/search/tests/data/mkdocs/out/search_index.json +++ /dev/null @@ -1,45 +0,0 @@ -[ - { - "title": "Read the Docs MkDocs Test Project", - "path": "index.html", - "sections": [ - { - "id": "", - "title": "Read the Docs MkDocs Test Project", - "content": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs." - }, - { - "id": "read-the-docs-mkdocs-test-project", - "title": "Read the Docs MkDocs Test Project", - "content": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs." - } - ] - }, - { - "title": "Versions & Themes", - "path": "versions/index.html", - "sections": [ - { - "id": "", - "title": "Versions & Themes", - "content": "Versions & Themes There are a number of versions and themes for mkdocs." - }, - { - "id": "versions-themes", - "title": "Versions & Themes", - "content": "Versions & Themes There are a number of versions and themes for mkdocs." - } - ] - }, - { - "title": "no-title", - "path": "no-title/index.html", - "sections": [ - { - "id": "", - "title": "no-title", - "content": "No title" - } - ] - } -] diff --git a/readthedocs/search/tests/data/mkdocs/out/search_index_old.json b/readthedocs/search/tests/data/mkdocs/out/search_index_old.json deleted file mode 100644 index 0c0c6f39aa5..00000000000 --- a/readthedocs/search/tests/data/mkdocs/out/search_index_old.json +++ /dev/null @@ -1,34 +0,0 @@ -[ - { - "title": "Read the Docs MkDocs Test Project", - "path": "index.html", - "sections": [ - { - "id": "", - "title": "Read the Docs MkDocs Test Project", - "content": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs ." - }, - { - "id": "read-the-docs-mkdocs-test-project", - "title": "Read the Docs MkDocs Test Project", - "content": "Read the Docs MkDocs Test Project This is a test of MkDocs as it appears on Read the Docs ." - } - ] - }, - { - "title": "Versions & Themes", - "path": "versions/index.html", - "sections": [ - { - "id": "", - "title": "Versions & Themes", - "content": "Versions & Themes There are a number of versions and themes for mkdocs." - }, - { - "id": "versions-themes", - "title": "Versions & Themes", - "content": "Versions & Themes There are a number of versions and themes for mkdocs." - } - ] - } -] diff --git a/readthedocs/search/tests/test_parsers.py b/readthedocs/search/tests/test_parsers.py index 59e7fcb5e69..1e129f752e7 100644 --- a/readthedocs/search/tests/test_parsers.py +++ b/readthedocs/search/tests/test_parsers.py @@ -37,44 +37,6 @@ def f(*args, **kwargs): yield read_mock return f - @mock.patch.object(BuildMediaFileSystemStorage, 'exists') - @mock.patch.object(BuildMediaFileSystemStorage, 'open') - def test_mkdocs(self, storage_open, storage_exists): - json_file = data_path / 'mkdocs/in/search_index.json' - storage_open.side_effect = self._mock_open( - json_file.open().read() - ) - storage_exists.return_value = True - - self.version.documentation_type = MKDOCS - self.version.save() - - index_file = get( - HTMLFile, - project=self.project, - version=self.version, - path='index.html', - ) - versions_file = get( - HTMLFile, - project=self.project, - version=self.version, - path='versions/index.html', - ) - no_title_file = get( - HTMLFile, - project=self.project, - version=self.version, - path='no-title/index.html', - ) - - parsed_json = [ - index_file.processed_json, - versions_file.processed_json, - no_title_file.processed_json, - ] - expected_json = json.load(open(data_path / 'mkdocs/out/search_index.json')) - assert parsed_json == expected_json @mock.patch.object(BuildMediaFileSystemStorage, 'exists') @mock.patch.object(BuildMediaFileSystemStorage, 'open') @@ -199,37 +161,6 @@ def test_mkdocs_readthedocs_theme(self, storage_open, storage_exists): expected_json = json.load(open(data_path / 'mkdocs/out/readthedocs-1.1.json')) assert parsed_json == expected_json - @mock.patch.object(BuildMediaFileSystemStorage, 'exists') - @mock.patch.object(BuildMediaFileSystemStorage, 'open') - def test_mkdocs_old_version(self, storage_open, storage_exists): - json_file = data_path / 'mkdocs/in/search_index_old.json' - storage_open.side_effect = self._mock_open( - json_file.open().read() - ) - storage_exists.return_value = True - - self.version.documentation_type = MKDOCS - self.version.save() - - index_file = get( - HTMLFile, - project=self.project, - version=self.version, - path='index.html', - ) - versions_file = get( - HTMLFile, - project=self.project, - version=self.version, - path='versions/index.html', - ) - - parsed_json = [ - index_file.processed_json, - versions_file.processed_json, - ] - expected_json = json.load(open(data_path / 'mkdocs/out/search_index_old.json')) - assert parsed_json == expected_json @mock.patch.object(BuildMediaFileSystemStorage, 'exists') @mock.patch.object(BuildMediaFileSystemStorage, 'open') From a03bb802442777a76d73617df9ab9be3e5043506 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 6 Jul 2023 12:04:00 -0500 Subject: [PATCH 2/2] Linter --- readthedocs/search/parsers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/readthedocs/search/parsers.py b/readthedocs/search/parsers.py index c9ced42e227..28d6108815f 100644 --- a/readthedocs/search/parsers.py +++ b/readthedocs/search/parsers.py @@ -3,7 +3,6 @@ import itertools import os import re -from urllib.parse import urlparse import orjson as json import structlog