diff --git a/readthedocs/builds/models.py b/readthedocs/builds/models.py index f4514bee815..ffc34333781 100644 --- a/readthedocs/builds/models.py +++ b/readthedocs/builds/models.py @@ -299,6 +299,17 @@ def last_build(self): def latest_build(self): return self.builds.order_by("-date").first() + @property + def latest_successful_build(self): + return ( + self.builds.filter( + state=BUILD_STATE_FINISHED, + success=True, + ) + .order_by("-date") + .first() + ) + @property def config(self): """ diff --git a/readthedocs/filetreediff/__init__.py b/readthedocs/filetreediff/__init__.py new file mode 100644 index 00000000000..65c3fd22ee1 --- /dev/null +++ b/readthedocs/filetreediff/__init__.py @@ -0,0 +1,111 @@ +""" +Module for the file tree diff feature (FTD). + +This feature is used to compare the files of two versions of a project. + +The process is as follows: + +- A build is triggered for a version. +- A task is triggered after the build has succeeded + to generate a manifest of the files of the version. + Currently, we only consider the latest version and pull request previews. +- The manifest contains the hash of the main content of each file. + Only HTML files are considered for now. +- The manifest is stored in the diff media storage. +- Then our application can compare the manifest to get a list of added, + deleted, and modified files between two versions. +""" + +import json + +from readthedocs.builds.models import Version +from readthedocs.filetreediff.dataclasses import FileTreeDiff, FileTreeDiffManifest +from readthedocs.projects.constants import MEDIA_TYPE_DIFF +from readthedocs.storage import build_media_storage + +MANIFEST_FILE_NAME = "manifest.json" + + +def get_diff(version_a: Version, version_b: Version) -> FileTreeDiff | None: + """ + Get the file tree diff between two versions. + + If any of the versions don't have a manifest, return None. + If the latest build of any of the versions is different from the manifest build, + the diff is marked as outdated. The client is responsible for deciding + how to handle this case. + + Set operations are used to calculate the added, deleted, and modified files. + To get the modified files, we compare the main content hash of each common file. + If there are no changes between the versions, all lists will be empty. + """ + outdated = False + manifests: list[FileTreeDiffManifest] = [] + for version in (version_a, version_b): + manifest = get_manifest(version) + if not manifest: + return None + + latest_build = version.latest_successful_build + if not latest_build: + return None + + if latest_build.id != manifest.build.id: + outdated = True + + manifests.append(manifest) + + # pylint: disable=unbalanced-tuple-unpacking + version_a_manifest, version_b_manifest = manifests + files_a = set(version_a_manifest.files.keys()) + files_b = set(version_b_manifest.files.keys()) + + files_added = list(files_a - files_b) + files_deleted = list(files_b - files_a) + files_modified = [] + for file_path in files_a & files_b: + file_a = version_a_manifest.files[file_path] + file_b = version_b_manifest.files[file_path] + if file_a.main_content_hash != file_b.main_content_hash: + files_modified.append(file_path) + + return FileTreeDiff( + added=files_added, + deleted=files_deleted, + modified=files_modified, + outdated=outdated, + ) + + +def get_manifest(version: Version) -> FileTreeDiffManifest | None: + """ + Get the file manifest for a version. + + If the manifest file does not exist, return None. + """ + storage_path = version.project.get_storage_path( + type_=MEDIA_TYPE_DIFF, + version_slug=version.slug, + include_file=False, + version_type=version.type, + ) + manifest_path = build_media_storage.join(storage_path, MANIFEST_FILE_NAME) + try: + with build_media_storage.open(manifest_path) as manifest_file: + manifest = json.load(manifest_file) + except FileNotFoundError: + return None + + return FileTreeDiffManifest.from_dict(manifest) + + +def write_manifest(version: Version, manifest: FileTreeDiffManifest): + storage_path = version.project.get_storage_path( + type_=MEDIA_TYPE_DIFF, + version_slug=version.slug, + include_file=False, + version_type=version.type, + ) + manifest_path = build_media_storage.join(storage_path, MANIFEST_FILE_NAME) + with build_media_storage.open(manifest_path, "w") as f: + json.dump(manifest.as_dict(), f) diff --git a/readthedocs/filetreediff/dataclasses.py b/readthedocs/filetreediff/dataclasses.py new file mode 100644 index 00000000000..d3e5186b79a --- /dev/null +++ b/readthedocs/filetreediff/dataclasses.py @@ -0,0 +1,64 @@ +from dataclasses import asdict, dataclass + + +@dataclass(slots=True) +class FileTreeDiffBuild: + + """The build associated with a file tree manifest.""" + + id: int + + +@dataclass(slots=True) +class FileTreeDiffFile: + + """A file in a file tree manifest.""" + + path: str + main_content_hash: str + + +@dataclass(slots=True) +class FileTreeDiffManifest: + + """A list of files and the build associated with them.""" + + files: dict[str, FileTreeDiffFile] + build: FileTreeDiffBuild + + def __init__(self, build_id: int, files: list[FileTreeDiffFile]): + self.build = FileTreeDiffBuild(id=build_id) + self.files = {file.path: file for file in files} + + @classmethod + def from_dict(cls, data: dict) -> "FileTreeDiffManifest": + """ + Create a FileTreeManifest from a dictionary. + + The dictionary should follow the same structure as the one returned by + converting the object to a dictionary using the `as_dict` method. + """ + build_id = data["build"]["id"] + files = [ + FileTreeDiffFile(path=path, main_content_hash=file["main_content_hash"]) + for path, file in data["files"].items() + ] + return cls(build_id, files) + + def as_dict(self) -> dict: + """Convert the object to a dictionary.""" + return asdict(self) + + +@dataclass +class FileTreeDiff: + + """Difference between two file tree manifests.""" + + added: list[str] + deleted: list[str] + modified: list[str] + outdated: bool = False + + def has_changes(self) -> bool: + return bool(self.added or self.deleted or self.modified) diff --git a/readthedocs/filetreediff/tests/test_filetreediff.py b/readthedocs/filetreediff/tests/test_filetreediff.py new file mode 100644 index 00000000000..e785063c1bb --- /dev/null +++ b/readthedocs/filetreediff/tests/test_filetreediff.py @@ -0,0 +1,134 @@ +import json +from contextlib import contextmanager +from unittest import mock + +from django.test import TestCase +from django_dynamic_fixture import get + +from readthedocs.builds.constants import BUILD_STATE_FINISHED, LATEST +from readthedocs.builds.models import Build, Version +from readthedocs.filetreediff import get_diff +from readthedocs.projects.models import Project +from readthedocs.rtd_tests.storage import BuildMediaFileSystemStorageTest + + +# We are overriding the storage class instead of using RTD_BUILD_MEDIA_STORAGE, +# since the setting is evaluated just once (first test to use the storage +# backend will set it for the whole test suite). +@mock.patch( + "readthedocs.filetreediff.build_media_storage", + new=BuildMediaFileSystemStorageTest(), +) +class TestsFileTreeDiff(TestCase): + def setUp(self): + self.project = get(Project) + self.version_a = self.project.versions.get(slug=LATEST) + self.build_a = get( + Build, + project=self.project, + version=self.version_a, + state=BUILD_STATE_FINISHED, + success=True, + ) + self.version_b = get( + Version, + project=self.project, + slug="v2", + active=True, + built=True, + ) + self.build_b = get( + Build, + project=self.project, + version=self.version_b, + state=BUILD_STATE_FINISHED, + success=True, + ) + + def _mock_open(self, content): + @contextmanager + def f(*args, **kwargs): + read_mock = mock.MagicMock() + read_mock.read.return_value = content + yield read_mock + + return f + + def _mock_manifest(self, build_id: int, files: dict[str, str]): + return self._mock_open( + json.dumps( + { + "build": {"id": build_id}, + "files": { + file_path: {"main_content_hash": main_content_hash} + for file_path, main_content_hash in files.items() + }, + } + ) + ) + + @mock.patch.object(BuildMediaFileSystemStorageTest, "open") + def test_diff_no_changes(self, storage_open): + files_a = { + "index.html": "hash1", + "tutorials/index.html": "hash2", + } + storage_open.side_effect = [ + self._mock_manifest(self.build_a.id, files_a)(), + self._mock_manifest(self.build_b.id, files_a)(), + ] + diff = get_diff(self.version_a, self.version_b) + assert diff.added == [] + assert diff.deleted == [] + assert diff.modified == [] + assert not diff.outdated + + @mock.patch.object(BuildMediaFileSystemStorageTest, "open") + def test_diff_changes(self, storage_open): + files_a = { + "index.html": "hash1", + "tutorials/index.html": "hash2", + "new-file.html": "hash-new", + } + files_b = { + "index.html": "hash1", + "tutorials/index.html": "hash-changed", + "deleted.html": "hash-deleted", + } + storage_open.side_effect = [ + self._mock_manifest(self.build_a.id, files_a)(), + self._mock_manifest(self.build_b.id, files_b)(), + ] + diff = get_diff(self.version_a, self.version_b) + assert diff.added == ["new-file.html"] + assert diff.deleted == ["deleted.html"] + assert diff.modified == ["tutorials/index.html"] + assert not diff.outdated + + @mock.patch.object(BuildMediaFileSystemStorageTest, "open") + def test_missing_manifest(self, storage_open): + storage_open.side_effect = FileNotFoundError + diff = get_diff(self.version_a, self.version_b) + assert diff is None + + @mock.patch.object(BuildMediaFileSystemStorageTest, "open") + def test_outdated_diff(self, storage_open): + files_a = { + "index.html": "hash1", + "tutorials/index.html": "hash2", + "new-file.html": "hash-new", + } + files_b = { + "index.html": "hash1", + "tutorials/index.html": "hash-changed", + "deleted.html": "hash-deleted", + } + storage_open.side_effect = [ + self._mock_manifest(self.build_a.id + 5, files_a)(), + self._mock_manifest(self.build_b.id + 5, files_b)(), + ] + diff = get_diff(self.version_a, self.version_b) + assert diff.added == ["new-file.html"] + assert diff.deleted == ["deleted.html"] + assert diff.modified == ["tutorials/index.html"] + assert diff.outdated diff --git a/readthedocs/projects/constants.py b/readthedocs/projects/constants.py index d1401c171e6..eae06bd9e8f 100644 --- a/readthedocs/projects/constants.py +++ b/readthedocs/projects/constants.py @@ -34,6 +34,7 @@ MEDIA_TYPE_EPUB = "epub" MEDIA_TYPE_HTMLZIP = "htmlzip" MEDIA_TYPE_JSON = "json" +MEDIA_TYPE_DIFF = "diff" DOWNLOADABLE_MEDIA_TYPES = ( MEDIA_TYPE_PDF, MEDIA_TYPE_EPUB, @@ -45,6 +46,7 @@ MEDIA_TYPE_EPUB, MEDIA_TYPE_HTMLZIP, MEDIA_TYPE_JSON, + MEDIA_TYPE_DIFF, ) BUILD_COMMANDS_OUTPUT_PATH = "_readthedocs/" diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index fbb74fa6342..7f69ce69d42 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -1895,6 +1895,7 @@ def add_features(sender, **kwargs): RESOLVE_PROJECT_FROM_HEADER = "resolve_project_from_header" USE_PROXIED_APIS_WITH_PREFIX = "use_proxied_apis_with_prefix" ALLOW_VERSION_WARNING_BANNER = "allow_version_warning_banner" + GENERATE_MANIFEST_FOR_FILE_TREE_DIFF = "generate_manifest_for_file_tree_diff" # Versions sync related features SKIP_SYNC_TAGS = "skip_sync_tags" @@ -1955,6 +1956,10 @@ def add_features(sender, **kwargs): ALLOW_VERSION_WARNING_BANNER, _("Dashboard: Allow project to use the version warning banner."), ), + ( + GENERATE_MANIFEST_FOR_FILE_TREE_DIFF, + _("Build: Generate a file manifest for file tree diff."), + ), # Versions sync related features ( SKIP_SYNC_BRANCHES, diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index 1a76195e8a3..d95ea854c0c 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -2,9 +2,11 @@ import structlog -from readthedocs.builds.constants import BUILD_STATE_FINISHED, INTERNAL +from readthedocs.builds.constants import BUILD_STATE_FINISHED, INTERNAL, LATEST from readthedocs.builds.models import Build, Version -from readthedocs.projects.models import HTMLFile, Project +from readthedocs.filetreediff import write_manifest +from readthedocs.filetreediff.dataclasses import FileTreeDiffFile, FileTreeDiffManifest +from readthedocs.projects.models import Feature, HTMLFile, Project from readthedocs.projects.signals import files_changed from readthedocs.search.documents import PageDocument from readthedocs.search.utils import index_objects, remove_indexed_files @@ -120,7 +122,32 @@ def collect(self, sync_id: int): self.version.imported_files.exclude(build=sync_id).delete() -def _get_indexers(*, version, search_ranking, search_ignore, search_index_name=None): +class FileManifestIndexer(Indexer): + def __init__(self, version: Version, build: Build): + self.version = version + self.build = build + self._hashes = {} + + def process(self, html_file: HTMLFile, sync_id: int): + self._hashes[html_file.path] = html_file.processed_json["main_content_hash"] + + def collect(self, sync_id: int): + manifest = FileTreeDiffManifest( + build_id=self.build.id, + files=[ + FileTreeDiffFile(path=path, main_content_hash=hash) + for path, hash in self._hashes.items() + ], + ) + write_manifest(self.version, manifest) + + +def _get_indexers(*, version: Version, build: Build, search_index_name=None): + build_config = build.config or {} + search_config = build_config.get("search", {}) + search_ranking = search_config.get("ranking", {}) + search_ignore = search_config.get("ignore", []) + indexers = [] # NOTE: The search indexer must be before the index file indexer. # This is because saving the objects in the DB will give them an id, @@ -136,6 +163,20 @@ def _get_indexers(*, version, search_ranking, search_ignore, search_index_name=N search_index_name=search_index_name, ) indexers.append(search_indexer) + + # File tree diff is under a feature flag for now, + # and we only allow to compare PR previous against the latest version. + has_feature = version.project.has_feature( + Feature.GENERATE_MANIFEST_FOR_FILE_TREE_DIFF + ) + create_manifest = has_feature and (version.is_external or version.slug == LATEST) + if create_manifest: + file_manifest_indexer = FileManifestIndexer( + version=version, + build=build, + ) + indexers.append(file_manifest_indexer) + index_file_indexer = IndexFileIndexer( project=version.project, version=version, @@ -230,18 +271,12 @@ def index_build(build_id): build_id=build.id, ) - build_config = build.config or {} - search_config = build_config.get("search", {}) - search_ranking = search_config.get("ranking", {}) - search_ignore = search_config.get("ignore", []) - try: indexers = _get_indexers( version=version, - search_ranking=search_ranking, - search_ignore=search_ignore, + build=build, ) - _process_files(version=version, indexers=indexers) + return _process_files(version=version, indexers=indexers) except Exception: log.exception("Failed to index build") @@ -280,17 +315,10 @@ def reindex_version(version_id, search_index_name=None): version_slug=version.slug, build_id=latest_successful_build.id, ) - - build_config = latest_successful_build.config or {} - search_config = build_config.get("search", {}) - search_ranking = search_config.get("ranking", {}) - search_ignore = search_config.get("ignore", []) - try: indexers = _get_indexers( version=version, - search_ranking=search_ranking, - search_ignore=search_ignore, + build=latest_successful_build, search_index_name=search_index_name, ) _process_files(version=version, indexers=indexers) diff --git a/readthedocs/proxito/tests/responses/v1.json b/readthedocs/proxito/tests/responses/v1.json index efd1e1107d6..27dc0c2131c 100644 --- a/readthedocs/proxito/tests/responses/v1.json +++ b/readthedocs/proxito/tests/responses/v1.json @@ -150,6 +150,9 @@ "base_host": "", "base_page": "" }, + "filetreediff": { + "enabled": false + }, "flyout": { "enabled": true }, diff --git a/readthedocs/proxito/tests/test_hosting.py b/readthedocs/proxito/tests/test_hosting.py index 70de2902d1e..ee1d53f6d83 100644 --- a/readthedocs/proxito/tests/test_hosting.py +++ b/readthedocs/proxito/tests/test_hosting.py @@ -2,6 +2,7 @@ import json from pathlib import Path +from unittest import mock import django_dynamic_fixture as fixture import pytest @@ -9,9 +10,11 @@ from django.test import TestCase, override_settings from django.urls import reverse from django.utils import timezone +from django_dynamic_fixture import get -from readthedocs.builds.constants import LATEST +from readthedocs.builds.constants import BUILD_STATE_FINISHED, EXTERNAL, LATEST from readthedocs.builds.models import Build, Version +from readthedocs.filetreediff.dataclasses import FileTreeDiffFile, FileTreeDiffManifest from readthedocs.projects.constants import ( ADDONS_FLYOUT_SORTING_ALPHABETICALLY, ADDONS_FLYOUT_SORTING_CALVER, @@ -21,7 +24,7 @@ PUBLIC, SINGLE_VERSION_WITHOUT_TRANSLATIONS, ) -from readthedocs.projects.models import AddonsConfig, Domain, Project +from readthedocs.projects.models import AddonsConfig, Domain, Feature, Project @override_settings( @@ -30,6 +33,7 @@ PUBLIC_DOMAIN_USES_HTTPS=True, GLOBAL_ANALYTICS_CODE=None, RTD_ALLOW_ORGANIZATIONS=False, + RTD_EXTERNAL_VERSION_DOMAIN="dev.readthedocs.build", ) @pytest.mark.proxito class TestReadTheDocsConfigJson(TestCase): @@ -845,6 +849,90 @@ def test_number_of_queries_url_translations(self): ) assert r.status_code == 200 + @mock.patch("readthedocs.filetreediff.get_manifest") + def test_file_tree_diff(self, get_manifest): + get( + Feature, + projects=[self.project], + feature_id=Feature.GENERATE_MANIFEST_FOR_FILE_TREE_DIFF, + ) + pr_version = get( + Version, + project=self.project, + slug="123", + active=True, + built=True, + privacy_level=PUBLIC, + type=EXTERNAL, + ) + pr_build = get( + Build, + project=self.project, + version=pr_version, + commit="a1b2c3", + state=BUILD_STATE_FINISHED, + success=True, + ) + get_manifest.side_effect = [ + FileTreeDiffManifest( + build_id=pr_build.id, + files=[ + FileTreeDiffFile( + path="index.html", + main_content_hash="hash1", + ), + FileTreeDiffFile( + path="tutorial/index.html", + main_content_hash="hash1", + ), + FileTreeDiffFile( + path="new-file.html", + main_content_hash="hash1", + ), + ], + ), + FileTreeDiffManifest( + build_id=self.build.id, + files=[ + FileTreeDiffFile( + path="index.html", + main_content_hash="hash1", + ), + FileTreeDiffFile( + path="tutorial/index.html", + main_content_hash="hash-changed", + ), + FileTreeDiffFile( + path="deleted.html", + main_content_hash="hash-deleted", + ), + ], + ), + ] + r = self.client.get( + reverse("proxito_readthedocs_docs_addons"), + { + "url": "https://project--123.dev.readthedocs.build/en/123/", + "client-version": "0.6.0", + "api-version": "1.0.0", + }, + secure=True, + headers={ + "host": "project--123.dev.readthedocs.build", + }, + ) + assert r.status_code == 200 + filetreediff_response = r.json()["addons"]["filetreediff"] + assert filetreediff_response == { + "enabled": True, + "outdated": False, + "diff": { + "added": [{"file": "new-file.html"}], + "deleted": [{"file": "deleted.html"}], + "modified": [{"file": "tutorial/index.html"}], + }, + } + def test_version_ordering(self): for slug in ["1.0", "1.2", "1.12", "2.0", "2020.01.05", "a-slug", "z-slug"]: fixture.get( diff --git a/readthedocs/proxito/views/hosting.py b/readthedocs/proxito/views/hosting.py index 1204b7f9c9c..370e2e4a0c8 100644 --- a/readthedocs/proxito/views/hosting.py +++ b/readthedocs/proxito/views/hosting.py @@ -23,13 +23,14 @@ from readthedocs.core.resolver import Resolver from readthedocs.core.unresolver import UnresolverError, unresolver from readthedocs.core.utils.extend import SettingsOverrideObject +from readthedocs.filetreediff import get_diff from readthedocs.projects.constants import ( ADDONS_FLYOUT_SORTING_CALVER, ADDONS_FLYOUT_SORTING_CUSTOM_PATTERN, ADDONS_FLYOUT_SORTING_PYTHON_PACKAGING, ADDONS_FLYOUT_SORTING_SEMVER_READTHEDOCS_COMPATIBLE, ) -from readthedocs.projects.models import AddonsConfig, Project +from readthedocs.projects.models import AddonsConfig, Feature, Project from readthedocs.projects.version_handling import ( comparable_version, sort_versions_calver, @@ -311,6 +312,15 @@ def _get_versions(self, request, project): include_hidden=False, ) + def _has_permission(self, request, version): + """ + Check if user from the request is authorized to access `version`. + + This is mainly to be overridden in .com to make use of + the auth backends in the proxied API. + """ + return True + def _v1(self, project, version, build, filename, url, request): """ Initial JSON data structure consumed by the JavaScript client. @@ -510,11 +520,20 @@ def _v1(self, project, version, build, filename, url, request): "trigger": "Slash", # Could be something like "Ctrl + D" }, }, + "filetreediff": { + "enabled": False, + }, }, } - # Show the subprojects filter on the parent project and subproject if version: + response = self._get_filetreediff_response( + request=request, project=project, version=version + ) + if response: + data["addons"]["filetreediff"].update(response) + + # Show the subprojects filter on the parent project and subproject # TODO: Remove these queries and try to find a way to get this data # from the resolver, which has already done these queries. # TODO: Replace this fixed filters with the work proposed in @@ -594,6 +613,39 @@ def _v1(self, project, version, build, filename, url, request): return data + def _get_filetreediff_response(self, *, request, project, version): + """ + Get the file tree diff response for the given version. + + This response is only enabled for external versions, + we do the comparison between the current version and the latest version. + """ + if not version.is_external: + return None + + if not project.has_feature(Feature.GENERATE_MANIFEST_FOR_FILE_TREE_DIFF): + return None + + latest_version = project.get_latest_version() + if not latest_version or not self._has_permission( + request=request, version=latest_version + ): + return None + + diff = get_diff(version_a=version, version_b=latest_version) + if not diff: + return None + + return { + "enabled": True, + "outdated": diff.outdated, + "diff": { + "added": [{"file": file} for file in diff.added], + "deleted": [{"file": file} for file in diff.deleted], + "modified": [{"file": file} for file in diff.modified], + }, + } + def _v2(self, project, version, build, filename, url, user): return { "api_version": "2", diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py index 77ce20b8ada..fb1a9474253 100644 --- a/readthedocs/rtd_tests/tests/test_imported_file.py +++ b/readthedocs/rtd_tests/tests/test_imported_file.py @@ -1,14 +1,18 @@ import os +from unittest import mock import pytest from django.conf import settings from django.core.files.storage import get_storage_class from django.test import TestCase from django.test.utils import override_settings +from django_dynamic_fixture import get -from readthedocs.builds.constants import EXTERNAL -from readthedocs.projects.models import HTMLFile, ImportedFile, Project -from readthedocs.projects.tasks.search import _get_indexers, _process_files +from readthedocs.builds.constants import BUILD_STATE_FINISHED, EXTERNAL, LATEST +from readthedocs.builds.models import Build, Version +from readthedocs.filetreediff.dataclasses import FileTreeDiffFile, FileTreeDiffManifest +from readthedocs.projects.models import Feature, HTMLFile, ImportedFile, Project +from readthedocs.projects.tasks.search import index_build from readthedocs.search.documents import PageDocument base_dir = os.path.dirname(os.path.dirname(__file__)) @@ -17,17 +21,22 @@ @pytest.mark.search @override_settings(ELASTICSEARCH_DSL_AUTOSYNC=True) class ImportedFileTests(TestCase): - fixtures = ["eric", "test_data"] - storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)() def setUp(self): - self.project = Project.objects.get(slug="pip") - self.version = self.project.versions.first() + self.project = get(Project) + self.version = self.project.versions.get(slug=LATEST) + self.build = get( + Build, + project=self.project, + version=self.version, + state=BUILD_STATE_FINISHED, + success=True, + ) self.test_dir = os.path.join(base_dir, "files") with override_settings(DOCROOT=self.test_dir): - self._copy_storage_dir() + self._copy_storage_dir(self.version) self._create_index() @@ -42,26 +51,15 @@ def tearDown(self): # Delete index PageDocument._index.delete(ignore=404) - def _manage_imported_files(self, version, search_ranking=None, search_ignore=None): - """Helper function for the tests to create and sync ImportedFiles.""" - search_ranking = search_ranking or {} - search_ignore = search_ignore or [] - indexers = _get_indexers( - version=version, - search_ranking=search_ranking, - search_ignore=search_ignore, - ) - return _process_files(version=version, indexers=indexers) - - def _copy_storage_dir(self): + def _copy_storage_dir(self, version): """Copy the test directory (rtd_tests/files) to storage""" self.storage.copy_directory( self.test_dir, self.project.get_storage_path( type_="html", - version_slug=self.version.slug, + version_slug=version.slug, include_file=False, - version_type=self.version.type, + version_type=version.type, ), ) @@ -78,7 +76,7 @@ def test_properly_created(self): """ self.assertEqual(ImportedFile.objects.count(), 0) - sync_id = self._manage_imported_files(version=self.version) + sync_id = index_build(self.build.pk) self.assertEqual(ImportedFile.objects.count(), 3) self.assertEqual( set(HTMLFile.objects.all().values_list("path", flat=True)), @@ -91,7 +89,7 @@ def test_properly_created(self): {"index.html", "404.html", "test.html", "api/index.html"}, ) - sync_id = self._manage_imported_files(version=self.version) + sync_id = index_build(self.build.pk) self.assertEqual(ImportedFile.objects.count(), 3) self.assertEqual( set(HTMLFile.objects.all().values_list("path", flat=True)), @@ -110,9 +108,9 @@ def test_index_external_version(self): self.version.save() with override_settings(DOCROOT=self.test_dir): - self._copy_storage_dir() + self._copy_storage_dir(self.version) - sync_id = self._manage_imported_files(version=self.version) + sync_id = index_build(self.build.pk) self.assertEqual(ImportedFile.objects.count(), 3) self.assertEqual( set(HTMLFile.objects.all().values_list("path", flat=True)), @@ -122,7 +120,7 @@ def test_index_external_version(self): results = PageDocument().search().filter("term", build=sync_id).execute() self.assertEqual(len(results), 0) - sync_id = self._manage_imported_files(version=self.version) + sync_id = index_build(self.build.pk) self.assertEqual(ImportedFile.objects.count(), 3) self.assertEqual( set(HTMLFile.objects.all().values_list("path", flat=True)), @@ -133,7 +131,7 @@ def test_index_external_version(self): def test_update_build(self): self.assertEqual(ImportedFile.objects.count(), 0) - sync_id = self._manage_imported_files(self.version) + sync_id = index_build(self.build.pk) for obj in ImportedFile.objects.all(): self.assertEqual(obj.build, sync_id) @@ -142,7 +140,7 @@ def test_update_build(self): for result in results: self.assertEqual(result.build, sync_id) - sync_id = self._manage_imported_files(self.version) + sync_id = index_build(self.build.pk) for obj in ImportedFile.objects.all(): self.assertEqual(obj.build, sync_id) @@ -152,11 +150,8 @@ def test_update_build(self): self.assertEqual(len(results), 4) def test_page_default_rank(self): - search_ranking = {} self.assertEqual(HTMLFile.objects.count(), 0) - sync_id = self._manage_imported_files( - self.version, search_ranking=search_ranking - ) + sync_id = index_build(self.build.pk) results = ( PageDocument() @@ -173,12 +168,15 @@ def test_page_default_rank(self): self.assertEqual(result.rank, 0) def test_page_custom_rank_glob(self): - search_ranking = { - "*index.html": 5, + self.build.config = { + "search": { + "ranking": { + "*index.html": 5, + } + } } - sync_id = self._manage_imported_files( - self.version, search_ranking=search_ranking - ) + self.build.save() + sync_id = index_build(self.build.pk) results = ( PageDocument() @@ -198,13 +196,16 @@ def test_page_custom_rank_glob(self): self.assertEqual(result.rank, 0, result.path) def test_page_custom_rank_several(self): - search_ranking = { - "test.html": 5, - "api/index.html": 2, + self.build.config = { + "search": { + "ranking": { + "test.html": 5, + "api/index.html": 2, + } + } } - sync_id = self._manage_imported_files( - self.version, search_ranking=search_ranking - ) + self.build.save() + sync_id = index_build(self.build.pk) results = ( PageDocument() @@ -226,13 +227,16 @@ def test_page_custom_rank_several(self): self.assertEqual(result.rank, 0) def test_page_custom_rank_precedence(self): - search_ranking = { - "*.html": 5, - "api/index.html": 2, + self.build.config = { + "search": { + "ranking": { + "*.html": 5, + "api/index.html": 2, + } + } } - sync_id = self._manage_imported_files( - self.version, search_ranking=search_ranking - ) + self.build.save() + sync_id = index_build(self.build.pk) results = ( PageDocument() @@ -252,13 +256,16 @@ def test_page_custom_rank_precedence(self): self.assertEqual(result.rank, 5, result.path) def test_page_custom_rank_precedence_inverted(self): - search_ranking = { - "api/index.html": 2, - "*.html": 5, + self.build.config = { + "search": { + "ranking": { + "api/index.html": 2, + "*.html": 5, + } + } } - sync_id = self._manage_imported_files( - self.version, search_ranking=search_ranking - ) + self.build.save() + sync_id = index_build(self.build.pk) results = ( PageDocument() @@ -275,11 +282,13 @@ def test_page_custom_rank_precedence_inverted(self): self.assertEqual(result.rank, 5) def test_search_page_ignore(self): - search_ignore = ["api/index.html"] - self._manage_imported_files( - self.version, - search_ignore=search_ignore, - ) + self.build.config = { + "search": { + "ignore": ["api/index.html"], + } + } + self.build.save() + index_build(self.build.pk) self.assertEqual( set(HTMLFile.objects.all().values_list("path", flat=True)), @@ -305,9 +314,9 @@ def test_update_content(self): f.write("Woo") with override_settings(DOCROOT=self.test_dir): - self._copy_storage_dir() + self._copy_storage_dir(self.version) - sync_id = self._manage_imported_files(self.version) + sync_id = index_build(self.build.pk) self.assertEqual(ImportedFile.objects.count(), 3) document = ( PageDocument() @@ -324,9 +333,9 @@ def test_update_content(self): f.write("Something Else") with override_settings(DOCROOT=self.test_dir): - self._copy_storage_dir() + self._copy_storage_dir(self.version) - sync_id = self._manage_imported_files(self.version) + sync_id = index_build(self.build.pk) self.assertEqual(ImportedFile.objects.count(), 3) document = ( PageDocument() @@ -338,3 +347,59 @@ def test_update_content(self): .execute()[0] ) self.assertEqual(document.sections[0].content, "Something Else") + + @mock.patch("readthedocs.projects.tasks.search.write_manifest") + def test_create_file_tree_manifest(self, write_manifest): + assert self.version.slug == LATEST + index_build(self.build.pk) + # Feature flag is not enabled. + write_manifest.assert_not_called() + + get( + Feature, + feature_id=Feature.GENERATE_MANIFEST_FOR_FILE_TREE_DIFF, + projects=[self.project], + ) + index_build(self.build.pk) + manifest = FileTreeDiffManifest( + build_id=self.build.pk, + files=[ + FileTreeDiffFile( + path="index.html", + main_content_hash=mock.ANY, + ), + FileTreeDiffFile( + path="404.html", + main_content_hash=mock.ANY, + ), + FileTreeDiffFile( + path="test.html", + main_content_hash=mock.ANY, + ), + FileTreeDiffFile( + path="api/index.html", + main_content_hash=mock.ANY, + ), + ], + ) + write_manifest.assert_called_once_with(self.version, manifest) + + # The version is not the latest nor a PR. + new_version = get( + Version, + project=self.project, + slug="new-version", + ) + self.build.version = new_version + self.build.save() + write_manifest.reset_mock() + index_build(self.build.pk) + write_manifest.assert_not_called() + + # Now it is a PR. + new_version.type = EXTERNAL + new_version.save() + with override_settings(DOCROOT=self.test_dir): + self._copy_storage_dir(new_version) + index_build(self.build.pk) + write_manifest.assert_called_once() diff --git a/readthedocs/search/api/v2/views.py b/readthedocs/search/api/v2/views.py index 9f8c5dad34f..d1cfcb2bb2b 100644 --- a/readthedocs/search/api/v2/views.py +++ b/readthedocs/search/api/v2/views.py @@ -81,7 +81,7 @@ def _get_projects_to_search(self): main_version = self._get_version() main_project = self._get_project() - if not self._has_permission(self.request.user, main_version): + if not self._has_permission(self.request, main_version): return [] projects_to_search = [(main_project, main_version)] @@ -101,7 +101,7 @@ def _get_projects_to_search(self): include_hidden=False, ) - if version and self._has_permission(self.request.user, version): + if version and self._has_permission(self.request, version): projects_to_search.append((subproject, version)) return projects_to_search @@ -125,7 +125,7 @@ def _get_project_version(self, project, version_slug, include_hidden=True): .first() ) - def _has_permission(self, user, version): + def _has_permission(self, request, version): """ Check if `user` is authorized to access `version`. diff --git a/readthedocs/search/api/v3/executor.py b/readthedocs/search/api/v3/executor.py index f5a7ff67123..e529c108493 100644 --- a/readthedocs/search/api/v3/executor.py +++ b/readthedocs/search/api/v3/executor.py @@ -93,14 +93,15 @@ def _get_projects_to_search(self): for value in self.parser.arguments["project"]: project, version = self._get_project_and_version(value) - if version and self._has_permission(self.request.user, version): + if version and self._has_permission(self.request, version): yield project, version + # TODO: respect the language of the project when including subprojects. for value in self.parser.arguments["subprojects"]: project, version = self._get_project_and_version(value) # Add the project itself. - if version and self._has_permission(self.request.user, version): + if version and self._has_permission(self.request, version): yield project, version if project: @@ -124,7 +125,7 @@ def _get_projects_from_user(self): version_slug=project.default_version, include_hidden=False, ) - if version and self._has_permission(self.request.user, version): + if version and self._has_permission(self.request, version): yield project, version def _get_subprojects(self, project, version_slug=None): @@ -153,10 +154,10 @@ def _get_subprojects(self, project, version_slug=None): include_hidden=False, ) - if version and self._has_permission(self.request.user, version): + if version and self._has_permission(self.request, version): yield subproject, version - def _has_permission(self, user, version): + def _has_permission(self, request, version): """ Check if `user` is authorized to access `version`. diff --git a/readthedocs/search/parsers.py b/readthedocs/search/parsers.py index 7031217d56f..e6dfbca9038 100644 --- a/readthedocs/search/parsers.py +++ b/readthedocs/search/parsers.py @@ -1,5 +1,5 @@ """JSON/HTML parsers for search indexing.""" - +import hashlib import itertools import re @@ -73,7 +73,7 @@ def _get_page_content(self, page): content = f.read() except Exception: log.warning( - "Unhandled exception during search processing file.", + "Failed to get page content.", page=page, ) return content @@ -449,6 +449,7 @@ def parse(self, page): "path": page, "title": "", "sections": [], + "main_content_hash": None, } def _process_content(self, page, content): @@ -457,7 +458,9 @@ def _process_content(self, page, content): body = self._get_main_node(html) title = "" sections = [] + main_content_hash = None if body: + main_content_hash = hashlib.md5(body.html.encode()).hexdigest() body = self._clean_body(body) title = self._get_page_title(body, html) or page sections = self._get_sections(title=title, body=body) @@ -470,4 +473,5 @@ def _process_content(self, page, content): "path": page, "title": title, "sections": sections, + "main_content_hash": main_content_hash, } diff --git a/readthedocs/search/tests/data/generic/out/basic.json b/readthedocs/search/tests/data/generic/out/basic.json index 188ddeeab10..7076a8bd838 100644 --- a/readthedocs/search/tests/data/generic/out/basic.json +++ b/readthedocs/search/tests/data/generic/out/basic.json @@ -2,6 +2,7 @@ { "path": "basic.html", "title": "Title of the page", + "main_content_hash": "006cb661925c211be260be17d64662b5", "sections": [ { "id": "love", diff --git a/readthedocs/search/tests/data/mkdocs/out/gitbook.json b/readthedocs/search/tests/data/mkdocs/out/gitbook.json index 54a8216cf23..3dcd789797b 100644 --- a/readthedocs/search/tests/data/mkdocs/out/gitbook.json +++ b/readthedocs/search/tests/data/mkdocs/out/gitbook.json @@ -2,6 +2,7 @@ { "path": "index.html", "title": "Mkdocs - GitBook Theme", + "main_content_hash": "a2050380ef001ef57754fb35b41d477d", "sections": [ { "id": "mkdocs-gitbook-theme", diff --git a/readthedocs/search/tests/data/mkdocs/out/material.json b/readthedocs/search/tests/data/mkdocs/out/material.json index 293ae38acb3..50331b2d6c8 100644 --- a/readthedocs/search/tests/data/mkdocs/out/material.json +++ b/readthedocs/search/tests/data/mkdocs/out/material.json @@ -2,6 +2,7 @@ { "path": "index.html", "title": "Overview", + "main_content_hash": "448eb15bfef60bc49a4059a34ebeaa4b", "sections": [ { "id": "", diff --git a/readthedocs/search/tests/data/mkdocs/out/mkdocs-1.1.json b/readthedocs/search/tests/data/mkdocs/out/mkdocs-1.1.json index 7e5eec85312..57a530d4fd5 100644 --- a/readthedocs/search/tests/data/mkdocs/out/mkdocs-1.1.json +++ b/readthedocs/search/tests/data/mkdocs/out/mkdocs-1.1.json @@ -2,6 +2,7 @@ { "path": "index.html", "title": "MkDocs", + "main_content_hash": "aca1f625e559fe49718abe801532b3ef", "sections": [ { "id": "mkdocs", @@ -38,6 +39,7 @@ { "path": "404.html", "title": "404", + "main_content_hash": "2f582057ff73758c152d6925eb149099", "sections": [ { "id": "404-page-not-found", @@ -49,6 +51,7 @@ { "path": "configuration.html", "title": "Configuration", + "main_content_hash": "6a5526f41cfef3da4c541f0465ac42e8", "sections": [ { "id": "configuration", @@ -90,6 +93,7 @@ { "path": "no-title.html", "title": "No title - Read the Docs MkDocs Test", + "main_content_hash": "c02e4800b9da5e96ed35841f11072777", "sections": [ { "id": "", @@ -101,6 +105,7 @@ { "path": "no-main-header.html", "title": "I'm the header", + "main_content_hash": "43010c91ae89c036b157c2a3947d63b8", "sections": [ { "id": "", diff --git a/readthedocs/search/tests/data/mkdocs/out/readthedocs-1.1.json b/readthedocs/search/tests/data/mkdocs/out/readthedocs-1.1.json index 55a06731178..063ab3bfa87 100644 --- a/readthedocs/search/tests/data/mkdocs/out/readthedocs-1.1.json +++ b/readthedocs/search/tests/data/mkdocs/out/readthedocs-1.1.json @@ -2,6 +2,7 @@ { "path": "index.html", "title": "Read the Docs MkDocs Test Project", + "main_content_hash": "25c48df9b31b31eaee235206a48239a5", "sections": [ { "id": "read-the-docs-mkdocs-test-project", @@ -28,6 +29,7 @@ { "path": "404.html", "title": "404", + "main_content_hash": "5f6d687421dad7e1409dcaa3ccd2d34d", "sections": [ { "id": "404-page-not-found", @@ -39,6 +41,7 @@ { "path": "versions.html", "title": "Versions & Themes", + "main_content_hash": "68a6609034e0a45adb135a60b98c12b8", "sections": [ { "id": "versions-themes", diff --git a/readthedocs/search/tests/data/mkdocs/out/windmill.json b/readthedocs/search/tests/data/mkdocs/out/windmill.json index ccc0d3f92d7..8ea6f7aad5f 100644 --- a/readthedocs/search/tests/data/mkdocs/out/windmill.json +++ b/readthedocs/search/tests/data/mkdocs/out/windmill.json @@ -2,6 +2,7 @@ { "path": "index.html", "title": "Windmill theme", + "main_content_hash": "68542e8759d648c4304f0e1ca85a0547", "sections": [ { "id": "windmill-theme", diff --git a/readthedocs/search/tests/data/pelican/out/default.json b/readthedocs/search/tests/data/pelican/out/default.json index 699aa53502e..d829e628d07 100644 --- a/readthedocs/search/tests/data/pelican/out/default.json +++ b/readthedocs/search/tests/data/pelican/out/default.json @@ -2,6 +2,7 @@ { "path": "index.html", "title": "Pelican Development Blog", + "main_content_hash": "ab49e3b7d72db4a92506ef085770e323", "sections": [ { "id": "banner", diff --git a/readthedocs/search/tests/data/sphinx/out/autodoc.json b/readthedocs/search/tests/data/sphinx/out/autodoc.json index a43bcbaca18..64d424f60fc 100644 --- a/readthedocs/search/tests/data/sphinx/out/autodoc.json +++ b/readthedocs/search/tests/data/sphinx/out/autodoc.json @@ -1,6 +1,7 @@ { "path": "autodoc.html", "title": "sphinx.ext.autodoc – Include documentation from docstrings", + "main_content_hash": "1761ca2f532958a9dda6967544cde167", "sections": [ { "id": "directive-automodule", diff --git a/readthedocs/search/tests/data/sphinx/out/httpdomain.json b/readthedocs/search/tests/data/sphinx/out/httpdomain.json index 9bede917427..7cb59fe5add 100644 --- a/readthedocs/search/tests/data/sphinx/out/httpdomain.json +++ b/readthedocs/search/tests/data/sphinx/out/httpdomain.json @@ -1,6 +1,7 @@ { "path": "httpdomain.html", "title": "Here is a nice reference", + "main_content_hash": "b0d8f8cbfe35ca79213ac9122704706f", "sections": [ { "id": "get--api-v3-projects-", diff --git a/readthedocs/search/tests/data/sphinx/out/local-toc.json b/readthedocs/search/tests/data/sphinx/out/local-toc.json index c6604b64cb3..5a848e37664 100644 --- a/readthedocs/search/tests/data/sphinx/out/local-toc.json +++ b/readthedocs/search/tests/data/sphinx/out/local-toc.json @@ -1,6 +1,7 @@ { "path": "local-toc.html", "title": "Security reports", + "main_content_hash": "71a1ef5892c6b2181e70cc18171ae5ee", "sections": [ { "id": "security-reports", diff --git a/readthedocs/search/tests/data/sphinx/out/no-title.json b/readthedocs/search/tests/data/sphinx/out/no-title.json index ed8410429bc..882bde7d8f7 100644 --- a/readthedocs/search/tests/data/sphinx/out/no-title.json +++ b/readthedocs/search/tests/data/sphinx/out/no-title.json @@ -1,6 +1,7 @@ { "title": "", "path": "no-title.html", + "main_content_hash": "e821445de7eb6b324ef37a9780392c85", "sections": [ { "id": "", diff --git a/readthedocs/search/tests/data/sphinx/out/page.json b/readthedocs/search/tests/data/sphinx/out/page.json index 4266b476b37..2270e9d8fae 100644 --- a/readthedocs/search/tests/data/sphinx/out/page.json +++ b/readthedocs/search/tests/data/sphinx/out/page.json @@ -1,6 +1,7 @@ { "path": "page.html", "title": "I Need Secrets (or Environment Variables) in my Build", + "main_content_hash": "95daf5cde2bee38d4bb23edaa1d5c0f3", "sections": [ { "id": "test_py_module.test.Foo", diff --git a/readthedocs/search/tests/data/sphinx/out/requests.json b/readthedocs/search/tests/data/sphinx/out/requests.json index 429b695bebd..74396bcb9ac 100644 --- a/readthedocs/search/tests/data/sphinx/out/requests.json +++ b/readthedocs/search/tests/data/sphinx/out/requests.json @@ -1,6 +1,7 @@ { "path": "requests.html", "title": "Developer Interface", + "main_content_hash": "fd572599b94b2319f973a73b25421d91", "sections": [ { "id": "requests.request", diff --git a/readthedocs/search/tests/data/sphinx/out/toctree.json b/readthedocs/search/tests/data/sphinx/out/toctree.json index f7b0ab63653..bc1d69b4cb3 100644 --- a/readthedocs/search/tests/data/sphinx/out/toctree.json +++ b/readthedocs/search/tests/data/sphinx/out/toctree.json @@ -1,6 +1,7 @@ { "path": "toctree.html", "title": "Public API", + "main_content_hash": "3d2e43fdc8b58752538ebd934a34b6ea", "sections": [ { "id": "public-api",