From a1828995fbbc4c826f33c0b242f5b18665c3d1a9 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Wed, 2 Oct 2024 16:57:12 -0500 Subject: [PATCH 01/20] Build: refactor search indexing process Currently, we walk the entire project directory to apply two operations: index files in ES, and keep track of index/404 files. These two operations are independent, but in our code they are kind of mixed together in order to avoid walking the project directory twice. I have abstracted the processing of the files with a "Indexer" class, which is responsible for doing an operation on a file, and at the end it can collect the results. --- readthedocs/projects/tasks/search.py | 323 +++++++++++------- .../rtd_tests/tests/test_imported_file.py | 5 +- 2 files changed, 196 insertions(+), 132 deletions(-) diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index 9964aeb9d70..1a76195e8a3 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -14,6 +14,193 @@ log = structlog.get_logger(__name__) +class Indexer: + + """ + Base class for doing operations over each file from a build. + + The process method should be implemented to apply the operation + over each file, and the collect method should be implemented + to collect the results of the operation after processing all files. + + `sync_id` is used to differentiate the files from the current sync from the previous one. + """ + + def process(self, html_file: HTMLFile, sync_id: int): + raise NotImplementedError + + def collect(self, sync_id: int): + raise NotImplementedError + + +class SearchIndexer(Indexer): + + """ + Index HTML files in ElasticSearch. + + We respect the search ranking and ignore patterns defined in the project's search configuration. + + If search_index_name is provided, it will be used as the search index name, + otherwise the default one will be used. + """ + + def __init__( + self, + project: Project, + version: Version, + search_ranking: dict[str, int], + search_ignore: list[str], + search_index_name: str | None = None, + ): + self.project = project + self.version = version + self.search_ranking = search_ranking + self.search_ignore = search_ignore + self._reversed_search_ranking = list(reversed(search_ranking.items())) + self.search_index_name = search_index_name + self._html_files_to_index = [] + + def process(self, html_file: HTMLFile, sync_id: int): + for pattern in self.search_ignore: + if fnmatch(html_file.path, pattern): + return + + for pattern, rank in self._reversed_search_ranking: + if fnmatch(html_file.path, pattern): + html_file.rank = rank + break + + self._html_files_to_index.append(html_file) + + def collect(self, sync_id: int): + # Index new files in ElasticSearch. + if self._html_files_to_index: + index_objects( + document=PageDocument, + objects=self._html_files_to_index, + index_name=self.search_index_name, + # Pages are indexed in small chunks to avoid a + # large payload that will probably timeout ES. + chunk_size=100, + ) + + # Remove old HTMLFiles from ElasticSearch. + remove_indexed_files( + project_slug=self.project.slug, + version_slug=self.version.slug, + sync_id=sync_id, + index_name=self.search_index_name, + ) + + +class IndexFileIndexer(Indexer): + + """ + Create imported files of interest in the DB. + + We only save the top-level 404 file and index files, + we don't need to keep track of all files. + These files are queried by proxito instead of checking S3 (slow). + """ + + def __init__(self, project: Project, version: Version): + self.project = project + self.version = version + self._html_files_to_save = [] + + def process(self, html_file: HTMLFile, sync_id: int): + if html_file.path == "404.html" or html_file.name == "index.html": + self._html_files_to_save.append(html_file) + + def collect(self, sync_id: int): + if self._html_files_to_save: + HTMLFile.objects.bulk_create(self._html_files_to_save) + + # Delete imported files from the previous build of the version. + self.version.imported_files.exclude(build=sync_id).delete() + + +def _get_indexers(*, version, search_ranking, search_ignore, search_index_name=None): + indexers = [] + # NOTE: The search indexer must be before the index file indexer. + # This is because saving the objects in the DB will give them an id, + # and we neeed this id to be `None` when indexing the objects in ES. + # ES will generate a unique id for each document. + # NOTE: If the version is external, we don't create a search index for it. + if not version.is_external: + search_indexer = SearchIndexer( + project=version.project, + version=version, + search_ranking=search_ranking, + search_ignore=search_ignore, + search_index_name=search_index_name, + ) + indexers.append(search_indexer) + index_file_indexer = IndexFileIndexer( + project=version.project, + version=version, + ) + indexers.append(index_file_indexer) + return indexers + + +def _process_files(*, version: Version, indexers: list[Indexer]): + storage_path = version.project.get_storage_path( + type_="html", + version_slug=version.slug, + include_file=False, + version_type=version.type, + ) + # A sync ID is a number different than the current `build` attribute (pending rename), + # it's used to differentiate the files from the current sync from the previous one. + # This is useful to easily delete the previous files from the DB and ES. + # See https://github.com/readthedocs/readthedocs.org/issues/10734. + imported_file_build_id = version.imported_files.values_list( + "build", flat=True + ).first() + sync_id = imported_file_build_id + 1 if imported_file_build_id else 1 + + log.debug( + "Using sync ID for search indexing", + sync_id=sync_id, + ) + + for root, __, filenames in build_media_storage.walk(storage_path): + for filename in filenames: + # We don't care about non-HTML files (for now?). + if not filename.endswith(".html"): + continue + + full_path = build_media_storage.join(root, filename) + # Generate a relative path for storage similar to os.path.relpath + relpath = full_path.removeprefix(storage_path).lstrip("/") + + html_file = HTMLFile( + project=version.project, + version=version, + path=relpath, + name=filename, + # TODO: We are setting the commit field since it's required, + # but it isn't used, and will be removed in the future + # together with other fields. + commit="unknown", + build=sync_id, + ) + for indexer in indexers: + indexer.process(html_file, sync_id) + + for indexer in indexers: + indexer.collect(sync_id) + + # This signal is used for purging the CDN. + files_changed.send( + sender=Project, + project=version.project, + version=version, + ) + return sync_id + + @app.task(queue="reindex") def index_build(build_id): """Create imported files and search index for the build.""" @@ -49,13 +236,14 @@ def index_build(build_id): search_ignore = search_config.get("ignore", []) try: - _create_imported_files_and_search_index( + indexers = _get_indexers( version=version, search_ranking=search_ranking, search_ignore=search_ignore, ) + _process_files(version=version, indexers=indexers) except Exception: - log.exception("Failed during creation of new files") + log.exception("Failed to index build") @app.task(queue="reindex") @@ -99,14 +287,15 @@ def reindex_version(version_id, search_index_name=None): search_ignore = search_config.get("ignore", []) try: - _create_imported_files_and_search_index( + indexers = _get_indexers( version=version, search_ranking=search_ranking, search_ignore=search_ignore, search_index_name=search_index_name, ) + _process_files(version=version, indexers=indexers) except Exception: - log.exception("Failed during creation of new files") + log.exception("Failed to re-index version") @app.task(queue="reindex") @@ -141,129 +330,3 @@ def remove_search_indexes(project_slug, version_slug=None): project_slug=project_slug, version_slug=version_slug, ) - - -def _create_imported_files_and_search_index( - *, version, search_ranking, search_ignore, search_index_name=None -): - """ - Create imported files and search index for the build of the version. - - If the version is external, we don't create a search index for it, only imported files. - After the process is completed, we delete the files and search index that - don't belong to the current build id. - - :param search_index: If provided, it will be used as the search index name, - otherwise the default one will be used. - """ - storage_path = version.project.get_storage_path( - type_="html", - version_slug=version.slug, - include_file=False, - version_type=version.type, - ) - # A sync ID is a number different than the current `build` attribute (pending rename), - # it's used to differentiate the files from the current sync from the previous one. - # This is useful to easily delete the previous files from the DB and ES. - # See https://github.com/readthedocs/readthedocs.org/issues/10734. - imported_file_build_id = version.imported_files.values_list( - "build", flat=True - ).first() - sync_id = imported_file_build_id + 1 if imported_file_build_id else 1 - - log.debug( - "Using sync ID for search indexing", - sync_id=sync_id, - ) - - html_files_to_index = [] - html_files_to_save = [] - reverse_rankings = list(reversed(search_ranking.items())) - for root, __, filenames in build_media_storage.walk(storage_path): - for filename in filenames: - # We don't care about non-HTML files - if not filename.endswith(".html"): - continue - - full_path = build_media_storage.join(root, filename) - - # Generate a relative path for storage similar to os.path.relpath - relpath = full_path.replace(storage_path, "", 1).lstrip("/") - - skip_search_index = False - if version.is_external: - # Never index files from external versions. - skip_search_index = True - else: - for pattern in search_ignore: - if fnmatch(relpath, pattern): - skip_search_index = True - break - - page_rank = 0 - # If the file is ignored, we don't need to check for its ranking. - if not skip_search_index: - # Last pattern to match takes precedence - for pattern, rank in reverse_rankings: - if fnmatch(relpath, pattern): - page_rank = rank - break - - html_file = HTMLFile( - project=version.project, - version=version, - path=relpath, - name=filename, - rank=page_rank, - # TODO: We are setting the commit field since it's required, - # but it isn't used, and will be removed in the future - # together with other fields. - commit="unknown", - build=sync_id, - ignore=skip_search_index, - ) - - if not skip_search_index: - html_files_to_index.append(html_file) - - # Create the imported file only if it's a top-level 404 file, - # or if it's an index file. We don't need to keep track of all files. - tryfiles = ["index.html"] - if relpath == "404.html" or filename in tryfiles: - html_files_to_save.append(html_file) - - # We first index the files in ES, and then save the objects in the DB. - # This is because saving the objects in the DB will give them an id, - # and we neeed this id to be `None` when indexing the objects in ES. - # ES will generate a unique id for each document. - if html_files_to_index: - index_objects( - document=PageDocument, - objects=html_files_to_index, - index_name=search_index_name, - # Pages are indexed in small chunks to avoid a - # large payload that will probably timeout ES. - chunk_size=100, - ) - - # Remove old HTMLFiles from ElasticSearch - remove_indexed_files( - project_slug=version.project.slug, - version_slug=version.slug, - sync_id=sync_id, - index_name=search_index_name, - ) - - if html_files_to_save: - HTMLFile.objects.bulk_create(html_files_to_save) - - # Delete imported files from the previous build of the version. - version.imported_files.exclude(build=sync_id).delete() - - # This signal is used for purging the CDN. - files_changed.send( - sender=Project, - project=version.project, - version=version, - ) - return sync_id diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py index 86b60c4907e..77ce20b8ada 100644 --- a/readthedocs/rtd_tests/tests/test_imported_file.py +++ b/readthedocs/rtd_tests/tests/test_imported_file.py @@ -8,7 +8,7 @@ from readthedocs.builds.constants import EXTERNAL from readthedocs.projects.models import HTMLFile, ImportedFile, Project -from readthedocs.projects.tasks.search import _create_imported_files_and_search_index +from readthedocs.projects.tasks.search import _get_indexers, _process_files from readthedocs.search.documents import PageDocument base_dir = os.path.dirname(os.path.dirname(__file__)) @@ -46,11 +46,12 @@ def _manage_imported_files(self, version, search_ranking=None, search_ignore=Non """Helper function for the tests to create and sync ImportedFiles.""" search_ranking = search_ranking or {} search_ignore = search_ignore or [] - return _create_imported_files_and_search_index( + indexers = _get_indexers( version=version, search_ranking=search_ranking, search_ignore=search_ignore, ) + return _process_files(version=version, indexers=indexers) def _copy_storage_dir(self): """Copy the test directory (rtd_tests/files) to storage""" From 2555ae60ab8cef83248b5f022d4e69b0ad03920a Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 3 Oct 2024 12:39:12 -0500 Subject: [PATCH 02/20] File tree diff Closes https://github.com/readthedocs/readthedocs.org/issues/11319 Ref https://github.com/readthedocs/readthedocs.org/pull/11507 --- readthedocs/filetreediff/__init__.py | 84 ++++++++++++++++++++++++++++ readthedocs/projects/constants.py | 2 + readthedocs/projects/models.py | 16 +++++- readthedocs/projects/tasks/search.py | 72 ++++++++++++++++++------ readthedocs/proxito/views/hosting.py | 20 +++++++ readthedocs/search/parsers.py | 15 ++++- 6 files changed, 187 insertions(+), 22 deletions(-) create mode 100644 readthedocs/filetreediff/__init__.py diff --git a/readthedocs/filetreediff/__init__.py b/readthedocs/filetreediff/__init__.py new file mode 100644 index 00000000000..3698a2e08a7 --- /dev/null +++ b/readthedocs/filetreediff/__init__.py @@ -0,0 +1,84 @@ +import json +from dataclasses import dataclass + +from readthedocs.builds.constants import BUILD_STATE_FINISHED +from readthedocs.builds.models import Version +from readthedocs.projects.constants import MEDIA_TYPE_METADATA +from readthedocs.storage import build_media_storage + + +@dataclass +class FileTreeDiff: + added: list[str] + removed: list[str] + modified: list[str] + + +def get_diff(version_a: Version, version_b: Version) -> FileTreeDiff | None: + version_a_manifest = get_manifest(version_a) + version_b_manifest = get_manifest(version_b) + + if not version_a_manifest or not version_b_manifest: + return None + + files_a = set(version_a_manifest.get("files", {}).keys()) + files_b = set(version_b_manifest.get("files", {}).keys()) + + files_added = list(files_a - files_b) + files_removed = list(files_b - files_a) + files_modified = [] + for file_path in files_a & files_b: + file_a = version_a_manifest["files"][file_path] + file_b = version_b_manifest["files"][file_path] + + if file_a["hash"] != file_b["hash"]: + files_modified.append(file_path) + + return FileTreeDiff( + added=files_added, + removed=files_removed, + modified=files_modified, + ) + + +def get_manifest(version: Version): + storage_path = version.project.get_storage_path( + type_=MEDIA_TYPE_METADATA, + version_slug=version.slug, + include_file=False, + version_type=version.type, + ) + manifest_path = build_media_storage.join(storage_path, "manifest.json") + try: + with build_media_storage.open(manifest_path) as manifest_file: + manifest = json.load(manifest_file) + except FileNotFoundError: + return None + + latest_successful_build = version.builds.filter( + state=BUILD_STATE_FINISHED, + success=True, + ).first() + if not latest_successful_build: + return None + + build_id_from_manifest = manifest.get("build", {}).get("id") + if latest_successful_build.id != build_id_from_manifest: + # The manifest is outdated, + # do we want to still use it? do we care? + # Should the caller be responsible to handle this? + return None + + return manifest + + +def write_manifest(version: Version, manifest: dict): + storage_path = version.project.get_storage_path( + type_=MEDIA_TYPE_METADATA, + version_slug=version.slug, + include_file=False, + version_type=version.type, + ) + manifest_path = build_media_storage.join(storage_path, "manifest.json") + with build_media_storage.open(manifest_path, "w") as f: + json.dump(manifest, f) diff --git a/readthedocs/projects/constants.py b/readthedocs/projects/constants.py index d1401c171e6..05b3b3d15ae 100644 --- a/readthedocs/projects/constants.py +++ b/readthedocs/projects/constants.py @@ -34,6 +34,7 @@ MEDIA_TYPE_EPUB = "epub" MEDIA_TYPE_HTMLZIP = "htmlzip" MEDIA_TYPE_JSON = "json" +MEDIA_TYPE_METADATA = "metadata" DOWNLOADABLE_MEDIA_TYPES = ( MEDIA_TYPE_PDF, MEDIA_TYPE_EPUB, @@ -45,6 +46,7 @@ MEDIA_TYPE_EPUB, MEDIA_TYPE_HTMLZIP, MEDIA_TYPE_JSON, + MEDIA_TYPE_METADATA, ) BUILD_COMMANDS_OUTPUT_PATH = "_readthedocs/" diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index b3f9a9b9aba..d22d90be0e9 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -1524,13 +1524,20 @@ class Meta: objects = HTMLFileManager() def get_processed_json(self): - parser = GenericParser(self.version) - return parser.parse(self.path) + return self._parser.parse(self.path) + + @cached_property + def _parser(self): + return GenericParser(self.version) @cached_property def processed_json(self): return self.get_processed_json() + @property + def main_content(self): + return self._parser.get_main_content(self.path) + class Notification(TimeStampedModel): @@ -1887,6 +1894,7 @@ def add_features(sender, **kwargs): RESOLVE_PROJECT_FROM_HEADER = "resolve_project_from_header" USE_PROXIED_APIS_WITH_PREFIX = "use_proxied_apis_with_prefix" ALLOW_VERSION_WARNING_BANNER = "allow_version_warning_banner" + GENERATE_MANIFEST_FOR_FILE_TREE_DIFF = "generate_manifest_for_file_tree_diff" # Versions sync related features SKIP_SYNC_TAGS = "skip_sync_tags" @@ -1947,6 +1955,10 @@ def add_features(sender, **kwargs): ALLOW_VERSION_WARNING_BANNER, _("Dashboard: Allow project to use the version warning banner."), ), + ( + GENERATE_MANIFEST_FOR_FILE_TREE_DIFF, + _("Build: Generate a file manifest for file tree diff."), + ), # Versions sync related features ( SKIP_SYNC_BRANCHES, diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index 1a76195e8a3..bc4cdfe6dc6 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -1,10 +1,12 @@ +import hashlib from fnmatch import fnmatch import structlog -from readthedocs.builds.constants import BUILD_STATE_FINISHED, INTERNAL +from readthedocs.builds.constants import BUILD_STATE_FINISHED, INTERNAL, LATEST from readthedocs.builds.models import Build, Version -from readthedocs.projects.models import HTMLFile, Project +from readthedocs.filetreediff import write_manifest +from readthedocs.projects.models import Feature, HTMLFile, Project from readthedocs.projects.signals import files_changed from readthedocs.search.documents import PageDocument from readthedocs.search.utils import index_objects, remove_indexed_files @@ -120,7 +122,38 @@ def collect(self, sync_id: int): self.version.imported_files.exclude(build=sync_id).delete() -def _get_indexers(*, version, search_ranking, search_ignore, search_index_name=None): +class FileManifestIndexer(Indexer): + def __init__(self, version: Version, build: Build): + self.version = version + self.build = build + self._hashes = {} + + def process(self, html_file: HTMLFile, sync_id: int): + self._hashes[html_file.path] = hashlib.md5( + html_file.main_content.encode() + ).hexdigest() + + def collect(self, sync_id: int): + manifest = { + "build": { + "id": self.build.id, + }, + "files": { + path: { + "hash": hash, + } + for path, hash in self._hashes.items() + }, + } + write_manifest(self.version, manifest) + + +def _get_indexers(*, version: Version, build: Build, search_index_name=None): + build_config = build.config or {} + search_config = build_config.get("search", {}) + search_ranking = search_config.get("ranking", {}) + search_ignore = search_config.get("ignore", []) + indexers = [] # NOTE: The search indexer must be before the index file indexer. # This is because saving the objects in the DB will give them an id, @@ -136,6 +169,22 @@ def _get_indexers(*, version, search_ranking, search_ignore, search_index_name=N search_index_name=search_index_name, ) indexers.append(search_indexer) + + # File tree diff is under a feature flag for now, + # and we only allow to compare PR previous against the latest version. + has_feature = version.project.has_feature( + Feature.GENERATE_MANIFEST_FOR_FILE_TREE_DIFF + ) + create_manifest = has_feature and ( + version.is_external or version == version.slug == LATEST + ) + if create_manifest: + file_manifest_indexer = FileManifestIndexer( + version=version, + build=build, + ) + indexers.append(file_manifest_indexer) + index_file_indexer = IndexFileIndexer( project=version.project, version=version, @@ -230,16 +279,10 @@ def index_build(build_id): build_id=build.id, ) - build_config = build.config or {} - search_config = build_config.get("search", {}) - search_ranking = search_config.get("ranking", {}) - search_ignore = search_config.get("ignore", []) - try: indexers = _get_indexers( version=version, - search_ranking=search_ranking, - search_ignore=search_ignore, + build=build, ) _process_files(version=version, indexers=indexers) except Exception: @@ -280,17 +323,10 @@ def reindex_version(version_id, search_index_name=None): version_slug=version.slug, build_id=latest_successful_build.id, ) - - build_config = latest_successful_build.config or {} - search_config = build_config.get("search", {}) - search_ranking = search_config.get("ranking", {}) - search_ignore = search_config.get("ignore", []) - try: indexers = _get_indexers( version=version, - search_ranking=search_ranking, - search_ignore=search_ignore, + build=latest_successful_build, search_index_name=search_index_name, ) _process_files(version=version, indexers=indexers) diff --git a/readthedocs/proxito/views/hosting.py b/readthedocs/proxito/views/hosting.py index 51174fbab55..04971480c55 100644 --- a/readthedocs/proxito/views/hosting.py +++ b/readthedocs/proxito/views/hosting.py @@ -23,6 +23,7 @@ from readthedocs.core.resolver import Resolver from readthedocs.core.unresolver import UnresolverError, unresolver from readthedocs.core.utils.extend import SettingsOverrideObject +from readthedocs.filetreediff import get_diff from readthedocs.projects.constants import ( ADDONS_FLYOUT_SORTING_CALVER, ADDONS_FLYOUT_SORTING_CUSTOM_PATTERN, @@ -501,9 +502,28 @@ def _v1(self, project, version, build, filename, url, request): "trigger": "Slash", # Could be something like "Ctrl + D" }, }, + "filetreediff": { + "enabled": False, + }, }, } + if version.is_external: + latest_version = project.get_latest_version() + diff = get_diff(version_a=version, version_b=latest_version) + if diff: + diff_result = { + "added": [{"file": file} for file in diff.added], + "removed": [{"file": file} for file in diff.removed], + "modified": [{"file": file} for file in diff.modified], + } + data["addons"]["filetreediff"].update( + { + "enabled": True, + "diff": diff_result, + } + ) + # DocDiff depends on `url=` GET attribute. # This attribute allows us to know the exact filename where the request was made. # If we don't know the filename, we cannot return the data required by DocDiff to work. diff --git a/readthedocs/search/parsers.py b/readthedocs/search/parsers.py index f5a99294b79..30e92803ae4 100644 --- a/readthedocs/search/parsers.py +++ b/readthedocs/search/parsers.py @@ -1,5 +1,5 @@ """JSON/HTML parsers for search indexing.""" - +import functools import itertools import re @@ -20,6 +20,7 @@ def __init__(self, version): self.project = self.version.project self.storage = build_media_storage + @functools.cache def _get_page_content(self, page): """Gets the page content from storage.""" content = None @@ -34,7 +35,7 @@ def _get_page_content(self, page): content = f.read() except Exception: log.warning( - "Unhandled exception during search processing file.", + "Failed to get page content.", page=page, ) return content @@ -427,3 +428,13 @@ def _process_content(self, page, content): "title": title, "sections": sections, } + + def get_main_content(self, page): + try: + content = self._get_page_content(page) + html = HTMLParser(content) + body = self._get_main_node(html) + return body.html + except Exception: + log.info("Failed to get main content from page.", path=page, exc_info=True) + return "" From ed5377d9fe0b6b27269cbaec5c3d590226c6420b Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 3 Oct 2024 14:46:36 -0500 Subject: [PATCH 03/20] Hash when parsing content to avoid caching the whole content --- readthedocs/projects/models.py | 11 ++--------- readthedocs/projects/tasks/search.py | 5 +---- readthedocs/search/parsers.py | 5 +++++ 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index d22d90be0e9..20ee2b27a20 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -1524,20 +1524,13 @@ class Meta: objects = HTMLFileManager() def get_processed_json(self): - return self._parser.parse(self.path) - - @cached_property - def _parser(self): - return GenericParser(self.version) + parser = GenericParser(self.version) + return parser.parse(self.path) @cached_property def processed_json(self): return self.get_processed_json() - @property - def main_content(self): - return self._parser.get_main_content(self.path) - class Notification(TimeStampedModel): diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index bc4cdfe6dc6..0a39fd86f8e 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -1,4 +1,3 @@ -import hashlib from fnmatch import fnmatch import structlog @@ -129,9 +128,7 @@ def __init__(self, version: Version, build: Build): self._hashes = {} def process(self, html_file: HTMLFile, sync_id: int): - self._hashes[html_file.path] = hashlib.md5( - html_file.main_content.encode() - ).hexdigest() + self._hashes[html_file.path] = html_file.processed_json["main_content_hash"] def collect(self, sync_id: int): manifest = { diff --git a/readthedocs/search/parsers.py b/readthedocs/search/parsers.py index 30e92803ae4..8352d7b9d53 100644 --- a/readthedocs/search/parsers.py +++ b/readthedocs/search/parsers.py @@ -1,5 +1,6 @@ """JSON/HTML parsers for search indexing.""" import functools +import hashlib import itertools import re @@ -406,6 +407,7 @@ def parse(self, page): "path": page, "title": "", "sections": [], + "main_content_hash": None, } def _process_content(self, page, content): @@ -414,7 +416,9 @@ def _process_content(self, page, content): body = self._get_main_node(html) title = "" sections = [] + main_content_hash = None if body: + main_content_hash = hashlib.md5(body.html.encode()).hexdigest() body = self._clean_body(body) title = self._get_page_title(body, html) or page sections = self._get_sections(title=title, body=body) @@ -427,6 +431,7 @@ def _process_content(self, page, content): "path": page, "title": title, "sections": sections, + "main_content_hash": main_content_hash, } def get_main_content(self, page): From 1d62412837c59a74c921e011ca70bd4cedc74ff0 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 3 Oct 2024 14:51:05 -0500 Subject: [PATCH 04/20] We don't need this anymore --- readthedocs/search/parsers.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/readthedocs/search/parsers.py b/readthedocs/search/parsers.py index 8352d7b9d53..5f612d48a40 100644 --- a/readthedocs/search/parsers.py +++ b/readthedocs/search/parsers.py @@ -1,5 +1,4 @@ """JSON/HTML parsers for search indexing.""" -import functools import hashlib import itertools import re @@ -21,7 +20,6 @@ def __init__(self, version): self.project = self.version.project self.storage = build_media_storage - @functools.cache def _get_page_content(self, page): """Gets the page content from storage.""" content = None From a5d290582cd1020ba2650f7b14f8bcc66b4efe67 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 3 Oct 2024 14:51:31 -0500 Subject: [PATCH 05/20] More to delete --- readthedocs/search/parsers.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/readthedocs/search/parsers.py b/readthedocs/search/parsers.py index 5f612d48a40..960d4e53ae9 100644 --- a/readthedocs/search/parsers.py +++ b/readthedocs/search/parsers.py @@ -431,13 +431,3 @@ def _process_content(self, page, content): "sections": sections, "main_content_hash": main_content_hash, } - - def get_main_content(self, page): - try: - content = self._get_page_content(page) - html = HTMLParser(content) - body = self._get_main_node(html) - return body.html - except Exception: - log.info("Failed to get main content from page.", path=page, exc_info=True) - return "" From 59915474261ab92fa2b0d64c6e73d7d3c2e49c64 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Mon, 7 Oct 2024 11:02:45 -0500 Subject: [PATCH 06/20] todo --- readthedocs/proxito/views/hosting.py | 1 + 1 file changed, 1 insertion(+) diff --git a/readthedocs/proxito/views/hosting.py b/readthedocs/proxito/views/hosting.py index 04971480c55..4e6148d15aa 100644 --- a/readthedocs/proxito/views/hosting.py +++ b/readthedocs/proxito/views/hosting.py @@ -509,6 +509,7 @@ def _v1(self, project, version, build, filename, url, request): } if version.is_external: + # TODO: check if the user has access to this version. latest_version = project.get_latest_version() diff = get_diff(version_a=version, version_b=latest_version) if diff: From 538f9fa9b214c1abe5f392169a98ec61420849e5 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Wed, 16 Oct 2024 17:36:58 -0500 Subject: [PATCH 07/20] Less WIP, more diff --- readthedocs/builds/models.py | 11 ++++ readthedocs/filetreediff/__init__.py | 99 ++++++++++++++++++---------- readthedocs/projects/constants.py | 4 +- readthedocs/projects/tasks/search.py | 18 ++--- readthedocs/proxito/views/hosting.py | 54 ++++++++++----- 5 files changed, 122 insertions(+), 64 deletions(-) diff --git a/readthedocs/builds/models.py b/readthedocs/builds/models.py index f4514bee815..ffc34333781 100644 --- a/readthedocs/builds/models.py +++ b/readthedocs/builds/models.py @@ -299,6 +299,17 @@ def last_build(self): def latest_build(self): return self.builds.order_by("-date").first() + @property + def latest_successful_build(self): + return ( + self.builds.filter( + state=BUILD_STATE_FINISHED, + success=True, + ) + .order_by("-date") + .first() + ) + @property def config(self): """ diff --git a/readthedocs/filetreediff/__init__.py b/readthedocs/filetreediff/__init__.py index 3698a2e08a7..ff505d8b763 100644 --- a/readthedocs/filetreediff/__init__.py +++ b/readthedocs/filetreediff/__init__.py @@ -1,84 +1,113 @@ import json -from dataclasses import dataclass +from dataclasses import asdict, dataclass -from readthedocs.builds.constants import BUILD_STATE_FINISHED from readthedocs.builds.models import Version -from readthedocs.projects.constants import MEDIA_TYPE_METADATA +from readthedocs.projects.constants import MEDIA_TYPE_DIFF from readthedocs.storage import build_media_storage +MANIFEST_FILE_NAME = "manifest.json" + + +@dataclass(slots=True) +class FileTreeBuild: + id: int + + +@dataclass(slots=True) +class FileTreeFile: + path: str + main_content_hash: str + + +@dataclass(slots=True) +class FileTreeManifest: + files: dict[str, FileTreeFile] + build: FileTreeBuild + + def __init__(self, build_id: int, files: list[FileTreeFile]): + self.build = FileTreeBuild(id=build_id) + self.files = {file.path: file for file in files} + + @classmethod + def from_dict(cls, data: dict) -> "FileTreeManifest": + build_id = data["build"]["id"] + files = [ + FileTreeFile(path=path, main_content_hash=file["main_content_hash"]) + for path, file in data["files"].items() + ] + return cls(build_id, files) + @dataclass class FileTreeDiff: added: list[str] removed: list[str] modified: list[str] + outdated: bool = False def get_diff(version_a: Version, version_b: Version) -> FileTreeDiff | None: - version_a_manifest = get_manifest(version_a) - version_b_manifest = get_manifest(version_b) + outdated = False + manifests: list[FileTreeManifest] = [] + for version in (version_a, version_b): + manifest = get_manifest(version) + if not manifest: + return None - if not version_a_manifest or not version_b_manifest: - return None + latest_build = version_a.latest_successful_build + if not latest_build: + return None + + if latest_build.id != manifest.build.id: + outdated = True - files_a = set(version_a_manifest.get("files", {}).keys()) - files_b = set(version_b_manifest.get("files", {}).keys()) + manifests.append(manifest) + + version_a_manifest, version_b_manifest = manifests + files_a = set(version_a_manifest.files.keys()) + files_b = set(version_b_manifest.files.keys()) files_added = list(files_a - files_b) files_removed = list(files_b - files_a) files_modified = [] for file_path in files_a & files_b: - file_a = version_a_manifest["files"][file_path] - file_b = version_b_manifest["files"][file_path] - - if file_a["hash"] != file_b["hash"]: + file_a = version_a_manifest.files[file_path] + file_b = version_b_manifest.files[file_path] + if file_a.main_content_hash != file_b.main_content_hash: files_modified.append(file_path) return FileTreeDiff( added=files_added, removed=files_removed, modified=files_modified, + outdated=outdated, ) -def get_manifest(version: Version): +def get_manifest(version: Version) -> FileTreeManifest | None: storage_path = version.project.get_storage_path( - type_=MEDIA_TYPE_METADATA, + type_=MEDIA_TYPE_DIFF, version_slug=version.slug, include_file=False, version_type=version.type, ) - manifest_path = build_media_storage.join(storage_path, "manifest.json") + manifest_path = build_media_storage.join(storage_path, MANIFEST_FILE_NAME) try: with build_media_storage.open(manifest_path) as manifest_file: manifest = json.load(manifest_file) except FileNotFoundError: return None - latest_successful_build = version.builds.filter( - state=BUILD_STATE_FINISHED, - success=True, - ).first() - if not latest_successful_build: - return None - - build_id_from_manifest = manifest.get("build", {}).get("id") - if latest_successful_build.id != build_id_from_manifest: - # The manifest is outdated, - # do we want to still use it? do we care? - # Should the caller be responsible to handle this? - return None - - return manifest + return FileTreeManifest.from_dict(manifest) -def write_manifest(version: Version, manifest: dict): +def write_manifest(version: Version, manifest: FileTreeManifest): storage_path = version.project.get_storage_path( - type_=MEDIA_TYPE_METADATA, + type_=MEDIA_TYPE_DIFF, version_slug=version.slug, include_file=False, version_type=version.type, ) - manifest_path = build_media_storage.join(storage_path, "manifest.json") + manifest_path = build_media_storage.join(storage_path, MANIFEST_FILE_NAME) with build_media_storage.open(manifest_path, "w") as f: - json.dump(manifest, f) + json.dump(asdict(manifest), f) diff --git a/readthedocs/projects/constants.py b/readthedocs/projects/constants.py index 05b3b3d15ae..eae06bd9e8f 100644 --- a/readthedocs/projects/constants.py +++ b/readthedocs/projects/constants.py @@ -34,7 +34,7 @@ MEDIA_TYPE_EPUB = "epub" MEDIA_TYPE_HTMLZIP = "htmlzip" MEDIA_TYPE_JSON = "json" -MEDIA_TYPE_METADATA = "metadata" +MEDIA_TYPE_DIFF = "diff" DOWNLOADABLE_MEDIA_TYPES = ( MEDIA_TYPE_PDF, MEDIA_TYPE_EPUB, @@ -46,7 +46,7 @@ MEDIA_TYPE_EPUB, MEDIA_TYPE_HTMLZIP, MEDIA_TYPE_JSON, - MEDIA_TYPE_METADATA, + MEDIA_TYPE_DIFF, ) BUILD_COMMANDS_OUTPUT_PATH = "_readthedocs/" diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index 0a39fd86f8e..7fb60bba2bc 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -4,7 +4,7 @@ from readthedocs.builds.constants import BUILD_STATE_FINISHED, INTERNAL, LATEST from readthedocs.builds.models import Build, Version -from readthedocs.filetreediff import write_manifest +from readthedocs.filetreediff import FileTreeFile, FileTreeManifest, write_manifest from readthedocs.projects.models import Feature, HTMLFile, Project from readthedocs.projects.signals import files_changed from readthedocs.search.documents import PageDocument @@ -131,17 +131,13 @@ def process(self, html_file: HTMLFile, sync_id: int): self._hashes[html_file.path] = html_file.processed_json["main_content_hash"] def collect(self, sync_id: int): - manifest = { - "build": { - "id": self.build.id, - }, - "files": { - path: { - "hash": hash, - } + manifest = FileTreeManifest( + build_id=self.build.id, + files=[ + FileTreeFile(path=path, main_content_hash=hash) for path, hash in self._hashes.items() - }, - } + ], + ) write_manifest(self.version, manifest) diff --git a/readthedocs/proxito/views/hosting.py b/readthedocs/proxito/views/hosting.py index e5af06ddf11..fc8ec3fc4dc 100644 --- a/readthedocs/proxito/views/hosting.py +++ b/readthedocs/proxito/views/hosting.py @@ -312,6 +312,15 @@ def _get_versions(self, request, project): include_hidden=False, ) + def _has_permission(self, user, version): + """ + Check if `user` is authorized to access `version`. + + This is mainly to be overridden in .com to make use of + the auth backends in the proxied API. + """ + return True + def _v1(self, project, version, build, filename, url, request): """ Initial JSON data structure consumed by the JavaScript client. @@ -517,22 +526,11 @@ def _v1(self, project, version, build, filename, url, request): }, } - if version.is_external: - # TODO: check if the user has access to this version. - latest_version = project.get_latest_version() - diff = get_diff(version_a=version, version_b=latest_version) - if diff: - diff_result = { - "added": [{"file": file} for file in diff.added], - "removed": [{"file": file} for file in diff.removed], - "modified": [{"file": file} for file in diff.modified], - } - data["addons"]["filetreediff"].update( - { - "enabled": True, - "diff": diff_result, - } - ) + response = self._get_filetreediff_response( + user=user, project=project, version=version + ) + if response: + data["addons"]["filetreediff"].update(response) # Show the subprojects filter on the parent project and subproject if version: @@ -615,6 +613,30 @@ def _v1(self, project, version, build, filename, url, request): return data + def _get_filetreediff_response(self, *, user, project, version): + if not version or not version.is_external: + return None + + latest_version = project.get_latest_version() + if not latest_version or not self._has_permission( + user=user, version=latest_version + ): + return None + + diff = get_diff(version_a=version, version_b=latest_version) + if not diff: + return None + + return { + "enabled": True, + "outdated": diff.outdated, + "diff": { + "added": [{"file": file} for file in diff.added], + "removed": [{"file": file} for file in diff.removed], + "modified": [{"file": file} for file in diff.modified], + }, + } + def _v2(self, project, version, build, filename, url, user): return { "api_version": "2", From 6f154ada56b79d09da5b91090a20a7add08104a1 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Wed, 16 Oct 2024 17:48:17 -0500 Subject: [PATCH 08/20] Less bugs --- readthedocs/filetreediff/__init__.py | 2 +- readthedocs/proxito/views/hosting.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/readthedocs/filetreediff/__init__.py b/readthedocs/filetreediff/__init__.py index ff505d8b763..b72afde6afc 100644 --- a/readthedocs/filetreediff/__init__.py +++ b/readthedocs/filetreediff/__init__.py @@ -54,7 +54,7 @@ def get_diff(version_a: Version, version_b: Version) -> FileTreeDiff | None: if not manifest: return None - latest_build = version_a.latest_successful_build + latest_build = version.latest_successful_build if not latest_build: return None diff --git a/readthedocs/proxito/views/hosting.py b/readthedocs/proxito/views/hosting.py index fc8ec3fc4dc..fa3cf55156a 100644 --- a/readthedocs/proxito/views/hosting.py +++ b/readthedocs/proxito/views/hosting.py @@ -526,14 +526,14 @@ def _v1(self, project, version, build, filename, url, request): }, } - response = self._get_filetreediff_response( - user=user, project=project, version=version - ) - if response: - data["addons"]["filetreediff"].update(response) - - # Show the subprojects filter on the parent project and subproject if version: + response = self._get_filetreediff_response( + user=user, project=project, version=version + ) + if response: + data["addons"]["filetreediff"].update(response) + + # Show the subprojects filter on the parent project and subproject # TODO: Remove these queries and try to find a way to get this data # from the resolver, which has already done these queries. # TODO: Replace this fixed filters with the work proposed in @@ -614,7 +614,7 @@ def _v1(self, project, version, build, filename, url, request): return data def _get_filetreediff_response(self, *, user, project, version): - if not version or not version.is_external: + if not version.is_external: return None latest_version = project.get_latest_version() From 518ad624dabbbcd4c81a116627a0532744ca09ee Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 17 Oct 2024 10:17:19 -0500 Subject: [PATCH 09/20] Git uses deleted --- readthedocs/filetreediff/__init__.py | 6 +++--- readthedocs/proxito/views/hosting.py | 7 +++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/readthedocs/filetreediff/__init__.py b/readthedocs/filetreediff/__init__.py index b72afde6afc..9ca243c4bd1 100644 --- a/readthedocs/filetreediff/__init__.py +++ b/readthedocs/filetreediff/__init__.py @@ -41,7 +41,7 @@ def from_dict(cls, data: dict) -> "FileTreeManifest": @dataclass class FileTreeDiff: added: list[str] - removed: list[str] + deleted: list[str] modified: list[str] outdated: bool = False @@ -68,7 +68,7 @@ def get_diff(version_a: Version, version_b: Version) -> FileTreeDiff | None: files_b = set(version_b_manifest.files.keys()) files_added = list(files_a - files_b) - files_removed = list(files_b - files_a) + files_deleted = list(files_b - files_a) files_modified = [] for file_path in files_a & files_b: file_a = version_a_manifest.files[file_path] @@ -78,7 +78,7 @@ def get_diff(version_a: Version, version_b: Version) -> FileTreeDiff | None: return FileTreeDiff( added=files_added, - removed=files_removed, + deleted=files_deleted, modified=files_modified, outdated=outdated, ) diff --git a/readthedocs/proxito/views/hosting.py b/readthedocs/proxito/views/hosting.py index fa3cf55156a..41cd512fed0 100644 --- a/readthedocs/proxito/views/hosting.py +++ b/readthedocs/proxito/views/hosting.py @@ -30,7 +30,7 @@ ADDONS_FLYOUT_SORTING_PYTHON_PACKAGING, ADDONS_FLYOUT_SORTING_SEMVER_READTHEDOCS_COMPATIBLE, ) -from readthedocs.projects.models import AddonsConfig, Project +from readthedocs.projects.models import AddonsConfig, Feature, Project from readthedocs.projects.version_handling import ( comparable_version, sort_versions_calver, @@ -617,6 +617,9 @@ def _get_filetreediff_response(self, *, user, project, version): if not version.is_external: return None + if not project.has_feature(Feature.GENERATE_MANIFEST_FOR_FILE_TREE_DIFF): + return None + latest_version = project.get_latest_version() if not latest_version or not self._has_permission( user=user, version=latest_version @@ -632,7 +635,7 @@ def _get_filetreediff_response(self, *, user, project, version): "outdated": diff.outdated, "diff": { "added": [{"file": file} for file in diff.added], - "removed": [{"file": file} for file in diff.removed], + "deleted": [{"file": file} for file in diff.deleted], "modified": [{"file": file} for file in diff.modified], }, } From 790a99b8a973b11ae9c0b5ce23e6ee6e2229c4c5 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 17 Oct 2024 10:21:47 -0500 Subject: [PATCH 10/20] Fix tests --- readthedocs/proxito/tests/responses/v1.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/readthedocs/proxito/tests/responses/v1.json b/readthedocs/proxito/tests/responses/v1.json index efd1e1107d6..27dc0c2131c 100644 --- a/readthedocs/proxito/tests/responses/v1.json +++ b/readthedocs/proxito/tests/responses/v1.json @@ -150,6 +150,9 @@ "base_host": "", "base_page": "" }, + "filetreediff": { + "enabled": false + }, "flyout": { "enabled": true }, From fd0db6310c64d8702a3a415f1b63292bdb862236 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 17 Oct 2024 11:39:41 -0500 Subject: [PATCH 11/20] More refactor --- readthedocs/filetreediff/__init__.py | 76 ++++++++++++------------- readthedocs/filetreediff/dataclasses.py | 56 ++++++++++++++++++ readthedocs/projects/tasks/search.py | 3 +- 3 files changed, 95 insertions(+), 40 deletions(-) create mode 100644 readthedocs/filetreediff/dataclasses.py diff --git a/readthedocs/filetreediff/__init__.py b/readthedocs/filetreediff/__init__.py index 9ca243c4bd1..9db499974c3 100644 --- a/readthedocs/filetreediff/__init__.py +++ b/readthedocs/filetreediff/__init__.py @@ -1,52 +1,45 @@ +""" +Module for the file tree diff feature (FTD). + +This feature is used to compare the files of two versions of a project. + +The process is as follows: + +- A build is triggered for a version. +- A task is triggered after the build has succeeded + to generate a manifest of the files of the version. + Currently, we only consider the latest version and pull request previews. +- The manifest contains the hash of the main content of each file. + Only HTML files are considered for now. +- The manifest is stored in the diff media storage. +- Then our application can compare the manifest to get a list of added, + deleted, and modified files between two versions. +""" + import json -from dataclasses import asdict, dataclass +from dataclasses import asdict from readthedocs.builds.models import Version +from readthedocs.filetreediff.dataclasses import FileTreeDiff, FileTreeManifest from readthedocs.projects.constants import MEDIA_TYPE_DIFF from readthedocs.storage import build_media_storage MANIFEST_FILE_NAME = "manifest.json" -@dataclass(slots=True) -class FileTreeBuild: - id: int - - -@dataclass(slots=True) -class FileTreeFile: - path: str - main_content_hash: str - - -@dataclass(slots=True) -class FileTreeManifest: - files: dict[str, FileTreeFile] - build: FileTreeBuild - - def __init__(self, build_id: int, files: list[FileTreeFile]): - self.build = FileTreeBuild(id=build_id) - self.files = {file.path: file for file in files} - - @classmethod - def from_dict(cls, data: dict) -> "FileTreeManifest": - build_id = data["build"]["id"] - files = [ - FileTreeFile(path=path, main_content_hash=file["main_content_hash"]) - for path, file in data["files"].items() - ] - return cls(build_id, files) - - -@dataclass -class FileTreeDiff: - added: list[str] - deleted: list[str] - modified: list[str] - outdated: bool = False - - def get_diff(version_a: Version, version_b: Version) -> FileTreeDiff | None: + """ + Get the file tree diff between two versions. + + If any of the versions don't have a manifest, return None. + If the latest build of any of the versions is different from the manifest build, + the diff is marked as outdated. The client is responsible for deciding + how to handle this case. + + Set operations are used to calculate the added, deleted, and modified files. + To get the modified files, we compare the main content hash of each common file. + If there are no changes between the versions, all lists will be empty. + """ outdated = False manifests: list[FileTreeManifest] = [] for version in (version_a, version_b): @@ -85,6 +78,11 @@ def get_diff(version_a: Version, version_b: Version) -> FileTreeDiff | None: def get_manifest(version: Version) -> FileTreeManifest | None: + """ + Get the file manifest for a version. + + If the manifest file does not exist, return None. + """ storage_path = version.project.get_storage_path( type_=MEDIA_TYPE_DIFF, version_slug=version.slug, diff --git a/readthedocs/filetreediff/dataclasses.py b/readthedocs/filetreediff/dataclasses.py new file mode 100644 index 00000000000..928a7cf63d9 --- /dev/null +++ b/readthedocs/filetreediff/dataclasses.py @@ -0,0 +1,56 @@ +from dataclasses import dataclass + + +@dataclass(slots=True) +class FileTreeBuild: + """The build associated with a file tree manifest.""" + + id: int + + +@dataclass(slots=True) +class FileTreeFile: + """A file in a file tree manifest.""" + + path: str + main_content_hash: str + + +@dataclass(slots=True) +class FileTreeManifest: + """A list of files and the build associated with them.""" + + files: dict[str, FileTreeFile] + build: FileTreeBuild + + def __init__(self, build_id: int, files: list[FileTreeFile]): + self.build = FileTreeBuild(id=build_id) + self.files = {file.path: file for file in files} + + @classmethod + def from_dict(cls, data: dict) -> "FileTreeManifest": + """ + Create a FileTreeManifest from a dictionary. + + The dictionary should follow the same structure as the one returned by + converting the object to a dictionary using the `asdict` function. + """ + build_id = data["build"]["id"] + files = [ + FileTreeFile(path=path, main_content_hash=file["main_content_hash"]) + for path, file in data["files"].items() + ] + return cls(build_id, files) + + +@dataclass +class FileTreeDiff: + """Difference between two file tree manifests.""" + + added: list[str] + deleted: list[str] + modified: list[str] + outdated: bool = False + + def has_changes(self) -> bool: + return bool(self.added or self.deleted or self.modified) diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index 7fb60bba2bc..af4f0f93fe7 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -4,7 +4,8 @@ from readthedocs.builds.constants import BUILD_STATE_FINISHED, INTERNAL, LATEST from readthedocs.builds.models import Build, Version -from readthedocs.filetreediff import FileTreeFile, FileTreeManifest, write_manifest +from readthedocs.filetreediff import write_manifest +from readthedocs.filetreediff.dataclasses import FileTreeFile, FileTreeManifest from readthedocs.projects.models import Feature, HTMLFile, Project from readthedocs.projects.signals import files_changed from readthedocs.search.documents import PageDocument From 5a824fce2b66c86f4d9dd86fc1d76bb96aefd178 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 17 Oct 2024 12:27:16 -0500 Subject: [PATCH 12/20] Fix search tests --- readthedocs/filetreediff/__init__.py | 2 ++ readthedocs/filetreediff/dataclasses.py | 4 ++++ readthedocs/proxito/views/hosting.py | 6 ++++++ .../rtd_tests/tests/test_imported_file.py | 17 +++++++++++++---- 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/readthedocs/filetreediff/__init__.py b/readthedocs/filetreediff/__init__.py index 9db499974c3..533c7596c3f 100644 --- a/readthedocs/filetreediff/__init__.py +++ b/readthedocs/filetreediff/__init__.py @@ -56,6 +56,8 @@ def get_diff(version_a: Version, version_b: Version) -> FileTreeDiff | None: manifests.append(manifest) + # Just to make pylint happy + assert len(manifests) == 2 version_a_manifest, version_b_manifest = manifests files_a = set(version_a_manifest.files.keys()) files_b = set(version_b_manifest.files.keys()) diff --git a/readthedocs/filetreediff/dataclasses.py b/readthedocs/filetreediff/dataclasses.py index 928a7cf63d9..3818d6ab67d 100644 --- a/readthedocs/filetreediff/dataclasses.py +++ b/readthedocs/filetreediff/dataclasses.py @@ -3,6 +3,7 @@ @dataclass(slots=True) class FileTreeBuild: + """The build associated with a file tree manifest.""" id: int @@ -10,6 +11,7 @@ class FileTreeBuild: @dataclass(slots=True) class FileTreeFile: + """A file in a file tree manifest.""" path: str @@ -18,6 +20,7 @@ class FileTreeFile: @dataclass(slots=True) class FileTreeManifest: + """A list of files and the build associated with them.""" files: dict[str, FileTreeFile] @@ -45,6 +48,7 @@ def from_dict(cls, data: dict) -> "FileTreeManifest": @dataclass class FileTreeDiff: + """Difference between two file tree manifests.""" added: list[str] diff --git a/readthedocs/proxito/views/hosting.py b/readthedocs/proxito/views/hosting.py index 41cd512fed0..dae01d06dfc 100644 --- a/readthedocs/proxito/views/hosting.py +++ b/readthedocs/proxito/views/hosting.py @@ -614,6 +614,12 @@ def _v1(self, project, version, build, filename, url, request): return data def _get_filetreediff_response(self, *, user, project, version): + """ + Get the file tree diff response for the given version. + + This response is only enabled for external versions, + we do the comparison between the current version and the latest version. + """ if not version.is_external: return None diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py index 77ce20b8ada..894ed926635 100644 --- a/readthedocs/rtd_tests/tests/test_imported_file.py +++ b/readthedocs/rtd_tests/tests/test_imported_file.py @@ -7,6 +7,7 @@ from django.test.utils import override_settings from readthedocs.builds.constants import EXTERNAL +from readthedocs.builds.models import Build from readthedocs.projects.models import HTMLFile, ImportedFile, Project from readthedocs.projects.tasks.search import _get_indexers, _process_files from readthedocs.search.documents import PageDocument @@ -44,12 +45,20 @@ def tearDown(self): def _manage_imported_files(self, version, search_ranking=None, search_ignore=None): """Helper function for the tests to create and sync ImportedFiles.""" - search_ranking = search_ranking or {} - search_ignore = search_ignore or [] + # Create a temporal build object just to pass the search configuration. + build = Build( + project=self.project, + version=version, + config={ + "search": { + "ranking": search_ranking or {}, + "ignore": search_ignore or [], + } + }, + ) indexers = _get_indexers( version=version, - search_ranking=search_ranking, - search_ignore=search_ignore, + build=build, ) return _process_files(version=version, indexers=indexers) From cb10dc6bf35fbf676afcd846d43d130709496bd7 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 17 Oct 2024 13:37:42 -0500 Subject: [PATCH 13/20] Fix search tests --- readthedocs/filetreediff/__init__.py | 3 +-- readthedocs/search/tests/data/generic/out/basic.json | 1 + readthedocs/search/tests/data/mkdocs/out/gitbook.json | 1 + readthedocs/search/tests/data/mkdocs/out/material.json | 1 + readthedocs/search/tests/data/mkdocs/out/mkdocs-1.1.json | 5 +++++ .../search/tests/data/mkdocs/out/readthedocs-1.1.json | 3 +++ readthedocs/search/tests/data/mkdocs/out/windmill.json | 1 + readthedocs/search/tests/data/pelican/out/default.json | 1 + readthedocs/search/tests/data/sphinx/out/autodoc.json | 1 + readthedocs/search/tests/data/sphinx/out/httpdomain.json | 1 + readthedocs/search/tests/data/sphinx/out/local-toc.json | 1 + readthedocs/search/tests/data/sphinx/out/no-title.json | 1 + readthedocs/search/tests/data/sphinx/out/page.json | 1 + readthedocs/search/tests/data/sphinx/out/requests.json | 1 + readthedocs/search/tests/data/sphinx/out/toctree.json | 1 + 15 files changed, 21 insertions(+), 2 deletions(-) diff --git a/readthedocs/filetreediff/__init__.py b/readthedocs/filetreediff/__init__.py index 533c7596c3f..775de261c32 100644 --- a/readthedocs/filetreediff/__init__.py +++ b/readthedocs/filetreediff/__init__.py @@ -56,8 +56,7 @@ def get_diff(version_a: Version, version_b: Version) -> FileTreeDiff | None: manifests.append(manifest) - # Just to make pylint happy - assert len(manifests) == 2 + # pylint: disable=unbalanced-tuple-unpacking version_a_manifest, version_b_manifest = manifests files_a = set(version_a_manifest.files.keys()) files_b = set(version_b_manifest.files.keys()) diff --git a/readthedocs/search/tests/data/generic/out/basic.json b/readthedocs/search/tests/data/generic/out/basic.json index 188ddeeab10..7076a8bd838 100644 --- a/readthedocs/search/tests/data/generic/out/basic.json +++ b/readthedocs/search/tests/data/generic/out/basic.json @@ -2,6 +2,7 @@ { "path": "basic.html", "title": "Title of the page", + "main_content_hash": "006cb661925c211be260be17d64662b5", "sections": [ { "id": "love", diff --git a/readthedocs/search/tests/data/mkdocs/out/gitbook.json b/readthedocs/search/tests/data/mkdocs/out/gitbook.json index 54a8216cf23..3dcd789797b 100644 --- a/readthedocs/search/tests/data/mkdocs/out/gitbook.json +++ b/readthedocs/search/tests/data/mkdocs/out/gitbook.json @@ -2,6 +2,7 @@ { "path": "index.html", "title": "Mkdocs - GitBook Theme", + "main_content_hash": "a2050380ef001ef57754fb35b41d477d", "sections": [ { "id": "mkdocs-gitbook-theme", diff --git a/readthedocs/search/tests/data/mkdocs/out/material.json b/readthedocs/search/tests/data/mkdocs/out/material.json index 293ae38acb3..50331b2d6c8 100644 --- a/readthedocs/search/tests/data/mkdocs/out/material.json +++ b/readthedocs/search/tests/data/mkdocs/out/material.json @@ -2,6 +2,7 @@ { "path": "index.html", "title": "Overview", + "main_content_hash": "448eb15bfef60bc49a4059a34ebeaa4b", "sections": [ { "id": "", diff --git a/readthedocs/search/tests/data/mkdocs/out/mkdocs-1.1.json b/readthedocs/search/tests/data/mkdocs/out/mkdocs-1.1.json index 7e5eec85312..57a530d4fd5 100644 --- a/readthedocs/search/tests/data/mkdocs/out/mkdocs-1.1.json +++ b/readthedocs/search/tests/data/mkdocs/out/mkdocs-1.1.json @@ -2,6 +2,7 @@ { "path": "index.html", "title": "MkDocs", + "main_content_hash": "aca1f625e559fe49718abe801532b3ef", "sections": [ { "id": "mkdocs", @@ -38,6 +39,7 @@ { "path": "404.html", "title": "404", + "main_content_hash": "2f582057ff73758c152d6925eb149099", "sections": [ { "id": "404-page-not-found", @@ -49,6 +51,7 @@ { "path": "configuration.html", "title": "Configuration", + "main_content_hash": "6a5526f41cfef3da4c541f0465ac42e8", "sections": [ { "id": "configuration", @@ -90,6 +93,7 @@ { "path": "no-title.html", "title": "No title - Read the Docs MkDocs Test", + "main_content_hash": "c02e4800b9da5e96ed35841f11072777", "sections": [ { "id": "", @@ -101,6 +105,7 @@ { "path": "no-main-header.html", "title": "I'm the header", + "main_content_hash": "43010c91ae89c036b157c2a3947d63b8", "sections": [ { "id": "", diff --git a/readthedocs/search/tests/data/mkdocs/out/readthedocs-1.1.json b/readthedocs/search/tests/data/mkdocs/out/readthedocs-1.1.json index 55a06731178..063ab3bfa87 100644 --- a/readthedocs/search/tests/data/mkdocs/out/readthedocs-1.1.json +++ b/readthedocs/search/tests/data/mkdocs/out/readthedocs-1.1.json @@ -2,6 +2,7 @@ { "path": "index.html", "title": "Read the Docs MkDocs Test Project", + "main_content_hash": "25c48df9b31b31eaee235206a48239a5", "sections": [ { "id": "read-the-docs-mkdocs-test-project", @@ -28,6 +29,7 @@ { "path": "404.html", "title": "404", + "main_content_hash": "5f6d687421dad7e1409dcaa3ccd2d34d", "sections": [ { "id": "404-page-not-found", @@ -39,6 +41,7 @@ { "path": "versions.html", "title": "Versions & Themes", + "main_content_hash": "68a6609034e0a45adb135a60b98c12b8", "sections": [ { "id": "versions-themes", diff --git a/readthedocs/search/tests/data/mkdocs/out/windmill.json b/readthedocs/search/tests/data/mkdocs/out/windmill.json index ccc0d3f92d7..8ea6f7aad5f 100644 --- a/readthedocs/search/tests/data/mkdocs/out/windmill.json +++ b/readthedocs/search/tests/data/mkdocs/out/windmill.json @@ -2,6 +2,7 @@ { "path": "index.html", "title": "Windmill theme", + "main_content_hash": "68542e8759d648c4304f0e1ca85a0547", "sections": [ { "id": "windmill-theme", diff --git a/readthedocs/search/tests/data/pelican/out/default.json b/readthedocs/search/tests/data/pelican/out/default.json index 699aa53502e..d829e628d07 100644 --- a/readthedocs/search/tests/data/pelican/out/default.json +++ b/readthedocs/search/tests/data/pelican/out/default.json @@ -2,6 +2,7 @@ { "path": "index.html", "title": "Pelican Development Blog", + "main_content_hash": "ab49e3b7d72db4a92506ef085770e323", "sections": [ { "id": "banner", diff --git a/readthedocs/search/tests/data/sphinx/out/autodoc.json b/readthedocs/search/tests/data/sphinx/out/autodoc.json index a43bcbaca18..64d424f60fc 100644 --- a/readthedocs/search/tests/data/sphinx/out/autodoc.json +++ b/readthedocs/search/tests/data/sphinx/out/autodoc.json @@ -1,6 +1,7 @@ { "path": "autodoc.html", "title": "sphinx.ext.autodoc – Include documentation from docstrings", + "main_content_hash": "1761ca2f532958a9dda6967544cde167", "sections": [ { "id": "directive-automodule", diff --git a/readthedocs/search/tests/data/sphinx/out/httpdomain.json b/readthedocs/search/tests/data/sphinx/out/httpdomain.json index 9bede917427..7cb59fe5add 100644 --- a/readthedocs/search/tests/data/sphinx/out/httpdomain.json +++ b/readthedocs/search/tests/data/sphinx/out/httpdomain.json @@ -1,6 +1,7 @@ { "path": "httpdomain.html", "title": "Here is a nice reference", + "main_content_hash": "b0d8f8cbfe35ca79213ac9122704706f", "sections": [ { "id": "get--api-v3-projects-", diff --git a/readthedocs/search/tests/data/sphinx/out/local-toc.json b/readthedocs/search/tests/data/sphinx/out/local-toc.json index c6604b64cb3..5a848e37664 100644 --- a/readthedocs/search/tests/data/sphinx/out/local-toc.json +++ b/readthedocs/search/tests/data/sphinx/out/local-toc.json @@ -1,6 +1,7 @@ { "path": "local-toc.html", "title": "Security reports", + "main_content_hash": "71a1ef5892c6b2181e70cc18171ae5ee", "sections": [ { "id": "security-reports", diff --git a/readthedocs/search/tests/data/sphinx/out/no-title.json b/readthedocs/search/tests/data/sphinx/out/no-title.json index ed8410429bc..882bde7d8f7 100644 --- a/readthedocs/search/tests/data/sphinx/out/no-title.json +++ b/readthedocs/search/tests/data/sphinx/out/no-title.json @@ -1,6 +1,7 @@ { "title": "", "path": "no-title.html", + "main_content_hash": "e821445de7eb6b324ef37a9780392c85", "sections": [ { "id": "", diff --git a/readthedocs/search/tests/data/sphinx/out/page.json b/readthedocs/search/tests/data/sphinx/out/page.json index 4266b476b37..2270e9d8fae 100644 --- a/readthedocs/search/tests/data/sphinx/out/page.json +++ b/readthedocs/search/tests/data/sphinx/out/page.json @@ -1,6 +1,7 @@ { "path": "page.html", "title": "I Need Secrets (or Environment Variables) in my Build", + "main_content_hash": "95daf5cde2bee38d4bb23edaa1d5c0f3", "sections": [ { "id": "test_py_module.test.Foo", diff --git a/readthedocs/search/tests/data/sphinx/out/requests.json b/readthedocs/search/tests/data/sphinx/out/requests.json index 429b695bebd..74396bcb9ac 100644 --- a/readthedocs/search/tests/data/sphinx/out/requests.json +++ b/readthedocs/search/tests/data/sphinx/out/requests.json @@ -1,6 +1,7 @@ { "path": "requests.html", "title": "Developer Interface", + "main_content_hash": "fd572599b94b2319f973a73b25421d91", "sections": [ { "id": "requests.request", diff --git a/readthedocs/search/tests/data/sphinx/out/toctree.json b/readthedocs/search/tests/data/sphinx/out/toctree.json index f7b0ab63653..bc1d69b4cb3 100644 --- a/readthedocs/search/tests/data/sphinx/out/toctree.json +++ b/readthedocs/search/tests/data/sphinx/out/toctree.json @@ -1,6 +1,7 @@ { "path": "toctree.html", "title": "Public API", + "main_content_hash": "3d2e43fdc8b58752538ebd934a34b6ea", "sections": [ { "id": "public-api", From 1e55441bc458531472fa80ebe261cb8fa9055b38 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 17 Oct 2024 17:21:09 -0500 Subject: [PATCH 14/20] More tests --- readthedocs/filetreediff/__init__.py | 10 +- readthedocs/filetreediff/dataclasses.py | 18 +-- .../filetreediff/tests/test_filetreediff.py | 134 ++++++++++++++++++ readthedocs/projects/tasks/search.py | 6 +- 4 files changed, 151 insertions(+), 17 deletions(-) create mode 100644 readthedocs/filetreediff/tests/test_filetreediff.py diff --git a/readthedocs/filetreediff/__init__.py b/readthedocs/filetreediff/__init__.py index 775de261c32..2d5de4043dd 100644 --- a/readthedocs/filetreediff/__init__.py +++ b/readthedocs/filetreediff/__init__.py @@ -20,7 +20,7 @@ from dataclasses import asdict from readthedocs.builds.models import Version -from readthedocs.filetreediff.dataclasses import FileTreeDiff, FileTreeManifest +from readthedocs.filetreediff.dataclasses import FileTreeDiff, FileTreeDiffManifest from readthedocs.projects.constants import MEDIA_TYPE_DIFF from readthedocs.storage import build_media_storage @@ -41,7 +41,7 @@ def get_diff(version_a: Version, version_b: Version) -> FileTreeDiff | None: If there are no changes between the versions, all lists will be empty. """ outdated = False - manifests: list[FileTreeManifest] = [] + manifests: list[FileTreeDiffManifest] = [] for version in (version_a, version_b): manifest = get_manifest(version) if not manifest: @@ -78,7 +78,7 @@ def get_diff(version_a: Version, version_b: Version) -> FileTreeDiff | None: ) -def get_manifest(version: Version) -> FileTreeManifest | None: +def get_manifest(version: Version) -> FileTreeDiffManifest | None: """ Get the file manifest for a version. @@ -97,10 +97,10 @@ def get_manifest(version: Version) -> FileTreeManifest | None: except FileNotFoundError: return None - return FileTreeManifest.from_dict(manifest) + return FileTreeDiffManifest.from_dict(manifest) -def write_manifest(version: Version, manifest: FileTreeManifest): +def write_manifest(version: Version, manifest: FileTreeDiffManifest): storage_path = version.project.get_storage_path( type_=MEDIA_TYPE_DIFF, version_slug=version.slug, diff --git a/readthedocs/filetreediff/dataclasses.py b/readthedocs/filetreediff/dataclasses.py index 3818d6ab67d..abbcc04658f 100644 --- a/readthedocs/filetreediff/dataclasses.py +++ b/readthedocs/filetreediff/dataclasses.py @@ -2,7 +2,7 @@ @dataclass(slots=True) -class FileTreeBuild: +class FileTreeDiffBuild: """The build associated with a file tree manifest.""" @@ -10,7 +10,7 @@ class FileTreeBuild: @dataclass(slots=True) -class FileTreeFile: +class FileTreeDiffFile: """A file in a file tree manifest.""" @@ -19,19 +19,19 @@ class FileTreeFile: @dataclass(slots=True) -class FileTreeManifest: +class FileTreeDiffManifest: """A list of files and the build associated with them.""" - files: dict[str, FileTreeFile] - build: FileTreeBuild + files: dict[str, FileTreeDiffFile] + build: FileTreeDiffBuild - def __init__(self, build_id: int, files: list[FileTreeFile]): - self.build = FileTreeBuild(id=build_id) + def __init__(self, build_id: int, files: list[FileTreeDiffFile]): + self.build = FileTreeDiffBuild(id=build_id) self.files = {file.path: file for file in files} @classmethod - def from_dict(cls, data: dict) -> "FileTreeManifest": + def from_dict(cls, data: dict) -> "FileTreeDiffManifest": """ Create a FileTreeManifest from a dictionary. @@ -40,7 +40,7 @@ def from_dict(cls, data: dict) -> "FileTreeManifest": """ build_id = data["build"]["id"] files = [ - FileTreeFile(path=path, main_content_hash=file["main_content_hash"]) + FileTreeDiffFile(path=path, main_content_hash=file["main_content_hash"]) for path, file in data["files"].items() ] return cls(build_id, files) diff --git a/readthedocs/filetreediff/tests/test_filetreediff.py b/readthedocs/filetreediff/tests/test_filetreediff.py new file mode 100644 index 00000000000..e785063c1bb --- /dev/null +++ b/readthedocs/filetreediff/tests/test_filetreediff.py @@ -0,0 +1,134 @@ +import json +from contextlib import contextmanager +from unittest import mock + +from django.test import TestCase +from django_dynamic_fixture import get + +from readthedocs.builds.constants import BUILD_STATE_FINISHED, LATEST +from readthedocs.builds.models import Build, Version +from readthedocs.filetreediff import get_diff +from readthedocs.projects.models import Project +from readthedocs.rtd_tests.storage import BuildMediaFileSystemStorageTest + + +# We are overriding the storage class instead of using RTD_BUILD_MEDIA_STORAGE, +# since the setting is evaluated just once (first test to use the storage +# backend will set it for the whole test suite). +@mock.patch( + "readthedocs.filetreediff.build_media_storage", + new=BuildMediaFileSystemStorageTest(), +) +class TestsFileTreeDiff(TestCase): + def setUp(self): + self.project = get(Project) + self.version_a = self.project.versions.get(slug=LATEST) + self.build_a = get( + Build, + project=self.project, + version=self.version_a, + state=BUILD_STATE_FINISHED, + success=True, + ) + self.version_b = get( + Version, + project=self.project, + slug="v2", + active=True, + built=True, + ) + self.build_b = get( + Build, + project=self.project, + version=self.version_b, + state=BUILD_STATE_FINISHED, + success=True, + ) + + def _mock_open(self, content): + @contextmanager + def f(*args, **kwargs): + read_mock = mock.MagicMock() + read_mock.read.return_value = content + yield read_mock + + return f + + def _mock_manifest(self, build_id: int, files: dict[str, str]): + return self._mock_open( + json.dumps( + { + "build": {"id": build_id}, + "files": { + file_path: {"main_content_hash": main_content_hash} + for file_path, main_content_hash in files.items() + }, + } + ) + ) + + @mock.patch.object(BuildMediaFileSystemStorageTest, "open") + def test_diff_no_changes(self, storage_open): + files_a = { + "index.html": "hash1", + "tutorials/index.html": "hash2", + } + storage_open.side_effect = [ + self._mock_manifest(self.build_a.id, files_a)(), + self._mock_manifest(self.build_b.id, files_a)(), + ] + diff = get_diff(self.version_a, self.version_b) + assert diff.added == [] + assert diff.deleted == [] + assert diff.modified == [] + assert not diff.outdated + + @mock.patch.object(BuildMediaFileSystemStorageTest, "open") + def test_diff_changes(self, storage_open): + files_a = { + "index.html": "hash1", + "tutorials/index.html": "hash2", + "new-file.html": "hash-new", + } + files_b = { + "index.html": "hash1", + "tutorials/index.html": "hash-changed", + "deleted.html": "hash-deleted", + } + storage_open.side_effect = [ + self._mock_manifest(self.build_a.id, files_a)(), + self._mock_manifest(self.build_b.id, files_b)(), + ] + diff = get_diff(self.version_a, self.version_b) + assert diff.added == ["new-file.html"] + assert diff.deleted == ["deleted.html"] + assert diff.modified == ["tutorials/index.html"] + assert not diff.outdated + + @mock.patch.object(BuildMediaFileSystemStorageTest, "open") + def test_missing_manifest(self, storage_open): + storage_open.side_effect = FileNotFoundError + diff = get_diff(self.version_a, self.version_b) + assert diff is None + + @mock.patch.object(BuildMediaFileSystemStorageTest, "open") + def test_outdated_diff(self, storage_open): + files_a = { + "index.html": "hash1", + "tutorials/index.html": "hash2", + "new-file.html": "hash-new", + } + files_b = { + "index.html": "hash1", + "tutorials/index.html": "hash-changed", + "deleted.html": "hash-deleted", + } + storage_open.side_effect = [ + self._mock_manifest(self.build_a.id + 5, files_a)(), + self._mock_manifest(self.build_b.id + 5, files_b)(), + ] + diff = get_diff(self.version_a, self.version_b) + assert diff.added == ["new-file.html"] + assert diff.deleted == ["deleted.html"] + assert diff.modified == ["tutorials/index.html"] + assert diff.outdated diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index af4f0f93fe7..dff2637b291 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -5,7 +5,7 @@ from readthedocs.builds.constants import BUILD_STATE_FINISHED, INTERNAL, LATEST from readthedocs.builds.models import Build, Version from readthedocs.filetreediff import write_manifest -from readthedocs.filetreediff.dataclasses import FileTreeFile, FileTreeManifest +from readthedocs.filetreediff.dataclasses import FileTreeDiffFile, FileTreeDiffManifest from readthedocs.projects.models import Feature, HTMLFile, Project from readthedocs.projects.signals import files_changed from readthedocs.search.documents import PageDocument @@ -132,10 +132,10 @@ def process(self, html_file: HTMLFile, sync_id: int): self._hashes[html_file.path] = html_file.processed_json["main_content_hash"] def collect(self, sync_id: int): - manifest = FileTreeManifest( + manifest = FileTreeDiffManifest( build_id=self.build.id, files=[ - FileTreeFile(path=path, main_content_hash=hash) + FileTreeDiffFile(path=path, main_content_hash=hash) for path, hash in self._hashes.items() ], ) From 6a0ebdd77618ca349eadaabff8c818326089451e Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 17 Oct 2024 17:47:08 -0500 Subject: [PATCH 15/20] More tests --- readthedocs/proxito/tests/test_hosting.py | 92 ++++++++++++++++++++++- 1 file changed, 90 insertions(+), 2 deletions(-) diff --git a/readthedocs/proxito/tests/test_hosting.py b/readthedocs/proxito/tests/test_hosting.py index 2ec44656332..460d3520b8c 100644 --- a/readthedocs/proxito/tests/test_hosting.py +++ b/readthedocs/proxito/tests/test_hosting.py @@ -2,6 +2,7 @@ import json from pathlib import Path +from unittest import mock import django_dynamic_fixture as fixture import pytest @@ -9,16 +10,18 @@ from django.test import TestCase, override_settings from django.urls import reverse from django.utils import timezone +from django_dynamic_fixture import get -from readthedocs.builds.constants import LATEST +from readthedocs.builds.constants import BUILD_STATE_FINISHED, EXTERNAL, LATEST from readthedocs.builds.models import Build, Version +from readthedocs.filetreediff.dataclasses import FileTreeDiffFile, FileTreeDiffManifest from readthedocs.projects.constants import ( MULTIPLE_VERSIONS_WITH_TRANSLATIONS, PRIVATE, PUBLIC, SINGLE_VERSION_WITHOUT_TRANSLATIONS, ) -from readthedocs.projects.models import AddonsConfig, Domain, Project +from readthedocs.projects.models import AddonsConfig, Domain, Feature, Project @override_settings( @@ -27,6 +30,7 @@ PUBLIC_DOMAIN_USES_HTTPS=True, GLOBAL_ANALYTICS_CODE=None, RTD_ALLOW_ORGANIZATIONS=False, + RTD_EXTERNAL_VERSION_DOMAIN="dev.readthedocs.build", ) @pytest.mark.proxito class TestReadTheDocsConfigJson(TestCase): @@ -841,3 +845,87 @@ def test_number_of_queries_url_translations(self): }, ) assert r.status_code == 200 + + @mock.patch("readthedocs.filetreediff.get_manifest") + def test_file_tree_diff(self, get_manifest): + get( + Feature, + projects=[self.project], + feature_id=Feature.GENERATE_MANIFEST_FOR_FILE_TREE_DIFF, + ) + pr_version = get( + Version, + project=self.project, + slug="123", + active=True, + built=True, + privacy_level=PUBLIC, + type=EXTERNAL, + ) + pr_build = get( + Build, + project=self.project, + version=pr_version, + commit="a1b2c3", + state=BUILD_STATE_FINISHED, + success=True, + ) + get_manifest.side_effect = [ + FileTreeDiffManifest( + build_id=pr_build.id, + files=[ + FileTreeDiffFile( + path="index.html", + main_content_hash="hash1", + ), + FileTreeDiffFile( + path="tutorial/index.html", + main_content_hash="hash1", + ), + FileTreeDiffFile( + path="new-file.html", + main_content_hash="hash1", + ), + ], + ), + FileTreeDiffManifest( + build_id=self.build.id, + files=[ + FileTreeDiffFile( + path="index.html", + main_content_hash="hash1", + ), + FileTreeDiffFile( + path="tutorial/index.html", + main_content_hash="hash-changed", + ), + FileTreeDiffFile( + path="deleted.html", + main_content_hash="hash-deleted", + ), + ], + ), + ] + r = self.client.get( + reverse("proxito_readthedocs_docs_addons"), + { + "url": "https://project--123.dev.readthedocs.build/en/123/", + "client-version": "0.6.0", + "api-version": "1.0.0", + }, + secure=True, + headers={ + "host": "project--123.dev.readthedocs.build", + }, + ) + assert r.status_code == 200 + filetreediff_response = r.json()["addons"]["filetreediff"] + assert filetreediff_response == { + "enabled": True, + "outdated": False, + "diff": { + "added": [{"file": "new-file.html"}], + "deleted": [{"file": "deleted.html"}], + "modified": [{"file": "tutorial/index.html"}], + }, + } From 4cbf9d62937d722f087e32f3649e770f69ff41b5 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 17 Oct 2024 17:51:54 -0500 Subject: [PATCH 16/20] Bye little bug, we still need tests --- readthedocs/projects/tasks/search.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index dff2637b291..b00d0388df9 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -169,9 +169,7 @@ def _get_indexers(*, version: Version, build: Build, search_index_name=None): has_feature = version.project.has_feature( Feature.GENERATE_MANIFEST_FOR_FILE_TREE_DIFF ) - create_manifest = has_feature and ( - version.is_external or version == version.slug == LATEST - ) + create_manifest = has_feature and (version.is_external or version.slug == LATEST) if create_manifest: file_manifest_indexer = FileManifestIndexer( version=version, From 10f01f989f1afc6c66caed3ed06f930e734bbd99 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Mon, 21 Oct 2024 12:56:45 -0500 Subject: [PATCH 17/20] Test task --- .../rtd_tests/tests/test_imported_file.py | 188 +++++++++++------- 1 file changed, 121 insertions(+), 67 deletions(-) diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py index 894ed926635..8ec87a3901e 100644 --- a/readthedocs/rtd_tests/tests/test_imported_file.py +++ b/readthedocs/rtd_tests/tests/test_imported_file.py @@ -1,15 +1,18 @@ import os +from unittest import mock import pytest from django.conf import settings from django.core.files.storage import get_storage_class from django.test import TestCase from django.test.utils import override_settings +from django_dynamic_fixture import get -from readthedocs.builds.constants import EXTERNAL -from readthedocs.builds.models import Build -from readthedocs.projects.models import HTMLFile, ImportedFile, Project -from readthedocs.projects.tasks.search import _get_indexers, _process_files +from readthedocs.builds.constants import BUILD_STATE_FINISHED, EXTERNAL, LATEST +from readthedocs.builds.models import Build, Version +from readthedocs.filetreediff.dataclasses import FileTreeDiffFile, FileTreeDiffManifest +from readthedocs.projects.models import Feature, HTMLFile, ImportedFile, Project +from readthedocs.projects.tasks.search import index_build from readthedocs.search.documents import PageDocument base_dir = os.path.dirname(os.path.dirname(__file__)) @@ -18,13 +21,18 @@ @pytest.mark.search @override_settings(ELASTICSEARCH_DSL_AUTOSYNC=True) class ImportedFileTests(TestCase): - fixtures = ["eric", "test_data"] - storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)() def setUp(self): - self.project = Project.objects.get(slug="pip") - self.version = self.project.versions.first() + self.project = get(Project) + self.version = self.project.versions.get(slug=LATEST) + self.build = get( + Build, + project=self.project, + version=self.version, + state=BUILD_STATE_FINISHED, + success=True, + ) self.test_dir = os.path.join(base_dir, "files") with override_settings(DOCROOT=self.test_dir): @@ -43,25 +51,6 @@ def tearDown(self): # Delete index PageDocument._index.delete(ignore=404) - def _manage_imported_files(self, version, search_ranking=None, search_ignore=None): - """Helper function for the tests to create and sync ImportedFiles.""" - # Create a temporal build object just to pass the search configuration. - build = Build( - project=self.project, - version=version, - config={ - "search": { - "ranking": search_ranking or {}, - "ignore": search_ignore or [], - } - }, - ) - indexers = _get_indexers( - version=version, - build=build, - ) - return _process_files(version=version, indexers=indexers) - def _copy_storage_dir(self): """Copy the test directory (rtd_tests/files) to storage""" self.storage.copy_directory( @@ -87,7 +76,7 @@ def test_properly_created(self): """ self.assertEqual(ImportedFile.objects.count(), 0) - sync_id = self._manage_imported_files(version=self.version) + sync_id = index_build(self.build.pk) self.assertEqual(ImportedFile.objects.count(), 3) self.assertEqual( set(HTMLFile.objects.all().values_list("path", flat=True)), @@ -100,7 +89,7 @@ def test_properly_created(self): {"index.html", "404.html", "test.html", "api/index.html"}, ) - sync_id = self._manage_imported_files(version=self.version) + sync_id = index_build(self.build.pk) self.assertEqual(ImportedFile.objects.count(), 3) self.assertEqual( set(HTMLFile.objects.all().values_list("path", flat=True)), @@ -121,7 +110,7 @@ def test_index_external_version(self): with override_settings(DOCROOT=self.test_dir): self._copy_storage_dir() - sync_id = self._manage_imported_files(version=self.version) + sync_id = index_build(self.build.pk) self.assertEqual(ImportedFile.objects.count(), 3) self.assertEqual( set(HTMLFile.objects.all().values_list("path", flat=True)), @@ -131,7 +120,7 @@ def test_index_external_version(self): results = PageDocument().search().filter("term", build=sync_id).execute() self.assertEqual(len(results), 0) - sync_id = self._manage_imported_files(version=self.version) + sync_id = index_build(self.build.pk) self.assertEqual(ImportedFile.objects.count(), 3) self.assertEqual( set(HTMLFile.objects.all().values_list("path", flat=True)), @@ -142,7 +131,7 @@ def test_index_external_version(self): def test_update_build(self): self.assertEqual(ImportedFile.objects.count(), 0) - sync_id = self._manage_imported_files(self.version) + sync_id = index_build(self.build.pk) for obj in ImportedFile.objects.all(): self.assertEqual(obj.build, sync_id) @@ -151,7 +140,7 @@ def test_update_build(self): for result in results: self.assertEqual(result.build, sync_id) - sync_id = self._manage_imported_files(self.version) + sync_id = index_build(self.build.pk) for obj in ImportedFile.objects.all(): self.assertEqual(obj.build, sync_id) @@ -161,11 +150,8 @@ def test_update_build(self): self.assertEqual(len(results), 4) def test_page_default_rank(self): - search_ranking = {} self.assertEqual(HTMLFile.objects.count(), 0) - sync_id = self._manage_imported_files( - self.version, search_ranking=search_ranking - ) + sync_id = index_build(self.build.pk) results = ( PageDocument() @@ -182,12 +168,15 @@ def test_page_default_rank(self): self.assertEqual(result.rank, 0) def test_page_custom_rank_glob(self): - search_ranking = { - "*index.html": 5, + self.build.config = { + "search": { + "ranking": { + "*index.html": 5, + } + } } - sync_id = self._manage_imported_files( - self.version, search_ranking=search_ranking - ) + self.build.save() + sync_id = index_build(self.build.pk) results = ( PageDocument() @@ -207,13 +196,16 @@ def test_page_custom_rank_glob(self): self.assertEqual(result.rank, 0, result.path) def test_page_custom_rank_several(self): - search_ranking = { - "test.html": 5, - "api/index.html": 2, + self.build.config = { + "search": { + "ranking": { + "test.html": 5, + "api/index.html": 2, + } + } } - sync_id = self._manage_imported_files( - self.version, search_ranking=search_ranking - ) + self.build.save() + sync_id = index_build(self.build.pk) results = ( PageDocument() @@ -235,13 +227,16 @@ def test_page_custom_rank_several(self): self.assertEqual(result.rank, 0) def test_page_custom_rank_precedence(self): - search_ranking = { - "*.html": 5, - "api/index.html": 2, + self.build.config = { + "search": { + "ranking": { + "*.html": 5, + "api/index.html": 2, + } + } } - sync_id = self._manage_imported_files( - self.version, search_ranking=search_ranking - ) + self.build.save() + sync_id = index_build(self.build.pk) results = ( PageDocument() @@ -261,13 +256,16 @@ def test_page_custom_rank_precedence(self): self.assertEqual(result.rank, 5, result.path) def test_page_custom_rank_precedence_inverted(self): - search_ranking = { - "api/index.html": 2, - "*.html": 5, + self.build.config = { + "search": { + "ranking": { + "api/index.html": 2, + "*.html": 5, + } + } } - sync_id = self._manage_imported_files( - self.version, search_ranking=search_ranking - ) + self.build.save() + sync_id = index_build(self.build.pk) results = ( PageDocument() @@ -284,11 +282,13 @@ def test_page_custom_rank_precedence_inverted(self): self.assertEqual(result.rank, 5) def test_search_page_ignore(self): - search_ignore = ["api/index.html"] - self._manage_imported_files( - self.version, - search_ignore=search_ignore, - ) + self.build.config = { + "search": { + "ignore": ["api/index.html"], + } + } + self.build.save() + index_build(self.build.pk) self.assertEqual( set(HTMLFile.objects.all().values_list("path", flat=True)), @@ -316,7 +316,7 @@ def test_update_content(self): with override_settings(DOCROOT=self.test_dir): self._copy_storage_dir() - sync_id = self._manage_imported_files(self.version) + sync_id = index_build(self.build.pk) self.assertEqual(ImportedFile.objects.count(), 3) document = ( PageDocument() @@ -335,7 +335,7 @@ def test_update_content(self): with override_settings(DOCROOT=self.test_dir): self._copy_storage_dir() - sync_id = self._manage_imported_files(self.version) + sync_id = index_build(self.build.pk) self.assertEqual(ImportedFile.objects.count(), 3) document = ( PageDocument() @@ -347,3 +347,57 @@ def test_update_content(self): .execute()[0] ) self.assertEqual(document.sections[0].content, "Something Else") + + @mock.patch("readthedocs.projects.tasks.search.write_manifest") + def test_create_file_tree_manifest(self, write_manifest): + assert self.version.slug == LATEST + index_build(self.build.pk) + # Feature flag is not enabled. + write_manifest.assert_not_called() + + get( + Feature, + feature_id=Feature.GENERATE_MANIFEST_FOR_FILE_TREE_DIFF, + projects=[self.project], + ) + index_build(self.build.pk) + manifest = FileTreeDiffManifest( + build_id=self.build.pk, + files=[ + FileTreeDiffFile( + path="index.html", + main_content_hash=mock.ANY, + ), + FileTreeDiffFile( + path="404.html", + main_content_hash=mock.ANY, + ), + FileTreeDiffFile( + path="test.html", + main_content_hash=mock.ANY, + ), + FileTreeDiffFile( + path="api/index.html", + main_content_hash=mock.ANY, + ), + ], + ) + write_manifest.assert_called_once_with(self.version, manifest) + + # The version is not the latest nor a PR. + invalid_version = get( + Version, + project=self.project, + slug="invalid", + ) + self.build.version = invalid_version + self.build.save() + write_manifest.reset_mock() + index_build(self.build.pk) + write_manifest.assert_not_called() + + # Now it is a PR. + invalid_version.type = EXTERNAL + invalid_version.save() + index_build(self.build.pk) + write_manifest.assert_called_once() From dcea1d70f5f21db3b990a0ef3b44cf0a9443c2fe Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Mon, 21 Oct 2024 13:24:44 -0500 Subject: [PATCH 18/20] Adapt to changes on .com --- readthedocs/projects/tasks/search.py | 2 +- readthedocs/proxito/views/hosting.py | 10 +++++----- readthedocs/search/api/v2/views.py | 6 +++--- readthedocs/search/api/v3/executor.py | 11 ++++++----- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index b00d0388df9..d95ea854c0c 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -276,7 +276,7 @@ def index_build(build_id): version=version, build=build, ) - _process_files(version=version, indexers=indexers) + return _process_files(version=version, indexers=indexers) except Exception: log.exception("Failed to index build") diff --git a/readthedocs/proxito/views/hosting.py b/readthedocs/proxito/views/hosting.py index dae01d06dfc..062f0ab2619 100644 --- a/readthedocs/proxito/views/hosting.py +++ b/readthedocs/proxito/views/hosting.py @@ -312,9 +312,9 @@ def _get_versions(self, request, project): include_hidden=False, ) - def _has_permission(self, user, version): + def _has_permission(self, request, version): """ - Check if `user` is authorized to access `version`. + Check if user from the request is authorized to access `version`. This is mainly to be overridden in .com to make use of the auth backends in the proxied API. @@ -528,7 +528,7 @@ def _v1(self, project, version, build, filename, url, request): if version: response = self._get_filetreediff_response( - user=user, project=project, version=version + request=request, project=project, version=version ) if response: data["addons"]["filetreediff"].update(response) @@ -613,7 +613,7 @@ def _v1(self, project, version, build, filename, url, request): return data - def _get_filetreediff_response(self, *, user, project, version): + def _get_filetreediff_response(self, *, request, project, version): """ Get the file tree diff response for the given version. @@ -628,7 +628,7 @@ def _get_filetreediff_response(self, *, user, project, version): latest_version = project.get_latest_version() if not latest_version or not self._has_permission( - user=user, version=latest_version + request=request, version=latest_version ): return None diff --git a/readthedocs/search/api/v2/views.py b/readthedocs/search/api/v2/views.py index 9f8c5dad34f..d1cfcb2bb2b 100644 --- a/readthedocs/search/api/v2/views.py +++ b/readthedocs/search/api/v2/views.py @@ -81,7 +81,7 @@ def _get_projects_to_search(self): main_version = self._get_version() main_project = self._get_project() - if not self._has_permission(self.request.user, main_version): + if not self._has_permission(self.request, main_version): return [] projects_to_search = [(main_project, main_version)] @@ -101,7 +101,7 @@ def _get_projects_to_search(self): include_hidden=False, ) - if version and self._has_permission(self.request.user, version): + if version and self._has_permission(self.request, version): projects_to_search.append((subproject, version)) return projects_to_search @@ -125,7 +125,7 @@ def _get_project_version(self, project, version_slug, include_hidden=True): .first() ) - def _has_permission(self, user, version): + def _has_permission(self, request, version): """ Check if `user` is authorized to access `version`. diff --git a/readthedocs/search/api/v3/executor.py b/readthedocs/search/api/v3/executor.py index f5a7ff67123..e529c108493 100644 --- a/readthedocs/search/api/v3/executor.py +++ b/readthedocs/search/api/v3/executor.py @@ -93,14 +93,15 @@ def _get_projects_to_search(self): for value in self.parser.arguments["project"]: project, version = self._get_project_and_version(value) - if version and self._has_permission(self.request.user, version): + if version and self._has_permission(self.request, version): yield project, version + # TODO: respect the language of the project when including subprojects. for value in self.parser.arguments["subprojects"]: project, version = self._get_project_and_version(value) # Add the project itself. - if version and self._has_permission(self.request.user, version): + if version and self._has_permission(self.request, version): yield project, version if project: @@ -124,7 +125,7 @@ def _get_projects_from_user(self): version_slug=project.default_version, include_hidden=False, ) - if version and self._has_permission(self.request.user, version): + if version and self._has_permission(self.request, version): yield project, version def _get_subprojects(self, project, version_slug=None): @@ -153,10 +154,10 @@ def _get_subprojects(self, project, version_slug=None): include_hidden=False, ) - if version and self._has_permission(self.request.user, version): + if version and self._has_permission(self.request, version): yield subproject, version - def _has_permission(self, user, version): + def _has_permission(self, request, version): """ Check if `user` is authorized to access `version`. From 2dd5734670341bf9c9680ede16fae7572cd97b7e Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Mon, 21 Oct 2024 14:37:51 -0500 Subject: [PATCH 19/20] Fix tests --- .../rtd_tests/tests/test_imported_file.py | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py index 8ec87a3901e..fb1a9474253 100644 --- a/readthedocs/rtd_tests/tests/test_imported_file.py +++ b/readthedocs/rtd_tests/tests/test_imported_file.py @@ -36,7 +36,7 @@ def setUp(self): self.test_dir = os.path.join(base_dir, "files") with override_settings(DOCROOT=self.test_dir): - self._copy_storage_dir() + self._copy_storage_dir(self.version) self._create_index() @@ -51,15 +51,15 @@ def tearDown(self): # Delete index PageDocument._index.delete(ignore=404) - def _copy_storage_dir(self): + def _copy_storage_dir(self, version): """Copy the test directory (rtd_tests/files) to storage""" self.storage.copy_directory( self.test_dir, self.project.get_storage_path( type_="html", - version_slug=self.version.slug, + version_slug=version.slug, include_file=False, - version_type=self.version.type, + version_type=version.type, ), ) @@ -108,7 +108,7 @@ def test_index_external_version(self): self.version.save() with override_settings(DOCROOT=self.test_dir): - self._copy_storage_dir() + self._copy_storage_dir(self.version) sync_id = index_build(self.build.pk) self.assertEqual(ImportedFile.objects.count(), 3) @@ -314,7 +314,7 @@ def test_update_content(self): f.write("Woo") with override_settings(DOCROOT=self.test_dir): - self._copy_storage_dir() + self._copy_storage_dir(self.version) sync_id = index_build(self.build.pk) self.assertEqual(ImportedFile.objects.count(), 3) @@ -333,7 +333,7 @@ def test_update_content(self): f.write("Something Else") with override_settings(DOCROOT=self.test_dir): - self._copy_storage_dir() + self._copy_storage_dir(self.version) sync_id = index_build(self.build.pk) self.assertEqual(ImportedFile.objects.count(), 3) @@ -385,19 +385,21 @@ def test_create_file_tree_manifest(self, write_manifest): write_manifest.assert_called_once_with(self.version, manifest) # The version is not the latest nor a PR. - invalid_version = get( + new_version = get( Version, project=self.project, - slug="invalid", + slug="new-version", ) - self.build.version = invalid_version + self.build.version = new_version self.build.save() write_manifest.reset_mock() index_build(self.build.pk) write_manifest.assert_not_called() # Now it is a PR. - invalid_version.type = EXTERNAL - invalid_version.save() + new_version.type = EXTERNAL + new_version.save() + with override_settings(DOCROOT=self.test_dir): + self._copy_storage_dir(new_version) index_build(self.build.pk) write_manifest.assert_called_once() From 7e27b270b3a643be067b89851e3126dfe6d3de58 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Tue, 22 Oct 2024 09:18:07 -0500 Subject: [PATCH 20/20] Updates from review --- readthedocs/filetreediff/__init__.py | 3 +-- readthedocs/filetreediff/dataclasses.py | 8 ++++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/readthedocs/filetreediff/__init__.py b/readthedocs/filetreediff/__init__.py index 2d5de4043dd..65c3fd22ee1 100644 --- a/readthedocs/filetreediff/__init__.py +++ b/readthedocs/filetreediff/__init__.py @@ -17,7 +17,6 @@ """ import json -from dataclasses import asdict from readthedocs.builds.models import Version from readthedocs.filetreediff.dataclasses import FileTreeDiff, FileTreeDiffManifest @@ -109,4 +108,4 @@ def write_manifest(version: Version, manifest: FileTreeDiffManifest): ) manifest_path = build_media_storage.join(storage_path, MANIFEST_FILE_NAME) with build_media_storage.open(manifest_path, "w") as f: - json.dump(asdict(manifest), f) + json.dump(manifest.as_dict(), f) diff --git a/readthedocs/filetreediff/dataclasses.py b/readthedocs/filetreediff/dataclasses.py index abbcc04658f..d3e5186b79a 100644 --- a/readthedocs/filetreediff/dataclasses.py +++ b/readthedocs/filetreediff/dataclasses.py @@ -1,4 +1,4 @@ -from dataclasses import dataclass +from dataclasses import asdict, dataclass @dataclass(slots=True) @@ -36,7 +36,7 @@ def from_dict(cls, data: dict) -> "FileTreeDiffManifest": Create a FileTreeManifest from a dictionary. The dictionary should follow the same structure as the one returned by - converting the object to a dictionary using the `asdict` function. + converting the object to a dictionary using the `as_dict` method. """ build_id = data["build"]["id"] files = [ @@ -45,6 +45,10 @@ def from_dict(cls, data: dict) -> "FileTreeDiffManifest": ] return cls(build_id, files) + def as_dict(self) -> dict: + """Convert the object to a dictionary.""" + return asdict(self) + @dataclass class FileTreeDiff: