Skip to content

File tree diff #11646

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions readthedocs/builds/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,17 @@ def last_build(self):
def latest_build(self):
return self.builds.order_by("-date").first()

@property
def latest_successful_build(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we try to cache this somehow? I know we've been hitting issues with calling stuff like this repeatedly... Maybe not important in this PR, but I'd love to have a general solution for this.

return (
self.builds.filter(
state=BUILD_STATE_FINISHED,
success=True,
)
.order_by("-date")
.first()
)

@property
def config(self):
"""
Expand Down
112 changes: 112 additions & 0 deletions readthedocs/filetreediff/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""
Module for the file tree diff feature (FTD).

This feature is used to compare the files of two versions of a project.

The process is as follows:

- A build is triggered for a version.
- A task is triggered after the build has succeeded
to generate a manifest of the files of the version.
Currently, we only consider the latest version and pull request previews.
- The manifest contains the hash of the main content of each file.
Only HTML files are considered for now.
- The manifest is stored in the diff media storage.
- Then our application can compare the manifest to get a list of added,
deleted, and modified files between two versions.
"""

import json
from dataclasses import asdict

from readthedocs.builds.models import Version
from readthedocs.filetreediff.dataclasses import FileTreeDiff, FileTreeDiffManifest
from readthedocs.projects.constants import MEDIA_TYPE_DIFF
from readthedocs.storage import build_media_storage

MANIFEST_FILE_NAME = "manifest.json"


def get_diff(version_a: Version, version_b: Version) -> FileTreeDiff | None:
"""
Get the file tree diff between two versions.

If any of the versions don't have a manifest, return None.
If the latest build of any of the versions is different from the manifest build,
the diff is marked as outdated. The client is responsible for deciding
how to handle this case.

Set operations are used to calculate the added, deleted, and modified files.
To get the modified files, we compare the main content hash of each common file.
If there are no changes between the versions, all lists will be empty.
"""
outdated = False
manifests: list[FileTreeDiffManifest] = []
for version in (version_a, version_b):
manifest = get_manifest(version)
if not manifest:
return None

latest_build = version.latest_successful_build
if not latest_build:
return None

if latest_build.id != manifest.build.id:
outdated = True

manifests.append(manifest)

# pylint: disable=unbalanced-tuple-unpacking
version_a_manifest, version_b_manifest = manifests
files_a = set(version_a_manifest.files.keys())
files_b = set(version_b_manifest.files.keys())

files_added = list(files_a - files_b)
files_deleted = list(files_b - files_a)
files_modified = []
for file_path in files_a & files_b:
file_a = version_a_manifest.files[file_path]
file_b = version_b_manifest.files[file_path]
if file_a.main_content_hash != file_b.main_content_hash:
files_modified.append(file_path)

return FileTreeDiff(
added=files_added,
deleted=files_deleted,
modified=files_modified,
outdated=outdated,
)


def get_manifest(version: Version) -> FileTreeDiffManifest | None:
"""
Get the file manifest for a version.

If the manifest file does not exist, return None.
"""
storage_path = version.project.get_storage_path(
type_=MEDIA_TYPE_DIFF,
version_slug=version.slug,
include_file=False,
version_type=version.type,
)
manifest_path = build_media_storage.join(storage_path, MANIFEST_FILE_NAME)
try:
with build_media_storage.open(manifest_path) as manifest_file:
manifest = json.load(manifest_file)
except FileNotFoundError:
return None

return FileTreeDiffManifest.from_dict(manifest)


def write_manifest(version: Version, manifest: FileTreeDiffManifest):
storage_path = version.project.get_storage_path(
type_=MEDIA_TYPE_DIFF,
version_slug=version.slug,
include_file=False,
version_type=version.type,
)
manifest_path = build_media_storage.join(storage_path, MANIFEST_FILE_NAME)
with build_media_storage.open(manifest_path, "w") as f:
json.dump(asdict(manifest), f)
60 changes: 60 additions & 0 deletions readthedocs/filetreediff/dataclasses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from dataclasses import dataclass


@dataclass(slots=True)
class FileTreeDiffBuild:

"""The build associated with a file tree manifest."""

id: int


@dataclass(slots=True)
class FileTreeDiffFile:

"""A file in a file tree manifest."""

path: str
main_content_hash: str


@dataclass(slots=True)
class FileTreeDiffManifest:

"""A list of files and the build associated with them."""

files: dict[str, FileTreeDiffFile]
build: FileTreeDiffBuild

def __init__(self, build_id: int, files: list[FileTreeDiffFile]):
self.build = FileTreeDiffBuild(id=build_id)
self.files = {file.path: file for file in files}

@classmethod
def from_dict(cls, data: dict) -> "FileTreeDiffManifest":
"""
Create a FileTreeManifest from a dictionary.

The dictionary should follow the same structure as the one returned by
converting the object to a dictionary using the `asdict` function.
"""
build_id = data["build"]["id"]
files = [
FileTreeDiffFile(path=path, main_content_hash=file["main_content_hash"])
for path, file in data["files"].items()
]
return cls(build_id, files)


@dataclass
class FileTreeDiff:

"""Difference between two file tree manifests."""

added: list[str]
deleted: list[str]
modified: list[str]
outdated: bool = False

def has_changes(self) -> bool:
return bool(self.added or self.deleted or self.modified)
134 changes: 134 additions & 0 deletions readthedocs/filetreediff/tests/test_filetreediff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import json
from contextlib import contextmanager
from unittest import mock

from django.test import TestCase
from django_dynamic_fixture import get

from readthedocs.builds.constants import BUILD_STATE_FINISHED, LATEST
from readthedocs.builds.models import Build, Version
from readthedocs.filetreediff import get_diff
from readthedocs.projects.models import Project
from readthedocs.rtd_tests.storage import BuildMediaFileSystemStorageTest


# We are overriding the storage class instead of using RTD_BUILD_MEDIA_STORAGE,
# since the setting is evaluated just once (first test to use the storage
# backend will set it for the whole test suite).
@mock.patch(
"readthedocs.filetreediff.build_media_storage",
new=BuildMediaFileSystemStorageTest(),
)
class TestsFileTreeDiff(TestCase):
def setUp(self):
self.project = get(Project)
self.version_a = self.project.versions.get(slug=LATEST)
self.build_a = get(
Build,
project=self.project,
version=self.version_a,
state=BUILD_STATE_FINISHED,
success=True,
)
self.version_b = get(
Version,
project=self.project,
slug="v2",
active=True,
built=True,
)
self.build_b = get(
Build,
project=self.project,
version=self.version_b,
state=BUILD_STATE_FINISHED,
success=True,
)

def _mock_open(self, content):
@contextmanager
def f(*args, **kwargs):
read_mock = mock.MagicMock()
read_mock.read.return_value = content
yield read_mock

return f

def _mock_manifest(self, build_id: int, files: dict[str, str]):
return self._mock_open(
json.dumps(
{
"build": {"id": build_id},
"files": {
file_path: {"main_content_hash": main_content_hash}
for file_path, main_content_hash in files.items()
},
}
)
)

@mock.patch.object(BuildMediaFileSystemStorageTest, "open")
def test_diff_no_changes(self, storage_open):
files_a = {
"index.html": "hash1",
"tutorials/index.html": "hash2",
}
storage_open.side_effect = [
self._mock_manifest(self.build_a.id, files_a)(),
self._mock_manifest(self.build_b.id, files_a)(),
]
diff = get_diff(self.version_a, self.version_b)
assert diff.added == []
assert diff.deleted == []
assert diff.modified == []
assert not diff.outdated

@mock.patch.object(BuildMediaFileSystemStorageTest, "open")
def test_diff_changes(self, storage_open):
files_a = {
"index.html": "hash1",
"tutorials/index.html": "hash2",
"new-file.html": "hash-new",
}
files_b = {
"index.html": "hash1",
"tutorials/index.html": "hash-changed",
"deleted.html": "hash-deleted",
}
storage_open.side_effect = [
self._mock_manifest(self.build_a.id, files_a)(),
self._mock_manifest(self.build_b.id, files_b)(),
]
diff = get_diff(self.version_a, self.version_b)
assert diff.added == ["new-file.html"]
assert diff.deleted == ["deleted.html"]
assert diff.modified == ["tutorials/index.html"]
assert not diff.outdated

@mock.patch.object(BuildMediaFileSystemStorageTest, "open")
def test_missing_manifest(self, storage_open):
storage_open.side_effect = FileNotFoundError
diff = get_diff(self.version_a, self.version_b)
assert diff is None

@mock.patch.object(BuildMediaFileSystemStorageTest, "open")
def test_outdated_diff(self, storage_open):
files_a = {
"index.html": "hash1",
"tutorials/index.html": "hash2",
"new-file.html": "hash-new",
}
files_b = {
"index.html": "hash1",
"tutorials/index.html": "hash-changed",
"deleted.html": "hash-deleted",
}
storage_open.side_effect = [
self._mock_manifest(self.build_a.id + 5, files_a)(),
self._mock_manifest(self.build_b.id + 5, files_b)(),
]
diff = get_diff(self.version_a, self.version_b)
assert diff.added == ["new-file.html"]
assert diff.deleted == ["deleted.html"]
assert diff.modified == ["tutorials/index.html"]
assert diff.outdated
2 changes: 2 additions & 0 deletions readthedocs/projects/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
MEDIA_TYPE_EPUB = "epub"
MEDIA_TYPE_HTMLZIP = "htmlzip"
MEDIA_TYPE_JSON = "json"
MEDIA_TYPE_DIFF = "diff"
DOWNLOADABLE_MEDIA_TYPES = (
MEDIA_TYPE_PDF,
MEDIA_TYPE_EPUB,
Expand All @@ -45,6 +46,7 @@
MEDIA_TYPE_EPUB,
MEDIA_TYPE_HTMLZIP,
MEDIA_TYPE_JSON,
MEDIA_TYPE_DIFF,
)

BUILD_COMMANDS_OUTPUT_PATH = "_readthedocs/"
Expand Down
5 changes: 5 additions & 0 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1895,6 +1895,7 @@ def add_features(sender, **kwargs):
RESOLVE_PROJECT_FROM_HEADER = "resolve_project_from_header"
USE_PROXIED_APIS_WITH_PREFIX = "use_proxied_apis_with_prefix"
ALLOW_VERSION_WARNING_BANNER = "allow_version_warning_banner"
GENERATE_MANIFEST_FOR_FILE_TREE_DIFF = "generate_manifest_for_file_tree_diff"

# Versions sync related features
SKIP_SYNC_TAGS = "skip_sync_tags"
Expand Down Expand Up @@ -1955,6 +1956,10 @@ def add_features(sender, **kwargs):
ALLOW_VERSION_WARNING_BANNER,
_("Dashboard: Allow project to use the version warning banner."),
),
(
GENERATE_MANIFEST_FOR_FILE_TREE_DIFF,
_("Build: Generate a file manifest for file tree diff."),
),
# Versions sync related features
(
SKIP_SYNC_BRANCHES,
Expand Down
Loading