Skip to content

Commit 5c8d47e

Browse files
authored
File tree diff (#11646)
Closes #11319 Ref #11507
1 parent 9451d9c commit 5c8d47e

28 files changed

+686
-98
lines changed

readthedocs/builds/models.py

+11
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,17 @@ def last_build(self):
299299
def latest_build(self):
300300
return self.builds.order_by("-date").first()
301301

302+
@property
303+
def latest_successful_build(self):
304+
return (
305+
self.builds.filter(
306+
state=BUILD_STATE_FINISHED,
307+
success=True,
308+
)
309+
.order_by("-date")
310+
.first()
311+
)
312+
302313
@property
303314
def config(self):
304315
"""

readthedocs/filetreediff/__init__.py

+111
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
"""
2+
Module for the file tree diff feature (FTD).
3+
4+
This feature is used to compare the files of two versions of a project.
5+
6+
The process is as follows:
7+
8+
- A build is triggered for a version.
9+
- A task is triggered after the build has succeeded
10+
to generate a manifest of the files of the version.
11+
Currently, we only consider the latest version and pull request previews.
12+
- The manifest contains the hash of the main content of each file.
13+
Only HTML files are considered for now.
14+
- The manifest is stored in the diff media storage.
15+
- Then our application can compare the manifest to get a list of added,
16+
deleted, and modified files between two versions.
17+
"""
18+
19+
import json
20+
21+
from readthedocs.builds.models import Version
22+
from readthedocs.filetreediff.dataclasses import FileTreeDiff, FileTreeDiffManifest
23+
from readthedocs.projects.constants import MEDIA_TYPE_DIFF
24+
from readthedocs.storage import build_media_storage
25+
26+
MANIFEST_FILE_NAME = "manifest.json"
27+
28+
29+
def get_diff(version_a: Version, version_b: Version) -> FileTreeDiff | None:
30+
"""
31+
Get the file tree diff between two versions.
32+
33+
If any of the versions don't have a manifest, return None.
34+
If the latest build of any of the versions is different from the manifest build,
35+
the diff is marked as outdated. The client is responsible for deciding
36+
how to handle this case.
37+
38+
Set operations are used to calculate the added, deleted, and modified files.
39+
To get the modified files, we compare the main content hash of each common file.
40+
If there are no changes between the versions, all lists will be empty.
41+
"""
42+
outdated = False
43+
manifests: list[FileTreeDiffManifest] = []
44+
for version in (version_a, version_b):
45+
manifest = get_manifest(version)
46+
if not manifest:
47+
return None
48+
49+
latest_build = version.latest_successful_build
50+
if not latest_build:
51+
return None
52+
53+
if latest_build.id != manifest.build.id:
54+
outdated = True
55+
56+
manifests.append(manifest)
57+
58+
# pylint: disable=unbalanced-tuple-unpacking
59+
version_a_manifest, version_b_manifest = manifests
60+
files_a = set(version_a_manifest.files.keys())
61+
files_b = set(version_b_manifest.files.keys())
62+
63+
files_added = list(files_a - files_b)
64+
files_deleted = list(files_b - files_a)
65+
files_modified = []
66+
for file_path in files_a & files_b:
67+
file_a = version_a_manifest.files[file_path]
68+
file_b = version_b_manifest.files[file_path]
69+
if file_a.main_content_hash != file_b.main_content_hash:
70+
files_modified.append(file_path)
71+
72+
return FileTreeDiff(
73+
added=files_added,
74+
deleted=files_deleted,
75+
modified=files_modified,
76+
outdated=outdated,
77+
)
78+
79+
80+
def get_manifest(version: Version) -> FileTreeDiffManifest | None:
81+
"""
82+
Get the file manifest for a version.
83+
84+
If the manifest file does not exist, return None.
85+
"""
86+
storage_path = version.project.get_storage_path(
87+
type_=MEDIA_TYPE_DIFF,
88+
version_slug=version.slug,
89+
include_file=False,
90+
version_type=version.type,
91+
)
92+
manifest_path = build_media_storage.join(storage_path, MANIFEST_FILE_NAME)
93+
try:
94+
with build_media_storage.open(manifest_path) as manifest_file:
95+
manifest = json.load(manifest_file)
96+
except FileNotFoundError:
97+
return None
98+
99+
return FileTreeDiffManifest.from_dict(manifest)
100+
101+
102+
def write_manifest(version: Version, manifest: FileTreeDiffManifest):
103+
storage_path = version.project.get_storage_path(
104+
type_=MEDIA_TYPE_DIFF,
105+
version_slug=version.slug,
106+
include_file=False,
107+
version_type=version.type,
108+
)
109+
manifest_path = build_media_storage.join(storage_path, MANIFEST_FILE_NAME)
110+
with build_media_storage.open(manifest_path, "w") as f:
111+
json.dump(manifest.as_dict(), f)
+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
from dataclasses import asdict, dataclass
2+
3+
4+
@dataclass(slots=True)
5+
class FileTreeDiffBuild:
6+
7+
"""The build associated with a file tree manifest."""
8+
9+
id: int
10+
11+
12+
@dataclass(slots=True)
13+
class FileTreeDiffFile:
14+
15+
"""A file in a file tree manifest."""
16+
17+
path: str
18+
main_content_hash: str
19+
20+
21+
@dataclass(slots=True)
22+
class FileTreeDiffManifest:
23+
24+
"""A list of files and the build associated with them."""
25+
26+
files: dict[str, FileTreeDiffFile]
27+
build: FileTreeDiffBuild
28+
29+
def __init__(self, build_id: int, files: list[FileTreeDiffFile]):
30+
self.build = FileTreeDiffBuild(id=build_id)
31+
self.files = {file.path: file for file in files}
32+
33+
@classmethod
34+
def from_dict(cls, data: dict) -> "FileTreeDiffManifest":
35+
"""
36+
Create a FileTreeManifest from a dictionary.
37+
38+
The dictionary should follow the same structure as the one returned by
39+
converting the object to a dictionary using the `as_dict` method.
40+
"""
41+
build_id = data["build"]["id"]
42+
files = [
43+
FileTreeDiffFile(path=path, main_content_hash=file["main_content_hash"])
44+
for path, file in data["files"].items()
45+
]
46+
return cls(build_id, files)
47+
48+
def as_dict(self) -> dict:
49+
"""Convert the object to a dictionary."""
50+
return asdict(self)
51+
52+
53+
@dataclass
54+
class FileTreeDiff:
55+
56+
"""Difference between two file tree manifests."""
57+
58+
added: list[str]
59+
deleted: list[str]
60+
modified: list[str]
61+
outdated: bool = False
62+
63+
def has_changes(self) -> bool:
64+
return bool(self.added or self.deleted or self.modified)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
import json
2+
from contextlib import contextmanager
3+
from unittest import mock
4+
5+
from django.test import TestCase
6+
from django_dynamic_fixture import get
7+
8+
from readthedocs.builds.constants import BUILD_STATE_FINISHED, LATEST
9+
from readthedocs.builds.models import Build, Version
10+
from readthedocs.filetreediff import get_diff
11+
from readthedocs.projects.models import Project
12+
from readthedocs.rtd_tests.storage import BuildMediaFileSystemStorageTest
13+
14+
15+
# We are overriding the storage class instead of using RTD_BUILD_MEDIA_STORAGE,
16+
# since the setting is evaluated just once (first test to use the storage
17+
# backend will set it for the whole test suite).
18+
@mock.patch(
19+
"readthedocs.filetreediff.build_media_storage",
20+
new=BuildMediaFileSystemStorageTest(),
21+
)
22+
class TestsFileTreeDiff(TestCase):
23+
def setUp(self):
24+
self.project = get(Project)
25+
self.version_a = self.project.versions.get(slug=LATEST)
26+
self.build_a = get(
27+
Build,
28+
project=self.project,
29+
version=self.version_a,
30+
state=BUILD_STATE_FINISHED,
31+
success=True,
32+
)
33+
self.version_b = get(
34+
Version,
35+
project=self.project,
36+
slug="v2",
37+
active=True,
38+
built=True,
39+
)
40+
self.build_b = get(
41+
Build,
42+
project=self.project,
43+
version=self.version_b,
44+
state=BUILD_STATE_FINISHED,
45+
success=True,
46+
)
47+
48+
def _mock_open(self, content):
49+
@contextmanager
50+
def f(*args, **kwargs):
51+
read_mock = mock.MagicMock()
52+
read_mock.read.return_value = content
53+
yield read_mock
54+
55+
return f
56+
57+
def _mock_manifest(self, build_id: int, files: dict[str, str]):
58+
return self._mock_open(
59+
json.dumps(
60+
{
61+
"build": {"id": build_id},
62+
"files": {
63+
file_path: {"main_content_hash": main_content_hash}
64+
for file_path, main_content_hash in files.items()
65+
},
66+
}
67+
)
68+
)
69+
70+
@mock.patch.object(BuildMediaFileSystemStorageTest, "open")
71+
def test_diff_no_changes(self, storage_open):
72+
files_a = {
73+
"index.html": "hash1",
74+
"tutorials/index.html": "hash2",
75+
}
76+
storage_open.side_effect = [
77+
self._mock_manifest(self.build_a.id, files_a)(),
78+
self._mock_manifest(self.build_b.id, files_a)(),
79+
]
80+
diff = get_diff(self.version_a, self.version_b)
81+
assert diff.added == []
82+
assert diff.deleted == []
83+
assert diff.modified == []
84+
assert not diff.outdated
85+
86+
@mock.patch.object(BuildMediaFileSystemStorageTest, "open")
87+
def test_diff_changes(self, storage_open):
88+
files_a = {
89+
"index.html": "hash1",
90+
"tutorials/index.html": "hash2",
91+
"new-file.html": "hash-new",
92+
}
93+
files_b = {
94+
"index.html": "hash1",
95+
"tutorials/index.html": "hash-changed",
96+
"deleted.html": "hash-deleted",
97+
}
98+
storage_open.side_effect = [
99+
self._mock_manifest(self.build_a.id, files_a)(),
100+
self._mock_manifest(self.build_b.id, files_b)(),
101+
]
102+
diff = get_diff(self.version_a, self.version_b)
103+
assert diff.added == ["new-file.html"]
104+
assert diff.deleted == ["deleted.html"]
105+
assert diff.modified == ["tutorials/index.html"]
106+
assert not diff.outdated
107+
108+
@mock.patch.object(BuildMediaFileSystemStorageTest, "open")
109+
def test_missing_manifest(self, storage_open):
110+
storage_open.side_effect = FileNotFoundError
111+
diff = get_diff(self.version_a, self.version_b)
112+
assert diff is None
113+
114+
@mock.patch.object(BuildMediaFileSystemStorageTest, "open")
115+
def test_outdated_diff(self, storage_open):
116+
files_a = {
117+
"index.html": "hash1",
118+
"tutorials/index.html": "hash2",
119+
"new-file.html": "hash-new",
120+
}
121+
files_b = {
122+
"index.html": "hash1",
123+
"tutorials/index.html": "hash-changed",
124+
"deleted.html": "hash-deleted",
125+
}
126+
storage_open.side_effect = [
127+
self._mock_manifest(self.build_a.id + 5, files_a)(),
128+
self._mock_manifest(self.build_b.id + 5, files_b)(),
129+
]
130+
diff = get_diff(self.version_a, self.version_b)
131+
assert diff.added == ["new-file.html"]
132+
assert diff.deleted == ["deleted.html"]
133+
assert diff.modified == ["tutorials/index.html"]
134+
assert diff.outdated

readthedocs/projects/constants.py

+2
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
MEDIA_TYPE_EPUB = "epub"
3535
MEDIA_TYPE_HTMLZIP = "htmlzip"
3636
MEDIA_TYPE_JSON = "json"
37+
MEDIA_TYPE_DIFF = "diff"
3738
DOWNLOADABLE_MEDIA_TYPES = (
3839
MEDIA_TYPE_PDF,
3940
MEDIA_TYPE_EPUB,
@@ -45,6 +46,7 @@
4546
MEDIA_TYPE_EPUB,
4647
MEDIA_TYPE_HTMLZIP,
4748
MEDIA_TYPE_JSON,
49+
MEDIA_TYPE_DIFF,
4850
)
4951

5052
BUILD_COMMANDS_OUTPUT_PATH = "_readthedocs/"

readthedocs/projects/models.py

+5
Original file line numberDiff line numberDiff line change
@@ -1895,6 +1895,7 @@ def add_features(sender, **kwargs):
18951895
RESOLVE_PROJECT_FROM_HEADER = "resolve_project_from_header"
18961896
USE_PROXIED_APIS_WITH_PREFIX = "use_proxied_apis_with_prefix"
18971897
ALLOW_VERSION_WARNING_BANNER = "allow_version_warning_banner"
1898+
GENERATE_MANIFEST_FOR_FILE_TREE_DIFF = "generate_manifest_for_file_tree_diff"
18981899

18991900
# Versions sync related features
19001901
SKIP_SYNC_TAGS = "skip_sync_tags"
@@ -1955,6 +1956,10 @@ def add_features(sender, **kwargs):
19551956
ALLOW_VERSION_WARNING_BANNER,
19561957
_("Dashboard: Allow project to use the version warning banner."),
19571958
),
1959+
(
1960+
GENERATE_MANIFEST_FOR_FILE_TREE_DIFF,
1961+
_("Build: Generate a file manifest for file tree diff."),
1962+
),
19581963
# Versions sync related features
19591964
(
19601965
SKIP_SYNC_BRANCHES,

0 commit comments

Comments
 (0)