Skip to content

Commit 2555ae6

Browse files
committed
File tree diff
Closes #11319 Ref #11507
1 parent a182899 commit 2555ae6

File tree

6 files changed

+187
-22
lines changed

6 files changed

+187
-22
lines changed

readthedocs/filetreediff/__init__.py

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import json
2+
from dataclasses import dataclass
3+
4+
from readthedocs.builds.constants import BUILD_STATE_FINISHED
5+
from readthedocs.builds.models import Version
6+
from readthedocs.projects.constants import MEDIA_TYPE_METADATA
7+
from readthedocs.storage import build_media_storage
8+
9+
10+
@dataclass
11+
class FileTreeDiff:
12+
added: list[str]
13+
removed: list[str]
14+
modified: list[str]
15+
16+
17+
def get_diff(version_a: Version, version_b: Version) -> FileTreeDiff | None:
18+
version_a_manifest = get_manifest(version_a)
19+
version_b_manifest = get_manifest(version_b)
20+
21+
if not version_a_manifest or not version_b_manifest:
22+
return None
23+
24+
files_a = set(version_a_manifest.get("files", {}).keys())
25+
files_b = set(version_b_manifest.get("files", {}).keys())
26+
27+
files_added = list(files_a - files_b)
28+
files_removed = list(files_b - files_a)
29+
files_modified = []
30+
for file_path in files_a & files_b:
31+
file_a = version_a_manifest["files"][file_path]
32+
file_b = version_b_manifest["files"][file_path]
33+
34+
if file_a["hash"] != file_b["hash"]:
35+
files_modified.append(file_path)
36+
37+
return FileTreeDiff(
38+
added=files_added,
39+
removed=files_removed,
40+
modified=files_modified,
41+
)
42+
43+
44+
def get_manifest(version: Version):
45+
storage_path = version.project.get_storage_path(
46+
type_=MEDIA_TYPE_METADATA,
47+
version_slug=version.slug,
48+
include_file=False,
49+
version_type=version.type,
50+
)
51+
manifest_path = build_media_storage.join(storage_path, "manifest.json")
52+
try:
53+
with build_media_storage.open(manifest_path) as manifest_file:
54+
manifest = json.load(manifest_file)
55+
except FileNotFoundError:
56+
return None
57+
58+
latest_successful_build = version.builds.filter(
59+
state=BUILD_STATE_FINISHED,
60+
success=True,
61+
).first()
62+
if not latest_successful_build:
63+
return None
64+
65+
build_id_from_manifest = manifest.get("build", {}).get("id")
66+
if latest_successful_build.id != build_id_from_manifest:
67+
# The manifest is outdated,
68+
# do we want to still use it? do we care?
69+
# Should the caller be responsible to handle this?
70+
return None
71+
72+
return manifest
73+
74+
75+
def write_manifest(version: Version, manifest: dict):
76+
storage_path = version.project.get_storage_path(
77+
type_=MEDIA_TYPE_METADATA,
78+
version_slug=version.slug,
79+
include_file=False,
80+
version_type=version.type,
81+
)
82+
manifest_path = build_media_storage.join(storage_path, "manifest.json")
83+
with build_media_storage.open(manifest_path, "w") as f:
84+
json.dump(manifest, f)

readthedocs/projects/constants.py

+2
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
MEDIA_TYPE_EPUB = "epub"
3535
MEDIA_TYPE_HTMLZIP = "htmlzip"
3636
MEDIA_TYPE_JSON = "json"
37+
MEDIA_TYPE_METADATA = "metadata"
3738
DOWNLOADABLE_MEDIA_TYPES = (
3839
MEDIA_TYPE_PDF,
3940
MEDIA_TYPE_EPUB,
@@ -45,6 +46,7 @@
4546
MEDIA_TYPE_EPUB,
4647
MEDIA_TYPE_HTMLZIP,
4748
MEDIA_TYPE_JSON,
49+
MEDIA_TYPE_METADATA,
4850
)
4951

5052
BUILD_COMMANDS_OUTPUT_PATH = "_readthedocs/"

readthedocs/projects/models.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -1524,13 +1524,20 @@ class Meta:
15241524
objects = HTMLFileManager()
15251525

15261526
def get_processed_json(self):
1527-
parser = GenericParser(self.version)
1528-
return parser.parse(self.path)
1527+
return self._parser.parse(self.path)
1528+
1529+
@cached_property
1530+
def _parser(self):
1531+
return GenericParser(self.version)
15291532

15301533
@cached_property
15311534
def processed_json(self):
15321535
return self.get_processed_json()
15331536

1537+
@property
1538+
def main_content(self):
1539+
return self._parser.get_main_content(self.path)
1540+
15341541

15351542
class Notification(TimeStampedModel):
15361543

@@ -1887,6 +1894,7 @@ def add_features(sender, **kwargs):
18871894
RESOLVE_PROJECT_FROM_HEADER = "resolve_project_from_header"
18881895
USE_PROXIED_APIS_WITH_PREFIX = "use_proxied_apis_with_prefix"
18891896
ALLOW_VERSION_WARNING_BANNER = "allow_version_warning_banner"
1897+
GENERATE_MANIFEST_FOR_FILE_TREE_DIFF = "generate_manifest_for_file_tree_diff"
18901898

18911899
# Versions sync related features
18921900
SKIP_SYNC_TAGS = "skip_sync_tags"
@@ -1947,6 +1955,10 @@ def add_features(sender, **kwargs):
19471955
ALLOW_VERSION_WARNING_BANNER,
19481956
_("Dashboard: Allow project to use the version warning banner."),
19491957
),
1958+
(
1959+
GENERATE_MANIFEST_FOR_FILE_TREE_DIFF,
1960+
_("Build: Generate a file manifest for file tree diff."),
1961+
),
19501962
# Versions sync related features
19511963
(
19521964
SKIP_SYNC_BRANCHES,

readthedocs/projects/tasks/search.py

+54-18
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
1+
import hashlib
12
from fnmatch import fnmatch
23

34
import structlog
45

5-
from readthedocs.builds.constants import BUILD_STATE_FINISHED, INTERNAL
6+
from readthedocs.builds.constants import BUILD_STATE_FINISHED, INTERNAL, LATEST
67
from readthedocs.builds.models import Build, Version
7-
from readthedocs.projects.models import HTMLFile, Project
8+
from readthedocs.filetreediff import write_manifest
9+
from readthedocs.projects.models import Feature, HTMLFile, Project
810
from readthedocs.projects.signals import files_changed
911
from readthedocs.search.documents import PageDocument
1012
from readthedocs.search.utils import index_objects, remove_indexed_files
@@ -120,7 +122,38 @@ def collect(self, sync_id: int):
120122
self.version.imported_files.exclude(build=sync_id).delete()
121123

122124

123-
def _get_indexers(*, version, search_ranking, search_ignore, search_index_name=None):
125+
class FileManifestIndexer(Indexer):
126+
def __init__(self, version: Version, build: Build):
127+
self.version = version
128+
self.build = build
129+
self._hashes = {}
130+
131+
def process(self, html_file: HTMLFile, sync_id: int):
132+
self._hashes[html_file.path] = hashlib.md5(
133+
html_file.main_content.encode()
134+
).hexdigest()
135+
136+
def collect(self, sync_id: int):
137+
manifest = {
138+
"build": {
139+
"id": self.build.id,
140+
},
141+
"files": {
142+
path: {
143+
"hash": hash,
144+
}
145+
for path, hash in self._hashes.items()
146+
},
147+
}
148+
write_manifest(self.version, manifest)
149+
150+
151+
def _get_indexers(*, version: Version, build: Build, search_index_name=None):
152+
build_config = build.config or {}
153+
search_config = build_config.get("search", {})
154+
search_ranking = search_config.get("ranking", {})
155+
search_ignore = search_config.get("ignore", [])
156+
124157
indexers = []
125158
# NOTE: The search indexer must be before the index file indexer.
126159
# This is because saving the objects in the DB will give them an id,
@@ -136,6 +169,22 @@ def _get_indexers(*, version, search_ranking, search_ignore, search_index_name=N
136169
search_index_name=search_index_name,
137170
)
138171
indexers.append(search_indexer)
172+
173+
# File tree diff is under a feature flag for now,
174+
# and we only allow to compare PR previous against the latest version.
175+
has_feature = version.project.has_feature(
176+
Feature.GENERATE_MANIFEST_FOR_FILE_TREE_DIFF
177+
)
178+
create_manifest = has_feature and (
179+
version.is_external or version == version.slug == LATEST
180+
)
181+
if create_manifest:
182+
file_manifest_indexer = FileManifestIndexer(
183+
version=version,
184+
build=build,
185+
)
186+
indexers.append(file_manifest_indexer)
187+
139188
index_file_indexer = IndexFileIndexer(
140189
project=version.project,
141190
version=version,
@@ -230,16 +279,10 @@ def index_build(build_id):
230279
build_id=build.id,
231280
)
232281

233-
build_config = build.config or {}
234-
search_config = build_config.get("search", {})
235-
search_ranking = search_config.get("ranking", {})
236-
search_ignore = search_config.get("ignore", [])
237-
238282
try:
239283
indexers = _get_indexers(
240284
version=version,
241-
search_ranking=search_ranking,
242-
search_ignore=search_ignore,
285+
build=build,
243286
)
244287
_process_files(version=version, indexers=indexers)
245288
except Exception:
@@ -280,17 +323,10 @@ def reindex_version(version_id, search_index_name=None):
280323
version_slug=version.slug,
281324
build_id=latest_successful_build.id,
282325
)
283-
284-
build_config = latest_successful_build.config or {}
285-
search_config = build_config.get("search", {})
286-
search_ranking = search_config.get("ranking", {})
287-
search_ignore = search_config.get("ignore", [])
288-
289326
try:
290327
indexers = _get_indexers(
291328
version=version,
292-
search_ranking=search_ranking,
293-
search_ignore=search_ignore,
329+
build=latest_successful_build,
294330
search_index_name=search_index_name,
295331
)
296332
_process_files(version=version, indexers=indexers)

readthedocs/proxito/views/hosting.py

+20
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from readthedocs.core.resolver import Resolver
2424
from readthedocs.core.unresolver import UnresolverError, unresolver
2525
from readthedocs.core.utils.extend import SettingsOverrideObject
26+
from readthedocs.filetreediff import get_diff
2627
from readthedocs.projects.constants import (
2728
ADDONS_FLYOUT_SORTING_CALVER,
2829
ADDONS_FLYOUT_SORTING_CUSTOM_PATTERN,
@@ -501,9 +502,28 @@ def _v1(self, project, version, build, filename, url, request):
501502
"trigger": "Slash", # Could be something like "Ctrl + D"
502503
},
503504
},
505+
"filetreediff": {
506+
"enabled": False,
507+
},
504508
},
505509
}
506510

511+
if version.is_external:
512+
latest_version = project.get_latest_version()
513+
diff = get_diff(version_a=version, version_b=latest_version)
514+
if diff:
515+
diff_result = {
516+
"added": [{"file": file} for file in diff.added],
517+
"removed": [{"file": file} for file in diff.removed],
518+
"modified": [{"file": file} for file in diff.modified],
519+
}
520+
data["addons"]["filetreediff"].update(
521+
{
522+
"enabled": True,
523+
"diff": diff_result,
524+
}
525+
)
526+
507527
# DocDiff depends on `url=` GET attribute.
508528
# This attribute allows us to know the exact filename where the request was made.
509529
# If we don't know the filename, we cannot return the data required by DocDiff to work.

readthedocs/search/parsers.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""JSON/HTML parsers for search indexing."""
2-
2+
import functools
33
import itertools
44
import re
55

@@ -20,6 +20,7 @@ def __init__(self, version):
2020
self.project = self.version.project
2121
self.storage = build_media_storage
2222

23+
@functools.cache
2324
def _get_page_content(self, page):
2425
"""Gets the page content from storage."""
2526
content = None
@@ -34,7 +35,7 @@ def _get_page_content(self, page):
3435
content = f.read()
3536
except Exception:
3637
log.warning(
37-
"Unhandled exception during search processing file.",
38+
"Failed to get page content.",
3839
page=page,
3940
)
4041
return content
@@ -427,3 +428,13 @@ def _process_content(self, page, content):
427428
"title": title,
428429
"sections": sections,
429430
}
431+
432+
def get_main_content(self, page):
433+
try:
434+
content = self._get_page_content(page)
435+
html = HTMLParser(content)
436+
body = self._get_main_node(html)
437+
return body.html
438+
except Exception:
439+
log.info("Failed to get main content from page.", path=page, exc_info=True)
440+
return ""

0 commit comments

Comments
 (0)