Skip to content

Commit 1302f94

Browse files
authored
Build: don't track changed files (#7874)
We aren't caching per file, and to do this we are tracking the md5 for each file. Which involves reading te file, calculate the md5, and query the db for each html file. We also don't need the HTMLFile model at all.. But, we can remove that in another PR, this should help to make indexing fast.
1 parent a6e26a3 commit 1302f94

File tree

3 files changed

+11
-60
lines changed

3 files changed

+11
-60
lines changed

readthedocs/projects/signals.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import django.dispatch
44

5-
65
before_vcs = django.dispatch.Signal(providing_args=['version', 'environmemt'])
76
after_vcs = django.dispatch.Signal(providing_args=['version'])
87

@@ -12,4 +11,4 @@
1211
project_import = django.dispatch.Signal(providing_args=['project'])
1312

1413
# Used to purge files from the CDN
15-
files_changed = django.dispatch.Signal(providing_args=['project', 'version', 'files'])
14+
files_changed = django.dispatch.Signal(providing_args=['project', 'version'])

readthedocs/projects/tasks.py

+7-49
Original file line numberDiff line numberDiff line change
@@ -1340,15 +1340,14 @@ def fileify(version_pk, commit, build, search_ranking, search_ignore):
13401340
},
13411341
)
13421342
try:
1343-
changed_files = _create_imported_files(
1343+
_create_imported_files(
13441344
version=version,
13451345
commit=commit,
13461346
build=build,
13471347
search_ranking=search_ranking,
13481348
search_ignore=search_ignore,
13491349
)
13501350
except Exception:
1351-
changed_files = set()
13521351
log.exception('Failed during ImportedFile creation')
13531352

13541353
try:
@@ -1357,7 +1356,7 @@ def fileify(version_pk, commit, build, search_ranking, search_ignore):
13571356
log.exception('Failed during SphinxDomain creation')
13581357

13591358
try:
1360-
_sync_imported_files(version, build, changed_files)
1359+
_sync_imported_files(version, build)
13611360
except Exception:
13621361
log.exception('Failed during ImportedFile syncing')
13631362

@@ -1530,57 +1529,22 @@ def _create_imported_files(*, version, commit, build, search_ranking, search_ign
15301529
:param version: Version instance
15311530
:param commit: Commit that updated path
15321531
:param build: Build id
1533-
:returns: paths of changed files
1534-
:rtype: set
15351532
"""
1536-
changed_files = set()
1537-
15381533
# Re-create all objects from the new build of the version
15391534
storage_path = version.project.get_storage_path(
15401535
type_='html', version_slug=version.slug, include_file=False
15411536
)
15421537
for root, __, filenames in build_media_storage.walk(storage_path):
15431538
for filename in filenames:
1544-
if filename.endswith('.html'):
1545-
model_class = HTMLFile
1546-
elif version.project.cdn_enabled:
1547-
# We need to track all files for CDN enabled projects so the files can be purged
1548-
model_class = ImportedFile
1549-
else:
1550-
# For projects not behind a CDN, we don't care about non-HTML
1539+
# We don't care about non-HTML files
1540+
if not filename.endswith('.html'):
15511541
continue
15521542

15531543
full_path = build_media_storage.join(root, filename)
15541544

15551545
# Generate a relative path for storage similar to os.path.relpath
15561546
relpath = full_path.replace(storage_path, '', 1).lstrip('/')
15571547

1558-
try:
1559-
md5 = hashlib.md5(build_media_storage.open(full_path, 'rb').read()).hexdigest()
1560-
except Exception:
1561-
log.exception(
1562-
'Error while generating md5 for %s:%s:%s. Don\'t stop.',
1563-
version.project.slug,
1564-
version.slug,
1565-
relpath,
1566-
)
1567-
md5 = ''
1568-
# Keep track of changed files to be purged in the CDN
1569-
obj = (
1570-
model_class.objects
1571-
.filter(project=version.project, version=version, path=relpath)
1572-
.order_by('-modified_date')
1573-
.first()
1574-
)
1575-
if obj and md5 and obj.md5 != md5:
1576-
changed_files.add(
1577-
resolve_path(
1578-
version.project,
1579-
filename=relpath,
1580-
version_slug=version.slug,
1581-
),
1582-
)
1583-
15841548
page_rank = 0
15851549
# Last pattern to match takes precedence
15861550
# XXX: see if we can implement another type of precedence,
@@ -1598,37 +1562,31 @@ def _create_imported_files(*, version, commit, build, search_ranking, search_ign
15981562
break
15991563

16001564
# Create imported files from new build
1601-
model_class.objects.create(
1565+
HTMLFile.objects.create(
16021566
project=version.project,
16031567
version=version,
16041568
path=relpath,
16051569
name=filename,
1606-
md5=md5,
16071570
rank=page_rank,
16081571
commit=commit,
16091572
build=build,
16101573
ignore=ignore,
16111574
)
16121575

1613-
# This signal is used for clearing the CDN,
1614-
# so send it as soon as we have the list of changed files
1576+
# This signal is used for purging the CDN.
16151577
files_changed.send(
16161578
sender=Project,
16171579
project=version.project,
16181580
version=version,
1619-
files=changed_files,
16201581
)
16211582

1622-
return changed_files
1623-
16241583

1625-
def _sync_imported_files(version, build, changed_files):
1584+
def _sync_imported_files(version, build):
16261585
"""
16271586
Sync/Update/Delete ImportedFiles objects of this version.
16281587
16291588
:param version: Version instance
16301589
:param build: Build id
1631-
:param changed_files: path of changed files
16321590
"""
16331591

16341592
# Index new HTMLFiles to ElasticSearch

readthedocs/rtd_tests/tests/test_imported_file.py

+3-9
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def _manage_imported_files(
4747
search_ranking=search_ranking,
4848
search_ignore=search_ignore,
4949
)
50-
_sync_imported_files(version, build, set())
50+
_sync_imported_files(version, build)
5151

5252
def _copy_storage_dir(self):
5353
"""Copy the test directory (rtd_tests/files) to storage"""
@@ -63,18 +63,14 @@ def _copy_storage_dir(self):
6363
def test_properly_created(self):
6464
# Only 2 files in the directory is HTML (test.html, api/index.html)
6565
self.assertEqual(ImportedFile.objects.count(), 0)
66-
self._manage_imported_files(self.version, 'commit01', 1)
66+
self._manage_imported_files(version=self.version, commit='commit01', build=1)
6767
self.assertEqual(ImportedFile.objects.count(), 2)
68-
self._manage_imported_files(self.version, 'commit01', 2)
68+
self._manage_imported_files(version=self.version, commit='commit01', build=2)
6969
self.assertEqual(ImportedFile.objects.count(), 2)
7070

7171
self.project.cdn_enabled = True
7272
self.project.save()
7373

74-
# CDN enabled projects => save all files
75-
self._manage_imported_files(self.version, 'commit01', 3)
76-
self.assertEqual(ImportedFile.objects.count(), 4)
77-
7874
def test_update_commit(self):
7975
self.assertEqual(ImportedFile.objects.count(), 0)
8076
self._manage_imported_files(self.version, 'commit01', 1)
@@ -168,7 +164,6 @@ def test_update_content(self):
168164
self._copy_storage_dir()
169165

170166
self._manage_imported_files(self.version, 'commit01', 1)
171-
self.assertEqual(ImportedFile.objects.get(name='test.html').md5, 'c7532f22a052d716f7b2310fb52ad981')
172167
self.assertEqual(ImportedFile.objects.count(), 2)
173168

174169
with open(os.path.join(test_dir, 'test.html'), 'w+') as f:
@@ -177,7 +172,6 @@ def test_update_content(self):
177172
self._copy_storage_dir()
178173

179174
self._manage_imported_files(self.version, 'commit02', 2)
180-
self.assertNotEqual(ImportedFile.objects.get(name='test.html').md5, 'c7532f22a052d716f7b2310fb52ad981')
181175
self.assertEqual(ImportedFile.objects.count(), 2)
182176

183177
@override_settings(PRODUCTION_DOMAIN='readthedocs.org')

0 commit comments

Comments
 (0)