Skip to content

Commit 871bfe2

Browse files
committed
Search: stop relying on the DB when indexing
- Closes #10623 - Closes #10690
1 parent 9c6ade2 commit 871bfe2

File tree

4 files changed

+94
-26
lines changed

4 files changed

+94
-26
lines changed

readthedocs/builds/models.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -319,7 +319,7 @@ def config(self):
319319
:rtype: dict
320320
"""
321321
last_build = (
322-
self.builds(manager=INTERNAL).filter(
322+
self.builds.filter(
323323
state=BUILD_STATE_FINISHED,
324324
success=True,
325325
).order_by('-date')

readthedocs/projects/tasks/search.py

+79-24
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,12 @@
22

33
import structlog
44

5-
from readthedocs.builds.constants import EXTERNAL
5+
from readthedocs.builds.constants import BUILD_STATE_FINISHED, EXTERNAL
66
from readthedocs.builds.models import Version
77
from readthedocs.projects.models import HTMLFile, ImportedFile, Project
88
from readthedocs.projects.signals import files_changed
9-
from readthedocs.search.utils import index_new_files, remove_indexed_files
9+
from readthedocs.search.utils import remove_indexed_files
10+
from django_elasticsearch_dsl.registries import registry
1011
from readthedocs.storage import build_media_storage
1112
from readthedocs.worker import app
1213

@@ -43,7 +44,7 @@ def fileify(version_pk, commit, build, search_ranking, search_ignore):
4344
_create_imported_files(
4445
version=version,
4546
commit=commit,
46-
build=build,
47+
build_id=build,
4748
search_ranking=search_ranking,
4849
search_ignore=search_ignore,
4950
)
@@ -65,9 +66,6 @@ def _sync_imported_files(version, build):
6566
"""
6667
project = version.project
6768

68-
# Index new HTMLFiles to ElasticSearch
69-
index_new_files(model=HTMLFile, version=version, build=build)
70-
7169
# Remove old HTMLFiles from ElasticSearch
7270
remove_indexed_files(
7371
model=HTMLFile,
@@ -95,7 +93,35 @@ def remove_search_indexes(project_slug, version_slug=None):
9593
)
9694

9795

98-
def _create_imported_files(*, version, commit, build, search_ranking, search_ignore):
96+
def reindex_version(version):
97+
"""
98+
Reindex all files of this version.
99+
"""
100+
latest_successful_build = version.builds.filter(
101+
state=BUILD_STATE_FINISHED, success=True
102+
).order_by("-date").first()
103+
# If the version doesn't have a successful
104+
# build, we don't have files to index.
105+
if not latest_successful_build:
106+
return
107+
108+
search_ranking = []
109+
search_ignore = []
110+
build_config = latest_successful_build.config
111+
if build_config:
112+
search_ranking = build_config.search.ranking
113+
search_ignore = build_config.search.ignore
114+
115+
_create_imported_files(
116+
version=version,
117+
commit=latest_successful_build.commit,
118+
build_id=latest_successful_build.id,
119+
search_ranking=search_ranking,
120+
search_ignore=search_ignore,
121+
)
122+
123+
124+
def _create_imported_files(*, version, commit, build_id, search_ranking, search_ignore):
99125
"""
100126
Create imported files for version.
101127
@@ -107,6 +133,9 @@ def _create_imported_files(*, version, commit, build, search_ranking, search_ign
107133
storage_path = version.project.get_storage_path(
108134
type_='html', version_slug=version.slug, include_file=False
109135
)
136+
html_files_to_index = []
137+
html_files_to_save = []
138+
reverse_rankings = reversed(list(search_ranking.items()))
110139
for root, __, filenames in build_media_storage.walk(storage_path):
111140
for filename in filenames:
112141
# We don't care about non-HTML files
@@ -118,34 +147,60 @@ def _create_imported_files(*, version, commit, build, search_ranking, search_ign
118147
# Generate a relative path for storage similar to os.path.relpath
119148
relpath = full_path.replace(storage_path, '', 1).lstrip('/')
120149

121-
page_rank = 0
122-
# Last pattern to match takes precedence
123-
# XXX: see if we can implement another type of precedence,
124-
# like the longest pattern.
125-
reverse_rankings = reversed(list(search_ranking.items()))
126-
for pattern, rank in reverse_rankings:
127-
if fnmatch(relpath, pattern):
128-
page_rank = rank
129-
break
130-
131150
ignore = False
132-
for pattern in search_ignore:
133-
if fnmatch(relpath, pattern):
134-
ignore = True
135-
break
151+
if version.is_external:
152+
# Never index files from external versions.
153+
ignore = True
154+
else:
155+
for pattern in search_ignore:
156+
if fnmatch(relpath, pattern):
157+
ignore = True
158+
break
136159

137-
# Create imported files from new build
138-
HTMLFile.objects.create(
160+
page_rank = 0
161+
# If the file is ignored, we don't need to check for its ranking.
162+
if not ignore:
163+
# Last pattern to match takes precedence
164+
# XXX: see if we can implement another type of precedence,
165+
# like the longest pattern.
166+
for pattern, rank in reverse_rankings:
167+
if fnmatch(relpath, pattern):
168+
page_rank = rank
169+
break
170+
171+
html_file = HTMLFile(
139172
project=version.project,
140173
version=version,
141174
path=relpath,
142175
name=filename,
143176
rank=page_rank,
144177
commit=commit,
145-
build=build,
178+
build=build_id,
146179
ignore=ignore,
147180
)
148181

182+
# Don't index files that are ignored.
183+
if not ignore:
184+
html_files_to_index.append(html_file)
185+
186+
# Create the imported file only if it's a top-level 404 file,
187+
# or if it's an index file. We don't need to keep track of all files.
188+
is_top_level_404_file = filename == "404.html" and root == storage_path
189+
is_index_file = filename in ["index.html", "README.html"]
190+
if is_top_level_404_file or is_index_file:
191+
html_files_to_save.append(html_file)
192+
193+
# We first index the files in ES, and then save the objects in the DB.
194+
# This is because saving the objects in the DB will give them an id,
195+
# and we neeed this id to be `None` when indexing the objects in ES.
196+
# ES will generate a unique id for each document.
197+
if html_files_to_index:
198+
document = list(registry.get_documents(models=[HTMLFile]))[0]
199+
document().update(html_files_to_index)
200+
201+
if html_files_to_save:
202+
HTMLFile.objects.bulk_create(html_files_to_save)
203+
149204
# This signal is used for purging the CDN.
150205
files_changed.send(
151206
sender=Project,

readthedocs/rtd_tests/tests/test_imported_file.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def _manage_imported_files(
3636
_create_imported_files(
3737
version=version,
3838
commit=commit,
39-
build=build,
39+
build_id=build,
4040
search_ranking=search_ranking,
4141
search_ignore=search_ignore,
4242
)

readthedocs/search/management/commands/reindex_elasticsearch.py

+13
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
from django.conf import settings
99
from django.core.management import BaseCommand
1010
from django_elasticsearch_dsl.registries import registry
11+
from readthedocs.builds.constants import BUILD_STATE_FINISHED
12+
from projects.tasks.search import reindex_version
1113

1214
from readthedocs.builds.models import Version
1315
from readthedocs.projects.models import HTMLFile, Project
@@ -50,6 +52,17 @@ def _run_reindex_tasks(self, models, queue):
5052

5153
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
5254

55+
# TODO: move this to where it makes sense :D
56+
qs = (
57+
Version.objects
58+
.filter(built=True, builds__state=BUILD_STATE_FINISHED, builds_success=True)
59+
.exclude(project__delisted=True)
60+
.exclude(project__is_spam=True)
61+
.select_related("project")
62+
)
63+
for version in qs.iterator():
64+
reindex_version(version)
65+
5366
for doc in registry.get_documents(models):
5467
queryset = doc().get_queryset()
5568

0 commit comments

Comments
 (0)