Skip to content

Commit fa54900

Browse files
authored
Search: stop relying on the DB when indexing (#10696)
- Removed the "wipe" actions from the admin instead of porting them, since I'm not sure that we need an action in the admin just to delete the search index of a project. Re-index seems useful. - `fileify` was replaced by `index_build`, and it only requires the build id to be passed, any other information can be retrieved from the build/version object. - `fileify` isn't removed in this PR to avoid downtimes during deploy, it's safe to keep it around till next deploy. - New code is avoiding any deep connection to the django-elasticsearch-dsl package, since it doesn't make sense anymore to have it, and I'm planning on removing it. - We are no longer tracking all files in the DB, only the ones of interest. - Re-indexing a version will also re-evaluate the files from the DB, useful for old projects that are out of sync. - The reindex command now generates taks per-version rather than per-collection of files, since we no longer track all files in the DB. - Closes #10623 - Closes #10690 We don't need to do anything special during deploy, zero downtime out of the box. We can trigger a re-index for all versions if we want to delete the HTML files that we don't need from the DB, but that operation will also re-index their contents in ES, so probably better do that after we are all settled with any changes to ES.
1 parent 31f0da5 commit fa54900

File tree

16 files changed

+587
-435
lines changed

16 files changed

+587
-435
lines changed

docs/dev/server-side-search.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ You can fix this by deleting the page index and :ref:`re-indexing <server-side-s
6262
.. prompt:: bash
6363

6464
inv docker.manage 'search_index --delete'
65-
inv docker.manage reindex_elasticsearch
65+
inv docker.manage 'reindex_elasticsearch --queue web'
6666

6767
How we index documentations
6868
~~~~~~~~~~~~~~~~~~~~~~~~~~~

readthedocs/builds/admin.py

+4-35
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,7 @@
1212
)
1313
from readthedocs.core.utils import trigger_build
1414
from readthedocs.core.utils.admin import pretty_json_field
15-
from readthedocs.projects.models import HTMLFile
16-
from readthedocs.search.utils import _indexing_helper
15+
from readthedocs.projects.tasks.search import reindex_version
1716

1817

1918
class BuildCommandResultInline(admin.TabularInline):
@@ -89,7 +88,7 @@ class VersionAdmin(admin.ModelAdmin):
8988
list_filter = ("type", "privacy_level", "active", "built")
9089
search_fields = ("slug", "project__slug")
9190
raw_id_fields = ("project",)
92-
actions = ["build_version", "reindex_version", "wipe_version_indexes"]
91+
actions = ["build_version", "reindex_version"]
9392

9493
def project_slug(self, obj):
9594
return obj.project.slug
@@ -117,41 +116,11 @@ def build_version(self, request, queryset):
117116
@admin.action(description="Reindex version to ES")
118117
def reindex_version(self, request, queryset):
119118
"""Reindexes all selected versions to ES."""
120-
html_objs_qs = []
121-
for version in queryset.iterator():
122-
html_objs = HTMLFile.objects.filter(
123-
project=version.project, version=version
124-
)
125-
126-
if html_objs.exists():
127-
html_objs_qs.append(html_objs)
128-
129-
if html_objs_qs:
130-
_indexing_helper(html_objs_qs, wipe=False)
119+
for version_id in queryset.values_list("id", flat=True).iterator():
120+
reindex_version.delay(version_id)
131121

132122
self.message_user(request, "Task initiated successfully.", messages.SUCCESS)
133123

134-
@admin.action(description="Wipe version from ES")
135-
def wipe_version_indexes(self, request, queryset):
136-
"""Wipe selected versions from ES."""
137-
html_objs_qs = []
138-
for version in queryset.iterator():
139-
html_objs = HTMLFile.objects.filter(
140-
project=version.project, version=version
141-
)
142-
143-
if html_objs.exists():
144-
html_objs_qs.append(html_objs)
145-
146-
if html_objs_qs:
147-
_indexing_helper(html_objs_qs, wipe=True)
148-
149-
self.message_user(
150-
request,
151-
"Task initiated successfully",
152-
messages.SUCCESS,
153-
)
154-
155124

156125
@admin.register(RegexAutomationRule)
157126
class RegexAutomationRuleAdmin(PolymorphicChildModelAdmin, admin.ModelAdmin):

readthedocs/builds/models.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,7 @@ def config(self):
320320
:rtype: dict
321321
"""
322322
last_build = (
323-
self.builds(manager=INTERNAL).filter(
323+
self.builds.filter(
324324
state=BUILD_STATE_FINISHED,
325325
success=True,
326326
).order_by('-date')

readthedocs/builds/querysets.py

+22
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,28 @@ def public(
118118
def api(self, user=None):
119119
return self.public(user, only_active=False)
120120

121+
def for_reindex(self):
122+
"""
123+
Get all versions that can be reindexed.
124+
125+
A version can be reindexed if:
126+
127+
- It's active and has been built at least once successfully.
128+
Since that means that it has files to be indexed.
129+
- Its project is not delisted or marked as spam.
130+
"""
131+
return (
132+
self.filter(
133+
active=True,
134+
built=True,
135+
builds__state=BUILD_STATE_FINISHED,
136+
builds__success=True,
137+
)
138+
.exclude(project__delisted=True)
139+
.exclude(project__is_spam=True)
140+
.distinct()
141+
)
142+
121143

122144
class VersionQuerySet(SettingsOverrideObject):
123145
_default_class = VersionQuerySetBase

readthedocs/projects/admin.py

+8-49
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111
from readthedocs.core.history import ExtraSimpleHistoryAdmin, set_change_reason
1212
from readthedocs.core.utils import trigger_build
1313
from readthedocs.notifications.views import SendNotificationView
14+
from readthedocs.projects.tasks.search import reindex_version
1415
from readthedocs.redirects.models import Redirect
15-
from readthedocs.search.utils import _indexing_helper
1616

1717
from .forms import FeatureForm
1818
from .models import (
@@ -264,7 +264,6 @@ class ProjectAdmin(ExtraSimpleHistoryAdmin):
264264
"run_spam_rule_checks",
265265
"build_default_version",
266266
"reindex_active_versions",
267-
"wipe_all_versions",
268267
"import_tags_from_vcs",
269268
]
270269

@@ -362,66 +361,26 @@ def reindex_active_versions(self, request, queryset):
362361
"""Reindex all active versions of the selected projects to ES."""
363362
qs_iterator = queryset.iterator()
364363
for project in qs_iterator:
365-
version_qs = Version.internal.filter(project=project)
366-
active_versions = version_qs.filter(active=True)
364+
versions_id_to_reindex = project.versions.for_reindex().values_list(
365+
"pk", flat=True
366+
)
367367

368-
if not active_versions.exists():
368+
if not versions_id_to_reindex.exists():
369369
self.message_user(
370370
request,
371-
"No active versions of project {}".format(project),
371+
"No versions to be re-indexed for project {}".format(project),
372372
messages.ERROR,
373373
)
374374
else:
375-
html_objs_qs = []
376-
for version in active_versions.iterator():
377-
html_objs = HTMLFile.objects.filter(
378-
project=project, version=version
379-
)
380-
381-
if html_objs.exists():
382-
html_objs_qs.append(html_objs)
383-
384-
if html_objs_qs:
385-
_indexing_helper(html_objs_qs, wipe=False)
375+
for version_id in versions_id_to_reindex.iterator():
376+
reindex_version.delay(version_id)
386377

387378
self.message_user(
388379
request,
389380
"Task initiated successfully for {}".format(project),
390381
messages.SUCCESS,
391382
)
392383

393-
# TODO: rename method to mention "indexes" on its name
394-
@admin.action(description="Wipe all versions from ES")
395-
def wipe_all_versions(self, request, queryset):
396-
"""Wipe indexes of all versions of selected projects."""
397-
qs_iterator = queryset.iterator()
398-
for project in qs_iterator:
399-
version_qs = Version.internal.filter(project=project)
400-
if not version_qs.exists():
401-
self.message_user(
402-
request,
403-
"No active versions of project {}.".format(project),
404-
messages.ERROR,
405-
)
406-
else:
407-
html_objs_qs = []
408-
for version in version_qs.iterator():
409-
html_objs = HTMLFile.objects.filter(
410-
project=project, version=version
411-
)
412-
413-
if html_objs.exists():
414-
html_objs_qs.append(html_objs)
415-
416-
if html_objs_qs:
417-
_indexing_helper(html_objs_qs, wipe=True)
418-
419-
self.message_user(
420-
request,
421-
"Task initiated successfully for {}.".format(project),
422-
messages.SUCCESS,
423-
)
424-
425384
@admin.action(description="Import tags from the version control API")
426385
def import_tags_from_vcs(self, request, queryset):
427386
for project in queryset.iterator():

readthedocs/projects/tasks/builds.py

+2-8
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@
6767
from ..models import APIProject, WebHookEvent
6868
from ..signals import before_vcs
6969
from .mixins import SyncRepositoryMixin
70-
from .search import fileify
70+
from .search import index_build
7171
from .utils import (
7272
BuildRequest,
7373
clean_build,
@@ -653,13 +653,7 @@ def on_success(self, retval, task_id, args, kwargs):
653653
)
654654

655655
# Index search data
656-
fileify.delay(
657-
version_pk=self.data.version.pk,
658-
commit=self.data.build['commit'],
659-
build=self.data.build['id'],
660-
search_ranking=self.data.config.search.ranking,
661-
search_ignore=self.data.config.search.ignore,
662-
)
656+
index_build.delay(build_id=self.data.build["id"])
663657

664658
if not self.data.project.has_valid_clone:
665659
self.set_valid_clone()

0 commit comments

Comments
 (0)