Search: stop relying on the DB when indexing

stsewd · stsewd · commit 871bfe2cd3a9 · 2023-08-31T18:45:12.000-05:00
- Closes #10623 - Closes #10690
diff --git a/readthedocs/builds/models.py b/readthedocs/builds/models.py
@@ -319,7 +319,7 @@ def config(self):
         :rtype: dict
         """
         last_build = (
-            self.builds(manager=INTERNAL).filter(
+            self.builds.filter(
                 state=BUILD_STATE_FINISHED,
                 success=True,
             ).order_by('-date')
diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py
@@ -2,11 +2,12 @@
 
 import structlog
 
-from readthedocs.builds.constants import EXTERNAL
+from readthedocs.builds.constants import BUILD_STATE_FINISHED, EXTERNAL
 from readthedocs.builds.models import Version
 from readthedocs.projects.models import HTMLFile, ImportedFile, Project
 from readthedocs.projects.signals import files_changed
-from readthedocs.search.utils import index_new_files, remove_indexed_files
+from readthedocs.search.utils import remove_indexed_files
+from django_elasticsearch_dsl.registries import registry
 from readthedocs.storage import build_media_storage
 from readthedocs.worker import app
 
@@ -43,7 +44,7 @@ def fileify(version_pk, commit, build, search_ranking, search_ignore):
         _create_imported_files(
             version=version,
             commit=commit,
-            build=build,
+            build_id=build,
             search_ranking=search_ranking,
             search_ignore=search_ignore,
         )
@@ -65,9 +66,6 @@ def _sync_imported_files(version, build):
     """
     project = version.project
 
-    # Index new HTMLFiles to ElasticSearch
-    index_new_files(model=HTMLFile, version=version, build=build)
-
     # Remove old HTMLFiles from ElasticSearch
     remove_indexed_files(
         model=HTMLFile,
@@ -95,7 +93,35 @@ def remove_search_indexes(project_slug, version_slug=None):
     )
 
 
-def _create_imported_files(*, version, commit, build, search_ranking, search_ignore):
+def reindex_version(version):
+    """
+    Reindex all files of this version.
+    """
+    latest_successful_build = version.builds.filter(
+        state=BUILD_STATE_FINISHED, success=True
+    ).order_by("-date").first()
+    # If the version doesn't have a successful
+    # build, we don't have files to index.
+    if not latest_successful_build:
+        return
+
+    search_ranking = []
+    search_ignore = []
+    build_config = latest_successful_build.config
+    if build_config:
+        search_ranking = build_config.search.ranking
+        search_ignore = build_config.search.ignore
+
+    _create_imported_files(
+        version=version,
+        commit=latest_successful_build.commit,
+        build_id=latest_successful_build.id,
+        search_ranking=search_ranking,
+        search_ignore=search_ignore,
+    )
+
+
+def _create_imported_files(*, version, commit, build_id, search_ranking, search_ignore):
     """
     Create imported files for version.
 
@@ -107,6 +133,9 @@ def _create_imported_files(*, version, commit, build, search_ranking, search_ign
     storage_path = version.project.get_storage_path(
         type_='html', version_slug=version.slug, include_file=False
     )
+    html_files_to_index = []
+    html_files_to_save = []
+    reverse_rankings = reversed(list(search_ranking.items()))
     for root, __, filenames in build_media_storage.walk(storage_path):
         for filename in filenames:
             # We don't care about non-HTML files
@@ -118,34 +147,60 @@ def _create_imported_files(*, version, commit, build, search_ranking, search_ign
             # Generate a relative path for storage similar to os.path.relpath
             relpath = full_path.replace(storage_path, '', 1).lstrip('/')
 
-            page_rank = 0
-            # Last pattern to match takes precedence
-            # XXX: see if we can implement another type of precedence,
-            # like the longest pattern.
-            reverse_rankings = reversed(list(search_ranking.items()))
-            for pattern, rank in reverse_rankings:
-                if fnmatch(relpath, pattern):
-                    page_rank = rank
-                    break
-
             ignore = False
-            for pattern in search_ignore:
-                if fnmatch(relpath, pattern):
-                    ignore = True
-                    break
+            if version.is_external:
+                # Never index files from external versions.
+                ignore = True
+            else:
+                for pattern in search_ignore:
+                    if fnmatch(relpath, pattern):
+                        ignore = True
+                        break
 
-            # Create imported files from new build
-            HTMLFile.objects.create(
+            page_rank = 0
+            # If the file is ignored, we don't need to check for its ranking.
+            if not ignore:
+                # Last pattern to match takes precedence
+                # XXX: see if we can implement another type of precedence,
+                # like the longest pattern.
+                for pattern, rank in reverse_rankings:
+                    if fnmatch(relpath, pattern):
+                        page_rank = rank
+                        break
+
+            html_file = HTMLFile(
                 project=version.project,
                 version=version,
                 path=relpath,
                 name=filename,
                 rank=page_rank,
                 commit=commit,
-                build=build,
+                build=build_id,
                 ignore=ignore,
             )
 
+            # Don't index files that are ignored.
+            if not ignore:
+                html_files_to_index.append(html_file)
+
+            # Create the imported file only if it's a top-level 404 file,
+            # or if it's an index file. We don't need to keep track of all files.
+            is_top_level_404_file = filename == "404.html" and root == storage_path
+            is_index_file = filename in ["index.html", "README.html"]
+            if is_top_level_404_file or is_index_file:
+                html_files_to_save.append(html_file)
+
+        # We first index the files in ES, and then save the objects in the DB.
+        # This is because saving the objects in the DB will give them an id,
+        # and we neeed this id to be `None` when indexing the objects in ES.
+        # ES will generate a unique id for each document.
+        if html_files_to_index:
+            document = list(registry.get_documents(models=[HTMLFile]))[0]
+            document().update(html_files_to_index)
+
+        if html_files_to_save:
+            HTMLFile.objects.bulk_create(html_files_to_save)
+
     # This signal is used for purging the CDN.
     files_changed.send(
         sender=Project,
diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py
@@ -36,7 +36,7 @@ def _manage_imported_files(
         _create_imported_files(
             version=version,
             commit=commit,
-            build=build,
+            build_id=build,
             search_ranking=search_ranking,
             search_ignore=search_ignore,
         )
diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py
@@ -8,6 +8,8 @@
 from django.conf import settings
 from django.core.management import BaseCommand
 from django_elasticsearch_dsl.registries import registry
+from readthedocs.builds.constants import BUILD_STATE_FINISHED
+from projects.tasks.search import reindex_version
 
 from readthedocs.builds.models import Version
 from readthedocs.projects.models import HTMLFile, Project
@@ -50,6 +52,17 @@ def _run_reindex_tasks(self, models, queue):
 
         timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
 
+        # TODO: move this to where it makes sense :D
+        qs = (
+            Version.objects
+            .filter(built=True, builds__state=BUILD_STATE_FINISHED, builds_success=True)
+            .exclude(project__delisted=True)
+            .exclude(project__is_spam=True)
+            .select_related("project")
+        )
+        for version in qs.iterator():
+            reindex_version(version)
+
         for doc in registry.get_documents(models):
             queryset = doc().get_queryset()
 

Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ def _manage_imported_files(`
`36`	`36`	`_create_imported_files(`
`37`	`37`	`version=version,`
`38`	`38`	`commit=commit,`
`39`		`- build=build,`
	`39`	`+ build_id=build,`
`40`	`40`	`search_ranking=search_ranking,`
`41`	`41`	`search_ignore=search_ignore,`
`42`	`42`	`)`