From 871bfe2cd3a9598206351f24dd3592d5dc6637b9 Mon Sep 17 00:00:00 2001
From: Santos Gallegos <stsewd@proton.me>
Date: Thu, 31 Aug 2023 18:45:12 -0500
Subject: [PATCH 01/12] Search: stop relying on the DB when indexing

- Closes https://github.com/readthedocs/readthedocs.org/issues/10623
- Closes https://github.com/readthedocs/readthedocs.org/issues/10690
---
 readthedocs/builds/models.py                  |   2 +-
 readthedocs/projects/tasks/search.py          | 103 ++++++++++++++----
 .../rtd_tests/tests/test_imported_file.py     |   2 +-
 .../commands/reindex_elasticsearch.py         |  13 +++
 4 files changed, 94 insertions(+), 26 deletions(-)

diff --git a/readthedocs/builds/models.py b/readthedocs/builds/models.py
index 03d91ca9142..4fcd24e16e4 100644
--- a/readthedocs/builds/models.py
+++ b/readthedocs/builds/models.py
@@ -319,7 +319,7 @@ def config(self):
         :rtype: dict
         """
         last_build = (
-            self.builds(manager=INTERNAL).filter(
+            self.builds.filter(
                 state=BUILD_STATE_FINISHED,
                 success=True,
             ).order_by('-date')
diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py
index ae382e8dad2..cc70ca677b9 100644
--- a/readthedocs/projects/tasks/search.py
+++ b/readthedocs/projects/tasks/search.py
@@ -2,11 +2,12 @@
 
 import structlog
 
-from readthedocs.builds.constants import EXTERNAL
+from readthedocs.builds.constants import BUILD_STATE_FINISHED, EXTERNAL
 from readthedocs.builds.models import Version
 from readthedocs.projects.models import HTMLFile, ImportedFile, Project
 from readthedocs.projects.signals import files_changed
-from readthedocs.search.utils import index_new_files, remove_indexed_files
+from readthedocs.search.utils import remove_indexed_files
+from django_elasticsearch_dsl.registries import registry
 from readthedocs.storage import build_media_storage
 from readthedocs.worker import app
 
@@ -43,7 +44,7 @@ def fileify(version_pk, commit, build, search_ranking, search_ignore):
         _create_imported_files(
             version=version,
             commit=commit,
-            build=build,
+            build_id=build,
             search_ranking=search_ranking,
             search_ignore=search_ignore,
         )
@@ -65,9 +66,6 @@ def _sync_imported_files(version, build):
     """
     project = version.project
 
-    # Index new HTMLFiles to ElasticSearch
-    index_new_files(model=HTMLFile, version=version, build=build)
-
     # Remove old HTMLFiles from ElasticSearch
     remove_indexed_files(
         model=HTMLFile,
@@ -95,7 +93,35 @@ def remove_search_indexes(project_slug, version_slug=None):
     )
 
 
-def _create_imported_files(*, version, commit, build, search_ranking, search_ignore):
+def reindex_version(version):
+    """
+    Reindex all files of this version.
+    """
+    latest_successful_build = version.builds.filter(
+        state=BUILD_STATE_FINISHED, success=True
+    ).order_by("-date").first()
+    # If the version doesn't have a successful
+    # build, we don't have files to index.
+    if not latest_successful_build:
+        return
+
+    search_ranking = []
+    search_ignore = []
+    build_config = latest_successful_build.config
+    if build_config:
+        search_ranking = build_config.search.ranking
+        search_ignore = build_config.search.ignore
+
+    _create_imported_files(
+        version=version,
+        commit=latest_successful_build.commit,
+        build_id=latest_successful_build.id,
+        search_ranking=search_ranking,
+        search_ignore=search_ignore,
+    )
+
+
+def _create_imported_files(*, version, commit, build_id, search_ranking, search_ignore):
     """
     Create imported files for version.
 
@@ -107,6 +133,9 @@ def _create_imported_files(*, version, commit, build, search_ranking, search_ign
     storage_path = version.project.get_storage_path(
         type_='html', version_slug=version.slug, include_file=False
     )
+    html_files_to_index = []
+    html_files_to_save = []
+    reverse_rankings = reversed(list(search_ranking.items()))
     for root, __, filenames in build_media_storage.walk(storage_path):
         for filename in filenames:
             # We don't care about non-HTML files
@@ -118,34 +147,60 @@ def _create_imported_files(*, version, commit, build, search_ranking, search_ign
             # Generate a relative path for storage similar to os.path.relpath
             relpath = full_path.replace(storage_path, '', 1).lstrip('/')
 
-            page_rank = 0
-            # Last pattern to match takes precedence
-            # XXX: see if we can implement another type of precedence,
-            # like the longest pattern.
-            reverse_rankings = reversed(list(search_ranking.items()))
-            for pattern, rank in reverse_rankings:
-                if fnmatch(relpath, pattern):
-                    page_rank = rank
-                    break
-
             ignore = False
-            for pattern in search_ignore:
-                if fnmatch(relpath, pattern):
-                    ignore = True
-                    break
+            if version.is_external:
+                # Never index files from external versions.
+                ignore = True
+            else:
+                for pattern in search_ignore:
+                    if fnmatch(relpath, pattern):
+                        ignore = True
+                        break
 
-            # Create imported files from new build
-            HTMLFile.objects.create(
+            page_rank = 0
+            # If the file is ignored, we don't need to check for its ranking.
+            if not ignore:
+                # Last pattern to match takes precedence
+                # XXX: see if we can implement another type of precedence,
+                # like the longest pattern.
+                for pattern, rank in reverse_rankings:
+                    if fnmatch(relpath, pattern):
+                        page_rank = rank
+                        break
+
+            html_file = HTMLFile(
                 project=version.project,
                 version=version,
                 path=relpath,
                 name=filename,
                 rank=page_rank,
                 commit=commit,
-                build=build,
+                build=build_id,
                 ignore=ignore,
             )
 
+            # Don't index files that are ignored.
+            if not ignore:
+                html_files_to_index.append(html_file)
+
+            # Create the imported file only if it's a top-level 404 file,
+            # or if it's an index file. We don't need to keep track of all files.
+            is_top_level_404_file = filename == "404.html" and root == storage_path
+            is_index_file = filename in ["index.html", "README.html"]
+            if is_top_level_404_file or is_index_file:
+                html_files_to_save.append(html_file)
+
+        # We first index the files in ES, and then save the objects in the DB.
+        # This is because saving the objects in the DB will give them an id,
+        # and we neeed this id to be `None` when indexing the objects in ES.
+        # ES will generate a unique id for each document.
+        if html_files_to_index:
+            document = list(registry.get_documents(models=[HTMLFile]))[0]
+            document().update(html_files_to_index)
+
+        if html_files_to_save:
+            HTMLFile.objects.bulk_create(html_files_to_save)
+
     # This signal is used for purging the CDN.
     files_changed.send(
         sender=Project,
diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py
index 4e1aea8ce9f..423d56d73e9 100644
--- a/readthedocs/rtd_tests/tests/test_imported_file.py
+++ b/readthedocs/rtd_tests/tests/test_imported_file.py
@@ -36,7 +36,7 @@ def _manage_imported_files(
         _create_imported_files(
             version=version,
             commit=commit,
-            build=build,
+            build_id=build,
             search_ranking=search_ranking,
             search_ignore=search_ignore,
         )
diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py
index 50e412682d7..7a9a00eb0d2 100644
--- a/readthedocs/search/management/commands/reindex_elasticsearch.py
+++ b/readthedocs/search/management/commands/reindex_elasticsearch.py
@@ -8,6 +8,8 @@
 from django.conf import settings
 from django.core.management import BaseCommand
 from django_elasticsearch_dsl.registries import registry
+from readthedocs.builds.constants import BUILD_STATE_FINISHED
+from projects.tasks.search import reindex_version
 
 from readthedocs.builds.models import Version
 from readthedocs.projects.models import HTMLFile, Project
@@ -50,6 +52,17 @@ def _run_reindex_tasks(self, models, queue):
 
         timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
 
+        # TODO: move this to where it makes sense :D
+        qs = (
+            Version.objects
+            .filter(built=True, builds__state=BUILD_STATE_FINISHED, builds_success=True)
+            .exclude(project__delisted=True)
+            .exclude(project__is_spam=True)
+            .select_related("project")
+        )
+        for version in qs.iterator():
+            reindex_version(version)
+
         for doc in registry.get_documents(models):
             queryset = doc().get_queryset()
 

From c6df61167e8b9cc3f374b6a7560f87c77f208a74 Mon Sep 17 00:00:00 2001
From: Santos Gallegos <stsewd@proton.me>
Date: Tue, 5 Sep 2023 18:53:11 -0500
Subject: [PATCH 02/12] Second draft

---
 docs/dev/server-side-search.rst               |   2 +-
 readthedocs/projects/tasks/builds.py          |  10 +-
 readthedocs/projects/tasks/search.py          | 220 +++++++++++-------
 .../commands/reindex_elasticsearch.py         | 201 ++++++++--------
 readthedocs/search/utils.py                   |  24 --
 5 files changed, 253 insertions(+), 204 deletions(-)

diff --git a/docs/dev/server-side-search.rst b/docs/dev/server-side-search.rst
index 56e1636e58d..18b7c82d612 100644
--- a/docs/dev/server-side-search.rst
+++ b/docs/dev/server-side-search.rst
@@ -62,7 +62,7 @@ You can fix this by deleting the page index and :ref:`re-indexing <server-side-s
 .. prompt:: bash
 
    inv docker.manage 'search_index --delete'
-   inv docker.manage reindex_elasticsearch
+   inv docker.manage 'reindex_elasticsearch --queue web'
 
 How we index documentations
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/readthedocs/projects/tasks/builds.py b/readthedocs/projects/tasks/builds.py
index 9e409f91263..b44a568c865 100644
--- a/readthedocs/projects/tasks/builds.py
+++ b/readthedocs/projects/tasks/builds.py
@@ -67,7 +67,7 @@
 from ..models import APIProject, WebHookEvent
 from ..signals import before_vcs
 from .mixins import SyncRepositoryMixin
-from .search import fileify
+from .search import index_build
 from .utils import (
     BuildRequest,
     clean_build,
@@ -653,13 +653,7 @@ def on_success(self, retval, task_id, args, kwargs):
                 )
 
         # Index search data
-        fileify.delay(
-            version_pk=self.data.version.pk,
-            commit=self.data.build['commit'],
-            build=self.data.build['id'],
-            search_ranking=self.data.config.search.ranking,
-            search_ignore=self.data.config.search.ignore,
-        )
+        index_build.delay(build_id=self.data.build["id"])
 
         if not self.data.project.has_valid_clone:
             self.set_valid_clone()
diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py
index cc70ca677b9..742c8f99f21 100644
--- a/readthedocs/projects/tasks/search.py
+++ b/readthedocs/projects/tasks/search.py
@@ -2,18 +2,20 @@
 
 import structlog
 
-from readthedocs.builds.constants import BUILD_STATE_FINISHED, EXTERNAL
-from readthedocs.builds.models import Version
-from readthedocs.projects.models import HTMLFile, ImportedFile, Project
+from readthedocs.builds.constants import BUILD_STATE_FINISHED
+from readthedocs.builds.models import Build, Version
+from readthedocs.projects.models import HTMLFile, Project
 from readthedocs.projects.signals import files_changed
+from readthedocs.search.documents import PageDocument
 from readthedocs.search.utils import remove_indexed_files
-from django_elasticsearch_dsl.registries import registry
 from readthedocs.storage import build_media_storage
 from readthedocs.worker import app
 
 log = structlog.get_logger(__name__)
 
 
+# TODO: remove after deploy. This is kept, so builds keep
+# working while the deploy the new task below.
 @app.task(queue='reindex')
 def fileify(version_pk, commit, build, search_ranking, search_ignore):
     """
@@ -22,8 +24,7 @@ def fileify(version_pk, commit, build, search_ranking, search_ignore):
     This is so we have an idea of what files we have in the database.
     """
     version = Version.objects.get_object_or_log(pk=version_pk)
-    # Don't index external version builds for now
-    if not version or version.type == EXTERNAL:
+    if not version:
         return
     project = version.project
 
@@ -41,9 +42,8 @@ def fileify(version_pk, commit, build, search_ranking, search_ignore):
         version_slug=version.slug,
     )
     try:
-        _create_imported_files(
+        _create_imported_files_and_search_index(
             version=version,
-            commit=commit,
             build_id=build,
             search_ranking=search_ranking,
             search_ignore=search_ignore,
@@ -51,87 +51,125 @@ def fileify(version_pk, commit, build, search_ranking, search_ignore):
     except Exception:
         log.exception('Failed during ImportedFile creation')
 
-    try:
-        _sync_imported_files(version, build)
-    except Exception:
-        log.exception('Failed during ImportedFile syncing')
-
 
-def _sync_imported_files(version, build):
-    """
-    Sync/Update/Delete ImportedFiles objects of this version.
+@app.task(queue="reindex")
+def index_build(build_id):
+    """Create imported files and search index for the build."""
+    build = (
+        Build.objects.filter(pk=build_id)
+        .select_related("version", "version__project")
+        .first()
+    )
+    if not build:
+        return
 
-    :param version: Version instance
-    :param build: Build id
-    """
-    project = version.project
+    # The version may have been deleted.
+    version = build.version
+    if not version:
+        return
 
-    # Remove old HTMLFiles from ElasticSearch
-    remove_indexed_files(
-        model=HTMLFile,
+    log.bind(
         project_slug=version.project.slug,
         version_slug=version.slug,
-        build_id=build,
-    )
-
-    # Delete ImportedFiles objects (including HTMLFiles)
-    # from the previous build of the version.
-    (
-        ImportedFile.objects.filter(project=project, version=version)
-        .exclude(build=build)
-        .delete()
+        build_id=build.id,
     )
 
+    search_ranking = []
+    search_ignore = []
+    build_config = build.config
+    if build_config:
+        search_config = build_config.get("search", {})
+        search_ranking = search_config.get("ranking", [])
+        search_ignore = search_config.get("ignore", [])
 
-@app.task(queue='web')
-def remove_search_indexes(project_slug, version_slug=None):
-    """Wrapper around ``remove_indexed_files`` to make it a task."""
-    remove_indexed_files(
-        model=HTMLFile,
-        project_slug=project_slug,
-        version_slug=version_slug,
-    )
+    try:
+        _create_imported_files_and_search_index(
+            version=version,
+            build_id=build.id,
+            search_ranking=search_ranking,
+            search_ignore=search_ignore,
+        )
+    except Exception:
+        log.exception("Failed during creation of new files")
 
 
-def reindex_version(version):
+@app.task(queue="reindex")
+def reindex_version(version_id, search_index_name=None):
     """
-    Reindex all files of this version.
+    Re-create imported files and search index for the version.
+
+    The latest successful build is used for the re-creation.
     """
-    latest_successful_build = version.builds.filter(
-        state=BUILD_STATE_FINISHED, success=True
-    ).order_by("-date").first()
+    version = Version.objects.filter(pk=version_id).select_related("project").first()
+    if not version:
+        return
+
+    latest_successful_build = (
+        version.builds.filter(state=BUILD_STATE_FINISHED, success=True)
+        .order_by("-date")
+        .first()
+    )
     # If the version doesn't have a successful
     # build, we don't have files to index.
     if not latest_successful_build:
         return
 
+    log.bind(
+        project_slug=version.project.slug,
+        version_slug=version.slug,
+        build_id=latest_successful_build.id,
+    )
+
     search_ranking = []
     search_ignore = []
     build_config = latest_successful_build.config
     if build_config:
-        search_ranking = build_config.search.ranking
-        search_ignore = build_config.search.ignore
+        search_config = build_config.get("search", {})
+        search_ranking = search_config.get("ranking", [])
+        search_ignore = search_config.get("ignore", [])
+
+    # We need to use a build id that is different from the current one,
+    # since, otherwise we will end up with duplicated files.
+    # After the process is complete, we delete the files and search index
+    # that don't belong to the current build id.
+    build_id = latest_successful_build.id + 1
+    try:
+        _create_imported_files_and_search_index(
+            version=version,
+            build_id=build_id,
+            search_ranking=search_ranking,
+            search_ignore=search_ignore,
+            search_index_name=search_index_name,
+        )
+    except Exception:
+        log.exception("Failed during creation of new files")
 
-    _create_imported_files(
-        version=version,
-        commit=latest_successful_build.commit,
-        build_id=latest_successful_build.id,
-        search_ranking=search_ranking,
-        search_ignore=search_ignore,
+
+@app.task(queue="web")
+def remove_search_indexes(project_slug, version_slug=None):
+    """Wrapper around ``remove_indexed_files`` to make it a task."""
+    remove_indexed_files(
+        model=HTMLFile,
+        project_slug=project_slug,
+        version_slug=version_slug,
     )
 
 
-def _create_imported_files(*, version, commit, build_id, search_ranking, search_ignore):
+def _create_imported_files_and_search_index(
+    *, version, build_id, search_ranking, search_ignore, search_index_name=None
+):
     """
-    Create imported files for version.
+    Create imported files and search index for version.
 
-    :param version: Version instance
-    :param commit: Commit that updated path
-    :param build: Build id
+    If the version is external, we don't create a search index for it, only imported files.
+    After the process is completed, we delete the files and search index that
+    don't belong to the current build id.
+
+    :param search_index: If provided, it will be used as the search index name,
+     otherwise the default one will be used.
     """
-    # Re-create all objects from the new build of the version
     storage_path = version.project.get_storage_path(
-        type_='html', version_slug=version.slug, include_file=False
+        type_="html", version_slug=version.slug, include_file=False
     )
     html_files_to_index = []
     html_files_to_save = []
@@ -139,27 +177,27 @@ def _create_imported_files(*, version, commit, build_id, search_ranking, search_
     for root, __, filenames in build_media_storage.walk(storage_path):
         for filename in filenames:
             # We don't care about non-HTML files
-            if not filename.endswith('.html'):
+            if not filename.endswith(".html"):
                 continue
 
             full_path = build_media_storage.join(root, filename)
 
             # Generate a relative path for storage similar to os.path.relpath
-            relpath = full_path.replace(storage_path, '', 1).lstrip('/')
+            relpath = full_path.replace(storage_path, "", 1).lstrip("/")
 
-            ignore = False
+            skip_search_index = False
             if version.is_external:
                 # Never index files from external versions.
-                ignore = True
+                skip_search_index = True
             else:
                 for pattern in search_ignore:
                     if fnmatch(relpath, pattern):
-                        ignore = True
+                        skip_search_index = True
                         break
 
             page_rank = 0
             # If the file is ignored, we don't need to check for its ranking.
-            if not ignore:
+            if not skip_search_index:
                 # Last pattern to match takes precedence
                 # XXX: see if we can implement another type of precedence,
                 # like the longest pattern.
@@ -174,13 +212,14 @@ def _create_imported_files(*, version, commit, build_id, search_ranking, search_
                 path=relpath,
                 name=filename,
                 rank=page_rank,
-                commit=commit,
+                # TODO: We are setting this field since it's required,
+                # but it will be removed in the future together with other fields.
+                commit="unknown",
                 build=build_id,
-                ignore=ignore,
+                ignore=skip_search_index,
             )
 
-            # Don't index files that are ignored.
-            if not ignore:
+            if not skip_search_index:
                 html_files_to_index.append(html_file)
 
             # Create the imported file only if it's a top-level 404 file,
@@ -190,16 +229,39 @@ def _create_imported_files(*, version, commit, build_id, search_ranking, search_
             if is_top_level_404_file or is_index_file:
                 html_files_to_save.append(html_file)
 
-        # We first index the files in ES, and then save the objects in the DB.
-        # This is because saving the objects in the DB will give them an id,
-        # and we neeed this id to be `None` when indexing the objects in ES.
-        # ES will generate a unique id for each document.
-        if html_files_to_index:
-            document = list(registry.get_documents(models=[HTMLFile]))[0]
-            document().update(html_files_to_index)
+    # We first index the files in ES, and then save the objects in the DB.
+    # This is because saving the objects in the DB will give them an id,
+    # and we neeed this id to be `None` when indexing the objects in ES.
+    # ES will generate a unique id for each document.
+    if html_files_to_index:
+        # document = list(registry.get_documents(models=[HTMLFile]))[0]
+        document = PageDocument
+
+        # If a search index name is provided, we need to temporarily change
+        # the index name of the document.
+        old_index_name = document._index._name
+        if search_index_name:
+            document._index._name = search_index_name
+
+        document().update(html_files_to_index)
+
+        # Restore the old index name.
+        if search_index_name:
+            document._index._name = old_index_name
+
+    # Remove old HTMLFiles from ElasticSearch
+    remove_indexed_files(
+        model=HTMLFile,
+        project_slug=version.project.slug,
+        version_slug=version.slug,
+        build_id=build_id,
+    )
+
+    if html_files_to_save:
+        HTMLFile.objects.bulk_create(html_files_to_save)
 
-        if html_files_to_save:
-            HTMLFile.objects.bulk_create(html_files_to_save)
+    # Delete imported files from the previous build of the version.
+    version.imported_files.exclude(build=build_id).delete()
 
     # This signal is used for purging the CDN.
     files_changed.send(
diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py
index 7a9a00eb0d2..e9334b4f4a8 100644
--- a/readthedocs/search/management/commands/reindex_elasticsearch.py
+++ b/readthedocs/search/management/commands/reindex_elasticsearch.py
@@ -1,18 +1,19 @@
 import itertools
-import structlog
 import sys
 import textwrap
 from datetime import datetime, timedelta
 
+import structlog
 from django.apps import apps
 from django.conf import settings
 from django.core.management import BaseCommand
 from django_elasticsearch_dsl.registries import registry
-from readthedocs.builds.constants import BUILD_STATE_FINISHED
-from projects.tasks.search import reindex_version
 
+from readthedocs.builds.constants import BUILD_STATE_FINISHED
 from readthedocs.builds.models import Version
 from readthedocs.projects.models import HTMLFile, Project
+from readthedocs.projects.tasks.search import reindex_version
+from readthedocs.search.documents import PageDocument, ProjectDocument
 from readthedocs.search.tasks import (
     create_new_es_index,
     index_objects_to_es,
@@ -47,57 +48,20 @@ def _get_indexing_tasks(
             yield index_objects_to_es.si(**data)
 
     def _run_reindex_tasks(self, models, queue):
-        apply_async_kwargs = {"queue": queue}
         log.info("Adding indexing tasks to queue.", queue=queue)
-
         timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
 
-        # TODO: move this to where it makes sense :D
-        qs = (
-            Version.objects
-            .filter(built=True, builds__state=BUILD_STATE_FINISHED, builds_success=True)
-            .exclude(project__delisted=True)
-            .exclude(project__is_spam=True)
-            .select_related("project")
-        )
-        for version in qs.iterator():
-            reindex_version(version)
-
-        for doc in registry.get_documents(models):
-            queryset = doc().get_queryset()
-
-            app_label = queryset.model._meta.app_label
-            model_name = queryset.model.__name__
-
-            index_name = doc._index._name
-            new_index_name = "{}_{}".format(index_name, timestamp)
-
-            # Set and create a temporal index for indexing.
-            create_new_es_index(
-                app_label=app_label,
-                model_name=model_name,
-                index_name=index_name,
-                new_index_name=new_index_name,
-            )
-            doc._index._name = new_index_name
-            log.info("Temporal index created.", index_name=new_index_name)
-
-            indexing_tasks = self._get_indexing_tasks(
-                app_label=app_label,
-                model_name=model_name,
-                queryset=queryset,
-                index_name=new_index_name,
-                document_class=str(doc),
-            )
-            for task in indexing_tasks:
-                task.apply_async(**apply_async_kwargs)
+        for model in models:
+            if model == HTMLFile:
+                self._reindex_files(queue=queue, timestamp=timestamp)
+            elif model == Project:
+                self._reindex_projects(queue=queue, timestamp=timestamp)
+            else:
+                log.warning(
+                    "Re-index not available for model.", model_name=model.__name__
+                )
+                continue
 
-            log.info(
-                "Tasks issued successfully.",
-                model_name=model_name,
-                app_label=app_label,
-                items=queryset.count(),
-            )
         return timestamp
 
     def _change_index(self, models, timestamp):
@@ -135,6 +99,44 @@ def _reindex_from(self, days_ago, models, queue):
                 continue
             functions[model](days_ago=days_ago, queue=queue)
 
+    def _reindex_projects(self, queue, timestamp):
+        # TODO: can we just improt the project document?
+        # document = list(registry.get_documents(models=[Project]))[0]
+        document = ProjectDocument
+        app_label = Project._meta.app_label
+        model_name = Project.__name__
+        index_name = document._index._name
+        new_index_name = "{}_{}".format(index_name, timestamp)
+
+        create_new_es_index(
+            app_label=app_label,
+            model_name=model_name,
+            index_name=index_name,
+            new_index_name=new_index_name,
+        )
+        log.info("Temporal index created.", index_name=new_index_name)
+
+        queryset = document().get_queryset()
+        indexing_tasks = self._get_indexing_tasks(
+            app_label=app_label,
+            model_name=model_name,
+            queryset=queryset,
+            index_name=new_index_name,
+            document_class=str(document),
+        )
+        number_of_tasks = 0
+        for task in indexing_tasks:
+            task.apply_async(queue=queue)
+            number_of_tasks += 1
+
+        log.info(
+            "Tasks issued successfully.",
+            model_name=model_name,
+            app_label=app_label,
+            items=queryset.count(),
+            number_of_tasks=number_of_tasks,
+        )
+
     def _reindex_projects_from(self, days_ago, queue):
         """Reindex projects with recent changes."""
         since = datetime.now() - timedelta(days=days_ago)
@@ -160,51 +162,65 @@ def _reindex_projects_from(self, days_ago, queue):
                 items=queryset.count(),
             )
 
-    def _reindex_files_from(self, days_ago, queue):
-        """Reindex HTML files from versions with recent builds."""
-        chunk_size = settings.ES_TASK_CHUNK_SIZE
-        since = datetime.now() - timedelta(days=days_ago)
-        queryset = Version.objects.filter(builds__date__gte=since).distinct()
+    def _get_versions_to_reindex_queryset(self):
+        return (
+            Version.objects.filter(
+                built=True,
+                builds__state=BUILD_STATE_FINISHED,
+                builds__success=True,
+            )
+            .exclude(project__delisted=True)
+            .exclude(project__is_spam=True)
+            .distinct()
+        )
+
+    def _reindex_files(self, queue, timestamp):
+        document = PageDocument
         app_label = HTMLFile._meta.app_label
         model_name = HTMLFile.__name__
-        apply_async_kwargs = {
-            "queue": queue,
-            "kwargs": {
-                "app_label": app_label,
-                "model_name": model_name,
-            },
-        }
+        index_name = document._index._name
+        new_index_name = "{}_{}".format(index_name, timestamp)
+        create_new_es_index(
+            app_label=app_label,
+            model_name=model_name,
+            index_name=index_name,
+            new_index_name=new_index_name,
+        )
+        log.info("Temporal index created.", index_name=new_index_name)
 
-        for doc in registry.get_documents(models=[HTMLFile]):
-            apply_async_kwargs["kwargs"]["document_class"] = str(doc)
-            for version in queryset.iterator():
-                project = version.project
-                files_qs = (
-                    HTMLFile.objects.filter(version=version)
-                    .values_list("pk", flat=True)
-                    .iterator()
-                )
-                current = 0
-                while True:
-                    objects_id = list(itertools.islice(files_qs, chunk_size))
-                    if not objects_id:
-                        break
-                    current += len(objects_id)
-                    log.info(
-                        "Re-indexing files.",
-                        version_slug=version.slug,
-                        project_slug=project.slug,
-                        items=current,
-                    )
-                    apply_async_kwargs["kwargs"]["objects_id"] = objects_id
-                    index_objects_to_es.apply_async(**apply_async_kwargs)
+        queryset = self._get_versions_to_reindex_queryset().values_list("pk", flat=True)
+        for version_id in queryset.iterator():
+            reindex_version.apply_async(
+                kwargs={
+                    "version_id": version_id,
+                    "search_index_name": new_index_name,
+                },
+                queue=queue,
+            )
 
-                log.info(
-                    "Tasks issued successfully.",
-                    project_slug=project.slug,
-                    version_slug=version.slug,
-                    items=current,
-                )
+        log.info(
+            "Tasks issued successfully for re-indexing of versions.",
+            number_of_tasks=queryset.count(),
+        )
+
+    def _reindex_files_from(self, days_ago, queue):
+        """Reindex HTML files from versions with recent builds."""
+        since = datetime.now() - timedelta(days=days_ago)
+        queryset = (
+            self._get_versions_to_reindex_queryset()
+            .filter(builds__date__gte=since)
+            .values_list("pk", flat=True)
+        )
+        for version_id in queryset.iterator():
+            reindex_version.apply_async(
+                kwargs={"version_id": version_id},
+                queue=queue,
+            )
+
+        log.info(
+            "Tasks issued successfully for re-indexing of versions.",
+            number_of_tasks=queryset.count(),
+        )
 
     def add_arguments(self, parser):
         parser.add_argument(
@@ -252,9 +268,10 @@ def handle(self, *args, **options):
         `--model <app_label>.<model_name>` parameter.
         Otherwise, it will re-index all the models
         """
-        models = None
         if options["models"]:
             models = [apps.get_model(model_name) for model_name in options["models"]]
+        else:
+            models = [Project, HTMLFile]
 
         queue = options["queue"]
         change_index = options["change_index"]
diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py
index daf6d47d790..8ef62345121 100644
--- a/readthedocs/search/utils.py
+++ b/readthedocs/search/utils.py
@@ -10,30 +10,6 @@
 log = structlog.get_logger(__name__)
 
 
-def index_new_files(model, version, build):
-    """Index new files from the version into the search index."""
-
-    log.bind(
-        project_slug=version.project.slug,
-        version_slug=version.slug,
-    )
-
-    if not DEDConfig.autosync_enabled():
-        log.info("Autosync disabled. Skipping indexing into the search index.")
-        return
-
-    try:
-        document = list(registry.get_documents(models=[model]))[0]
-        doc_obj = document()
-        queryset = doc_obj.get_queryset().filter(
-            project=version.project, version=version, build=build
-        )
-        log.info("Indexing new objecst into search index.")
-        doc_obj.update(queryset.iterator())
-    except Exception:
-        log.exception("Unable to index a subset of files. Continuing.")
-
-
 def remove_indexed_files(model, project_slug, version_slug=None, build_id=None):
     """
     Remove files from `version_slug` of `project_slug` from the search index.

From bc8e4ff13b8f7ef2f2e380a7b5c10f1c0ce85759 Mon Sep 17 00:00:00 2001
From: Santos Gallegos <stsewd@proton.me>
Date: Tue, 12 Sep 2023 15:45:07 -0500
Subject: [PATCH 03/12] Fix tests

---
 readthedocs/projects/tasks/search.py          |   5 +-
 .../projects/tests/test_build_tasks.py        |  12 +-
 readthedocs/rtd_tests/files/404.html          |  11 +
 readthedocs/rtd_tests/files/index.html        |  11 +
 .../rtd_tests/tests/test_build_storage.py     |  30 ++-
 .../rtd_tests/tests/test_imported_file.py     | 254 ++++++++++++++----
 readthedocs/search/tests/test_api.py          |  59 ----
 7 files changed, 245 insertions(+), 137 deletions(-)
 create mode 100644 readthedocs/rtd_tests/files/404.html
 create mode 100644 readthedocs/rtd_tests/files/index.html

diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py
index 742c8f99f21..92fb2f3a819 100644
--- a/readthedocs/projects/tasks/search.py
+++ b/readthedocs/projects/tasks/search.py
@@ -159,7 +159,7 @@ def _create_imported_files_and_search_index(
     *, version, build_id, search_ranking, search_ignore, search_index_name=None
 ):
     """
-    Create imported files and search index for version.
+    Create imported files and search index for the build of the version.
 
     If the version is external, we don't create a search index for it, only imported files.
     After the process is completed, we delete the files and search index that
@@ -173,7 +173,7 @@ def _create_imported_files_and_search_index(
     )
     html_files_to_index = []
     html_files_to_save = []
-    reverse_rankings = reversed(list(search_ranking.items()))
+    reverse_rankings = list(reversed(search_ranking.items()))
     for root, __, filenames in build_media_storage.walk(storage_path):
         for filename in filenames:
             # We don't care about non-HTML files
@@ -234,7 +234,6 @@ def _create_imported_files_and_search_index(
     # and we neeed this id to be `None` when indexing the objects in ES.
     # ES will generate a unique id for each document.
     if html_files_to_index:
-        # document = list(registry.get_documents(models=[HTMLFile]))[0]
         document = PageDocument
 
         # If a search index name is provided, we need to temporarily change
diff --git a/readthedocs/projects/tests/test_build_tasks.py b/readthedocs/projects/tests/test_build_tasks.py
index 37480500336..a1519bb707b 100644
--- a/readthedocs/projects/tests/test_build_tasks.py
+++ b/readthedocs/projects/tests/test_build_tasks.py
@@ -407,7 +407,7 @@ def test_get_env_vars(self, load_yaml_config, build_environment, config, externa
             expected_build_env_vars["PRIVATE_TOKEN"] = "a1b2c3"
         assert build_env_vars == expected_build_env_vars
 
-    @mock.patch("readthedocs.projects.tasks.builds.fileify")
+    @mock.patch("readthedocs.projects.tasks.builds.index_build")
     @mock.patch("readthedocs.projects.tasks.builds.build_complete")
     @mock.patch("readthedocs.projects.tasks.builds.send_external_build_status")
     @mock.patch("readthedocs.projects.tasks.builds.UpdateDocsTask.send_notifications")
@@ -420,7 +420,7 @@ def test_successful_build(
         send_notifications,
         send_external_build_status,
         build_complete,
-        fileify,
+        index_build,
     ):
         load_yaml_config.return_value = self._config_file(
             {
@@ -481,13 +481,7 @@ def test_successful_build(
             build=mock.ANY,
         )
 
-        fileify.delay.assert_called_once_with(
-            version_pk=self.version.pk,
-            commit=self.build.commit,
-            build=self.build.pk,
-            search_ranking=mock.ANY,
-            search_ignore=mock.ANY,
-        )
+        index_build.delay.assert_called_once_with(build_id=self.build.pk)
 
         # TODO: assert the verb and the path for each API call as well
 
diff --git a/readthedocs/rtd_tests/files/404.html b/readthedocs/rtd_tests/files/404.html
new file mode 100644
index 00000000000..a939588f6c8
--- /dev/null
+++ b/readthedocs/rtd_tests/files/404.html
@@ -0,0 +1,11 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width">
+    <title>404 - Page not found</title>
+  </head>
+  <body>
+    Page not found
+  </body>
+</html>
diff --git a/readthedocs/rtd_tests/files/index.html b/readthedocs/rtd_tests/files/index.html
new file mode 100644
index 00000000000..03363c621ca
--- /dev/null
+++ b/readthedocs/rtd_tests/files/index.html
@@ -0,0 +1,11 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width">
+    <title>Root index file</title>
+  </head>
+  <body>
+    Root index file
+  </body>
+</html>
diff --git a/readthedocs/rtd_tests/tests/test_build_storage.py b/readthedocs/rtd_tests/tests/test_build_storage.py
index 78e138e7b7e..0d44ed72595 100644
--- a/readthedocs/rtd_tests/tests/test_build_storage.py
+++ b/readthedocs/rtd_tests/tests/test_build_storage.py
@@ -58,9 +58,11 @@ def test_sync_directory(self):
 
         tree = [
             ('api', ['index.html']),
-            'api.fjson',
-            'conf.py',
-            'test.html',
+            "404.html",
+            "api.fjson",
+            "conf.py",
+            "index.html",
+            "test.html",
         ]
         with override_settings(DOCROOT=tmp_files_dir):
             self.storage.rclone_sync_directory(tmp_files_dir, storage_dir)
@@ -68,7 +70,9 @@ def test_sync_directory(self):
 
         tree = [
             ('api', ['index.html']),
+            "404.html",
             'conf.py',
+            "index.html",
             'test.html',
         ]
         os.remove(os.path.join(tmp_files_dir, 'api.fjson'))
@@ -77,7 +81,9 @@ def test_sync_directory(self):
         self.assertFileTree(storage_dir, tree)
 
         tree = [
+            "404.html",
             'conf.py',
+            "index.html",
             'test.html',
         ]
         shutil.rmtree(os.path.join(tmp_files_dir, 'api'))
@@ -126,7 +132,9 @@ def test_delete_directory(self):
             self.storage.copy_directory(files_dir, "files")
         dirs, files = self.storage.listdir("files")
         self.assertEqual(dirs, ["api"])
-        self.assertCountEqual(files, ["api.fjson", "conf.py", "test.html"])
+        self.assertCountEqual(
+            files, ["404.html", "api.fjson", "conf.py", "index.html", "test.html"]
+        )
 
         self.storage.delete_directory('files/')
         _, files = self.storage.listdir('files')
@@ -147,9 +155,11 @@ def test_walk(self):
         self.assertEqual(len(output), 2)
 
         top, dirs, files = output[0]
-        self.assertEqual(top, 'files')
-        self.assertCountEqual(dirs, ['api'])
-        self.assertCountEqual(files, ['api.fjson', 'conf.py', 'test.html'])
+        self.assertEqual(top, "files")
+        self.assertCountEqual(dirs, ["api"])
+        self.assertCountEqual(
+            files, ["404.html", "api.fjson", "conf.py", "index.html", "test.html"]
+        )
 
         top, dirs, files = output[1]
         self.assertEqual(top, 'files/api')
@@ -163,8 +173,10 @@ def test_rclone_sync(self):
 
         tree = [
             ("api", ["index.html"]),
+            "404.html",
             "api.fjson",
             "conf.py",
+            "index.html",
             "test.html",
         ]
         with override_settings(DOCROOT=tmp_files_dir):
@@ -173,7 +185,9 @@ def test_rclone_sync(self):
 
         tree = [
             ("api", ["index.html"]),
+            "404.html",
             "conf.py",
+            "index.html",
             "test.html",
         ]
         (tmp_files_dir / "api.fjson").unlink()
@@ -182,7 +196,9 @@ def test_rclone_sync(self):
         self.assertFileTree(storage_dir, tree)
 
         tree = [
+            "404.html",
             "conf.py",
+            "index.html",
             "test.html",
         ]
         shutil.rmtree(tmp_files_dir / "api")
diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py
index 423d56d73e9..9ea7515d9cd 100644
--- a/readthedocs/rtd_tests/tests/test_imported_file.py
+++ b/readthedocs/rtd_tests/tests/test_imported_file.py
@@ -6,14 +6,13 @@
 from django.test.utils import override_settings
 
 from readthedocs.projects.models import HTMLFile, ImportedFile, Project
-from readthedocs.projects.tasks.search import (
-    _create_imported_files,
-    _sync_imported_files,
-)
+from readthedocs.projects.tasks.search import _create_imported_files_and_search_index
+from readthedocs.search.documents import PageDocument
 
 base_dir = os.path.dirname(os.path.dirname(__file__))
 
 
+@override_settings(ELASTICSEARCH_DSL_AUTOSYNC=True)
 class ImportedFileTests(TestCase):
     fixtures = ["eric", "test_data"]
 
@@ -27,20 +26,25 @@ def setUp(self):
         with override_settings(DOCROOT=self.test_dir):
             self._copy_storage_dir()
 
+    def tearDown(self):
+        try:
+            PageDocument().search().filter().delete()
+        except Exception:
+            # If there are no documents, the query fails.
+            pass
+
     def _manage_imported_files(
-        self, version, commit, build, search_ranking=None, search_ignore=None
+        self, version, build_id, search_ranking=None, search_ignore=None
     ):
         """Helper function for the tests to create and sync ImportedFiles."""
         search_ranking = search_ranking or {}
         search_ignore = search_ignore or []
-        _create_imported_files(
+        _create_imported_files_and_search_index(
             version=version,
-            commit=commit,
-            build_id=build,
+            build_id=build_id,
             search_ranking=search_ranking,
             search_ignore=search_ignore,
         )
-        _sync_imported_files(version, build)
 
     def _copy_storage_dir(self):
         """Copy the test directory (rtd_tests/files) to storage"""
@@ -54,96 +58,208 @@ def _copy_storage_dir(self):
         )
 
     def test_properly_created(self):
-        # Only 2 files in the directory is HTML (test.html, api/index.html)
+        """
+        Only 4 files in the directory are HTML
+
+        - index.html
+        - test.html
+        - api/index.html
+        - 404.html
+
+        But we create imported files for index.html and 404.html pages only.
+        """
         self.assertEqual(ImportedFile.objects.count(), 0)
-        self._manage_imported_files(version=self.version, commit="commit01", build=1)
-        self.assertEqual(ImportedFile.objects.count(), 2)
-        self._manage_imported_files(version=self.version, commit="commit01", build=2)
-        self.assertEqual(ImportedFile.objects.count(), 2)
 
-        self.project.cdn_enabled = True
-        self.project.save()
+        self._manage_imported_files(version=self.version, build_id=1)
+        self.assertEqual(ImportedFile.objects.count(), 3)
+        self.assertEqual(
+            set(HTMLFile.objects.all().values_list("path", flat=True)),
+            {"index.html", "api/index.html", "404.html"},
+        )
 
-    def test_update_commit(self):
+        results = PageDocument().search().filter("term", build=1).execute()
+        self.assertEqual(
+            {result.path for result in results}, {"index.html", "404.html", "test.html"}
+        )
+
+        self._manage_imported_files(version=self.version, build_id=2)
+        self.assertEqual(ImportedFile.objects.count(), 3)
+        self.assertEqual(
+            set(HTMLFile.objects.all().values_list("path", flat=True)),
+            {"index.html", "api/index.html", "404.html"},
+        )
+
+        results = PageDocument().search().filter("term", build=2).execute()
+        self.assertEqual(
+            {result.path for result in results}, {"index.html", "404.html", "test.html"}
+        )
+
+    def test_update_build(self):
         self.assertEqual(ImportedFile.objects.count(), 0)
-        self._manage_imported_files(self.version, "commit01", 1)
-        self.assertEqual(ImportedFile.objects.first().commit, "commit01")
-        self._manage_imported_files(self.version, "commit02", 2)
-        self.assertEqual(ImportedFile.objects.first().commit, "commit02")
+        self._manage_imported_files(self.version, build_id=1)
+        for obj in ImportedFile.objects.all():
+            self.assertEqual(obj.build, 1)
+
+        results = PageDocument().search().filter().execute()
+        for result in results:
+            self.assertEqual(result.build, 1)
+
+        self._manage_imported_files(self.version, build_id=2)
+        for obj in ImportedFile.objects.all():
+            self.assertEqual(obj.build, 2)
+
+        # NOTE: we can't test the ES index here because the index is updated asynchronously,
+        # so we will get results for both builds.
+        # results = PageDocument().search().filter().execute()
+        # for result in results:
+        #     self.assertEqual(result.build, 2)
 
     def test_page_default_rank(self):
         search_ranking = {}
         self.assertEqual(HTMLFile.objects.count(), 0)
-        self._manage_imported_files(self.version, "commit01", 1, search_ranking)
+        self._manage_imported_files(
+            self.version, build_id=1, search_ranking=search_ranking
+        )
 
-        self.assertEqual(HTMLFile.objects.count(), 2)
-        self.assertEqual(HTMLFile.objects.filter(rank=0).count(), 2)
+        results = (
+            PageDocument()
+            .search()
+            .filter("term", project=self.project.slug)
+            .filter("term", version=self.version.slug)
+            .execute()
+        )
+        self.assertEqual(len(results), 4)
+        for result in results:
+            self.assertEqual(result.project, self.project.slug)
+            self.assertEqual(result.version, self.version.slug)
+            self.assertEqual(result.build, 1)
+            self.assertEqual(result.rank, 0)
 
     def test_page_custom_rank_glob(self):
         search_ranking = {
             "*index.html": 5,
         }
-        self._manage_imported_files(self.version, "commit01", 1, search_ranking)
+        self._manage_imported_files(
+            self.version, build_id=1, search_ranking=search_ranking
+        )
 
-        self.assertEqual(HTMLFile.objects.count(), 2)
-        file_api = HTMLFile.objects.get(path="api/index.html")
-        file_test = HTMLFile.objects.get(path="test.html")
-        self.assertEqual(file_api.rank, 5)
-        self.assertEqual(file_test.rank, 0)
+        results = (
+            PageDocument()
+            .search()
+            .filter("term", project=self.project.slug)
+            .filter("term", version=self.version.slug)
+            .execute()
+        )
+        self.assertEqual(len(results), 4)
+        for result in results:
+            self.assertEqual(result.project, self.project.slug)
+            self.assertEqual(result.version, self.version.slug)
+            self.assertEqual(result.build, 1)
+            if result.path.endswith("index.html"):
+                self.assertEqual(result.rank, 5, result.path)
+            else:
+                self.assertEqual(result.rank, 0, result.path)
 
     def test_page_custom_rank_several(self):
         search_ranking = {
             "test.html": 5,
             "api/index.html": 2,
         }
-        self._manage_imported_files(self.version, "commit01", 1, search_ranking)
+        self._manage_imported_files(
+            self.version, build_id=1, search_ranking=search_ranking
+        )
 
-        self.assertEqual(HTMLFile.objects.count(), 2)
-        file_api = HTMLFile.objects.get(path="api/index.html")
-        file_test = HTMLFile.objects.get(path="test.html")
-        self.assertEqual(file_api.rank, 2)
-        self.assertEqual(file_test.rank, 5)
+        results = (
+            PageDocument()
+            .search()
+            .filter("term", project=self.project.slug)
+            .filter("term", version=self.version.slug)
+            .execute()
+        )
+        self.assertEqual(len(results), 4)
+        for result in results:
+            self.assertEqual(result.project, self.project.slug)
+            self.assertEqual(result.version, self.version.slug)
+            self.assertEqual(result.build, 1)
+            if result.path == "test.html":
+                self.assertEqual(result.rank, 5)
+            elif result.path == "api/index.html":
+                self.assertEqual(result.rank, 2)
+            else:
+                self.assertEqual(result.rank, 0)
 
     def test_page_custom_rank_precedence(self):
         search_ranking = {
             "*.html": 5,
             "api/index.html": 2,
         }
-        self._manage_imported_files(self.version, "commit01", 1, search_ranking)
+        self._manage_imported_files(
+            self.version, build_id=1, search_ranking=search_ranking
+        )
 
-        self.assertEqual(HTMLFile.objects.count(), 2)
-        file_api = HTMLFile.objects.get(path="api/index.html")
-        file_test = HTMLFile.objects.get(path="test.html")
-        self.assertEqual(file_api.rank, 2)
-        self.assertEqual(file_test.rank, 5)
+        results = (
+            PageDocument()
+            .search()
+            .filter("term", project=self.project.slug)
+            .filter("term", version=self.version.slug)
+            .execute()
+        )
+        self.assertEqual(len(results), 4)
+        for result in results:
+            self.assertEqual(result.project, self.project.slug)
+            self.assertEqual(result.version, self.version.slug)
+            self.assertEqual(result.build, 1)
+            if result.path == "api/index.html":
+                self.assertEqual(result.rank, 2, result.path)
+            else:
+                self.assertEqual(result.rank, 5, result.path)
 
     def test_page_custom_rank_precedence_inverted(self):
         search_ranking = {
             "api/index.html": 2,
             "*.html": 5,
         }
-        self._manage_imported_files(self.version, "commit01", 1, search_ranking)
+        self._manage_imported_files(
+            self.version, build_id=1, search_ranking=search_ranking
+        )
 
-        self.assertEqual(HTMLFile.objects.count(), 2)
-        file_api = HTMLFile.objects.get(path="api/index.html")
-        file_test = HTMLFile.objects.get(path="test.html")
-        self.assertEqual(file_api.rank, 5)
-        self.assertEqual(file_test.rank, 5)
+        results = (
+            PageDocument()
+            .search()
+            .filter("term", project=self.project.slug)
+            .filter("term", version=self.version.slug)
+            .execute()
+        )
+        self.assertEqual(len(results), 4)
+        for result in results:
+            self.assertEqual(result.project, self.project.slug)
+            self.assertEqual(result.version, self.version.slug)
+            self.assertEqual(result.build, 1)
+            self.assertEqual(result.rank, 5)
 
     def test_search_page_ignore(self):
         search_ignore = ["api/index.html"]
         self._manage_imported_files(
             self.version,
-            "commit01",
-            1,
+            build_id=1,
             search_ignore=search_ignore,
         )
 
-        self.assertEqual(HTMLFile.objects.count(), 2)
-        file_api = HTMLFile.objects.get(path="api/index.html")
-        file_test = HTMLFile.objects.get(path="test.html")
-        self.assertTrue(file_api.ignore)
-        self.assertFalse(file_test.ignore)
+        self.assertEqual(
+            set(HTMLFile.objects.all().values_list("path", flat=True)),
+            {"index.html", "api/index.html", "404.html"},
+        )
+        results = (
+            PageDocument()
+            .search()
+            .filter("term", project=self.project.slug)
+            .filter("term", version=self.version.slug)
+            .execute()
+        )
+        self.assertEqual(len(results), 3)
+        self.assertEqual(
+            {result.path for result in results}, {"index.html", "404.html", "test.html"}
+        )
 
     def test_update_content(self):
         test_dir = os.path.join(base_dir, "files")
@@ -155,8 +271,18 @@ def test_update_content(self):
         with override_settings(DOCROOT=self.test_dir):
             self._copy_storage_dir()
 
-        self._manage_imported_files(self.version, "commit01", 1)
-        self.assertEqual(ImportedFile.objects.count(), 2)
+        self._manage_imported_files(self.version, build_id=1)
+        self.assertEqual(ImportedFile.objects.count(), 3)
+        document = (
+            PageDocument()
+            .search()
+            .filter("term", project=self.project.slug)
+            .filter("term", version=self.version.slug)
+            .filter("term", path="test.html")
+            .filter("term", build=1)
+            .execute()[0]
+        )
+        self.assertEqual(document.sections[0].content, "Woo")
 
         with open(os.path.join(test_dir, "test.html"), "w+") as f:
             f.write("Something Else")
@@ -164,5 +290,15 @@ def test_update_content(self):
         with override_settings(DOCROOT=self.test_dir):
             self._copy_storage_dir()
 
-        self._manage_imported_files(self.version, "commit02", 2)
-        self.assertEqual(ImportedFile.objects.count(), 2)
+        self._manage_imported_files(self.version, build_id=2)
+        self.assertEqual(ImportedFile.objects.count(), 3)
+        document = (
+            PageDocument()
+            .search()
+            .filter("term", project=self.project.slug)
+            .filter("term", version=self.version.slug)
+            .filter("term", path="test.html")
+            .filter("term", build=2)
+            .execute()[0]
+        )
+        self.assertEqual(document.sections[0].content, "Something Else")
diff --git a/readthedocs/search/tests/test_api.py b/readthedocs/search/tests/test_api.py
index 123b0bbd652..3f4a868666f 100644
--- a/readthedocs/search/tests/test_api.py
+++ b/readthedocs/search/tests/test_api.py
@@ -21,7 +21,6 @@
     SECTION_FIELDS,
     get_search_query_from_project_file,
 )
-from readthedocs.search.utils import index_new_files, remove_indexed_files
 
 
 @pytest.mark.django_db
@@ -726,64 +725,6 @@ def test_search_custom_ranking(self, api_client):
         assert results[0]["path"] == "/en/latest/index.html"
         assert results[1]["path"] == "/en/latest/guides/index.html"
 
-    def test_search_ignore(self, api_client):
-        project = Project.objects.get(slug="docs")
-        version = project.versions.all().first()
-
-        page_index = HTMLFile.objects.get(
-            version=version,
-            path="index.html",
-        )
-        page_guides = HTMLFile.objects.get(
-            version=version,
-            path="guides/index.html",
-        )
-
-        search_params = {
-            "project": project.slug,
-            "version": version.slug,
-            "q": '"content from"',
-        }
-
-        # Query with files not ignored.
-        assert page_index.ignore is None
-        assert page_guides.ignore is None
-
-        resp = self.get_search(api_client, search_params)
-        assert resp.status_code == 200
-
-        results = resp.data["results"]
-        assert len(results) == 2
-        assert results[0]["path"] == "/en/latest/index.html"
-        assert results[1]["path"] == "/en/latest/guides/index.html"
-
-        # Query with guides/index.html ignored.
-        page_guides.ignore = True
-        page_guides.save()
-
-        remove_indexed_files(HTMLFile, project.slug, version.slug)
-        index_new_files(HTMLFile, version, page_index.build)
-
-        resp = self.get_search(api_client, search_params)
-        assert resp.status_code == 200
-
-        results = resp.data["results"]
-        assert len(results) == 1
-        assert results[0]["path"] == "/en/latest/index.html"
-
-        # Query with index.html and guides/index.html ignored.
-        page_index.ignore = True
-        page_index.save()
-
-        remove_indexed_files(HTMLFile, project.slug, version.slug)
-        index_new_files(HTMLFile, version, page_index.build)
-
-        resp = self.get_search(api_client, search_params)
-        assert resp.status_code == 200
-
-        results = resp.data["results"]
-        assert len(results) == 0
-
 
 class TestDocumentSearch(BaseTestDocumentSearch):
     pass

From 3e2fd317ae12a9a3b00fab570c708b199b3c64f8 Mon Sep 17 00:00:00 2001
From: Santos Gallegos <stsewd@proton.me>
Date: Tue, 12 Sep 2023 19:34:50 -0500
Subject: [PATCH 04/12] Remove more code

---
 readthedocs/builds/admin.py                   | 39 ++-----------
 readthedocs/builds/querysets.py               | 22 +++++++
 readthedocs/projects/admin.py                 | 57 +++----------------
 readthedocs/projects/tasks/search.py          | 11 ++--
 .../rtd_tests/tests/test_imported_file.py     |  5 +-
 .../commands/reindex_elasticsearch.py         | 17 +-----
 readthedocs/search/utils.py                   | 34 -----------
 7 files changed, 45 insertions(+), 140 deletions(-)

diff --git a/readthedocs/builds/admin.py b/readthedocs/builds/admin.py
index 9d08d3ed671..fe19cedff8c 100644
--- a/readthedocs/builds/admin.py
+++ b/readthedocs/builds/admin.py
@@ -12,8 +12,7 @@
 )
 from readthedocs.core.utils import trigger_build
 from readthedocs.core.utils.admin import pretty_json_field
-from readthedocs.projects.models import HTMLFile
-from readthedocs.search.utils import _indexing_helper
+from readthedocs.projects.tasks.search import reindex_version
 
 
 class BuildCommandResultInline(admin.TabularInline):
@@ -89,7 +88,7 @@ class VersionAdmin(admin.ModelAdmin):
     list_filter = ("type", "privacy_level", "active", "built")
     search_fields = ("slug", "project__slug")
     raw_id_fields = ("project",)
-    actions = ["build_version", "reindex_version", "wipe_version_indexes"]
+    actions = ["build_version", "reindex_version"]
 
     def project_slug(self, obj):
         return obj.project.slug
@@ -117,41 +116,11 @@ def build_version(self, request, queryset):
     @admin.action(description="Reindex version to ES")
     def reindex_version(self, request, queryset):
         """Reindexes all selected versions to ES."""
-        html_objs_qs = []
-        for version in queryset.iterator():
-            html_objs = HTMLFile.objects.filter(
-                project=version.project, version=version
-            )
-
-            if html_objs.exists():
-                html_objs_qs.append(html_objs)
-
-        if html_objs_qs:
-            _indexing_helper(html_objs_qs, wipe=False)
+        for version_id in queryset.values_list("id", flat=True).iterator():
+            reindex_version.delay(version_id)
 
         self.message_user(request, "Task initiated successfully.", messages.SUCCESS)
 
-    @admin.action(description="Wipe version from ES")
-    def wipe_version_indexes(self, request, queryset):
-        """Wipe selected versions from ES."""
-        html_objs_qs = []
-        for version in queryset.iterator():
-            html_objs = HTMLFile.objects.filter(
-                project=version.project, version=version
-            )
-
-            if html_objs.exists():
-                html_objs_qs.append(html_objs)
-
-        if html_objs_qs:
-            _indexing_helper(html_objs_qs, wipe=True)
-
-        self.message_user(
-            request,
-            "Task initiated successfully",
-            messages.SUCCESS,
-        )
-
 
 @admin.register(RegexAutomationRule)
 class RegexAutomationRuleAdmin(PolymorphicChildModelAdmin, admin.ModelAdmin):
diff --git a/readthedocs/builds/querysets.py b/readthedocs/builds/querysets.py
index bf2ebcc4963..3ab46436171 100644
--- a/readthedocs/builds/querysets.py
+++ b/readthedocs/builds/querysets.py
@@ -118,6 +118,28 @@ def public(
     def api(self, user=None):
         return self.public(user, only_active=False)
 
+    def for_reindex(self):
+        """
+        Get all versions that can be reindexed.
+
+        A version can be reindexed if:
+
+        - It's active and has been built at least once successfully.
+          Since that means that it has files to be indexed.
+        - Its project is not delisted or marked as spam.
+        """
+        return (
+            self.filter(
+                active=True,
+                built=True,
+                builds__state=BUILD_STATE_FINISHED,
+                builds__success=True,
+            )
+            .exclude(project__delisted=True)
+            .exclude(project__is_spam=True)
+            .distinct()
+        )
+
 
 class VersionQuerySet(SettingsOverrideObject):
     _default_class = VersionQuerySetBase
diff --git a/readthedocs/projects/admin.py b/readthedocs/projects/admin.py
index 534c19ae202..ff5a25ebb59 100644
--- a/readthedocs/projects/admin.py
+++ b/readthedocs/projects/admin.py
@@ -11,8 +11,8 @@
 from readthedocs.core.history import ExtraSimpleHistoryAdmin, set_change_reason
 from readthedocs.core.utils import trigger_build
 from readthedocs.notifications.views import SendNotificationView
+from readthedocs.projects.tasks.search import reindex_version
 from readthedocs.redirects.models import Redirect
-from readthedocs.search.utils import _indexing_helper
 
 from .forms import FeatureForm
 from .models import (
@@ -264,7 +264,6 @@ class ProjectAdmin(ExtraSimpleHistoryAdmin):
         "run_spam_rule_checks",
         "build_default_version",
         "reindex_active_versions",
-        "wipe_all_versions",
         "import_tags_from_vcs",
     ]
 
@@ -362,27 +361,19 @@ def reindex_active_versions(self, request, queryset):
         """Reindex all active versions of the selected projects to ES."""
         qs_iterator = queryset.iterator()
         for project in qs_iterator:
-            version_qs = Version.internal.filter(project=project)
-            active_versions = version_qs.filter(active=True)
+            versions_id_to_reindex = project.versions.for_reindex().values_list(
+                "pk", flat=True
+            )
 
-            if not active_versions.exists():
+            if not versions_id_to_reindex.exists():
                 self.message_user(
                     request,
-                    "No active versions of project {}".format(project),
+                    "No versions to be re-indexed for project {}".format(project),
                     messages.ERROR,
                 )
             else:
-                html_objs_qs = []
-                for version in active_versions.iterator():
-                    html_objs = HTMLFile.objects.filter(
-                        project=project, version=version
-                    )
-
-                    if html_objs.exists():
-                        html_objs_qs.append(html_objs)
-
-                if html_objs_qs:
-                    _indexing_helper(html_objs_qs, wipe=False)
+                for version_id in versions_id_to_reindex.iterator():
+                    reindex_version.delay(version_id)
 
                 self.message_user(
                     request,
@@ -390,38 +381,6 @@ def reindex_active_versions(self, request, queryset):
                     messages.SUCCESS,
                 )
 
-    # TODO: rename method to mention "indexes" on its name
-    @admin.action(description="Wipe all versions from ES")
-    def wipe_all_versions(self, request, queryset):
-        """Wipe indexes of all versions of selected projects."""
-        qs_iterator = queryset.iterator()
-        for project in qs_iterator:
-            version_qs = Version.internal.filter(project=project)
-            if not version_qs.exists():
-                self.message_user(
-                    request,
-                    "No active versions of project {}.".format(project),
-                    messages.ERROR,
-                )
-            else:
-                html_objs_qs = []
-                for version in version_qs.iterator():
-                    html_objs = HTMLFile.objects.filter(
-                        project=project, version=version
-                    )
-
-                    if html_objs.exists():
-                        html_objs_qs.append(html_objs)
-
-                if html_objs_qs:
-                    _indexing_helper(html_objs_qs, wipe=True)
-
-                self.message_user(
-                    request,
-                    "Task initiated successfully for {}.".format(project),
-                    messages.SUCCESS,
-                )
-
     @admin.action(description="Import tags from the version control API")
     def import_tags_from_vcs(self, request, queryset):
         for project in queryset.iterator():
diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py
index 92fb2f3a819..fb41272458e 100644
--- a/readthedocs/projects/tasks/search.py
+++ b/readthedocs/projects/tasks/search.py
@@ -101,7 +101,7 @@ def reindex_version(version_id, search_index_name=None):
     The latest successful build is used for the re-creation.
     """
     version = Version.objects.filter(pk=version_id).select_related("project").first()
-    if not version:
+    if not version or not version.built:
         return
 
     latest_successful_build = (
@@ -129,9 +129,9 @@ def reindex_version(version_id, search_index_name=None):
         search_ignore = search_config.get("ignore", [])
 
     # We need to use a build id that is different from the current one,
-    # since, otherwise we will end up with duplicated files.
-    # After the process is complete, we delete the files and search index
-    # that don't belong to the current build id.
+    # otherwise we will end up with duplicated files. After the process
+    # is complete, we delete the files and search index that don't belong
+    # to the current build id. The build id isn't used for anything else.
     build_id = latest_successful_build.id + 1
     try:
         _create_imported_files_and_search_index(
@@ -213,7 +213,8 @@ def _create_imported_files_and_search_index(
                 name=filename,
                 rank=page_rank,
                 # TODO: We are setting this field since it's required,
-                # but it will be removed in the future together with other fields.
+                # but it isn't used, and will be removed in the future
+                # together with other fields.
                 commit="unknown",
                 build=build_id,
                 ignore=skip_search_index,
diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py
index 9ea7515d9cd..d97d897b6bb 100644
--- a/readthedocs/rtd_tests/tests/test_imported_file.py
+++ b/readthedocs/rtd_tests/tests/test_imported_file.py
@@ -66,7 +66,7 @@ def test_properly_created(self):
         - api/index.html
         - 404.html
 
-        But we create imported files for index.html and 404.html pages only.
+        But we create imported files for index.html and 404.html files only.
         """
         self.assertEqual(ImportedFile.objects.count(), 0)
 
@@ -79,7 +79,8 @@ def test_properly_created(self):
 
         results = PageDocument().search().filter("term", build=1).execute()
         self.assertEqual(
-            {result.path for result in results}, {"index.html", "404.html", "test.html"}
+            {result.path for result in results},
+            {"index.html", "404.html", "test.html", "api/index.html"},
         )
 
         self._manage_imported_files(version=self.version, build_id=2)
diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py
index e9334b4f4a8..31c2a626dc3 100644
--- a/readthedocs/search/management/commands/reindex_elasticsearch.py
+++ b/readthedocs/search/management/commands/reindex_elasticsearch.py
@@ -9,7 +9,6 @@
 from django.core.management import BaseCommand
 from django_elasticsearch_dsl.registries import registry
 
-from readthedocs.builds.constants import BUILD_STATE_FINISHED
 from readthedocs.builds.models import Version
 from readthedocs.projects.models import HTMLFile, Project
 from readthedocs.projects.tasks.search import reindex_version
@@ -162,18 +161,6 @@ def _reindex_projects_from(self, days_ago, queue):
                 items=queryset.count(),
             )
 
-    def _get_versions_to_reindex_queryset(self):
-        return (
-            Version.objects.filter(
-                built=True,
-                builds__state=BUILD_STATE_FINISHED,
-                builds__success=True,
-            )
-            .exclude(project__delisted=True)
-            .exclude(project__is_spam=True)
-            .distinct()
-        )
-
     def _reindex_files(self, queue, timestamp):
         document = PageDocument
         app_label = HTMLFile._meta.app_label
@@ -188,7 +175,7 @@ def _reindex_files(self, queue, timestamp):
         )
         log.info("Temporal index created.", index_name=new_index_name)
 
-        queryset = self._get_versions_to_reindex_queryset().values_list("pk", flat=True)
+        queryset = Version.objects.for_reindex().values_list("pk", flat=True)
         for version_id in queryset.iterator():
             reindex_version.apply_async(
                 kwargs={
@@ -207,7 +194,7 @@ def _reindex_files_from(self, days_ago, queue):
         """Reindex HTML files from versions with recent builds."""
         since = datetime.now() - timedelta(days=days_ago)
         queryset = (
-            self._get_versions_to_reindex_queryset()
+            Version.objects.for_reindex()
             .filter(builds__date__gte=since)
             .values_list("pk", flat=True)
         )
diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py
index 8ef62345121..7f25e1991c2 100644
--- a/readthedocs/search/utils.py
+++ b/readthedocs/search/utils.py
@@ -5,8 +5,6 @@
 from django_elasticsearch_dsl.apps import DEDConfig
 from django_elasticsearch_dsl.registries import registry
 
-from readthedocs.projects.models import HTMLFile
-
 log = structlog.get_logger(__name__)
 
 
@@ -71,38 +69,6 @@ def _get_document(model, document_class):
             return document
 
 
-def _indexing_helper(html_objs_qs, wipe=False):
-    """
-    Helper function for reindexing and wiping indexes of projects and versions.
-
-    If ``wipe`` is set to False, html_objs are deleted from the ES index,
-    else, html_objs are indexed.
-    """
-    from readthedocs.search.documents import PageDocument
-    from readthedocs.search.tasks import delete_objects_in_es, index_objects_to_es
-
-    if html_objs_qs:
-        obj_ids = []
-        for html_objs in html_objs_qs:
-            obj_ids.extend([obj.id for obj in html_objs])
-
-        # removing redundant ids if exists.
-        obj_ids = list(set(obj_ids))
-
-        if obj_ids:
-            kwargs = {
-                "app_label": HTMLFile._meta.app_label,
-                "model_name": HTMLFile.__name__,
-                "document_class": str(PageDocument),
-                "objects_id": obj_ids,
-            }
-
-            if not wipe:
-                index_objects_to_es.delay(**kwargs)
-            else:
-                delete_objects_in_es.delay(**kwargs)
-
-
 def _last_30_days_iter():
     """Returns iterator for previous 30 days (including today)."""
     thirty_days_ago = timezone.now().date() - timezone.timedelta(days=30)

From c5ec0806ab379d7ff6686977bc4b3653a21aef82 Mon Sep 17 00:00:00 2001
From: Santos Gallegos <stsewd@proton.me>
Date: Tue, 12 Sep 2023 20:07:50 -0500
Subject: [PATCH 05/12] Simplify more things

---
 readthedocs/projects/tasks/search.py   | 22 ++++++----------------
 readthedocs/search/tests/test_utils.py |  7 +------
 readthedocs/search/utils.py            | 20 ++++++++++++++++++--
 3 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py
index fb41272458e..4b45acc6569 100644
--- a/readthedocs/projects/tasks/search.py
+++ b/readthedocs/projects/tasks/search.py
@@ -7,7 +7,7 @@
 from readthedocs.projects.models import HTMLFile, Project
 from readthedocs.projects.signals import files_changed
 from readthedocs.search.documents import PageDocument
-from readthedocs.search.utils import remove_indexed_files
+from readthedocs.search.utils import index_objects, remove_indexed_files
 from readthedocs.storage import build_media_storage
 from readthedocs.worker import app
 
@@ -149,7 +149,6 @@ def reindex_version(version_id, search_index_name=None):
 def remove_search_indexes(project_slug, version_slug=None):
     """Wrapper around ``remove_indexed_files`` to make it a task."""
     remove_indexed_files(
-        model=HTMLFile,
         project_slug=project_slug,
         version_slug=version_slug,
     )
@@ -235,23 +234,14 @@ def _create_imported_files_and_search_index(
     # and we neeed this id to be `None` when indexing the objects in ES.
     # ES will generate a unique id for each document.
     if html_files_to_index:
-        document = PageDocument
-
-        # If a search index name is provided, we need to temporarily change
-        # the index name of the document.
-        old_index_name = document._index._name
-        if search_index_name:
-            document._index._name = search_index_name
-
-        document().update(html_files_to_index)
-
-        # Restore the old index name.
-        if search_index_name:
-            document._index._name = old_index_name
+        index_objects(
+            document=PageDocument,
+            objects=html_files_to_index,
+            index_name=search_index_name,
+        )
 
     # Remove old HTMLFiles from ElasticSearch
     remove_indexed_files(
-        model=HTMLFile,
         project_slug=version.project.slug,
         version_slug=version.slug,
         build_id=build_id,
diff --git a/readthedocs/search/tests/test_utils.py b/readthedocs/search/tests/test_utils.py
index 21583603534..8d7429f8b5f 100644
--- a/readthedocs/search/tests/test_utils.py
+++ b/readthedocs/search/tests/test_utils.py
@@ -4,7 +4,6 @@
 from django.urls import reverse
 
 from readthedocs.builds.constants import LATEST, STABLE
-from readthedocs.projects.models import HTMLFile
 from readthedocs.search import utils
 from readthedocs.search.tests.utils import get_search_query_from_project_file
 
@@ -33,10 +32,7 @@ def test_remove_only_one_project_index(self, api_client, all_projects):
         assert self.has_results(api_client, project, LATEST)
         assert self.has_results(api_client, project, STABLE)
 
-        utils.remove_indexed_files(
-            HTMLFile,
-            project_slug=project,
-        )
+        utils.remove_indexed_files(project_slug=project)
         # Deletion of indices from ES happens async,
         # so we need to wait a little before checking for results.
         time.sleep(1)
@@ -55,7 +51,6 @@ def test_remove_only_one_version_index(self, api_client, all_projects):
         assert self.has_results(api_client, project, STABLE)
 
         utils.remove_indexed_files(
-            HTMLFile,
             project_slug=project,
             version_slug=LATEST,
         )
diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py
index 7f25e1991c2..f473b4d0425 100644
--- a/readthedocs/search/utils.py
+++ b/readthedocs/search/utils.py
@@ -5,10 +5,26 @@
 from django_elasticsearch_dsl.apps import DEDConfig
 from django_elasticsearch_dsl.registries import registry
 
+from readthedocs.search.documents import PageDocument
+
 log = structlog.get_logger(__name__)
 
 
-def remove_indexed_files(model, project_slug, version_slug=None, build_id=None):
+def index_objects(document, objects, index_name=None):
+    # If a search index name is provided, we need to temporarily change
+    # the index name of the document.
+    old_index_name = document._index._name
+    if index_name:
+        document._index._name = index_name
+
+    document().update(objects)
+
+    # Restore the old index name.
+    if index_name:
+        document._index._name = old_index_name
+
+
+def remove_indexed_files(project_slug, version_slug=None, build_id=None):
     """
     Remove files from `version_slug` of `project_slug` from the search index.
 
@@ -29,7 +45,7 @@ def remove_indexed_files(model, project_slug, version_slug=None, build_id=None):
         return
 
     try:
-        document = list(registry.get_documents(models=[model]))[0]
+        document = PageDocument
         log.info("Deleting old files from search index.")
         documents = document().search().filter("term", project=project_slug)
         if version_slug:

From d57c481a8e37cc7d1c9e76a5eb7ba9fb4831cabf Mon Sep 17 00:00:00 2001
From: Santos Gallegos <stsewd@proton.me>
Date: Tue, 12 Sep 2023 20:09:41 -0500
Subject: [PATCH 06/12] Fix test

---
 readthedocs/rtd_tests/tests/test_imported_file.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py
index d97d897b6bb..1f61e5f51b0 100644
--- a/readthedocs/rtd_tests/tests/test_imported_file.py
+++ b/readthedocs/rtd_tests/tests/test_imported_file.py
@@ -92,7 +92,8 @@ def test_properly_created(self):
 
         results = PageDocument().search().filter("term", build=2).execute()
         self.assertEqual(
-            {result.path for result in results}, {"index.html", "404.html", "test.html"}
+            {result.path for result in results},
+            {"index.html", "404.html", "test.html", "api/index.html"},
         )
 
     def test_update_build(self):

From 022e7e92dd00d3dd013d0ef16cb941d4489301a1 Mon Sep 17 00:00:00 2001
From: Santos Gallegos <stsewd@proton.me>
Date: Tue, 12 Sep 2023 20:27:48 -0500
Subject: [PATCH 07/12] Small updates

---
 readthedocs/projects/tasks/search.py                      | 8 +++-----
 .../search/management/commands/reindex_elasticsearch.py   | 2 --
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py
index 4b45acc6569..9ad13e69d13 100644
--- a/readthedocs/projects/tasks/search.py
+++ b/readthedocs/projects/tasks/search.py
@@ -15,8 +15,8 @@
 
 
 # TODO: remove after deploy. This is kept, so builds keep
-# working while the deploy the new task below.
-@app.task(queue='reindex')
+# working while we deploy the task below.
+@app.task(queue="reindex")
 def fileify(version_pk, commit, build, search_ranking, search_ignore):
     """
     Create ImportedFile objects for all of a version's files.
@@ -224,9 +224,7 @@ def _create_imported_files_and_search_index(
 
             # Create the imported file only if it's a top-level 404 file,
             # or if it's an index file. We don't need to keep track of all files.
-            is_top_level_404_file = filename == "404.html" and root == storage_path
-            is_index_file = filename in ["index.html", "README.html"]
-            if is_top_level_404_file or is_index_file:
+            if relpath == "404.html" or filename in ["index.html", "README.html"]:
                 html_files_to_save.append(html_file)
 
     # We first index the files in ES, and then save the objects in the DB.
diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py
index 31c2a626dc3..27585c12334 100644
--- a/readthedocs/search/management/commands/reindex_elasticsearch.py
+++ b/readthedocs/search/management/commands/reindex_elasticsearch.py
@@ -99,8 +99,6 @@ def _reindex_from(self, days_ago, models, queue):
             functions[model](days_ago=days_ago, queue=queue)
 
     def _reindex_projects(self, queue, timestamp):
-        # TODO: can we just improt the project document?
-        # document = list(registry.get_documents(models=[Project]))[0]
         document = ProjectDocument
         app_label = Project._meta.app_label
         model_name = Project.__name__

From 19d7c5c19febcb8a5d8f1c34a384169b040ba571 Mon Sep 17 00:00:00 2001
From: Santos Gallegos <stsewd@proton.me>
Date: Tue, 12 Sep 2023 20:31:49 -0500
Subject: [PATCH 08/12] Update test

---
 readthedocs/rtd_tests/tests/test_imported_file.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py
index 1f61e5f51b0..c59b2a795e8 100644
--- a/readthedocs/rtd_tests/tests/test_imported_file.py
+++ b/readthedocs/rtd_tests/tests/test_imported_file.py
@@ -103,6 +103,7 @@ def test_update_build(self):
             self.assertEqual(obj.build, 1)
 
         results = PageDocument().search().filter().execute()
+        self.assertEqual(len(results), 4)
         for result in results:
             self.assertEqual(result.build, 1)
 
@@ -110,11 +111,10 @@ def test_update_build(self):
         for obj in ImportedFile.objects.all():
             self.assertEqual(obj.build, 2)
 
-        # NOTE: we can't test the ES index here because the index is updated asynchronously,
-        # so we will get results for both builds.
-        # results = PageDocument().search().filter().execute()
-        # for result in results:
-        #     self.assertEqual(result.build, 2)
+        # NOTE: we can't test that the files from the previous build
+        # were deleted, since deletion happens asynchronously.
+        results = PageDocument().search().filter("term", build=2).execute()
+        self.assertEqual(len(results), 4)
 
     def test_page_default_rank(self):
         search_ranking = {}

From 79e6141794d62e7380352734f60fcf599c38979c Mon Sep 17 00:00:00 2001
From: Santos Gallegos <stsewd@proton.me>
Date: Wed, 13 Sep 2023 13:06:38 -0500
Subject: [PATCH 09/12] Updates from review and fixes

---
 readthedocs/projects/tasks/search.py          | 44 +++++++++----------
 .../commands/reindex_elasticsearch.py         |  5 +--
 readthedocs/search/utils.py                   | 18 +++++++-
 3 files changed, 40 insertions(+), 27 deletions(-)

diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py
index 9ad13e69d13..57c195e6fb1 100644
--- a/readthedocs/projects/tasks/search.py
+++ b/readthedocs/projects/tasks/search.py
@@ -14,8 +14,8 @@
 log = structlog.get_logger(__name__)
 
 
-# TODO: remove after deploy. This is kept, so builds keep
-# working while we deploy the task below.
+# TODO: remove `fileify` task after deploy. We keep it for now
+# so builds keep working while we deploy the `index_build` task.
 @app.task(queue="reindex")
 def fileify(version_pk, commit, build, search_ranking, search_ignore):
     """
@@ -61,11 +61,13 @@ def index_build(build_id):
         .first()
     )
     if not build:
+        log.debug("Skipping search indexing. Build object doesn't exists.", build_id=build_id)
         return
 
     # The version may have been deleted.
     version = build.version
     if not version:
+        log.debug("Skipping search indexing. Build doesn't have a version attach it to it.", build_id=build_id)
         return
 
     log.bind(
@@ -74,13 +76,10 @@ def index_build(build_id):
         build_id=build.id,
     )
 
-    search_ranking = []
-    search_ignore = []
-    build_config = build.config
-    if build_config:
-        search_config = build_config.get("search", {})
-        search_ranking = search_config.get("ranking", [])
-        search_ignore = search_config.get("ignore", [])
+    build_config = build.config or {}
+    search_config = build_config.get("search", {})
+    search_ranking = search_config.get("ranking", [])
+    search_ignore = search_config.get("ignore", [])
 
     try:
         _create_imported_files_and_search_index(
@@ -102,6 +101,7 @@ def reindex_version(version_id, search_index_name=None):
     """
     version = Version.objects.filter(pk=version_id).select_related("project").first()
     if not version or not version.built:
+        log.debug("Skipping search indexing. Version doesn't exist or is not built.", version_id=version_id)
         return
 
     latest_successful_build = (
@@ -112,6 +112,7 @@ def reindex_version(version_id, search_index_name=None):
     # If the version doesn't have a successful
     # build, we don't have files to index.
     if not latest_successful_build:
+        log.debug("Skipping search indexing. Version doesn't have a successful build.", version_id=version_id)
         return
 
     log.bind(
@@ -120,18 +121,16 @@ def reindex_version(version_id, search_index_name=None):
         build_id=latest_successful_build.id,
     )
 
-    search_ranking = []
-    search_ignore = []
-    build_config = latest_successful_build.config
-    if build_config:
-        search_config = build_config.get("search", {})
-        search_ranking = search_config.get("ranking", [])
-        search_ignore = search_config.get("ignore", [])
-
-    # We need to use a build id that is different from the current one,
-    # otherwise we will end up with duplicated files. After the process
-    # is complete, we delete the files and search index that don't belong
-    # to the current build id. The build id isn't used for anything else.
+    build_config = latest_successful_build.config or {}
+    search_config = build_config.get("search", {})
+    search_ranking = search_config.get("ranking", [])
+    search_ignore = search_config.get("ignore", [])
+
+    # We need to use a build ID that is different from the current one,
+    # this ID is to differentiate the new files being indexed from the
+    # current ones, so we can delete the old ones easily.
+    # If we re-use the same build ID, we will end up with duplicated files.
+    # The build id isn't used for anything else.
     build_id = latest_successful_build.id + 1
     try:
         _create_imported_files_and_search_index(
@@ -211,7 +210,7 @@ def _create_imported_files_and_search_index(
                 path=relpath,
                 name=filename,
                 rank=page_rank,
-                # TODO: We are setting this field since it's required,
+                # TODO: We are setting the commit field since it's required,
                 # but it isn't used, and will be removed in the future
                 # together with other fields.
                 commit="unknown",
@@ -243,6 +242,7 @@ def _create_imported_files_and_search_index(
         project_slug=version.project.slug,
         version_slug=version.slug,
         build_id=build_id,
+        index_name=search_index_name,
     )
 
     if html_files_to_save:
diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py
index 27585c12334..514c6a39299 100644
--- a/readthedocs/search/management/commands/reindex_elasticsearch.py
+++ b/readthedocs/search/management/commands/reindex_elasticsearch.py
@@ -86,10 +86,9 @@ def _change_index(self, models, timestamp):
 
     def _reindex_from(self, days_ago, models, queue):
         functions = {
-            apps.get_model("projects.HTMLFile"): self._reindex_files_from,
-            apps.get_model("projects.Project"): self._reindex_projects_from,
+            HTMLFile: self._reindex_files_from,
+            Project: self._reindex_projects_from,
         }
-        models = models or functions.keys()
         for model in models:
             if model not in functions:
                 log.warning(
diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py
index f473b4d0425..e82ce75d3a6 100644
--- a/readthedocs/search/utils.py
+++ b/readthedocs/search/utils.py
@@ -11,6 +11,10 @@
 
 
 def index_objects(document, objects, index_name=None):
+    if not DEDConfig.autosync_enabled():
+        log.info("Autosync disabled, skipping searh indexing.")
+        return
+
     # If a search index name is provided, we need to temporarily change
     # the index name of the document.
     old_index_name = document._index._name
@@ -24,7 +28,7 @@ def index_objects(document, objects, index_name=None):
         document._index._name = old_index_name
 
 
-def remove_indexed_files(project_slug, version_slug=None, build_id=None):
+def remove_indexed_files(project_slug, version_slug=None, build_id=None, index_name=None):
     """
     Remove files from `version_slug` of `project_slug` from the search index.
 
@@ -44,8 +48,14 @@ def remove_indexed_files(project_slug, version_slug=None, build_id=None):
         log.info("Autosync disabled, skipping removal from the search index.")
         return
 
+    # If a search index name is provided, we need to temporarily change
+    # the index name of the document.
+    document = PageDocument
+    old_index_name = document._index._name
+    if index_name:
+        document._index._name = index_name
+
     try:
-        document = PageDocument
         log.info("Deleting old files from search index.")
         documents = document().search().filter("term", project=project_slug)
         if version_slug:
@@ -56,6 +66,10 @@ def remove_indexed_files(project_slug, version_slug=None, build_id=None):
     except Exception:
         log.exception("Unable to delete a subset of files. Continuing.")
 
+    # Restore the old index name.
+    if index_name:
+        document._index._name = old_index_name
+
 
 def _get_index(indices, index_name):
     """

From 2ccf0e83c7cae036461efe6e73f8791e8d5b92cf Mon Sep 17 00:00:00 2001
From: Santos Gallegos <stsewd@proton.me>
Date: Wed, 13 Sep 2023 13:10:55 -0500
Subject: [PATCH 10/12] Format

---
 readthedocs/projects/tasks/search.py | 19 +++++++++++++++----
 readthedocs/search/utils.py          |  4 +++-
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py
index 57c195e6fb1..c29cc11b599 100644
--- a/readthedocs/projects/tasks/search.py
+++ b/readthedocs/projects/tasks/search.py
@@ -61,13 +61,18 @@ def index_build(build_id):
         .first()
     )
     if not build:
-        log.debug("Skipping search indexing. Build object doesn't exists.", build_id=build_id)
+        log.debug(
+            "Skipping search indexing. Build object doesn't exists.", build_id=build_id
+        )
         return
 
     # The version may have been deleted.
     version = build.version
     if not version:
-        log.debug("Skipping search indexing. Build doesn't have a version attach it to it.", build_id=build_id)
+        log.debug(
+            "Skipping search indexing. Build doesn't have a version attach it to it.",
+            build_id=build_id,
+        )
         return
 
     log.bind(
@@ -101,7 +106,10 @@ def reindex_version(version_id, search_index_name=None):
     """
     version = Version.objects.filter(pk=version_id).select_related("project").first()
     if not version or not version.built:
-        log.debug("Skipping search indexing. Version doesn't exist or is not built.", version_id=version_id)
+        log.debug(
+            "Skipping search indexing. Version doesn't exist or is not built.",
+            version_id=version_id,
+        )
         return
 
     latest_successful_build = (
@@ -112,7 +120,10 @@ def reindex_version(version_id, search_index_name=None):
     # If the version doesn't have a successful
     # build, we don't have files to index.
     if not latest_successful_build:
-        log.debug("Skipping search indexing. Version doesn't have a successful build.", version_id=version_id)
+        log.debug(
+            "Skipping search indexing. Version doesn't have a successful build.",
+            version_id=version_id,
+        )
         return
 
     log.bind(
diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py
index e82ce75d3a6..3c2ed5d5ad6 100644
--- a/readthedocs/search/utils.py
+++ b/readthedocs/search/utils.py
@@ -28,7 +28,9 @@ def index_objects(document, objects, index_name=None):
         document._index._name = old_index_name
 
 
-def remove_indexed_files(project_slug, version_slug=None, build_id=None, index_name=None):
+def remove_indexed_files(
+    project_slug, version_slug=None, build_id=None, index_name=None
+):
     """
     Remove files from `version_slug` of `project_slug` from the search index.
 

From 8fdbc0681f05c5706332f65d2e27986dc60df13b Mon Sep 17 00:00:00 2001
From: Santos Gallegos <stsewd@proton.me>
Date: Wed, 13 Sep 2023 14:03:19 -0500
Subject: [PATCH 11/12] Rename build -> sync_id

---
 readthedocs/projects/tasks/search.py          | 29 ++++----
 .../rtd_tests/tests/test_imported_file.py     | 66 +++++++++----------
 readthedocs/search/utils.py                   |  6 +-
 3 files changed, 50 insertions(+), 51 deletions(-)

diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py
index c29cc11b599..bd67f206b59 100644
--- a/readthedocs/projects/tasks/search.py
+++ b/readthedocs/projects/tasks/search.py
@@ -44,7 +44,6 @@ def fileify(version_pk, commit, build, search_ranking, search_ignore):
     try:
         _create_imported_files_and_search_index(
             version=version,
-            build_id=build,
             search_ranking=search_ranking,
             search_ignore=search_ignore,
         )
@@ -89,7 +88,6 @@ def index_build(build_id):
     try:
         _create_imported_files_and_search_index(
             version=version,
-            build_id=build.id,
             search_ranking=search_ranking,
             search_ignore=search_ignore,
         )
@@ -137,16 +135,9 @@ def reindex_version(version_id, search_index_name=None):
     search_ranking = search_config.get("ranking", [])
     search_ignore = search_config.get("ignore", [])
 
-    # We need to use a build ID that is different from the current one,
-    # this ID is to differentiate the new files being indexed from the
-    # current ones, so we can delete the old ones easily.
-    # If we re-use the same build ID, we will end up with duplicated files.
-    # The build id isn't used for anything else.
-    build_id = latest_successful_build.id + 1
     try:
         _create_imported_files_and_search_index(
             version=version,
-            build_id=build_id,
             search_ranking=search_ranking,
             search_ignore=search_ignore,
             search_index_name=search_index_name,
@@ -165,7 +156,7 @@ def remove_search_indexes(project_slug, version_slug=None):
 
 
 def _create_imported_files_and_search_index(
-    *, version, build_id, search_ranking, search_ignore, search_index_name=None
+    *, version, search_ranking, search_ignore, search_index_name=None
 ):
     """
     Create imported files and search index for the build of the version.
@@ -180,6 +171,17 @@ def _create_imported_files_and_search_index(
     storage_path = version.project.get_storage_path(
         type_="html", version_slug=version.slug, include_file=False
     )
+    # A sync ID is a number different than the current `build` attribute (pending rename),
+    # it's used to differentiate the files from the current sync from the previous one.
+    # This is useful to easily delete the previous files from the DB and ES.
+    imported_file = version.imported_files.first()
+    sync_id = imported_file.build + 1 if imported_file else 1
+
+    log.debug(
+        "Using sync ID for search indexing",
+        sync_id=sync_id,
+    )
+
     html_files_to_index = []
     html_files_to_save = []
     reverse_rankings = list(reversed(search_ranking.items()))
@@ -225,7 +227,7 @@ def _create_imported_files_and_search_index(
                 # but it isn't used, and will be removed in the future
                 # together with other fields.
                 commit="unknown",
-                build=build_id,
+                build=sync_id,
                 ignore=skip_search_index,
             )
 
@@ -252,7 +254,7 @@ def _create_imported_files_and_search_index(
     remove_indexed_files(
         project_slug=version.project.slug,
         version_slug=version.slug,
-        build_id=build_id,
+        sync_id=sync_id,
         index_name=search_index_name,
     )
 
@@ -260,7 +262,7 @@ def _create_imported_files_and_search_index(
         HTMLFile.objects.bulk_create(html_files_to_save)
 
     # Delete imported files from the previous build of the version.
-    version.imported_files.exclude(build=build_id).delete()
+    version.imported_files.exclude(build=sync_id).delete()
 
     # This signal is used for purging the CDN.
     files_changed.send(
@@ -268,3 +270,4 @@ def _create_imported_files_and_search_index(
         project=version.project,
         version=version,
     )
+    return sync_id
diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py
index c59b2a795e8..82e25f98b88 100644
--- a/readthedocs/rtd_tests/tests/test_imported_file.py
+++ b/readthedocs/rtd_tests/tests/test_imported_file.py
@@ -33,15 +33,12 @@ def tearDown(self):
             # If there are no documents, the query fails.
             pass
 
-    def _manage_imported_files(
-        self, version, build_id, search_ranking=None, search_ignore=None
-    ):
+    def _manage_imported_files(self, version, search_ranking=None, search_ignore=None):
         """Helper function for the tests to create and sync ImportedFiles."""
         search_ranking = search_ranking or {}
         search_ignore = search_ignore or []
-        _create_imported_files_and_search_index(
+        return _create_imported_files_and_search_index(
             version=version,
-            build_id=build_id,
             search_ranking=search_ranking,
             search_ignore=search_ignore,
         )
@@ -70,27 +67,27 @@ def test_properly_created(self):
         """
         self.assertEqual(ImportedFile.objects.count(), 0)
 
-        self._manage_imported_files(version=self.version, build_id=1)
+        sync_id = self._manage_imported_files(version=self.version)
         self.assertEqual(ImportedFile.objects.count(), 3)
         self.assertEqual(
             set(HTMLFile.objects.all().values_list("path", flat=True)),
             {"index.html", "api/index.html", "404.html"},
         )
 
-        results = PageDocument().search().filter("term", build=1).execute()
+        results = PageDocument().search().filter("term", build=sync_id).execute()
         self.assertEqual(
             {result.path for result in results},
             {"index.html", "404.html", "test.html", "api/index.html"},
         )
 
-        self._manage_imported_files(version=self.version, build_id=2)
+        sync_id = self._manage_imported_files(version=self.version)
         self.assertEqual(ImportedFile.objects.count(), 3)
         self.assertEqual(
             set(HTMLFile.objects.all().values_list("path", flat=True)),
             {"index.html", "api/index.html", "404.html"},
         )
 
-        results = PageDocument().search().filter("term", build=2).execute()
+        results = PageDocument().search().filter("term", build=sync_id).execute()
         self.assertEqual(
             {result.path for result in results},
             {"index.html", "404.html", "test.html", "api/index.html"},
@@ -98,29 +95,29 @@ def test_properly_created(self):
 
     def test_update_build(self):
         self.assertEqual(ImportedFile.objects.count(), 0)
-        self._manage_imported_files(self.version, build_id=1)
+        sync_id = self._manage_imported_files(self.version)
         for obj in ImportedFile.objects.all():
-            self.assertEqual(obj.build, 1)
+            self.assertEqual(obj.build, sync_id)
 
         results = PageDocument().search().filter().execute()
         self.assertEqual(len(results), 4)
         for result in results:
-            self.assertEqual(result.build, 1)
+            self.assertEqual(result.build, sync_id)
 
-        self._manage_imported_files(self.version, build_id=2)
+        sync_id = self._manage_imported_files(self.version)
         for obj in ImportedFile.objects.all():
-            self.assertEqual(obj.build, 2)
+            self.assertEqual(obj.build, sync_id)
 
         # NOTE: we can't test that the files from the previous build
         # were deleted, since deletion happens asynchronously.
-        results = PageDocument().search().filter("term", build=2).execute()
+        results = PageDocument().search().filter("term", build=sync_id).execute()
         self.assertEqual(len(results), 4)
 
     def test_page_default_rank(self):
         search_ranking = {}
         self.assertEqual(HTMLFile.objects.count(), 0)
-        self._manage_imported_files(
-            self.version, build_id=1, search_ranking=search_ranking
+        sync_id = self._manage_imported_files(
+            self.version, search_ranking=search_ranking
         )
 
         results = (
@@ -134,15 +131,15 @@ def test_page_default_rank(self):
         for result in results:
             self.assertEqual(result.project, self.project.slug)
             self.assertEqual(result.version, self.version.slug)
-            self.assertEqual(result.build, 1)
+            self.assertEqual(result.build, sync_id)
             self.assertEqual(result.rank, 0)
 
     def test_page_custom_rank_glob(self):
         search_ranking = {
             "*index.html": 5,
         }
-        self._manage_imported_files(
-            self.version, build_id=1, search_ranking=search_ranking
+        sync_id = self._manage_imported_files(
+            self.version, search_ranking=search_ranking
         )
 
         results = (
@@ -156,7 +153,7 @@ def test_page_custom_rank_glob(self):
         for result in results:
             self.assertEqual(result.project, self.project.slug)
             self.assertEqual(result.version, self.version.slug)
-            self.assertEqual(result.build, 1)
+            self.assertEqual(result.build, sync_id)
             if result.path.endswith("index.html"):
                 self.assertEqual(result.rank, 5, result.path)
             else:
@@ -167,8 +164,8 @@ def test_page_custom_rank_several(self):
             "test.html": 5,
             "api/index.html": 2,
         }
-        self._manage_imported_files(
-            self.version, build_id=1, search_ranking=search_ranking
+        sync_id = self._manage_imported_files(
+            self.version, search_ranking=search_ranking
         )
 
         results = (
@@ -182,7 +179,7 @@ def test_page_custom_rank_several(self):
         for result in results:
             self.assertEqual(result.project, self.project.slug)
             self.assertEqual(result.version, self.version.slug)
-            self.assertEqual(result.build, 1)
+            self.assertEqual(result.build, sync_id)
             if result.path == "test.html":
                 self.assertEqual(result.rank, 5)
             elif result.path == "api/index.html":
@@ -195,8 +192,8 @@ def test_page_custom_rank_precedence(self):
             "*.html": 5,
             "api/index.html": 2,
         }
-        self._manage_imported_files(
-            self.version, build_id=1, search_ranking=search_ranking
+        sync_id = self._manage_imported_files(
+            self.version, search_ranking=search_ranking
         )
 
         results = (
@@ -210,7 +207,7 @@ def test_page_custom_rank_precedence(self):
         for result in results:
             self.assertEqual(result.project, self.project.slug)
             self.assertEqual(result.version, self.version.slug)
-            self.assertEqual(result.build, 1)
+            self.assertEqual(result.build, sync_id)
             if result.path == "api/index.html":
                 self.assertEqual(result.rank, 2, result.path)
             else:
@@ -221,8 +218,8 @@ def test_page_custom_rank_precedence_inverted(self):
             "api/index.html": 2,
             "*.html": 5,
         }
-        self._manage_imported_files(
-            self.version, build_id=1, search_ranking=search_ranking
+        sync_id = self._manage_imported_files(
+            self.version, search_ranking=search_ranking
         )
 
         results = (
@@ -236,14 +233,13 @@ def test_page_custom_rank_precedence_inverted(self):
         for result in results:
             self.assertEqual(result.project, self.project.slug)
             self.assertEqual(result.version, self.version.slug)
-            self.assertEqual(result.build, 1)
+            self.assertEqual(result.build, sync_id)
             self.assertEqual(result.rank, 5)
 
     def test_search_page_ignore(self):
         search_ignore = ["api/index.html"]
         self._manage_imported_files(
             self.version,
-            build_id=1,
             search_ignore=search_ignore,
         )
 
@@ -273,7 +269,7 @@ def test_update_content(self):
         with override_settings(DOCROOT=self.test_dir):
             self._copy_storage_dir()
 
-        self._manage_imported_files(self.version, build_id=1)
+        sync_id = self._manage_imported_files(self.version)
         self.assertEqual(ImportedFile.objects.count(), 3)
         document = (
             PageDocument()
@@ -281,7 +277,7 @@ def test_update_content(self):
             .filter("term", project=self.project.slug)
             .filter("term", version=self.version.slug)
             .filter("term", path="test.html")
-            .filter("term", build=1)
+            .filter("term", build=sync_id)
             .execute()[0]
         )
         self.assertEqual(document.sections[0].content, "Woo")
@@ -292,7 +288,7 @@ def test_update_content(self):
         with override_settings(DOCROOT=self.test_dir):
             self._copy_storage_dir()
 
-        self._manage_imported_files(self.version, build_id=2)
+        sync_id = self._manage_imported_files(self.version)
         self.assertEqual(ImportedFile.objects.count(), 3)
         document = (
             PageDocument()
@@ -300,7 +296,7 @@ def test_update_content(self):
             .filter("term", project=self.project.slug)
             .filter("term", version=self.version.slug)
             .filter("term", path="test.html")
-            .filter("term", build=2)
+            .filter("term", build=sync_id)
             .execute()[0]
         )
         self.assertEqual(document.sections[0].content, "Something Else")
diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py
index 3c2ed5d5ad6..4dd10b49e39 100644
--- a/readthedocs/search/utils.py
+++ b/readthedocs/search/utils.py
@@ -29,7 +29,7 @@ def index_objects(document, objects, index_name=None):
 
 
 def remove_indexed_files(
-    project_slug, version_slug=None, build_id=None, index_name=None
+    project_slug, version_slug=None, sync_id=None, index_name=None
 ):
     """
     Remove files from `version_slug` of `project_slug` from the search index.
@@ -62,8 +62,8 @@ def remove_indexed_files(
         documents = document().search().filter("term", project=project_slug)
         if version_slug:
             documents = documents.filter("term", version=version_slug)
-        if build_id:
-            documents = documents.exclude("term", build=build_id)
+        if sync_id:
+            documents = documents.exclude("term", build=sync_id)
         documents.delete()
     except Exception:
         log.exception("Unable to delete a subset of files. Continuing.")

From dd9e226de2760bb6c6a5efd3ba7113549d050661 Mon Sep 17 00:00:00 2001
From: Santos Gallegos <stsewd@proton.me>
Date: Thu, 14 Sep 2023 10:34:32 -0500
Subject: [PATCH 12/12] Updates from review

---
 readthedocs/projects/tasks/search.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py
index bd67f206b59..6b7127507e9 100644
--- a/readthedocs/projects/tasks/search.py
+++ b/readthedocs/projects/tasks/search.py
@@ -174,6 +174,7 @@ def _create_imported_files_and_search_index(
     # A sync ID is a number different than the current `build` attribute (pending rename),
     # it's used to differentiate the files from the current sync from the previous one.
     # This is useful to easily delete the previous files from the DB and ES.
+    # See https://github.com/readthedocs/readthedocs.org/issues/10734.
     imported_file = version.imported_files.first()
     sync_id = imported_file.build + 1 if imported_file else 1
 
@@ -210,8 +211,6 @@ def _create_imported_files_and_search_index(
             # If the file is ignored, we don't need to check for its ranking.
             if not skip_search_index:
                 # Last pattern to match takes precedence
-                # XXX: see if we can implement another type of precedence,
-                # like the longest pattern.
                 for pattern, rank in reverse_rankings:
                     if fnmatch(relpath, pattern):
                         page_rank = rank