Search: improve re-index management command (#7904)

stsewd · web-flow · commit cc1a02ddc93c · 2021-02-16T18:22:03.000-05:00
diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py
@@ -1,14 +1,16 @@
-import datetime
+import itertools
 import logging
+import textwrap
+from datetime import datetime, timedelta
 
-from celery import chain, chord
 from django.apps import apps
 from django.conf import settings
 from django.core.management import BaseCommand
-from django.utils import timezone
 from django_elasticsearch_dsl.registries import registry
 
-from ...tasks import (
+from readthedocs.builds.models import Version
+from readthedocs.projects.models import HTMLFile, Project
+from readthedocs.search.tasks import (
     create_new_es_index,
     index_missing_objects,
     index_objects_to_es,
@@ -23,7 +25,7 @@ class Command(BaseCommand):
     @staticmethod
     def _get_indexing_tasks(app_label, model_name, index_name, queryset, document_class):
         chunk_size = settings.ES_TASK_CHUNK_SIZE
-        qs_iterator = queryset.only('pk').iterator()
+        qs_iterator = queryset.values_list('pk', flat=True).iterator()
         is_iterator_empty = False
 
         data = {
@@ -32,21 +34,13 @@ def _get_indexing_tasks(app_label, model_name, index_name, queryset, document_cl
             'document_class': document_class,
             'index_name': index_name,
         }
-
-        while not is_iterator_empty:
-            objects_id = []
-
-            try:
-                for _ in range(chunk_size):
-                    pk = next(qs_iterator).pk
-                    objects_id.append(pk)
-
-                    if pk % 5000 == 0:
-                        log.info('Total: %s', pk)
-
-            except StopIteration:
-                is_iterator_empty = True
-
+        current = 0
+        while True:
+            objects_id = list(itertools.islice(qs_iterator, chunk_size))
+            if not objects_id:
+                break
+            current += len(objects_id)
+            log.info('Total: %s', current)
             data['objects_id'] = objects_id
             yield index_objects_to_es.si(**data)
 
@@ -58,56 +52,141 @@ def _run_reindex_tasks(self, models, queue):
         else:
             log.info('Adding indexing tasks to default queue')
 
-        index_time = timezone.now()
-        timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
+        timestamp = datetime.now().strftime('%Y%m%d%H%M%S')
 
         for doc in registry.get_documents(models):
             queryset = doc().get_queryset()
-            # Get latest object from the queryset
 
             app_label = queryset.model._meta.app_label
             model_name = queryset.model.__name__
 
             index_name = doc._index._name
             new_index_name = "{}_{}".format(index_name, timestamp)
-            # Set index temporarily for indexing,
-            # this will only get set during the running of this command
+
+            # Set and create a temporal index for indexing.
+            create_new_es_index(
+                app_label=app_label,
+                model_name=model_name,
+                index_name=index_name,
+                new_index_name=new_index_name,
+            )
             doc._index._name = new_index_name
+            log.info('Temporal index created: %s', new_index_name)
+
+            indexing_tasks = self._get_indexing_tasks(
+                app_label=app_label,
+                model_name=model_name,
+                queryset=queryset,
+                index_name=new_index_name,
+                document_class=str(doc),
+            )
+            for task in indexing_tasks:
+                task.apply_async(**apply_async_kwargs)
+
+            log.info(
+                "Tasks issued successfully. model=%s.%s items=%s",
+                app_label, model_name, str(queryset.count())
+            )
+        return timestamp
+
+    def _change_index(self, models, timestamp):
+        for doc in registry.get_documents(models):
+            queryset = doc().get_queryset()
+            app_label = queryset.model._meta.app_label
+            model_name = queryset.model.__name__
+            index_name = doc._index._name
+            new_index_name = "{}_{}".format(index_name, timestamp)
+            switch_es_index(
+                app_label=app_label,
+                model_name=model_name,
+                index_name=index_name,
+                new_index_name=new_index_name,
+            )
+            log.info(
+                "Index name changed. model=%s.%s from=%s to=%s",
+                app_label, model_name, new_index_name, index_name,
+            )
+
+    def _reindex_from(self, days_ago, models, queue=None):
+        functions = {
+            apps.get_model('projects.HTMLFile'): self._reindex_files_from,
+            apps.get_model('projects.Project'): self._reindex_projects_from,
+        }
+        models = models or functions.keys()
+        for model in models:
+            if model not in functions:
+                log.warning("Re-index from not available for model %s", model.__name__)
+                continue
+            functions[model](days_ago=days_ago, queue=queue)
+
+    def _reindex_projects_from(self, days_ago, queue=None):
+        """Reindex projects with recent changes."""
+        since = datetime.now() - timedelta(days=days_ago)
+        queryset = Project.objects.filter(modified_date__gte=since).distinct()
+        app_label = Project._meta.app_label
+        model_name = Project.__name__
+        apply_async_kwargs = {}
+        if queue:
+            apply_async_kwargs['queue'] = queue
+
+        for doc in registry.get_documents(models=[Project]):
+            indexing_tasks = self._get_indexing_tasks(
+                app_label=app_label,
+                model_name=model_name,
+                queryset=queryset,
+                index_name=doc._index._name,
+                document_class=str(doc),
+            )
+            for task in indexing_tasks:
+                task.apply_async(**apply_async_kwargs)
+            log.info(
+                "Tasks issued successfully. model=%s.%s items=%s",
+                app_label, model_name, str(queryset.count())
+            )
+
+    def _reindex_files_from(self, days_ago, queue=None):
+        """Reindex HTML files from versions with recent builds."""
+        chunk_size = settings.ES_TASK_CHUNK_SIZE
+        since = datetime.now() - timedelta(days=days_ago)
+        queryset = Version.objects.filter(builds__date__gte=since).distinct()
+        app_label = HTMLFile._meta.app_label
+        model_name = HTMLFile.__name__
+        apply_async_kwargs = {
+            'kwargs': {
+                'app_label': app_label,
+                'model_name': model_name,
+            },
+        }
+        if queue:
+            apply_async_kwargs['queue'] = queue
+
+        for doc in registry.get_documents(models=[HTMLFile]):
+            apply_async_kwargs['kwargs']['document_class'] = str(doc)
+            for version in queryset.iterator():
+                project = version.project
+                files_qs = (
+                    HTMLFile.objects
+                    .filter(version=version)
+                    .values_list('pk', flat=True)
+                    .iterator()
+                )
+                current = 0
+                while True:
+                    objects_id = list(itertools.islice(files_qs, chunk_size))
+                    if not objects_id:
+                        break
+                    current += len(objects_id)
+                    log.info(
+                        'Re-indexing files. version=%s:%s total=%s',
+                        project.slug, version.slug, current,
+                    )
+                    apply_async_kwargs['kwargs']['objects_id'] = objects_id
+                    index_objects_to_es.apply_async(**apply_async_kwargs)
 
-            pre_index_task = create_new_es_index.si(app_label=app_label,
-                                                    model_name=model_name,
-                                                    index_name=index_name,
-                                                    new_index_name=new_index_name)
-
-            indexing_tasks = self._get_indexing_tasks(app_label=app_label, model_name=model_name,
-                                                      queryset=queryset,
-                                                      index_name=new_index_name,
-                                                      document_class=str(doc))
-
-            post_index_task = switch_es_index.si(app_label=app_label, model_name=model_name,
-                                                 index_name=index_name,
-                                                 new_index_name=new_index_name)
-
-            # Task to run in order to add the objects
-            # that has been inserted into database while indexing_tasks was running
-            # We pass the creation time of latest object, so its possible to index later items
-            missed_index_task = index_missing_objects.si(app_label=app_label,
-                                                         model_name=model_name,
-                                                         document_class=str(doc),
-                                                         index_generation_time=index_time)
-
-            # http://celery.readthedocs.io/en/latest/userguide/canvas.html#chords
-            chord_tasks = chord(header=indexing_tasks, body=post_index_task)
-            if queue:
-                pre_index_task.set(queue=queue)
-                chord_tasks.set(queue=queue)
-                missed_index_task.set(queue=queue)
-            # http://celery.readthedocs.io/en/latest/userguide/canvas.html#chain
-            chain(pre_index_task, chord_tasks, missed_index_task).apply_async(**apply_async_kwargs)
-
-            message = ("Successfully issued tasks for {}.{}, total {} items"
-                       .format(app_label, model_name, queryset.count()))
-            log.info(message)
+                log.info(
+                    "Tasks issued successfully. version=%s:%s items=%s",
+                    project.slug, version.slug, str(current),
+                )
 
     def add_arguments(self, parser):
         parser.add_argument(
@@ -116,13 +195,34 @@ def add_arguments(self, parser):
             action='store',
             help="Set the celery queue name for the task."
         )
+        parser.add_argument(
+            '--change-index',
+            dest='change_index',
+            action='store',
+            help=(
+                "Change the index to the new one using the given timestamp and delete the old one. "
+                "**This should be run after a re-index is completed**."
+            ),
+        )
+        parser.add_argument(
+            '--update-from',
+            dest='update_from',
+            type=int,
+            action='store',
+            help=(
+                "Re-index the models from the given days. "
+                "This should be run after a re-index."
+            ),
+        )
         parser.add_argument(
             '--models',
             dest='models',
             type=str,
             nargs='*',
-            help=("Specify the model to be updated in elasticsearch."
-                  "The format is <app_label>.<model_name>")
+            help=(
+                "Specify the model to be updated in elasticsearch. "
+                "The format is <app_label>.<model_name>"
+            ),
         )
 
     def handle(self, *args, **options):
@@ -131,7 +231,7 @@ def handle(self, *args, **options):
 
         You can specify model to get indexed by passing
         `--model <app_label>.<model_name>` parameter.
-        Otherwise, it will reindex all the models
+        Otherwise, it will re-index all the models
         """
         models = None
         if options['models']:
@@ -141,4 +241,31 @@ def handle(self, *args, **options):
         if options.get('queue'):
             queue = options['queue']
 
-        self._run_reindex_tasks(models=models, queue=queue)
+        change_index = options['change_index']
+        update_from = options['update_from']
+        if change_index:
+            timestamp = change_index
+            self._change_index(models=models, timestamp=timestamp)
+            print(textwrap.dedent(
+                """
+                Indexes had been changed.
+
+                Remember to re-index changed projects and versions with the
+                `--update-from n` argument,
+                where `n` is the number of days since the re-index.
+                """
+            ))
+        elif update_from:
+            self._reindex_from(days_ago=update_from, models=models, queue=queue)
+        else:
+            timestamp = self._run_reindex_tasks(models=models, queue=queue)
+            print(textwrap.dedent(
+                f"""
+                Re-indexing tasks have been created.
+                Timestamp: {timestamp}
+
+                Please monitor the tasks.
+                After they are completed run the same command with the
+                `--change-index {timestamp}` argument.
+                """
+            ))
diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py
@@ -610,7 +610,7 @@ def DOCKER_LIMITS(self):
         },
     }
     # Chunk size for elasticsearch reindex celery tasks
-    ES_TASK_CHUNK_SIZE = 100
+    ES_TASK_CHUNK_SIZE = 500
 
     # Info from Honza about this:
     # The key to determine shard number is actually usually not the node count,

Original file line number	Diff line number	Diff line change
`@@ -610,7 +610,7 @@ def DOCKER_LIMITS(self):`
`610`	`610`	`},`
`611`	`611`	`}`
`612`	`612`	`# Chunk size for elasticsearch reindex celery tasks`
`613`		`- ES_TASK_CHUNK_SIZE = 100`
	`613`	`+ ES_TASK_CHUNK_SIZE = 500`
`614`	`614`
`615`	`615`	`# Info from Honza about this:`
`616`	`616`	`# The key to determine shard number is actually usually not the node count,`