Optimizing reindexing management command

safwanrahman · safwanrahman · commit 143ce7fedecd · 2018-07-27T20:49:31.000+06:00
diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py
@@ -13,6 +13,7 @@
 from django.contrib.auth.models import User
 from django.core.urlresolvers import NoReverseMatch, reverse
 from django.db import models
+from django.utils import timezone
 from django.utils.encoding import python_2_unicode_compatible
 from django.utils.functional import cached_property
 from django.utils.translation import ugettext_lazy as _
@@ -911,6 +912,7 @@ class ImportedFile(models.Model):
     path = models.CharField(_('Path'), max_length=255)
     md5 = models.CharField(_('MD5 checksum'), max_length=255)
     commit = models.CharField(_('Commit'), max_length=255)
+    modified_date = models.DateTimeField(_('Modified date'), auto_now=True)
 
     def get_absolute_url(self):
         return resolve(project=self.project, version_slug=self.version.slug, filename=self.path)
diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py
@@ -9,34 +9,37 @@
 
 from ...tasks import (index_objects_to_es, switch_es_index, create_new_es_index,
                       index_missing_objects)
-from ...utils import chunks
+from ...utils import chunk_queryset
 
 log = logging.getLogger(__name__)
 
 
 class Command(BaseCommand):
 
     @staticmethod
-    def _get_indexing_tasks(app_label, model_name, instance_ids, document_class, index_name):
-        chunk_objects = chunks(instance_ids, settings.ES_TASK_CHUNK_SIZE)
+    def _get_indexing_tasks(app_label, model_name, queryset, document_class, index_name):
+        queryset = queryset.values_list('id', flat=True)
+        chunked_queryset = chunk_queryset(queryset, settings.ES_TASK_CHUNK_SIZE)
 
-        for chunk in chunk_objects:
+        for chunk in chunked_queryset:
             data = {
                 'app_label': app_label,
                 'model_name': model_name,
                 'document_class': document_class,
                 'index_name': index_name,
-                'objects_id': chunk
+                'objects_id': list(chunk)
             }
             yield index_objects_to_es.si(**data)
 
     def _run_reindex_tasks(self, models):
         for doc in registry.get_documents(models):
-            qs = doc().get_queryset()
-            instance_ids = list(qs.values_list('id', flat=True))
+            queryset = doc().get_queryset()
+            # Get latest object from the queryset
+            latest_object = queryset.latest('modified_date')
+            latest_object_time = latest_object.modified_date
 
-            app_label = qs.model._meta.app_label
-            model_name = qs.model.__name__
+            app_label = queryset.model._meta.app_label
+            model_name = queryset.model.__name__
 
             index_name = doc._doc_type.index
             timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
@@ -48,7 +51,7 @@ def _run_reindex_tasks(self, models):
                                                     new_index_name=new_index_name)
 
             indexing_tasks = self._get_indexing_tasks(app_label=app_label, model_name=model_name,
-                                                      instance_ids=instance_ids,
+                                                      queryset=queryset,
                                                       document_class=str(doc),
                                                       index_name=new_index_name)
 
@@ -58,18 +61,19 @@ def _run_reindex_tasks(self, models):
 
             # Task to run in order to add the objects
             # that has been inserted into database while indexing_tasks was running
+            # We pass the creation time of latest object, so its possible to index later items
             missed_index_task = index_missing_objects.si(app_label=app_label,
                                                          model_name=model_name,
                                                          document_class=str(doc),
-                                                         indexed_instance_ids=instance_ids)
+                                                         latest_indexed=latest_object_time)
 
             # http://celery.readthedocs.io/en/latest/userguide/canvas.html#chords
             chord_tasks = chord(header=indexing_tasks, body=post_index_task)
             # http://celery.readthedocs.io/en/latest/userguide/canvas.html#chain
             chain(pre_index_task, chord_tasks, missed_index_task).apply_async()
 
             message = ("Successfully issued tasks for {}.{}, total {} items"
-                       .format(app_label, model_name, len(instance_ids)))
+                       .format(app_label, model_name, queryset.count()))
             log.info(message)
 
     def add_arguments(self, parser):
diff --git a/readthedocs/search/tasks.py b/readthedocs/search/tasks.py
@@ -77,18 +77,19 @@ def index_objects_to_es(app_label, model_name, document_class, index_name, objec
 
 
 @app.task(queue='web')
-def index_missing_objects(app_label, model_name, document_class, indexed_instance_ids):
+def index_missing_objects(app_label, model_name, document_class, latest_indexed):
     """
     Task to insure that none of the object is missed from indexing.
 
-    The object ids are sent to task for indexing.
-    But in the meantime, new objects can be created/deleted in database
-    and they will not be in the tasks.
-    This task will index all the objects excluding the ones which have got indexed already
+    The object ids are sent to `index_objects_to_es` task for indexing.
+    While the task is running, new objects can be created/deleted in database
+    and they will not be in the tasks for indexing into ES.
+    This task will index all the objects that got into DB after the `latest_indexed` timestamp
+    to ensure that everything is in ES index.
     """
     model = apps.get_model(app_label, model_name)
     document = _get_document(model=model, document_class=document_class)
-    queryset = document().get_queryset().exclude(id__in=indexed_instance_ids)
+    queryset = document().get_queryset().exclude(modified_date__lte=latest_indexed)
     document().update(queryset.iterator())
 
     log.info("Indexed {} missing objects from model: {}'".format(queryset.count(), model.__name__))
diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py
@@ -323,9 +323,10 @@ def get_project_list_or_404(project_slug, user):
     return project_list
 
 
-def chunks(elements, chunk_size):
-    """Yield successive `chunk_size` chunks from elements."""
-    # Taken from https://stackoverflow.com/a/312464
+def chunk_queryset(queryset, chunk_size):
+    """Yield successive `chunk_size` chunks of queryset."""
+    # Based on https://stackoverflow.com/a/312464
     # licensed under cc by-sa 3.0
-    for i in range(0, len(elements), chunk_size):
-        yield elements[i:i + chunk_size]
+    total = queryset.count()
+    for i in range(0, total, chunk_size):
+        yield queryset[i:i + chunk_size]