[Fix readthedocs#4333] Implement asynchronous search reindex functionality using celery

safwanrahman · safwanrahman · commit 9a07177bb4c6 · 2018-07-14T05:10:11.000+06:00
diff --git a/readthedocs/core/management/commands/reindex_elasticsearch.py b/readthedocs/core/management/commands/reindex_elasticsearch.py
diff --git a/readthedocs/search/documents.py b/readthedocs/search/documents.py
@@ -3,9 +3,9 @@
 from elasticsearch_dsl.query import SimpleQueryString, Bool
 
 from readthedocs.projects.models import Project, HTMLFile
-from .conf import SEARCH_EXCLUDED_FILE
-
 from readthedocs.search.faceted_search import ProjectSearch, FileSearch
+from .conf import SEARCH_EXCLUDED_FILE
+from .mixins import RTDDocTypeMixin
 
 project_conf = settings.ES_INDEXES['project']
 project_index = Index(project_conf['name'])
@@ -17,7 +17,7 @@
 
 
 @project_index.doc_type
-class ProjectDocument(DocType):
+class ProjectDocument(RTDDocTypeMixin, DocType):
 
     class Meta(object):
         model = Project
@@ -47,7 +47,7 @@ def faceted_search(cls, query, language=None, using=None, index=None):
 
 
 @page_index.doc_type
-class PageDocument(DocType):
+class PageDocument(RTDDocTypeMixin, DocType):
 
     class Meta(object):
         model = HTMLFile
@@ -121,21 +121,3 @@ def get_queryset(self):
         queryset = (queryset.filter(project__documentation_type='sphinx')
                             .exclude(name__in=SEARCH_EXCLUDED_FILE))
         return queryset
-
-    def update(self, thing, refresh=None, action='index', **kwargs):
-        """Overwrite in order to index only certain files"""
-        # Object not exist in the provided queryset should not be indexed
-        # TODO: remove this overwrite when the issue has been fixed
-        # See below link for more information
-        # https://github.com/sabricot/django-elasticsearch-dsl/issues/111
-        # Moreover, do not need to check if its a delete action
-        # Because while delete action, the object is already remove from database
-        if isinstance(thing, HTMLFile) and action != 'delete':
-            # Its a model instance.
-            queryset = self.get_queryset()
-            obj = queryset.filter(pk=thing.pk)
-            if not obj.exists():
-                return None
-
-        return super(PageDocument, self).update(thing=thing, refresh=refresh,
-                                                action=action, **kwargs)
diff --git a/readthedocs/search/management/__init__.py b/readthedocs/search/management/__init__.py
diff --git a/readthedocs/search/management/commands/__init__.py b/readthedocs/search/management/commands/__init__.py
diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py
@@ -0,0 +1,93 @@
+import datetime
+import logging
+
+from celery import chord, chain
+from django.apps import apps
+from django.conf import settings
+from django.core.management import BaseCommand
+from django_elasticsearch_dsl.registries import registry
+
+from ...tasks import (index_objects_to_es_task, switch_es_index_task, create_new_es_index_task,
+                      index_missing_objects_task)
+from ...utils import chunks
+
+log = logging.getLogger(__name__)
+
+
+class Command(BaseCommand):
+
+    @staticmethod
+    def _get_indexing_tasks(app_label, model_name, instance_ids, document_class, index_name):
+        chunk_objects = chunks(instance_ids, settings.ES_TASK_CHUNK_SIZE)
+
+        for chunk in chunk_objects:
+            data = {
+                'app_label': app_label,
+                'model_name': model_name,
+                'document_class': document_class,
+                'index_name': index_name,
+                'objects_id': chunk
+            }
+            yield index_objects_to_es_task.si(**data)
+
+    def _run_reindex_tasks(self, models):
+        for doc in registry.get_documents(models):
+            qs = doc().get_queryset()
+            instance_ids = list(qs.values_list('id', flat=True))
+
+            app_label = qs.model._meta.app_label
+            model_name = qs.model.__name__
+
+            old_index_name = doc._doc_type.index
+            timestamp_prefix = 'temp-{}-'.format(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
+            new_index_name = timestamp_prefix + old_index_name
+
+            pre_index_task = create_new_es_index_task.si(app_label=app_label,
+                                                         model_name=model_name,
+                                                         old_index_name=old_index_name,
+                                                         new_index_name=new_index_name)
+
+            indexing_tasks = self._get_indexing_tasks(app_label=app_label, model_name=model_name,
+                                                      instance_ids=instance_ids, document_class=str(doc),
+                                                      index_name=new_index_name)
+
+            post_index_task = switch_es_index_task.si(app_label=app_label, model_name=model_name,
+                                                      old_index_name=old_index_name,
+                                                      new_index_name=new_index_name)
+
+            # Task to run in order to add the objects
+            # that has been inserted into database while indexing_tasks was running
+            missed_index_task = index_missing_objects_task.si(app_label=app_label,
+                                                              model_name=model_name,
+                                                              document_class=str(doc),
+                                                              indexed_instance_ids=instance_ids)
+
+            # http://celery.readthedocs.io/en/latest/userguide/canvas.html#chords
+            chord_tasks = chord(header=indexing_tasks, body=post_index_task)
+            # http://celery.readthedocs.io/en/latest/userguide/canvas.html#chain
+            chain(pre_index_task, chord_tasks, missed_index_task).apply_async()
+
+            message = "Successfully issued tasks for {}.{}".format(app_label, model_name)
+            log.info(message)
+
+    @staticmethod
+    def _get_models(args):
+        for model_name in args:
+            yield apps.get_model(model_name)
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            '--models',
+            dest='models',
+            type=str,
+            nargs='*',
+            help=("Specify the model to be updated in elasticsearch."
+                  "The format is <app_label>.<model_name>")
+        )
+
+    def handle(self, *args, **options):
+        models = None
+        if options['models']:
+            models = self._get_models(options['models'])
+
+        self._run_reindex_tasks(models=models)
diff --git a/readthedocs/search/mixins.py b/readthedocs/search/mixins.py
@@ -0,0 +1,65 @@
+from django.db import models
+from django.core.paginator import Paginator
+
+
+class RTDDocTypeMixin(object):
+    """Override some methods of DocType of DED
+
+    Changelog as following:
+    - Do not index object that not exist in the provided queryset
+    - Take additional argument in update method `index_name` to update specific index
+
+    Issues:
+    - https://github.com/sabricot/django-elasticsearch-dsl/issues/111
+    """
+
+    def _prepare_action(self, object_instance, action, index_name=None):
+        return {
+            '_op_type': action,
+            '_index': index_name or str(self._doc_type.index),
+            '_type': self._doc_type.mapping.doc_type,
+            '_id': object_instance.pk,
+            '_source': (
+                self.prepare(object_instance) if action != 'delete' else None
+            ),
+        }
+
+    def _get_actions(self, object_list, action, index_name=None):
+        if self._doc_type.queryset_pagination is not None:
+            paginator = Paginator(
+                object_list, self._doc_type.queryset_pagination
+            )
+            for page in paginator.page_range:
+                for object_instance in paginator.page(page).object_list:
+                    yield self._prepare_action(object_instance, action, index_name)
+        else:
+            for object_instance in object_list:
+                yield self._prepare_action(object_instance, action, index_name)
+
+    def update(self, thing, refresh=None, action='index', index_name=None, **kwargs):
+        """
+        Update each document in ES for a model, iterable of models or queryset
+        """
+        if refresh is True or (
+            refresh is None and self._doc_type.auto_refresh
+        ):
+            kwargs['refresh'] = True
+
+        # TODO: remove this overwrite when the issue has been fixed
+        # https://github.com/sabricot/django-elasticsearch-dsl/issues/111
+        # Moreover, do not need to check if its a delete action
+        # Because while delete action, the object is already remove from database
+        if isinstance(thing, models.Model) and action != 'delete':
+            # Its a model instance.
+            queryset = self.get_queryset()
+            obj = queryset.filter(pk=thing.pk)
+            if not obj.exists():
+                return None
+
+            object_list = [thing]
+        else:
+            object_list = thing
+
+        return self.bulk(
+            self._get_actions(object_list, action, index_name=index_name), **kwargs
+        )
diff --git a/readthedocs/search/tasks.py b/readthedocs/search/tasks.py
@@ -0,0 +1,83 @@
+import logging
+
+from django.apps import apps
+from django_elasticsearch_dsl.registries import registry
+
+from readthedocs.worker import app
+
+log = logging.getLogger(__name__)
+
+
+def _get_index(indices, index_name):
+    for index in indices:
+        if str(index) == index_name:
+            return index
+
+
+def _get_document(model, document_class):
+    documents = registry.get_documents(models=[model])
+
+    for document in documents:
+        if str(document) == document_class:
+            return document
+
+
+@app.task(queue='web')
+def create_new_es_index_task(app_label, model_name, old_index_name, new_index_name):
+    model = apps.get_model(app_label, model_name)
+    indices = registry.get_indices(models=[model])
+    old_index = _get_index(indices=indices, index_name=old_index_name)
+    new_index = old_index.clone(name=new_index_name)
+    new_index.create()
+
+
+@app.task(queue='web')
+def switch_es_index_task(app_label, model_name, old_index_name, new_index_name):
+    model = apps.get_model(app_label, model_name)
+    indices = registry.get_indices(models=[model])
+    old_index = _get_index(indices=indices, index_name=old_index_name)
+
+    new_index = old_index.clone(name=new_index_name)
+
+    if old_index.exists():
+        # Alias can not be used to delete an index.
+        # https://www.elastic.co/guide/en/elasticsearch/reference/6.0/indices-delete-index.html
+        # So get the index actual name to delete it
+        old_index_info = old_index.get()
+        # The info is a dictionary and the key is the actual name of the index
+        old_index_name = old_index_info.keys()[0]
+        old_index.connection.indices.delete(index=old_index_name)
+
+    new_index.put_alias(name=old_index_name)
+
+
+@app.task(queue='web')
+def index_objects_to_es_task(app_label, model_name, document_class, index_name, objects_id):
+    model = apps.get_model(app_label, model_name)
+    document = _get_document(model=model, document_class=document_class)
+
+    # Use queryset from model as the ids are specific
+    queryset = model.objects.all().filter(id__in=objects_id).iterator()
+    log.info("Indexing model: {}, id:'{}'".format(model.__name__, objects_id))
+    document().update(queryset, index_name=index_name)
+
+
+@app.task(queue='web')
+def index_missing_objects_task(app_label, model_name, document_class, indexed_instance_ids):
+    """
+    Task to insure that none of the object is missed from indexing.
+
+    The object ids are sent to task for indexing.
+    But in the meantime, new objects can be created/deleted in database
+    and they will not be in the tasks.
+    This task will index all the objects excluding the ones which have got indexed already
+    """
+
+    model = apps.get_model(app_label, model_name)
+    document = _get_document(model=model, document_class=document_class)
+    queryset = document().get_queryset().exclude(id__in=indexed_instance_ids)
+    document().update(queryset.iterator())
+
+    log.info("Indexed {} missing objects from model: {}'".format(queryset.count(), model.__name__))
+
+    # TODO: Figure out how to remove the objects from ES index that has been deleted
diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py
@@ -321,3 +321,11 @@ def get_project_list_or_404(project_slug, user):
 
     project_list = list(subprojects) + [project]
     return project_list
+
+
+def chunks(elements, chunk_size):
+    """Yield successive n-sized chunks from l."""
+    # Taken from https://stackoverflow.com/a/312464
+    # licensed under cc by-sa 3.0
+    for i in range(0, len(elements), chunk_size):
+        yield elements[i:i + chunk_size]
diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py
@@ -320,6 +320,8 @@ def USE_PROMOS(self):  # noqa
             'hosts': '127.0.0.1:9200'
         },
     }
+    # Chunk size for elasticsearch reindex celery tasks
+    ES_TASK_CHUNK_SIZE = 100
 
     # ANALYZER = 'analysis': {
     #     'analyzer': {

Original file line number	Diff line number	Diff line change
`@@ -320,6 +320,8 @@ def USE_PROMOS(self): # noqa`
`320`	`320`	`'hosts': '127.0.0.1:9200'`
`321`	`321`	`},`
`322`	`322`	`}`
	`323`	`+ # Chunk size for elasticsearch reindex celery tasks`
	`324`	`+ ES_TASK_CHUNK_SIZE = 100`
`323`	`325`
`324`	`326`	`# ANALYZER = 'analysis': {`
`325`	`327`	`# 'analyzer': {`