Skip to content

Commit 143ce7f

Browse files
committed
Optimizing reindexing management command
1 parent baf8421 commit 143ce7f

File tree

4 files changed

+31
-23
lines changed

4 files changed

+31
-23
lines changed

readthedocs/projects/models.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from django.contrib.auth.models import User
1414
from django.core.urlresolvers import NoReverseMatch, reverse
1515
from django.db import models
16+
from django.utils import timezone
1617
from django.utils.encoding import python_2_unicode_compatible
1718
from django.utils.functional import cached_property
1819
from django.utils.translation import ugettext_lazy as _
@@ -911,6 +912,7 @@ class ImportedFile(models.Model):
911912
path = models.CharField(_('Path'), max_length=255)
912913
md5 = models.CharField(_('MD5 checksum'), max_length=255)
913914
commit = models.CharField(_('Commit'), max_length=255)
915+
modified_date = models.DateTimeField(_('Modified date'), auto_now=True)
914916

915917
def get_absolute_url(self):
916918
return resolve(project=self.project, version_slug=self.version.slug, filename=self.path)

readthedocs/search/management/commands/reindex_elasticsearch.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,34 +9,37 @@
99

1010
from ...tasks import (index_objects_to_es, switch_es_index, create_new_es_index,
1111
index_missing_objects)
12-
from ...utils import chunks
12+
from ...utils import chunk_queryset
1313

1414
log = logging.getLogger(__name__)
1515

1616

1717
class Command(BaseCommand):
1818

1919
@staticmethod
20-
def _get_indexing_tasks(app_label, model_name, instance_ids, document_class, index_name):
21-
chunk_objects = chunks(instance_ids, settings.ES_TASK_CHUNK_SIZE)
20+
def _get_indexing_tasks(app_label, model_name, queryset, document_class, index_name):
21+
queryset = queryset.values_list('id', flat=True)
22+
chunked_queryset = chunk_queryset(queryset, settings.ES_TASK_CHUNK_SIZE)
2223

23-
for chunk in chunk_objects:
24+
for chunk in chunked_queryset:
2425
data = {
2526
'app_label': app_label,
2627
'model_name': model_name,
2728
'document_class': document_class,
2829
'index_name': index_name,
29-
'objects_id': chunk
30+
'objects_id': list(chunk)
3031
}
3132
yield index_objects_to_es.si(**data)
3233

3334
def _run_reindex_tasks(self, models):
3435
for doc in registry.get_documents(models):
35-
qs = doc().get_queryset()
36-
instance_ids = list(qs.values_list('id', flat=True))
36+
queryset = doc().get_queryset()
37+
# Get latest object from the queryset
38+
latest_object = queryset.latest('modified_date')
39+
latest_object_time = latest_object.modified_date
3740

38-
app_label = qs.model._meta.app_label
39-
model_name = qs.model.__name__
41+
app_label = queryset.model._meta.app_label
42+
model_name = queryset.model.__name__
4043

4144
index_name = doc._doc_type.index
4245
timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
@@ -48,7 +51,7 @@ def _run_reindex_tasks(self, models):
4851
new_index_name=new_index_name)
4952

5053
indexing_tasks = self._get_indexing_tasks(app_label=app_label, model_name=model_name,
51-
instance_ids=instance_ids,
54+
queryset=queryset,
5255
document_class=str(doc),
5356
index_name=new_index_name)
5457

@@ -58,18 +61,19 @@ def _run_reindex_tasks(self, models):
5861

5962
# Task to run in order to add the objects
6063
# that has been inserted into database while indexing_tasks was running
64+
# We pass the creation time of latest object, so its possible to index later items
6165
missed_index_task = index_missing_objects.si(app_label=app_label,
6266
model_name=model_name,
6367
document_class=str(doc),
64-
indexed_instance_ids=instance_ids)
68+
latest_indexed=latest_object_time)
6569

6670
# http://celery.readthedocs.io/en/latest/userguide/canvas.html#chords
6771
chord_tasks = chord(header=indexing_tasks, body=post_index_task)
6872
# http://celery.readthedocs.io/en/latest/userguide/canvas.html#chain
6973
chain(pre_index_task, chord_tasks, missed_index_task).apply_async()
7074

7175
message = ("Successfully issued tasks for {}.{}, total {} items"
72-
.format(app_label, model_name, len(instance_ids)))
76+
.format(app_label, model_name, queryset.count()))
7377
log.info(message)
7478

7579
def add_arguments(self, parser):

readthedocs/search/tasks.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -77,18 +77,19 @@ def index_objects_to_es(app_label, model_name, document_class, index_name, objec
7777

7878

7979
@app.task(queue='web')
80-
def index_missing_objects(app_label, model_name, document_class, indexed_instance_ids):
80+
def index_missing_objects(app_label, model_name, document_class, latest_indexed):
8181
"""
8282
Task to insure that none of the object is missed from indexing.
8383
84-
The object ids are sent to task for indexing.
85-
But in the meantime, new objects can be created/deleted in database
86-
and they will not be in the tasks.
87-
This task will index all the objects excluding the ones which have got indexed already
84+
The object ids are sent to `index_objects_to_es` task for indexing.
85+
While the task is running, new objects can be created/deleted in database
86+
and they will not be in the tasks for indexing into ES.
87+
This task will index all the objects that got into DB after the `latest_indexed` timestamp
88+
to ensure that everything is in ES index.
8889
"""
8990
model = apps.get_model(app_label, model_name)
9091
document = _get_document(model=model, document_class=document_class)
91-
queryset = document().get_queryset().exclude(id__in=indexed_instance_ids)
92+
queryset = document().get_queryset().exclude(modified_date__lte=latest_indexed)
9293
document().update(queryset.iterator())
9394

9495
log.info("Indexed {} missing objects from model: {}'".format(queryset.count(), model.__name__))

readthedocs/search/utils.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -323,9 +323,10 @@ def get_project_list_or_404(project_slug, user):
323323
return project_list
324324

325325

326-
def chunks(elements, chunk_size):
327-
"""Yield successive `chunk_size` chunks from elements."""
328-
# Taken from https://stackoverflow.com/a/312464
326+
def chunk_queryset(queryset, chunk_size):
327+
"""Yield successive `chunk_size` chunks of queryset."""
328+
# Based on https://stackoverflow.com/a/312464
329329
# licensed under cc by-sa 3.0
330-
for i in range(0, len(elements), chunk_size):
331-
yield elements[i:i + chunk_size]
330+
total = queryset.count()
331+
for i in range(0, total, chunk_size):
332+
yield queryset[i:i + chunk_size]

0 commit comments

Comments
 (0)