Skip to content

Commit 463f9e2

Browse files
authored
Merge pull request #4368 from safwanrahman/comman
[Fix #4333] Implement asynchronous search reindex functionality using celery
2 parents 665cc08 + 652f869 commit 463f9e2

20 files changed

+382
-93
lines changed

conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# -*- coding: utf-8 -*-
22
import pytest
3+
from django.conf import settings
34
from rest_framework.test import APIClient
45

56
try:
@@ -46,7 +47,6 @@ def pytest_configure(config):
4647
def settings_modification(settings):
4748
settings.CELERY_ALWAYS_EAGER = True
4849

49-
5050
@pytest.fixture
5151
def api_client():
5252
return APIClient()

readthedocs/core/management/commands/reindex_elasticsearch.py

Lines changed: 0 additions & 58 deletions
This file was deleted.
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# -*- coding: utf-8 -*-
2+
# Generated by Django 1.9.13 on 2018-07-27 09:54
3+
from __future__ import unicode_literals
4+
5+
from django.db import migrations, models
6+
import django.utils.timezone
7+
8+
9+
class Migration(migrations.Migration):
10+
11+
dependencies = [
12+
('projects', '0027_add_htmlfile_model'),
13+
]
14+
15+
operations = [
16+
migrations.AddField(
17+
model_name='importedfile',
18+
name='modified_date',
19+
field=models.DateTimeField(auto_now=True, default=django.utils.timezone.now, verbose_name='Modified date'),
20+
preserve_default=False,
21+
),
22+
]

readthedocs/projects/models.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from django.contrib.auth.models import User
1414
from django.core.urlresolvers import NoReverseMatch, reverse
1515
from django.db import models
16+
from django.utils import timezone
1617
from django.utils.encoding import python_2_unicode_compatible
1718
from django.utils.functional import cached_property
1819
from django.utils.translation import ugettext_lazy as _
@@ -911,6 +912,7 @@ class ImportedFile(models.Model):
911912
path = models.CharField(_('Path'), max_length=255)
912913
md5 = models.CharField(_('MD5 checksum'), max_length=255)
913914
commit = models.CharField(_('Commit'), max_length=255)
915+
modified_date = models.DateTimeField(_('Modified date'), auto_now=True)
914916

915917
def get_absolute_url(self):
916918
return resolve(project=self.project, version_slug=self.version.slug, filename=self.path)

readthedocs/projects/signals.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,7 @@
1414
project_import = django.dispatch.Signal(providing_args=["project"])
1515

1616
files_changed = django.dispatch.Signal(providing_args=["project", "files"])
17+
18+
bulk_post_create = django.dispatch.Signal(providing_args=["instance_list"])
19+
20+
bulk_post_delete = django.dispatch.Signal(providing_args=["instance_list"])

readthedocs/projects/tasks.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@
3232
from .constants import LOG_TEMPLATE
3333
from .exceptions import RepositoryError
3434
from .models import ImportedFile, Project, Domain, Feature, HTMLFile
35-
from .signals import before_vcs, after_vcs, before_build, after_build, files_changed
35+
from .signals import before_vcs, after_vcs, before_build, after_build, files_changed, \
36+
bulk_post_create, bulk_post_delete
3637
from readthedocs.builds.constants import (
3738
BUILD_STATE_BUILDING, BUILD_STATE_CLONING, BUILD_STATE_FINISHED,
3839
BUILD_STATE_INSTALLING, LATEST, LATEST_VERBOSE_NAME, STABLE_VERBOSE_NAME)
@@ -986,6 +987,7 @@ def _manage_imported_files(version, path, commit):
986987
:param commit: Commit that updated path
987988
"""
988989
changed_files = set()
990+
created_html_files = []
989991
for root, __, filenames in os.walk(path):
990992
for filename in filenames:
991993
if fnmatch.fnmatch(filename, '*.html'):
@@ -1015,15 +1017,27 @@ def _manage_imported_files(version, path, commit):
10151017
obj.commit = commit
10161018
obj.save()
10171019

1018-
# Delete the HTMLFile first from previous versions
1019-
HTMLFile.objects.filter(project=version.project,
1020-
version=version
1021-
).exclude(commit=commit).delete()
1020+
if model_class == HTMLFile:
1021+
# the `obj` is HTMLFile, so add it to the list
1022+
created_html_files.append(obj)
1023+
1024+
# Send bulk_post_create signal for bulk indexing to Elasticsearch
1025+
bulk_post_create.send(sender=HTMLFile, instance_list=created_html_files)
1026+
1027+
# Delete the HTMLFile first from previous commit and
1028+
# send bulk_post_delete signal for bulk removing from Elasticsearch
1029+
delete_queryset = (HTMLFile.objects.filter(project=version.project, version=version)
1030+
.exclude(commit=commit))
1031+
# Keep the objects into memory to send it to signal
1032+
instance_list = list(delete_queryset)
1033+
# Safely delete from database
1034+
delete_queryset.delete()
1035+
# Always pass the list of instance, not queryset.
1036+
bulk_post_delete.send(sender=HTMLFile, instance_list=instance_list)
10221037

10231038
# Delete ImportedFiles from previous versions
1024-
ImportedFile.objects.filter(project=version.project,
1025-
version=version
1026-
).exclude(commit=commit).delete()
1039+
(ImportedFile.objects.filter(project=version.project, version=version)
1040+
.exclude(commit=commit).delete())
10271041
changed_files = [
10281042
resolve_path(
10291043
version.project, filename=file, version_slug=version.slug,

readthedocs/search/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
default_app_config = 'readthedocs.search.apps.SearchConfig'

readthedocs/search/apps.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
"""Project app config"""
2+
3+
from django.apps import AppConfig
4+
5+
6+
class SearchConfig(AppConfig):
7+
name = 'readthedocs.search'
8+
9+
def ready(self):
10+
from .signals import index_html_file, remove_html_file

readthedocs/search/documents.py

Lines changed: 5 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@
33
from elasticsearch_dsl.query import SimpleQueryString, Bool
44

55
from readthedocs.projects.models import Project, HTMLFile
6-
from .conf import SEARCH_EXCLUDED_FILE
7-
86
from readthedocs.search.faceted_search import ProjectSearch, FileSearch
7+
from .conf import SEARCH_EXCLUDED_FILE
8+
from .mixins import RTDDocTypeMixin
99

1010
project_conf = settings.ES_INDEXES['project']
1111
project_index = Index(project_conf['name'])
@@ -17,7 +17,7 @@
1717

1818

1919
@project_index.doc_type
20-
class ProjectDocument(DocType):
20+
class ProjectDocument(RTDDocTypeMixin, DocType):
2121

2222
class Meta(object):
2323
model = Project
@@ -47,11 +47,12 @@ def faceted_search(cls, query, language=None, using=None, index=None):
4747

4848

4949
@page_index.doc_type
50-
class PageDocument(DocType):
50+
class PageDocument(RTDDocTypeMixin, DocType):
5151

5252
class Meta(object):
5353
model = HTMLFile
5454
fields = ('commit',)
55+
ignore_signals = settings.ES_PAGE_IGNORE_SIGNALS
5556

5657
project = fields.KeywordField(attr='project.slug')
5758
version = fields.KeywordField(attr='version.slug')
@@ -121,21 +122,3 @@ def get_queryset(self):
121122
queryset = (queryset.filter(project__documentation_type='sphinx')
122123
.exclude(name__in=SEARCH_EXCLUDED_FILE))
123124
return queryset
124-
125-
def update(self, thing, refresh=None, action='index', **kwargs):
126-
"""Overwrite in order to index only certain files"""
127-
# Object not exist in the provided queryset should not be indexed
128-
# TODO: remove this overwrite when the issue has been fixed
129-
# See below link for more information
130-
# https://github.com/sabricot/django-elasticsearch-dsl/issues/111
131-
# Moreover, do not need to check if its a delete action
132-
# Because while delete action, the object is already remove from database
133-
if isinstance(thing, HTMLFile) and action != 'delete':
134-
# Its a model instance.
135-
queryset = self.get_queryset()
136-
obj = queryset.filter(pk=thing.pk)
137-
if not obj.exists():
138-
return None
139-
140-
return super(PageDocument, self).update(thing=thing, refresh=refresh,
141-
action=action, **kwargs)

readthedocs/search/management/__init__.py

Whitespace-only changes.

readthedocs/search/management/commands/__init__.py

Whitespace-only changes.
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
import datetime
2+
import logging
3+
4+
from celery import chord, chain
5+
from django.apps import apps
6+
from django.conf import settings
7+
from django.core.management import BaseCommand
8+
from django.utils import timezone
9+
from django_elasticsearch_dsl.registries import registry
10+
11+
from ...tasks import (index_objects_to_es, switch_es_index, create_new_es_index,
12+
index_missing_objects)
13+
from ...utils import chunk_queryset
14+
15+
log = logging.getLogger(__name__)
16+
17+
18+
class Command(BaseCommand):
19+
20+
@staticmethod
21+
def _get_indexing_tasks(app_label, model_name, queryset, document_class, index_name):
22+
queryset = queryset.values_list('id', flat=True)
23+
chunked_queryset = chunk_queryset(queryset, settings.ES_TASK_CHUNK_SIZE)
24+
25+
for chunk in chunked_queryset:
26+
data = {
27+
'app_label': app_label,
28+
'model_name': model_name,
29+
'document_class': document_class,
30+
'index_name': index_name,
31+
'objects_id': list(chunk)
32+
}
33+
yield index_objects_to_es.si(**data)
34+
35+
def _run_reindex_tasks(self, models):
36+
for doc in registry.get_documents(models):
37+
queryset = doc().get_queryset()
38+
# Get latest object from the queryset
39+
index_time = timezone.now()
40+
41+
app_label = queryset.model._meta.app_label
42+
model_name = queryset.model.__name__
43+
44+
index_name = doc._doc_type.index
45+
timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
46+
new_index_name = "{}_{}".format(index_name, timestamp)
47+
48+
pre_index_task = create_new_es_index.si(app_label=app_label,
49+
model_name=model_name,
50+
index_name=index_name,
51+
new_index_name=new_index_name)
52+
53+
indexing_tasks = self._get_indexing_tasks(app_label=app_label, model_name=model_name,
54+
queryset=queryset,
55+
document_class=str(doc),
56+
index_name=new_index_name)
57+
58+
post_index_task = switch_es_index.si(app_label=app_label, model_name=model_name,
59+
index_name=index_name,
60+
new_index_name=new_index_name)
61+
62+
# Task to run in order to add the objects
63+
# that has been inserted into database while indexing_tasks was running
64+
# We pass the creation time of latest object, so its possible to index later items
65+
missed_index_task = index_missing_objects.si(app_label=app_label,
66+
model_name=model_name,
67+
document_class=str(doc),
68+
index_generation_time=index_time)
69+
70+
# http://celery.readthedocs.io/en/latest/userguide/canvas.html#chords
71+
chord_tasks = chord(header=indexing_tasks, body=post_index_task)
72+
# http://celery.readthedocs.io/en/latest/userguide/canvas.html#chain
73+
chain(pre_index_task, chord_tasks, missed_index_task).apply_async()
74+
75+
message = ("Successfully issued tasks for {}.{}, total {} items"
76+
.format(app_label, model_name, queryset.count()))
77+
log.info(message)
78+
79+
def add_arguments(self, parser):
80+
parser.add_argument(
81+
'--models',
82+
dest='models',
83+
type=str,
84+
nargs='*',
85+
help=("Specify the model to be updated in elasticsearch."
86+
"The format is <app_label>.<model_name>")
87+
)
88+
89+
def handle(self, *args, **options):
90+
"""
91+
Index models into Elasticsearch index asynchronously using celery.
92+
93+
You can specify model to get indexed by passing
94+
`--model <app_label>.<model_name>` parameter.
95+
Otherwise, it will reindex all the models
96+
"""
97+
models = None
98+
if options['models']:
99+
models = [apps.get_model(model_name) for model_name in options['models']]
100+
101+
self._run_reindex_tasks(models=models)

0 commit comments

Comments
 (0)