From b38423d7c62a844fbfa23e6bd73f9ed569060fbc Mon Sep 17 00:00:00 2001 From: dojutsu-user Date: Tue, 16 Jul 2019 23:18:06 +0530 Subject: [PATCH 1/4] fix indexing speedup --- .../commands/reindex_elasticsearch.py | 36 ++++++++++++------- readthedocs/search/utils.py | 8 ----- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py index 32f9a4c8534..bc2c9965f47 100644 --- a/readthedocs/search/management/commands/reindex_elasticsearch.py +++ b/readthedocs/search/management/commands/reindex_elasticsearch.py @@ -10,7 +10,6 @@ from ...tasks import (index_objects_to_es, switch_es_index, create_new_es_index, index_missing_objects) -from ...utils import get_chunk log = logging.getLogger(__name__) @@ -19,18 +18,29 @@ class Command(BaseCommand): @staticmethod def _get_indexing_tasks(app_label, model_name, index_name, queryset, document_class): - total = queryset.count() - chunks = get_chunk(total, settings.ES_TASK_CHUNK_SIZE) - - for chunk in chunks: - data = { - 'app_label': app_label, - 'model_name': model_name, - 'document_class': document_class, - 'index_name': index_name, - 'chunk': chunk - } - yield index_objects_to_es.si(**data) + chunk_size = settings.ES_TASK_CHUNK_SIZE + qs_iterator = queryset.only('pk').iterator() + is_iterator_empty = False + + data = { + 'app_label': app_label, + 'model_name': model_name, + 'document_class': document_class, + 'index_name': index_name, + } + + while not is_iterator_empty: + objects_id = [] + + try: + for _ in range(chunk_size): + objects_id.append(qs_iterator.__next__().pk) + except StopIteration: + is_iterator_empty = True + + if objects_id: + data['objects_id']: objects_id + yield index_objects_to_es.si(**data) def _run_reindex_tasks(self, models, queue): apply_async_kwargs = {'priority': 0} diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py index 0ff42ddcdd2..cf1f0fb73aa 100644 --- a/readthedocs/search/utils.py +++ b/readthedocs/search/utils.py @@ -94,14 +94,6 @@ def get_project_list_or_404(project_slug, user, version_slug=None): return project_list -def get_chunk(total, chunk_size): - """Yield successive `chunk_size` chunks.""" - # Based on https://stackoverflow.com/a/312464 - # licensed under cc by-sa 3.0 - for i in range(0, total, chunk_size): - yield (i, i + chunk_size) - - def _get_index(indices, index_name): """ Get Index from all the indices. From cbbbb424c44d07360bdc38cf6e440acf1b7b0732 Mon Sep 17 00:00:00 2001 From: dojutsu-user Date: Tue, 16 Jul 2019 23:21:44 +0530 Subject: [PATCH 2/4] remove if --- .../search/management/commands/reindex_elasticsearch.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py index bc2c9965f47..c59a6a5dab9 100644 --- a/readthedocs/search/management/commands/reindex_elasticsearch.py +++ b/readthedocs/search/management/commands/reindex_elasticsearch.py @@ -38,9 +38,8 @@ def _get_indexing_tasks(app_label, model_name, index_name, queryset, document_cl except StopIteration: is_iterator_empty = True - if objects_id: - data['objects_id']: objects_id - yield index_objects_to_es.si(**data) + data['objects_id'] = objects_id + yield index_objects_to_es.si(**data) def _run_reindex_tasks(self, models, queue): apply_async_kwargs = {'priority': 0} From a6b8a1a6141b2efcd2c36d15fdc7eb675b8300b5 Mon Sep 17 00:00:00 2001 From: dojutsu-user Date: Tue, 16 Jul 2019 23:35:42 +0530 Subject: [PATCH 3/4] use next() --- readthedocs/search/management/commands/reindex_elasticsearch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py index c59a6a5dab9..fb2662ac2fb 100644 --- a/readthedocs/search/management/commands/reindex_elasticsearch.py +++ b/readthedocs/search/management/commands/reindex_elasticsearch.py @@ -34,7 +34,7 @@ def _get_indexing_tasks(app_label, model_name, index_name, queryset, document_cl try: for _ in range(chunk_size): - objects_id.append(qs_iterator.__next__().pk) + objects_id.append(next(qs_iterator).pk) except StopIteration: is_iterator_empty = True From 985488d6afb6109bca4a7d988ec0a2ae73f47d50 Mon Sep 17 00:00:00 2001 From: dojutsu-user Date: Tue, 16 Jul 2019 23:50:07 +0530 Subject: [PATCH 4/4] add logging --- .../search/management/commands/reindex_elasticsearch.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py index fb2662ac2fb..7c0ea6982cf 100644 --- a/readthedocs/search/management/commands/reindex_elasticsearch.py +++ b/readthedocs/search/management/commands/reindex_elasticsearch.py @@ -34,7 +34,12 @@ def _get_indexing_tasks(app_label, model_name, index_name, queryset, document_cl try: for _ in range(chunk_size): - objects_id.append(next(qs_iterator).pk) + pk = next(qs_iterator).pk + objects_id.append(pk) + + if pk % 5000 == 0: + log.info('Total: %s', pk) + except StopIteration: is_iterator_empty = True