Merge pull request #5939 from dojutsu-user/indexing-speedup

ericholscher · web-flow · commit a457fc00777b · 2019-07-16T12:04:14.000-07:00
Indexing speedup
diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py
@@ -10,7 +10,6 @@
 
 from ...tasks import (index_objects_to_es, switch_es_index, create_new_es_index,
                       index_missing_objects)
-from ...utils import get_chunk
 
 log = logging.getLogger(__name__)
 
@@ -19,17 +18,32 @@ class Command(BaseCommand):
 
     @staticmethod
     def _get_indexing_tasks(app_label, model_name, index_name, queryset, document_class):
-        total = queryset.count()
-        chunks = get_chunk(total, settings.ES_TASK_CHUNK_SIZE)
-
-        for chunk in chunks:
-            data = {
-                'app_label': app_label,
-                'model_name': model_name,
-                'document_class': document_class,
-                'index_name': index_name,
-                'chunk': chunk
-            }
+        chunk_size = settings.ES_TASK_CHUNK_SIZE
+        qs_iterator = queryset.only('pk').iterator()
+        is_iterator_empty = False
+
+        data = {
+            'app_label': app_label,
+            'model_name': model_name,
+            'document_class': document_class,
+            'index_name': index_name,
+        }
+
+        while not is_iterator_empty:
+            objects_id = []
+
+            try:
+                for _ in range(chunk_size):
+                    pk = next(qs_iterator).pk
+                    objects_id.append(pk)
+
+                    if pk % 5000 == 0:
+                        log.info('Total: %s', pk)
+
+            except StopIteration:
+                is_iterator_empty = True
+
+            data['objects_id'] = objects_id
             yield index_objects_to_es.si(**data)
 
     def _run_reindex_tasks(self, models, queue):
diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py
@@ -94,14 +94,6 @@ def get_project_list_or_404(project_slug, user, version_slug=None):
     return project_list
 
 
-def get_chunk(total, chunk_size):
-    """Yield successive `chunk_size` chunks."""
-    # Based on https://stackoverflow.com/a/312464
-    # licensed under cc by-sa 3.0
-    for i in range(0, total, chunk_size):
-        yield (i, i + chunk_size)
-
-
 def _get_index(indices, index_name):
     """
     Get Index from all the indices.