Search: index in small chunks (#10914)

stsewd · web-flow · commit 6723e5954c0c · 2023-11-22T10:32:25.000-05:00
* Search: index in small chunks The default is 500 documents, our documents have nested objects, that can make the final document really big, and timeout ES. Ref #10911 (comment) * Test with 100
diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py
@@ -215,6 +215,9 @@ def _create_imported_files_and_search_index(
             document=PageDocument,
             objects=html_files_to_index,
             index_name=search_index_name,
+            # Pages are indexed in small chunks to avoid a
+            # large payload that will probably timeout ES.
+            chunk_size=100,
         )
 
     # Remove old HTMLFiles from ElasticSearch
diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py
@@ -10,7 +10,7 @@
 log = structlog.get_logger(__name__)
 
 
-def index_objects(document, objects, index_name=None):
+def index_objects(document, objects, index_name=None, chunk_size=500):
     if not DEDConfig.autosync_enabled():
         log.info("Autosync disabled, skipping searh indexing.")
         return
@@ -21,7 +21,7 @@ def index_objects(document, objects, index_name=None):
     if index_name:
         document._index._name = index_name
 
-    document().update(objects)
+    document().update(objects, chunk_size=chunk_size)
 
     # Restore the old index name.
     if index_name:

Original file line number	Diff line number	Diff line change
`@@ -215,6 +215,9 @@ def _create_imported_files_and_search_index(`
`215`	`215`	`document=PageDocument,`
`216`	`216`	`objects=html_files_to_index,`
`217`	`217`	`index_name=search_index_name,`
	`218`	`+ # Pages are indexed in small chunks to avoid a`
	`219`	`+ # large payload that will probably timeout ES.`
	`220`	`+ chunk_size=100,`
`218`	`221`	`)`
`219`	`222`
`220`	`223`	`# Remove old HTMLFiles from ElasticSearch`