From 4c0f25e6df3f01e721b6c5ab6bef7320fc756a4c Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Mon, 20 Nov 2023 15:30:05 -0500 Subject: [PATCH 1/2] Search: index in small chunks The default is 500 documents, our documents have nested objects, that can make the final document really big, and timeout ES. Ref https://github.com/readthedocs/readthedocs.org/issues/10911#issuecomment-1819688230 --- readthedocs/projects/tasks/search.py | 3 +++ readthedocs/search/utils.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index f065a957d28..f6751bde577 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -215,6 +215,9 @@ def _create_imported_files_and_search_index( document=PageDocument, objects=html_files_to_index, index_name=search_index_name, + # Pages are indexed in small chunks to avoid a + # large payload that will probably timeout ES. + chunk_size=25, ) # Remove old HTMLFiles from ElasticSearch diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py index 4dd10b49e39..81f50e63f8c 100644 --- a/readthedocs/search/utils.py +++ b/readthedocs/search/utils.py @@ -10,7 +10,7 @@ log = structlog.get_logger(__name__) -def index_objects(document, objects, index_name=None): +def index_objects(document, objects, index_name=None, chunk_size=500): if not DEDConfig.autosync_enabled(): log.info("Autosync disabled, skipping searh indexing.") return @@ -21,7 +21,7 @@ def index_objects(document, objects, index_name=None): if index_name: document._index._name = index_name - document().update(objects) + document().update(objects, chunk_size=chunk_size) # Restore the old index name. if index_name: From 8ca5ea0386c72708f5dfd47cc455d8ad91dd3622 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Wed, 22 Nov 2023 10:23:34 -0500 Subject: [PATCH 2/2] Test with 100 --- readthedocs/projects/tasks/search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index f6751bde577..0423b2fe403 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -217,7 +217,7 @@ def _create_imported_files_and_search_index( index_name=search_index_name, # Pages are indexed in small chunks to avoid a # large payload that will probably timeout ES. - chunk_size=25, + chunk_size=100, ) # Remove old HTMLFiles from ElasticSearch