From 871bfe2cd3a9598206351f24dd3592d5dc6637b9 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 31 Aug 2023 18:45:12 -0500 Subject: [PATCH 01/12] Search: stop relying on the DB when indexing - Closes https://github.com/readthedocs/readthedocs.org/issues/10623 - Closes https://github.com/readthedocs/readthedocs.org/issues/10690 --- readthedocs/builds/models.py | 2 +- readthedocs/projects/tasks/search.py | 103 ++++++++++++++---- .../rtd_tests/tests/test_imported_file.py | 2 +- .../commands/reindex_elasticsearch.py | 13 +++ 4 files changed, 94 insertions(+), 26 deletions(-) diff --git a/readthedocs/builds/models.py b/readthedocs/builds/models.py index 03d91ca9142..4fcd24e16e4 100644 --- a/readthedocs/builds/models.py +++ b/readthedocs/builds/models.py @@ -319,7 +319,7 @@ def config(self): :rtype: dict """ last_build = ( - self.builds(manager=INTERNAL).filter( + self.builds.filter( state=BUILD_STATE_FINISHED, success=True, ).order_by('-date') diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index ae382e8dad2..cc70ca677b9 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -2,11 +2,12 @@ import structlog -from readthedocs.builds.constants import EXTERNAL +from readthedocs.builds.constants import BUILD_STATE_FINISHED, EXTERNAL from readthedocs.builds.models import Version from readthedocs.projects.models import HTMLFile, ImportedFile, Project from readthedocs.projects.signals import files_changed -from readthedocs.search.utils import index_new_files, remove_indexed_files +from readthedocs.search.utils import remove_indexed_files +from django_elasticsearch_dsl.registries import registry from readthedocs.storage import build_media_storage from readthedocs.worker import app @@ -43,7 +44,7 @@ def fileify(version_pk, commit, build, search_ranking, search_ignore): _create_imported_files( version=version, commit=commit, - build=build, + build_id=build, search_ranking=search_ranking, search_ignore=search_ignore, ) @@ -65,9 +66,6 @@ def _sync_imported_files(version, build): """ project = version.project - # Index new HTMLFiles to ElasticSearch - index_new_files(model=HTMLFile, version=version, build=build) - # Remove old HTMLFiles from ElasticSearch remove_indexed_files( model=HTMLFile, @@ -95,7 +93,35 @@ def remove_search_indexes(project_slug, version_slug=None): ) -def _create_imported_files(*, version, commit, build, search_ranking, search_ignore): +def reindex_version(version): + """ + Reindex all files of this version. + """ + latest_successful_build = version.builds.filter( + state=BUILD_STATE_FINISHED, success=True + ).order_by("-date").first() + # If the version doesn't have a successful + # build, we don't have files to index. + if not latest_successful_build: + return + + search_ranking = [] + search_ignore = [] + build_config = latest_successful_build.config + if build_config: + search_ranking = build_config.search.ranking + search_ignore = build_config.search.ignore + + _create_imported_files( + version=version, + commit=latest_successful_build.commit, + build_id=latest_successful_build.id, + search_ranking=search_ranking, + search_ignore=search_ignore, + ) + + +def _create_imported_files(*, version, commit, build_id, search_ranking, search_ignore): """ Create imported files for version. @@ -107,6 +133,9 @@ def _create_imported_files(*, version, commit, build, search_ranking, search_ign storage_path = version.project.get_storage_path( type_='html', version_slug=version.slug, include_file=False ) + html_files_to_index = [] + html_files_to_save = [] + reverse_rankings = reversed(list(search_ranking.items())) for root, __, filenames in build_media_storage.walk(storage_path): for filename in filenames: # We don't care about non-HTML files @@ -118,34 +147,60 @@ def _create_imported_files(*, version, commit, build, search_ranking, search_ign # Generate a relative path for storage similar to os.path.relpath relpath = full_path.replace(storage_path, '', 1).lstrip('/') - page_rank = 0 - # Last pattern to match takes precedence - # XXX: see if we can implement another type of precedence, - # like the longest pattern. - reverse_rankings = reversed(list(search_ranking.items())) - for pattern, rank in reverse_rankings: - if fnmatch(relpath, pattern): - page_rank = rank - break - ignore = False - for pattern in search_ignore: - if fnmatch(relpath, pattern): - ignore = True - break + if version.is_external: + # Never index files from external versions. + ignore = True + else: + for pattern in search_ignore: + if fnmatch(relpath, pattern): + ignore = True + break - # Create imported files from new build - HTMLFile.objects.create( + page_rank = 0 + # If the file is ignored, we don't need to check for its ranking. + if not ignore: + # Last pattern to match takes precedence + # XXX: see if we can implement another type of precedence, + # like the longest pattern. + for pattern, rank in reverse_rankings: + if fnmatch(relpath, pattern): + page_rank = rank + break + + html_file = HTMLFile( project=version.project, version=version, path=relpath, name=filename, rank=page_rank, commit=commit, - build=build, + build=build_id, ignore=ignore, ) + # Don't index files that are ignored. + if not ignore: + html_files_to_index.append(html_file) + + # Create the imported file only if it's a top-level 404 file, + # or if it's an index file. We don't need to keep track of all files. + is_top_level_404_file = filename == "404.html" and root == storage_path + is_index_file = filename in ["index.html", "README.html"] + if is_top_level_404_file or is_index_file: + html_files_to_save.append(html_file) + + # We first index the files in ES, and then save the objects in the DB. + # This is because saving the objects in the DB will give them an id, + # and we neeed this id to be `None` when indexing the objects in ES. + # ES will generate a unique id for each document. + if html_files_to_index: + document = list(registry.get_documents(models=[HTMLFile]))[0] + document().update(html_files_to_index) + + if html_files_to_save: + HTMLFile.objects.bulk_create(html_files_to_save) + # This signal is used for purging the CDN. files_changed.send( sender=Project, diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py index 4e1aea8ce9f..423d56d73e9 100644 --- a/readthedocs/rtd_tests/tests/test_imported_file.py +++ b/readthedocs/rtd_tests/tests/test_imported_file.py @@ -36,7 +36,7 @@ def _manage_imported_files( _create_imported_files( version=version, commit=commit, - build=build, + build_id=build, search_ranking=search_ranking, search_ignore=search_ignore, ) diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py index 50e412682d7..7a9a00eb0d2 100644 --- a/readthedocs/search/management/commands/reindex_elasticsearch.py +++ b/readthedocs/search/management/commands/reindex_elasticsearch.py @@ -8,6 +8,8 @@ from django.conf import settings from django.core.management import BaseCommand from django_elasticsearch_dsl.registries import registry +from readthedocs.builds.constants import BUILD_STATE_FINISHED +from projects.tasks.search import reindex_version from readthedocs.builds.models import Version from readthedocs.projects.models import HTMLFile, Project @@ -50,6 +52,17 @@ def _run_reindex_tasks(self, models, queue): timestamp = datetime.now().strftime("%Y%m%d%H%M%S") + # TODO: move this to where it makes sense :D + qs = ( + Version.objects + .filter(built=True, builds__state=BUILD_STATE_FINISHED, builds_success=True) + .exclude(project__delisted=True) + .exclude(project__is_spam=True) + .select_related("project") + ) + for version in qs.iterator(): + reindex_version(version) + for doc in registry.get_documents(models): queryset = doc().get_queryset() From c6df61167e8b9cc3f374b6a7560f87c77f208a74 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Tue, 5 Sep 2023 18:53:11 -0500 Subject: [PATCH 02/12] Second draft --- docs/dev/server-side-search.rst | 2 +- readthedocs/projects/tasks/builds.py | 10 +- readthedocs/projects/tasks/search.py | 220 +++++++++++------- .../commands/reindex_elasticsearch.py | 201 ++++++++-------- readthedocs/search/utils.py | 24 -- 5 files changed, 253 insertions(+), 204 deletions(-) diff --git a/docs/dev/server-side-search.rst b/docs/dev/server-side-search.rst index 56e1636e58d..18b7c82d612 100644 --- a/docs/dev/server-side-search.rst +++ b/docs/dev/server-side-search.rst @@ -62,7 +62,7 @@ You can fix this by deleting the page index and :ref:`re-indexing .` parameter. Otherwise, it will re-index all the models """ - models = None if options["models"]: models = [apps.get_model(model_name) for model_name in options["models"]] + else: + models = [Project, HTMLFile] queue = options["queue"] change_index = options["change_index"] diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py index daf6d47d790..8ef62345121 100644 --- a/readthedocs/search/utils.py +++ b/readthedocs/search/utils.py @@ -10,30 +10,6 @@ log = structlog.get_logger(__name__) -def index_new_files(model, version, build): - """Index new files from the version into the search index.""" - - log.bind( - project_slug=version.project.slug, - version_slug=version.slug, - ) - - if not DEDConfig.autosync_enabled(): - log.info("Autosync disabled. Skipping indexing into the search index.") - return - - try: - document = list(registry.get_documents(models=[model]))[0] - doc_obj = document() - queryset = doc_obj.get_queryset().filter( - project=version.project, version=version, build=build - ) - log.info("Indexing new objecst into search index.") - doc_obj.update(queryset.iterator()) - except Exception: - log.exception("Unable to index a subset of files. Continuing.") - - def remove_indexed_files(model, project_slug, version_slug=None, build_id=None): """ Remove files from `version_slug` of `project_slug` from the search index. From bc8e4ff13b8f7ef2f2e380a7b5c10f1c0ce85759 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Tue, 12 Sep 2023 15:45:07 -0500 Subject: [PATCH 03/12] Fix tests --- readthedocs/projects/tasks/search.py | 5 +- .../projects/tests/test_build_tasks.py | 12 +- readthedocs/rtd_tests/files/404.html | 11 + readthedocs/rtd_tests/files/index.html | 11 + .../rtd_tests/tests/test_build_storage.py | 30 ++- .../rtd_tests/tests/test_imported_file.py | 254 ++++++++++++++---- readthedocs/search/tests/test_api.py | 59 ---- 7 files changed, 245 insertions(+), 137 deletions(-) create mode 100644 readthedocs/rtd_tests/files/404.html create mode 100644 readthedocs/rtd_tests/files/index.html diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index 742c8f99f21..92fb2f3a819 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -159,7 +159,7 @@ def _create_imported_files_and_search_index( *, version, build_id, search_ranking, search_ignore, search_index_name=None ): """ - Create imported files and search index for version. + Create imported files and search index for the build of the version. If the version is external, we don't create a search index for it, only imported files. After the process is completed, we delete the files and search index that @@ -173,7 +173,7 @@ def _create_imported_files_and_search_index( ) html_files_to_index = [] html_files_to_save = [] - reverse_rankings = reversed(list(search_ranking.items())) + reverse_rankings = list(reversed(search_ranking.items())) for root, __, filenames in build_media_storage.walk(storage_path): for filename in filenames: # We don't care about non-HTML files @@ -234,7 +234,6 @@ def _create_imported_files_and_search_index( # and we neeed this id to be `None` when indexing the objects in ES. # ES will generate a unique id for each document. if html_files_to_index: - # document = list(registry.get_documents(models=[HTMLFile]))[0] document = PageDocument # If a search index name is provided, we need to temporarily change diff --git a/readthedocs/projects/tests/test_build_tasks.py b/readthedocs/projects/tests/test_build_tasks.py index 37480500336..a1519bb707b 100644 --- a/readthedocs/projects/tests/test_build_tasks.py +++ b/readthedocs/projects/tests/test_build_tasks.py @@ -407,7 +407,7 @@ def test_get_env_vars(self, load_yaml_config, build_environment, config, externa expected_build_env_vars["PRIVATE_TOKEN"] = "a1b2c3" assert build_env_vars == expected_build_env_vars - @mock.patch("readthedocs.projects.tasks.builds.fileify") + @mock.patch("readthedocs.projects.tasks.builds.index_build") @mock.patch("readthedocs.projects.tasks.builds.build_complete") @mock.patch("readthedocs.projects.tasks.builds.send_external_build_status") @mock.patch("readthedocs.projects.tasks.builds.UpdateDocsTask.send_notifications") @@ -420,7 +420,7 @@ def test_successful_build( send_notifications, send_external_build_status, build_complete, - fileify, + index_build, ): load_yaml_config.return_value = self._config_file( { @@ -481,13 +481,7 @@ def test_successful_build( build=mock.ANY, ) - fileify.delay.assert_called_once_with( - version_pk=self.version.pk, - commit=self.build.commit, - build=self.build.pk, - search_ranking=mock.ANY, - search_ignore=mock.ANY, - ) + index_build.delay.assert_called_once_with(build_id=self.build.pk) # TODO: assert the verb and the path for each API call as well diff --git a/readthedocs/rtd_tests/files/404.html b/readthedocs/rtd_tests/files/404.html new file mode 100644 index 00000000000..a939588f6c8 --- /dev/null +++ b/readthedocs/rtd_tests/files/404.html @@ -0,0 +1,11 @@ + + + + + + 404 - Page not found + + + Page not found + + diff --git a/readthedocs/rtd_tests/files/index.html b/readthedocs/rtd_tests/files/index.html new file mode 100644 index 00000000000..03363c621ca --- /dev/null +++ b/readthedocs/rtd_tests/files/index.html @@ -0,0 +1,11 @@ + + + + + + Root index file + + + Root index file + + diff --git a/readthedocs/rtd_tests/tests/test_build_storage.py b/readthedocs/rtd_tests/tests/test_build_storage.py index 78e138e7b7e..0d44ed72595 100644 --- a/readthedocs/rtd_tests/tests/test_build_storage.py +++ b/readthedocs/rtd_tests/tests/test_build_storage.py @@ -58,9 +58,11 @@ def test_sync_directory(self): tree = [ ('api', ['index.html']), - 'api.fjson', - 'conf.py', - 'test.html', + "404.html", + "api.fjson", + "conf.py", + "index.html", + "test.html", ] with override_settings(DOCROOT=tmp_files_dir): self.storage.rclone_sync_directory(tmp_files_dir, storage_dir) @@ -68,7 +70,9 @@ def test_sync_directory(self): tree = [ ('api', ['index.html']), + "404.html", 'conf.py', + "index.html", 'test.html', ] os.remove(os.path.join(tmp_files_dir, 'api.fjson')) @@ -77,7 +81,9 @@ def test_sync_directory(self): self.assertFileTree(storage_dir, tree) tree = [ + "404.html", 'conf.py', + "index.html", 'test.html', ] shutil.rmtree(os.path.join(tmp_files_dir, 'api')) @@ -126,7 +132,9 @@ def test_delete_directory(self): self.storage.copy_directory(files_dir, "files") dirs, files = self.storage.listdir("files") self.assertEqual(dirs, ["api"]) - self.assertCountEqual(files, ["api.fjson", "conf.py", "test.html"]) + self.assertCountEqual( + files, ["404.html", "api.fjson", "conf.py", "index.html", "test.html"] + ) self.storage.delete_directory('files/') _, files = self.storage.listdir('files') @@ -147,9 +155,11 @@ def test_walk(self): self.assertEqual(len(output), 2) top, dirs, files = output[0] - self.assertEqual(top, 'files') - self.assertCountEqual(dirs, ['api']) - self.assertCountEqual(files, ['api.fjson', 'conf.py', 'test.html']) + self.assertEqual(top, "files") + self.assertCountEqual(dirs, ["api"]) + self.assertCountEqual( + files, ["404.html", "api.fjson", "conf.py", "index.html", "test.html"] + ) top, dirs, files = output[1] self.assertEqual(top, 'files/api') @@ -163,8 +173,10 @@ def test_rclone_sync(self): tree = [ ("api", ["index.html"]), + "404.html", "api.fjson", "conf.py", + "index.html", "test.html", ] with override_settings(DOCROOT=tmp_files_dir): @@ -173,7 +185,9 @@ def test_rclone_sync(self): tree = [ ("api", ["index.html"]), + "404.html", "conf.py", + "index.html", "test.html", ] (tmp_files_dir / "api.fjson").unlink() @@ -182,7 +196,9 @@ def test_rclone_sync(self): self.assertFileTree(storage_dir, tree) tree = [ + "404.html", "conf.py", + "index.html", "test.html", ] shutil.rmtree(tmp_files_dir / "api") diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py index 423d56d73e9..9ea7515d9cd 100644 --- a/readthedocs/rtd_tests/tests/test_imported_file.py +++ b/readthedocs/rtd_tests/tests/test_imported_file.py @@ -6,14 +6,13 @@ from django.test.utils import override_settings from readthedocs.projects.models import HTMLFile, ImportedFile, Project -from readthedocs.projects.tasks.search import ( - _create_imported_files, - _sync_imported_files, -) +from readthedocs.projects.tasks.search import _create_imported_files_and_search_index +from readthedocs.search.documents import PageDocument base_dir = os.path.dirname(os.path.dirname(__file__)) +@override_settings(ELASTICSEARCH_DSL_AUTOSYNC=True) class ImportedFileTests(TestCase): fixtures = ["eric", "test_data"] @@ -27,20 +26,25 @@ def setUp(self): with override_settings(DOCROOT=self.test_dir): self._copy_storage_dir() + def tearDown(self): + try: + PageDocument().search().filter().delete() + except Exception: + # If there are no documents, the query fails. + pass + def _manage_imported_files( - self, version, commit, build, search_ranking=None, search_ignore=None + self, version, build_id, search_ranking=None, search_ignore=None ): """Helper function for the tests to create and sync ImportedFiles.""" search_ranking = search_ranking or {} search_ignore = search_ignore or [] - _create_imported_files( + _create_imported_files_and_search_index( version=version, - commit=commit, - build_id=build, + build_id=build_id, search_ranking=search_ranking, search_ignore=search_ignore, ) - _sync_imported_files(version, build) def _copy_storage_dir(self): """Copy the test directory (rtd_tests/files) to storage""" @@ -54,96 +58,208 @@ def _copy_storage_dir(self): ) def test_properly_created(self): - # Only 2 files in the directory is HTML (test.html, api/index.html) + """ + Only 4 files in the directory are HTML + + - index.html + - test.html + - api/index.html + - 404.html + + But we create imported files for index.html and 404.html pages only. + """ self.assertEqual(ImportedFile.objects.count(), 0) - self._manage_imported_files(version=self.version, commit="commit01", build=1) - self.assertEqual(ImportedFile.objects.count(), 2) - self._manage_imported_files(version=self.version, commit="commit01", build=2) - self.assertEqual(ImportedFile.objects.count(), 2) - self.project.cdn_enabled = True - self.project.save() + self._manage_imported_files(version=self.version, build_id=1) + self.assertEqual(ImportedFile.objects.count(), 3) + self.assertEqual( + set(HTMLFile.objects.all().values_list("path", flat=True)), + {"index.html", "api/index.html", "404.html"}, + ) - def test_update_commit(self): + results = PageDocument().search().filter("term", build=1).execute() + self.assertEqual( + {result.path for result in results}, {"index.html", "404.html", "test.html"} + ) + + self._manage_imported_files(version=self.version, build_id=2) + self.assertEqual(ImportedFile.objects.count(), 3) + self.assertEqual( + set(HTMLFile.objects.all().values_list("path", flat=True)), + {"index.html", "api/index.html", "404.html"}, + ) + + results = PageDocument().search().filter("term", build=2).execute() + self.assertEqual( + {result.path for result in results}, {"index.html", "404.html", "test.html"} + ) + + def test_update_build(self): self.assertEqual(ImportedFile.objects.count(), 0) - self._manage_imported_files(self.version, "commit01", 1) - self.assertEqual(ImportedFile.objects.first().commit, "commit01") - self._manage_imported_files(self.version, "commit02", 2) - self.assertEqual(ImportedFile.objects.first().commit, "commit02") + self._manage_imported_files(self.version, build_id=1) + for obj in ImportedFile.objects.all(): + self.assertEqual(obj.build, 1) + + results = PageDocument().search().filter().execute() + for result in results: + self.assertEqual(result.build, 1) + + self._manage_imported_files(self.version, build_id=2) + for obj in ImportedFile.objects.all(): + self.assertEqual(obj.build, 2) + + # NOTE: we can't test the ES index here because the index is updated asynchronously, + # so we will get results for both builds. + # results = PageDocument().search().filter().execute() + # for result in results: + # self.assertEqual(result.build, 2) def test_page_default_rank(self): search_ranking = {} self.assertEqual(HTMLFile.objects.count(), 0) - self._manage_imported_files(self.version, "commit01", 1, search_ranking) + self._manage_imported_files( + self.version, build_id=1, search_ranking=search_ranking + ) - self.assertEqual(HTMLFile.objects.count(), 2) - self.assertEqual(HTMLFile.objects.filter(rank=0).count(), 2) + results = ( + PageDocument() + .search() + .filter("term", project=self.project.slug) + .filter("term", version=self.version.slug) + .execute() + ) + self.assertEqual(len(results), 4) + for result in results: + self.assertEqual(result.project, self.project.slug) + self.assertEqual(result.version, self.version.slug) + self.assertEqual(result.build, 1) + self.assertEqual(result.rank, 0) def test_page_custom_rank_glob(self): search_ranking = { "*index.html": 5, } - self._manage_imported_files(self.version, "commit01", 1, search_ranking) + self._manage_imported_files( + self.version, build_id=1, search_ranking=search_ranking + ) - self.assertEqual(HTMLFile.objects.count(), 2) - file_api = HTMLFile.objects.get(path="api/index.html") - file_test = HTMLFile.objects.get(path="test.html") - self.assertEqual(file_api.rank, 5) - self.assertEqual(file_test.rank, 0) + results = ( + PageDocument() + .search() + .filter("term", project=self.project.slug) + .filter("term", version=self.version.slug) + .execute() + ) + self.assertEqual(len(results), 4) + for result in results: + self.assertEqual(result.project, self.project.slug) + self.assertEqual(result.version, self.version.slug) + self.assertEqual(result.build, 1) + if result.path.endswith("index.html"): + self.assertEqual(result.rank, 5, result.path) + else: + self.assertEqual(result.rank, 0, result.path) def test_page_custom_rank_several(self): search_ranking = { "test.html": 5, "api/index.html": 2, } - self._manage_imported_files(self.version, "commit01", 1, search_ranking) + self._manage_imported_files( + self.version, build_id=1, search_ranking=search_ranking + ) - self.assertEqual(HTMLFile.objects.count(), 2) - file_api = HTMLFile.objects.get(path="api/index.html") - file_test = HTMLFile.objects.get(path="test.html") - self.assertEqual(file_api.rank, 2) - self.assertEqual(file_test.rank, 5) + results = ( + PageDocument() + .search() + .filter("term", project=self.project.slug) + .filter("term", version=self.version.slug) + .execute() + ) + self.assertEqual(len(results), 4) + for result in results: + self.assertEqual(result.project, self.project.slug) + self.assertEqual(result.version, self.version.slug) + self.assertEqual(result.build, 1) + if result.path == "test.html": + self.assertEqual(result.rank, 5) + elif result.path == "api/index.html": + self.assertEqual(result.rank, 2) + else: + self.assertEqual(result.rank, 0) def test_page_custom_rank_precedence(self): search_ranking = { "*.html": 5, "api/index.html": 2, } - self._manage_imported_files(self.version, "commit01", 1, search_ranking) + self._manage_imported_files( + self.version, build_id=1, search_ranking=search_ranking + ) - self.assertEqual(HTMLFile.objects.count(), 2) - file_api = HTMLFile.objects.get(path="api/index.html") - file_test = HTMLFile.objects.get(path="test.html") - self.assertEqual(file_api.rank, 2) - self.assertEqual(file_test.rank, 5) + results = ( + PageDocument() + .search() + .filter("term", project=self.project.slug) + .filter("term", version=self.version.slug) + .execute() + ) + self.assertEqual(len(results), 4) + for result in results: + self.assertEqual(result.project, self.project.slug) + self.assertEqual(result.version, self.version.slug) + self.assertEqual(result.build, 1) + if result.path == "api/index.html": + self.assertEqual(result.rank, 2, result.path) + else: + self.assertEqual(result.rank, 5, result.path) def test_page_custom_rank_precedence_inverted(self): search_ranking = { "api/index.html": 2, "*.html": 5, } - self._manage_imported_files(self.version, "commit01", 1, search_ranking) + self._manage_imported_files( + self.version, build_id=1, search_ranking=search_ranking + ) - self.assertEqual(HTMLFile.objects.count(), 2) - file_api = HTMLFile.objects.get(path="api/index.html") - file_test = HTMLFile.objects.get(path="test.html") - self.assertEqual(file_api.rank, 5) - self.assertEqual(file_test.rank, 5) + results = ( + PageDocument() + .search() + .filter("term", project=self.project.slug) + .filter("term", version=self.version.slug) + .execute() + ) + self.assertEqual(len(results), 4) + for result in results: + self.assertEqual(result.project, self.project.slug) + self.assertEqual(result.version, self.version.slug) + self.assertEqual(result.build, 1) + self.assertEqual(result.rank, 5) def test_search_page_ignore(self): search_ignore = ["api/index.html"] self._manage_imported_files( self.version, - "commit01", - 1, + build_id=1, search_ignore=search_ignore, ) - self.assertEqual(HTMLFile.objects.count(), 2) - file_api = HTMLFile.objects.get(path="api/index.html") - file_test = HTMLFile.objects.get(path="test.html") - self.assertTrue(file_api.ignore) - self.assertFalse(file_test.ignore) + self.assertEqual( + set(HTMLFile.objects.all().values_list("path", flat=True)), + {"index.html", "api/index.html", "404.html"}, + ) + results = ( + PageDocument() + .search() + .filter("term", project=self.project.slug) + .filter("term", version=self.version.slug) + .execute() + ) + self.assertEqual(len(results), 3) + self.assertEqual( + {result.path for result in results}, {"index.html", "404.html", "test.html"} + ) def test_update_content(self): test_dir = os.path.join(base_dir, "files") @@ -155,8 +271,18 @@ def test_update_content(self): with override_settings(DOCROOT=self.test_dir): self._copy_storage_dir() - self._manage_imported_files(self.version, "commit01", 1) - self.assertEqual(ImportedFile.objects.count(), 2) + self._manage_imported_files(self.version, build_id=1) + self.assertEqual(ImportedFile.objects.count(), 3) + document = ( + PageDocument() + .search() + .filter("term", project=self.project.slug) + .filter("term", version=self.version.slug) + .filter("term", path="test.html") + .filter("term", build=1) + .execute()[0] + ) + self.assertEqual(document.sections[0].content, "Woo") with open(os.path.join(test_dir, "test.html"), "w+") as f: f.write("Something Else") @@ -164,5 +290,15 @@ def test_update_content(self): with override_settings(DOCROOT=self.test_dir): self._copy_storage_dir() - self._manage_imported_files(self.version, "commit02", 2) - self.assertEqual(ImportedFile.objects.count(), 2) + self._manage_imported_files(self.version, build_id=2) + self.assertEqual(ImportedFile.objects.count(), 3) + document = ( + PageDocument() + .search() + .filter("term", project=self.project.slug) + .filter("term", version=self.version.slug) + .filter("term", path="test.html") + .filter("term", build=2) + .execute()[0] + ) + self.assertEqual(document.sections[0].content, "Something Else") diff --git a/readthedocs/search/tests/test_api.py b/readthedocs/search/tests/test_api.py index 123b0bbd652..3f4a868666f 100644 --- a/readthedocs/search/tests/test_api.py +++ b/readthedocs/search/tests/test_api.py @@ -21,7 +21,6 @@ SECTION_FIELDS, get_search_query_from_project_file, ) -from readthedocs.search.utils import index_new_files, remove_indexed_files @pytest.mark.django_db @@ -726,64 +725,6 @@ def test_search_custom_ranking(self, api_client): assert results[0]["path"] == "/en/latest/index.html" assert results[1]["path"] == "/en/latest/guides/index.html" - def test_search_ignore(self, api_client): - project = Project.objects.get(slug="docs") - version = project.versions.all().first() - - page_index = HTMLFile.objects.get( - version=version, - path="index.html", - ) - page_guides = HTMLFile.objects.get( - version=version, - path="guides/index.html", - ) - - search_params = { - "project": project.slug, - "version": version.slug, - "q": '"content from"', - } - - # Query with files not ignored. - assert page_index.ignore is None - assert page_guides.ignore is None - - resp = self.get_search(api_client, search_params) - assert resp.status_code == 200 - - results = resp.data["results"] - assert len(results) == 2 - assert results[0]["path"] == "/en/latest/index.html" - assert results[1]["path"] == "/en/latest/guides/index.html" - - # Query with guides/index.html ignored. - page_guides.ignore = True - page_guides.save() - - remove_indexed_files(HTMLFile, project.slug, version.slug) - index_new_files(HTMLFile, version, page_index.build) - - resp = self.get_search(api_client, search_params) - assert resp.status_code == 200 - - results = resp.data["results"] - assert len(results) == 1 - assert results[0]["path"] == "/en/latest/index.html" - - # Query with index.html and guides/index.html ignored. - page_index.ignore = True - page_index.save() - - remove_indexed_files(HTMLFile, project.slug, version.slug) - index_new_files(HTMLFile, version, page_index.build) - - resp = self.get_search(api_client, search_params) - assert resp.status_code == 200 - - results = resp.data["results"] - assert len(results) == 0 - class TestDocumentSearch(BaseTestDocumentSearch): pass From 3e2fd317ae12a9a3b00fab570c708b199b3c64f8 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Tue, 12 Sep 2023 19:34:50 -0500 Subject: [PATCH 04/12] Remove more code --- readthedocs/builds/admin.py | 39 ++----------- readthedocs/builds/querysets.py | 22 +++++++ readthedocs/projects/admin.py | 57 +++---------------- readthedocs/projects/tasks/search.py | 11 ++-- .../rtd_tests/tests/test_imported_file.py | 5 +- .../commands/reindex_elasticsearch.py | 17 +----- readthedocs/search/utils.py | 34 ----------- 7 files changed, 45 insertions(+), 140 deletions(-) diff --git a/readthedocs/builds/admin.py b/readthedocs/builds/admin.py index 9d08d3ed671..fe19cedff8c 100644 --- a/readthedocs/builds/admin.py +++ b/readthedocs/builds/admin.py @@ -12,8 +12,7 @@ ) from readthedocs.core.utils import trigger_build from readthedocs.core.utils.admin import pretty_json_field -from readthedocs.projects.models import HTMLFile -from readthedocs.search.utils import _indexing_helper +from readthedocs.projects.tasks.search import reindex_version class BuildCommandResultInline(admin.TabularInline): @@ -89,7 +88,7 @@ class VersionAdmin(admin.ModelAdmin): list_filter = ("type", "privacy_level", "active", "built") search_fields = ("slug", "project__slug") raw_id_fields = ("project",) - actions = ["build_version", "reindex_version", "wipe_version_indexes"] + actions = ["build_version", "reindex_version"] def project_slug(self, obj): return obj.project.slug @@ -117,41 +116,11 @@ def build_version(self, request, queryset): @admin.action(description="Reindex version to ES") def reindex_version(self, request, queryset): """Reindexes all selected versions to ES.""" - html_objs_qs = [] - for version in queryset.iterator(): - html_objs = HTMLFile.objects.filter( - project=version.project, version=version - ) - - if html_objs.exists(): - html_objs_qs.append(html_objs) - - if html_objs_qs: - _indexing_helper(html_objs_qs, wipe=False) + for version_id in queryset.values_list("id", flat=True).iterator(): + reindex_version.delay(version_id) self.message_user(request, "Task initiated successfully.", messages.SUCCESS) - @admin.action(description="Wipe version from ES") - def wipe_version_indexes(self, request, queryset): - """Wipe selected versions from ES.""" - html_objs_qs = [] - for version in queryset.iterator(): - html_objs = HTMLFile.objects.filter( - project=version.project, version=version - ) - - if html_objs.exists(): - html_objs_qs.append(html_objs) - - if html_objs_qs: - _indexing_helper(html_objs_qs, wipe=True) - - self.message_user( - request, - "Task initiated successfully", - messages.SUCCESS, - ) - @admin.register(RegexAutomationRule) class RegexAutomationRuleAdmin(PolymorphicChildModelAdmin, admin.ModelAdmin): diff --git a/readthedocs/builds/querysets.py b/readthedocs/builds/querysets.py index bf2ebcc4963..3ab46436171 100644 --- a/readthedocs/builds/querysets.py +++ b/readthedocs/builds/querysets.py @@ -118,6 +118,28 @@ def public( def api(self, user=None): return self.public(user, only_active=False) + def for_reindex(self): + """ + Get all versions that can be reindexed. + + A version can be reindexed if: + + - It's active and has been built at least once successfully. + Since that means that it has files to be indexed. + - Its project is not delisted or marked as spam. + """ + return ( + self.filter( + active=True, + built=True, + builds__state=BUILD_STATE_FINISHED, + builds__success=True, + ) + .exclude(project__delisted=True) + .exclude(project__is_spam=True) + .distinct() + ) + class VersionQuerySet(SettingsOverrideObject): _default_class = VersionQuerySetBase diff --git a/readthedocs/projects/admin.py b/readthedocs/projects/admin.py index 534c19ae202..ff5a25ebb59 100644 --- a/readthedocs/projects/admin.py +++ b/readthedocs/projects/admin.py @@ -11,8 +11,8 @@ from readthedocs.core.history import ExtraSimpleHistoryAdmin, set_change_reason from readthedocs.core.utils import trigger_build from readthedocs.notifications.views import SendNotificationView +from readthedocs.projects.tasks.search import reindex_version from readthedocs.redirects.models import Redirect -from readthedocs.search.utils import _indexing_helper from .forms import FeatureForm from .models import ( @@ -264,7 +264,6 @@ class ProjectAdmin(ExtraSimpleHistoryAdmin): "run_spam_rule_checks", "build_default_version", "reindex_active_versions", - "wipe_all_versions", "import_tags_from_vcs", ] @@ -362,27 +361,19 @@ def reindex_active_versions(self, request, queryset): """Reindex all active versions of the selected projects to ES.""" qs_iterator = queryset.iterator() for project in qs_iterator: - version_qs = Version.internal.filter(project=project) - active_versions = version_qs.filter(active=True) + versions_id_to_reindex = project.versions.for_reindex().values_list( + "pk", flat=True + ) - if not active_versions.exists(): + if not versions_id_to_reindex.exists(): self.message_user( request, - "No active versions of project {}".format(project), + "No versions to be re-indexed for project {}".format(project), messages.ERROR, ) else: - html_objs_qs = [] - for version in active_versions.iterator(): - html_objs = HTMLFile.objects.filter( - project=project, version=version - ) - - if html_objs.exists(): - html_objs_qs.append(html_objs) - - if html_objs_qs: - _indexing_helper(html_objs_qs, wipe=False) + for version_id in versions_id_to_reindex.iterator(): + reindex_version.delay(version_id) self.message_user( request, @@ -390,38 +381,6 @@ def reindex_active_versions(self, request, queryset): messages.SUCCESS, ) - # TODO: rename method to mention "indexes" on its name - @admin.action(description="Wipe all versions from ES") - def wipe_all_versions(self, request, queryset): - """Wipe indexes of all versions of selected projects.""" - qs_iterator = queryset.iterator() - for project in qs_iterator: - version_qs = Version.internal.filter(project=project) - if not version_qs.exists(): - self.message_user( - request, - "No active versions of project {}.".format(project), - messages.ERROR, - ) - else: - html_objs_qs = [] - for version in version_qs.iterator(): - html_objs = HTMLFile.objects.filter( - project=project, version=version - ) - - if html_objs.exists(): - html_objs_qs.append(html_objs) - - if html_objs_qs: - _indexing_helper(html_objs_qs, wipe=True) - - self.message_user( - request, - "Task initiated successfully for {}.".format(project), - messages.SUCCESS, - ) - @admin.action(description="Import tags from the version control API") def import_tags_from_vcs(self, request, queryset): for project in queryset.iterator(): diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index 92fb2f3a819..fb41272458e 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -101,7 +101,7 @@ def reindex_version(version_id, search_index_name=None): The latest successful build is used for the re-creation. """ version = Version.objects.filter(pk=version_id).select_related("project").first() - if not version: + if not version or not version.built: return latest_successful_build = ( @@ -129,9 +129,9 @@ def reindex_version(version_id, search_index_name=None): search_ignore = search_config.get("ignore", []) # We need to use a build id that is different from the current one, - # since, otherwise we will end up with duplicated files. - # After the process is complete, we delete the files and search index - # that don't belong to the current build id. + # otherwise we will end up with duplicated files. After the process + # is complete, we delete the files and search index that don't belong + # to the current build id. The build id isn't used for anything else. build_id = latest_successful_build.id + 1 try: _create_imported_files_and_search_index( @@ -213,7 +213,8 @@ def _create_imported_files_and_search_index( name=filename, rank=page_rank, # TODO: We are setting this field since it's required, - # but it will be removed in the future together with other fields. + # but it isn't used, and will be removed in the future + # together with other fields. commit="unknown", build=build_id, ignore=skip_search_index, diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py index 9ea7515d9cd..d97d897b6bb 100644 --- a/readthedocs/rtd_tests/tests/test_imported_file.py +++ b/readthedocs/rtd_tests/tests/test_imported_file.py @@ -66,7 +66,7 @@ def test_properly_created(self): - api/index.html - 404.html - But we create imported files for index.html and 404.html pages only. + But we create imported files for index.html and 404.html files only. """ self.assertEqual(ImportedFile.objects.count(), 0) @@ -79,7 +79,8 @@ def test_properly_created(self): results = PageDocument().search().filter("term", build=1).execute() self.assertEqual( - {result.path for result in results}, {"index.html", "404.html", "test.html"} + {result.path for result in results}, + {"index.html", "404.html", "test.html", "api/index.html"}, ) self._manage_imported_files(version=self.version, build_id=2) diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py index e9334b4f4a8..31c2a626dc3 100644 --- a/readthedocs/search/management/commands/reindex_elasticsearch.py +++ b/readthedocs/search/management/commands/reindex_elasticsearch.py @@ -9,7 +9,6 @@ from django.core.management import BaseCommand from django_elasticsearch_dsl.registries import registry -from readthedocs.builds.constants import BUILD_STATE_FINISHED from readthedocs.builds.models import Version from readthedocs.projects.models import HTMLFile, Project from readthedocs.projects.tasks.search import reindex_version @@ -162,18 +161,6 @@ def _reindex_projects_from(self, days_ago, queue): items=queryset.count(), ) - def _get_versions_to_reindex_queryset(self): - return ( - Version.objects.filter( - built=True, - builds__state=BUILD_STATE_FINISHED, - builds__success=True, - ) - .exclude(project__delisted=True) - .exclude(project__is_spam=True) - .distinct() - ) - def _reindex_files(self, queue, timestamp): document = PageDocument app_label = HTMLFile._meta.app_label @@ -188,7 +175,7 @@ def _reindex_files(self, queue, timestamp): ) log.info("Temporal index created.", index_name=new_index_name) - queryset = self._get_versions_to_reindex_queryset().values_list("pk", flat=True) + queryset = Version.objects.for_reindex().values_list("pk", flat=True) for version_id in queryset.iterator(): reindex_version.apply_async( kwargs={ @@ -207,7 +194,7 @@ def _reindex_files_from(self, days_ago, queue): """Reindex HTML files from versions with recent builds.""" since = datetime.now() - timedelta(days=days_ago) queryset = ( - self._get_versions_to_reindex_queryset() + Version.objects.for_reindex() .filter(builds__date__gte=since) .values_list("pk", flat=True) ) diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py index 8ef62345121..7f25e1991c2 100644 --- a/readthedocs/search/utils.py +++ b/readthedocs/search/utils.py @@ -5,8 +5,6 @@ from django_elasticsearch_dsl.apps import DEDConfig from django_elasticsearch_dsl.registries import registry -from readthedocs.projects.models import HTMLFile - log = structlog.get_logger(__name__) @@ -71,38 +69,6 @@ def _get_document(model, document_class): return document -def _indexing_helper(html_objs_qs, wipe=False): - """ - Helper function for reindexing and wiping indexes of projects and versions. - - If ``wipe`` is set to False, html_objs are deleted from the ES index, - else, html_objs are indexed. - """ - from readthedocs.search.documents import PageDocument - from readthedocs.search.tasks import delete_objects_in_es, index_objects_to_es - - if html_objs_qs: - obj_ids = [] - for html_objs in html_objs_qs: - obj_ids.extend([obj.id for obj in html_objs]) - - # removing redundant ids if exists. - obj_ids = list(set(obj_ids)) - - if obj_ids: - kwargs = { - "app_label": HTMLFile._meta.app_label, - "model_name": HTMLFile.__name__, - "document_class": str(PageDocument), - "objects_id": obj_ids, - } - - if not wipe: - index_objects_to_es.delay(**kwargs) - else: - delete_objects_in_es.delay(**kwargs) - - def _last_30_days_iter(): """Returns iterator for previous 30 days (including today).""" thirty_days_ago = timezone.now().date() - timezone.timedelta(days=30) From c5ec0806ab379d7ff6686977bc4b3653a21aef82 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Tue, 12 Sep 2023 20:07:50 -0500 Subject: [PATCH 05/12] Simplify more things --- readthedocs/projects/tasks/search.py | 22 ++++++---------------- readthedocs/search/tests/test_utils.py | 7 +------ readthedocs/search/utils.py | 20 ++++++++++++++++++-- 3 files changed, 25 insertions(+), 24 deletions(-) diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index fb41272458e..4b45acc6569 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -7,7 +7,7 @@ from readthedocs.projects.models import HTMLFile, Project from readthedocs.projects.signals import files_changed from readthedocs.search.documents import PageDocument -from readthedocs.search.utils import remove_indexed_files +from readthedocs.search.utils import index_objects, remove_indexed_files from readthedocs.storage import build_media_storage from readthedocs.worker import app @@ -149,7 +149,6 @@ def reindex_version(version_id, search_index_name=None): def remove_search_indexes(project_slug, version_slug=None): """Wrapper around ``remove_indexed_files`` to make it a task.""" remove_indexed_files( - model=HTMLFile, project_slug=project_slug, version_slug=version_slug, ) @@ -235,23 +234,14 @@ def _create_imported_files_and_search_index( # and we neeed this id to be `None` when indexing the objects in ES. # ES will generate a unique id for each document. if html_files_to_index: - document = PageDocument - - # If a search index name is provided, we need to temporarily change - # the index name of the document. - old_index_name = document._index._name - if search_index_name: - document._index._name = search_index_name - - document().update(html_files_to_index) - - # Restore the old index name. - if search_index_name: - document._index._name = old_index_name + index_objects( + document=PageDocument, + objects=html_files_to_index, + index_name=search_index_name, + ) # Remove old HTMLFiles from ElasticSearch remove_indexed_files( - model=HTMLFile, project_slug=version.project.slug, version_slug=version.slug, build_id=build_id, diff --git a/readthedocs/search/tests/test_utils.py b/readthedocs/search/tests/test_utils.py index 21583603534..8d7429f8b5f 100644 --- a/readthedocs/search/tests/test_utils.py +++ b/readthedocs/search/tests/test_utils.py @@ -4,7 +4,6 @@ from django.urls import reverse from readthedocs.builds.constants import LATEST, STABLE -from readthedocs.projects.models import HTMLFile from readthedocs.search import utils from readthedocs.search.tests.utils import get_search_query_from_project_file @@ -33,10 +32,7 @@ def test_remove_only_one_project_index(self, api_client, all_projects): assert self.has_results(api_client, project, LATEST) assert self.has_results(api_client, project, STABLE) - utils.remove_indexed_files( - HTMLFile, - project_slug=project, - ) + utils.remove_indexed_files(project_slug=project) # Deletion of indices from ES happens async, # so we need to wait a little before checking for results. time.sleep(1) @@ -55,7 +51,6 @@ def test_remove_only_one_version_index(self, api_client, all_projects): assert self.has_results(api_client, project, STABLE) utils.remove_indexed_files( - HTMLFile, project_slug=project, version_slug=LATEST, ) diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py index 7f25e1991c2..f473b4d0425 100644 --- a/readthedocs/search/utils.py +++ b/readthedocs/search/utils.py @@ -5,10 +5,26 @@ from django_elasticsearch_dsl.apps import DEDConfig from django_elasticsearch_dsl.registries import registry +from readthedocs.search.documents import PageDocument + log = structlog.get_logger(__name__) -def remove_indexed_files(model, project_slug, version_slug=None, build_id=None): +def index_objects(document, objects, index_name=None): + # If a search index name is provided, we need to temporarily change + # the index name of the document. + old_index_name = document._index._name + if index_name: + document._index._name = index_name + + document().update(objects) + + # Restore the old index name. + if index_name: + document._index._name = old_index_name + + +def remove_indexed_files(project_slug, version_slug=None, build_id=None): """ Remove files from `version_slug` of `project_slug` from the search index. @@ -29,7 +45,7 @@ def remove_indexed_files(model, project_slug, version_slug=None, build_id=None): return try: - document = list(registry.get_documents(models=[model]))[0] + document = PageDocument log.info("Deleting old files from search index.") documents = document().search().filter("term", project=project_slug) if version_slug: From d57c481a8e37cc7d1c9e76a5eb7ba9fb4831cabf Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Tue, 12 Sep 2023 20:09:41 -0500 Subject: [PATCH 06/12] Fix test --- readthedocs/rtd_tests/tests/test_imported_file.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py index d97d897b6bb..1f61e5f51b0 100644 --- a/readthedocs/rtd_tests/tests/test_imported_file.py +++ b/readthedocs/rtd_tests/tests/test_imported_file.py @@ -92,7 +92,8 @@ def test_properly_created(self): results = PageDocument().search().filter("term", build=2).execute() self.assertEqual( - {result.path for result in results}, {"index.html", "404.html", "test.html"} + {result.path for result in results}, + {"index.html", "404.html", "test.html", "api/index.html"}, ) def test_update_build(self): From 022e7e92dd00d3dd013d0ef16cb941d4489301a1 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Tue, 12 Sep 2023 20:27:48 -0500 Subject: [PATCH 07/12] Small updates --- readthedocs/projects/tasks/search.py | 8 +++----- .../search/management/commands/reindex_elasticsearch.py | 2 -- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index 4b45acc6569..9ad13e69d13 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -15,8 +15,8 @@ # TODO: remove after deploy. This is kept, so builds keep -# working while the deploy the new task below. -@app.task(queue='reindex') +# working while we deploy the task below. +@app.task(queue="reindex") def fileify(version_pk, commit, build, search_ranking, search_ignore): """ Create ImportedFile objects for all of a version's files. @@ -224,9 +224,7 @@ def _create_imported_files_and_search_index( # Create the imported file only if it's a top-level 404 file, # or if it's an index file. We don't need to keep track of all files. - is_top_level_404_file = filename == "404.html" and root == storage_path - is_index_file = filename in ["index.html", "README.html"] - if is_top_level_404_file or is_index_file: + if relpath == "404.html" or filename in ["index.html", "README.html"]: html_files_to_save.append(html_file) # We first index the files in ES, and then save the objects in the DB. diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py index 31c2a626dc3..27585c12334 100644 --- a/readthedocs/search/management/commands/reindex_elasticsearch.py +++ b/readthedocs/search/management/commands/reindex_elasticsearch.py @@ -99,8 +99,6 @@ def _reindex_from(self, days_ago, models, queue): functions[model](days_ago=days_ago, queue=queue) def _reindex_projects(self, queue, timestamp): - # TODO: can we just improt the project document? - # document = list(registry.get_documents(models=[Project]))[0] document = ProjectDocument app_label = Project._meta.app_label model_name = Project.__name__ From 19d7c5c19febcb8a5d8f1c34a384169b040ba571 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Tue, 12 Sep 2023 20:31:49 -0500 Subject: [PATCH 08/12] Update test --- readthedocs/rtd_tests/tests/test_imported_file.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py index 1f61e5f51b0..c59b2a795e8 100644 --- a/readthedocs/rtd_tests/tests/test_imported_file.py +++ b/readthedocs/rtd_tests/tests/test_imported_file.py @@ -103,6 +103,7 @@ def test_update_build(self): self.assertEqual(obj.build, 1) results = PageDocument().search().filter().execute() + self.assertEqual(len(results), 4) for result in results: self.assertEqual(result.build, 1) @@ -110,11 +111,10 @@ def test_update_build(self): for obj in ImportedFile.objects.all(): self.assertEqual(obj.build, 2) - # NOTE: we can't test the ES index here because the index is updated asynchronously, - # so we will get results for both builds. - # results = PageDocument().search().filter().execute() - # for result in results: - # self.assertEqual(result.build, 2) + # NOTE: we can't test that the files from the previous build + # were deleted, since deletion happens asynchronously. + results = PageDocument().search().filter("term", build=2).execute() + self.assertEqual(len(results), 4) def test_page_default_rank(self): search_ranking = {} From 79e6141794d62e7380352734f60fcf599c38979c Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Wed, 13 Sep 2023 13:06:38 -0500 Subject: [PATCH 09/12] Updates from review and fixes --- readthedocs/projects/tasks/search.py | 44 +++++++++---------- .../commands/reindex_elasticsearch.py | 5 +-- readthedocs/search/utils.py | 18 +++++++- 3 files changed, 40 insertions(+), 27 deletions(-) diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index 9ad13e69d13..57c195e6fb1 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -14,8 +14,8 @@ log = structlog.get_logger(__name__) -# TODO: remove after deploy. This is kept, so builds keep -# working while we deploy the task below. +# TODO: remove `fileify` task after deploy. We keep it for now +# so builds keep working while we deploy the `index_build` task. @app.task(queue="reindex") def fileify(version_pk, commit, build, search_ranking, search_ignore): """ @@ -61,11 +61,13 @@ def index_build(build_id): .first() ) if not build: + log.debug("Skipping search indexing. Build object doesn't exists.", build_id=build_id) return # The version may have been deleted. version = build.version if not version: + log.debug("Skipping search indexing. Build doesn't have a version attach it to it.", build_id=build_id) return log.bind( @@ -74,13 +76,10 @@ def index_build(build_id): build_id=build.id, ) - search_ranking = [] - search_ignore = [] - build_config = build.config - if build_config: - search_config = build_config.get("search", {}) - search_ranking = search_config.get("ranking", []) - search_ignore = search_config.get("ignore", []) + build_config = build.config or {} + search_config = build_config.get("search", {}) + search_ranking = search_config.get("ranking", []) + search_ignore = search_config.get("ignore", []) try: _create_imported_files_and_search_index( @@ -102,6 +101,7 @@ def reindex_version(version_id, search_index_name=None): """ version = Version.objects.filter(pk=version_id).select_related("project").first() if not version or not version.built: + log.debug("Skipping search indexing. Version doesn't exist or is not built.", version_id=version_id) return latest_successful_build = ( @@ -112,6 +112,7 @@ def reindex_version(version_id, search_index_name=None): # If the version doesn't have a successful # build, we don't have files to index. if not latest_successful_build: + log.debug("Skipping search indexing. Version doesn't have a successful build.", version_id=version_id) return log.bind( @@ -120,18 +121,16 @@ def reindex_version(version_id, search_index_name=None): build_id=latest_successful_build.id, ) - search_ranking = [] - search_ignore = [] - build_config = latest_successful_build.config - if build_config: - search_config = build_config.get("search", {}) - search_ranking = search_config.get("ranking", []) - search_ignore = search_config.get("ignore", []) - - # We need to use a build id that is different from the current one, - # otherwise we will end up with duplicated files. After the process - # is complete, we delete the files and search index that don't belong - # to the current build id. The build id isn't used for anything else. + build_config = latest_successful_build.config or {} + search_config = build_config.get("search", {}) + search_ranking = search_config.get("ranking", []) + search_ignore = search_config.get("ignore", []) + + # We need to use a build ID that is different from the current one, + # this ID is to differentiate the new files being indexed from the + # current ones, so we can delete the old ones easily. + # If we re-use the same build ID, we will end up with duplicated files. + # The build id isn't used for anything else. build_id = latest_successful_build.id + 1 try: _create_imported_files_and_search_index( @@ -211,7 +210,7 @@ def _create_imported_files_and_search_index( path=relpath, name=filename, rank=page_rank, - # TODO: We are setting this field since it's required, + # TODO: We are setting the commit field since it's required, # but it isn't used, and will be removed in the future # together with other fields. commit="unknown", @@ -243,6 +242,7 @@ def _create_imported_files_and_search_index( project_slug=version.project.slug, version_slug=version.slug, build_id=build_id, + index_name=search_index_name, ) if html_files_to_save: diff --git a/readthedocs/search/management/commands/reindex_elasticsearch.py b/readthedocs/search/management/commands/reindex_elasticsearch.py index 27585c12334..514c6a39299 100644 --- a/readthedocs/search/management/commands/reindex_elasticsearch.py +++ b/readthedocs/search/management/commands/reindex_elasticsearch.py @@ -86,10 +86,9 @@ def _change_index(self, models, timestamp): def _reindex_from(self, days_ago, models, queue): functions = { - apps.get_model("projects.HTMLFile"): self._reindex_files_from, - apps.get_model("projects.Project"): self._reindex_projects_from, + HTMLFile: self._reindex_files_from, + Project: self._reindex_projects_from, } - models = models or functions.keys() for model in models: if model not in functions: log.warning( diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py index f473b4d0425..e82ce75d3a6 100644 --- a/readthedocs/search/utils.py +++ b/readthedocs/search/utils.py @@ -11,6 +11,10 @@ def index_objects(document, objects, index_name=None): + if not DEDConfig.autosync_enabled(): + log.info("Autosync disabled, skipping searh indexing.") + return + # If a search index name is provided, we need to temporarily change # the index name of the document. old_index_name = document._index._name @@ -24,7 +28,7 @@ def index_objects(document, objects, index_name=None): document._index._name = old_index_name -def remove_indexed_files(project_slug, version_slug=None, build_id=None): +def remove_indexed_files(project_slug, version_slug=None, build_id=None, index_name=None): """ Remove files from `version_slug` of `project_slug` from the search index. @@ -44,8 +48,14 @@ def remove_indexed_files(project_slug, version_slug=None, build_id=None): log.info("Autosync disabled, skipping removal from the search index.") return + # If a search index name is provided, we need to temporarily change + # the index name of the document. + document = PageDocument + old_index_name = document._index._name + if index_name: + document._index._name = index_name + try: - document = PageDocument log.info("Deleting old files from search index.") documents = document().search().filter("term", project=project_slug) if version_slug: @@ -56,6 +66,10 @@ def remove_indexed_files(project_slug, version_slug=None, build_id=None): except Exception: log.exception("Unable to delete a subset of files. Continuing.") + # Restore the old index name. + if index_name: + document._index._name = old_index_name + def _get_index(indices, index_name): """ From 2ccf0e83c7cae036461efe6e73f8791e8d5b92cf Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Wed, 13 Sep 2023 13:10:55 -0500 Subject: [PATCH 10/12] Format --- readthedocs/projects/tasks/search.py | 19 +++++++++++++++---- readthedocs/search/utils.py | 4 +++- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index 57c195e6fb1..c29cc11b599 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -61,13 +61,18 @@ def index_build(build_id): .first() ) if not build: - log.debug("Skipping search indexing. Build object doesn't exists.", build_id=build_id) + log.debug( + "Skipping search indexing. Build object doesn't exists.", build_id=build_id + ) return # The version may have been deleted. version = build.version if not version: - log.debug("Skipping search indexing. Build doesn't have a version attach it to it.", build_id=build_id) + log.debug( + "Skipping search indexing. Build doesn't have a version attach it to it.", + build_id=build_id, + ) return log.bind( @@ -101,7 +106,10 @@ def reindex_version(version_id, search_index_name=None): """ version = Version.objects.filter(pk=version_id).select_related("project").first() if not version or not version.built: - log.debug("Skipping search indexing. Version doesn't exist or is not built.", version_id=version_id) + log.debug( + "Skipping search indexing. Version doesn't exist or is not built.", + version_id=version_id, + ) return latest_successful_build = ( @@ -112,7 +120,10 @@ def reindex_version(version_id, search_index_name=None): # If the version doesn't have a successful # build, we don't have files to index. if not latest_successful_build: - log.debug("Skipping search indexing. Version doesn't have a successful build.", version_id=version_id) + log.debug( + "Skipping search indexing. Version doesn't have a successful build.", + version_id=version_id, + ) return log.bind( diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py index e82ce75d3a6..3c2ed5d5ad6 100644 --- a/readthedocs/search/utils.py +++ b/readthedocs/search/utils.py @@ -28,7 +28,9 @@ def index_objects(document, objects, index_name=None): document._index._name = old_index_name -def remove_indexed_files(project_slug, version_slug=None, build_id=None, index_name=None): +def remove_indexed_files( + project_slug, version_slug=None, build_id=None, index_name=None +): """ Remove files from `version_slug` of `project_slug` from the search index. From 8fdbc0681f05c5706332f65d2e27986dc60df13b Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Wed, 13 Sep 2023 14:03:19 -0500 Subject: [PATCH 11/12] Rename build -> sync_id --- readthedocs/projects/tasks/search.py | 29 ++++---- .../rtd_tests/tests/test_imported_file.py | 66 +++++++++---------- readthedocs/search/utils.py | 6 +- 3 files changed, 50 insertions(+), 51 deletions(-) diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index c29cc11b599..bd67f206b59 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -44,7 +44,6 @@ def fileify(version_pk, commit, build, search_ranking, search_ignore): try: _create_imported_files_and_search_index( version=version, - build_id=build, search_ranking=search_ranking, search_ignore=search_ignore, ) @@ -89,7 +88,6 @@ def index_build(build_id): try: _create_imported_files_and_search_index( version=version, - build_id=build.id, search_ranking=search_ranking, search_ignore=search_ignore, ) @@ -137,16 +135,9 @@ def reindex_version(version_id, search_index_name=None): search_ranking = search_config.get("ranking", []) search_ignore = search_config.get("ignore", []) - # We need to use a build ID that is different from the current one, - # this ID is to differentiate the new files being indexed from the - # current ones, so we can delete the old ones easily. - # If we re-use the same build ID, we will end up with duplicated files. - # The build id isn't used for anything else. - build_id = latest_successful_build.id + 1 try: _create_imported_files_and_search_index( version=version, - build_id=build_id, search_ranking=search_ranking, search_ignore=search_ignore, search_index_name=search_index_name, @@ -165,7 +156,7 @@ def remove_search_indexes(project_slug, version_slug=None): def _create_imported_files_and_search_index( - *, version, build_id, search_ranking, search_ignore, search_index_name=None + *, version, search_ranking, search_ignore, search_index_name=None ): """ Create imported files and search index for the build of the version. @@ -180,6 +171,17 @@ def _create_imported_files_and_search_index( storage_path = version.project.get_storage_path( type_="html", version_slug=version.slug, include_file=False ) + # A sync ID is a number different than the current `build` attribute (pending rename), + # it's used to differentiate the files from the current sync from the previous one. + # This is useful to easily delete the previous files from the DB and ES. + imported_file = version.imported_files.first() + sync_id = imported_file.build + 1 if imported_file else 1 + + log.debug( + "Using sync ID for search indexing", + sync_id=sync_id, + ) + html_files_to_index = [] html_files_to_save = [] reverse_rankings = list(reversed(search_ranking.items())) @@ -225,7 +227,7 @@ def _create_imported_files_and_search_index( # but it isn't used, and will be removed in the future # together with other fields. commit="unknown", - build=build_id, + build=sync_id, ignore=skip_search_index, ) @@ -252,7 +254,7 @@ def _create_imported_files_and_search_index( remove_indexed_files( project_slug=version.project.slug, version_slug=version.slug, - build_id=build_id, + sync_id=sync_id, index_name=search_index_name, ) @@ -260,7 +262,7 @@ def _create_imported_files_and_search_index( HTMLFile.objects.bulk_create(html_files_to_save) # Delete imported files from the previous build of the version. - version.imported_files.exclude(build=build_id).delete() + version.imported_files.exclude(build=sync_id).delete() # This signal is used for purging the CDN. files_changed.send( @@ -268,3 +270,4 @@ def _create_imported_files_and_search_index( project=version.project, version=version, ) + return sync_id diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py index c59b2a795e8..82e25f98b88 100644 --- a/readthedocs/rtd_tests/tests/test_imported_file.py +++ b/readthedocs/rtd_tests/tests/test_imported_file.py @@ -33,15 +33,12 @@ def tearDown(self): # If there are no documents, the query fails. pass - def _manage_imported_files( - self, version, build_id, search_ranking=None, search_ignore=None - ): + def _manage_imported_files(self, version, search_ranking=None, search_ignore=None): """Helper function for the tests to create and sync ImportedFiles.""" search_ranking = search_ranking or {} search_ignore = search_ignore or [] - _create_imported_files_and_search_index( + return _create_imported_files_and_search_index( version=version, - build_id=build_id, search_ranking=search_ranking, search_ignore=search_ignore, ) @@ -70,27 +67,27 @@ def test_properly_created(self): """ self.assertEqual(ImportedFile.objects.count(), 0) - self._manage_imported_files(version=self.version, build_id=1) + sync_id = self._manage_imported_files(version=self.version) self.assertEqual(ImportedFile.objects.count(), 3) self.assertEqual( set(HTMLFile.objects.all().values_list("path", flat=True)), {"index.html", "api/index.html", "404.html"}, ) - results = PageDocument().search().filter("term", build=1).execute() + results = PageDocument().search().filter("term", build=sync_id).execute() self.assertEqual( {result.path for result in results}, {"index.html", "404.html", "test.html", "api/index.html"}, ) - self._manage_imported_files(version=self.version, build_id=2) + sync_id = self._manage_imported_files(version=self.version) self.assertEqual(ImportedFile.objects.count(), 3) self.assertEqual( set(HTMLFile.objects.all().values_list("path", flat=True)), {"index.html", "api/index.html", "404.html"}, ) - results = PageDocument().search().filter("term", build=2).execute() + results = PageDocument().search().filter("term", build=sync_id).execute() self.assertEqual( {result.path for result in results}, {"index.html", "404.html", "test.html", "api/index.html"}, @@ -98,29 +95,29 @@ def test_properly_created(self): def test_update_build(self): self.assertEqual(ImportedFile.objects.count(), 0) - self._manage_imported_files(self.version, build_id=1) + sync_id = self._manage_imported_files(self.version) for obj in ImportedFile.objects.all(): - self.assertEqual(obj.build, 1) + self.assertEqual(obj.build, sync_id) results = PageDocument().search().filter().execute() self.assertEqual(len(results), 4) for result in results: - self.assertEqual(result.build, 1) + self.assertEqual(result.build, sync_id) - self._manage_imported_files(self.version, build_id=2) + sync_id = self._manage_imported_files(self.version) for obj in ImportedFile.objects.all(): - self.assertEqual(obj.build, 2) + self.assertEqual(obj.build, sync_id) # NOTE: we can't test that the files from the previous build # were deleted, since deletion happens asynchronously. - results = PageDocument().search().filter("term", build=2).execute() + results = PageDocument().search().filter("term", build=sync_id).execute() self.assertEqual(len(results), 4) def test_page_default_rank(self): search_ranking = {} self.assertEqual(HTMLFile.objects.count(), 0) - self._manage_imported_files( - self.version, build_id=1, search_ranking=search_ranking + sync_id = self._manage_imported_files( + self.version, search_ranking=search_ranking ) results = ( @@ -134,15 +131,15 @@ def test_page_default_rank(self): for result in results: self.assertEqual(result.project, self.project.slug) self.assertEqual(result.version, self.version.slug) - self.assertEqual(result.build, 1) + self.assertEqual(result.build, sync_id) self.assertEqual(result.rank, 0) def test_page_custom_rank_glob(self): search_ranking = { "*index.html": 5, } - self._manage_imported_files( - self.version, build_id=1, search_ranking=search_ranking + sync_id = self._manage_imported_files( + self.version, search_ranking=search_ranking ) results = ( @@ -156,7 +153,7 @@ def test_page_custom_rank_glob(self): for result in results: self.assertEqual(result.project, self.project.slug) self.assertEqual(result.version, self.version.slug) - self.assertEqual(result.build, 1) + self.assertEqual(result.build, sync_id) if result.path.endswith("index.html"): self.assertEqual(result.rank, 5, result.path) else: @@ -167,8 +164,8 @@ def test_page_custom_rank_several(self): "test.html": 5, "api/index.html": 2, } - self._manage_imported_files( - self.version, build_id=1, search_ranking=search_ranking + sync_id = self._manage_imported_files( + self.version, search_ranking=search_ranking ) results = ( @@ -182,7 +179,7 @@ def test_page_custom_rank_several(self): for result in results: self.assertEqual(result.project, self.project.slug) self.assertEqual(result.version, self.version.slug) - self.assertEqual(result.build, 1) + self.assertEqual(result.build, sync_id) if result.path == "test.html": self.assertEqual(result.rank, 5) elif result.path == "api/index.html": @@ -195,8 +192,8 @@ def test_page_custom_rank_precedence(self): "*.html": 5, "api/index.html": 2, } - self._manage_imported_files( - self.version, build_id=1, search_ranking=search_ranking + sync_id = self._manage_imported_files( + self.version, search_ranking=search_ranking ) results = ( @@ -210,7 +207,7 @@ def test_page_custom_rank_precedence(self): for result in results: self.assertEqual(result.project, self.project.slug) self.assertEqual(result.version, self.version.slug) - self.assertEqual(result.build, 1) + self.assertEqual(result.build, sync_id) if result.path == "api/index.html": self.assertEqual(result.rank, 2, result.path) else: @@ -221,8 +218,8 @@ def test_page_custom_rank_precedence_inverted(self): "api/index.html": 2, "*.html": 5, } - self._manage_imported_files( - self.version, build_id=1, search_ranking=search_ranking + sync_id = self._manage_imported_files( + self.version, search_ranking=search_ranking ) results = ( @@ -236,14 +233,13 @@ def test_page_custom_rank_precedence_inverted(self): for result in results: self.assertEqual(result.project, self.project.slug) self.assertEqual(result.version, self.version.slug) - self.assertEqual(result.build, 1) + self.assertEqual(result.build, sync_id) self.assertEqual(result.rank, 5) def test_search_page_ignore(self): search_ignore = ["api/index.html"] self._manage_imported_files( self.version, - build_id=1, search_ignore=search_ignore, ) @@ -273,7 +269,7 @@ def test_update_content(self): with override_settings(DOCROOT=self.test_dir): self._copy_storage_dir() - self._manage_imported_files(self.version, build_id=1) + sync_id = self._manage_imported_files(self.version) self.assertEqual(ImportedFile.objects.count(), 3) document = ( PageDocument() @@ -281,7 +277,7 @@ def test_update_content(self): .filter("term", project=self.project.slug) .filter("term", version=self.version.slug) .filter("term", path="test.html") - .filter("term", build=1) + .filter("term", build=sync_id) .execute()[0] ) self.assertEqual(document.sections[0].content, "Woo") @@ -292,7 +288,7 @@ def test_update_content(self): with override_settings(DOCROOT=self.test_dir): self._copy_storage_dir() - self._manage_imported_files(self.version, build_id=2) + sync_id = self._manage_imported_files(self.version) self.assertEqual(ImportedFile.objects.count(), 3) document = ( PageDocument() @@ -300,7 +296,7 @@ def test_update_content(self): .filter("term", project=self.project.slug) .filter("term", version=self.version.slug) .filter("term", path="test.html") - .filter("term", build=2) + .filter("term", build=sync_id) .execute()[0] ) self.assertEqual(document.sections[0].content, "Something Else") diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py index 3c2ed5d5ad6..4dd10b49e39 100644 --- a/readthedocs/search/utils.py +++ b/readthedocs/search/utils.py @@ -29,7 +29,7 @@ def index_objects(document, objects, index_name=None): def remove_indexed_files( - project_slug, version_slug=None, build_id=None, index_name=None + project_slug, version_slug=None, sync_id=None, index_name=None ): """ Remove files from `version_slug` of `project_slug` from the search index. @@ -62,8 +62,8 @@ def remove_indexed_files( documents = document().search().filter("term", project=project_slug) if version_slug: documents = documents.filter("term", version=version_slug) - if build_id: - documents = documents.exclude("term", build=build_id) + if sync_id: + documents = documents.exclude("term", build=sync_id) documents.delete() except Exception: log.exception("Unable to delete a subset of files. Continuing.") From dd9e226de2760bb6c6a5efd3ba7113549d050661 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 14 Sep 2023 10:34:32 -0500 Subject: [PATCH 12/12] Updates from review --- readthedocs/projects/tasks/search.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/readthedocs/projects/tasks/search.py b/readthedocs/projects/tasks/search.py index bd67f206b59..6b7127507e9 100644 --- a/readthedocs/projects/tasks/search.py +++ b/readthedocs/projects/tasks/search.py @@ -174,6 +174,7 @@ def _create_imported_files_and_search_index( # A sync ID is a number different than the current `build` attribute (pending rename), # it's used to differentiate the files from the current sync from the previous one. # This is useful to easily delete the previous files from the DB and ES. + # See https://github.com/readthedocs/readthedocs.org/issues/10734. imported_file = version.imported_files.first() sync_id = imported_file.build + 1 if imported_file else 1 @@ -210,8 +211,6 @@ def _create_imported_files_and_search_index( # If the file is ignored, we don't need to check for its ranking. if not skip_search_index: # Last pattern to match takes precedence - # XXX: see if we can implement another type of precedence, - # like the longest pattern. for pattern, rank in reverse_rankings: if fnmatch(relpath, pattern): page_rank = rank