From b66e8174591b2b2f70d79769611759248ccac1f0 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Tue, 21 Jul 2020 14:14:15 -0500 Subject: [PATCH 1/3] Search: allow ignoring files from indexing Closes https://github.com/readthedocs/readthedocs.org/issues/5247 Ref https://github.com/readthedocs/readthedocs.org/issues/7217 --- docs/config-file/v2.rst | 44 ++++++++++++++- docs/server-side-search.rst | 4 ++ readthedocs/config/config.py | 21 ++++++- readthedocs/config/models.py | 2 +- readthedocs/config/tests/test_config.py | 46 ++++++++++++++++ .../0058_add_imported_file_ignore.py | 18 ++++++ readthedocs/projects/models.py | 6 ++ readthedocs/projects/tasks.py | 13 ++++- .../rtd_tests/fixtures/spec/v2/schema.yml | 4 ++ readthedocs/rtd_tests/tests/test_celery.py | 8 ++- .../rtd_tests/tests/test_imported_file.py | 29 +++++++++- readthedocs/search/documents.py | 29 ++++------ readthedocs/search/tests/conftest.py | 1 + readthedocs/search/tests/test_api.py | 55 ++++++++++++++++++- 14 files changed, 253 insertions(+), 27 deletions(-) create mode 100644 readthedocs/projects/migrations/0058_add_imported_file_ignore.py diff --git a/docs/config-file/v2.rst b/docs/config-file/v2.rst index 18487831589..b692aadc8e6 100644 --- a/docs/config-file/v2.rst +++ b/docs/config-file/v2.rst @@ -466,6 +466,8 @@ Settings for more control over :doc:`/server-side-search`. ranking: api/v1/*: -1 api/v2/*: 4 + ignore: + - 404.html search.ranking `````````````` @@ -488,6 +490,9 @@ Pages with a rank closer to -10 will appear further down the list of results, and pages with a rank closer to 10 will appear higher in the list of results. Note that 0 means *normal rank*, not *no rank*. +If you are looking to completely ignore a page, +check :ref:`config-file/v2:search.ignore`. + .. code-block:: yaml version: 2 @@ -495,7 +500,7 @@ Note that 0 means *normal rank*, not *no rank*. search: ranking: # Match a single file - tutorial.hml: 2 + tutorial.html: 2 # Match all files under the api/v1 directory api/v1/*: -5 @@ -512,6 +517,43 @@ Note that 0 means *normal rank*, not *no rank*. Is better to decrease the rank of pages you want to deprecate, rather than increasing the rank of the other pages. +search.ignore +````````````` + +Don't index files matching a pattern. +This is, you won't see search results from these files. + +:Type: ``list`` of patterns +:Default: ``['search.html', 'search/index.html', '404.html', '404/index.html']`` + +Patterns are matched against the final html pages produced by the build +(you should try to match `index.html`, not `docs/index.rst`). +Patterns can include some special characters: + +- ``*`` matches everything +- ``?`` matches any single character +- ``[seq]`` matches any character in ``seq`` + +.. code-block:: yaml + + version: 2 + + search: + ignore: + # Ignore a single file + - 404.html + + # Ignore all files under the search/ directory + - search/* + + # Ignore all files that end with ref.html + - '*/ref.html' + +.. note:: + + Since Read the Docs fallbacks to the original search engine when no results are found, + you may still see search results from ignored pages. + Schema ------ diff --git a/docs/server-side-search.rst b/docs/server-side-search.rst index fabd2f79814..49bbd1c3f26 100644 --- a/docs/server-side-search.rst +++ b/docs/server-side-search.rst @@ -40,6 +40,10 @@ Special query syntax for more specific results. We support a full range of search queries. You can see some examples in our :ref:`guides/searching-with-readthedocs:search query syntax` guide. +Configurable. + Tweak search results according to your needs using a + :ref:`configuration file `. + .. Code object searching With the user of :doc:`Sphinx Domains ` we are able to automatically provide direct search results to your Code objects. diff --git a/readthedocs/config/config.py b/readthedocs/config/config.py index 244135277eb..9166d171fc8 100644 --- a/readthedocs/config/config.py +++ b/readthedocs/config/config.py @@ -656,7 +656,7 @@ def submodules(self): @property def search(self): - return Search(ranking={}) + return Search(ranking={}, ignore=[]) class BuildConfigV2(BuildConfigBase): @@ -1023,7 +1023,8 @@ def validate_search(self): """ Validates the search key. - - Ranking is a map of path patterns to a rank. + - ``ranking`` is a map of path patterns to a rank. + - ``ignore`` is a list of patterns. - The path pattern supports basic globs (*, ?, [seq]). - The rank can be a integer number between -10 and 10. """ @@ -1046,6 +1047,22 @@ def validate_search(self): search['ranking'] = final_ranking + with self.catch_validation_error('search.ignore'): + ignore_default = [ + 'search.html', + 'search/index.html', + '404.html', + '404/index.html', + ] + search_ignore = self.pop_config('search.ignore', ignore_default) + validate_list(search_ignore) + + final_ignore = [ + validate_path_pattern(pattern) + for pattern in search_ignore + ] + search['ignore'] = final_ignore + return search def validate_keys(self): diff --git a/readthedocs/config/models.py b/readthedocs/config/models.py index 55932e991d4..998da8498be 100644 --- a/readthedocs/config/models.py +++ b/readthedocs/config/models.py @@ -72,4 +72,4 @@ class Submodules(Base): class Search(Base): - __slots__ = ('ranking',) + __slots__ = ('ranking', 'ignore') diff --git a/readthedocs/config/tests/test_config.py b/readthedocs/config/tests/test_config.py index 3ee20d1653c..a2834c78305 100644 --- a/readthedocs/config/tests/test_config.py +++ b/readthedocs/config/tests/test_config.py @@ -767,6 +767,7 @@ def test_as_dict(tmpdir): }, 'search': { 'ranking': {}, + 'ignore': [], }, } assert build.as_dict() == expected_dict @@ -1908,6 +1909,45 @@ def test_search_ranking_normilize_path(self, path, expected): build.validate() assert build.search.ranking == {expected: 1} + @pytest.mark.parametrize( + 'value', + [ + 'invalid', + True, + 0, + [2, 3], + {'foo/bar': 11}, + ], + ) + def test_search_ignore_invalid_type(self, value): + build = self.get_build_config({ + 'search': {'ignore': value}, + }) + with raises(InvalidConfig) as excinfo: + build.validate() + assert excinfo.value.key == 'search.ignore' + + @pytest.mark.parametrize('path, expected', [ + ('/foo/bar', 'foo/bar'), + ('///foo//bar', 'foo/bar'), + ('///foo//bar/', 'foo/bar'), + ('/foo/bar/../', 'foo'), + ('/foo*', 'foo*'), + ('/foo/bar/*', 'foo/bar/*'), + ('/foo/bar?/*', 'foo/bar?/*'), + ('foo/[bc]ar/*/', 'foo/[bc]ar/*'), + ('*', '*'), + ('index.html', 'index.html'), + ]) + def test_search_ignore_valid_type(self, path, expected): + build = self.get_build_config({ + 'search': { + 'ignore': [path], + }, + }) + build.validate() + assert build.search.ignore == [expected] + @pytest.mark.parametrize('value,key', [ ({'typo': 'something'}, 'typo'), ( @@ -2048,6 +2088,12 @@ def test_as_dict(self, tmpdir): }, 'search': { 'ranking': {}, + 'ignore': [ + 'search.html', + 'search/index.html', + '404.html', + '404/index.html', + ], }, } assert build.as_dict() == expected_dict diff --git a/readthedocs/projects/migrations/0058_add_imported_file_ignore.py b/readthedocs/projects/migrations/0058_add_imported_file_ignore.py new file mode 100644 index 00000000000..167611f21e8 --- /dev/null +++ b/readthedocs/projects/migrations/0058_add_imported_file_ignore.py @@ -0,0 +1,18 @@ +# Generated by Django 2.2.12 on 2020-07-21 18:21 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('projects', '0057_add_page_rank'), + ] + + operations = [ + migrations.AddField( + model_name='importedfile', + name='ignore', + field=models.BooleanField(null=True, verbose_name='Ignore this file from operations like indexing'), + ), + ] diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index fd89f761733..8ca2b845d76 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -1360,6 +1360,12 @@ class ImportedFile(models.Model): null=True, validators=[MinValueValidator(-10), MaxValueValidator(10)], ) + ignore = models.BooleanField( + _('Ignore this file from operations like indexing'), + # default=False, + # TODO: remove after migration + null=True, + ) def get_absolute_url(self): return resolve( diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py index 4df043c0096..f6fdebd75c3 100644 --- a/readthedocs/projects/tasks.py +++ b/readthedocs/projects/tasks.py @@ -1136,6 +1136,7 @@ def update_app_instances( commit=self.build['commit'], build=self.build['id'], search_ranking=self.config.search.ranking, + search_ignore=self.config.search.ignore, ) def setup_python_environment(self): @@ -1278,7 +1279,7 @@ def is_type_sphinx(self): # Web tasks @app.task(queue='reindex') -def fileify(version_pk, commit, build, search_ranking): +def fileify(version_pk, commit, build, search_ranking, search_ignore): """ Create ImportedFile objects for all of a version's files. @@ -1317,6 +1318,7 @@ def fileify(version_pk, commit, build, search_ranking): commit=commit, build=build, search_ranking=search_ranking, + search_ignore=search_ignore, ) except Exception: changed_files = set() @@ -1494,7 +1496,7 @@ def clean_build(version_pk): return True -def _create_imported_files(*, version, commit, build, search_ranking): +def _create_imported_files(*, version, commit, build, search_ranking, search_ignore): """ Create imported files for version. @@ -1564,6 +1566,12 @@ def _create_imported_files(*, version, commit, build, search_ranking): page_rank = rank break + ignore = False + for pattern in search_ignore: + if fnmatch(relpath, pattern): + ignore = True + break + # Create imported files from new build model_class.objects.create( project=version.project, @@ -1574,6 +1582,7 @@ def _create_imported_files(*, version, commit, build, search_ranking): rank=page_rank, commit=commit, build=build, + ignore=ignore, ) # This signal is used for clearing the CDN, diff --git a/readthedocs/rtd_tests/fixtures/spec/v2/schema.yml b/readthedocs/rtd_tests/fixtures/spec/v2/schema.yml index 2e34d08c7f0..f88db8025d1 100644 --- a/readthedocs/rtd_tests/fixtures/spec/v2/schema.yml +++ b/readthedocs/rtd_tests/fixtures/spec/v2/schema.yml @@ -134,3 +134,7 @@ search: # Map of patterns to ranks # Default: {} ranking: map(str(), int(min=-10, max=10), required=False) + + # List of patterns + # Default: ['search.html', 'search/index.html', '404.html', '404/index.html'] + ignore: list(str(), required=False) diff --git a/readthedocs/rtd_tests/tests/test_celery.py b/readthedocs/rtd_tests/tests/test_celery.py index 5ff0eaf1cd9..aae93822b8e 100644 --- a/readthedocs/rtd_tests/tests/test_celery.py +++ b/readthedocs/rtd_tests/tests/test_celery.py @@ -327,7 +327,13 @@ def public_task_exception(): @patch('readthedocs.builds.managers.log') def test_fileify_logging_when_wrong_version_pk(self, mock_logger): self.assertFalse(Version.objects.filter(pk=345343).exists()) - tasks.fileify(version_pk=345343, commit=None, build=1, search_ranking={}) + tasks.fileify( + version_pk=345343, + commit=None, + build=1, + search_ranking={}, + search_ignore=[], + ) mock_logger.warning.assert_called_with("Version not found for given kwargs. {'pk': 345343}") @patch('readthedocs.oauth.services.github.GitHubService.send_build_status') diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py index 1452eeaeaaf..3633ffa3fca 100644 --- a/readthedocs/rtd_tests/tests/test_imported_file.py +++ b/readthedocs/rtd_tests/tests/test_imported_file.py @@ -28,14 +28,23 @@ def setUp(self): self.test_dir = os.path.join(base_dir, 'files') self._copy_storage_dir() - def _manage_imported_files(self, version, commit, build, search_ranking=None): + def _manage_imported_files( + self, + version, + commit, + build, + search_ranking=None, + search_ignore=None + ): """Helper function for the tests to create and sync ImportedFiles.""" search_ranking = search_ranking or {} + search_ignore = search_ignore or [] _create_imported_files( version=version, commit=commit, build=build, search_ranking=search_ranking, + search_ignore=search_ignore, ) _sync_imported_files(version, build, set()) @@ -131,6 +140,23 @@ def test_page_custom_rank_precedence_inverted(self): self.assertEqual(file_api.rank, 5) self.assertEqual(file_test.rank, 5) + def test_search_page_ignore(self): + search_ignore = [ + 'api/index.html' + ] + self._manage_imported_files( + self.version, + 'commit01', + 1, + search_ignore=search_ignore, + ) + + self.assertEqual(HTMLFile.objects.count(), 2) + file_api = HTMLFile.objects.get(path='api/index.html') + file_test = HTMLFile.objects.get(path='test.html') + self.assertTrue(file_api.ignore) + self.assertFalse(file_test.ignore) + def test_update_content(self): test_dir = os.path.join(base_dir, 'files') self.assertEqual(ImportedFile.objects.count(), 0) @@ -195,6 +221,7 @@ def test_create_intersphinx_data(self, mock_exists): commit='commit01', build=1, search_ranking={}, + search_ignore=[], ) _create_intersphinx_data(self.version, 'commit01', 1) diff --git a/readthedocs/search/documents.py b/readthedocs/search/documents.py index 776e16338d5..0e153dd0dcb 100644 --- a/readthedocs/search/documents.py +++ b/readthedocs/search/documents.py @@ -141,23 +141,16 @@ def prepare_domains(self, html_file): return all_domains def get_queryset(self): - """Overwrite default queryset to filter certain files to index.""" - queryset = super().get_queryset() - - # Do not index files from external versions - queryset = queryset.internal().all() - - # TODO: Make this smarter - # This was causing issues excluding some valid user documentation pages - # excluded_files = [ - # 'search.html', - # 'genindex.html', - # 'py-modindex.html', - # 'search/index.html', - # 'genindex/index.html', - # 'py-modindex/index.html', - # ] - # for ending in excluded_files: - # queryset = queryset.exclude(path=ending) + """ + Ignore certain files from indexing. + - Files from external versions + - Ignored files + """ + queryset = super().get_queryset() + queryset = ( + queryset + .internal() + .exclude(ignore=True) + ) return queryset diff --git a/readthedocs/search/tests/conftest.py b/readthedocs/search/tests/conftest.py index 97bba15dc45..4a796b20730 100644 --- a/readthedocs/search/tests/conftest.py +++ b/readthedocs/search/tests/conftest.py @@ -47,6 +47,7 @@ def all_projects(es_index, mock_processed_json, db, settings): version=version, name=file_name, path=file_name, + build=1, ) # creating sphinx domain test objects diff --git a/readthedocs/search/tests/test_api.py b/readthedocs/search/tests/test_api.py index 4e90ec40525..f01868a6d22 100644 --- a/readthedocs/search/tests/test_api.py +++ b/readthedocs/search/tests/test_api.py @@ -14,7 +14,7 @@ SPHINX_HTMLDIR, SPHINX_SINGLEHTML, ) -from readthedocs.projects.models import HTMLFile, Project, Feature +from readthedocs.projects.models import Feature, HTMLFile, Project from readthedocs.search.api import PageSearchAPIView from readthedocs.search.documents import PageDocument from readthedocs.search.tests.utils import ( @@ -22,6 +22,7 @@ SECTION_FIELDS, get_search_query_from_project_file, ) +from readthedocs.search.utils import index_new_files, remove_indexed_files OLD_TYPES = { 'domain': 'domains', @@ -613,6 +614,58 @@ def test_search_custom_ranking(self, api_client): assert results[0]['full_path'] == 'index.html' assert results[1]['full_path'] == 'guides/index.html' + def test_search_ignore(self, api_client): + project = Project.objects.get(slug='docs') + version = project.versions.all().first() + + page_index = HTMLFile.objects.get(path='index.html') + page_guides = HTMLFile.objects.get(path='guides/index.html') + + search_params = { + 'project': project.slug, + 'version': version.slug, + 'q': '"content from"', + } + + # Query with files not ignored. + assert page_index.ignore is None + assert page_guides.ignore is None + + resp = self.get_search(api_client, search_params) + assert resp.status_code == 200 + + results = resp.data['results'] + assert len(results) == 2 + assert results[0]['full_path'] == 'index.html' + assert results[1]['full_path'] == 'guides/index.html' + + # Query with guides/index.html ignored. + page_guides.ignore = True + page_guides.save() + + remove_indexed_files(HTMLFile, project.slug, version.slug) + index_new_files(HTMLFile, version, page_index.build) + + resp = self.get_search(api_client, search_params) + assert resp.status_code == 200 + + results = resp.data['results'] + assert len(results) == 1 + assert results[0]['full_path'] == 'index.html' + + # Query with index.html and guides/index.html ignored. + page_index.ignore = True + page_index.save() + + remove_indexed_files(HTMLFile, project.slug, version.slug) + index_new_files(HTMLFile, version, page_index.build) + + resp = self.get_search(api_client, search_params) + assert resp.status_code == 200 + + results = resp.data['results'] + assert len(results) == 0 + class TestDocumentSearch(BaseTestDocumentSearch): From 27717cbc2b1fb1bbab1af81c97b42b6f93490e85 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Wed, 9 Sep 2020 15:57:09 -0500 Subject: [PATCH 2/3] Update migration --- ...imported_file_ignore.py => 0061_add_imported_file_ignore.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename readthedocs/projects/migrations/{0058_add_imported_file_ignore.py => 0061_add_imported_file_ignore.py} (89%) diff --git a/readthedocs/projects/migrations/0058_add_imported_file_ignore.py b/readthedocs/projects/migrations/0061_add_imported_file_ignore.py similarity index 89% rename from readthedocs/projects/migrations/0058_add_imported_file_ignore.py rename to readthedocs/projects/migrations/0061_add_imported_file_ignore.py index 167611f21e8..6f03f7dd5bd 100644 --- a/readthedocs/projects/migrations/0058_add_imported_file_ignore.py +++ b/readthedocs/projects/migrations/0061_add_imported_file_ignore.py @@ -6,7 +6,7 @@ class Migration(migrations.Migration): dependencies = [ - ('projects', '0057_add_page_rank'), + ('projects', '0060_make_rank_not_null'), ] operations = [ From 84987796e76eb301a3e05f0d047ced44315b811a Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Wed, 9 Sep 2020 16:16:48 -0500 Subject: [PATCH 3/3] Update tests --- readthedocs/search/tests/test_api.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/readthedocs/search/tests/test_api.py b/readthedocs/search/tests/test_api.py index 2d66f0aae79..58abfb37f4f 100644 --- a/readthedocs/search/tests/test_api.py +++ b/readthedocs/search/tests/test_api.py @@ -632,8 +632,8 @@ def test_search_ignore(self, api_client): results = resp.data['results'] assert len(results) == 2 - assert results[0]['full_path'] == 'index.html' - assert results[1]['full_path'] == 'guides/index.html' + assert results[0]['path'] == '/en/latest/index.html' + assert results[1]['path'] == '/en/latest/guides/index.html' # Query with guides/index.html ignored. page_guides.ignore = True @@ -647,7 +647,7 @@ def test_search_ignore(self, api_client): results = resp.data['results'] assert len(results) == 1 - assert results[0]['full_path'] == 'index.html' + assert results[0]['path'] == '/en/latest/index.html' # Query with index.html and guides/index.html ignored. page_index.ignore = True