readthedocs · stsewd · Sep 9, 2020 · Jul 21, 2020 · Sep 9, 2020 · Sep 9, 2020
diff --git a/docs/config-file/v2.rst b/docs/config-file/v2.rst
@@ -468,6 +468,8 @@ Settings for more control over :doc:`/server-side-search`.
       ranking:
         api/v1/*: -1
         api/v2/*: 4
+      ignore:
+        - 404.html
 
 search.ranking
 ``````````````
@@ -490,14 +492,17 @@ Pages with a rank closer to -10 will appear further down the list of results,
 and pages with a rank closer to 10 will appear higher in the list of results.
 Note that 0 means *normal rank*, not *no rank*.
 
+If you are looking to completely ignore a page,
+check :ref:`config-file/v2:search.ignore`.
+
 .. code-block:: yaml
 
    version: 2
 
    search:
       ranking:
         # Match a single file
-        tutorial.hml: 2
+        tutorial.html: 2
 
         # Match all files under the api/v1 directory
         api/v1/*: -5
@@ -514,6 +519,43 @@ Note that 0 means *normal rank*, not *no rank*.
    Is better to decrease the rank of pages you want to deprecate,
    rather than increasing the rank of the other pages.
 
+search.ignore
+`````````````
+
+Don't index files matching a pattern.
+This is, you won't see search results from these files.
+
+:Type: ``list`` of patterns
+:Default: ``['search.html', 'search/index.html', '404.html', '404/index.html']``
+
+Patterns are matched against the final html pages produced by the build
+(you should try to match `index.html`, not `docs/index.rst`).
+Patterns can include some special characters:
+
+- ``*`` matches everything
+- ``?`` matches any single character
+- ``[seq]`` matches any character in ``seq``
+
+.. code-block:: yaml
+
+   version: 2
+
+   search:
+      ignore:
+        # Ignore a single file
+        - 404.html
+
+        # Ignore all files under the search/ directory
+        - search/*
+
+        # Ignore all files that end with ref.html
+        - '*/ref.html'
+
+.. note::
+
+   Since Read the Docs fallbacks to the original search engine when no results are found,
+   you may still see search results from ignored pages.
+
 Schema
 ------
 

diff --git a/docs/server-side-search.rst b/docs/server-side-search.rst
@@ -40,6 +40,10 @@ Special query syntax for more specific results.
    We support a full range of search queries.
    You can see some examples in our :ref:`guides/searching-with-readthedocs:search query syntax` guide.
 
+Configurable.
+   Tweak search results according to your needs using a
+   :ref:`configuration file <config-file/v2:search>`.
+
 ..
    Code object searching
       With the user of :doc:`Sphinx Domains <sphinx:/usage/restructuredtext/domains>` we are able to automatically provide direct search results to your Code objects.

diff --git a/readthedocs/config/config.py b/readthedocs/config/config.py
@@ -656,7 +656,7 @@ def submodules(self):
 
     @property
     def search(self):
-        return Search(ranking={})
+        return Search(ranking={}, ignore=[])
 
 
 class BuildConfigV2(BuildConfigBase):
@@ -1023,7 +1023,8 @@ def validate_search(self):
         """
         Validates the search key.
 
-        - Ranking is a map of path patterns to a rank.
+        - ``ranking`` is a map of path patterns to a rank.
+        - ``ignore`` is a list of patterns.
         - The path pattern supports basic globs (*, ?, [seq]).
         - The rank can be a integer number between -10 and 10.
         """
@@ -1046,6 +1047,22 @@ def validate_search(self):
 
             search['ranking'] = final_ranking
 
+        with self.catch_validation_error('search.ignore'):
+            ignore_default = [
+                'search.html',
+                'search/index.html',
+                '404.html',
+                '404/index.html',
+            ]
+            search_ignore = self.pop_config('search.ignore', ignore_default)
+            validate_list(search_ignore)
+
+            final_ignore = [
+                validate_path_pattern(pattern)
+                for pattern in search_ignore
+            ]
+            search['ignore'] = final_ignore
+
         return search
 
     def validate_keys(self):

diff --git a/readthedocs/config/models.py b/readthedocs/config/models.py
@@ -72,4 +72,4 @@ class Submodules(Base):
 
 class Search(Base):
 
-    __slots__ = ('ranking',)
+    __slots__ = ('ranking', 'ignore')
diff --git a/readthedocs/config/tests/test_config.py b/readthedocs/config/tests/test_config.py
@@ -767,6 +767,7 @@ def test_as_dict(tmpdir):
         },
         'search': {
             'ranking': {},
+            'ignore': [],
         },
     }
     assert build.as_dict() == expected_dict
@@ -1908,6 +1909,45 @@ def test_search_ranking_normilize_path(self, path, expected):
         build.validate()
         assert build.search.ranking == {expected: 1}
 
+    @pytest.mark.parametrize(
+        'value',
+        [
+            'invalid',
+            True,
+            0,
+            [2, 3],
+            {'foo/bar': 11},
+        ],
+    )
+    def test_search_ignore_invalid_type(self, value):
+        build = self.get_build_config({
+            'search': {'ignore': value},
+        })
+        with raises(InvalidConfig) as excinfo:
+            build.validate()
+        assert excinfo.value.key == 'search.ignore'
+
+    @pytest.mark.parametrize('path, expected', [
+        ('/foo/bar', 'foo/bar'),
+        ('///foo//bar', 'foo/bar'),
+        ('///foo//bar/', 'foo/bar'),
+        ('/foo/bar/../', 'foo'),
+        ('/foo*', 'foo*'),
+        ('/foo/bar/*', 'foo/bar/*'),
+        ('/foo/bar?/*', 'foo/bar?/*'),
+        ('foo/[bc]ar/*/', 'foo/[bc]ar/*'),
+        ('*', '*'),
+        ('index.html', 'index.html'),
+    ])
+    def test_search_ignore_valid_type(self, path, expected):
+        build = self.get_build_config({
+            'search': {
+                'ignore': [path],
+            },
+        })
+        build.validate()
+        assert build.search.ignore == [expected]
+
     @pytest.mark.parametrize('value,key', [
         ({'typo': 'something'}, 'typo'),
         (
@@ -2048,6 +2088,12 @@ def test_as_dict(self, tmpdir):
             },
             'search': {
                 'ranking': {},
+                'ignore': [
+                    'search.html',
+                    'search/index.html',
+                    '404.html',
+                    '404/index.html',
+                ],
             },
         }
         assert build.as_dict() == expected_dict
diff --git a/readthedocs/projects/migrations/0061_add_imported_file_ignore.py b/readthedocs/projects/migrations/0061_add_imported_file_ignore.py
@@ -0,0 +1,18 @@
+# Generated by Django 2.2.12 on 2020-07-21 18:21
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('projects', '0060_make_rank_not_null'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='importedfile',
+            name='ignore',
+            field=models.BooleanField(null=True, verbose_name='Ignore this file from operations like indexing'),
+        ),
+    ]
diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py
@@ -1358,6 +1358,12 @@ class ImportedFile(models.Model):
         default=0,
         validators=[MinValueValidator(-10), MaxValueValidator(10)],
     )
+    ignore = models.BooleanField(
+        _('Ignore this file from operations like indexing'),
+        # default=False,
+        # TODO: remove after migration
+        null=True,
+    )
 
     def get_absolute_url(self):
         return resolve(

diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py
@@ -1136,6 +1136,7 @@ def update_app_instances(
             commit=self.build['commit'],
             build=self.build['id'],
             search_ranking=self.config.search.ranking,
+            search_ignore=self.config.search.ignore,
         )
 
     def setup_python_environment(self):
@@ -1278,7 +1279,7 @@ def is_type_sphinx(self):
 
 # Web tasks
 @app.task(queue='reindex')
-def fileify(version_pk, commit, build, search_ranking):
+def fileify(version_pk, commit, build, search_ranking, search_ignore):
     """
     Create ImportedFile objects for all of a version's files.
 
@@ -1317,6 +1318,7 @@ def fileify(version_pk, commit, build, search_ranking):
             commit=commit,
             build=build,
             search_ranking=search_ranking,
+            search_ignore=search_ignore,
         )
     except Exception:
         changed_files = set()
@@ -1494,7 +1496,7 @@ def clean_build(version_pk):
         return True
 
 
-def _create_imported_files(*, version, commit, build, search_ranking):
+def _create_imported_files(*, version, commit, build, search_ranking, search_ignore):
     """
     Create imported files for version.
 
@@ -1564,6 +1566,12 @@ def _create_imported_files(*, version, commit, build, search_ranking):
                     page_rank = rank
                     break
 
+            ignore = False
+            for pattern in search_ignore:
+                if fnmatch(relpath, pattern):
+                    ignore = True
+                    break
+
             # Create imported files from new build
             model_class.objects.create(
                 project=version.project,
@@ -1574,6 +1582,7 @@ def _create_imported_files(*, version, commit, build, search_ranking):
                 rank=page_rank,
                 commit=commit,
                 build=build,
+                ignore=ignore,
             )
 
     # This signal is used for clearing the CDN,

diff --git a/readthedocs/rtd_tests/fixtures/spec/v2/schema.yml b/readthedocs/rtd_tests/fixtures/spec/v2/schema.yml
@@ -134,3 +134,7 @@ search:
   # Map of patterns to ranks
   # Default: {}
   ranking: map(str(), int(min=-10, max=10), required=False)
+
+  # List of patterns
+  # Default: ['search.html', 'search/index.html', '404.html', '404/index.html']
+  ignore: list(str(), required=False)
diff --git a/readthedocs/rtd_tests/tests/test_celery.py b/readthedocs/rtd_tests/tests/test_celery.py
@@ -327,7 +327,13 @@ def public_task_exception():
     @patch('readthedocs.builds.managers.log')
     def test_fileify_logging_when_wrong_version_pk(self, mock_logger):
         self.assertFalse(Version.objects.filter(pk=345343).exists())
-        tasks.fileify(version_pk=345343, commit=None, build=1, search_ranking={})
+        tasks.fileify(
+            version_pk=345343,
+            commit=None,
+            build=1,
+            search_ranking={},
+            search_ignore=[],
+        )
         mock_logger.warning.assert_called_with("Version not found for given kwargs. {'pk': 345343}")
 
     @patch('readthedocs.oauth.services.github.GitHubService.send_build_status')

diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py
@@ -28,14 +28,23 @@ def setUp(self):
         self.test_dir = os.path.join(base_dir, 'files')
         self._copy_storage_dir()
 
-    def _manage_imported_files(self, version, commit, build, search_ranking=None):
+    def _manage_imported_files(
+        self,
+        version,
+        commit,
+        build,
+        search_ranking=None,
+        search_ignore=None
+    ):
         """Helper function for the tests to create and sync ImportedFiles."""
         search_ranking = search_ranking or {}
+        search_ignore = search_ignore or []
         _create_imported_files(
             version=version,
             commit=commit,
             build=build,
             search_ranking=search_ranking,
+            search_ignore=search_ignore,
         )
         _sync_imported_files(version, build, set())
 
@@ -131,6 +140,23 @@ def test_page_custom_rank_precedence_inverted(self):
         self.assertEqual(file_api.rank, 5)
         self.assertEqual(file_test.rank, 5)
 
+    def test_search_page_ignore(self):
+        search_ignore = [
+            'api/index.html'
+        ]
+        self._manage_imported_files(
+            self.version,
+            'commit01',
+            1,
+            search_ignore=search_ignore,
+        )
+
+        self.assertEqual(HTMLFile.objects.count(), 2)
+        file_api = HTMLFile.objects.get(path='api/index.html')
+        file_test = HTMLFile.objects.get(path='test.html')
+        self.assertTrue(file_api.ignore)
+        self.assertFalse(file_test.ignore)
+
     def test_update_content(self):
         test_dir = os.path.join(base_dir, 'files')
         self.assertEqual(ImportedFile.objects.count(), 0)
@@ -195,6 +221,7 @@ def test_create_intersphinx_data(self, mock_exists):
                 commit='commit01',
                 build=1,
                 search_ranking={},
+                search_ignore=[],
             )
             _create_intersphinx_data(self.version, 'commit01', 1)
 

diff --git a/readthedocs/search/documents.py b/readthedocs/search/documents.py
@@ -140,23 +140,16 @@ def prepare_domains(self, html_file):
         return all_domains
 
     def get_queryset(self):
-        """Overwrite default queryset to filter certain files to index."""
-        queryset = super().get_queryset()
-
-        # Do not index files from external versions
-        queryset = queryset.internal().all()
-
-        # TODO: Make this smarter
-        # This was causing issues excluding some valid user documentation pages
-        # excluded_files = [
-        #     'search.html',
-        #     'genindex.html',
-        #     'py-modindex.html',
-        #     'search/index.html',
-        #     'genindex/index.html',
-        #     'py-modindex/index.html',
-        # ]
-        # for ending in excluded_files:
-        #     queryset = queryset.exclude(path=ending)
+        """
+        Ignore certain files from indexing.
 
+        - Files from external versions
+        - Ignored files
+        """
+        queryset = super().get_queryset()
+        queryset = (
+            queryset
+            .internal()
+            .exclude(ignore=True)
+        )
         return queryset
Original file line number	Diff line number	Diff line change
Expand Up		@@ -72,4 +72,4 @@ class Submodules(Base):

		class Search(Base):

		__slots__ = ('ranking',)
		__slots__ = ('ranking', 'ignore')