From b66e8174591b2b2f70d79769611759248ccac1f0 Mon Sep 17 00:00:00 2001
From: Santos Gallegos <stsewd@protonmail.com>
Date: Tue, 21 Jul 2020 14:14:15 -0500
Subject: [PATCH 1/3] Search: allow ignoring files from indexing

Closes https://github.com/readthedocs/readthedocs.org/issues/5247
Ref https://github.com/readthedocs/readthedocs.org/issues/7217
---
 docs/config-file/v2.rst                       | 44 ++++++++++++++-
 docs/server-side-search.rst                   |  4 ++
 readthedocs/config/config.py                  | 21 ++++++-
 readthedocs/config/models.py                  |  2 +-
 readthedocs/config/tests/test_config.py       | 46 ++++++++++++++++
 .../0058_add_imported_file_ignore.py          | 18 ++++++
 readthedocs/projects/models.py                |  6 ++
 readthedocs/projects/tasks.py                 | 13 ++++-
 .../rtd_tests/fixtures/spec/v2/schema.yml     |  4 ++
 readthedocs/rtd_tests/tests/test_celery.py    |  8 ++-
 .../rtd_tests/tests/test_imported_file.py     | 29 +++++++++-
 readthedocs/search/documents.py               | 29 ++++------
 readthedocs/search/tests/conftest.py          |  1 +
 readthedocs/search/tests/test_api.py          | 55 ++++++++++++++++++-
 14 files changed, 253 insertions(+), 27 deletions(-)
 create mode 100644 readthedocs/projects/migrations/0058_add_imported_file_ignore.py

diff --git a/docs/config-file/v2.rst b/docs/config-file/v2.rst
index 18487831589..b692aadc8e6 100644
--- a/docs/config-file/v2.rst
+++ b/docs/config-file/v2.rst
@@ -466,6 +466,8 @@ Settings for more control over :doc:`/server-side-search`.
       ranking:
         api/v1/*: -1
         api/v2/*: 4
+      ignore:
+        - 404.html
 
 search.ranking
 ``````````````
@@ -488,6 +490,9 @@ Pages with a rank closer to -10 will appear further down the list of results,
 and pages with a rank closer to 10 will appear higher in the list of results.
 Note that 0 means *normal rank*, not *no rank*.
 
+If you are looking to completely ignore a page,
+check :ref:`config-file/v2:search.ignore`.
+
 .. code-block:: yaml
 
    version: 2
@@ -495,7 +500,7 @@ Note that 0 means *normal rank*, not *no rank*.
    search:
       ranking:
         # Match a single file
-        tutorial.hml: 2
+        tutorial.html: 2
 
         # Match all files under the api/v1 directory
         api/v1/*: -5
@@ -512,6 +517,43 @@ Note that 0 means *normal rank*, not *no rank*.
    Is better to decrease the rank of pages you want to deprecate,
    rather than increasing the rank of the other pages.
 
+search.ignore
+`````````````
+
+Don't index files matching a pattern.
+This is, you won't see search results from these files.
+
+:Type: ``list`` of patterns
+:Default: ``['search.html', 'search/index.html', '404.html', '404/index.html']``
+
+Patterns are matched against the final html pages produced by the build
+(you should try to match `index.html`, not `docs/index.rst`).
+Patterns can include some special characters:
+
+- ``*`` matches everything
+- ``?`` matches any single character
+- ``[seq]`` matches any character in ``seq``
+
+.. code-block:: yaml
+
+   version: 2
+
+   search:
+      ignore:
+        # Ignore a single file
+        - 404.html
+
+        # Ignore all files under the search/ directory
+        - search/*
+
+        # Ignore all files that end with ref.html
+        - '*/ref.html'
+
+.. note::
+
+   Since Read the Docs fallbacks to the original search engine when no results are found,
+   you may still see search results from ignored pages.
+
 Schema
 ------
 
diff --git a/docs/server-side-search.rst b/docs/server-side-search.rst
index fabd2f79814..49bbd1c3f26 100644
--- a/docs/server-side-search.rst
+++ b/docs/server-side-search.rst
@@ -40,6 +40,10 @@ Special query syntax for more specific results.
    We support a full range of search queries.
    You can see some examples in our :ref:`guides/searching-with-readthedocs:search query syntax` guide.
 
+Configurable.
+   Tweak search results according to your needs using a
+   :ref:`configuration file <config-file/v2:search>`.
+
 ..
    Code object searching
       With the user of :doc:`Sphinx Domains <sphinx:/usage/restructuredtext/domains>` we are able to automatically provide direct search results to your Code objects.
diff --git a/readthedocs/config/config.py b/readthedocs/config/config.py
index 244135277eb..9166d171fc8 100644
--- a/readthedocs/config/config.py
+++ b/readthedocs/config/config.py
@@ -656,7 +656,7 @@ def submodules(self):
 
     @property
     def search(self):
-        return Search(ranking={})
+        return Search(ranking={}, ignore=[])
 
 
 class BuildConfigV2(BuildConfigBase):
@@ -1023,7 +1023,8 @@ def validate_search(self):
         """
         Validates the search key.
 
-        - Ranking is a map of path patterns to a rank.
+        - ``ranking`` is a map of path patterns to a rank.
+        - ``ignore`` is a list of patterns.
         - The path pattern supports basic globs (*, ?, [seq]).
         - The rank can be a integer number between -10 and 10.
         """
@@ -1046,6 +1047,22 @@ def validate_search(self):
 
             search['ranking'] = final_ranking
 
+        with self.catch_validation_error('search.ignore'):
+            ignore_default = [
+                'search.html',
+                'search/index.html',
+                '404.html',
+                '404/index.html',
+            ]
+            search_ignore = self.pop_config('search.ignore', ignore_default)
+            validate_list(search_ignore)
+
+            final_ignore = [
+                validate_path_pattern(pattern)
+                for pattern in search_ignore
+            ]
+            search['ignore'] = final_ignore
+
         return search
 
     def validate_keys(self):
diff --git a/readthedocs/config/models.py b/readthedocs/config/models.py
index 55932e991d4..998da8498be 100644
--- a/readthedocs/config/models.py
+++ b/readthedocs/config/models.py
@@ -72,4 +72,4 @@ class Submodules(Base):
 
 class Search(Base):
 
-    __slots__ = ('ranking',)
+    __slots__ = ('ranking', 'ignore')
diff --git a/readthedocs/config/tests/test_config.py b/readthedocs/config/tests/test_config.py
index 3ee20d1653c..a2834c78305 100644
--- a/readthedocs/config/tests/test_config.py
+++ b/readthedocs/config/tests/test_config.py
@@ -767,6 +767,7 @@ def test_as_dict(tmpdir):
         },
         'search': {
             'ranking': {},
+            'ignore': [],
         },
     }
     assert build.as_dict() == expected_dict
@@ -1908,6 +1909,45 @@ def test_search_ranking_normilize_path(self, path, expected):
         build.validate()
         assert build.search.ranking == {expected: 1}
 
+    @pytest.mark.parametrize(
+        'value',
+        [
+            'invalid',
+            True,
+            0,
+            [2, 3],
+            {'foo/bar': 11},
+        ],
+    )
+    def test_search_ignore_invalid_type(self, value):
+        build = self.get_build_config({
+            'search': {'ignore': value},
+        })
+        with raises(InvalidConfig) as excinfo:
+            build.validate()
+        assert excinfo.value.key == 'search.ignore'
+
+    @pytest.mark.parametrize('path, expected', [
+        ('/foo/bar', 'foo/bar'),
+        ('///foo//bar', 'foo/bar'),
+        ('///foo//bar/', 'foo/bar'),
+        ('/foo/bar/../', 'foo'),
+        ('/foo*', 'foo*'),
+        ('/foo/bar/*', 'foo/bar/*'),
+        ('/foo/bar?/*', 'foo/bar?/*'),
+        ('foo/[bc]ar/*/', 'foo/[bc]ar/*'),
+        ('*', '*'),
+        ('index.html', 'index.html'),
+    ])
+    def test_search_ignore_valid_type(self, path, expected):
+        build = self.get_build_config({
+            'search': {
+                'ignore': [path],
+            },
+        })
+        build.validate()
+        assert build.search.ignore == [expected]
+
     @pytest.mark.parametrize('value,key', [
         ({'typo': 'something'}, 'typo'),
         (
@@ -2048,6 +2088,12 @@ def test_as_dict(self, tmpdir):
             },
             'search': {
                 'ranking': {},
+                'ignore': [
+                    'search.html',
+                    'search/index.html',
+                    '404.html',
+                    '404/index.html',
+                ],
             },
         }
         assert build.as_dict() == expected_dict
diff --git a/readthedocs/projects/migrations/0058_add_imported_file_ignore.py b/readthedocs/projects/migrations/0058_add_imported_file_ignore.py
new file mode 100644
index 00000000000..167611f21e8
--- /dev/null
+++ b/readthedocs/projects/migrations/0058_add_imported_file_ignore.py
@@ -0,0 +1,18 @@
+# Generated by Django 2.2.12 on 2020-07-21 18:21
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('projects', '0057_add_page_rank'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='importedfile',
+            name='ignore',
+            field=models.BooleanField(null=True, verbose_name='Ignore this file from operations like indexing'),
+        ),
+    ]
diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py
index fd89f761733..8ca2b845d76 100644
--- a/readthedocs/projects/models.py
+++ b/readthedocs/projects/models.py
@@ -1360,6 +1360,12 @@ class ImportedFile(models.Model):
         null=True,
         validators=[MinValueValidator(-10), MaxValueValidator(10)],
     )
+    ignore = models.BooleanField(
+        _('Ignore this file from operations like indexing'),
+        # default=False,
+        # TODO: remove after migration
+        null=True,
+    )
 
     def get_absolute_url(self):
         return resolve(
diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py
index 4df043c0096..f6fdebd75c3 100644
--- a/readthedocs/projects/tasks.py
+++ b/readthedocs/projects/tasks.py
@@ -1136,6 +1136,7 @@ def update_app_instances(
             commit=self.build['commit'],
             build=self.build['id'],
             search_ranking=self.config.search.ranking,
+            search_ignore=self.config.search.ignore,
         )
 
     def setup_python_environment(self):
@@ -1278,7 +1279,7 @@ def is_type_sphinx(self):
 
 # Web tasks
 @app.task(queue='reindex')
-def fileify(version_pk, commit, build, search_ranking):
+def fileify(version_pk, commit, build, search_ranking, search_ignore):
     """
     Create ImportedFile objects for all of a version's files.
 
@@ -1317,6 +1318,7 @@ def fileify(version_pk, commit, build, search_ranking):
             commit=commit,
             build=build,
             search_ranking=search_ranking,
+            search_ignore=search_ignore,
         )
     except Exception:
         changed_files = set()
@@ -1494,7 +1496,7 @@ def clean_build(version_pk):
         return True
 
 
-def _create_imported_files(*, version, commit, build, search_ranking):
+def _create_imported_files(*, version, commit, build, search_ranking, search_ignore):
     """
     Create imported files for version.
 
@@ -1564,6 +1566,12 @@ def _create_imported_files(*, version, commit, build, search_ranking):
                     page_rank = rank
                     break
 
+            ignore = False
+            for pattern in search_ignore:
+                if fnmatch(relpath, pattern):
+                    ignore = True
+                    break
+
             # Create imported files from new build
             model_class.objects.create(
                 project=version.project,
@@ -1574,6 +1582,7 @@ def _create_imported_files(*, version, commit, build, search_ranking):
                 rank=page_rank,
                 commit=commit,
                 build=build,
+                ignore=ignore,
             )
 
     # This signal is used for clearing the CDN,
diff --git a/readthedocs/rtd_tests/fixtures/spec/v2/schema.yml b/readthedocs/rtd_tests/fixtures/spec/v2/schema.yml
index 2e34d08c7f0..f88db8025d1 100644
--- a/readthedocs/rtd_tests/fixtures/spec/v2/schema.yml
+++ b/readthedocs/rtd_tests/fixtures/spec/v2/schema.yml
@@ -134,3 +134,7 @@ search:
   # Map of patterns to ranks
   # Default: {}
   ranking: map(str(), int(min=-10, max=10), required=False)
+
+  # List of patterns
+  # Default: ['search.html', 'search/index.html', '404.html', '404/index.html']
+  ignore: list(str(), required=False)
diff --git a/readthedocs/rtd_tests/tests/test_celery.py b/readthedocs/rtd_tests/tests/test_celery.py
index 5ff0eaf1cd9..aae93822b8e 100644
--- a/readthedocs/rtd_tests/tests/test_celery.py
+++ b/readthedocs/rtd_tests/tests/test_celery.py
@@ -327,7 +327,13 @@ def public_task_exception():
     @patch('readthedocs.builds.managers.log')
     def test_fileify_logging_when_wrong_version_pk(self, mock_logger):
         self.assertFalse(Version.objects.filter(pk=345343).exists())
-        tasks.fileify(version_pk=345343, commit=None, build=1, search_ranking={})
+        tasks.fileify(
+            version_pk=345343,
+            commit=None,
+            build=1,
+            search_ranking={},
+            search_ignore=[],
+        )
         mock_logger.warning.assert_called_with("Version not found for given kwargs. {'pk': 345343}")
 
     @patch('readthedocs.oauth.services.github.GitHubService.send_build_status')
diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py
index 1452eeaeaaf..3633ffa3fca 100644
--- a/readthedocs/rtd_tests/tests/test_imported_file.py
+++ b/readthedocs/rtd_tests/tests/test_imported_file.py
@@ -28,14 +28,23 @@ def setUp(self):
         self.test_dir = os.path.join(base_dir, 'files')
         self._copy_storage_dir()
 
-    def _manage_imported_files(self, version, commit, build, search_ranking=None):
+    def _manage_imported_files(
+        self,
+        version,
+        commit,
+        build,
+        search_ranking=None,
+        search_ignore=None
+    ):
         """Helper function for the tests to create and sync ImportedFiles."""
         search_ranking = search_ranking or {}
+        search_ignore = search_ignore or []
         _create_imported_files(
             version=version,
             commit=commit,
             build=build,
             search_ranking=search_ranking,
+            search_ignore=search_ignore,
         )
         _sync_imported_files(version, build, set())
 
@@ -131,6 +140,23 @@ def test_page_custom_rank_precedence_inverted(self):
         self.assertEqual(file_api.rank, 5)
         self.assertEqual(file_test.rank, 5)
 
+    def test_search_page_ignore(self):
+        search_ignore = [
+            'api/index.html'
+        ]
+        self._manage_imported_files(
+            self.version,
+            'commit01',
+            1,
+            search_ignore=search_ignore,
+        )
+
+        self.assertEqual(HTMLFile.objects.count(), 2)
+        file_api = HTMLFile.objects.get(path='api/index.html')
+        file_test = HTMLFile.objects.get(path='test.html')
+        self.assertTrue(file_api.ignore)
+        self.assertFalse(file_test.ignore)
+
     def test_update_content(self):
         test_dir = os.path.join(base_dir, 'files')
         self.assertEqual(ImportedFile.objects.count(), 0)
@@ -195,6 +221,7 @@ def test_create_intersphinx_data(self, mock_exists):
                 commit='commit01',
                 build=1,
                 search_ranking={},
+                search_ignore=[],
             )
             _create_intersphinx_data(self.version, 'commit01', 1)
 
diff --git a/readthedocs/search/documents.py b/readthedocs/search/documents.py
index 776e16338d5..0e153dd0dcb 100644
--- a/readthedocs/search/documents.py
+++ b/readthedocs/search/documents.py
@@ -141,23 +141,16 @@ def prepare_domains(self, html_file):
         return all_domains
 
     def get_queryset(self):
-        """Overwrite default queryset to filter certain files to index."""
-        queryset = super().get_queryset()
-
-        # Do not index files from external versions
-        queryset = queryset.internal().all()
-
-        # TODO: Make this smarter
-        # This was causing issues excluding some valid user documentation pages
-        # excluded_files = [
-        #     'search.html',
-        #     'genindex.html',
-        #     'py-modindex.html',
-        #     'search/index.html',
-        #     'genindex/index.html',
-        #     'py-modindex/index.html',
-        # ]
-        # for ending in excluded_files:
-        #     queryset = queryset.exclude(path=ending)
+        """
+        Ignore certain files from indexing.
 
+        - Files from external versions
+        - Ignored files
+        """
+        queryset = super().get_queryset()
+        queryset = (
+            queryset
+            .internal()
+            .exclude(ignore=True)
+        )
         return queryset
diff --git a/readthedocs/search/tests/conftest.py b/readthedocs/search/tests/conftest.py
index 97bba15dc45..4a796b20730 100644
--- a/readthedocs/search/tests/conftest.py
+++ b/readthedocs/search/tests/conftest.py
@@ -47,6 +47,7 @@ def all_projects(es_index, mock_processed_json, db, settings):
                 version=version,
                 name=file_name,
                 path=file_name,
+                build=1,
             )
 
             # creating sphinx domain test objects
diff --git a/readthedocs/search/tests/test_api.py b/readthedocs/search/tests/test_api.py
index 4e90ec40525..f01868a6d22 100644
--- a/readthedocs/search/tests/test_api.py
+++ b/readthedocs/search/tests/test_api.py
@@ -14,7 +14,7 @@
     SPHINX_HTMLDIR,
     SPHINX_SINGLEHTML,
 )
-from readthedocs.projects.models import HTMLFile, Project, Feature
+from readthedocs.projects.models import Feature, HTMLFile, Project
 from readthedocs.search.api import PageSearchAPIView
 from readthedocs.search.documents import PageDocument
 from readthedocs.search.tests.utils import (
@@ -22,6 +22,7 @@
     SECTION_FIELDS,
     get_search_query_from_project_file,
 )
+from readthedocs.search.utils import index_new_files, remove_indexed_files
 
 OLD_TYPES = {
     'domain': 'domains',
@@ -613,6 +614,58 @@ def test_search_custom_ranking(self, api_client):
         assert results[0]['full_path'] == 'index.html'
         assert results[1]['full_path'] == 'guides/index.html'
 
+    def test_search_ignore(self, api_client):
+        project = Project.objects.get(slug='docs')
+        version = project.versions.all().first()
+
+        page_index = HTMLFile.objects.get(path='index.html')
+        page_guides = HTMLFile.objects.get(path='guides/index.html')
+
+        search_params = {
+            'project': project.slug,
+            'version': version.slug,
+            'q': '"content from"',
+        }
+
+        # Query with files not ignored.
+        assert page_index.ignore is None
+        assert page_guides.ignore is None
+
+        resp = self.get_search(api_client, search_params)
+        assert resp.status_code == 200
+
+        results = resp.data['results']
+        assert len(results) == 2
+        assert results[0]['full_path'] == 'index.html'
+        assert results[1]['full_path'] == 'guides/index.html'
+
+        # Query with guides/index.html ignored.
+        page_guides.ignore = True
+        page_guides.save()
+
+        remove_indexed_files(HTMLFile, project.slug, version.slug)
+        index_new_files(HTMLFile, version, page_index.build)
+
+        resp = self.get_search(api_client, search_params)
+        assert resp.status_code == 200
+
+        results = resp.data['results']
+        assert len(results) == 1
+        assert results[0]['full_path'] == 'index.html'
+
+        # Query with index.html and guides/index.html ignored.
+        page_index.ignore = True
+        page_index.save()
+
+        remove_indexed_files(HTMLFile, project.slug, version.slug)
+        index_new_files(HTMLFile, version, page_index.build)
+
+        resp = self.get_search(api_client, search_params)
+        assert resp.status_code == 200
+
+        results = resp.data['results']
+        assert len(results) == 0
+
 
 class TestDocumentSearch(BaseTestDocumentSearch):
 

From 27717cbc2b1fb1bbab1af81c97b42b6f93490e85 Mon Sep 17 00:00:00 2001
From: Santos Gallegos <stsewd@protonmail.com>
Date: Wed, 9 Sep 2020 15:57:09 -0500
Subject: [PATCH 2/3] Update migration

---
 ...imported_file_ignore.py => 0061_add_imported_file_ignore.py} | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename readthedocs/projects/migrations/{0058_add_imported_file_ignore.py => 0061_add_imported_file_ignore.py} (89%)

diff --git a/readthedocs/projects/migrations/0058_add_imported_file_ignore.py b/readthedocs/projects/migrations/0061_add_imported_file_ignore.py
similarity index 89%
rename from readthedocs/projects/migrations/0058_add_imported_file_ignore.py
rename to readthedocs/projects/migrations/0061_add_imported_file_ignore.py
index 167611f21e8..6f03f7dd5bd 100644
--- a/readthedocs/projects/migrations/0058_add_imported_file_ignore.py
+++ b/readthedocs/projects/migrations/0061_add_imported_file_ignore.py
@@ -6,7 +6,7 @@
 class Migration(migrations.Migration):
 
     dependencies = [
-        ('projects', '0057_add_page_rank'),
+        ('projects', '0060_make_rank_not_null'),
     ]
 
     operations = [

From 84987796e76eb301a3e05f0d047ced44315b811a Mon Sep 17 00:00:00 2001
From: Santos Gallegos <stsewd@protonmail.com>
Date: Wed, 9 Sep 2020 16:16:48 -0500
Subject: [PATCH 3/3] Update tests

---
 readthedocs/search/tests/test_api.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/readthedocs/search/tests/test_api.py b/readthedocs/search/tests/test_api.py
index 2d66f0aae79..58abfb37f4f 100644
--- a/readthedocs/search/tests/test_api.py
+++ b/readthedocs/search/tests/test_api.py
@@ -632,8 +632,8 @@ def test_search_ignore(self, api_client):
 
         results = resp.data['results']
         assert len(results) == 2
-        assert results[0]['full_path'] == 'index.html'
-        assert results[1]['full_path'] == 'guides/index.html'
+        assert results[0]['path'] == '/en/latest/index.html'
+        assert results[1]['path'] == '/en/latest/guides/index.html'
 
         # Query with guides/index.html ignored.
         page_guides.ignore = True
@@ -647,7 +647,7 @@ def test_search_ignore(self, api_client):
 
         results = resp.data['results']
         assert len(results) == 1
-        assert results[0]['full_path'] == 'index.html'
+        assert results[0]['path'] == '/en/latest/index.html'
 
         # Query with index.html and guides/index.html ignored.
         page_index.ignore = True