Skip to content

Commit b66e817

Browse files
committed
Search: allow ignoring files from indexing
Closes #5247 Ref #7217
1 parent e4b618d commit b66e817

File tree

14 files changed

+253
-27
lines changed

14 files changed

+253
-27
lines changed

docs/config-file/v2.rst

+43-1
Original file line numberDiff line numberDiff line change
@@ -466,6 +466,8 @@ Settings for more control over :doc:`/server-side-search`.
466466
ranking:
467467
api/v1/*: -1
468468
api/v2/*: 4
469+
ignore:
470+
- 404.html
469471
470472
search.ranking
471473
``````````````
@@ -488,14 +490,17 @@ Pages with a rank closer to -10 will appear further down the list of results,
488490
and pages with a rank closer to 10 will appear higher in the list of results.
489491
Note that 0 means *normal rank*, not *no rank*.
490492

493+
If you are looking to completely ignore a page,
494+
check :ref:`config-file/v2:search.ignore`.
495+
491496
.. code-block:: yaml
492497
493498
version: 2
494499
495500
search:
496501
ranking:
497502
# Match a single file
498-
tutorial.hml: 2
503+
tutorial.html: 2
499504
500505
# Match all files under the api/v1 directory
501506
api/v1/*: -5
@@ -512,6 +517,43 @@ Note that 0 means *normal rank*, not *no rank*.
512517
Is better to decrease the rank of pages you want to deprecate,
513518
rather than increasing the rank of the other pages.
514519

520+
search.ignore
521+
`````````````
522+
523+
Don't index files matching a pattern.
524+
This is, you won't see search results from these files.
525+
526+
:Type: ``list`` of patterns
527+
:Default: ``['search.html', 'search/index.html', '404.html', '404/index.html']``
528+
529+
Patterns are matched against the final html pages produced by the build
530+
(you should try to match `index.html`, not `docs/index.rst`).
531+
Patterns can include some special characters:
532+
533+
- ``*`` matches everything
534+
- ``?`` matches any single character
535+
- ``[seq]`` matches any character in ``seq``
536+
537+
.. code-block:: yaml
538+
539+
version: 2
540+
541+
search:
542+
ignore:
543+
# Ignore a single file
544+
- 404.html
545+
546+
# Ignore all files under the search/ directory
547+
- search/*
548+
549+
# Ignore all files that end with ref.html
550+
- '*/ref.html'
551+
552+
.. note::
553+
554+
Since Read the Docs fallbacks to the original search engine when no results are found,
555+
you may still see search results from ignored pages.
556+
515557
Schema
516558
------
517559

docs/server-side-search.rst

+4
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ Special query syntax for more specific results.
4040
We support a full range of search queries.
4141
You can see some examples in our :ref:`guides/searching-with-readthedocs:search query syntax` guide.
4242

43+
Configurable.
44+
Tweak search results according to your needs using a
45+
:ref:`configuration file <config-file/v2:search>`.
46+
4347
..
4448
Code object searching
4549
With the user of :doc:`Sphinx Domains <sphinx:/usage/restructuredtext/domains>` we are able to automatically provide direct search results to your Code objects.

readthedocs/config/config.py

+19-2
Original file line numberDiff line numberDiff line change
@@ -656,7 +656,7 @@ def submodules(self):
656656

657657
@property
658658
def search(self):
659-
return Search(ranking={})
659+
return Search(ranking={}, ignore=[])
660660

661661

662662
class BuildConfigV2(BuildConfigBase):
@@ -1023,7 +1023,8 @@ def validate_search(self):
10231023
"""
10241024
Validates the search key.
10251025
1026-
- Ranking is a map of path patterns to a rank.
1026+
- ``ranking`` is a map of path patterns to a rank.
1027+
- ``ignore`` is a list of patterns.
10271028
- The path pattern supports basic globs (*, ?, [seq]).
10281029
- The rank can be a integer number between -10 and 10.
10291030
"""
@@ -1046,6 +1047,22 @@ def validate_search(self):
10461047

10471048
search['ranking'] = final_ranking
10481049

1050+
with self.catch_validation_error('search.ignore'):
1051+
ignore_default = [
1052+
'search.html',
1053+
'search/index.html',
1054+
'404.html',
1055+
'404/index.html',
1056+
]
1057+
search_ignore = self.pop_config('search.ignore', ignore_default)
1058+
validate_list(search_ignore)
1059+
1060+
final_ignore = [
1061+
validate_path_pattern(pattern)
1062+
for pattern in search_ignore
1063+
]
1064+
search['ignore'] = final_ignore
1065+
10491066
return search
10501067

10511068
def validate_keys(self):

readthedocs/config/models.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -72,4 +72,4 @@ class Submodules(Base):
7272

7373
class Search(Base):
7474

75-
__slots__ = ('ranking',)
75+
__slots__ = ('ranking', 'ignore')

readthedocs/config/tests/test_config.py

+46
Original file line numberDiff line numberDiff line change
@@ -767,6 +767,7 @@ def test_as_dict(tmpdir):
767767
},
768768
'search': {
769769
'ranking': {},
770+
'ignore': [],
770771
},
771772
}
772773
assert build.as_dict() == expected_dict
@@ -1908,6 +1909,45 @@ def test_search_ranking_normilize_path(self, path, expected):
19081909
build.validate()
19091910
assert build.search.ranking == {expected: 1}
19101911

1912+
@pytest.mark.parametrize(
1913+
'value',
1914+
[
1915+
'invalid',
1916+
True,
1917+
0,
1918+
[2, 3],
1919+
{'foo/bar': 11},
1920+
],
1921+
)
1922+
def test_search_ignore_invalid_type(self, value):
1923+
build = self.get_build_config({
1924+
'search': {'ignore': value},
1925+
})
1926+
with raises(InvalidConfig) as excinfo:
1927+
build.validate()
1928+
assert excinfo.value.key == 'search.ignore'
1929+
1930+
@pytest.mark.parametrize('path, expected', [
1931+
('/foo/bar', 'foo/bar'),
1932+
('///foo//bar', 'foo/bar'),
1933+
('///foo//bar/', 'foo/bar'),
1934+
('/foo/bar/../', 'foo'),
1935+
('/foo*', 'foo*'),
1936+
('/foo/bar/*', 'foo/bar/*'),
1937+
('/foo/bar?/*', 'foo/bar?/*'),
1938+
('foo/[bc]ar/*/', 'foo/[bc]ar/*'),
1939+
('*', '*'),
1940+
('index.html', 'index.html'),
1941+
])
1942+
def test_search_ignore_valid_type(self, path, expected):
1943+
build = self.get_build_config({
1944+
'search': {
1945+
'ignore': [path],
1946+
},
1947+
})
1948+
build.validate()
1949+
assert build.search.ignore == [expected]
1950+
19111951
@pytest.mark.parametrize('value,key', [
19121952
({'typo': 'something'}, 'typo'),
19131953
(
@@ -2048,6 +2088,12 @@ def test_as_dict(self, tmpdir):
20482088
},
20492089
'search': {
20502090
'ranking': {},
2091+
'ignore': [
2092+
'search.html',
2093+
'search/index.html',
2094+
'404.html',
2095+
'404/index.html',
2096+
],
20512097
},
20522098
}
20532099
assert build.as_dict() == expected_dict
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Generated by Django 2.2.12 on 2020-07-21 18:21
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
('projects', '0057_add_page_rank'),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name='importedfile',
15+
name='ignore',
16+
field=models.BooleanField(null=True, verbose_name='Ignore this file from operations like indexing'),
17+
),
18+
]

readthedocs/projects/models.py

+6
Original file line numberDiff line numberDiff line change
@@ -1360,6 +1360,12 @@ class ImportedFile(models.Model):
13601360
null=True,
13611361
validators=[MinValueValidator(-10), MaxValueValidator(10)],
13621362
)
1363+
ignore = models.BooleanField(
1364+
_('Ignore this file from operations like indexing'),
1365+
# default=False,
1366+
# TODO: remove after migration
1367+
null=True,
1368+
)
13631369

13641370
def get_absolute_url(self):
13651371
return resolve(

readthedocs/projects/tasks.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -1136,6 +1136,7 @@ def update_app_instances(
11361136
commit=self.build['commit'],
11371137
build=self.build['id'],
11381138
search_ranking=self.config.search.ranking,
1139+
search_ignore=self.config.search.ignore,
11391140
)
11401141

11411142
def setup_python_environment(self):
@@ -1278,7 +1279,7 @@ def is_type_sphinx(self):
12781279

12791280
# Web tasks
12801281
@app.task(queue='reindex')
1281-
def fileify(version_pk, commit, build, search_ranking):
1282+
def fileify(version_pk, commit, build, search_ranking, search_ignore):
12821283
"""
12831284
Create ImportedFile objects for all of a version's files.
12841285
@@ -1317,6 +1318,7 @@ def fileify(version_pk, commit, build, search_ranking):
13171318
commit=commit,
13181319
build=build,
13191320
search_ranking=search_ranking,
1321+
search_ignore=search_ignore,
13201322
)
13211323
except Exception:
13221324
changed_files = set()
@@ -1494,7 +1496,7 @@ def clean_build(version_pk):
14941496
return True
14951497

14961498

1497-
def _create_imported_files(*, version, commit, build, search_ranking):
1499+
def _create_imported_files(*, version, commit, build, search_ranking, search_ignore):
14981500
"""
14991501
Create imported files for version.
15001502
@@ -1564,6 +1566,12 @@ def _create_imported_files(*, version, commit, build, search_ranking):
15641566
page_rank = rank
15651567
break
15661568

1569+
ignore = False
1570+
for pattern in search_ignore:
1571+
if fnmatch(relpath, pattern):
1572+
ignore = True
1573+
break
1574+
15671575
# Create imported files from new build
15681576
model_class.objects.create(
15691577
project=version.project,
@@ -1574,6 +1582,7 @@ def _create_imported_files(*, version, commit, build, search_ranking):
15741582
rank=page_rank,
15751583
commit=commit,
15761584
build=build,
1585+
ignore=ignore,
15771586
)
15781587

15791588
# This signal is used for clearing the CDN,

readthedocs/rtd_tests/fixtures/spec/v2/schema.yml

+4
Original file line numberDiff line numberDiff line change
@@ -134,3 +134,7 @@ search:
134134
# Map of patterns to ranks
135135
# Default: {}
136136
ranking: map(str(), int(min=-10, max=10), required=False)
137+
138+
# List of patterns
139+
# Default: ['search.html', 'search/index.html', '404.html', '404/index.html']
140+
ignore: list(str(), required=False)

readthedocs/rtd_tests/tests/test_celery.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,13 @@ def public_task_exception():
327327
@patch('readthedocs.builds.managers.log')
328328
def test_fileify_logging_when_wrong_version_pk(self, mock_logger):
329329
self.assertFalse(Version.objects.filter(pk=345343).exists())
330-
tasks.fileify(version_pk=345343, commit=None, build=1, search_ranking={})
330+
tasks.fileify(
331+
version_pk=345343,
332+
commit=None,
333+
build=1,
334+
search_ranking={},
335+
search_ignore=[],
336+
)
331337
mock_logger.warning.assert_called_with("Version not found for given kwargs. {'pk': 345343}")
332338

333339
@patch('readthedocs.oauth.services.github.GitHubService.send_build_status')

readthedocs/rtd_tests/tests/test_imported_file.py

+28-1
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,23 @@ def setUp(self):
2828
self.test_dir = os.path.join(base_dir, 'files')
2929
self._copy_storage_dir()
3030

31-
def _manage_imported_files(self, version, commit, build, search_ranking=None):
31+
def _manage_imported_files(
32+
self,
33+
version,
34+
commit,
35+
build,
36+
search_ranking=None,
37+
search_ignore=None
38+
):
3239
"""Helper function for the tests to create and sync ImportedFiles."""
3340
search_ranking = search_ranking or {}
41+
search_ignore = search_ignore or []
3442
_create_imported_files(
3543
version=version,
3644
commit=commit,
3745
build=build,
3846
search_ranking=search_ranking,
47+
search_ignore=search_ignore,
3948
)
4049
_sync_imported_files(version, build, set())
4150

@@ -131,6 +140,23 @@ def test_page_custom_rank_precedence_inverted(self):
131140
self.assertEqual(file_api.rank, 5)
132141
self.assertEqual(file_test.rank, 5)
133142

143+
def test_search_page_ignore(self):
144+
search_ignore = [
145+
'api/index.html'
146+
]
147+
self._manage_imported_files(
148+
self.version,
149+
'commit01',
150+
1,
151+
search_ignore=search_ignore,
152+
)
153+
154+
self.assertEqual(HTMLFile.objects.count(), 2)
155+
file_api = HTMLFile.objects.get(path='api/index.html')
156+
file_test = HTMLFile.objects.get(path='test.html')
157+
self.assertTrue(file_api.ignore)
158+
self.assertFalse(file_test.ignore)
159+
134160
def test_update_content(self):
135161
test_dir = os.path.join(base_dir, 'files')
136162
self.assertEqual(ImportedFile.objects.count(), 0)
@@ -195,6 +221,7 @@ def test_create_intersphinx_data(self, mock_exists):
195221
commit='commit01',
196222
build=1,
197223
search_ranking={},
224+
search_ignore=[],
198225
)
199226
_create_intersphinx_data(self.version, 'commit01', 1)
200227

readthedocs/search/documents.py

+11-18
Original file line numberDiff line numberDiff line change
@@ -141,23 +141,16 @@ def prepare_domains(self, html_file):
141141
return all_domains
142142

143143
def get_queryset(self):
144-
"""Overwrite default queryset to filter certain files to index."""
145-
queryset = super().get_queryset()
146-
147-
# Do not index files from external versions
148-
queryset = queryset.internal().all()
149-
150-
# TODO: Make this smarter
151-
# This was causing issues excluding some valid user documentation pages
152-
# excluded_files = [
153-
# 'search.html',
154-
# 'genindex.html',
155-
# 'py-modindex.html',
156-
# 'search/index.html',
157-
# 'genindex/index.html',
158-
# 'py-modindex/index.html',
159-
# ]
160-
# for ending in excluded_files:
161-
# queryset = queryset.exclude(path=ending)
144+
"""
145+
Ignore certain files from indexing.
162146
147+
- Files from external versions
148+
- Ignored files
149+
"""
150+
queryset = super().get_queryset()
151+
queryset = (
152+
queryset
153+
.internal()
154+
.exclude(ignore=True)
155+
)
163156
return queryset

0 commit comments

Comments
 (0)