Skip to content

Commit 7667580

Browse files
authored
Search: stop creating SphinxDomain objects (#10451)
We are indexing sphinx domains as sections now, we no longer need to create sphinx domains or index them. We should see some small improvement in build time, and maybe in search? This change will require a re-index, since old projects that haven't been re-build still have sphinx domain objects indexed separately from sections (this step can be done after or before the deploy). This also fixes a small bug, we are still indexing sphinx domains titles, without their content ![Screenshot 2023-06-20 at 14-36-48 Read the Docs documentation simplified](https://github.com/readthedocs/readthedocs.org/assets/4975310/eeb1865b-a859-4010-9cff-95a95d877882) ref #10272
1 parent 0df0520 commit 7667580

33 files changed

+44
-626
lines changed

readthedocs/projects/models.py

-5
Original file line numberDiff line numberDiff line change
@@ -1913,7 +1913,6 @@ def add_features(sender, **kwargs):
19131913
RECORD_404_PAGE_VIEWS = "record_404_page_views"
19141914
ALLOW_FORCED_REDIRECTS = "allow_forced_redirects"
19151915
DISABLE_PAGEVIEWS = "disable_pageviews"
1916-
DISABLE_SPHINX_DOMAINS = "disable_sphinx_domains"
19171916
RESOLVE_PROJECT_FROM_HEADER = "resolve_project_from_header"
19181917
USE_UNRESOLVER_WITH_PROXITO = "use_unresolver_with_proxito"
19191918

@@ -1999,10 +1998,6 @@ def add_features(sender, **kwargs):
19991998
DISABLE_PAGEVIEWS,
20001999
_("Proxito: Disable all page views"),
20012000
),
2002-
(
2003-
DISABLE_SPHINX_DOMAINS,
2004-
_("Sphinx: Disable indexing of sphinx domains"),
2005-
),
20062001
(
20072002
RESOLVE_PROJECT_FROM_HEADER,
20082003
_("Proxito: Allow usage of the X-RTD-Slug header"),

readthedocs/projects/tasks/search.py

+1-151
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,12 @@
1-
import json
21
from fnmatch import fnmatch
32

43
import structlog
5-
from django.conf import settings
6-
from sphinx.ext import intersphinx
74

85
from readthedocs.builds.constants import EXTERNAL
96
from readthedocs.builds.models import Version
10-
from readthedocs.projects.models import Feature, HTMLFile, ImportedFile, Project
7+
from readthedocs.projects.models import HTMLFile, ImportedFile, Project
118
from readthedocs.projects.signals import files_changed
129
from readthedocs.search.utils import index_new_files, remove_indexed_files
13-
from readthedocs.sphinx_domains.models import SphinxDomain
1410
from readthedocs.storage import build_media_storage
1511
from readthedocs.worker import app
1612

@@ -54,14 +50,6 @@ def fileify(version_pk, commit, build, search_ranking, search_ignore):
5450
except Exception:
5551
log.exception('Failed during ImportedFile creation')
5652

57-
# XXX: Don't access the sphinx domains table while we migrate the ID type
58-
# https://github.com/readthedocs/readthedocs.org/pull/9482.
59-
if not project.has_feature(Feature.DISABLE_SPHINX_DOMAINS):
60-
try:
61-
_create_intersphinx_data(version, commit, build)
62-
except Exception:
63-
log.exception("Failed during SphinxDomain creation")
64-
6553
try:
6654
_sync_imported_files(version, build)
6755
except Exception:
@@ -88,18 +76,6 @@ def _sync_imported_files(version, build):
8876
build_id=build,
8977
)
9078

91-
# Delete SphinxDomain objects from previous versions
92-
# This has to be done before deleting ImportedFiles and not with a cascade,
93-
# because multiple Domain's can reference a specific HTMLFile.
94-
# XXX: Don't access the sphinx domains table while we migrate the ID type
95-
# https://github.com/readthedocs/readthedocs.org/pull/9482.
96-
if not project.has_feature(Feature.DISABLE_SPHINX_DOMAINS):
97-
(
98-
SphinxDomain.objects.filter(project=project, version=version)
99-
.exclude(build=build)
100-
.delete()
101-
)
102-
10379
# Delete ImportedFiles objects (including HTMLFiles)
10480
# from the previous build of the version.
10581
(
@@ -119,132 +95,6 @@ def remove_search_indexes(project_slug, version_slug=None):
11995
)
12096

12197

122-
def _create_intersphinx_data(version, commit, build):
123-
"""
124-
Create intersphinx data for this version.
125-
126-
:param version: Version instance
127-
:param commit: Commit that updated path
128-
:param build: Build id
129-
"""
130-
if not version.is_sphinx_type:
131-
return
132-
133-
html_storage_path = version.project.get_storage_path(
134-
type_='html', version_slug=version.slug, include_file=False
135-
)
136-
json_storage_path = version.project.get_storage_path(
137-
type_='json', version_slug=version.slug, include_file=False
138-
)
139-
140-
object_file = build_media_storage.join(html_storage_path, 'objects.inv')
141-
if not build_media_storage.exists(object_file):
142-
log.debug('No objects.inv, skipping intersphinx indexing.')
143-
return
144-
145-
type_file = build_media_storage.join(json_storage_path, 'readthedocs-sphinx-domain-names.json')
146-
types = {}
147-
titles = {}
148-
if build_media_storage.exists(type_file):
149-
try:
150-
data = json.load(build_media_storage.open(type_file))
151-
types = data['types']
152-
titles = data['titles']
153-
except Exception:
154-
log.exception('Exception parsing readthedocs-sphinx-domain-names.json')
155-
156-
# These classes are copied from Sphinx
157-
# https://github.com/sphinx-doc/sphinx/blob/d79d041f4f90818e0b495523fdcc28db12783caf/sphinx/ext/intersphinx.py#L400-L403 # noqa
158-
class MockConfig:
159-
intersphinx_timeout = None
160-
tls_verify = False
161-
user_agent = None
162-
163-
class MockApp:
164-
srcdir = ''
165-
config = MockConfig()
166-
167-
def warn(self, msg):
168-
log.warning('Sphinx MockApp.', msg=msg)
169-
170-
# Re-create all objects from the new build of the version
171-
object_file_url = build_media_storage.url(object_file)
172-
if object_file_url.startswith('/'):
173-
# Filesystem backed storage simply prepends MEDIA_URL to the path to get the URL
174-
# This can cause an issue if MEDIA_URL is not fully qualified
175-
object_file_url = settings.RTD_INTERSPHINX_URL + object_file_url
176-
177-
invdata = intersphinx.fetch_inventory(MockApp(), '', object_file_url)
178-
for key, value in sorted(invdata.items() or {}):
179-
domain, _type = key.split(':', 1)
180-
for name, einfo in sorted(value.items()):
181-
# project, version, url, display_name
182-
# ('Sphinx', '1.7.9', 'faq.html#epub-faq', 'Epub info')
183-
try:
184-
url = einfo[2]
185-
if '#' in url:
186-
doc_name, anchor = url.split(
187-
'#',
188-
# The anchor can contain ``#`` characters
189-
maxsplit=1
190-
)
191-
else:
192-
doc_name, anchor = url, ''
193-
display_name = einfo[3]
194-
except Exception:
195-
log.exception(
196-
'Error while getting sphinx domain information. Skipping...',
197-
project_slug=version.project.slug,
198-
version_slug=version.slug,
199-
sphinx_domain=f"{domain}->{name}",
200-
)
201-
continue
202-
203-
# HACK: This is done because the difference between
204-
# ``sphinx.builders.html.StandaloneHTMLBuilder``
205-
# and ``sphinx.builders.dirhtml.DirectoryHTMLBuilder``.
206-
# They both have different ways of generating HTML Files,
207-
# and therefore the doc_name generated is different.
208-
# More info on: http://www.sphinx-doc.org/en/master/usage/builders/index.html#builders
209-
# Also see issue: https://github.com/readthedocs/readthedocs.org/issues/5821
210-
if doc_name.endswith('/'):
211-
doc_name += 'index.html'
212-
213-
html_file = HTMLFile.objects.filter(
214-
project=version.project, version=version,
215-
path=doc_name, build=build,
216-
).first()
217-
218-
if not html_file:
219-
log.debug(
220-
'HTMLFile object not found.',
221-
project_slug=version.project.slug,
222-
version_slug=version.slug,
223-
build_id=build,
224-
doc_name=doc_name
225-
)
226-
227-
# Don't create Sphinx Domain objects
228-
# if the HTMLFile object is not found.
229-
continue
230-
231-
SphinxDomain.objects.create(
232-
project=version.project,
233-
version=version,
234-
html_file=html_file,
235-
domain=domain,
236-
name=name,
237-
display_name=display_name,
238-
type=_type,
239-
type_display=types.get(f'{domain}:{_type}', ''),
240-
doc_name=doc_name,
241-
doc_display=titles.get(doc_name, ''),
242-
anchor=anchor,
243-
commit=commit,
244-
build=build,
245-
)
246-
247-
24898
def _create_imported_files(*, version, commit, build, search_ranking, search_ignore):
24999
"""
250100
Create imported files for version.

readthedocs/rtd_tests/tests/test_imported_file.py

-100
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import os
2-
from unittest import mock
32

43
from django.conf import settings
54
from django.core.files.storage import get_storage_class
@@ -9,10 +8,8 @@
98
from readthedocs.projects.models import HTMLFile, ImportedFile, Project
109
from readthedocs.projects.tasks.search import (
1110
_create_imported_files,
12-
_create_intersphinx_data,
1311
_sync_imported_files,
1412
)
15-
from readthedocs.sphinx_domains.models import SphinxDomain
1613

1714
base_dir = os.path.dirname(os.path.dirname(__file__))
1815

@@ -176,100 +173,3 @@ def test_update_content(self):
176173

177174
self._manage_imported_files(self.version, 'commit02', 2)
178175
self.assertEqual(ImportedFile.objects.count(), 2)
179-
180-
@override_settings(PRODUCTION_DOMAIN="readthedocs.org")
181-
@override_settings(RTD_INTERSPHINX_URL="https://readthedocs.org")
182-
@mock.patch("readthedocs.doc_builder.director.os.path.exists")
183-
def test_create_intersphinx_data(self, mock_exists):
184-
mock_exists.return_Value = True
185-
186-
# Test data for objects.inv file
187-
test_objects_inv = {
188-
'cpp:function': {
189-
'sphinx.test.function': [
190-
'dummy-proj-1',
191-
'dummy-version-1',
192-
'test.html#epub-faq', # file generated by ``sphinx.builders.html.StandaloneHTMLBuilder``
193-
'dummy-func-name-1',
194-
]
195-
},
196-
'py:function': {
197-
'sample.test.function': [
198-
'dummy-proj-2',
199-
'dummy-version-2',
200-
'test.html#sample-test-func', # file generated by ``sphinx.builders.html.StandaloneHTMLBuilder``
201-
'dummy-func-name-2'
202-
]
203-
},
204-
'js:function': {
205-
'testFunction': [
206-
'dummy-proj-3',
207-
'dummy-version-3',
208-
'api/#test-func', # file generated by ``sphinx.builders.dirhtml.DirectoryHTMLBuilder``
209-
'dummy-func-name-3'
210-
]
211-
}
212-
}
213-
214-
with mock.patch(
215-
'sphinx.ext.intersphinx.fetch_inventory',
216-
return_value=test_objects_inv
217-
) as mock_fetch_inventory:
218-
219-
_create_imported_files(
220-
version=self.version,
221-
commit='commit01',
222-
build=1,
223-
search_ranking={},
224-
search_ignore=[],
225-
)
226-
_create_intersphinx_data(self.version, 'commit01', 1)
227-
228-
# there will be two html files,
229-
# `api/index.html` and `test.html`
230-
self.assertEqual(
231-
HTMLFile.objects.all().count(),
232-
2
233-
)
234-
self.assertEqual(
235-
HTMLFile.objects.filter(path='test.html').count(),
236-
1
237-
)
238-
self.assertEqual(
239-
HTMLFile.objects.filter(path='api/index.html').count(),
240-
1
241-
)
242-
243-
html_file_api = HTMLFile.objects.filter(path='api/index.html').first()
244-
245-
self.assertEqual(
246-
SphinxDomain.objects.all().count(),
247-
3
248-
)
249-
self.assertEqual(
250-
SphinxDomain.objects.filter(html_file=html_file_api).count(),
251-
1
252-
)
253-
mock_fetch_inventory.assert_called_once()
254-
self.assertRegex(
255-
mock_fetch_inventory.call_args[0][2],
256-
r'^https://readthedocs\.org/media/.*/objects\.inv$'
257-
)
258-
self.assertEqual(ImportedFile.objects.count(), 2)
259-
260-
@override_settings(RTD_INTERSPHINX_URL="http://localhost:8080")
261-
@mock.patch("readthedocs.doc_builder.director.os.path.exists")
262-
def test_custom_intersphinx_url(self, mock_exists):
263-
mock_exists.return_Value = True
264-
265-
with mock.patch(
266-
'sphinx.ext.intersphinx.fetch_inventory',
267-
return_value={}
268-
) as mock_fetch_inventory:
269-
_create_intersphinx_data(self.version, 'commit01', 1)
270-
271-
mock_fetch_inventory.assert_called_once()
272-
self.assertRegex(
273-
mock_fetch_inventory.call_args[0][2],
274-
'^http://localhost:8080/media/.*/objects.inv$'
275-
)

readthedocs/search/api/v2/serializers.py

+3-40
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
They should be renamed in the ES index too.
77
"""
88

9-
import itertools
109
import re
1110
from functools import namedtuple
1211
from operator import attrgetter
@@ -157,52 +156,16 @@ def _get_full_path(self, obj):
157156

158157
def get_blocks(self, obj):
159158
"""Combine and sort inner results (domains and sections)."""
160-
serializers = {
161-
"domain": DomainSearchSerializer,
162-
"section": SectionSearchSerializer,
163-
}
164-
165-
inner_hits = obj.meta.inner_hits
166-
sections = inner_hits.sections or []
167-
domains = inner_hits.domains or []
168-
169-
# Make them identifiable before merging them
170-
for s in sections:
171-
s.type = "section"
172-
for d in domains:
173-
d.type = "domain"
174-
159+
sections = obj.meta.inner_hits.sections or []
175160
sorted_results = sorted(
176-
itertools.chain(sections, domains),
161+
sections,
177162
key=attrgetter("meta.score"),
178163
reverse=True,
179164
)
180-
sorted_results = [serializers[hit.type](hit).data for hit in sorted_results]
165+
sorted_results = [SectionSearchSerializer(hit).data for hit in sorted_results]
181166
return sorted_results
182167

183168

184-
class DomainHighlightSerializer(serializers.Serializer):
185-
186-
name = serializers.SerializerMethodField()
187-
content = serializers.SerializerMethodField()
188-
189-
def get_name(self, obj):
190-
return list(getattr(obj, "domains.name", []))
191-
192-
def get_content(self, obj):
193-
return list(getattr(obj, "domains.docstrings", []))
194-
195-
196-
class DomainSearchSerializer(serializers.Serializer):
197-
198-
type = serializers.CharField(default="domain", source=None, read_only=True)
199-
role = serializers.CharField(source="role_name")
200-
name = serializers.CharField()
201-
id = serializers.CharField(source="anchor")
202-
content = serializers.CharField(source="docstrings")
203-
highlights = DomainHighlightSerializer(source="meta.highlight", default=dict)
204-
205-
206169
class SectionHighlightSerializer(serializers.Serializer):
207170

208171
title = serializers.SerializerMethodField()

0 commit comments

Comments
 (0)