Skip to content

Commit bbae0dc

Browse files
authored
Merge pull request #5290 from rtfd/domain-search
Add search for DomainData objects
2 parents 5da21c0 + 479e4e8 commit bbae0dc

19 files changed

+536
-356
lines changed

readthedocs/projects/admin.py

+1
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,7 @@ class ImportedFileAdmin(admin.ModelAdmin):
307307

308308
raw_id_fields = ('project', 'version')
309309
list_display = ('path', 'name', 'version')
310+
search_fields = ('project', 'path')
310311

311312

312313
class DomainAdmin(admin.ModelAdmin):

readthedocs/projects/models.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from django.db.models import Prefetch
1414
from django.urls import NoReverseMatch, reverse
1515
from django.utils.translation import ugettext_lazy as _
16+
from django.utils.functional import cached_property
1617
from django_extensions.db.models import TimeStampedModel
1718
from guardian.shortcuts import assign
1819
from six.moves import shlex_quote
@@ -1215,7 +1216,7 @@ def get_processed_json(self):
12151216
'sections': [],
12161217
}
12171218

1218-
@property
1219+
@cached_property
12191220
def processed_json(self):
12201221
return self.get_processed_json()
12211222

readthedocs/projects/signals.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@
1515

1616
files_changed = django.dispatch.Signal(providing_args=['project', 'files'])
1717

18-
bulk_post_create = django.dispatch.Signal(providing_args=['instance_list'])
18+
bulk_post_create = django.dispatch.Signal(providing_args=['instance_list', 'commit'])
1919

20-
bulk_post_delete = django.dispatch.Signal(providing_args=['instance_list'])
20+
bulk_post_delete = django.dispatch.Signal(providing_args=['instance_list', 'commit'])
2121

2222
# Used to force verify a domain (eg. for SSL cert issuance)
2323
domain_verify = django.dispatch.Signal(providing_args=['domain'])

readthedocs/projects/tasks.py

+65-28
Original file line numberDiff line numberDiff line change
@@ -1177,13 +1177,12 @@ def fileify(version_pk, commit):
11771177
project = version.project
11781178

11791179
if not commit:
1180-
log.info(
1180+
log.warning(
11811181
LOG_TEMPLATE.format(
11821182
project=project.slug,
11831183
version=version.slug,
11841184
msg=(
1185-
'Imported File not being built because no commit '
1186-
'information'
1185+
'Search index not being built because no commit information'
11871186
),
11881187
),
11891188
)
@@ -1198,16 +1197,15 @@ def fileify(version_pk, commit):
11981197
msg='Creating ImportedFiles',
11991198
),
12001199
)
1201-
_manage_imported_files(version, path, commit)
1202-
_update_intersphinx_data(version, path, commit)
1203-
else:
1204-
log.info(
1205-
LOG_TEMPLATE.format(
1206-
project=project.slug,
1207-
version=version.slug,
1208-
msg='No ImportedFile files',
1209-
),
1210-
)
1200+
try:
1201+
_manage_imported_files(version, path, commit)
1202+
except Exception:
1203+
log.exception('Failed during ImportedFile creation')
1204+
1205+
try:
1206+
_update_intersphinx_data(version, path, commit)
1207+
except Exception:
1208+
log.exception('Failed during SphinxDomain creation')
12111209

12121210

12131211
def _update_intersphinx_data(version, path, commit):
@@ -1223,6 +1221,20 @@ def _update_intersphinx_data(version, path, commit):
12231221
log.debug('No objects.inv, skipping intersphinx indexing.')
12241222
return
12251223

1224+
full_json_path = version.project.get_production_media_path(
1225+
type_='json', version_slug=version.slug, include_file=False
1226+
)
1227+
type_file = os.path.join(full_json_path, 'readthedocs-sphinx-domain-names.json')
1228+
types = {}
1229+
titles = {}
1230+
if os.path.exists(type_file):
1231+
try:
1232+
data = json.load(open(type_file))
1233+
types = data['types']
1234+
titles = data['titles']
1235+
except Exception:
1236+
log.exception('Exception parsing readthedocs-sphinx-domain-names.json')
1237+
12261238
# These classes are copied from Sphinx
12271239
# https://git.io/fhFbI
12281240
class MockConfig:
@@ -1236,6 +1248,8 @@ class MockApp:
12361248
def warn(self, msg):
12371249
log.warning('Sphinx MockApp: %s', msg)
12381250

1251+
created_sphinx_domains = []
1252+
12391253
invdata = intersphinx.fetch_inventory(MockApp(), '', object_file)
12401254
for key, value in sorted(invdata.items() or {}):
12411255
domain, _type = key.split(':')
@@ -1252,22 +1266,41 @@ def warn(self, msg):
12521266
else:
12531267
doc_name, anchor = url, ''
12541268
display_name = einfo[3]
1255-
obj, _ = SphinxDomain.objects.get_or_create(
1269+
obj, created = SphinxDomain.objects.get_or_create(
12561270
project=version.project,
12571271
version=version,
12581272
domain=domain,
12591273
name=name,
12601274
display_name=display_name,
12611275
type=_type,
1276+
type_display=types.get(f'{domain}:{_type}', ''),
12621277
doc_name=doc_name,
1278+
doc_display=titles.get(doc_name, ''),
12631279
anchor=anchor,
12641280
)
12651281
if obj.commit != commit:
12661282
obj.commit = commit
12671283
obj.save()
1268-
SphinxDomain.objects.filter(project=version.project,
1269-
version=version
1270-
).exclude(commit=commit).delete()
1284+
if created:
1285+
created_sphinx_domains.append(obj)
1286+
1287+
# Send bulk_post_create signal for bulk indexing to Elasticsearch
1288+
bulk_post_create.send(sender=SphinxDomain, instance_list=created_sphinx_domains, commit=commit)
1289+
1290+
# Delete the SphinxDomain first from previous commit and
1291+
# send bulk_post_delete signal for bulk removing from Elasticsearch
1292+
delete_queryset = (
1293+
SphinxDomain.objects.filter(project=version.project,
1294+
version=version
1295+
).exclude(commit=commit)
1296+
)
1297+
# Keep the objects into memory to send it to signal
1298+
instance_list = list(delete_queryset)
1299+
# Always pass the list of instance, not queryset.
1300+
bulk_post_delete.send(sender=SphinxDomain, instance_list=instance_list, commit=commit)
1301+
1302+
# Delete from previous versions
1303+
delete_queryset.delete()
12711304

12721305

12731306
def _manage_imported_files(version, path, commit):
@@ -1294,7 +1327,7 @@ def _manage_imported_files(version, path, commit):
12941327
md5 = hashlib.md5(open(full_path, 'rb').read()).hexdigest()
12951328
try:
12961329
# pylint: disable=unpacking-non-sequence
1297-
obj, __ = model_class.objects.get_or_create(
1330+
obj, created = model_class.objects.get_or_create(
12981331
project=version.project,
12991332
version=version,
13001333
path=dirpath,
@@ -1310,34 +1343,38 @@ def _manage_imported_files(version, path, commit):
13101343
obj.commit = commit
13111344
obj.save()
13121345

1313-
if model_class == HTMLFile:
1346+
if created and model_class == HTMLFile:
13141347
# the `obj` is HTMLFile, so add it to the list
13151348
created_html_files.append(obj)
13161349

13171350
# Send bulk_post_create signal for bulk indexing to Elasticsearch
1318-
bulk_post_create.send(sender=HTMLFile, instance_list=created_html_files)
1351+
bulk_post_create.send(sender=HTMLFile, instance_list=created_html_files,
1352+
version=version, commit=commit)
13191353

13201354
# Delete the HTMLFile first from previous commit and
13211355
# send bulk_post_delete signal for bulk removing from Elasticsearch
13221356
delete_queryset = (
13231357
HTMLFile.objects.filter(project=version.project,
13241358
version=version).exclude(commit=commit)
13251359
)
1360+
13261361
# Keep the objects into memory to send it to signal
13271362
instance_list = list(delete_queryset)
1363+
13281364
# Always pass the list of instance, not queryset.
13291365
# These objects must exist though,
13301366
# because the task will query the DB for the objects before deleting
1331-
bulk_post_delete.send(sender=HTMLFile, instance_list=instance_list)
1332-
# Safely delete from database
1333-
delete_queryset.delete()
1367+
bulk_post_delete.send(sender=HTMLFile, instance_list=instance_list,
1368+
version=version, commit=commit)
13341369

13351370
# Delete ImportedFiles from previous versions
1336-
(
1337-
ImportedFile.objects.filter(project=version.project,
1338-
version=version).exclude(commit=commit
1339-
).delete()
1340-
)
1371+
delete_queryset.delete()
1372+
1373+
# This is required to delete ImportedFile objects that aren't HTMLFile objects,
1374+
ImportedFile.objects.filter(
1375+
project=version.project, version=version
1376+
).exclude(commit=commit).delete()
1377+
13411378
changed_files = [
13421379
resolve_path(
13431380
version.project,

readthedocs/projects/urls/public.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
),
5252
url(
5353
r'^(?P<project_slug>{project_slug})/search/$'.format(**pattern_opts),
54-
search_views.elastic_project_search,
54+
search_views.elastic_search,
5555
name='elastic_project_search',
5656
),
5757
url(

readthedocs/restapi/urls.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
router.register(r'project', ProjectViewSet, basename='project')
3636
router.register(r'notification', NotificationViewSet, basename='emailhook')
3737
router.register(r'domain', DomainViewSet, basename='domain')
38-
router.register(r'sphinx_domains', SphinxDomainAPIView, basename='sphinxdomain')
38+
router.register(r'sphinx_domain', SphinxDomainAPIView, basename='sphinxdomain')
3939
router.register(
4040
r'remote/org',
4141
RemoteOrganizationViewSet,

readthedocs/search/api.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@
66
from rest_framework.exceptions import ValidationError
77
from rest_framework.pagination import PageNumberPagination
88

9-
from readthedocs.search.documents import PageDocument
9+
from readthedocs.search.faceted_search import PageSearch
1010
from readthedocs.search.utils import get_project_list_or_404
1111

1212
log = logging.getLogger(__name__)
1313

1414

1515
class SearchPagination(PageNumberPagination):
16-
page_size = 25
16+
page_size = 50
1717
page_size_query_param = 'page_size'
1818
max_page_size = 100
1919

@@ -62,15 +62,15 @@ def get_queryset(self):
6262
# Validate all the required params are there
6363
self.validate_query_params()
6464
query = self.request.query_params.get('q', '')
65-
kwargs = {'filter_by_user': False}
66-
kwargs['projects_list'] = [p.slug for p in self.get_all_projects()]
67-
kwargs['versions_list'] = self.request.query_params.get('version')
68-
if not kwargs['projects_list']:
65+
kwargs = {'filter_by_user': False, 'filters': {}}
66+
kwargs['filters']['project'] = [p.slug for p in self.get_all_projects()]
67+
kwargs['filters']['version'] = self.request.query_params.get('version')
68+
if not kwargs['filters']['project']:
6969
raise ValidationError("Unable to find a project to search")
70-
if not kwargs['versions_list']:
70+
if not kwargs['filters']['version']:
7171
raise ValidationError("Unable to find a version to search")
7272
user = self.request.user
73-
queryset = PageDocument.faceted_search(
73+
queryset = PageSearch(
7474
query=query, user=user, **kwargs
7575
)
7676
return queryset

readthedocs/search/documents.py

+49-1
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
# -*- coding: utf-8 -*-
21
import logging
32

43
from django.conf import settings
54
from django_elasticsearch_dsl import DocType, Index, fields
65

76
from readthedocs.projects.models import HTMLFile, Project
7+
from readthedocs.sphinx_domains.models import SphinxDomain
88

99

1010
project_conf = settings.ES_INDEXES['project']
@@ -15,9 +15,53 @@
1515
page_index = Index(page_conf['name'])
1616
page_index.settings(**page_conf['settings'])
1717

18+
domain_conf = settings.ES_INDEXES['domain']
19+
domain_index = Index(domain_conf['name'])
20+
domain_index.settings(**domain_conf['settings'])
21+
1822
log = logging.getLogger(__name__)
1923

2024

25+
@domain_index.doc_type
26+
class SphinxDomainDocument(DocType):
27+
project = fields.KeywordField(attr='project.slug')
28+
version = fields.KeywordField(attr='version.slug')
29+
role_name = fields.KeywordField(attr='role_name')
30+
31+
# For linking to the URL
32+
doc_name = fields.KeywordField(attr='doc_name')
33+
anchor = fields.KeywordField(attr='anchor')
34+
35+
# For showing in the search result
36+
type_display = fields.TextField(attr='type_display')
37+
doc_display = fields.TextField(attr='doc_display')
38+
39+
# Simple analyzer breaks on `.`,
40+
# otherwise search results are too strict for this use case
41+
name = fields.TextField(attr='name', analyzer='simple')
42+
display_name = fields.TextField(attr='display_name', analyzer='simple')
43+
44+
modified_model_field = 'modified'
45+
46+
class Meta(object):
47+
model = SphinxDomain
48+
fields = ('commit',)
49+
ignore_signals = True
50+
51+
def get_queryset(self):
52+
"""Overwrite default queryset to filter certain files to index."""
53+
queryset = super().get_queryset()
54+
55+
excluded_types = [
56+
{'domain': 'std', 'type': 'doc'},
57+
{'domain': 'std', 'type': 'label'},
58+
]
59+
60+
for exclude in excluded_types:
61+
queryset = queryset.exclude(**exclude)
62+
return queryset
63+
64+
2165
@project_index.doc_type
2266
class ProjectDocument(DocType):
2367

@@ -31,6 +75,8 @@ class ProjectDocument(DocType):
3175
)
3276
language = fields.KeywordField()
3377

78+
modified_model_field = 'modified_date'
79+
3480
class Meta(object):
3581
model = Project
3682
fields = ('name', 'slug', 'description')
@@ -63,6 +109,8 @@ class PageDocument(DocType):
63109
headers = fields.TextField(attr='processed_json.headers')
64110
content = fields.TextField(attr='processed_json.content')
65111

112+
modified_model_field = 'modified_date'
113+
66114
class Meta(object):
67115
model = HTMLFile
68116
fields = ('commit',)

0 commit comments

Comments
 (0)