Skip to content

Commit 4383072

Browse files
committed
Add initial search implementation to DomainData
This move project search to a subset of site search. This removes the second entry point to site search, and unifies all our searching to use fancy facets and be nice.
1 parent 0986866 commit 4383072

File tree

14 files changed

+1276
-293
lines changed

14 files changed

+1276
-293
lines changed

readthedocs/domaindata/__init__.py

Whitespace-only changes.

readthedocs/domaindata/admin.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from django.contrib import admin
2+
from .models import DomainData
3+
4+
5+
class DomainDataAdmin(admin.ModelAdmin):
6+
list_filter = ('type', 'project')
7+
raw_id_fields = ('project', 'version')
8+
search_fields = ('doc_name', 'name')
9+
10+
11+
admin.site.register(DomainData, DomainDataAdmin)

readthedocs/domaindata/api.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from rest_framework import serializers
2+
3+
from readthedocs.restapi.views.model_views import UserSelectViewSet
4+
from .models import DomainData
5+
6+
7+
class DomainDataSerializer(serializers.ModelSerializer):
8+
project = serializers.SlugRelatedField(slug_field='slug', read_only=True)
9+
version = serializers.SlugRelatedField(slug_field='slug', read_only=True)
10+
11+
class Meta:
12+
model = DomainData
13+
fields = ('project', 'version', 'name', 'display_name', 'doc_type', 'doc_url')
14+
15+
16+
class DomainDataAdminSerializer(DomainDataSerializer):
17+
18+
class Meta(DomainDataSerializer.Meta):
19+
fields = '__all__'
20+
21+
22+
class DomainDataAPIView(UserSelectViewSet):
23+
model = DomainData
24+
serializer_class = DomainDataSerializer
25+
admin_serializer_class = DomainDataAdminSerializer
26+
filter_fields = ('project__slug', 'version__slug', 'domain', 'type', 'doc_name', 'name')

readthedocs/domaindata/models.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
from django.db import models
2+
from django.utils.encoding import python_2_unicode_compatible
3+
from django.utils.translation import ugettext_lazy as _
4+
5+
from readthedocs.builds.models import Version
6+
from readthedocs.core.resolver import resolve
7+
from readthedocs.projects.models import Project
8+
from readthedocs.projects.querysets import RelatedProjectQuerySet
9+
10+
11+
@python_2_unicode_compatible
12+
class DomainData(models.Model):
13+
14+
"""
15+
Information from a project about it's Sphinx domains.
16+
17+
This captures data about API objects that exist in that codebase.
18+
"""
19+
20+
project = models.ForeignKey(
21+
Project,
22+
related_name='domain_data',
23+
)
24+
version = models.ForeignKey(Version, verbose_name=_('Version'),
25+
related_name='domain_data')
26+
modified_date = models.DateTimeField(_('Publication date'), auto_now=True)
27+
commit = models.CharField(_('Commit'), max_length=255)
28+
29+
domain = models.CharField(
30+
_('Domain'),
31+
max_length=255,
32+
)
33+
name = models.CharField(
34+
_('Name'),
35+
max_length=255,
36+
)
37+
display_name = models.CharField(
38+
_('Display Name'),
39+
max_length=255,
40+
)
41+
type = models.CharField(
42+
_('Type'),
43+
max_length=255,
44+
)
45+
doc_name = models.CharField(
46+
_('Doc Name'),
47+
max_length=255,
48+
)
49+
anchor = models.CharField(
50+
_('Anchor'),
51+
max_length=255,
52+
)
53+
objects = RelatedProjectQuerySet.as_manager()
54+
55+
def __str__(self):
56+
return f'''
57+
DomainData [{self.project.slug}:{self.version.slug}]
58+
[{self.domain}:{self.type}] {self.name} -> {self.doc_name}#{self.anchor}
59+
'''
60+
61+
@property
62+
def doc_type(self):
63+
return f'{self.domain}:{self.type}'
64+
65+
@property
66+
def doc_url(self):
67+
path = self.doc_name
68+
if self.anchor:
69+
path += f'#{self.anchor}'
70+
full_url = resolve(
71+
project=self.project, version_slug=self.version.slug, filename=path)
72+
return full_url

readthedocs/projects/tasks.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import json
1212
import logging
1313
import os
14+
import sys
1415
import shutil
1516
import socket
1617
from collections import Counter, defaultdict

readthedocs/projects/urls/public.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
),
5252
url(
5353
r'^(?P<project_slug>{project_slug})/search/$'.format(**pattern_opts),
54-
search_views.elastic_project_search,
54+
search_views.elastic_search,
5555
name='elastic_project_search',
5656
),
5757
url(

readthedocs/search/documents.py

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
from django.conf import settings
55
from django_elasticsearch_dsl import DocType, Index, fields
66

7-
from readthedocs.projects.models import HTMLFile, Project
8-
7+
from readthedocs.projects.models import Project, HTMLFile
8+
from readthedocs.domaindata.models import DomainData
99

1010
project_conf = settings.ES_INDEXES['project']
1111
project_index = Index(project_conf['name'])
@@ -15,9 +15,52 @@
1515
page_index = Index(page_conf['name'])
1616
page_index.settings(**page_conf['settings'])
1717

18+
domain_conf = settings.ES_INDEXES['domain']
19+
domain_index = Index(domain_conf['name'])
20+
domain_index.settings(**domain_conf['settings'])
21+
1822
log = logging.getLogger(__name__)
1923

2024

25+
@domain_index.doc_type
26+
class DomainDocument(DocType):
27+
project = fields.KeywordField(attr='project.slug')
28+
version = fields.KeywordField(attr='version.slug')
29+
doc_type = fields.KeywordField(attr='doc_type')
30+
anchor = fields.KeywordField(attr='anchor')
31+
32+
class Meta(object):
33+
model = DomainData
34+
fields = ('name', 'display_name', 'doc_name')
35+
ignore_signals = True
36+
37+
@classmethod
38+
def faceted_search(cls, query, user, doc_type=None):
39+
from readthedocs.search.faceted_search import DomainSearch
40+
kwargs = {
41+
'user': user,
42+
'query': query,
43+
}
44+
45+
if doc_type:
46+
kwargs['filters'] = {'doc_type': doc_type}
47+
48+
return DomainSearch(**kwargs)
49+
50+
def get_queryset(self):
51+
"""Overwrite default queryset to filter certain files to index"""
52+
queryset = super().get_queryset()
53+
54+
# Exclude some types to not index
55+
excluded_types = ['std:doc', 'std:label']
56+
57+
# Do not index files that belong to non sphinx project
58+
# Also do not index certain files
59+
for exclude in excluded_types:
60+
queryset = queryset.exclude(type=exclude)
61+
return queryset
62+
63+
2164
@project_index.doc_type
2265
class ProjectDocument(DocType):
2366

readthedocs/search/faceted_search.py

Lines changed: 83 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,40 @@
1+
# -*- coding: utf-8 -*-
12
import logging
23

34
from elasticsearch_dsl import FacetedSearch, TermsFacet
45
from elasticsearch_dsl.query import Bool, SimpleQueryString
56

7+
from readthedocs.search.documents import (
8+
DomainDocument,
9+
PageDocument,
10+
ProjectDocument,
11+
)
612
from readthedocs.core.utils.extend import SettingsOverrideObject
7-
from readthedocs.search.documents import PageDocument, ProjectDocument
813

914
log = logging.getLogger(__name__)
1015

16+
ALL_FACETS = ['project', 'version', 'doc_type', 'language', 'index']
17+
1118

1219
class RTDFacetedSearch(FacetedSearch):
1320

1421
def __init__(self, user, **kwargs):
22+
self.user = user
23+
self.filter_by_user = kwargs.pop('filter_by_user', None)
24+
25+
# Set filters properly
26+
for facet in self.facets:
27+
if facet in kwargs:
28+
kwargs.setdefault('filters', {})[facet] = kwargs.pop(facet)
29+
30+
# Don't pass along unnecessary filters
31+
for f in ALL_FACETS:
32+
if f in kwargs:
33+
del kwargs[f]
34+
35+
super().__init__(**kwargs)
36+
37+
def search(self):
1538
"""
1639
Pass in a user in order to filter search results by privacy.
1740
@@ -20,19 +43,36 @@ def __init__(self, user, **kwargs):
2043
The `self.user` attribute isn't currently used on the .org,
2144
but is used on the .com
2245
"""
23-
self.user = user
24-
self.filter_by_user = kwargs.pop('filter_by_user', None)
25-
super().__init__(**kwargs)
46+
s = super().search()
47+
s = s.source(exclude=['content', 'headers'])
48+
# Return 25 results
49+
return s[:25]
2650

2751
def query(self, search, query):
2852
"""
2953
Add query part to ``search`` when needed.
3054
31-
Also does HTML encoding of results to avoid XSS issues.
55+
Also:
56+
57+
* Adds SimpleQueryString instead of default query.
58+
* Adds HTML encoding of results to avoid XSS issues.
3259
"""
33-
search = super().query(search, query)
3460
search = search.highlight_options(encoder='html', number_of_fragments=3)
35-
search = search.source(exclude=['content', 'headers'])
61+
62+
all_queries = []
63+
64+
# need to search for both 'and' and 'or' operations
65+
# the score of and should be higher as it satisfies both or and and
66+
for operator in ['and', 'or']:
67+
query_string = SimpleQueryString(
68+
query=query, fields=self.fields, default_operator=operator
69+
)
70+
all_queries.append(query_string)
71+
72+
# run bool query with should, so it returns result where either of the query matches
73+
bool_query = Bool(should=all_queries)
74+
75+
search = search.query(bool_query)
3676
return search
3777

3878

@@ -52,26 +92,16 @@ class PageSearchBase(RTDFacetedSearch):
5292
index = PageDocument._doc_type.index
5393
fields = ['title^10', 'headers^5', 'content']
5494

55-
def query(self, search, query):
56-
"""Use a custom SimpleQueryString instead of default query."""
57-
58-
search = super().query(search, query)
59-
60-
all_queries = []
6195

62-
# need to search for both 'and' and 'or' operations
63-
# the score of and should be higher as it satisfies both or and and
64-
for operator in ['AND', 'OR']:
65-
query_string = SimpleQueryString(
66-
query=query, fields=self.fields, default_operator=operator
67-
)
68-
all_queries.append(query_string)
69-
70-
# run bool query with should, so it returns result where either of the query matches
71-
bool_query = Bool(should=all_queries)
72-
73-
search = search.query(bool_query)
74-
return search
96+
class DomainSearchBase(RTDFacetedSearch):
97+
facets = {
98+
'project': TermsFacet(field='project'),
99+
'version': TermsFacet(field='version'),
100+
'doc_type': TermsFacet(field='doc_type'),
101+
}
102+
doc_types = [DomainDocument]
103+
index = DomainDocument._doc_type.index
104+
fields = ('display_name^5', 'name')
75105

76106

77107
class PageSearch(SettingsOverrideObject):
@@ -94,3 +124,30 @@ class ProjectSearch(SettingsOverrideObject):
94124
"""
95125

96126
_default_class = ProjectSearchBase
127+
128+
129+
class DomainSearch(SettingsOverrideObject):
130+
131+
"""
132+
Allow this class to be overridden based on CLASS_OVERRIDES setting.
133+
134+
This is primary used on the .com to adjust how we filter our search queries
135+
"""
136+
137+
_default_class = DomainSearchBase
138+
139+
140+
class AllSearch(RTDFacetedSearch):
141+
facets = {
142+
'project': TermsFacet(field='project'),
143+
'version': TermsFacet(field='version'),
144+
'language': TermsFacet(field='language'),
145+
'doc_type': TermsFacet(field='doc_type'),
146+
'index': TermsFacet(field='_index'),
147+
}
148+
doc_types = [DomainDocument, PageDocument, ProjectDocument]
149+
index = [DomainDocument._doc_type.index,
150+
PageDocument._doc_type.index,
151+
ProjectDocument._doc_type.index]
152+
fields = ('title^10', 'headers^5', 'content', 'name^20',
153+
'slug^5', 'description', 'display_name^5')

0 commit comments

Comments
 (0)