Skip to content

Commit e8ac769

Browse files
committed
[fix readthedocs#4265] Port Document search API for Elasticsearch 6.x
1 parent e9bfeee commit e8ac769

File tree

9 files changed

+108
-24
lines changed

9 files changed

+108
-24
lines changed

readthedocs/search/api.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from rest_framework import generics
2+
3+
from readthedocs.search.documents import PageDocument
4+
from readthedocs.search.filters import SearchFilterBackend
5+
from readthedocs.search.pagination import SearchPagination
6+
from readthedocs.search.serializers import PageSearchSerializer
7+
8+
9+
class PageSearchAPIView(generics.ListAPIView):
10+
pagination_class = SearchPagination
11+
filter_backends = [SearchFilterBackend]
12+
serializer_class = PageSearchSerializer
13+
14+
def get_queryset(self):
15+
query = self.request.query_params.get('query')
16+
queryset = PageDocument.search(query=query)
17+
return queryset

readthedocs/search/documents.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from django.conf import settings
22
from django_elasticsearch_dsl import DocType, Index, fields
3+
from elasticsearch_dsl.query import SimpleQueryString, Bool
34

45
from readthedocs.projects.models import Project, HTMLFile
56
from .conf import SEARCH_EXCLUDED_FILE
@@ -60,14 +61,19 @@ class Meta(object):
6061
content = fields.TextField(attr='processed_json.content')
6162
path = fields.TextField(attr='processed_json.path')
6263

64+
# Fields to perform search with weight
65+
search_fields = ['title^10', 'headers^5', 'content']
66+
6367
@classmethod
6468
def faceted_search(cls, query, projects_list=None, versions_list=None, using=None, index=None):
69+
es_query = cls.get_es_query(query=query)
6570
kwargs = {
6671
'using': using or cls._doc_type.using,
6772
'index': index or cls._doc_type.index,
6873
'doc_types': [cls],
6974
'model': cls._doc_type.model,
70-
'query': query
75+
'query': es_query,
76+
'fields': cls.search_fields
7177
}
7278
filters = {}
7379

@@ -80,6 +86,32 @@ def faceted_search(cls, query, projects_list=None, versions_list=None, using=Non
8086

8187
return FileSearch(**kwargs)
8288

89+
@classmethod
90+
def search(cls, using=None, index=None, **kwargs):
91+
es_search = super(PageDocument, cls).search(using=using, index=index)
92+
query = kwargs.pop('query')
93+
es_query = cls.get_es_query(query=query)
94+
95+
es_search = es_search.query(es_query)
96+
return es_search
97+
98+
@classmethod
99+
def get_es_query(cls, query):
100+
"""Return the Elasticsearch query generated from the query string"""
101+
all_queries = []
102+
103+
# Need to search for both 'AND' and 'OR' operations
104+
# The score of AND should be higher as it satisfies both OR and AND
105+
for operator in ['AND', 'OR']:
106+
query_string = SimpleQueryString(query=query, fields=cls.search_fields,
107+
default_operator=operator)
108+
all_queries.append(query_string)
109+
110+
# Run bool query with should, so it returns result where either of the query matches
111+
bool_query = Bool(should=all_queries)
112+
113+
return bool_query
114+
83115
def get_queryset(self):
84116
"""Overwrite default queryset to filter certain files to index"""
85117
queryset = super(PageDocument, self).get_queryset()

readthedocs/search/faceted_search.py

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@ class RTDFacetedSearch(FacetedSearch):
99
# TODO: Remove the overwrite when the elastic/elasticsearch-dsl-py#916
1010
# See more: https://github.com/elastic/elasticsearch-dsl-py/issues/916
1111

12-
def __init__(self, using, index, doc_types, model, **kwargs):
12+
def __init__(self, using, index, doc_types, model, fields=None, **kwargs):
1313
self.using = using
1414
self.index = index
1515
self.doc_types = doc_types
1616
self._model = model
17+
if fields:
18+
self.fields = fields
1719
super(RTDFacetedSearch, self).__init__(**kwargs)
1820

1921

@@ -25,7 +27,6 @@ class ProjectSearch(RTDFacetedSearch):
2527

2628

2729
class FileSearch(RTDFacetedSearch):
28-
fields = ['title^10', 'headers^5', 'content']
2930
facets = {
3031
'project': TermsFacet(field='project'),
3132
'version': TermsFacet(field='version')
@@ -34,17 +35,6 @@ class FileSearch(RTDFacetedSearch):
3435
def query(self, search, query):
3536
"""Add query part to ``search``"""
3637
if query:
37-
all_queries = []
38-
39-
# Need to search for both 'AND' and 'OR' operations
40-
# The score of AND should be higher as it comes first
41-
for operator in ['AND', 'OR']:
42-
query_string = SimpleQueryString(query=query, fields=self.fields,
43-
default_operator=operator)
44-
all_queries.append(query_string)
45-
46-
# Run bool query with should, so it returns result where either of the query matches
47-
bool_query = Bool(should=all_queries)
48-
search = search.query(bool_query)
38+
search = search.query(query)
4939

5040
return search

readthedocs/search/filters.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from rest_framework import filters
2+
3+
from readthedocs.search.utils import get_project_slug_list_or_404
4+
5+
6+
class SearchFilterBackend(filters.BaseFilterBackend):
7+
"""
8+
Filter search result with project
9+
"""
10+
11+
def filter_queryset(self, request, queryset, view):
12+
project_slug = request.query_params.get('project')
13+
project_slug_list = get_project_slug_list_or_404(project_slug=project_slug,
14+
user=request.user)
15+
return queryset.filter('terms', project=project_slug_list)

readthedocs/search/pagination.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from rest_framework.pagination import PageNumberPagination
2+
3+
4+
class SearchPagination(PageNumberPagination):
5+
page_size = 10
6+
page_size_query_param = 'page_size'
7+
max_page_size = 100

readthedocs/search/serializers.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from rest_framework import serializers
2+
3+
4+
class PageSearchSerializer(serializers.Serializer):
5+
title = serializers.CharField()
6+
headers = serializers.ListField()
7+
content = serializers.CharField()
8+
path = serializers.CharField()

readthedocs/search/utils.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@
1111
import json
1212

1313
from builtins import next, range
14+
from django.shortcuts import get_object_or_404
1415
from pyquery import PyQuery
1516

17+
from readthedocs.projects.models import Project
1618

1719
log = logging.getLogger(__name__)
1820

@@ -306,3 +308,19 @@ def parse_sections(documentation_type, content):
306308
return ''
307309

308310
return sections
311+
312+
313+
# TODO: Rewrite all the views using this in Class Based View,
314+
# and move this function to a mixin
315+
def get_project_slug_list_or_404(project_slug, user):
316+
"""Return list of subproject's slug including own slug.
317+
If the project is not available to user, redirect to 404
318+
"""
319+
queryset = Project.objects.api(user).only('slug')
320+
project = get_object_or_404(queryset, slug=project_slug)
321+
322+
subprojects_slug = (queryset.filter(superprojects__parent_id=project.id)
323+
.values_list('slug', flat=True))
324+
325+
slug_list = [project.slug] + list(subprojects_slug)
326+
return slug_list

readthedocs/search/views.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from readthedocs.projects.models import Project
1515
from readthedocs.search import lib as search_lib
1616
from readthedocs.search.documents import ProjectDocument, PageDocument
17+
from readthedocs.search.utils import get_project_slug_list_or_404
1718

1819
log = logging.getLogger(__name__)
1920
LOG_TEMPLATE = u'(Elastic Search) [{user}:{type}] [{project}:{version}:{language}] {msg}'
@@ -54,14 +55,9 @@ def elastic_search(request):
5455
elif user_input.type == 'file':
5556
kwargs = {}
5657
if user_input.project:
57-
queryset = Project.objects.api(request.user).only('slug')
58-
project = get_object_or_404(queryset, slug=user_input.project)
59-
60-
subprojects_slug = (queryset.filter(superprojects__parent_id=project.id)
61-
.values_list('slug', flat=True))
62-
63-
projects_list = [project.slug] + list(subprojects_slug)
64-
kwargs['projects_list'] = projects_list
58+
project_slug_list = get_project_slug_list_or_404(project_slug=user_input.project,
59+
user=request.user)
60+
kwargs['projects_list'] = project_slug_list
6561
if user_input.version:
6662
kwargs['versions_list'] = user_input.version
6763

readthedocs/urls.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
do_not_track,
2323
)
2424
from readthedocs.search import views as search_views
25-
25+
from readthedocs.search.api import PageSearchAPIView
2626

2727
v1_api = Api(api_name='v1')
2828
v1_api.register(UserResource())
@@ -67,6 +67,7 @@
6767
url(r'^api/', include(v1_api.urls)),
6868
url(r'^api/v2/', include('readthedocs.restapi.urls')),
6969
url(r'^api-auth/', include('rest_framework.urls', namespace='rest_framework')),
70+
url(r'^api/search/', PageSearchAPIView.as_view()),
7071
]
7172

7273
i18n_urls = [

0 commit comments

Comments
 (0)