[fix readthedocs#4265] Port Document search API for Elasticsearch 6.x

safwanrahman · safwanrahman · commit e8ac7691f13b · 2018-07-16T09:20:50.000+06:00
diff --git a/readthedocs/search/api.py b/readthedocs/search/api.py
@@ -0,0 +1,17 @@
+from rest_framework import generics
+
+from readthedocs.search.documents import PageDocument
+from readthedocs.search.filters import SearchFilterBackend
+from readthedocs.search.pagination import SearchPagination
+from readthedocs.search.serializers import PageSearchSerializer
+
+
+class PageSearchAPIView(generics.ListAPIView):
+    pagination_class = SearchPagination
+    filter_backends = [SearchFilterBackend]
+    serializer_class = PageSearchSerializer
+
+    def get_queryset(self):
+        query = self.request.query_params.get('query')
+        queryset = PageDocument.search(query=query)
+        return queryset
diff --git a/readthedocs/search/documents.py b/readthedocs/search/documents.py
@@ -1,5 +1,6 @@
 from django.conf import settings
 from django_elasticsearch_dsl import DocType, Index, fields
+from elasticsearch_dsl.query import SimpleQueryString, Bool
 
 from readthedocs.projects.models import Project, HTMLFile
 from .conf import SEARCH_EXCLUDED_FILE
@@ -60,14 +61,19 @@ class Meta(object):
     content = fields.TextField(attr='processed_json.content')
     path = fields.TextField(attr='processed_json.path')
 
+    # Fields to perform search with weight
+    search_fields = ['title^10', 'headers^5', 'content']
+
     @classmethod
     def faceted_search(cls, query, projects_list=None, versions_list=None, using=None, index=None):
+        es_query = cls.get_es_query(query=query)
         kwargs = {
             'using': using or cls._doc_type.using,
             'index': index or cls._doc_type.index,
             'doc_types': [cls],
             'model': cls._doc_type.model,
-            'query': query
+            'query': es_query,
+            'fields': cls.search_fields
         }
         filters = {}
 
@@ -80,6 +86,32 @@ def faceted_search(cls, query, projects_list=None, versions_list=None, using=Non
 
         return FileSearch(**kwargs)
 
+    @classmethod
+    def search(cls, using=None, index=None, **kwargs):
+        es_search = super(PageDocument, cls).search(using=using, index=index)
+        query = kwargs.pop('query')
+        es_query = cls.get_es_query(query=query)
+
+        es_search = es_search.query(es_query)
+        return es_search
+
+    @classmethod
+    def get_es_query(cls, query):
+        """Return the Elasticsearch query generated from the query string"""
+        all_queries = []
+
+        # Need to search for both 'AND' and 'OR' operations
+        # The score of AND should be higher as it satisfies both OR and AND
+        for operator in ['AND', 'OR']:
+            query_string = SimpleQueryString(query=query, fields=cls.search_fields,
+                                             default_operator=operator)
+            all_queries.append(query_string)
+
+        # Run bool query with should, so it returns result where either of the query matches
+        bool_query = Bool(should=all_queries)
+
+        return bool_query
+
     def get_queryset(self):
         """Overwrite default queryset to filter certain files to index"""
         queryset = super(PageDocument, self).get_queryset()
diff --git a/readthedocs/search/faceted_search.py b/readthedocs/search/faceted_search.py
@@ -9,11 +9,13 @@ class RTDFacetedSearch(FacetedSearch):
     # TODO: Remove the overwrite when the elastic/elasticsearch-dsl-py#916
     # See more: https://github.com/elastic/elasticsearch-dsl-py/issues/916
 
-    def __init__(self, using, index, doc_types, model, **kwargs):
+    def __init__(self, using, index, doc_types, model, fields=None, **kwargs):
         self.using = using
         self.index = index
         self.doc_types = doc_types
         self._model = model
+        if fields:
+            self.fields = fields
         super(RTDFacetedSearch, self).__init__(**kwargs)
 
 
@@ -25,7 +27,6 @@ class ProjectSearch(RTDFacetedSearch):
 
 
 class FileSearch(RTDFacetedSearch):
-    fields = ['title^10', 'headers^5', 'content']
     facets = {
         'project': TermsFacet(field='project'),
         'version': TermsFacet(field='version')
@@ -34,17 +35,6 @@ class FileSearch(RTDFacetedSearch):
     def query(self, search, query):
         """Add query part to ``search``"""
         if query:
-            all_queries = []
-
-            # Need to search for both 'AND' and 'OR' operations
-            # The score of AND should be higher as it comes first
-            for operator in ['AND', 'OR']:
-                query_string = SimpleQueryString(query=query, fields=self.fields,
-                                                 default_operator=operator)
-                all_queries.append(query_string)
-
-            # Run bool query with should, so it returns result where either of the query matches
-            bool_query = Bool(should=all_queries)
-            search = search.query(bool_query)
+            search = search.query(query)
 
         return search
diff --git a/readthedocs/search/filters.py b/readthedocs/search/filters.py
@@ -0,0 +1,15 @@
+from rest_framework import filters
+
+from readthedocs.search.utils import get_project_slug_list_or_404
+
+
+class SearchFilterBackend(filters.BaseFilterBackend):
+    """
+    Filter search result with project
+    """
+
+    def filter_queryset(self, request, queryset, view):
+        project_slug = request.query_params.get('project')
+        project_slug_list = get_project_slug_list_or_404(project_slug=project_slug,
+                                                         user=request.user)
+        return queryset.filter('terms', project=project_slug_list)
diff --git a/readthedocs/search/pagination.py b/readthedocs/search/pagination.py
@@ -0,0 +1,7 @@
+from rest_framework.pagination import PageNumberPagination
+
+
+class SearchPagination(PageNumberPagination):
+    page_size = 10
+    page_size_query_param = 'page_size'
+    max_page_size = 100
diff --git a/readthedocs/search/serializers.py b/readthedocs/search/serializers.py
@@ -0,0 +1,8 @@
+from rest_framework import serializers
+
+
+class PageSearchSerializer(serializers.Serializer):
+    title = serializers.CharField()
+    headers = serializers.ListField()
+    content = serializers.CharField()
+    path = serializers.CharField()
diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py
@@ -11,8 +11,10 @@
 import json
 
 from builtins import next, range
+from django.shortcuts import get_object_or_404
 from pyquery import PyQuery
 
+from readthedocs.projects.models import Project
 
 log = logging.getLogger(__name__)
 
@@ -306,3 +308,19 @@ def parse_sections(documentation_type, content):
             return ''
 
     return sections
+
+
+# TODO: Rewrite all the views using this in Class Based View,
+# and move this function to a mixin
+def get_project_slug_list_or_404(project_slug, user):
+    """Return list of subproject's slug including own slug.
+    If the project is not available to user, redirect to 404
+    """
+    queryset = Project.objects.api(user).only('slug')
+    project = get_object_or_404(queryset, slug=project_slug)
+
+    subprojects_slug = (queryset.filter(superprojects__parent_id=project.id)
+                        .values_list('slug', flat=True))
+
+    slug_list = [project.slug] + list(subprojects_slug)
+    return slug_list
diff --git a/readthedocs/search/views.py b/readthedocs/search/views.py
@@ -14,6 +14,7 @@
 from readthedocs.projects.models import Project
 from readthedocs.search import lib as search_lib
 from readthedocs.search.documents import ProjectDocument, PageDocument
+from readthedocs.search.utils import get_project_slug_list_or_404
 
 log = logging.getLogger(__name__)
 LOG_TEMPLATE = u'(Elastic Search) [{user}:{type}] [{project}:{version}:{language}] {msg}'
@@ -54,14 +55,9 @@ def elastic_search(request):
         elif user_input.type == 'file':
             kwargs = {}
             if user_input.project:
-                queryset = Project.objects.api(request.user).only('slug')
-                project = get_object_or_404(queryset, slug=user_input.project)
-
-                subprojects_slug = (queryset.filter(superprojects__parent_id=project.id)
-                                            .values_list('slug', flat=True))
-
-                projects_list = [project.slug] + list(subprojects_slug)
-                kwargs['projects_list'] = projects_list
+                project_slug_list = get_project_slug_list_or_404(project_slug=user_input.project,
+                                                                 user=request.user)
+                kwargs['projects_list'] = project_slug_list
             if user_input.version:
                 kwargs['versions_list'] = user_input.version
 
diff --git a/readthedocs/urls.py b/readthedocs/urls.py
@@ -22,7 +22,7 @@
     do_not_track,
 )
 from readthedocs.search import views as search_views
-
+from readthedocs.search.api import PageSearchAPIView
 
 v1_api = Api(api_name='v1')
 v1_api.register(UserResource())
@@ -67,6 +67,7 @@
     url(r'^api/', include(v1_api.urls)),
     url(r'^api/v2/', include('readthedocs.restapi.urls')),
     url(r'^api-auth/', include('rest_framework.urls', namespace='rest_framework')),
+    url(r'^api/search/', PageSearchAPIView.as_view()),
 ]
 
 i18n_urls = [

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@`
`22`	`22`	`do_not_track,`
`23`	`23`	`)`
`24`	`24`	`from readthedocs.search import views as search_views`
`25`		`-`
	`25`	`+from readthedocs.search.api import PageSearchAPIView`
`26`	`26`
`27`	`27`	`v1_api = Api(api_name='v1')`
`28`	`28`	`v1_api.register(UserResource())`
`@@ -67,6 +67,7 @@`
`67`	`67`	`url(r'^api/', include(v1_api.urls)),`
`68`	`68`	`url(r'^api/v2/', include('readthedocs.restapi.urls')),`
`69`	`69`	`url(r'^api-auth/', include('rest_framework.urls', namespace='rest_framework')),`
	`70`	`+ url(r'^api/search/', PageSearchAPIView.as_view()),`
`70`	`71`	`]`
`71`	`72`
`72`	`73`	`i18n_urls = [`