readthedocs · ericholscher · Feb 6, 2019 · Jan 29, 2019 · Jan 29, 2019 · Jan 29, 2019
diff --git a/docs/development/search.rst b/docs/development/search.rst
@@ -37,13 +37,14 @@ By default, Auto Indexing is turned off in development mode. To turn it on, chan
 After that, whenever a documentation successfully builds, or project gets added,
 the search index will update automatically.
 
-
 Architecture
 ------------
 The search architecture is devided into 2 parts.
-One part is responsible for **indexing** the documents and projects and
-the other part is responsible for querying the Index to show the proper results to users.
-We use the `django-elasticsearch-dsl`_ package mostly to the keep the search working.
+
+* One part is responsible for **indexing** the documents and projects (``documents.py``)
+* The other part is responsible for **querying** the Index to show the proper results to users (``faceted_search.py``)
+
+We use the `django-elasticsearch-dsl`_ package for our Document abstraction.
 `django-elasticsearch-dsl`_ is a wrapper around `elasticsearch-dsl`_ for easy configuration
 with Django.
 
@@ -72,11 +73,11 @@ and index/delete the documentation content from the `HTMLFile` instances.
 
 How we index projects
 ~~~~~~~~~~~~~~~~~~~~~
+
 We also index project information in our search index so that the user can search for projects
-from the main site. `django-elasticsearch-dsl`_ listen `post_create` and `post_delete` signals of
+from the main site. We listen to the `post_create` and `post_delete` signals of
 `Project` model and index/delete into Elasticsearch accordingly.
 
-
 Elasticsearch Document
 ~~~~~~~~~~~~~~~~~~~~~~
 
@@ -88,9 +89,7 @@ As per requirements of `django-elasticsearch-dsl`_, it is stored in the
     `django-elasticsearch-dsl`_ listens to the `post_save` signal of `Project` model and
     then index/delete into Elasticsearch.
 
-    **PageDocument**: It is used for indexing documentation of projects. By default, the auto
-    indexing is turned off by `ignore_signals = settings.ES_PAGE_IGNORE_SIGNALS`.
-    `settings.ES_PAGE_IGNORE_SIGNALS` is `False` both in development and production.
+    **PageDocument**: It is used for indexing documentation of projects. 
     As mentioned above, our `Search` app listens to the `bulk_post_create` and `bulk_post_delete`
     signals and indexes/deleted documentation into Elasticsearch. The signal listeners are in
     the `readthedocs/search/signals.py` file. Both of the signals are dispatched

diff --git a/readthedocs/core/static-src/core/js/doc-embed/search.js b/readthedocs/core/static-src/core/js/doc-embed/search.js
@@ -55,12 +55,18 @@ function attach_elastic_search_query(data) {
 
                         // Show highlighted texts
                         if (highlight.content) {
-                            var content_text = xss(highlight.content[0]);
-                            var contents = $('<div class="context">');
-
-                            contents.html(content_text);
-                            contents.find('em').addClass('highlighted');
-                            list_item.append(contents);
+                            for (var index = 0; index < highlight.content.length; index += 1) {
+                                if (index < 3) {
+                                    // Show up to 3 results for search
+                                    var content = highlight.content[index];
+                                    var content_text = xss(content);
+                                    var contents = $('<div class="context">');
+
+                                    contents.html("..." + content_text + "...");
+                                    contents.find('em').addClass('highlighted');
+                                    list_item.append(contents);
+                                }
+                            }
                         }
 
                         Search.output.append(list_item);
@@ -71,10 +77,11 @@ function attach_elastic_search_query(data) {
                 if (!hit_list.length) {
                     // Fallback to Sphinx's indexes
                     Search.query_fallback(query);
+                    console.log('Read the Docs search failed. Falling back to Sphinx search.');
                 }
                 else {
                     Search.status.text(
-                        _('Search finished, found %s page(s) matching the search query.').replace('%s', total_count)
+                        _('Search finished, found %s page(s) matching the search query.').replace('%s', hit_list.length)
                     );
                 }
             })

diff --git a/readthedocs/core/static/core/js/readthedocs-doc-embed.js b/readthedocs/core/static/core/js/readthedocs-doc-embed.js
diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py
@@ -1145,11 +1145,15 @@ def get_processed_json(self):
             return process_file(file_path)
         except Exception:
             log.warning(
-                'Unhandled exception during search processing file: %s' % file_path
+                'Unhandled exception during search processing file: %s',
+                file_path,
             )
         return {
-            'headers': [], 'content': '', 'path': file_path, 'title': '',
-            'sections': []
+            'headers': [],
+            'content': '',
+            'path': file_path,
+            'title': '',
+            'sections': [],
         }
 
     @cached_property

diff --git a/readthedocs/projects/urls/public.py b/readthedocs/projects/urls/public.py
@@ -8,6 +8,7 @@
 from readthedocs.constants import pattern_opts
 from readthedocs.projects.views import public
 from readthedocs.projects.views.public import ProjectDetailView, ProjectIndex
+from readthedocs.search import views as search_views
 
 
 urlpatterns = [
@@ -50,7 +51,7 @@
     ),
     url(
         r'^(?P<project_slug>{project_slug})/search/$'.format(**pattern_opts),
-        public.elastic_project_search,
+        search_views.elastic_project_search,
         name='elastic_project_search',
     ),
     url(

diff --git a/readthedocs/projects/views/public.py b/readthedocs/projects/views/public.py
@@ -25,9 +25,7 @@
 from readthedocs.builds.models import Version
 from readthedocs.builds.views import BuildTriggerMixin
 from readthedocs.projects.models import Project
-from readthedocs.search.documents import PageDocument
 from readthedocs.projects.templatetags.projects_tags import sort_version_aware
-from readthedocs.search.views import LOG_TEMPLATE
 
 from .base import ProjectOnboardMixin
 
@@ -240,48 +238,6 @@ def project_download_media(request, project_slug, type_, version_slug):
     return response
 
 
-def elastic_project_search(request, project_slug):
-    """Use elastic search to search in a project."""
-    queryset = Project.objects.protected(request.user)
-    project = get_object_or_404(queryset, slug=project_slug)
-    version_slug = request.GET.get('version', LATEST)
-    query = request.GET.get('q', None)
-    results = None
-    if query:
-        user = ''
-        if request.user.is_authenticated:
-            user = request.user
-        log.info(
-            LOG_TEMPLATE.format(
-                user=user,
-                project=project or '',
-                type='inproject',
-                version=version_slug or '',
-                language='',
-                msg=query or '',
-            ),
-        )
-
-    if query:
-        req = PageDocument.simple_search(query=query)
-        filtered_query = (
-            req.filter('term', project=project.slug)
-            .filter('term', version=version_slug)
-        )
-        paginated_query = filtered_query[:50]
-        results = paginated_query.execute()
-
-    return render(
-        request,
-        'search/elastic_project_search.html',
-        {
-            'project': project,
-            'query': query,
-            'results': results,
-        },
-    )
-
-
 def project_versions(request, project_slug):
     """
     Project version list view.

diff --git a/readthedocs/search/api.py b/readthedocs/search/api.py
@@ -1,16 +1,54 @@
+import logging
+from pprint import pformat
+
 from rest_framework import generics
+from rest_framework import serializers
 from rest_framework.exceptions import ValidationError
+from rest_framework.pagination import PageNumberPagination
 
 from readthedocs.search.documents import PageDocument
-from readthedocs.search.filters import SearchFilterBackend
-from readthedocs.search.pagination import SearchPagination
-from readthedocs.search.serializers import PageSearchSerializer
 from readthedocs.search.utils import get_project_list_or_404
 
+log = logging.getLogger(__name__)
+
+
+class SearchPagination(PageNumberPagination):
+    page_size = 25
+    page_size_query_param = 'page_size'
+    max_page_size = 100
+
+
+class PageSearchSerializer(serializers.Serializer):
+    project = serializers.CharField()
+    version = serializers.CharField()
+    title = serializers.CharField()
+    path = serializers.CharField()
+    link = serializers.SerializerMethodField()
+    highlight = serializers.SerializerMethodField()
+
+    def get_link(self, obj):
+        projects_url = self.context.get('projects_url')
+        if projects_url:
+            docs_url = projects_url[obj.project]
+            return docs_url + obj.path
+
+    def get_highlight(self, obj):
+        highlight = getattr(obj.meta, 'highlight', None)
+        if highlight:
+            if hasattr(highlight, 'content'):
+                # Change results to turn newlines in highlight into periods
+                # https://github.com/rtfd/readthedocs.org/issues/5168
+                highlight.content = [result.replace('\n', '. ') for result in highlight.content]
+            ret = highlight.to_dict()
+            log.debug('API Search highlight: %s', pformat(ret))
+            return ret
+
 
 class PageSearchAPIView(generics.ListAPIView):
+
+    """Main entry point to perform a search using Elasticsearch."""
+
     pagination_class = SearchPagination
-    filter_backends = [SearchFilterBackend]
     serializer_class = PageSearchSerializer
 
     def get_queryset(self):
@@ -24,10 +62,25 @@ def get_queryset(self):
         # Validate all the required params are there
         self.validate_query_params()
         query = self.request.query_params.get('q', '')
-        queryset = PageDocument.simple_search(query=query)
+        kwargs = {'filter_user': False}
+        kwargs['projects_list'] = [p.slug for p in self.get_all_projects()]
+        kwargs['versions_list'] = self.request.query_params.get('version')
+        user = self.request.user
+        queryset = PageDocument.faceted_search(
+            query=query, user=user, **kwargs
+        )
         return queryset
 
     def validate_query_params(self):
+        """
+        Validate all required query params are passed on the request.
+
+        Query params required are: ``q``, ``project`` and ``version``.
+
+        :rtype: None
+
+        :raises: ValidationError if one of them is missing.
+        """
         required_query_params = {'q', 'project', 'version'}  # python `set` literal is `{}`
         request_params = set(self.request.query_params.keys())
         missing_params = required_query_params - request_params
@@ -39,17 +92,47 @@ def validate_query_params(self):
             raise ValidationError(errors)
 
     def get_serializer_context(self):
-        context = super(PageSearchAPIView, self).get_serializer_context()
+        context = super().get_serializer_context()
         context['projects_url'] = self.get_all_projects_url()
         return context
 
+    def get_all_projects(self):
+        """
+        Return a list containing the project itself and all its subprojects.
+
+        The project slug is retrieved from ``project`` query param.
+
+        :rtype: list
+
+        :raises: Http404 if project is not found
+        """
+        project_slug = self.request.query_params.get('project')
+        version_slug = self.request.query_params.get('version')
+        all_projects = get_project_list_or_404(
+            project_slug=project_slug, user=self.request.user, version_slug=version_slug,
+        )
+        return all_projects
+
     def get_all_projects_url(self):
+        """
+        Return a dict containing the project slug and its version URL.
+
+        The dictionary contains the project and its subprojects . Each project's
+        slug is used as a key and the documentation URL for that project and
+        version as the value.
+
+        Example:
+
+        {
+            "requests": "https://requests.readthedocs.io/en/latest/",
+            "requests-oauth": "https://requests-oauth.readthedocs.io/en/latest/",
+        }
+
+        :rtype: dict
+        """
+        all_projects = self.get_all_projects()
         version_slug = self.request.query_params.get('version')
-        project_slug = self.request.query_params.get('project')
-        all_projects = get_project_list_or_404(project_slug=project_slug, user=self.request.user)
         projects_url = {}
-
         for project in all_projects:
             projects_url[project.slug] = project.get_docs_url(version_slug=version_slug)
-
         return projects_url
diff --git a/readthedocs/search/apps.py b/readthedocs/search/apps.py
@@ -1,10 +1,8 @@
-"""Project app config"""
-
 from django.apps import AppConfig
 
 
 class SearchConfig(AppConfig):
     name = 'readthedocs.search'
 
     def ready(self):
-        from .signals import index_html_file, remove_html_file
+        import readthedocs.search.signals  # noqa