readthedocs · ericholscher · Feb 6, 2019 · Jan 29, 2019 · Jan 29, 2019 · Jan 29, 2019
diff --git a/readthedocs/core/static-src/core/js/doc-embed/search.js b/readthedocs/core/static-src/core/js/doc-embed/search.js
@@ -55,12 +55,18 @@ function attach_elastic_search_query(data) {
 
                         // Show highlighted texts
                         if (highlight.content) {
-                            var content_text = xss(highlight.content[0]);
-                            var contents = $('<div class="context">');
-
-                            contents.html(content_text);
-                            contents.find('em').addClass('highlighted');
-                            list_item.append(contents);
+                            for (index in highlight.content) {
+                                if (index < 3) {
+                                    // Show up to 3 results for search
+                                    var content = highlight.content[index];
+                                    var content_text = xss(content);
+                                    var contents = $('<div class="context">');
+
+                                    contents.html("..." + content_text + "...");
+                                    contents.find('em').addClass('highlighted');
+                                    list_item.append(contents);
+                                }
+                            }
                         }
 
                         Search.output.append(list_item);

diff --git a/readthedocs/projects/views/public.py b/readthedocs/projects/views/public.py
@@ -8,6 +8,7 @@
 import operator
 import os
 from collections import OrderedDict
+from pprint import pformat
 
 import requests
 from django.conf import settings
@@ -247,10 +248,23 @@ def elastic_project_search(request, project_slug):
     version_slug = request.GET.get('version', LATEST)
     query = request.GET.get('q', None)
     results = None
+
     if query:
+        kwargs = {}
+        kwargs['projects_list'] = [project.slug]
+        kwargs['versions_list'] = version_slug
         user = ''
         if request.user.is_authenticated:
             user = request.user
+
+        page_search = PageDocument.faceted_search(
+            query=query, user=user, **kwargs
+        )
+        results = page_search.execute()
+
+        log.debug('Search results: %s', pformat(results.to_dict()))
+        log.debug('Search facets: %s', pformat(results.facets.to_dict()))
+
         log.info(
             LOG_TEMPLATE.format(
                 user=user,
@@ -262,15 +276,6 @@ def elastic_project_search(request, project_slug):
             ),
         )
 
-    if query:
-        req = PageDocument.simple_search(query=query)
-        filtered_query = (
-            req.filter('term', project=project.slug)
-            .filter('term', version=version_slug)
-        )
-        paginated_query = filtered_query[:50]
-        results = paginated_query.execute()
-
     return render(
         request,
         'search/elastic_project_search.html',

diff --git a/readthedocs/search/api.py b/readthedocs/search/api.py
@@ -2,15 +2,13 @@
 from rest_framework.exceptions import ValidationError
 
 from readthedocs.search.documents import PageDocument
-from readthedocs.search.filters import SearchFilterBackend
 from readthedocs.search.pagination import SearchPagination
 from readthedocs.search.serializers import PageSearchSerializer
 from readthedocs.search.utils import get_project_list_or_404
 
 
 class PageSearchAPIView(generics.ListAPIView):
     pagination_class = SearchPagination
-    filter_backends = [SearchFilterBackend]
     serializer_class = PageSearchSerializer
 
     def get_queryset(self):
@@ -24,7 +22,15 @@ def get_queryset(self):
         # Validate all the required params are there
         self.validate_query_params()
         query = self.request.query_params.get('q', '')
-        queryset = PageDocument.simple_search(query=query)
+        kwargs = {}
+        kwargs['projects_list'] = [p.slug for p in self.get_all_projects()]
+        kwargs['versions_list'] = self.request.query_params.get('version')
+        user = ''
+        if self.request.user.is_authenticated:
+            user = self.request.user
+        queryset = PageDocument.faceted_search(
+            query=query, user=user, **kwargs
+        )
         return queryset
 
     def validate_query_params(self):
@@ -43,13 +49,15 @@ def get_serializer_context(self):
         context['projects_url'] = self.get_all_projects_url()
         return context
 
-    def get_all_projects_url(self):
-        version_slug = self.request.query_params.get('version')
+    def get_all_projects(self):
         project_slug = self.request.query_params.get('project')
         all_projects = get_project_list_or_404(project_slug=project_slug, user=self.request.user)
-        projects_url = {}
+        return all_projects
 
+    def get_all_projects_url(self):
+        all_projects = self.get_all_projects()
+        version_slug = self.request.query_params.get('version')
+        projects_url = {}
         for project in all_projects:
             projects_url[project.slug] = project.get_docs_url(version_slug=version_slug)
-
         return projects_url
diff --git a/readthedocs/search/documents.py b/readthedocs/search/documents.py
@@ -30,14 +30,19 @@ class Meta(object):
     })
     language = fields.KeywordField()
 
+    # Fields to perform search with weight
+    search_fields = ['name^5', 'description']
+
     @classmethod
-    def faceted_search(cls, query, language=None, using=None, index=None):
+    def faceted_search(cls, query, user, language=None, using=None, index=None):
         kwargs = {
+            'user': user,
             'using': using or cls._doc_type.using,
             'index': index or cls._doc_type.index,
             'doc_types': [cls],
             'model': cls._doc_type.model,
-            'query': query
+            'query': query,
+            'fields': cls.search_fields
         }
 
         if language:
@@ -69,9 +74,12 @@ class Meta(object):
                       'search/index.html', 'genindex/index.html', 'py-modindex/index.html']
 
     @classmethod
-    def faceted_search(cls, query, projects_list=None, versions_list=None, using=None, index=None):
+    def faceted_search(
+        cls, query, user, projects_list=None, versions_list=None, using=None, index=None
+    ):
         es_query = cls.get_es_query(query=query)
         kwargs = {
+            'user': user,
             'using': using or cls._doc_type.using,
             'index': index or cls._doc_type.index,
             'doc_types': [cls],
@@ -90,26 +98,6 @@ def faceted_search(cls, query, projects_list=None, versions_list=None, using=Non
 
         return FileSearch(**kwargs)
 
-    @classmethod
-    def simple_search(cls, query, using=None, index=None):
-        """
-        Do a search without facets.
-
-        This is used in:
-
-        * The Docsearch API
-        * The Project Admin Search page
-        """
-
-        es_search = cls.search(using=using, index=index)
-        es_search = es_search.highlight_options(encoder='html')
-
-        es_query = cls.get_es_query(query=query)
-        highlighted_fields = [f.split('^', 1)[0] for f in cls.search_fields]
-        es_search = es_search.query(es_query).highlight(*highlighted_fields)
-
-        return es_search
-
     @classmethod
     def get_es_query(cls, query):
         """Return the Elasticsearch query generated from the query string"""

diff --git a/readthedocs/search/faceted_search.py b/readthedocs/search/faceted_search.py
@@ -1,4 +1,9 @@
+import logging
+
 from elasticsearch_dsl import FacetedSearch, TermsFacet
+from readthedocs.search.signals import before_file_search, before_project_search
+
+log = logging.getLogger(__name__)
 
 
 class RTDFacetedSearch(FacetedSearch):
@@ -8,7 +13,8 @@ class RTDFacetedSearch(FacetedSearch):
     # TODO: Remove the overwrite when the elastic/elasticsearch-dsl-py#916
     # See more: https://github.com/elastic/elasticsearch-dsl-py/issues/916
 
-    def __init__(self, using, index, doc_types, model, fields=None, **kwargs):
+    def __init__(self, user, using, index, doc_types, model, fields=None, **kwargs):
+        self.user = user
         self.using = using
         self.index = index
         self.doc_types = doc_types
@@ -17,28 +23,47 @@ def __init__(self, using, index, doc_types, model, fields=None, **kwargs):
             self.fields = fields
         super(RTDFacetedSearch, self).__init__(**kwargs)
 
+    def search(self):
+        """
+        Filter out full content on search results
+
+        This was causing all of the indexed content to be returned,
+        which was never used on the client side.
+        """
+        s = super().search()
+        s = s.source(exclude=['content', 'headers'])
+        resp = self.signal.send(sender=self, user=self.user, search=s)
+        if resp:
+            # Signal return a search object
+            try:
+                s = resp[0][1]
+            except AttributeError:
+                log.exception('Failed to return a search object from search signals')
+        return s
+
+    def query(self, search, query):
+        """
+        Add query part to ``search`` when needed
+
+        Also does HTML encoding of results to avoid XSS issues.
+
+        """
+        search = search.highlight_options(encoder='html', number_of_fragments=3)
+        if not isinstance(query, str):
+            search = search.query(query)
+        return search
+
 
 class ProjectSearch(RTDFacetedSearch):
-    fields = ['name^5', 'description']
     facets = {
         'language': TermsFacet(field='language')
     }
+    signal = before_project_search
 
 
 class FileSearch(RTDFacetedSearch):
     facets = {
         'project': TermsFacet(field='project'),
         'version': TermsFacet(field='version')
     }
-
-    def query(self, search, query):
-        """
-        Add query part to ``search``
-
-        Overriding because we pass ES Query object instead of string
-        """
-        search = search.highlight_options(encoder='html')
-        if query:
-            search = search.query(query)
-
-        return search
+    signal = before_file_search
diff --git a/readthedocs/search/filters.py b/readthedocs/search/filters.py
diff --git a/readthedocs/search/serializers.py b/readthedocs/search/serializers.py
@@ -1,5 +1,10 @@
+import logging
+from pprint import pformat
+
 from rest_framework import serializers
 
+log = logging.getLogger(__name__)
+
 
 class PageSearchSerializer(serializers.Serializer):
     project = serializers.CharField()
@@ -18,4 +23,6 @@ def get_link(self, obj):
     def get_highlight(self, obj):
         highlight = getattr(obj.meta, 'highlight', None)
         if highlight:
-            return highlight.to_dict()
+            ret = highlight.to_dict()
+            log.debug('API Search highlight: %s', pformat(ret))
+            return ret
diff --git a/readthedocs/search/signals.py b/readthedocs/search/signals.py
@@ -9,17 +9,16 @@
 
 from readthedocs.projects.models import HTMLFile, Project
 from readthedocs.projects.signals import bulk_post_create, bulk_post_delete
-from readthedocs.search.documents import PageDocument, ProjectDocument
 from readthedocs.search.tasks import index_objects_to_es
 
 
 before_project_search = django.dispatch.Signal(providing_args=['body'])
 before_file_search = django.dispatch.Signal(providing_args=['body'])
-before_section_search = django.dispatch.Signal(providing_args=['body'])
 
 
 @receiver(bulk_post_create, sender=HTMLFile)
 def index_html_file(instance_list, **_):
+    from readthedocs.search.documents import PageDocument
     kwargs = {
         'app_label': HTMLFile._meta.app_label,
         'model_name': HTMLFile.__name__,
@@ -42,6 +41,7 @@ def remove_html_file(instance_list, **_):
 
 @receiver(post_save, sender=Project)
 def index_project(instance, *args, **kwargs):
+    from readthedocs.search.documents import ProjectDocument
     kwargs = {
         'app_label': Project._meta.app_label,
         'model_name': Project.__name__,

diff --git a/readthedocs/search/tests/test_api.py b/readthedocs/search/tests/test_api.py
@@ -12,8 +12,8 @@
 @pytest.mark.search
 class TestDocumentSearch(object):
 
-    def __init__(self):
-        # This reverse needs to be inside the ``__init__`` method because from
+    def setUp(self):
+        # This reverse needs to be inside the ``setUp`` method because from
         # the Corporate site we don't define this URL if ``-ext`` module is not
         # installed
         self.url = reverse('doc_search')

diff --git a/readthedocs/search/tests/test_faceted_search.py b/readthedocs/search/tests/test_faceted_search.py
@@ -21,7 +21,7 @@ def test_search_exact_match(self, client, project, case):
         cased_query = getattr(query_text, case)
         query = cased_query()
 
-        page_search = PageDocument.faceted_search(query=query)
+        page_search = PageDocument.faceted_search(query=query, user='')
         results = page_search.execute()
 
         assert len(results) == 1
@@ -37,7 +37,7 @@ def test_search_combined_result(self, client, project):
         - Where `Foo` or `Bar` is present
         """
         query = 'Official Support'
-        page_search = PageDocument.faceted_search(query=query)
+        page_search = PageDocument.faceted_search(query=query, user='')
         results = page_search.execute()
         assert len(results) == 3
 

diff --git a/readthedocs/search/tests/test_xss.py b/readthedocs/search/tests/test_xss.py
@@ -9,16 +9,7 @@ class TestXSS:
 
     def test_facted_page_xss(self, client, project):
         query = 'XSS'
-        page_search = PageDocument.faceted_search(query=query)
-        results = page_search.execute()
-        expected = """
-        &lt;h3&gt;<em>XSS</em> exploit&lt;&#x2F;h3&gt;
-        """.strip()
-        assert results[0].meta.highlight.content[0][:len(expected)] == expected
-
-    def test_simple_page_xss(self, client, project):
-        query = 'XSS'
-        page_search = PageDocument.simple_search(query=query)
+        page_search = PageDocument.faceted_search(query=query, user='')
         results = page_search.execute()
         expected = """
         &lt;h3&gt;<em>XSS</em> exploit&lt;&#x2F;h3&gt;