readthedocs · stsewd · Jun 23, 2020 · Jun 4, 2020 · Jun 4, 2020 · Jun 4, 2020
diff --git a/readthedocs/search/api.py b/readthedocs/search/api.py
@@ -110,6 +110,16 @@ def paginate_queryset(self, queryset, request, view=None):
 
 
 class PageSearchSerializer(serializers.Serializer):
+
+    """
+    Serializer for page search results.
+
+    .. note::
+
+       This serializer is deprecated in favor of
+       `readthedocs.search.serializers.PageSearchSerializer`.
+    """
+
     project = serializers.CharField()
     version = serializers.CharField()
     title = serializers.CharField()

diff --git a/readthedocs/search/serializers.py b/readthedocs/search/serializers.py
@@ -0,0 +1,143 @@
+"""
+Serializers for the ES's search result object.
+
+.. note::
+   Some fields are re-named to make their meaning more clear.
+   They should be renamed in the ES index too.
+"""
+
+import itertools
+from operator import attrgetter
+
+from django.shortcuts import get_object_or_404
+from rest_framework import serializers
+
+from readthedocs.core.resolver import resolve
+from readthedocs.projects.models import Project
+
+
+class ProjectHighlightSerializer(serializers.Serializer):
+
+    name = serializers.ListField(child=serializers.CharField(), default=list)
+    slug = serializers.ListField(child=serializers.CharField(), default=list)
+    description = serializers.ListField(child=serializers.CharField(), default=list)
+
+
+class ProjectSearchSerializer(serializers.Serializer):
+
+    type = serializers.CharField(default='project', source=None, read_only=True)
+    name = serializers.CharField()
+    slug = serializers.CharField()
+    link = serializers.CharField(source='url')
+    highlight = ProjectHighlightSerializer(source='meta.highlight', default=dict)
+
+
+class PageHighlightSerializer(serializers.Serializer):
+
+    title = serializers.ListField(child=serializers.CharField(), default=list)
+
+
+class PageSearchSerializer(serializers.Serializer):
+
+    type = serializers.CharField(default='page', source=None, read_only=True)
+    project = serializers.CharField()
+    version = serializers.CharField()
+    title = serializers.CharField()
+    path = serializers.CharField(source='full_path')
+    link = serializers.SerializerMethodField()
+    highlight = PageHighlightSerializer(source='meta.highlight', default=dict)
+    inner_hits = serializers.SerializerMethodField()
+
+    def get_link(self, obj):
+        # TODO: optimize this to not query the db for each result.
+        project = Project.objects.filter(slug=obj.project).first()
+        if project:
+            return resolve(
+                project=project,
+                version_slug=obj.version,
+                filename=obj.full_path,
+            )
+        return None
+
+    def get_inner_hits(self, obj):
+        """Combine and sort inner results (domains and sections)."""
+        serializers = {
+            'domain': DomainSearchSerializer,
+            'section': SectionSearchSerializer,
+        }
+
+        inner_hits = obj.meta.inner_hits
+        sections = inner_hits.sections or []
+        domains = inner_hits.domains or []
+
+        # Make them identifiable before merging them
+        for s in sections:
+            s.type = 'section'
+        for d in domains:
+            d.type = 'domain'
+
+        sorted_results = sorted(
+            itertools.chain(sections, domains),
+            key=attrgetter('_score'),
+            reverse=True,
+        )
+        sorted_results = [
+            serializers[hit.type](hit).data
+            for hit in sorted_results
+        ]
+        return sorted_results
+
+
+class DomainHighlightSerializer(serializers.Serializer):
+
+    """
+    Serializer for domain results.
+
+    .. note::
+
+       We override the `to_representation` method instead of declaring each field
+       becuase serializers don't play nice with keys that include `.`.
+    """
+
+    def to_representation(self, instance):
+        return {
+            'name': getattr(instance, 'domains.name', []),
+            'docstring': getattr(instance, 'domains.docstrings', []),
+        }
+
+
+class DomainSearchSerializer(serializers.Serializer):
+
+    type = serializers.CharField(default='domain', source=None, read_only=True)
+    role_name = serializers.CharField(source='_source.role_name')
+    name = serializers.CharField(source='_source.name')
+    id = serializers.CharField(source='_source.anchor')
+    docstring = serializers.CharField(source='_source.docstrings')
+    highlight = DomainHighlightSerializer(default=dict)
+
+
+class SectionHighlightSerializer(serializers.Serializer):
+
+    """
+    Serializer for section results.
+
+    .. note::
+
+       We override the `to_representation` method instead of declaring each field
+       becuase serializers don't play nice with keys that include `.`.
+    """
+
+    def to_representation(self, instance):
+        return {
+            'title': getattr(instance, 'sections.title', []),
+            'content': getattr(instance, 'sections.content', []),
+        }
+
+
+class SectionSearchSerializer(serializers.Serializer):
+
+    type = serializers.CharField(default='section', source=None, read_only=True)
+    id = serializers.CharField(source='_source.id')
+    title = serializers.CharField(source='_source.title')
+    content = serializers.CharField(source='_source.content')
+    highlight = SectionHighlightSerializer(default=dict)
diff --git a/readthedocs/search/tests/test_api.py b/readthedocs/search/tests/test_api.py
@@ -23,6 +23,14 @@
     get_search_query_from_project_file,
 )
 
+OLD_TYPES = {
+    'domain': 'domains',
+    'section': 'sections',
+}
+OLD_FIELDS = {
+    'docstring': 'docstrings',
+}
+
 
 @pytest.mark.django_db
 @pytest.mark.search
@@ -43,7 +51,7 @@ def test_search_works_with_title_query(self, api_client, project, page_num):
         query = get_search_query_from_project_file(
             project_slug=project.slug,
             page_num=page_num,
-            data_type='title'
+            field='title'
         )
 
         version = project.versions.all().first()
@@ -77,10 +85,12 @@ def test_search_works_with_sections_and_domains_query(
         page_num,
         data_type
     ):
+        type, field = data_type.split('.')
         query = get_search_query_from_project_file(
             project_slug=project.slug,
             page_num=page_num,
-            data_type=data_type
+            type=type,
+            field=field,
         )
         version = project.versions.all().first()
         search_params = {
@@ -105,10 +115,11 @@ def test_search_works_with_sections_and_domains_query(
 
         inner_hit_0 = inner_hits[0]  # first inner_hit
 
-        expected_type = data_type.split('.')[0]  # can be "sections" or "domains"
-        assert inner_hit_0['type'] == expected_type
+        old_type = OLD_TYPES.get(type, type)
+        assert inner_hit_0['type'] == old_type
 
-        highlight = inner_hit_0['highlight'][data_type]
+        old_field = old_type + '.' + OLD_FIELDS.get(field, field)
+        highlight = inner_hit_0['highlight'][old_field]
         assert (
             len(highlight) == 1
         ), 'number_of_fragments is set to 1'

diff --git a/readthedocs/search/tests/test_views.py b/readthedocs/search/tests/test_views.py
@@ -38,9 +38,9 @@ def test_search_by_project_name(self, client, project, all_projects):
         )
 
         assert len(results) == 1
-        assert project.name.encode('utf-8') in results[0].name.encode('utf-8')
+        assert project.name == results[0]['name']
         for proj in all_projects[1:]:
-            assert proj.name.encode('utf-8') not in results[0].name.encode('utf-8')
+            assert proj.name != results[0]['name']
 
     def test_search_project_have_correct_language_facets(self, client, project):
         """Test that searching project should have correct language facets in the results"""
@@ -102,23 +102,22 @@ def _get_search_result(self, url, client, search_params):
 
         return results, facets
 
-    def _get_highlight(self, result, data_type):
+    def _get_highlight(self, result, field, type=None):
         # if query is from page title,
         # highlighted title is present in 'result.meta.highlight.title'
-        if data_type == 'title':
-            highlight = result.meta.highlight.title
+        if not type and field == 'title':
+            highlight = result['highlight']['title']
 
         # if result is not from page title,
         # then results and highlighted results are present inside 'inner_hits'
         else:
-            inner_hits = result.meta.inner_hits
+            inner_hits = result['inner_hits']
             assert len(inner_hits) >= 1
 
             # checking first inner_hit
             inner_hit_0 = inner_hits[0]
-            expected_type = data_type.split('.')[0]  # can be either 'sections' or 'domains'
-            assert inner_hit_0['type'] == expected_type
-            highlight = inner_hit_0['highlight'][data_type]
+            assert inner_hit_0['type'] == type
+            highlight = inner_hit_0['highlight'][field]
 
         return highlight
 
@@ -132,10 +131,17 @@ def _get_highlighted_words(self, string):
     @pytest.mark.parametrize('data_type', DATA_TYPES_VALUES)
     @pytest.mark.parametrize('page_num', [0, 1])
     def test_file_search(self, client, project, data_type, page_num):
+        data_type = data_type.split('.')
+        type, field = None, None
+        if len(data_type) < 2:
+            field = data_type[0]
+        else:
+            type, field = data_type
         query = get_search_query_from_project_file(
             project_slug=project.slug,
             page_num=page_num,
-            data_type=data_type
+            type=type,
+            field=field,
         )
         results, _ = self._get_search_result(
             url=self.url,
@@ -146,7 +152,7 @@ def test_file_search(self, client, project, data_type, page_num):
 
         # checking first result
         result_0 = results[0]
-        highlight = self._get_highlight(result_0, data_type)
+        highlight = self._get_highlight(result_0, field, type)
         assert len(highlight) == 1
 
         highlighted_words = self._get_highlighted_words(highlight[0])
@@ -204,11 +210,11 @@ def test_file_search_filter_role_name(self, client):
         # in `signals` page
         assert len(new_results) == 1
         first_result = new_results[0]  # first result
-        inner_hits = first_result.meta.inner_hits  # inner_hits of first results
+        inner_hits = first_result['inner_hits']  # inner_hits of first results
         assert len(inner_hits) >= 1
         inner_hit_0 = inner_hits[0]  # first inner_hit
-        assert inner_hit_0.type == 'domains'
-        assert inner_hit_0.source.role_name == confval_facet
+        assert inner_hit_0['type'] == 'domain'
+        assert inner_hit_0['role_name'] == confval_facet
 
         for facet in new_role_names_facets:
             if facet[0] == confval_facet:
@@ -224,9 +230,16 @@ def test_file_search_case_insensitive(self, client, project, case, data_type):
 
         It tests with uppercase, lowercase and camelcase.
         """
+        type, field = None, None
+        data_type = data_type.split('.')
+        if len(data_type) < 2:
+            field = data_type[0]
+        else:
+            type, field = data_type
         query_text = get_search_query_from_project_file(
             project_slug=project.slug,
-            data_type=data_type
+            type=type,
+            field=field,
         )
         cased_query = getattr(query_text, case)
         query = cased_query()
@@ -239,7 +252,7 @@ def test_file_search_case_insensitive(self, client, project, case, data_type):
         assert len(results) >= 1
 
         first_result = results[0]
-        highlight = self._get_highlight(first_result, data_type)
+        highlight = self._get_highlight(first_result, field, type)
         assert len(highlight) == 1
         highlighted_words = self._get_highlighted_words(highlight[0])
         assert len(highlighted_words) >= 1
@@ -267,13 +280,13 @@ def test_file_search_exact_match(self, client, project):
         # because the phrase is present in
         # only one project
         assert len(results) == 1
-        assert results[0].project == 'kuma'
-        assert results[0].path == 'testdocumentation'
+        assert results[0]['project'] == 'kuma'
+        assert results[0]['path'] == 'documentation.html'
 
-        inner_hits = results[0].meta.inner_hits
+        inner_hits = results[0]['inner_hits']
         assert len(inner_hits) == 1
-        assert inner_hits[0].type == 'sections'
-        highlight = self._get_highlight(results[0], 'sections.content')
+        assert inner_hits[0]['type'] == 'section'
+        highlight = self._get_highlight(results[0], 'content', 'section')
         assert len(highlight) == 1
         highlighted_words = self._get_highlighted_words(highlight[0])
         assert len(highlighted_words) >= 1
@@ -316,12 +329,12 @@ def test_file_search_filter_by_project(self, client):
             search_params=search_params,
         )
         project_facets = facets['project']
-        resulted_project_facets = [ facet[0] for facet in project_facets ]
+        resulted_project_facets = [facet[0] for facet in project_facets]
 
         # There should be 1 search result as we have filtered
         assert len(results) == 1
         # kuma should should be there only
-        assert 'kuma' == results[0].project
+        assert 'kuma' == results[0]['project']
 
         # But there should be 2 projects in the project facets
         # as the query is present in both projects