diff --git a/readthedocs/search/api.py b/readthedocs/search/api.py index 0570e659e5a..cd5c87fbce3 100644 --- a/readthedocs/search/api.py +++ b/readthedocs/search/api.py @@ -121,6 +121,16 @@ def paginate_queryset(self, queryset, request, view=None): class PageSearchSerializer(serializers.Serializer): + + """ + Serializer for page search results. + + .. note:: + + This serializer is deprecated in favor of + `readthedocs.search.serializers.PageSearchSerializer`. + """ + project = serializers.CharField() version = serializers.CharField() title = serializers.CharField() diff --git a/readthedocs/search/serializers.py b/readthedocs/search/serializers.py new file mode 100644 index 00000000000..d01eb901d8e --- /dev/null +++ b/readthedocs/search/serializers.py @@ -0,0 +1,143 @@ +""" +Serializers for the ES's search result object. + +.. note:: + Some fields are re-named to make their meaning more clear. + They should be renamed in the ES index too. +""" + +import itertools +from operator import attrgetter + +from django.shortcuts import get_object_or_404 +from rest_framework import serializers + +from readthedocs.core.resolver import resolve +from readthedocs.projects.models import Project + + +class ProjectHighlightSerializer(serializers.Serializer): + + name = serializers.ListField(child=serializers.CharField(), default=list) + slug = serializers.ListField(child=serializers.CharField(), default=list) + description = serializers.ListField(child=serializers.CharField(), default=list) + + +class ProjectSearchSerializer(serializers.Serializer): + + type = serializers.CharField(default='project', source=None, read_only=True) + name = serializers.CharField() + slug = serializers.CharField() + link = serializers.CharField(source='url') + highlight = ProjectHighlightSerializer(source='meta.highlight', default=dict) + + +class PageHighlightSerializer(serializers.Serializer): + + title = serializers.ListField(child=serializers.CharField(), default=list) + + +class PageSearchSerializer(serializers.Serializer): + + type = serializers.CharField(default='page', source=None, read_only=True) + project = serializers.CharField() + version = serializers.CharField() + title = serializers.CharField() + link = serializers.SerializerMethodField() + highlight = PageHighlightSerializer(source='meta.highlight', default=dict) + blocks = serializers.SerializerMethodField() + + def get_link(self, obj): + # TODO: optimize this to not query the db for each result. + # TODO: return an relative URL when this is called from the indoc search. + project = Project.objects.filter(slug=obj.project).first() + if project: + return resolve( + project=project, + version_slug=obj.version, + filename=obj.full_path, + ) + return None + + def get_blocks(self, obj): + """Combine and sort inner results (domains and sections).""" + serializers = { + 'domain': DomainSearchSerializer, + 'section': SectionSearchSerializer, + } + + inner_hits = obj.meta.inner_hits + sections = inner_hits.sections or [] + domains = inner_hits.domains or [] + + # Make them identifiable before merging them + for s in sections: + s.type = 'section' + for d in domains: + d.type = 'domain' + + sorted_results = sorted( + itertools.chain(sections, domains), + key=attrgetter('_score'), + reverse=True, + ) + sorted_results = [ + serializers[hit.type](hit).data + for hit in sorted_results + ] + return sorted_results + + +class DomainHighlightSerializer(serializers.Serializer): + + """ + Serializer for domain results. + + .. note:: + + We override the `to_representation` method instead of declaring each field + because serializers don't play nice with keys that include `.`. + """ + + def to_representation(self, instance): + return { + 'name': getattr(instance, 'domains.name', []), + 'docstring': getattr(instance, 'domains.docstrings', []), + } + + +class DomainSearchSerializer(serializers.Serializer): + + type = serializers.CharField(default='domain', source=None, read_only=True) + role = serializers.CharField(source='_source.role_name') + name = serializers.CharField(source='_source.name') + id = serializers.CharField(source='_source.anchor') + docstring = serializers.CharField(source='_source.docstrings') + highlight = DomainHighlightSerializer(default=dict) + + +class SectionHighlightSerializer(serializers.Serializer): + + """ + Serializer for section results. + + .. note:: + + We override the `to_representation` method instead of declaring each field + because serializers don't play nice with keys that include `.`. + """ + + def to_representation(self, instance): + return { + 'title': getattr(instance, 'sections.title', []), + 'content': getattr(instance, 'sections.content', []), + } + + +class SectionSearchSerializer(serializers.Serializer): + + type = serializers.CharField(default='section', source=None, read_only=True) + id = serializers.CharField(source='_source.id') + title = serializers.CharField(source='_source.title') + content = serializers.CharField(source='_source.content') + highlight = SectionHighlightSerializer(default=dict) diff --git a/readthedocs/search/tests/test_api.py b/readthedocs/search/tests/test_api.py index 90ae4e7f248..1331a5e0aa0 100644 --- a/readthedocs/search/tests/test_api.py +++ b/readthedocs/search/tests/test_api.py @@ -23,6 +23,14 @@ get_search_query_from_project_file, ) +OLD_TYPES = { + 'domain': 'domains', + 'section': 'sections', +} +OLD_FIELDS = { + 'docstring': 'docstrings', +} + @pytest.mark.django_db @pytest.mark.search @@ -43,7 +51,7 @@ def test_search_works_with_title_query(self, api_client, project, page_num): query = get_search_query_from_project_file( project_slug=project.slug, page_num=page_num, - data_type='title' + field='title' ) version = project.versions.all().first() @@ -77,10 +85,12 @@ def test_search_works_with_sections_and_domains_query( page_num, data_type ): + type, field = data_type.split('.') query = get_search_query_from_project_file( project_slug=project.slug, page_num=page_num, - data_type=data_type + type=type, + field=field, ) version = project.versions.all().first() search_params = { @@ -105,10 +115,11 @@ def test_search_works_with_sections_and_domains_query( inner_hit_0 = inner_hits[0] # first inner_hit - expected_type = data_type.split('.')[0] # can be "sections" or "domains" - assert inner_hit_0['type'] == expected_type + old_type = OLD_TYPES.get(type, type) + assert inner_hit_0['type'] == old_type - highlight = inner_hit_0['highlight'][data_type] + old_field = old_type + '.' + OLD_FIELDS.get(field, field) + highlight = inner_hit_0['highlight'][old_field] assert ( len(highlight) == 1 ), 'number_of_fragments is set to 1' diff --git a/readthedocs/search/tests/test_views.py b/readthedocs/search/tests/test_views.py index 648f029e960..609e261af30 100644 --- a/readthedocs/search/tests/test_views.py +++ b/readthedocs/search/tests/test_views.py @@ -38,9 +38,9 @@ def test_search_by_project_name(self, client, project, all_projects): ) assert len(results) == 1 - assert project.name.encode('utf-8') in results[0].name.encode('utf-8') + assert project.name == results[0]['name'] for proj in all_projects[1:]: - assert proj.name.encode('utf-8') not in results[0].name.encode('utf-8') + assert proj.name != results[0]['name'] def test_search_project_have_correct_language_facets(self, client, project): """Test that searching project should have correct language facets in the results""" @@ -102,23 +102,22 @@ def _get_search_result(self, url, client, search_params): return results, facets - def _get_highlight(self, result, data_type): + def _get_highlight(self, result, field, type=None): # if query is from page title, # highlighted title is present in 'result.meta.highlight.title' - if data_type == 'title': - highlight = result.meta.highlight.title + if not type and field == 'title': + highlight = result['highlight']['title'] # if result is not from page title, - # then results and highlighted results are present inside 'inner_hits' + # then results and highlighted results are present inside 'blocks' else: - inner_hits = result.meta.inner_hits - assert len(inner_hits) >= 1 + blocks = result['blocks'] + assert len(blocks) >= 1 # checking first inner_hit - inner_hit_0 = inner_hits[0] - expected_type = data_type.split('.')[0] # can be either 'sections' or 'domains' - assert inner_hit_0['type'] == expected_type - highlight = inner_hit_0['highlight'][data_type] + inner_hit_0 = blocks[0] + assert inner_hit_0['type'] == type + highlight = inner_hit_0['highlight'][field] return highlight @@ -132,10 +131,17 @@ def _get_highlighted_words(self, string): @pytest.mark.parametrize('data_type', DATA_TYPES_VALUES) @pytest.mark.parametrize('page_num', [0, 1]) def test_file_search(self, client, project, data_type, page_num): + data_type = data_type.split('.') + type, field = None, None + if len(data_type) < 2: + field = data_type[0] + else: + type, field = data_type query = get_search_query_from_project_file( project_slug=project.slug, page_num=page_num, - data_type=data_type + type=type, + field=field, ) results, _ = self._get_search_result( url=self.url, @@ -146,7 +152,7 @@ def test_file_search(self, client, project, data_type, page_num): # checking first result result_0 = results[0] - highlight = self._get_highlight(result_0, data_type) + highlight = self._get_highlight(result_0, field, type) assert len(highlight) == 1 highlighted_words = self._get_highlighted_words(highlight[0]) @@ -204,11 +210,11 @@ def test_file_search_filter_role_name(self, client): # in `signals` page assert len(new_results) == 1 first_result = new_results[0] # first result - inner_hits = first_result.meta.inner_hits # inner_hits of first results - assert len(inner_hits) >= 1 - inner_hit_0 = inner_hits[0] # first inner_hit - assert inner_hit_0.type == 'domains' - assert inner_hit_0.source.role_name == confval_facet + blocks = first_result['blocks'] # blocks of first results + assert len(blocks) >= 1 + inner_hit_0 = blocks[0] # first inner_hit + assert inner_hit_0['type'] == 'domain' + assert inner_hit_0['role'] == confval_facet for facet in new_role_names_facets: if facet[0] == confval_facet: @@ -224,9 +230,16 @@ def test_file_search_case_insensitive(self, client, project, case, data_type): It tests with uppercase, lowercase and camelcase. """ + type, field = None, None + data_type = data_type.split('.') + if len(data_type) < 2: + field = data_type[0] + else: + type, field = data_type query_text = get_search_query_from_project_file( project_slug=project.slug, - data_type=data_type + type=type, + field=field, ) cased_query = getattr(query_text, case) query = cased_query() @@ -239,7 +252,7 @@ def test_file_search_case_insensitive(self, client, project, case, data_type): assert len(results) >= 1 first_result = results[0] - highlight = self._get_highlight(first_result, data_type) + highlight = self._get_highlight(first_result, field, type) assert len(highlight) == 1 highlighted_words = self._get_highlighted_words(highlight[0]) assert len(highlighted_words) >= 1 @@ -267,13 +280,13 @@ def test_file_search_exact_match(self, client, project): # because the phrase is present in # only one project assert len(results) == 1 - assert results[0].project == 'kuma' - assert results[0].path == 'testdocumentation' + assert results[0]['project'] == 'kuma' + assert results[0]['link'] == 'http://readthedocs.org/docs/kuma/en/latest/documentation.html' - inner_hits = results[0].meta.inner_hits - assert len(inner_hits) == 1 - assert inner_hits[0].type == 'sections' - highlight = self._get_highlight(results[0], 'sections.content') + blocks = results[0]['blocks'] + assert len(blocks) == 1 + assert blocks[0]['type'] == 'section' + highlight = self._get_highlight(results[0], 'content', 'section') assert len(highlight) == 1 highlighted_words = self._get_highlighted_words(highlight[0]) assert len(highlighted_words) >= 1 @@ -316,12 +329,12 @@ def test_file_search_filter_by_project(self, client): search_params=search_params, ) project_facets = facets['project'] - resulted_project_facets = [ facet[0] for facet in project_facets ] + resulted_project_facets = [facet[0] for facet in project_facets] # There should be 1 search result as we have filtered assert len(results) == 1 # kuma should should be there only - assert 'kuma' == results[0].project + assert 'kuma' == results[0]['project'] # But there should be 2 projects in the project facets # as the query is present in both projects diff --git a/readthedocs/search/tests/utils.py b/readthedocs/search/tests/utils.py index 8a82a42ccc5..c55380b7fe8 100644 --- a/readthedocs/search/tests/utils.py +++ b/readthedocs/search/tests/utils.py @@ -2,13 +2,12 @@ from readthedocs.projects.models import HTMLFile - -SECTION_FIELDS = [ 'sections.title', 'sections.content' ] -DOMAIN_FIELDS = [ 'domains.name', 'domains.docstrings' ] +SECTION_FIELDS = [ 'section.title', 'section.content' ] +DOMAIN_FIELDS = [ 'domain.name', 'domain.docstring' ] DATA_TYPES_VALUES = ['title'] + SECTION_FIELDS + DOMAIN_FIELDS -def get_search_query_from_project_file(project_slug, page_num=0, data_type='title'): +def get_search_query_from_project_file(project_slug, page_num=0, field='title', type=None): """ Return search query from the project's page file. @@ -18,14 +17,19 @@ def get_search_query_from_project_file(project_slug, page_num=0, data_type='titl html_file = HTMLFile.objects.filter(project__slug=project_slug).order_by('id')[page_num] file_data = html_file.processed_json - query_data = file_data[data_type.split('.')[0]] + internal_type = { + 'section': 'sections', + 'domain': 'domains', + 'title': 'title', + } + query_data = file_data[internal_type[type or field]] - if data_type == 'title': + if not type and field == 'title': # uses first word of page title as query query = query_data.split()[0] - elif data_type == 'sections.title': + elif type == 'section' and field == 'title': # generates query from section title query_data = query_data[0]['title'].split() @@ -34,7 +38,7 @@ def get_search_query_from_project_file(project_slug, page_num=0, data_type='titl query = query_data[start:end] query = ' '.join(query) - elif data_type == 'sections.content': + elif type == 'section' and field == 'content': # generates query from section content query_data = query_data[0]['content'].split() @@ -48,7 +52,7 @@ def get_search_query_from_project_file(project_slug, page_num=0, data_type='titl query = query_data[start:end] query = ' '.join(query) - elif data_type == 'domains.name': + elif type == 'domain' and field == 'name': # test data contains domains.name # some of which contains '.' and some '/' # and others are plain words. @@ -73,7 +77,7 @@ def get_search_query_from_project_file(project_slug, page_num=0, data_type='titl else: query = query_data[0]['name'].split()[0] - elif data_type == 'domains.docstrings': + elif type == 'domain' and field == 'docstring': # generates query from domain docstrings anchor = query_data[0]['anchor'] diff --git a/readthedocs/search/views.py b/readthedocs/search/views.py index 89e2e8ddaa5..4cb3033bb46 100644 --- a/readthedocs/search/views.py +++ b/readthedocs/search/views.py @@ -15,6 +15,8 @@ ProjectSearch, ) +from .serializers import PageSearchSerializer, ProjectSearchSerializer + log = logging.getLogger(__name__) LOG_TEMPLATE = '(Elastic Search) [%(user)s:%(type)s] [%(project)s:%(version)s:%(language)s] %(msg)s' @@ -64,7 +66,7 @@ def elastic_search(request, project_slug=None): } ) - results = None + results = [] facets = {} if user_input.query: @@ -76,7 +78,9 @@ def elastic_search(request, project_slug=None): filters[avail_facet] = value search = search_facets[user_input.type]( - query=user_input.query, filters=filters, user=request.user, + query=user_input.query, + filters=filters, + user=request.user, ) results = search[:50].execute() facets = results.facets @@ -99,29 +103,12 @@ def elastic_search(request, project_slug=None): if value and value not in (val[0] for val in facets[facet]): facets[facet].insert(0, (value, 0, True)) - if results: - - # sorting inner_hits (if present) - if user_input.type == 'file': - - try: - for result in results: - inner_hits = result.meta.inner_hits - sections = inner_hits.sections or [] - domains = inner_hits.domains or [] - all_results = itertools.chain(sections, domains) - - sorted_results = utils._get_sorted_results( - results=all_results, - source_key='source', - ) - - result.meta.inner_hits = sorted_results - except Exception: - log.exception('Error while sorting the results (inner_hits).') - - log.debug('Search results: %s', results.to_dict()) - log.debug('Search facets: %s', results.facets.to_dict()) + serializers = { + 'project': ProjectSearchSerializer, + 'file': PageSearchSerializer, + } + serializer = serializers.get(user_input.type, ProjectSearchSerializer) + results = serializer(results, many=True).data template_vars = user_input._asdict() template_vars.update({ diff --git a/readthedocs/templates/search/elastic_search.html b/readthedocs/templates/search/elastic_search.html index b01118e4cfb..64eeee991fa 100644 --- a/readthedocs/templates/search/elastic_search.html +++ b/readthedocs/templates/search/elastic_search.html @@ -166,11 +166,11 @@
- {% if 'project' in result.meta.index %} - + {% if result.type == 'project' %} + {{ result.name }} ({{ result.slug }}) - {% for fragment in result.meta.highlight.description %} + {% for fragment in result.highlight.description %}
...{{ fragment|safe }}...
@@ -182,50 +182,50 @@- - {% if inner_hit.highlight|get_key_or_none:"domains.name" %} - {% with domain_name=inner_hit.highlight|get_key_or_none:"domains.name" %} - [{{ inner_hit.source.role_name }}]: {{ domain_name.0|safe }} + + {% if block.highlight.name %} + {% with domain_name=block.highlight.name %} + [{{ block.role }}]: {{ domain_name.0|safe }} {% endwith %} {% else %} - [{{ inner_hit.source.role_name }}]: {{ inner_hit.source.name }} + [{{ block.role }}]: {{ block.name }} {% endif %}
- {% if inner_hit.highlight|get_key_or_none:"domains.docstrings" %} - {% with domain_docstrings=inner_hit.highlight|get_key_or_none:"domains.docstrings" %} + {% if block.highlight.docstring %} + {% with domain_docstrings=block.highlight.docstring %} {{ domain_docstrings.0|safe }} {% endwith %} {% else %} - {% if inner_hit.source.docstrings %} - {{ inner_hit.source.docstrings|slice:MAX_SUBSTRING_LIMIT }} ... + {% if block.docstring %} + {{ block.docstring|slice:MAX_SUBSTRING_LIMIT }} ... {% endif %} {% endif %}
- {% elif inner_hit.type == 'sections' %} + {% elif block.type == 'section' %}- - {% if inner_hit.highlight|get_key_or_none:"sections.title" %} - {% with section_title=inner_hit.highlight|get_key_or_none:"sections.title" %} + + {% if block.highlight.title %} + {% with section_title=block.highlight.title %} {{ section_title.0|safe }} {% endwith %} {% else %} - {{ inner_hit.source.title }} + {{ block.title }} {% endif %}
- {% if inner_hit.highlight|get_key_or_none:"sections.content" %} - {% with section_content=inner_hit.highlight|get_key_or_none:"sections.content" %} + {% if block.highlight.content %} + {% with section_content=block.highlight.content %} {% for content in section_content %}... {{ content|safe }} ... @@ -234,7 +234,7 @@
- {{ inner_hit.source.content|slice:MAX_SUBSTRING_LIMIT }} ... + {{ block.content|slice:MAX_SUBSTRING_LIMIT }} ...
{% endif %} {% endif %}