Skip to content

Search: Use serializers to parse search results #7157

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jun 23, 2020
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions readthedocs/search/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,16 @@ def paginate_queryset(self, queryset, request, view=None):


class PageSearchSerializer(serializers.Serializer):

"""
Serializer for page search results.

.. note::

This serializer is deprecated in favor of
`readthedocs.search.serializers.PageSearchSerializer`.
"""

project = serializers.CharField()
version = serializers.CharField()
title = serializers.CharField()
Expand Down
143 changes: 143 additions & 0 deletions readthedocs/search/serializers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
"""
Serializers for the ES's search result object.

.. note::
Some fields are re-named to make their meaning more clear.
They should be renamed in the ES index too.
"""

import itertools
from operator import attrgetter

from django.shortcuts import get_object_or_404
from rest_framework import serializers

from readthedocs.core.resolver import resolve
from readthedocs.projects.models import Project


class ProjectHighlightSerializer(serializers.Serializer):

name = serializers.ListField(child=serializers.CharField(), default=list)
slug = serializers.ListField(child=serializers.CharField(), default=list)
description = serializers.ListField(child=serializers.CharField(), default=list)


class ProjectSearchSerializer(serializers.Serializer):

type = serializers.CharField(default='project', source=None, read_only=True)
name = serializers.CharField()
slug = serializers.CharField()
link = serializers.CharField(source='url')
highlight = ProjectHighlightSerializer(source='meta.highlight', default=dict)


class PageHighlightSerializer(serializers.Serializer):

title = serializers.ListField(child=serializers.CharField(), default=list)


class PageSearchSerializer(serializers.Serializer):

type = serializers.CharField(default='page', source=None, read_only=True)
project = serializers.CharField()
version = serializers.CharField()
title = serializers.CharField()
path = serializers.CharField(source='full_path')
link = serializers.SerializerMethodField()
highlight = PageHighlightSerializer(source='meta.highlight', default=dict)
inner_hits = serializers.SerializerMethodField()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this should just be called hits? Or something even more explicit? search_results or something?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about sections or blocks? This should be something that includes the results from domains and sections (and maybe images/codeblocks in the future?)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, but they are still results, no? I guess I don't see what value inner has here, and seems confusing?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, they are results (and the object itself is a result, so that was kind of my idea of dropping the results suffix). And, yeah inner hits is more an ES thing than something users should care about.


def get_link(self, obj):
# TODO: optimize this to not query the db for each result.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are already doing one query per result, so this isn't something I'm introducing in this PR. We can fix this when merging the serializer from the api.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's doing more than 1 query if we're calling resolve. We probably want to add this to the index, instead of computing it on render -- is that the plan?

Copy link
Member Author

@stsewd stsewd Jun 15, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We probably want to add this to the index

My plan was to so something similar to subprojects, fetch the project once and resolve the main domain for that project and then pass it down here.

project = Project.objects.filter(slug=obj.project).first()
if project:
return resolve(
project=project,
version_slug=obj.version,
filename=obj.full_path,
)
return None

def get_inner_hits(self, obj):
"""Combine and sort inner results (domains and sections)."""
serializers = {
'domain': DomainSearchSerializer,
'section': SectionSearchSerializer,
}

inner_hits = obj.meta.inner_hits
sections = inner_hits.sections or []
domains = inner_hits.domains or []

# Make them identifiable before merging them
for s in sections:
s.type = 'section'
for d in domains:
d.type = 'domain'

sorted_results = sorted(
itertools.chain(sections, domains),
key=attrgetter('_score'),
reverse=True,
)
sorted_results = [
serializers[hit.type](hit).data
for hit in sorted_results
]
return sorted_results


class DomainHighlightSerializer(serializers.Serializer):

"""
Serializer for domain results.

.. note::

We override the `to_representation` method instead of declaring each field
becuase serializers don't play nice with keys that include `.`.
"""

def to_representation(self, instance):
return {
'name': getattr(instance, 'domains.name', []),
'docstring': getattr(instance, 'domains.docstrings', []),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is named docstrings, but we only store one docstring per domain. (note here it defaults to a list bc this is the result of the highlight)

}


class DomainSearchSerializer(serializers.Serializer):

type = serializers.CharField(default='domain', source=None, read_only=True)
role_name = serializers.CharField(source='_source.role_name')
name = serializers.CharField(source='_source.name')
id = serializers.CharField(source='_source.anchor')
docstring = serializers.CharField(source='_source.docstrings')
highlight = DomainHighlightSerializer(default=dict)


class SectionHighlightSerializer(serializers.Serializer):

"""
Serializer for section results.

.. note::

We override the `to_representation` method instead of declaring each field
becuase serializers don't play nice with keys that include `.`.
"""

def to_representation(self, instance):
return {
'title': getattr(instance, 'sections.title', []),
'content': getattr(instance, 'sections.content', []),
}


class SectionSearchSerializer(serializers.Serializer):

type = serializers.CharField(default='section', source=None, read_only=True)
id = serializers.CharField(source='_source.id')
title = serializers.CharField(source='_source.title')
content = serializers.CharField(source='_source.content')
highlight = SectionHighlightSerializer(default=dict)
21 changes: 16 additions & 5 deletions readthedocs/search/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@
get_search_query_from_project_file,
)

OLD_TYPES = {
'domain': 'domains',
'section': 'sections',
}
OLD_FIELDS = {
'docstring': 'docstrings',
}

Comment on lines +26 to +33
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be removed once we start using the same serializer in the api


@pytest.mark.django_db
@pytest.mark.search
Expand All @@ -43,7 +51,7 @@ def test_search_works_with_title_query(self, api_client, project, page_num):
query = get_search_query_from_project_file(
project_slug=project.slug,
page_num=page_num,
data_type='title'
field='title'
)

version = project.versions.all().first()
Expand Down Expand Up @@ -77,10 +85,12 @@ def test_search_works_with_sections_and_domains_query(
page_num,
data_type
):
type, field = data_type.split('.')
query = get_search_query_from_project_file(
project_slug=project.slug,
page_num=page_num,
data_type=data_type
type=type,
field=field,
)
version = project.versions.all().first()
search_params = {
Expand All @@ -105,10 +115,11 @@ def test_search_works_with_sections_and_domains_query(

inner_hit_0 = inner_hits[0] # first inner_hit

expected_type = data_type.split('.')[0] # can be "sections" or "domains"
assert inner_hit_0['type'] == expected_type
old_type = OLD_TYPES.get(type, type)
assert inner_hit_0['type'] == old_type

highlight = inner_hit_0['highlight'][data_type]
old_field = old_type + '.' + OLD_FIELDS.get(field, field)
highlight = inner_hit_0['highlight'][old_field]
assert (
len(highlight) == 1
), 'number_of_fragments is set to 1'
Expand Down
59 changes: 36 additions & 23 deletions readthedocs/search/tests/test_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ def test_search_by_project_name(self, client, project, all_projects):
)

assert len(results) == 1
assert project.name.encode('utf-8') in results[0].name.encode('utf-8')
assert project.name == results[0]['name']
for proj in all_projects[1:]:
assert proj.name.encode('utf-8') not in results[0].name.encode('utf-8')
assert proj.name != results[0]['name']

def test_search_project_have_correct_language_facets(self, client, project):
"""Test that searching project should have correct language facets in the results"""
Expand Down Expand Up @@ -102,23 +102,22 @@ def _get_search_result(self, url, client, search_params):

return results, facets

def _get_highlight(self, result, data_type):
def _get_highlight(self, result, field, type=None):
# if query is from page title,
# highlighted title is present in 'result.meta.highlight.title'
if data_type == 'title':
highlight = result.meta.highlight.title
if not type and field == 'title':
highlight = result['highlight']['title']

# if result is not from page title,
# then results and highlighted results are present inside 'inner_hits'
else:
inner_hits = result.meta.inner_hits
inner_hits = result['inner_hits']
assert len(inner_hits) >= 1

# checking first inner_hit
inner_hit_0 = inner_hits[0]
expected_type = data_type.split('.')[0] # can be either 'sections' or 'domains'
assert inner_hit_0['type'] == expected_type
highlight = inner_hit_0['highlight'][data_type]
assert inner_hit_0['type'] == type
highlight = inner_hit_0['highlight'][field]

return highlight

Expand All @@ -132,10 +131,17 @@ def _get_highlighted_words(self, string):
@pytest.mark.parametrize('data_type', DATA_TYPES_VALUES)
@pytest.mark.parametrize('page_num', [0, 1])
def test_file_search(self, client, project, data_type, page_num):
data_type = data_type.split('.')
type, field = None, None
if len(data_type) < 2:
field = data_type[0]
else:
type, field = data_type
query = get_search_query_from_project_file(
project_slug=project.slug,
page_num=page_num,
data_type=data_type
type=type,
field=field,
)
results, _ = self._get_search_result(
url=self.url,
Expand All @@ -146,7 +152,7 @@ def test_file_search(self, client, project, data_type, page_num):

# checking first result
result_0 = results[0]
highlight = self._get_highlight(result_0, data_type)
highlight = self._get_highlight(result_0, field, type)
assert len(highlight) == 1

highlighted_words = self._get_highlighted_words(highlight[0])
Expand Down Expand Up @@ -204,11 +210,11 @@ def test_file_search_filter_role_name(self, client):
# in `signals` page
assert len(new_results) == 1
first_result = new_results[0] # first result
inner_hits = first_result.meta.inner_hits # inner_hits of first results
inner_hits = first_result['inner_hits'] # inner_hits of first results
assert len(inner_hits) >= 1
inner_hit_0 = inner_hits[0] # first inner_hit
assert inner_hit_0.type == 'domains'
assert inner_hit_0.source.role_name == confval_facet
assert inner_hit_0['type'] == 'domain'
assert inner_hit_0['role_name'] == confval_facet

for facet in new_role_names_facets:
if facet[0] == confval_facet:
Expand All @@ -224,9 +230,16 @@ def test_file_search_case_insensitive(self, client, project, case, data_type):

It tests with uppercase, lowercase and camelcase.
"""
type, field = None, None
data_type = data_type.split('.')
if len(data_type) < 2:
field = data_type[0]
else:
type, field = data_type
query_text = get_search_query_from_project_file(
project_slug=project.slug,
data_type=data_type
type=type,
field=field,
)
cased_query = getattr(query_text, case)
query = cased_query()
Expand All @@ -239,7 +252,7 @@ def test_file_search_case_insensitive(self, client, project, case, data_type):
assert len(results) >= 1

first_result = results[0]
highlight = self._get_highlight(first_result, data_type)
highlight = self._get_highlight(first_result, field, type)
assert len(highlight) == 1
highlighted_words = self._get_highlighted_words(highlight[0])
assert len(highlighted_words) >= 1
Expand Down Expand Up @@ -267,13 +280,13 @@ def test_file_search_exact_match(self, client, project):
# because the phrase is present in
# only one project
assert len(results) == 1
assert results[0].project == 'kuma'
assert results[0].path == 'testdocumentation'
assert results[0]['project'] == 'kuma'
assert results[0]['path'] == 'documentation.html'

inner_hits = results[0].meta.inner_hits
inner_hits = results[0]['inner_hits']
assert len(inner_hits) == 1
assert inner_hits[0].type == 'sections'
highlight = self._get_highlight(results[0], 'sections.content')
assert inner_hits[0]['type'] == 'section'
highlight = self._get_highlight(results[0], 'content', 'section')
assert len(highlight) == 1
highlighted_words = self._get_highlighted_words(highlight[0])
assert len(highlighted_words) >= 1
Expand Down Expand Up @@ -316,12 +329,12 @@ def test_file_search_filter_by_project(self, client):
search_params=search_params,
)
project_facets = facets['project']
resulted_project_facets = [ facet[0] for facet in project_facets ]
resulted_project_facets = [facet[0] for facet in project_facets]

# There should be 1 search result as we have filtered
assert len(results) == 1
# kuma should should be there only
assert 'kuma' == results[0].project
assert 'kuma' == results[0]['project']

# But there should be 2 projects in the project facets
# as the query is present in both projects
Expand Down
Loading