Skip to content

Commit 9846324

Browse files
authored
Search: Use serializers to parse search results (#7157)
* Use serializers to parse search results * Update templates * Update tests * Update * Typo * update docstring * Linter * Feedback from review * Add todo
1 parent dc20232 commit 9846324

File tree

7 files changed

+262
-94
lines changed

7 files changed

+262
-94
lines changed

readthedocs/search/api.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,16 @@ def paginate_queryset(self, queryset, request, view=None):
121121

122122

123123
class PageSearchSerializer(serializers.Serializer):
124+
125+
"""
126+
Serializer for page search results.
127+
128+
.. note::
129+
130+
This serializer is deprecated in favor of
131+
`readthedocs.search.serializers.PageSearchSerializer`.
132+
"""
133+
124134
project = serializers.CharField()
125135
version = serializers.CharField()
126136
title = serializers.CharField()

readthedocs/search/serializers.py

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
"""
2+
Serializers for the ES's search result object.
3+
4+
.. note::
5+
Some fields are re-named to make their meaning more clear.
6+
They should be renamed in the ES index too.
7+
"""
8+
9+
import itertools
10+
from operator import attrgetter
11+
12+
from django.shortcuts import get_object_or_404
13+
from rest_framework import serializers
14+
15+
from readthedocs.core.resolver import resolve
16+
from readthedocs.projects.models import Project
17+
18+
19+
class ProjectHighlightSerializer(serializers.Serializer):
20+
21+
name = serializers.ListField(child=serializers.CharField(), default=list)
22+
slug = serializers.ListField(child=serializers.CharField(), default=list)
23+
description = serializers.ListField(child=serializers.CharField(), default=list)
24+
25+
26+
class ProjectSearchSerializer(serializers.Serializer):
27+
28+
type = serializers.CharField(default='project', source=None, read_only=True)
29+
name = serializers.CharField()
30+
slug = serializers.CharField()
31+
link = serializers.CharField(source='url')
32+
highlight = ProjectHighlightSerializer(source='meta.highlight', default=dict)
33+
34+
35+
class PageHighlightSerializer(serializers.Serializer):
36+
37+
title = serializers.ListField(child=serializers.CharField(), default=list)
38+
39+
40+
class PageSearchSerializer(serializers.Serializer):
41+
42+
type = serializers.CharField(default='page', source=None, read_only=True)
43+
project = serializers.CharField()
44+
version = serializers.CharField()
45+
title = serializers.CharField()
46+
link = serializers.SerializerMethodField()
47+
highlight = PageHighlightSerializer(source='meta.highlight', default=dict)
48+
blocks = serializers.SerializerMethodField()
49+
50+
def get_link(self, obj):
51+
# TODO: optimize this to not query the db for each result.
52+
# TODO: return an relative URL when this is called from the indoc search.
53+
project = Project.objects.filter(slug=obj.project).first()
54+
if project:
55+
return resolve(
56+
project=project,
57+
version_slug=obj.version,
58+
filename=obj.full_path,
59+
)
60+
return None
61+
62+
def get_blocks(self, obj):
63+
"""Combine and sort inner results (domains and sections)."""
64+
serializers = {
65+
'domain': DomainSearchSerializer,
66+
'section': SectionSearchSerializer,
67+
}
68+
69+
inner_hits = obj.meta.inner_hits
70+
sections = inner_hits.sections or []
71+
domains = inner_hits.domains or []
72+
73+
# Make them identifiable before merging them
74+
for s in sections:
75+
s.type = 'section'
76+
for d in domains:
77+
d.type = 'domain'
78+
79+
sorted_results = sorted(
80+
itertools.chain(sections, domains),
81+
key=attrgetter('_score'),
82+
reverse=True,
83+
)
84+
sorted_results = [
85+
serializers[hit.type](hit).data
86+
for hit in sorted_results
87+
]
88+
return sorted_results
89+
90+
91+
class DomainHighlightSerializer(serializers.Serializer):
92+
93+
"""
94+
Serializer for domain results.
95+
96+
.. note::
97+
98+
We override the `to_representation` method instead of declaring each field
99+
because serializers don't play nice with keys that include `.`.
100+
"""
101+
102+
def to_representation(self, instance):
103+
return {
104+
'name': getattr(instance, 'domains.name', []),
105+
'docstring': getattr(instance, 'domains.docstrings', []),
106+
}
107+
108+
109+
class DomainSearchSerializer(serializers.Serializer):
110+
111+
type = serializers.CharField(default='domain', source=None, read_only=True)
112+
role = serializers.CharField(source='_source.role_name')
113+
name = serializers.CharField(source='_source.name')
114+
id = serializers.CharField(source='_source.anchor')
115+
docstring = serializers.CharField(source='_source.docstrings')
116+
highlight = DomainHighlightSerializer(default=dict)
117+
118+
119+
class SectionHighlightSerializer(serializers.Serializer):
120+
121+
"""
122+
Serializer for section results.
123+
124+
.. note::
125+
126+
We override the `to_representation` method instead of declaring each field
127+
because serializers don't play nice with keys that include `.`.
128+
"""
129+
130+
def to_representation(self, instance):
131+
return {
132+
'title': getattr(instance, 'sections.title', []),
133+
'content': getattr(instance, 'sections.content', []),
134+
}
135+
136+
137+
class SectionSearchSerializer(serializers.Serializer):
138+
139+
type = serializers.CharField(default='section', source=None, read_only=True)
140+
id = serializers.CharField(source='_source.id')
141+
title = serializers.CharField(source='_source.title')
142+
content = serializers.CharField(source='_source.content')
143+
highlight = SectionHighlightSerializer(default=dict)

readthedocs/search/tests/test_api.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,14 @@
2323
get_search_query_from_project_file,
2424
)
2525

26+
OLD_TYPES = {
27+
'domain': 'domains',
28+
'section': 'sections',
29+
}
30+
OLD_FIELDS = {
31+
'docstring': 'docstrings',
32+
}
33+
2634

2735
@pytest.mark.django_db
2836
@pytest.mark.search
@@ -43,7 +51,7 @@ def test_search_works_with_title_query(self, api_client, project, page_num):
4351
query = get_search_query_from_project_file(
4452
project_slug=project.slug,
4553
page_num=page_num,
46-
data_type='title'
54+
field='title'
4755
)
4856

4957
version = project.versions.all().first()
@@ -77,10 +85,12 @@ def test_search_works_with_sections_and_domains_query(
7785
page_num,
7886
data_type
7987
):
88+
type, field = data_type.split('.')
8089
query = get_search_query_from_project_file(
8190
project_slug=project.slug,
8291
page_num=page_num,
83-
data_type=data_type
92+
type=type,
93+
field=field,
8494
)
8595
version = project.versions.all().first()
8696
search_params = {
@@ -105,10 +115,11 @@ def test_search_works_with_sections_and_domains_query(
105115

106116
inner_hit_0 = inner_hits[0] # first inner_hit
107117

108-
expected_type = data_type.split('.')[0] # can be "sections" or "domains"
109-
assert inner_hit_0['type'] == expected_type
118+
old_type = OLD_TYPES.get(type, type)
119+
assert inner_hit_0['type'] == old_type
110120

111-
highlight = inner_hit_0['highlight'][data_type]
121+
old_field = old_type + '.' + OLD_FIELDS.get(field, field)
122+
highlight = inner_hit_0['highlight'][old_field]
112123
assert (
113124
len(highlight) == 1
114125
), 'number_of_fragments is set to 1'

readthedocs/search/tests/test_views.py

Lines changed: 42 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ def test_search_by_project_name(self, client, project, all_projects):
3838
)
3939

4040
assert len(results) == 1
41-
assert project.name.encode('utf-8') in results[0].name.encode('utf-8')
41+
assert project.name == results[0]['name']
4242
for proj in all_projects[1:]:
43-
assert proj.name.encode('utf-8') not in results[0].name.encode('utf-8')
43+
assert proj.name != results[0]['name']
4444

4545
def test_search_project_have_correct_language_facets(self, client, project):
4646
"""Test that searching project should have correct language facets in the results"""
@@ -102,23 +102,22 @@ def _get_search_result(self, url, client, search_params):
102102

103103
return results, facets
104104

105-
def _get_highlight(self, result, data_type):
105+
def _get_highlight(self, result, field, type=None):
106106
# if query is from page title,
107107
# highlighted title is present in 'result.meta.highlight.title'
108-
if data_type == 'title':
109-
highlight = result.meta.highlight.title
108+
if not type and field == 'title':
109+
highlight = result['highlight']['title']
110110

111111
# if result is not from page title,
112-
# then results and highlighted results are present inside 'inner_hits'
112+
# then results and highlighted results are present inside 'blocks'
113113
else:
114-
inner_hits = result.meta.inner_hits
115-
assert len(inner_hits) >= 1
114+
blocks = result['blocks']
115+
assert len(blocks) >= 1
116116

117117
# checking first inner_hit
118-
inner_hit_0 = inner_hits[0]
119-
expected_type = data_type.split('.')[0] # can be either 'sections' or 'domains'
120-
assert inner_hit_0['type'] == expected_type
121-
highlight = inner_hit_0['highlight'][data_type]
118+
inner_hit_0 = blocks[0]
119+
assert inner_hit_0['type'] == type
120+
highlight = inner_hit_0['highlight'][field]
122121

123122
return highlight
124123

@@ -132,10 +131,17 @@ def _get_highlighted_words(self, string):
132131
@pytest.mark.parametrize('data_type', DATA_TYPES_VALUES)
133132
@pytest.mark.parametrize('page_num', [0, 1])
134133
def test_file_search(self, client, project, data_type, page_num):
134+
data_type = data_type.split('.')
135+
type, field = None, None
136+
if len(data_type) < 2:
137+
field = data_type[0]
138+
else:
139+
type, field = data_type
135140
query = get_search_query_from_project_file(
136141
project_slug=project.slug,
137142
page_num=page_num,
138-
data_type=data_type
143+
type=type,
144+
field=field,
139145
)
140146
results, _ = self._get_search_result(
141147
url=self.url,
@@ -146,7 +152,7 @@ def test_file_search(self, client, project, data_type, page_num):
146152

147153
# checking first result
148154
result_0 = results[0]
149-
highlight = self._get_highlight(result_0, data_type)
155+
highlight = self._get_highlight(result_0, field, type)
150156
assert len(highlight) == 1
151157

152158
highlighted_words = self._get_highlighted_words(highlight[0])
@@ -204,11 +210,11 @@ def test_file_search_filter_role_name(self, client):
204210
# in `signals` page
205211
assert len(new_results) == 1
206212
first_result = new_results[0] # first result
207-
inner_hits = first_result.meta.inner_hits # inner_hits of first results
208-
assert len(inner_hits) >= 1
209-
inner_hit_0 = inner_hits[0] # first inner_hit
210-
assert inner_hit_0.type == 'domains'
211-
assert inner_hit_0.source.role_name == confval_facet
213+
blocks = first_result['blocks'] # blocks of first results
214+
assert len(blocks) >= 1
215+
inner_hit_0 = blocks[0] # first inner_hit
216+
assert inner_hit_0['type'] == 'domain'
217+
assert inner_hit_0['role'] == confval_facet
212218

213219
for facet in new_role_names_facets:
214220
if facet[0] == confval_facet:
@@ -224,9 +230,16 @@ def test_file_search_case_insensitive(self, client, project, case, data_type):
224230
225231
It tests with uppercase, lowercase and camelcase.
226232
"""
233+
type, field = None, None
234+
data_type = data_type.split('.')
235+
if len(data_type) < 2:
236+
field = data_type[0]
237+
else:
238+
type, field = data_type
227239
query_text = get_search_query_from_project_file(
228240
project_slug=project.slug,
229-
data_type=data_type
241+
type=type,
242+
field=field,
230243
)
231244
cased_query = getattr(query_text, case)
232245
query = cased_query()
@@ -239,7 +252,7 @@ def test_file_search_case_insensitive(self, client, project, case, data_type):
239252
assert len(results) >= 1
240253

241254
first_result = results[0]
242-
highlight = self._get_highlight(first_result, data_type)
255+
highlight = self._get_highlight(first_result, field, type)
243256
assert len(highlight) == 1
244257
highlighted_words = self._get_highlighted_words(highlight[0])
245258
assert len(highlighted_words) >= 1
@@ -267,13 +280,13 @@ def test_file_search_exact_match(self, client, project):
267280
# because the phrase is present in
268281
# only one project
269282
assert len(results) == 1
270-
assert results[0].project == 'kuma'
271-
assert results[0].path == 'testdocumentation'
283+
assert results[0]['project'] == 'kuma'
284+
assert results[0]['link'] == 'http://readthedocs.org/docs/kuma/en/latest/documentation.html'
272285

273-
inner_hits = results[0].meta.inner_hits
274-
assert len(inner_hits) == 1
275-
assert inner_hits[0].type == 'sections'
276-
highlight = self._get_highlight(results[0], 'sections.content')
286+
blocks = results[0]['blocks']
287+
assert len(blocks) == 1
288+
assert blocks[0]['type'] == 'section'
289+
highlight = self._get_highlight(results[0], 'content', 'section')
277290
assert len(highlight) == 1
278291
highlighted_words = self._get_highlighted_words(highlight[0])
279292
assert len(highlighted_words) >= 1
@@ -316,12 +329,12 @@ def test_file_search_filter_by_project(self, client):
316329
search_params=search_params,
317330
)
318331
project_facets = facets['project']
319-
resulted_project_facets = [ facet[0] for facet in project_facets ]
332+
resulted_project_facets = [facet[0] for facet in project_facets]
320333

321334
# There should be 1 search result as we have filtered
322335
assert len(results) == 1
323336
# kuma should should be there only
324-
assert 'kuma' == results[0].project
337+
assert 'kuma' == results[0]['project']
325338

326339
# But there should be 2 projects in the project facets
327340
# as the query is present in both projects

0 commit comments

Comments
 (0)