Skip to content

Commit 1b47227

Browse files
ericholschersafwanrahman
authored andcommitted
Merge pull request readthedocs#4309 from safwanrahman/search_api
[fix readthedocs#4265] Port Document search API for Elasticsearch 6.x
2 parents e9bfeee + d30bac3 commit 1b47227

File tree

12 files changed

+310
-36
lines changed

12 files changed

+310
-36
lines changed

conftest.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# -*- coding: utf-8 -*-
22
import pytest
3+
from rest_framework.test import APIClient
34

45
try:
56
# TODO: this file is read/executed even when called from ``readthedocsinc``,
@@ -44,3 +45,8 @@ def pytest_configure(config):
4445
@pytest.fixture(autouse=True)
4546
def settings_modification(settings):
4647
settings.CELERY_ALWAYS_EAGER = True
48+
49+
50+
@pytest.fixture
51+
def api_client():
52+
return APIClient()

readthedocs/restapi/urls.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
url(r'index_search/',
4949
search_views.index_search,
5050
name='index_search'),
51-
url(r'search/$', views.search_views.search, name='api_search'),
51+
url(r'^search/$', views.search_views.search, name='api_search'),
5252
url(r'search/project/$',
5353
search_views.project_search,
5454
name='api_project_search'),
@@ -85,20 +85,12 @@
8585
name='api_webhook'),
8686
]
8787

88+
8889
urlpatterns += function_urls
89-
urlpatterns += search_urls
9090
urlpatterns += task_urls
91+
urlpatterns += search_urls
9192
urlpatterns += integration_urls
9293

93-
try:
94-
from readthedocsext.search.docsearch import DocSearch
95-
api_search_urls = [
96-
url(r'^docsearch/$', DocSearch.as_view(), name='doc_search'),
97-
]
98-
urlpatterns += api_search_urls
99-
except ImportError:
100-
pass
101-
10294
try:
10395
from readthedocsext.donate.restapi.urls import urlpatterns as sustainability_urls
10496

readthedocs/search/api.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
from rest_framework import generics
2+
from rest_framework import exceptions
3+
from rest_framework.exceptions import ValidationError
4+
5+
from readthedocs.projects.models import Project
6+
from readthedocs.search.documents import PageDocument
7+
from readthedocs.search.filters import SearchFilterBackend
8+
from readthedocs.search.pagination import SearchPagination
9+
from readthedocs.search.serializers import PageSearchSerializer
10+
from readthedocs.search.utils import get_project_list_or_404
11+
12+
13+
class PageSearchAPIView(generics.ListAPIView):
14+
pagination_class = SearchPagination
15+
filter_backends = [SearchFilterBackend]
16+
serializer_class = PageSearchSerializer
17+
18+
def get_queryset(self):
19+
"""
20+
Return Elasticsearch DSL Search object instead of Django Queryset.
21+
22+
Django Queryset and elasticsearch-dsl ``Search`` object is similar pattern.
23+
So for searching, its possible to return ``Search`` object instead of queryset.
24+
The ``filter_backends`` and ``pagination_class`` is compatible with ``Search``
25+
"""
26+
# Validate all the required params are there
27+
self.validate_query_params()
28+
query = self.request.query_params.get('query', '')
29+
queryset = PageDocument.simple_search(query=query)
30+
return queryset
31+
32+
def validate_query_params(self):
33+
required_query_params = {'query', 'project', 'version'} # python `set` literal is `{}`
34+
request_params = set(self.request.query_params.keys())
35+
missing_params = required_query_params - request_params
36+
if missing_params:
37+
errors = {}
38+
for param in missing_params:
39+
errors[param] = ["This query param is required"]
40+
41+
raise ValidationError(errors)
42+
43+
def get_serializer_context(self):
44+
context = super(PageSearchAPIView, self).get_serializer_context()
45+
context['projects_url'] = self.get_all_projects_url()
46+
return context
47+
48+
def get_all_projects_url(self):
49+
version_slug = self.request.query_params.get('version')
50+
project_slug = self.request.query_params.get('project')
51+
all_projects = get_project_list_or_404(project_slug=project_slug, user=self.request.user)
52+
projects_url = {}
53+
54+
for project in all_projects:
55+
projects_url[project.slug] = project.get_docs_url(version_slug=version_slug)
56+
57+
return projects_url

readthedocs/search/documents.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from django.conf import settings
22
from django_elasticsearch_dsl import DocType, Index, fields
3+
from elasticsearch_dsl.query import SimpleQueryString, Bool
34

45
from readthedocs.projects.models import Project, HTMLFile
56
from .conf import SEARCH_EXCLUDED_FILE
@@ -60,14 +61,19 @@ class Meta(object):
6061
content = fields.TextField(attr='processed_json.content')
6162
path = fields.TextField(attr='processed_json.path')
6263

64+
# Fields to perform search with weight
65+
search_fields = ['title^10', 'headers^5', 'content']
66+
6367
@classmethod
6468
def faceted_search(cls, query, projects_list=None, versions_list=None, using=None, index=None):
69+
es_query = cls.get_es_query(query=query)
6570
kwargs = {
6671
'using': using or cls._doc_type.using,
6772
'index': index or cls._doc_type.index,
6873
'doc_types': [cls],
6974
'model': cls._doc_type.model,
70-
'query': query
75+
'query': es_query,
76+
'fields': cls.search_fields
7177
}
7278
filters = {}
7379

@@ -80,6 +86,32 @@ def faceted_search(cls, query, projects_list=None, versions_list=None, using=Non
8086

8187
return FileSearch(**kwargs)
8288

89+
@classmethod
90+
def simple_search(cls, query, using=None, index=None):
91+
es_search = cls.search(using=using, index=index)
92+
es_query = cls.get_es_query(query=query)
93+
highlighted_fields = [f.split('^', 1)[0] for f in cls.search_fields]
94+
95+
es_search = es_search.query(es_query).highlight(*highlighted_fields)
96+
return es_search
97+
98+
@classmethod
99+
def get_es_query(cls, query):
100+
"""Return the Elasticsearch query generated from the query string"""
101+
all_queries = []
102+
103+
# Need to search for both 'AND' and 'OR' operations
104+
# The score of AND should be higher as it satisfies both OR and AND
105+
for operator in ['AND', 'OR']:
106+
query_string = SimpleQueryString(query=query, fields=cls.search_fields,
107+
default_operator=operator)
108+
all_queries.append(query_string)
109+
110+
# Run bool query with should, so it returns result where either of the query matches
111+
bool_query = Bool(should=all_queries)
112+
113+
return bool_query
114+
83115
def get_queryset(self):
84116
"""Overwrite default queryset to filter certain files to index"""
85117
queryset = super(PageDocument, self).get_queryset()

readthedocs/search/faceted_search.py

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@ class RTDFacetedSearch(FacetedSearch):
99
# TODO: Remove the overwrite when the elastic/elasticsearch-dsl-py#916
1010
# See more: https://github.com/elastic/elasticsearch-dsl-py/issues/916
1111

12-
def __init__(self, using, index, doc_types, model, **kwargs):
12+
def __init__(self, using, index, doc_types, model, fields=None, **kwargs):
1313
self.using = using
1414
self.index = index
1515
self.doc_types = doc_types
1616
self._model = model
17+
if fields:
18+
self.fields = fields
1719
super(RTDFacetedSearch, self).__init__(**kwargs)
1820

1921

@@ -25,26 +27,18 @@ class ProjectSearch(RTDFacetedSearch):
2527

2628

2729
class FileSearch(RTDFacetedSearch):
28-
fields = ['title^10', 'headers^5', 'content']
2930
facets = {
3031
'project': TermsFacet(field='project'),
3132
'version': TermsFacet(field='version')
3233
}
3334

3435
def query(self, search, query):
35-
"""Add query part to ``search``"""
36+
"""
37+
Add query part to ``search``
38+
39+
Overriding because we pass ES Query object instead of string
40+
"""
3641
if query:
37-
all_queries = []
38-
39-
# Need to search for both 'AND' and 'OR' operations
40-
# The score of AND should be higher as it comes first
41-
for operator in ['AND', 'OR']:
42-
query_string = SimpleQueryString(query=query, fields=self.fields,
43-
default_operator=operator)
44-
all_queries.append(query_string)
45-
46-
# Run bool query with should, so it returns result where either of the query matches
47-
bool_query = Bool(should=all_queries)
48-
search = search.query(bool_query)
42+
search = search.query(query)
4943

5044
return search

readthedocs/search/filters.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from rest_framework import filters
2+
3+
4+
class SearchFilterBackend(filters.BaseFilterBackend):
5+
6+
"""Filter search result with project"""
7+
8+
def filter_queryset(self, request, queryset, view):
9+
"""Overwrite the method to compatible with Elasticsearch DSL Search object."""
10+
# ``queryset`` is actually a Elasticsearch DSL ``Search`` object.
11+
# So change the variable name
12+
es_search = queryset
13+
version_slug = request.query_params.get('version')
14+
projects_info = view.get_all_projects_url()
15+
project_slug_list = list(projects_info.keys())
16+
# Elasticsearch ``terms`` query can take multiple values as list,
17+
# while ``term`` query takes single value.
18+
filtered_es_search = (es_search.filter('terms', project=project_slug_list)
19+
.filter('term', version=version_slug))
20+
return filtered_es_search

readthedocs/search/pagination.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from rest_framework.pagination import PageNumberPagination
2+
3+
4+
class SearchPagination(PageNumberPagination):
5+
page_size = 10
6+
page_size_query_param = 'page_size'
7+
max_page_size = 100

readthedocs/search/serializers.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from rest_framework import serializers
2+
3+
from readthedocs.projects.models import Project
4+
5+
6+
class PageSearchSerializer(serializers.Serializer):
7+
project = serializers.CharField()
8+
version = serializers.CharField()
9+
title = serializers.CharField()
10+
path = serializers.CharField()
11+
link = serializers.SerializerMethodField()
12+
highlight = serializers.SerializerMethodField()
13+
14+
def get_link(self, obj):
15+
projects_url = self.context.get('projects_url')
16+
if projects_url:
17+
docs_url = projects_url[obj.project]
18+
return docs_url + obj.path
19+
20+
def get_highlight(self, obj):
21+
highlight = getattr(obj.meta, 'highlight', None)
22+
if highlight:
23+
return highlight.to_dict()

readthedocs/search/tests/test_api.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
import pytest
2+
from django.core.urlresolvers import reverse
3+
from django_dynamic_fixture import G
4+
5+
from readthedocs.builds.models import Version
6+
from readthedocs.projects.models import HTMLFile
7+
from readthedocs.search.tests.utils import get_search_query_from_project_file
8+
9+
10+
@pytest.mark.django_db
11+
@pytest.mark.search
12+
class TestDocumentSearch(object):
13+
url = reverse('doc_search')
14+
15+
@pytest.mark.parametrize('data_type', ['content', 'headers', 'title'])
16+
@pytest.mark.parametrize('page_num', [0, 1])
17+
def test_search_works(self, api_client, project, data_type, page_num):
18+
query = get_search_query_from_project_file(project_slug=project.slug, page_num=page_num,
19+
data_type=data_type)
20+
21+
version = project.versions.all()[0]
22+
search_params = {'project': project.slug, 'version': version.slug, 'query': query}
23+
resp = api_client.get(self.url, search_params)
24+
assert resp.status_code == 200
25+
26+
data = resp.data['results']
27+
assert len(data) == 1
28+
project_data = data[0]
29+
assert project_data['project'] == project.slug
30+
31+
# Check highlight return correct object
32+
all_highlights = project_data['highlight'][data_type]
33+
for highlight in all_highlights:
34+
# Make it lower because our search is case insensitive
35+
assert query.lower() in highlight.lower()
36+
37+
def test_doc_search_filter_by_project(self, api_client):
38+
"""Test Doc search result are filtered according to project"""
39+
40+
# `Github` word is present both in `kuma` and `pipeline` files
41+
# so search with this phrase but filter through `kuma` project
42+
search_params = {'query': 'GitHub', 'project': 'kuma', 'version': 'latest'}
43+
resp = api_client.get(self.url, search_params)
44+
assert resp.status_code == 200
45+
46+
data = resp.data['results']
47+
assert len(data) == 1
48+
assert data[0]['project'] == 'kuma'
49+
50+
def test_doc_search_filter_by_version(self, api_client, project):
51+
"""Test Doc search result are filtered according to version"""
52+
query = get_search_query_from_project_file(project_slug=project.slug)
53+
latest_version = project.versions.all()[0]
54+
# Create another version
55+
dummy_version = G(Version, project=project)
56+
# Create HTMLFile same as the latest version
57+
latest_version_files = HTMLFile.objects.all().filter(version=latest_version)
58+
for f in latest_version_files:
59+
f.version = dummy_version
60+
# Make primary key to None, so django will create new object
61+
f.pk = None
62+
f.save()
63+
64+
search_params = {'query': query, 'project': project.slug, 'version': dummy_version.slug}
65+
resp = api_client.get(self.url, search_params)
66+
assert resp.status_code == 200
67+
68+
data = resp.data['results']
69+
assert len(data) == 1
70+
assert data[0]['project'] == project.slug
71+
72+
def test_doc_search_pagination(self, api_client, project):
73+
"""Test Doc search result can be paginated"""
74+
latest_version = project.versions.all()[0]
75+
html_file = HTMLFile.objects.filter(version=latest_version)[0]
76+
title = html_file.processed_json['title']
77+
query = title.split()[0]
78+
79+
# Create 15 more same html file
80+
for _ in range(15):
81+
# Make primary key to None, so django will create new object
82+
html_file.pk = None
83+
html_file.save()
84+
85+
search_params = {'query': query, 'project': project.slug, 'version': latest_version.slug}
86+
resp = api_client.get(self.url, search_params)
87+
assert resp.status_code == 200
88+
89+
# Check the count is 16 (1 existing and 15 new created)
90+
assert resp.data['count'] == 16
91+
# Check there are next url
92+
assert resp.data['next'] is not None
93+
# There should be only 10 data as the pagination is 10 by default
94+
assert len(resp.data['results']) == 10
95+
96+
# Add `page_size` parameter and check the data is paginated accordingly
97+
search_params['page_size'] = 5
98+
resp = api_client.get(self.url, search_params)
99+
assert resp.status_code == 200
100+
101+
assert len(resp.data['results']) == 5
102+
103+
def test_doc_search_without_parameters(self, api_client, project):
104+
"""Hitting Document Search endpoint without query parameters should return error"""
105+
resp = api_client.get(self.url)
106+
assert resp.status_code == 400
107+
# Check error message is there
108+
assert sorted(['query', 'project', 'version']) == sorted(resp.data.keys())
109+
110+
def test_doc_search_subprojects(self, api_client, all_projects):
111+
"""Test Document search return results from subprojects also"""
112+
project = all_projects[0]
113+
subproject = all_projects[1]
114+
version = project.versions.all()[0]
115+
# Add another project as subproject of the project
116+
project.add_subproject(subproject)
117+
118+
# Now search with subproject content but explicitly filter by the parent project
119+
query = get_search_query_from_project_file(project_slug=subproject.slug)
120+
search_params = {'query': query, 'project': project.slug, 'version': version.slug}
121+
resp = api_client.get(self.url, search_params)
122+
assert resp.status_code == 200
123+
124+
data = resp.data['results']
125+
assert len(data) == 1
126+
assert data[0]['project'] == subproject.slug
127+
# Check the link is the subproject document link
128+
document_link = subproject.get_docs_url(version_slug=version.slug)
129+
assert document_link in data[0]['link']

0 commit comments

Comments
 (0)