diff --git a/conftest.py b/conftest.py index 73474ea6726..e19351615de 100644 --- a/conftest.py +++ b/conftest.py @@ -1,6 +1,7 @@ import logging import pytest +from rest_framework.test import APIClient def pytest_addoption(parser): @@ -17,3 +18,8 @@ def pytest_configure(config): @pytest.fixture(autouse=True) def settings_modification(settings): settings.CELERY_ALWAYS_EAGER = True + + +@pytest.fixture +def api_client(): + return APIClient() diff --git a/readthedocs/restapi/urls.py b/readthedocs/restapi/urls.py index 51fc72e17c6..7e019ba597c 100644 --- a/readthedocs/restapi/urls.py +++ b/readthedocs/restapi/urls.py @@ -48,7 +48,7 @@ url(r'index_search/', search_views.index_search, name='index_search'), - url(r'search/$', views.search_views.search, name='api_search'), + url(r'^search/$', views.search_views.search, name='api_search'), url(r'search/project/$', search_views.project_search, name='api_project_search'), @@ -85,20 +85,12 @@ name='api_webhook'), ] + urlpatterns += function_urls -urlpatterns += search_urls urlpatterns += task_urls +urlpatterns += search_urls urlpatterns += integration_urls -try: - from readthedocsext.search.docsearch import DocSearch - api_search_urls = [ - url(r'^docsearch/$', DocSearch.as_view(), name='doc_search'), - ] - urlpatterns += api_search_urls -except ImportError: - pass - try: from readthedocsext.donate.restapi.urls import urlpatterns as sustainability_urls diff --git a/readthedocs/search/api.py b/readthedocs/search/api.py new file mode 100644 index 00000000000..2602bdabcf4 --- /dev/null +++ b/readthedocs/search/api.py @@ -0,0 +1,57 @@ +from rest_framework import generics +from rest_framework import exceptions +from rest_framework.exceptions import ValidationError + +from readthedocs.projects.models import Project +from readthedocs.search.documents import PageDocument +from readthedocs.search.filters import SearchFilterBackend +from readthedocs.search.pagination import SearchPagination +from readthedocs.search.serializers import PageSearchSerializer +from readthedocs.search.utils import get_project_list_or_404 + + +class PageSearchAPIView(generics.ListAPIView): + pagination_class = SearchPagination + filter_backends = [SearchFilterBackend] + serializer_class = PageSearchSerializer + + def get_queryset(self): + """ + Return Elasticsearch DSL Search object instead of Django Queryset. + + Django Queryset and elasticsearch-dsl ``Search`` object is similar pattern. + So for searching, its possible to return ``Search`` object instead of queryset. + The ``filter_backends`` and ``pagination_class`` is compatible with ``Search`` + """ + # Validate all the required params are there + self.validate_query_params() + query = self.request.query_params.get('query', '') + queryset = PageDocument.simple_search(query=query) + return queryset + + def validate_query_params(self): + required_query_params = {'query', 'project', 'version'} # python `set` literal is `{}` + request_params = set(self.request.query_params.keys()) + missing_params = required_query_params - request_params + if missing_params: + errors = {} + for param in missing_params: + errors[param] = ["This query param is required"] + + raise ValidationError(errors) + + def get_serializer_context(self): + context = super(PageSearchAPIView, self).get_serializer_context() + context['projects_url'] = self.get_all_projects_url() + return context + + def get_all_projects_url(self): + version_slug = self.request.query_params.get('version') + project_slug = self.request.query_params.get('project') + all_projects = get_project_list_or_404(project_slug=project_slug, user=self.request.user) + projects_url = {} + + for project in all_projects: + projects_url[project.slug] = project.get_docs_url(version_slug=version_slug) + + return projects_url diff --git a/readthedocs/search/documents.py b/readthedocs/search/documents.py index 2b8259e8ce2..3cc70343c35 100644 --- a/readthedocs/search/documents.py +++ b/readthedocs/search/documents.py @@ -1,5 +1,6 @@ from django.conf import settings from django_elasticsearch_dsl import DocType, Index, fields +from elasticsearch_dsl.query import SimpleQueryString, Bool from readthedocs.projects.models import Project, HTMLFile from .conf import SEARCH_EXCLUDED_FILE @@ -60,14 +61,19 @@ class Meta(object): content = fields.TextField(attr='processed_json.content') path = fields.TextField(attr='processed_json.path') + # Fields to perform search with weight + search_fields = ['title^10', 'headers^5', 'content'] + @classmethod def faceted_search(cls, query, projects_list=None, versions_list=None, using=None, index=None): + es_query = cls.get_es_query(query=query) kwargs = { 'using': using or cls._doc_type.using, 'index': index or cls._doc_type.index, 'doc_types': [cls], 'model': cls._doc_type.model, - 'query': query + 'query': es_query, + 'fields': cls.search_fields } filters = {} @@ -80,6 +86,32 @@ def faceted_search(cls, query, projects_list=None, versions_list=None, using=Non return FileSearch(**kwargs) + @classmethod + def simple_search(cls, query, using=None, index=None): + es_search = cls.search(using=using, index=index) + es_query = cls.get_es_query(query=query) + highlighted_fields = [f.split('^', 1)[0] for f in cls.search_fields] + + es_search = es_search.query(es_query).highlight(*highlighted_fields) + return es_search + + @classmethod + def get_es_query(cls, query): + """Return the Elasticsearch query generated from the query string""" + all_queries = [] + + # Need to search for both 'AND' and 'OR' operations + # The score of AND should be higher as it satisfies both OR and AND + for operator in ['AND', 'OR']: + query_string = SimpleQueryString(query=query, fields=cls.search_fields, + default_operator=operator) + all_queries.append(query_string) + + # Run bool query with should, so it returns result where either of the query matches + bool_query = Bool(should=all_queries) + + return bool_query + def get_queryset(self): """Overwrite default queryset to filter certain files to index""" queryset = super(PageDocument, self).get_queryset() diff --git a/readthedocs/search/faceted_search.py b/readthedocs/search/faceted_search.py index 77d9f13cc76..4a14cb8c541 100644 --- a/readthedocs/search/faceted_search.py +++ b/readthedocs/search/faceted_search.py @@ -9,11 +9,13 @@ class RTDFacetedSearch(FacetedSearch): # TODO: Remove the overwrite when the elastic/elasticsearch-dsl-py#916 # See more: https://github.com/elastic/elasticsearch-dsl-py/issues/916 - def __init__(self, using, index, doc_types, model, **kwargs): + def __init__(self, using, index, doc_types, model, fields=None, **kwargs): self.using = using self.index = index self.doc_types = doc_types self._model = model + if fields: + self.fields = fields super(RTDFacetedSearch, self).__init__(**kwargs) @@ -25,26 +27,18 @@ class ProjectSearch(RTDFacetedSearch): class FileSearch(RTDFacetedSearch): - fields = ['title^10', 'headers^5', 'content'] facets = { 'project': TermsFacet(field='project'), 'version': TermsFacet(field='version') } def query(self, search, query): - """Add query part to ``search``""" + """ + Add query part to ``search`` + + Overriding because we pass ES Query object instead of string + """ if query: - all_queries = [] - - # Need to search for both 'AND' and 'OR' operations - # The score of AND should be higher as it comes first - for operator in ['AND', 'OR']: - query_string = SimpleQueryString(query=query, fields=self.fields, - default_operator=operator) - all_queries.append(query_string) - - # Run bool query with should, so it returns result where either of the query matches - bool_query = Bool(should=all_queries) - search = search.query(bool_query) + search = search.query(query) return search diff --git a/readthedocs/search/filters.py b/readthedocs/search/filters.py new file mode 100644 index 00000000000..ba0154def93 --- /dev/null +++ b/readthedocs/search/filters.py @@ -0,0 +1,20 @@ +from rest_framework import filters + + +class SearchFilterBackend(filters.BaseFilterBackend): + + """Filter search result with project""" + + def filter_queryset(self, request, queryset, view): + """Overwrite the method to compatible with Elasticsearch DSL Search object.""" + # ``queryset`` is actually a Elasticsearch DSL ``Search`` object. + # So change the variable name + es_search = queryset + version_slug = request.query_params.get('version') + projects_info = view.get_all_projects_url() + project_slug_list = list(projects_info.keys()) + # Elasticsearch ``terms`` query can take multiple values as list, + # while ``term`` query takes single value. + filtered_es_search = (es_search.filter('terms', project=project_slug_list) + .filter('term', version=version_slug)) + return filtered_es_search diff --git a/readthedocs/search/pagination.py b/readthedocs/search/pagination.py new file mode 100644 index 00000000000..6abfba31b98 --- /dev/null +++ b/readthedocs/search/pagination.py @@ -0,0 +1,7 @@ +from rest_framework.pagination import PageNumberPagination + + +class SearchPagination(PageNumberPagination): + page_size = 10 + page_size_query_param = 'page_size' + max_page_size = 100 diff --git a/readthedocs/search/serializers.py b/readthedocs/search/serializers.py new file mode 100644 index 00000000000..7aa8f01c93f --- /dev/null +++ b/readthedocs/search/serializers.py @@ -0,0 +1,23 @@ +from rest_framework import serializers + +from readthedocs.projects.models import Project + + +class PageSearchSerializer(serializers.Serializer): + project = serializers.CharField() + version = serializers.CharField() + title = serializers.CharField() + path = serializers.CharField() + link = serializers.SerializerMethodField() + highlight = serializers.SerializerMethodField() + + def get_link(self, obj): + projects_url = self.context.get('projects_url') + if projects_url: + docs_url = projects_url[obj.project] + return docs_url + obj.path + + def get_highlight(self, obj): + highlight = getattr(obj.meta, 'highlight', None) + if highlight: + return highlight.to_dict() diff --git a/readthedocs/search/tests/test_api.py b/readthedocs/search/tests/test_api.py new file mode 100644 index 00000000000..23fc879cf2b --- /dev/null +++ b/readthedocs/search/tests/test_api.py @@ -0,0 +1,129 @@ +import pytest +from django.core.urlresolvers import reverse +from django_dynamic_fixture import G + +from readthedocs.builds.models import Version +from readthedocs.projects.models import HTMLFile +from readthedocs.search.tests.utils import get_search_query_from_project_file + + +@pytest.mark.django_db +@pytest.mark.search +class TestDocumentSearch(object): + url = reverse('doc_search') + + @pytest.mark.parametrize('data_type', ['content', 'headers', 'title']) + @pytest.mark.parametrize('page_num', [0, 1]) + def test_search_works(self, api_client, project, data_type, page_num): + query = get_search_query_from_project_file(project_slug=project.slug, page_num=page_num, + data_type=data_type) + + version = project.versions.all()[0] + search_params = {'project': project.slug, 'version': version.slug, 'query': query} + resp = api_client.get(self.url, search_params) + assert resp.status_code == 200 + + data = resp.data['results'] + assert len(data) == 1 + project_data = data[0] + assert project_data['project'] == project.slug + + # Check highlight return correct object + all_highlights = project_data['highlight'][data_type] + for highlight in all_highlights: + # Make it lower because our search is case insensitive + assert query.lower() in highlight.lower() + + def test_doc_search_filter_by_project(self, api_client): + """Test Doc search result are filtered according to project""" + + # `Github` word is present both in `kuma` and `pipeline` files + # so search with this phrase but filter through `kuma` project + search_params = {'query': 'GitHub', 'project': 'kuma', 'version': 'latest'} + resp = api_client.get(self.url, search_params) + assert resp.status_code == 200 + + data = resp.data['results'] + assert len(data) == 1 + assert data[0]['project'] == 'kuma' + + def test_doc_search_filter_by_version(self, api_client, project): + """Test Doc search result are filtered according to version""" + query = get_search_query_from_project_file(project_slug=project.slug) + latest_version = project.versions.all()[0] + # Create another version + dummy_version = G(Version, project=project) + # Create HTMLFile same as the latest version + latest_version_files = HTMLFile.objects.all().filter(version=latest_version) + for f in latest_version_files: + f.version = dummy_version + # Make primary key to None, so django will create new object + f.pk = None + f.save() + + search_params = {'query': query, 'project': project.slug, 'version': dummy_version.slug} + resp = api_client.get(self.url, search_params) + assert resp.status_code == 200 + + data = resp.data['results'] + assert len(data) == 1 + assert data[0]['project'] == project.slug + + def test_doc_search_pagination(self, api_client, project): + """Test Doc search result can be paginated""" + latest_version = project.versions.all()[0] + html_file = HTMLFile.objects.filter(version=latest_version)[0] + title = html_file.processed_json['title'] + query = title.split()[0] + + # Create 15 more same html file + for _ in range(15): + # Make primary key to None, so django will create new object + html_file.pk = None + html_file.save() + + search_params = {'query': query, 'project': project.slug, 'version': latest_version.slug} + resp = api_client.get(self.url, search_params) + assert resp.status_code == 200 + + # Check the count is 16 (1 existing and 15 new created) + assert resp.data['count'] == 16 + # Check there are next url + assert resp.data['next'] is not None + # There should be only 10 data as the pagination is 10 by default + assert len(resp.data['results']) == 10 + + # Add `page_size` parameter and check the data is paginated accordingly + search_params['page_size'] = 5 + resp = api_client.get(self.url, search_params) + assert resp.status_code == 200 + + assert len(resp.data['results']) == 5 + + def test_doc_search_without_parameters(self, api_client, project): + """Hitting Document Search endpoint without query parameters should return error""" + resp = api_client.get(self.url) + assert resp.status_code == 400 + # Check error message is there + assert sorted(['query', 'project', 'version']) == sorted(resp.data.keys()) + + def test_doc_search_subprojects(self, api_client, all_projects): + """Test Document search return results from subprojects also""" + project = all_projects[0] + subproject = all_projects[1] + version = project.versions.all()[0] + # Add another project as subproject of the project + project.add_subproject(subproject) + + # Now search with subproject content but explicitly filter by the parent project + query = get_search_query_from_project_file(project_slug=subproject.slug) + search_params = {'query': query, 'project': project.slug, 'version': version.slug} + resp = api_client.get(self.url, search_params) + assert resp.status_code == 200 + + data = resp.data['results'] + assert len(data) == 1 + assert data[0]['project'] == subproject.slug + # Check the link is the subproject document link + document_link = subproject.get_docs_url(version_slug=version.slug) + assert document_link in data[0]['link'] diff --git a/readthedocs/search/utils.py b/readthedocs/search/utils.py index 081104c201d..659effb1544 100644 --- a/readthedocs/search/utils.py +++ b/readthedocs/search/utils.py @@ -11,8 +11,10 @@ import json from builtins import next, range +from django.shortcuts import get_object_or_404 from pyquery import PyQuery +from readthedocs.projects.models import Project log = logging.getLogger(__name__) @@ -306,3 +308,16 @@ def parse_sections(documentation_type, content): return '' return sections + + +# TODO: Rewrite all the views using this in Class Based View, +# and move this function to a mixin +def get_project_list_or_404(project_slug, user): + """Return list of project and its subprojects.""" + queryset = Project.objects.api(user).only('slug') + + project = get_object_or_404(queryset, slug=project_slug) + subprojects = queryset.filter(superprojects__parent_id=project.id) + + project_list = list(subprojects) + [project] + return project_list diff --git a/readthedocs/search/views.py b/readthedocs/search/views.py index 385b7d371ca..062ee1fed50 100644 --- a/readthedocs/search/views.py +++ b/readthedocs/search/views.py @@ -14,6 +14,7 @@ from readthedocs.projects.models import Project from readthedocs.search import lib as search_lib from readthedocs.search.documents import ProjectDocument, PageDocument +from readthedocs.search.utils import get_project_list_or_404 log = logging.getLogger(__name__) LOG_TEMPLATE = u'(Elastic Search) [{user}:{type}] [{project}:{version}:{language}] {msg}' @@ -54,14 +55,10 @@ def elastic_search(request): elif user_input.type == 'file': kwargs = {} if user_input.project: - queryset = Project.objects.api(request.user).only('slug') - project = get_object_or_404(queryset, slug=user_input.project) - - subprojects_slug = (queryset.filter(superprojects__parent_id=project.id) - .values_list('slug', flat=True)) - - projects_list = [project.slug] + list(subprojects_slug) - kwargs['projects_list'] = projects_list + projects_list = get_project_list_or_404(project_slug=user_input.project, + user=request.user) + project_slug_list = [project.slug for project in projects_list] + kwargs['projects_list'] = project_slug_list if user_input.version: kwargs['versions_list'] = user_input.version diff --git a/readthedocs/urls.py b/readthedocs/urls.py index 19914cd72e7..9a50b9b7619 100644 --- a/readthedocs/urls.py +++ b/readthedocs/urls.py @@ -22,7 +22,7 @@ do_not_track, ) from readthedocs.search import views as search_views - +from readthedocs.search.api import PageSearchAPIView v1_api = Api(api_name='v1') v1_api.register(UserResource()) @@ -66,6 +66,8 @@ api_urls = [ url(r'^api/', include(v1_api.urls)), url(r'^api/v2/', include('readthedocs.restapi.urls')), + # Keep the `doc_search` at root level, so the test does not fail for other API + url(r'^api/v2/docsearch/$', PageSearchAPIView.as_view(), name='doc_search'), url(r'^api-auth/', include('rest_framework.urls', namespace='rest_framework')), ]