Skip to content

[fix #4265] Port Document search API for Elasticsearch 6.x #4309

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jul 6, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions readthedocs/search/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from rest_framework import generics

from readthedocs.search.documents import PageDocument
from readthedocs.search.filters import SearchFilterBackend
from readthedocs.search.pagination import SearchPagination
from readthedocs.search.serializers import PageSearchSerializer


class PageSearchAPIView(generics.ListAPIView):
pagination_class = SearchPagination
filter_backends = [SearchFilterBackend]
serializer_class = PageSearchSerializer

def get_queryset(self):
query = self.request.query_params.get('query')
queryset = PageDocument.search(query=query)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does search() return a queryset? Seems like it would return a search object or similar.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah. it returns a Search object.

return queryset
34 changes: 33 additions & 1 deletion readthedocs/search/documents.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from django.conf import settings
from django_elasticsearch_dsl import DocType, Index, fields
from elasticsearch_dsl.query import SimpleQueryString, Bool

from readthedocs.projects.models import Project, HTMLFile
from .conf import SEARCH_EXCLUDED_FILE
Expand Down Expand Up @@ -60,14 +61,19 @@ class Meta(object):
content = fields.TextField(attr='processed_json.content')
path = fields.TextField(attr='processed_json.path')

# Fields to perform search with weight
search_fields = ['title^10', 'headers^5', 'content']
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this used by the DocType class directly, or are we storing it here to add later? If the prior, we should probably set it in an __init__ method or somewhere else, so it doesn't get confused with the other data here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its not actually used by DocType class, but we are using it in class method. I dont know if keeping it in __init__ method will make it work in class method!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hrm, good point. I guess it's fine for now then.


@classmethod
def faceted_search(cls, query, projects_list=None, versions_list=None, using=None, index=None):
es_query = cls.get_es_query(query=query)
kwargs = {
'using': using or cls._doc_type.using,
'index': index or cls._doc_type.index,
'doc_types': [cls],
'model': cls._doc_type.model,
'query': query
'query': es_query,
'fields': cls.search_fields
}
filters = {}

Expand All @@ -80,6 +86,32 @@ def faceted_search(cls, query, projects_list=None, versions_list=None, using=Non

return FileSearch(**kwargs)

@classmethod
def search(cls, using=None, index=None, **kwargs):
es_search = super(PageDocument, cls).search(using=using, index=index)
query = kwargs.pop('query')
es_query = cls.get_es_query(query=query)

es_search = es_search.query(es_query)
return es_search

@classmethod
def get_es_query(cls, query):
"""Return the Elasticsearch query generated from the query string"""
all_queries = []

# Need to search for both 'AND' and 'OR' operations
# The score of AND should be higher as it satisfies both OR and AND
for operator in ['AND', 'OR']:
query_string = SimpleQueryString(query=query, fields=cls.search_fields,
default_operator=operator)
all_queries.append(query_string)

# Run bool query with should, so it returns result where either of the query matches
bool_query = Bool(should=all_queries)

return bool_query

def get_queryset(self):
"""Overwrite default queryset to filter certain files to index"""
queryset = super(PageDocument, self).get_queryset()
Expand Down
18 changes: 4 additions & 14 deletions readthedocs/search/faceted_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@ class RTDFacetedSearch(FacetedSearch):
# TODO: Remove the overwrite when the elastic/elasticsearch-dsl-py#916
# See more: https://github.com/elastic/elasticsearch-dsl-py/issues/916

def __init__(self, using, index, doc_types, model, **kwargs):
def __init__(self, using, index, doc_types, model, fields=None, **kwargs):
self.using = using
self.index = index
self.doc_types = doc_types
self._model = model
if fields:
self.fields = fields
super(RTDFacetedSearch, self).__init__(**kwargs)


Expand All @@ -25,7 +27,6 @@ class ProjectSearch(RTDFacetedSearch):


class FileSearch(RTDFacetedSearch):
fields = ['title^10', 'headers^5', 'content']
facets = {
'project': TermsFacet(field='project'),
'version': TermsFacet(field='version')
Expand All @@ -34,17 +35,6 @@ class FileSearch(RTDFacetedSearch):
def query(self, search, query):
"""Add query part to ``search``"""
if query:
all_queries = []

# Need to search for both 'AND' and 'OR' operations
# The score of AND should be higher as it comes first
for operator in ['AND', 'OR']:
query_string = SimpleQueryString(query=query, fields=self.fields,
default_operator=operator)
all_queries.append(query_string)

# Run bool query with should, so it returns result where either of the query matches
bool_query = Bool(should=all_queries)
search = search.query(bool_query)
search = search.query(query)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feels like we shouldn't be overwriting the object here. Will we return something invalid if there is no query because we have the same object name?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have actually a mixed feeling about this. Do you have any idea about how to do this withtout overriding the search method?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean mostly just the name of the object. search = search.query(query) -- is that similar to a queryset = queryset.filter()? It just feels like a weird bit of syntax to rewrite the name of the object over again.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah. search = search.query(query) -- is that similar to a queryset = queryset.filter(). If we do not overwrite the same object, then another variable need to be assigned and check if that is not None


return search
15 changes: 15 additions & 0 deletions readthedocs/search/filters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from rest_framework import filters

from readthedocs.search.utils import get_project_slug_list_or_404


class SearchFilterBackend(filters.BaseFilterBackend):
"""
Filter search result with project
"""

def filter_queryset(self, request, queryset, view):
project_slug = request.query_params.get('project')
project_slug_list = get_project_slug_list_or_404(project_slug=project_slug,
user=request.user)
return queryset.filter('terms', project=project_slug_list)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feels like we're conflating querysets and Search queries here again. It feels a bit odd.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah! I understand. Its not queryset, but a Search object but it is similar to queryset.

7 changes: 7 additions & 0 deletions readthedocs/search/pagination.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from rest_framework.pagination import PageNumberPagination


class SearchPagination(PageNumberPagination):
page_size = 10
page_size_query_param = 'page_size'
max_page_size = 100
8 changes: 8 additions & 0 deletions readthedocs/search/serializers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from rest_framework import serializers


class PageSearchSerializer(serializers.Serializer):
title = serializers.CharField()
headers = serializers.ListField()
content = serializers.CharField()
path = serializers.CharField()
18 changes: 18 additions & 0 deletions readthedocs/search/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
import json

from builtins import next, range
from django.shortcuts import get_object_or_404
from pyquery import PyQuery

from readthedocs.projects.models import Project

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -306,3 +308,19 @@ def parse_sections(documentation_type, content):
return ''

return sections


# TODO: Rewrite all the views using this in Class Based View,
# and move this function to a mixin
def get_project_slug_list_or_404(project_slug, user):
"""Return list of subproject's slug including own slug.
If the project is not available to user, redirect to 404
"""
queryset = Project.objects.api(user).only('slug')
project = get_object_or_404(queryset, slug=project_slug)

subprojects_slug = (queryset.filter(superprojects__parent_id=project.id)
.values_list('slug', flat=True))

slug_list = [project.slug] + list(subprojects_slug)
return slug_list
12 changes: 4 additions & 8 deletions readthedocs/search/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from readthedocs.projects.models import Project
from readthedocs.search import lib as search_lib
from readthedocs.search.documents import ProjectDocument, PageDocument
from readthedocs.search.utils import get_project_slug_list_or_404

log = logging.getLogger(__name__)
LOG_TEMPLATE = u'(Elastic Search) [{user}:{type}] [{project}:{version}:{language}] {msg}'
Expand Down Expand Up @@ -54,14 +55,9 @@ def elastic_search(request):
elif user_input.type == 'file':
kwargs = {}
if user_input.project:
queryset = Project.objects.api(request.user).only('slug')
project = get_object_or_404(queryset, slug=user_input.project)

subprojects_slug = (queryset.filter(superprojects__parent_id=project.id)
.values_list('slug', flat=True))

projects_list = [project.slug] + list(subprojects_slug)
kwargs['projects_list'] = projects_list
project_slug_list = get_project_slug_list_or_404(project_slug=user_input.project,
user=request.user)
kwargs['projects_list'] = project_slug_list
if user_input.version:
kwargs['versions_list'] = user_input.version

Expand Down
3 changes: 2 additions & 1 deletion readthedocs/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
do_not_track,
)
from readthedocs.search import views as search_views

from readthedocs.search.api import PageSearchAPIView

v1_api = Api(api_name='v1')
v1_api.register(UserResource())
Expand Down Expand Up @@ -67,6 +67,7 @@
url(r'^api/', include(v1_api.urls)),
url(r'^api/v2/', include('readthedocs.restapi.urls')),
url(r'^api-auth/', include('rest_framework.urls', namespace='rest_framework')),
url(r'^api/search/', PageSearchAPIView.as_view()),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should likely continue to live in v2 for now. Perhaps as api/v2/indocsearch or something, so it can live beside the old docsearch during rollout?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah! I added it for testing purpose actually!

]

i18n_urls = [
Expand Down