From 01f5c42a2fe0cc234ae95f9d239cbb5018d0217e Mon Sep 17 00:00:00 2001 From: Eric Holscher Date: Wed, 6 Dec 2017 21:54:21 -0800 Subject: [PATCH 1/9] Upgrade search app to Elastic Search 5.x --- .../commands/reindex_elasticsearch.py | 4 +- readthedocs/restapi/urls.py | 2 +- readthedocs/restapi/utils.py | 3 +- readthedocs/search/indexes.py | 73 +++++++++---------- readthedocs/settings/base.py | 1 + readthedocs/urls.py | 8 +- requirements/pip.txt | 3 +- 7 files changed, 48 insertions(+), 46 deletions(-) diff --git a/readthedocs/core/management/commands/reindex_elasticsearch.py b/readthedocs/core/management/commands/reindex_elasticsearch.py index b736a1cd426..0e85ee7be9f 100644 --- a/readthedocs/core/management/commands/reindex_elasticsearch.py +++ b/readthedocs/core/management/commands/reindex_elasticsearch.py @@ -29,7 +29,7 @@ def handle(self, *args, **options): """Build/index all versions or a single project's version""" project = options['project'] - queryset = Version.objects.all() + queryset = Version.objects.filter(active=True) if project: queryset = queryset.filter(project__slug=project) @@ -37,7 +37,7 @@ def handle(self, *args, **options): raise CommandError( 'No project with slug: {slug}'.format(slug=project)) log.info("Building all versions for %s", project) - elif getattr(settings, 'INDEX_ONLY_LATEST', True): + if getattr(settings, 'INDEX_ONLY_LATEST', True): queryset = queryset.filter(slug=LATEST) for version in queryset: diff --git a/readthedocs/restapi/urls.py b/readthedocs/restapi/urls.py index 5480ce98093..08315000a1c 100644 --- a/readthedocs/restapi/urls.py +++ b/readthedocs/restapi/urls.py @@ -50,7 +50,7 @@ url(r'index_search/', search_views.index_search, name='index_search'), - url(r'search/$', views.search_views.search, name='api_search'), + url(r'^search/$', views.search_views.search, name='api_search'), url(r'search/project/$', search_views.project_search, name='api_project_search'), diff --git a/readthedocs/restapi/utils.py b/readthedocs/restapi/utils.py index 00e9ec15937..74eb6d8b413 100644 --- a/readthedocs/restapi/utils.py +++ b/readthedocs/restapi/utils.py @@ -161,12 +161,11 @@ def index_search_request( for route in routes: section_obj.bulk_index( section_index_list, - parent=page_id, routing=route, ) for route in routes: - page_obj.bulk_index(index_list, parent=project.slug, routing=route) + page_obj.bulk_index(index_list, routing=route) if delete: log.info('Deleting files not in commit: %s', commit) diff --git a/readthedocs/search/indexes.py b/readthedocs/search/indexes.py index 1b2ede6aaa9..a3a409e7e1f 100644 --- a/readthedocs/search/indexes.py +++ b/readthedocs/search/indexes.py @@ -19,7 +19,7 @@ import datetime from elasticsearch import Elasticsearch, exceptions -from elasticsearch.helpers import bulk_index +from elasticsearch.helpers import bulk from django.conf import settings @@ -48,8 +48,6 @@ def get_settings(self, settings_override=None): 'number_of_replicas': settings.ES_DEFAULT_NUM_REPLICAS, 'number_of_shards': settings.ES_DEFAULT_NUM_SHARDS, 'refresh_interval': '5s', - 'store.compress.tv': True, - 'store.compress.stored': True, 'analysis': self.get_analysis(), } if settings_override: @@ -139,7 +137,7 @@ def bulk_index(self, data, index=None, chunk_size=500, parent=None, docs.append(doc) # TODO: This doesn't work with the new ES setup. - bulk_index(self.es, docs, chunk_size=chunk_size) + bulk(self.es, docs, chunk_size=chunk_size) def index_document(self, data, index=None, parent=None, routing=None): doc = self.extract_document(data) @@ -220,25 +218,24 @@ def get_mapping(self): # Disable _all field to reduce index size. '_all': {'enabled': False}, 'properties': { - 'id': {'type': 'long'}, - 'name': {'type': 'string', 'analyzer': 'default_icu'}, - 'description': {'type': 'string', 'analyzer': 'default_icu'}, - - 'slug': {'type': 'string', 'index': 'not_analyzed'}, - 'lang': {'type': 'string', 'index': 'not_analyzed'}, - 'tags': {'type': 'string', 'index': 'not_analyzed'}, - 'privacy': {'type': 'string', 'index': 'not_analyzed'}, + 'id': {'type': 'keyword'}, + 'name': {'type': 'text', 'analyzer': 'default_icu'}, + 'description': {'type': 'text', 'analyzer': 'default_icu'}, + + 'slug': {'type': 'keyword'}, + 'lang': {'type': 'keyword'}, + 'tags': {'type': 'keyword'}, + 'privacy': {'type': 'keyword'}, 'author': { - 'type': 'string', + 'type': 'text', 'analyzer': 'default_icu', 'fields': { 'raw': { - 'type': 'string', - 'index': 'not_analyzed', + 'type': 'keyword', }, }, }, - 'url': {'type': 'string', 'index': 'not_analyzed'}, + 'url': {'type': 'keyword'}, # Add a weight field to enhance relevancy scoring. 'weight': {'type': 'float'}, } @@ -273,19 +270,19 @@ def get_mapping(self): # Disable _all field to reduce index size. '_all': {'enabled': False}, # Associate a page with a project. - '_parent': {'type': self._parent}, + # '_parent': {'type': self._parent}, 'properties': { - 'id': {'type': 'string', 'index': 'not_analyzed'}, - 'sha': {'type': 'string', 'index': 'not_analyzed'}, - 'project': {'type': 'string', 'index': 'not_analyzed'}, - 'version': {'type': 'string', 'index': 'not_analyzed'}, - 'path': {'type': 'string', 'index': 'not_analyzed'}, - 'taxonomy': {'type': 'string', 'index': 'not_analyzed'}, - 'commit': {'type': 'string', 'index': 'not_analyzed'}, - - 'title': {'type': 'string', 'analyzer': 'default_icu'}, - 'headers': {'type': 'string', 'analyzer': 'default_icu'}, - 'content': {'type': 'string', 'analyzer': 'default_icu'}, + 'id': {'type': 'keyword'}, + 'sha': {'type': 'keyword'}, + 'project': {'type': 'keyword'}, + 'version': {'type': 'keyword'}, + 'path': {'type': 'keyword'}, + 'taxonomy': {'type': 'keyword'}, + 'commit': {'type': 'keyword'}, + + 'title': {'type': 'text', 'analyzer': 'default_icu'}, + 'headers': {'type': 'text', 'analyzer': 'default_icu'}, + 'content': {'type': 'text', 'analyzer': 'default_icu'}, # Add a weight field to enhance relevancy scoring. 'weight': {'type': 'float'}, } @@ -321,7 +318,7 @@ def get_mapping(self): # Disable _all field to reduce index size. '_all': {'enabled': False}, # Associate a section with a page. - '_parent': {'type': self._parent}, + # '_parent': {'type': self._parent}, # Commenting this out until we need it. # 'suggest': { # "type": "completion", @@ -330,18 +327,18 @@ def get_mapping(self): # "payloads": True, # }, 'properties': { - 'id': {'type': 'string', 'index': 'not_analyzed'}, - 'project': {'type': 'string', 'index': 'not_analyzed'}, - 'version': {'type': 'string', 'index': 'not_analyzed'}, - 'path': {'type': 'string', 'index': 'not_analyzed'}, - 'page_id': {'type': 'string', 'index': 'not_analyzed'}, - 'commit': {'type': 'string', 'index': 'not_analyzed'}, - 'title': {'type': 'string', 'analyzer': 'default_icu'}, - 'content': {'type': 'string', 'analyzer': 'default_icu'}, + 'id': {'type': 'keyword'}, + 'project': {'type': 'keyword'}, + 'version': {'type': 'keyword'}, + 'path': {'type': 'keyword'}, + 'page_id': {'type': 'keyword'}, + 'commit': {'type': 'keyword'}, + 'title': {'type': 'text', 'analyzer': 'default_icu'}, + 'content': {'type': 'text', 'analyzer': 'default_icu'}, 'blocks': { 'type': 'object', 'properties': { - 'code': {'type': 'string', 'analyzer': 'default_icu'} + 'code': {'type': 'text', 'analyzer': 'default_icu'} } }, # Add a weight field to enhance relevancy scoring. diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py index 8057901534f..31754880ee0 100644 --- a/readthedocs/settings/base.py +++ b/readthedocs/settings/base.py @@ -120,6 +120,7 @@ def INSTALLED_APPS(self): # noqa apps.append('django_countries') apps.append('readthedocsext.donate') apps.append('readthedocsext.embed') + apps.append('readthedocsext.search') return apps @property diff --git a/readthedocs/urls.py b/readthedocs/urls.py index b6053ab1983..25e04135d84 100644 --- a/readthedocs/urls.py +++ b/readthedocs/urls.py @@ -88,7 +88,7 @@ if settings.USE_PROMOS: # Include donation URL's - groups.append([ + groups.insert(0, [ url(r'^sustainability/', include('readthedocsext.donate.urls')), ]) @@ -98,6 +98,12 @@ url(r'^api/v1/embed/', include('readthedocsext.embed.urls')) ) +if 'readthedocsext.search' in settings.INSTALLED_APPS: + for num, _url in enumerate(rtd_urls): + if _url and hasattr(_url, 'name') and _url.name == 'search': + rtd_urls[num] = \ + url(r'^search/', 'readthedocsext.search.mainsearch.elastic_search', name='search'), + if not getattr(settings, 'USE_SUBDOMAIN', False) or settings.DEBUG: groups.insert(0, docs_urls) if getattr(settings, 'ALLOW_ADMIN', True): diff --git a/requirements/pip.txt b/requirements/pip.txt index a6dc76d9588..9b3fb8072af 100644 --- a/requirements/pip.txt +++ b/requirements/pip.txt @@ -51,8 +51,7 @@ httplib2==0.10.3 GitPython==2.1.8 # Search -elasticsearch==1.5.0 -pyelasticsearch==0.7.1 +elasticsearch==5.5.2 pyquery==1.4.0 # Utils From 953ae34b03c255d5f382d6f32b446d4dcc9e2365 Mon Sep 17 00:00:00 2001 From: Eric Holscher Date: Tue, 12 Dec 2017 15:20:54 -0800 Subject: [PATCH 2/9] Default sections off and fix unicode on index --- .../core/management/commands/reindex_elasticsearch.py | 8 ++++---- readthedocs/restapi/views/search_views.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/readthedocs/core/management/commands/reindex_elasticsearch.py b/readthedocs/core/management/commands/reindex_elasticsearch.py index 0e85ee7be9f..686afa77a40 100644 --- a/readthedocs/core/management/commands/reindex_elasticsearch.py +++ b/readthedocs/core/management/commands/reindex_elasticsearch.py @@ -35,13 +35,13 @@ def handle(self, *args, **options): queryset = queryset.filter(project__slug=project) if not queryset.exists(): raise CommandError( - 'No project with slug: {slug}'.format(slug=project)) - log.info("Building all versions for %s", project) + u'No project with slug: {slug}'.format(slug=project)) + log.info(u"Building all versions for %s", project) if getattr(settings, 'INDEX_ONLY_LATEST', True): queryset = queryset.filter(slug=LATEST) for version in queryset: - log.info("Reindexing %s", version) + log.info(u"Reindexing %s", version) try: commit = version.project.vcs_repo(version.slug).commit except: # pylint: disable=bare-except @@ -53,4 +53,4 @@ def handle(self, *args, **options): update_search(version.pk, commit, delete_non_commit_files=False) except Exception: - log.exception('Reindex failed for %s', version) + log.exception(u'Reindex failed for %s', version) diff --git a/readthedocs/restapi/views/search_views.py b/readthedocs/restapi/views/search_views.py index abe36174097..1db28af08e2 100644 --- a/readthedocs/restapi/views/search_views.py +++ b/readthedocs/restapi/views/search_views.py @@ -32,7 +32,7 @@ def index_search(request): utils.index_search_request( version=version, page_list=data['page_list'], commit=commit, - project_scale=project_scale, page_scale=page_scale) + project_scale=project_scale, page_scale=page_scale, section=False) return Response({'indexed': True}) From d8761d8fe321913f756e3daa95afe443bdf1cb00 Mon Sep 17 00:00:00 2001 From: Eric Holscher Date: Tue, 19 Dec 2017 10:29:28 -0800 Subject: [PATCH 3/9] Run task via celery to do concurrency --- .../core/management/commands/reindex_elasticsearch.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/readthedocs/core/management/commands/reindex_elasticsearch.py b/readthedocs/core/management/commands/reindex_elasticsearch.py index 686afa77a40..02580287451 100644 --- a/readthedocs/core/management/commands/reindex_elasticsearch.py +++ b/readthedocs/core/management/commands/reindex_elasticsearch.py @@ -3,6 +3,7 @@ from __future__ import absolute_import import logging from optparse import make_option +import socket from django.core.management.base import BaseCommand from django.core.management.base import CommandError @@ -38,6 +39,7 @@ def handle(self, *args, **options): u'No project with slug: {slug}'.format(slug=project)) log.info(u"Building all versions for %s", project) if getattr(settings, 'INDEX_ONLY_LATEST', True): + log.warning('Indexing only latest') queryset = queryset.filter(slug=LATEST) for version in queryset: @@ -50,7 +52,9 @@ def handle(self, *args, **options): commit = None try: - update_search(version.pk, commit, - delete_non_commit_files=False) + update_search.apply_async(args=[version.pk, commit], + kwargs=dict(delete_non_commit_files=False), + priority=0, + queue=socket.gethostname()) except Exception: log.exception(u'Reindex failed for %s', version) From 6e1c91e5aae00f5e40ef18de398c65b7c1199ad5 Mon Sep 17 00:00:00 2001 From: Eric Holscher Date: Tue, 19 Dec 2017 10:55:48 -0800 Subject: [PATCH 4/9] Lint reindex_elasticsearch --- .../commands/reindex_elasticsearch.py | 52 +++++++++++-------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/readthedocs/core/management/commands/reindex_elasticsearch.py b/readthedocs/core/management/commands/reindex_elasticsearch.py index 02580287451..24927a02f3e 100644 --- a/readthedocs/core/management/commands/reindex_elasticsearch.py +++ b/readthedocs/core/management/commands/reindex_elasticsearch.py @@ -1,13 +1,14 @@ -"""Reindex Elastic Search indexes""" +# -*- coding: utf-8 -*- +"""Reindex Elastic Search indexes.""" + +from __future__ import ( + absolute_import, division, print_function, unicode_literals) -from __future__ import absolute_import import logging -from optparse import make_option import socket +from optparse import make_option -from django.core.management.base import BaseCommand -from django.core.management.base import CommandError -from django.conf import settings +from django.core.management.base import BaseCommand, CommandError from readthedocs.builds.constants import LATEST from readthedocs.builds.models import Version @@ -24,11 +25,17 @@ class Command(BaseCommand): dest='project', default='', help='Project to index'), + make_option('-l', + dest='only_latest', + default=False, + action='store_true', + help='Only index latest'), ) def handle(self, *args, **options): - """Build/index all versions or a single project's version""" + """Build/index all versions or a single project's version.""" project = options['project'] + only_latest = options['only_latest'] queryset = Version.objects.filter(active=True) @@ -37,24 +44,23 @@ def handle(self, *args, **options): if not queryset.exists(): raise CommandError( u'No project with slug: {slug}'.format(slug=project)) - log.info(u"Building all versions for %s", project) - if getattr(settings, 'INDEX_ONLY_LATEST', True): + log.info(u'Building all versions for %s', project) + if only_latest: log.warning('Indexing only latest') queryset = queryset.filter(slug=LATEST) - for version in queryset: - log.info(u"Reindexing %s", version) - try: - commit = version.project.vcs_repo(version.slug).commit - except: # pylint: disable=bare-except - # An exception can be thrown here in production, but it's not - # documented what the exception here is - commit = None - + for version_pk, version_slug, project_slug in queryset.values_list( + 'pk', 'slug', 'project__slug'): + log.info(u'Reindexing %s:%s' % (project_slug, version_slug)) try: - update_search.apply_async(args=[version.pk, commit], - kwargs=dict(delete_non_commit_files=False), - priority=0, - queue=socket.gethostname()) + update_search.apply_async( + kwargs=dict( + version_pk=version_pk, + commit='reindex', + delete_non_commit_files=False + ), + priority=0, + queue=socket.gethostname() + ) except Exception: - log.exception(u'Reindex failed for %s', version) + log.exception(u'Reindexing failed for %s:%s' % (project_slug, version_slug)) From 2a2e0a42dbe1f767149ed78c77d5b970d2ff4fae Mon Sep 17 00:00:00 2001 From: Eric Holscher Date: Wed, 20 Dec 2017 14:37:47 -0800 Subject: [PATCH 5/9] Make reindexing better --- readthedocs/search/indexes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readthedocs/search/indexes.py b/readthedocs/search/indexes.py index a3a409e7e1f..378774e3b9e 100644 --- a/readthedocs/search/indexes.py +++ b/readthedocs/search/indexes.py @@ -74,7 +74,7 @@ def get_analysis(self): analyzers['default_icu'] = { 'type': 'custom', 'tokenizer': 'icu_tokenizer', - 'filter': ['word_delimiter', 'icu_folding', 'icu_normalizer'], + 'filter': ['custom_word_delimiter', 'icu_folding', 'icu_normalizer', 'lowercase'], } # Customize the word_delimiter filter to set various options. From cc7008216dd876b7f9e5a1c1747c782fc41bd892 Mon Sep 17 00:00:00 2001 From: Riccardo Magliocchetti Date: Tue, 13 Mar 2018 09:58:26 +0100 Subject: [PATCH 6/9] search: don't index page id As elasticsearch expects a long while we have an hash as string. While at it remove references to _parent and _id as they are gone. --- readthedocs/search/indexes.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/readthedocs/search/indexes.py b/readthedocs/search/indexes.py index 378774e3b9e..0184b086292 100644 --- a/readthedocs/search/indexes.py +++ b/readthedocs/search/indexes.py @@ -127,11 +127,8 @@ def bulk_index(self, data, index=None, chunk_size=500, parent=None, doc = { '_index': index, '_type': self._type, - '_id': source['id'], '_source': source, } - if parent: - doc['_parent'] = parent if routing: doc['_routing'] = routing docs.append(doc) @@ -294,7 +291,7 @@ def get_mapping(self): def extract_document(self, data): doc = {} - attrs = ('id', 'project', 'title', 'headers', 'version', 'path', + attrs = ('project', 'title', 'headers', 'version', 'path', 'content', 'taxonomy', 'commit') for attr in attrs: doc[attr] = data.get(attr, '') From 1b90a0117933b98e3d95d29a8dadf9dbf2d7504f Mon Sep 17 00:00:00 2001 From: Riccardo Magliocchetti Date: Tue, 13 Mar 2018 10:45:57 +0100 Subject: [PATCH 7/9] search: fixup elasticsearch query syntax for 5.5 fields is now _source and filter should be inside the query. aggs is the new facets and we can drop filtering on aggs as they should be calculated on the query results so after filtering. --- readthedocs/projects/views/public.py | 19 +++--- readthedocs/restapi/views/search_views.py | 7 ++- readthedocs/search/lib.py | 70 +++++++++-------------- readthedocs/search/views.py | 15 +++-- 4 files changed, 52 insertions(+), 59 deletions(-) diff --git a/readthedocs/projects/views/public.py b/readthedocs/projects/views/public.py index 61f6cfeca64..6db89a93ce0 100644 --- a/readthedocs/projects/views/public.py +++ b/readthedocs/projects/views/public.py @@ -305,6 +305,10 @@ def elastic_project_search(request, project_slug): {'match': {'title': {'query': query, 'boost': 10}}}, {'match': {'headers': {'query': query, 'boost': 5}}}, {'match': {'content': {'query': query}}}, + ], + 'filter': [ + {'term': {'project': project_slug}}, + {'term': {'version': version_slug}}, ] } }, @@ -315,13 +319,7 @@ def elastic_project_search(request, project_slug): 'content': {}, } }, - 'fields': ['title', 'project', 'version', 'path'], - 'filter': { - 'and': [ - {'term': {'project': project_slug}}, - {'term': {'version': version_slug}}, - ] - }, + '_source': ['title', 'project', 'version', 'path'], 'size': 50, # TODO: Support pagination. } @@ -335,9 +333,12 @@ def elastic_project_search(request, project_slug): if results: # pre and post 1.0 compat for num, hit in enumerate(results['hits']['hits']): - for key, val in list(hit['fields'].items()): + for key, val in list(hit['_source'].items()): if isinstance(val, list): - results['hits']['hits'][num]['fields'][key] = val[0] + results['hits']['hits'][num]['_source'][key] = val[0] + # we cannot render attributes starting with an underscore + hit['fields'] = hit['_source'] + del hit['_source'] return render( request, diff --git a/readthedocs/restapi/views/search_views.py b/readthedocs/restapi/views/search_views.py index 1db28af08e2..a0d64b401bd 100644 --- a/readthedocs/restapi/views/search_views.py +++ b/readthedocs/restapi/views/search_views.py @@ -64,7 +64,7 @@ def search(request): # Supplement result paths with domain information on project hits = results.get('hits', {}).get('hits', []) for (n, hit) in enumerate(hits): - fields = hit.get('fields', {}) + fields = hit.get('_source', {}) search_project = fields.get('project')[0] search_version = fields.get('version')[0] path = fields.get('path')[0] @@ -77,9 +77,12 @@ def search(request): ) except ProjectRelationship.DoesNotExist: pass - results['hits']['hits'][n]['fields']['link'] = ( + results['hits']['hits'][n]['_source']['link'] = ( canonical_url + path ) + # we cannot render attributes starting with an underscore + results['hits']['hits'][n]['fields'] = results['hits']['hits'][n]['_source'] + del results['hits']['hits'][n]['_source'] return Response({'results': results}) diff --git a/readthedocs/search/lib.py b/readthedocs/search/lib.py index 8500a829b03..5ace5d84094 100644 --- a/readthedocs/search/lib.py +++ b/readthedocs/search/lib.py @@ -25,9 +25,9 @@ def search_project(request, query, language=None): ] }, }, - "facets": { + "aggs": { "language": { - "terms": {"field": "lang"}, + "terms": {"field": "lang.keyword"}, }, }, "highlight": { @@ -36,13 +36,12 @@ def search_project(request, query, language=None): "description": {}, } }, - "fields": ["name", "slug", "description", "lang", "url"], + "_source": ["name", "slug", "description", "lang", "url"], "size": 50 # TODO: Support pagination. } if language: - body['facets']['language']['facet_filter'] = {"term": {"lang": language}} - body['filter'] = {"term": {"lang": language}} + body['query']['bool']['filter'] = {"term": {"lang": language}} before_project_search.send(request=request, sender=ProjectIndex, body=body) @@ -89,15 +88,15 @@ def search_file(request, query, project_slug=None, version_slug=LATEST, taxonomy ] } }, - "facets": { + "aggs": { "taxonomy": { - "terms": {"field": "taxonomy"}, + "terms": {"field": "taxonomy.keyword"}, }, "project": { - "terms": {"field": "project"}, + "terms": {"field": "project.keyword"}, }, "version": { - "terms": {"field": "version"}, + "terms": {"field": "version.keyword"}, }, }, "highlight": { @@ -107,12 +106,12 @@ def search_file(request, query, project_slug=None, version_slug=LATEST, taxonomy "content": {}, } }, - "fields": ["title", "project", "version", "path"], + "_source": ["title", "project", "version", "path"], "size": 50 # TODO: Support pagination. } if project_slug or version_slug or taxonomy: - final_filter = {"and": []} + final_filter = [] if project_slug: try: @@ -126,7 +125,7 @@ def search_file(request, query, project_slug=None, version_slug=LATEST, taxonomy in Project.objects.public( request.user).filter( superprojects__parent__slug=project.slug)) - final_filter['and'].append({"terms": {"project": project_slugs}}) + final_filter.append({"terms": {"project": project_slugs}}) # Add routing to optimize search by hitting the right shard. # This purposely doesn't apply routing if the project has more @@ -141,15 +140,12 @@ def search_file(request, query, project_slug=None, version_slug=LATEST, taxonomy return None if version_slug: - final_filter['and'].append({'term': {'version': version_slug}}) + final_filter.append({'term': {'version': version_slug}}) if taxonomy: - final_filter['and'].append({'term': {'taxonomy': taxonomy}}) + final_filter.append({'term': {'taxonomy': taxonomy}}) - body['filter'] = final_filter - body['facets']['project']['facet_filter'] = final_filter - body['facets']['version']['facet_filter'] = final_filter - body['facets']['taxonomy']['facet_filter'] = final_filter + body['query']['bool']['filter'] = final_filter if settings.DEBUG: print("Before Signal") @@ -167,9 +163,9 @@ def search_section(request, query, project_slug=None, version_slug=LATEST, """ Search for a section of content. - When you search, you will have a ``project`` facet, which includes the + When you search, you will have a ``project`` facet (aggs), which includes the number of matching sections per project. When you search inside a project, - the ``path`` facet will show the number of matching sections per page. + the ``path`` aggs will show the number of matching sections per page. :param request: Request instance :param query: string to use in query @@ -198,12 +194,9 @@ def search_section(request, query, project_slug=None, version_slug=LATEST, ] } }, - "facets": { + "aggs": { "project": { - "terms": {"field": "project"}, - "facet_filter": { - "term": {"version": version_slug}, - } + "terms": {"field": "project.keyword"}, }, }, "highlight": { @@ -212,36 +205,29 @@ def search_section(request, query, project_slug=None, version_slug=LATEST, "content": {}, } }, - "fields": ["title", "project", "version", "path", "page_id", "content"], + "_source": ["title", "project", "version", "path", "page_id", "content"], "size": 10 # TODO: Support pagination. } if project_slug: - body['filter'] = { - "and": [ - {"term": {"project": project_slug}}, - {"term": {"version": version_slug}}, - ] - } - body['facets']['path'] = { + body['query']['bool']['filter'] = [ + {"term": {"project": project_slug}}, + {"term": {"version": version_slug}}, + ] + body['aggs']['path'] = { "terms": {"field": "path"}, - "facet_filter": { - "term": {"project": project_slug}, - } }, # Add routing to optimize search by hitting the right shard. kwargs['routing'] = project_slug if path: - body['filter'] = { - "and": [ - {"term": {"path": path}}, - ] - } + body['query']['bool']['filter'] = [ + {"term": {"path": path}}, + ] if path and not project_slug: # Show facets when we only have a path - body['facets']['path'] = { + body['aggs']['path'] = { "terms": {"field": "path"} } diff --git a/readthedocs/search/views.py b/readthedocs/search/views.py index 7d3a51d5fc2..2cf7a057218 100644 --- a/readthedocs/search/views.py +++ b/readthedocs/search/views.py @@ -55,16 +55,19 @@ def elastic_search(request): if results: # pre and post 1.0 compat for num, hit in enumerate(results['hits']['hits']): - for key, val in list(hit['fields'].items()): + for key, val in list(hit['_source'].items()): if isinstance(val, list): - results['hits']['hits'][num]['fields'][key] = val[0] + results['hits']['hits'][num]['_source'][key] = val[0] + # we cannot render attributes starting with an underscore + hit['fields'] = hit['_source'] + del hit['_source'] - if 'facets' in results: + if 'aggregations' in results: for facet_type in ['project', 'version', 'taxonomy', 'language']: - if facet_type in results['facets']: + if facet_type in results['aggregations']: facets[facet_type] = collections.OrderedDict() - for term in results['facets'][facet_type]['terms']: - facets[facet_type][term['term']] = term['count'] + for term in results['aggregations'][facet_type]['buckets']: + facets[facet_type][term['key']] = term['doc_count'] if settings.DEBUG: print(pprint(results)) From 27915aced5bed72c66b66ee516038572f6a77a3e Mon Sep 17 00:00:00 2001 From: Riccardo Magliocchetti Date: Tue, 13 Mar 2018 15:34:18 +0100 Subject: [PATCH 8/9] search: reinstate the relationship mapping in the index --- readthedocs/search/indexes.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/readthedocs/search/indexes.py b/readthedocs/search/indexes.py index 0184b086292..2a0b725c0fe 100644 --- a/readthedocs/search/indexes.py +++ b/readthedocs/search/indexes.py @@ -266,8 +266,6 @@ def get_mapping(self): self._type: { # Disable _all field to reduce index size. '_all': {'enabled': False}, - # Associate a page with a project. - # '_parent': {'type': self._parent}, 'properties': { 'id': {'type': 'keyword'}, 'sha': {'type': 'keyword'}, @@ -282,6 +280,13 @@ def get_mapping(self): 'content': {'type': 'text', 'analyzer': 'default_icu'}, # Add a weight field to enhance relevancy scoring. 'weight': {'type': 'float'}, + # Associate a page with a project. + self._parent: { + 'type': 'join', + 'relations': { + self._parent: self._type + } + }, } } } @@ -314,8 +319,6 @@ def get_mapping(self): self._type: { # Disable _all field to reduce index size. '_all': {'enabled': False}, - # Associate a section with a page. - # '_parent': {'type': self._parent}, # Commenting this out until we need it. # 'suggest': { # "type": "completion", @@ -340,6 +343,13 @@ def get_mapping(self): }, # Add a weight field to enhance relevancy scoring. 'weight': {'type': 'float'}, + # Associate a section with a page. + self._parent: { + 'type': 'join', + 'relations': { + self._parent: self._type + } + }, } } } From 0f5c4c4c1737c5e93d4ccb13cb0209bf8e74739d Mon Sep 17 00:00:00 2001 From: Paolo Romolini Date: Tue, 20 Mar 2018 12:31:45 +0100 Subject: [PATCH 9/9] tests: first stab at unit testing search You got to start from somwhere --- .../rtd_tests/mocks/search_mock_responses.py | 160 ++++++++++++++++++ readthedocs/rtd_tests/tests/test_search.py | 150 ++++++++++++++++ 2 files changed, 310 insertions(+) create mode 100644 readthedocs/rtd_tests/mocks/search_mock_responses.py create mode 100644 readthedocs/rtd_tests/tests/test_search.py diff --git a/readthedocs/rtd_tests/mocks/search_mock_responses.py b/readthedocs/rtd_tests/mocks/search_mock_responses.py new file mode 100644 index 00000000000..2ade8103c52 --- /dev/null +++ b/readthedocs/rtd_tests/mocks/search_mock_responses.py @@ -0,0 +1,160 @@ +search_project_response = """ +{ + "took": 17, + "timed_out": false, + "_shards": { + "total": 5, + "successful": 5, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": 1, + "max_score": 1.8232156, + "hits": [ + { + "_index": "readthedocs", + "_type": "project", + "_id": "6", + "_score": 1.8232156, + "_source": { + "name": "Pip", + "description": "", + "lang": "en", + "url": "/projects/pip/", + "slug": "pip" + }, + "highlight": { + "name": [ + "Pip" + ] + } + } + ] + }, + "aggregations": { + "language": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": "en", + "doc_count": 1 + } + ] + } + } +} +""" + +search_file_response = """ +{ + "took": 27, + "timed_out": false, + "_shards": { + "total": 5, + "successful": 5, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": 3, + "max_score": 6.989019, + "hits": [ + { + "_index": "readthedocs", + "_type": "page", + "_id": "AWKuy4jp-H7vMtbTbHP5", + "_score": 6.989019, + "_routing": "prova", + "_source": { + "path": "_docs/cap2", + "project": "prova", + "title": "Capitolo 2", + "version": "latest" + }, + "highlight": { + "headers": [ + "Capitolo 2" + ], + "title": [ + "Capitolo 2" + ], + "content": [ + "Capitolo 2 In questo capitolo, vengono trattate" + ] + } + }, + { + "_index": "readthedocs", + "_type": "page", + "_id": "AWKuy4jp-H7vMtbTbHP4", + "_score": 6.973402, + "_routing": "prova", + "_source": { + "path": "_docs/cap1", + "project": "prova", + "title": "Capitolo 1", + "version": "latest" + }, + "highlight": { + "headers": [ + "Capitolo 1" + ], + "title": [ + "Capitolo 1" + ], + "content": [ + "Capitolo 1 In questo capitolo, le funzioni principali" + ] + } + }, + { + "_index": "readthedocs", + "_type": "page", + "_id": "AWKuy4jp-H7vMtbTbHP3", + "_score": 0.2017303, + "_routing": "prova", + "_source": { + "path": "index", + "project": "prova", + "title": "Titolo del documento", + "version": "latest" + }, + "highlight": { + "content": [ + "Titolo del documento Nel Capitolo 1 Nel Capitolo 2" + ] + } + } + ] + }, + "aggregations": { + "project": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": "prova", + "doc_count": 3 + } + ] + }, + "taxonomy": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [] + }, + "version": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": "latest", + "doc_count": 3 + } + ] + } + } +} +""" diff --git a/readthedocs/rtd_tests/tests/test_search.py b/readthedocs/rtd_tests/tests/test_search.py new file mode 100644 index 00000000000..01096a098c7 --- /dev/null +++ b/readthedocs/rtd_tests/tests/test_search.py @@ -0,0 +1,150 @@ +# -*- coding: utf-8 -*- +from __future__ import ( + absolute_import, division, print_function, unicode_literals) + +import json + +from django.core.urlresolvers import reverse +from django.test import TestCase, RequestFactory +from mock import patch +from urllib3._collections import HTTPHeaderDict + +from readthedocs.projects.models import Project +from readthedocs.rtd_tests.mocks.search_mock_responses import ( + search_project_response, search_file_response +) + + +class TestSearch(TestCase): + fixtures = ['eric', 'test_data'] + + def setUp(self): + self.client.login(username='eric', password='test') + self.pip = Project.objects.get(slug='pip') + self.factory = RequestFactory() + + def perform_request_file_mock(self, method, url, params=None, body=None, timeout=None, ignore=()): + """ + Elastic Search Urllib3HttpConnection mock for file search + """ + headers = HTTPHeaderDict({ + 'content-length': '893', + 'content-type': 'application/json; charset=UTF-8' + }) + raw_data = search_file_response + return 200, headers, raw_data + + def perform_request_project_mock(self, method, url, params=None, body=None, timeout=None, ignore=()): + """ + Elastic Search Urllib3HttpConnection mock for project search + """ + headers = HTTPHeaderDict({ + 'content-length': '893', + 'content-type': 'application/json; charset=UTF-8' + }) + raw_data = search_project_response + return 200, headers, raw_data + + @patch( + 'elasticsearch.connection.http_urllib3.Urllib3HttpConnection.perform_request', + side_effect=perform_request_project_mock + ) + def test_search_project(self, perform_request_mock): + """ + Tests the search view (by project) by mocking the perform request method + of the elastic search module. Checks the query string provided + to elastic search. + """ + self.client.login(username='eric', password='test') + r = self.client.get( + reverse('search'), + {'q': 'pip', 'type': 'project', 'project': None} + ) + self.assertEqual(r.status_code, 200) + response = perform_request_mock.call_args_list[0][0][3] + query_dict = json.loads(response) + self.assertIn('query', query_dict) + self.assertDictEqual({ + 'bool': { + 'should': [ + {'match': {'name': {'query': 'pip', 'boost': 10}}}, + {'match': {'description': {'query': 'pip'}}} + ] + } + }, query_dict['query']) + main_hit = r.context['results']['hits']['hits'][0] + self.assertEqual(r.status_code, 200) + self.assertEqual(main_hit['_type'], 'project') + self.assertEqual(main_hit['_type'], 'project') + self.assertEqual(main_hit['fields']['name'], 'Pip') + self.assertEqual(main_hit['fields']['slug'], 'pip') + + @patch( + 'elasticsearch.connection.http_urllib3.Urllib3HttpConnection.perform_request', + side_effect=perform_request_file_mock + ) + def test_search_file(self, perform_request_mock): + """ + Tests the search view (by file) by mocking the perform request method + of the elastic search module. Checks the query string provided + to elastic search. + """ + self.client.login(username='eric', password='test') + r = self.client.get( + reverse('search'), + {'q': 'capitolo', 'type': 'file'} + ) + response = perform_request_mock.call_args_list[0][0][3] + query_dict = json.loads(response) + self.assertIn('query', query_dict) + self.assertDictEqual({ + 'bool': { + 'filter': [{'term': {'version': 'latest'}}], + 'should': [ + {'match_phrase': {'title': {'query': 'capitolo', 'boost': 10, 'slop': 2}}}, + {'match_phrase': {'headers': {'query': 'capitolo', 'boost': 5, 'slop': 3}}}, + {'match_phrase': {'content': {'query': 'capitolo', 'slop': 5}}} + ] + } + }, query_dict['query']) + main_hit = r.context['results']['hits']['hits'][0] + self.assertEqual(r.status_code, 200) + self.assertEqual(main_hit['_type'], 'page') + self.assertEqual(main_hit['fields']['project'], 'prova') + self.assertEqual(main_hit['fields']['path'], '_docs/cap2') + + @patch( + 'elasticsearch.connection.http_urllib3.Urllib3HttpConnection.perform_request', + side_effect=perform_request_file_mock + ) + def test_search_in_project(self, perform_request_mock): + """ + Tests the search view (by file) by mocking the perform request method + of the elastic search module. Checks the query string provided + to elastic search. + """ + self.client.login(username='eric', password='test') + r = self.client.get( + '/projects/pip/search/', + {'q': 'capitolo'} + ) + response = perform_request_mock.call_args_list[0][0][3] + query_dict = json.loads(response) + self.assertDictEqual({ + 'bool': { + 'should': [ + {'match': {'title': {'boost': 10, 'query': 'capitolo'}}}, + {'match': {'headers': {'boost': 5, 'query': 'capitolo'}}}, + {'match': {'content': {'query': 'capitolo'}}} + ], + 'filter': [ + {'term': {'project': 'pip'}}, + {'term': {'version': 'latest'}} + ] + } + }, query_dict['query']) + main_hit = r.context['results']['hits']['hits'][0] + self.assertEqual(r.status_code, 200) + self.assertEqual(main_hit['_type'], 'page') + self.assertEqual(main_hit['fields']['project'], 'prova') + self.assertEqual(main_hit['fields']['path'], '_docs/cap2')