diff --git a/readthedocs/core/management/commands/reindex_elasticsearch.py b/readthedocs/core/management/commands/reindex_elasticsearch.py index a2bce6df840..24927a02f3e 100644 --- a/readthedocs/core/management/commands/reindex_elasticsearch.py +++ b/readthedocs/core/management/commands/reindex_elasticsearch.py @@ -1,12 +1,14 @@ -"""Reindex Elastic Search indexes""" +# -*- coding: utf-8 -*- +"""Reindex Elastic Search indexes.""" + +from __future__ import ( + absolute_import, division, print_function, unicode_literals) -from __future__ import absolute_import import logging +import socket from optparse import make_option -from django.core.management.base import BaseCommand -from django.core.management.base import CommandError -from django.conf import settings +from django.core.management.base import BaseCommand, CommandError from readthedocs.builds.constants import LATEST from readthedocs.builds.models import Version @@ -23,34 +25,42 @@ class Command(BaseCommand): dest='project', default='', help='Project to index'), + make_option('-l', + dest='only_latest', + default=False, + action='store_true', + help='Only index latest'), ) def handle(self, *args, **options): - """Build/index all versions or a single project's version""" + """Build/index all versions or a single project's version.""" project = options['project'] + only_latest = options['only_latest'] - queryset = Version.objects.all() + queryset = Version.objects.filter(active=True) if project: queryset = queryset.filter(project__slug=project) if not queryset.exists(): raise CommandError( - 'No project with slug: {slug}'.format(slug=project)) - log.info("Building all versions for %s", project) - elif getattr(settings, 'INDEX_ONLY_LATEST', True): + u'No project with slug: {slug}'.format(slug=project)) + log.info(u'Building all versions for %s', project) + if only_latest: + log.warning('Indexing only latest') queryset = queryset.filter(slug=LATEST) - for version in queryset: - log.info("Reindexing %s", version) - try: - commit = version.project.vcs_repo(version.slug).commit - except: # pylint: disable=bare-except - # An exception can be thrown here in production, but it's not - # documented what the exception here is - commit = None - + for version_pk, version_slug, project_slug in queryset.values_list( + 'pk', 'slug', 'project__slug'): + log.info(u'Reindexing %s:%s' % (project_slug, version_slug)) try: - update_search(version.pk, commit, - delete_non_commit_files=False) + update_search.apply_async( + kwargs=dict( + version_pk=version_pk, + commit='reindex', + delete_non_commit_files=False + ), + priority=0, + queue=socket.gethostname() + ) except Exception: - log.exception('Reindex failed for {}'.format(version)) + log.exception(u'Reindexing failed for %s:%s' % (project_slug, version_slug)) diff --git a/readthedocs/projects/migrations/0022_add-view-data.py b/readthedocs/projects/migrations/0022_add-view-data.py new file mode 100644 index 00000000000..e9fee53cca7 --- /dev/null +++ b/readthedocs/projects/migrations/0022_add-view-data.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.9.12 on 2017-12-11 13:05 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('projects', '0021_add-webhook-deprecation-feature'), + ] + + operations = [ + migrations.AlterField( + model_name='domain', + name='canonical', + field=models.BooleanField(default=False, help_text='This Domain is the primary one where the documentation is served from'), + ), + migrations.AlterField( + model_name='domain', + name='count', + field=models.IntegerField(default=0, help_text='Number of times this domain has been hit'), + ), + migrations.AlterField( + model_name='project', + name='allow_promos', + field=models.BooleanField(default=True, help_text='If unchecked, users will still see community ads.', verbose_name='Allow paid advertising'), + ), + migrations.AlterField( + model_name='project', + name='comment_moderation', + field=models.BooleanField(default=False, verbose_name='Comment Moderation'), + ), + migrations.AlterField( + model_name='project', + name='conf_py_file', + field=models.CharField(blank=True, default=b'', help_text='Path from project root to conf.py file (ex. docs/conf.py). Leave blank if you want us to find it for you.', max_length=255, verbose_name='Python configuration file'), + ), + migrations.AlterField( + model_name='project', + name='has_valid_webhook', + field=models.BooleanField(default=False, help_text='This project has been built with a webhook'), + ), + migrations.AlterField( + model_name='project', + name='programming_language', + field=models.CharField(blank=True, choices=[('words', 'Only Words'), ('py', 'Python'), ('js', 'JavaScript'), ('php', 'PHP'), ('ruby', 'Ruby'), ('perl', 'Perl'), ('java', 'Java'), ('go', 'Go'), ('julia', 'Julia'), ('c', 'C'), ('csharp', 'C#'), ('cpp', 'C++'), ('objc', 'Objective-C'), ('other', 'Other')], default=b'words', help_text='The primary programming language the project is written in.', max_length=20, verbose_name='Programming Language'), + ), + ] diff --git a/readthedocs/restapi/urls.py b/readthedocs/restapi/urls.py index 9d6fbd19229..9fc1a36b84d 100644 --- a/readthedocs/restapi/urls.py +++ b/readthedocs/restapi/urls.py @@ -47,7 +47,7 @@ url(r'index_search/', search_views.index_search, name='index_search'), - url(r'search/$', views.search_views.search, name='api_search'), + url(r'^search/$', views.search_views.search, name='api_search'), url(r'search/project/$', search_views.project_search, name='api_project_search'), diff --git a/readthedocs/restapi/utils.py b/readthedocs/restapi/utils.py index b80190bc427..7f6d87ad68f 100644 --- a/readthedocs/restapi/utils.py +++ b/readthedocs/restapi/utils.py @@ -1,6 +1,9 @@ +# -*- coding: utf-8 -*- """Utility functions that are used by both views and celery tasks.""" -from __future__ import absolute_import +from __future__ import ( + absolute_import, division, print_function, unicode_literals) + import hashlib import logging @@ -37,7 +40,7 @@ def sync_versions(project, versions, type): # pylint: disable=redefined-builtin type=type, machine=False, ) - log.info("(Sync Versions) Updated Version: [%s=%s] ", + log.info('(Sync Versions) Updated Version: [%s=%s] ', version['verbose_name'], version['identifier']) else: # New Version @@ -49,7 +52,7 @@ def sync_versions(project, versions, type): # pylint: disable=redefined-builtin ) added.add(created_version.slug) if added: - log.info("(Sync Versions) Added Versions: [%s] ", ' '.join(added)) + log.info('(Sync Versions) Added Versions: [%s] ', ' '.join(added)) return added @@ -70,14 +73,14 @@ def delete_versions(project, version_data): if to_delete_qs.count(): ret_val = {obj.slug for obj in to_delete_qs} - log.info("(Sync Versions) Deleted Versions: [%s]", ' '.join(ret_val)) + log.info('(Sync Versions) Deleted Versions: [%s]', ' '.join(ret_val)) to_delete_qs.delete() return ret_val return set() def index_search_request(version, page_list, commit, project_scale, page_scale, - section=True, delete=True): + section=False, delete=True): """ Update search indexes with build output JSON. @@ -89,7 +92,7 @@ def index_search_request(version, page_list, commit, project_scale, page_scale, project = version.project log_msg = ' '.join([page['path'] for page in page_list]) - log.info("Updating search index: project=%s pages=[%s]", + log.info('Updating search index: project=%s pages=[%s]', project.slug, log_msg) project_obj = ProjectIndex() @@ -112,7 +115,7 @@ def index_search_request(version, page_list, commit, project_scale, page_scale, routes = [project.slug] routes.extend([p.parent.slug for p in project.superprojects.all()]) for page in page_list: - log.debug("Indexing page: %s:%s", project.slug, page['path']) + log.debug('Indexing page: %s:%s', project.slug, page['path']) to_hash = '-'.join([project.slug, version.slug, page['path']]) page_id = hashlib.md5(to_hash.encode('utf-8')).hexdigest() index_list.append({ @@ -142,25 +145,24 @@ def index_search_request(version, page_list, commit, project_scale, page_scale, 'weight': page_scale, }) for route in routes: - section_obj.bulk_index(section_index_list, parent=page_id, - routing=route) + section_obj.bulk_index(section_index_list, routing=route) for route in routes: - page_obj.bulk_index(index_list, parent=project.slug, routing=route) + page_obj.bulk_index(index_list, routing=route) if delete: - log.info("Deleting files not in commit: %s", commit) + log.info('Deleting files not in commit: %s', commit) # TODO: AK Make sure this works delete_query = { - "query": { - "bool": { - "must": [ - {"term": {"project": project.slug, }}, - {"term": {"version": version.slug, }}, + 'query': { + 'bool': { + 'must': [ + {'term': {'project': project.slug, }}, + {'term': {'version': version.slug, }}, ], - "must_not": { - "term": { - "commit": commit + 'must_not': { + 'term': { + 'commit': commit } } } diff --git a/readthedocs/restapi/views/search_views.py b/readthedocs/restapi/views/search_views.py index abe36174097..1db28af08e2 100644 --- a/readthedocs/restapi/views/search_views.py +++ b/readthedocs/restapi/views/search_views.py @@ -32,7 +32,7 @@ def index_search(request): utils.index_search_request( version=version, page_list=data['page_list'], commit=commit, - project_scale=project_scale, page_scale=page_scale) + project_scale=project_scale, page_scale=page_scale, section=False) return Response({'indexed': True}) diff --git a/readthedocs/search/indexes.py b/readthedocs/search/indexes.py index 1b2ede6aaa9..378774e3b9e 100644 --- a/readthedocs/search/indexes.py +++ b/readthedocs/search/indexes.py @@ -19,7 +19,7 @@ import datetime from elasticsearch import Elasticsearch, exceptions -from elasticsearch.helpers import bulk_index +from elasticsearch.helpers import bulk from django.conf import settings @@ -48,8 +48,6 @@ def get_settings(self, settings_override=None): 'number_of_replicas': settings.ES_DEFAULT_NUM_REPLICAS, 'number_of_shards': settings.ES_DEFAULT_NUM_SHARDS, 'refresh_interval': '5s', - 'store.compress.tv': True, - 'store.compress.stored': True, 'analysis': self.get_analysis(), } if settings_override: @@ -76,7 +74,7 @@ def get_analysis(self): analyzers['default_icu'] = { 'type': 'custom', 'tokenizer': 'icu_tokenizer', - 'filter': ['word_delimiter', 'icu_folding', 'icu_normalizer'], + 'filter': ['custom_word_delimiter', 'icu_folding', 'icu_normalizer', 'lowercase'], } # Customize the word_delimiter filter to set various options. @@ -139,7 +137,7 @@ def bulk_index(self, data, index=None, chunk_size=500, parent=None, docs.append(doc) # TODO: This doesn't work with the new ES setup. - bulk_index(self.es, docs, chunk_size=chunk_size) + bulk(self.es, docs, chunk_size=chunk_size) def index_document(self, data, index=None, parent=None, routing=None): doc = self.extract_document(data) @@ -220,25 +218,24 @@ def get_mapping(self): # Disable _all field to reduce index size. '_all': {'enabled': False}, 'properties': { - 'id': {'type': 'long'}, - 'name': {'type': 'string', 'analyzer': 'default_icu'}, - 'description': {'type': 'string', 'analyzer': 'default_icu'}, - - 'slug': {'type': 'string', 'index': 'not_analyzed'}, - 'lang': {'type': 'string', 'index': 'not_analyzed'}, - 'tags': {'type': 'string', 'index': 'not_analyzed'}, - 'privacy': {'type': 'string', 'index': 'not_analyzed'}, + 'id': {'type': 'keyword'}, + 'name': {'type': 'text', 'analyzer': 'default_icu'}, + 'description': {'type': 'text', 'analyzer': 'default_icu'}, + + 'slug': {'type': 'keyword'}, + 'lang': {'type': 'keyword'}, + 'tags': {'type': 'keyword'}, + 'privacy': {'type': 'keyword'}, 'author': { - 'type': 'string', + 'type': 'text', 'analyzer': 'default_icu', 'fields': { 'raw': { - 'type': 'string', - 'index': 'not_analyzed', + 'type': 'keyword', }, }, }, - 'url': {'type': 'string', 'index': 'not_analyzed'}, + 'url': {'type': 'keyword'}, # Add a weight field to enhance relevancy scoring. 'weight': {'type': 'float'}, } @@ -273,19 +270,19 @@ def get_mapping(self): # Disable _all field to reduce index size. '_all': {'enabled': False}, # Associate a page with a project. - '_parent': {'type': self._parent}, + # '_parent': {'type': self._parent}, 'properties': { - 'id': {'type': 'string', 'index': 'not_analyzed'}, - 'sha': {'type': 'string', 'index': 'not_analyzed'}, - 'project': {'type': 'string', 'index': 'not_analyzed'}, - 'version': {'type': 'string', 'index': 'not_analyzed'}, - 'path': {'type': 'string', 'index': 'not_analyzed'}, - 'taxonomy': {'type': 'string', 'index': 'not_analyzed'}, - 'commit': {'type': 'string', 'index': 'not_analyzed'}, - - 'title': {'type': 'string', 'analyzer': 'default_icu'}, - 'headers': {'type': 'string', 'analyzer': 'default_icu'}, - 'content': {'type': 'string', 'analyzer': 'default_icu'}, + 'id': {'type': 'keyword'}, + 'sha': {'type': 'keyword'}, + 'project': {'type': 'keyword'}, + 'version': {'type': 'keyword'}, + 'path': {'type': 'keyword'}, + 'taxonomy': {'type': 'keyword'}, + 'commit': {'type': 'keyword'}, + + 'title': {'type': 'text', 'analyzer': 'default_icu'}, + 'headers': {'type': 'text', 'analyzer': 'default_icu'}, + 'content': {'type': 'text', 'analyzer': 'default_icu'}, # Add a weight field to enhance relevancy scoring. 'weight': {'type': 'float'}, } @@ -321,7 +318,7 @@ def get_mapping(self): # Disable _all field to reduce index size. '_all': {'enabled': False}, # Associate a section with a page. - '_parent': {'type': self._parent}, + # '_parent': {'type': self._parent}, # Commenting this out until we need it. # 'suggest': { # "type": "completion", @@ -330,18 +327,18 @@ def get_mapping(self): # "payloads": True, # }, 'properties': { - 'id': {'type': 'string', 'index': 'not_analyzed'}, - 'project': {'type': 'string', 'index': 'not_analyzed'}, - 'version': {'type': 'string', 'index': 'not_analyzed'}, - 'path': {'type': 'string', 'index': 'not_analyzed'}, - 'page_id': {'type': 'string', 'index': 'not_analyzed'}, - 'commit': {'type': 'string', 'index': 'not_analyzed'}, - 'title': {'type': 'string', 'analyzer': 'default_icu'}, - 'content': {'type': 'string', 'analyzer': 'default_icu'}, + 'id': {'type': 'keyword'}, + 'project': {'type': 'keyword'}, + 'version': {'type': 'keyword'}, + 'path': {'type': 'keyword'}, + 'page_id': {'type': 'keyword'}, + 'commit': {'type': 'keyword'}, + 'title': {'type': 'text', 'analyzer': 'default_icu'}, + 'content': {'type': 'text', 'analyzer': 'default_icu'}, 'blocks': { 'type': 'object', 'properties': { - 'code': {'type': 'string', 'analyzer': 'default_icu'} + 'code': {'type': 'text', 'analyzer': 'default_icu'} } }, # Add a weight field to enhance relevancy scoring. diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py index 1c33ed11cf0..d462528a259 100644 --- a/readthedocs/settings/base.py +++ b/readthedocs/settings/base.py @@ -115,6 +115,7 @@ def INSTALLED_APPS(self): # noqa if ext: apps.append('django_countries') apps.append('readthedocsext.donate') + apps.append('readthedocsext.search') apps.append('readthedocsext.embed') return apps diff --git a/readthedocs/urls.py b/readthedocs/urls.py index b8068ae1f52..6b7bdfaeebe 100644 --- a/readthedocs/urls.py +++ b/readthedocs/urls.py @@ -87,15 +87,18 @@ if 'readthedocsext.donate' in settings.INSTALLED_APPS: # Include donation URL's groups.append([ - url(r'^sustainability/', include('readthedocsext.donate.urls')), + url(r'^sustainability/', include('readthedocsext.donate.urls')) ]) - +if 'readthedocsext.search' in settings.INSTALLED_APPS: + for num, _url in enumerate(rtd_urls): + if _url and hasattr(_url, 'name') and _url.name == 'search': + rtd_urls[num] = \ + url(r'^search/', 'readthedocsext.search.mainsearch.elastic_search', name='search') if 'readthedocsext.embed' in settings.INSTALLED_APPS: api_urls.insert( 0, url(r'^api/v1/embed/', include('readthedocsext.embed.urls')) ) - if not getattr(settings, 'USE_SUBDOMAIN', False) or settings.DEBUG: groups.insert(0, docs_urls) if getattr(settings, 'ALLOW_ADMIN', True): diff --git a/requirements/pip.txt b/requirements/pip.txt index ca40caf724a..9100adbccd0 100644 --- a/requirements/pip.txt +++ b/requirements/pip.txt @@ -36,8 +36,7 @@ dnspython==1.15.0 httplib2==0.7.7 # Search -elasticsearch==1.5.0 -pyelasticsearch==0.7.1 +elasticsearch==5.5.1 pyquery==1.2.2 # Utils