Skip to content

Commit fd75aa3

Browse files
authored
Merge pull request #4211 from safwanrahman/search
Upgrade Elasticsearch to version 6.x
2 parents 52fee2c + 0965a94 commit fd75aa3

29 files changed

+436
-167
lines changed

.travis.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ python:
44
- 3.6
55
sudo: false
66
env:
7-
- ES_VERSION=1.3.9 ES_DOWNLOAD_URL=https://download.elastic.co/elasticsearch/elasticsearch/elasticsearch-${ES_VERSION}.tar.gz
7+
- ES_VERSION=6.2.4 ES_DOWNLOAD_URL=https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-${ES_VERSION}.tar.gz
88
matrix:
99
include:
1010
- python: 2.7
@@ -42,3 +42,4 @@ notifications:
4242
branches:
4343
only:
4444
- master
45+
- search_upgrade

readthedocs/projects/admin.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
from .forms import FeatureForm
1717
from .models import (Project, ImportedFile, Feature,
18-
ProjectRelationship, EmailHook, WebHook, Domain)
18+
ProjectRelationship, EmailHook, WebHook, Domain, HTMLFile)
1919
from .notifications import ResourceUsageNotification
2020
from .tasks import remove_dir
2121

@@ -206,3 +206,4 @@ def project_count(self, feature):
206206
admin.site.register(Feature, FeatureAdmin)
207207
admin.site.register(EmailHook)
208208
admin.site.register(WebHook)
209+
admin.site.register(HTMLFile)

readthedocs/projects/apps.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,6 @@ class ProjectsConfig(AppConfig):
99
def ready(self):
1010
from readthedocs.projects import tasks
1111
from readthedocs.worker import app
12+
1213
app.tasks.register(tasks.SyncRepositoryTask)
1314
app.tasks.register(tasks.UpdateDocsTask)

readthedocs/projects/managers.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from django.db import models
2+
3+
4+
class HTMLFileManager(models.Manager):
5+
6+
def get_queryset(self):
7+
return super(HTMLFileManager, self).get_queryset().filter(name__endswith='.html')
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# -*- coding: utf-8 -*-
2+
# Generated by Django 1.9.13 on 2018-06-18 16:45
3+
from __future__ import unicode_literals
4+
5+
from django.db import migrations, models
6+
7+
8+
class Migration(migrations.Migration):
9+
10+
dependencies = [
11+
('projects', '0025_show-version-warning-existing-projects'),
12+
]
13+
14+
operations = [
15+
migrations.CreateModel(
16+
name='HTMLFile',
17+
fields=[
18+
],
19+
options={
20+
'proxy': True,
21+
},
22+
bases=('projects.importedfile',),
23+
),
24+
migrations.AlterField(
25+
model_name='project',
26+
name='comment_moderation',
27+
field=models.BooleanField(default=False, verbose_name='Comment Moderation'),
28+
),
29+
migrations.AlterField(
30+
model_name='project',
31+
name='documentation_type',
32+
field=models.CharField(choices=[('auto', 'Automatically Choose'), ('sphinx', 'Sphinx Html'), ('mkdocs', 'Mkdocs (Markdown)'), ('sphinx_htmldir', 'Sphinx HtmlDir'), ('sphinx_singlehtml', 'Sphinx Single Page HTML')], default='sphinx', help_text='Type of documentation you are building. <a href="http://www.sphinx-doc.org/en/stable/builders.html#sphinx.builders.html.DirectoryHTMLBuilder">More info</a>.', max_length=20, verbose_name='Documentation type'),
33+
),
34+
migrations.AlterField(
35+
model_name='project',
36+
name='language',
37+
field=models.CharField(choices=[('aa', 'Afar'), ('ab', 'Abkhaz'), ('af', 'Afrikaans'), ('am', 'Amharic'), ('ar', 'Arabic'), ('as', 'Assamese'), ('ay', 'Aymara'), ('az', 'Azerbaijani'), ('ba', 'Bashkir'), ('be', 'Belarusian'), ('bg', 'Bulgarian'), ('bh', 'Bihari'), ('bi', 'Bislama'), ('bn', 'Bengali'), ('bo', 'Tibetan'), ('br', 'Breton'), ('ca', 'Catalan'), ('co', 'Corsican'), ('cs', 'Czech'), ('cy', 'Welsh'), ('da', 'Danish'), ('de', 'German'), ('dz', 'Dzongkha'), ('el', 'Greek'), ('en', 'English'), ('eo', 'Esperanto'), ('es', 'Spanish'), ('et', 'Estonian'), ('eu', 'Basque'), ('fa', 'Iranian'), ('fi', 'Finnish'), ('fj', 'Fijian'), ('fo', 'Faroese'), ('fr', 'French'), ('fy', 'Western Frisian'), ('ga', 'Irish'), ('gd', 'Scottish Gaelic'), ('gl', 'Galician'), ('gn', 'Guarani'), ('gu', 'Gujarati'), ('ha', 'Hausa'), ('hi', 'Hindi'), ('he', 'Hebrew'), ('hr', 'Croatian'), ('hu', 'Hungarian'), ('hy', 'Armenian'), ('ia', 'Interlingua'), ('id', 'Indonesian'), ('ie', 'Interlingue'), ('ik', 'Inupiaq'), ('is', 'Icelandic'), ('it', 'Italian'), ('iu', 'Inuktitut'), ('ja', 'Japanese'), ('jv', 'Javanese'), ('ka', 'Georgian'), ('kk', 'Kazakh'), ('kl', 'Kalaallisut'), ('km', 'Khmer'), ('kn', 'Kannada'), ('ko', 'Korean'), ('ks', 'Kashmiri'), ('ku', 'Kurdish'), ('ky', 'Kyrgyz'), ('la', 'Latin'), ('ln', 'Lingala'), ('lo', 'Lao'), ('lt', 'Lithuanian'), ('lv', 'Latvian'), ('mg', 'Malagasy'), ('mi', 'Maori'), ('mk', 'Macedonian'), ('ml', 'Malayalam'), ('mn', 'Mongolian'), ('mr', 'Marathi'), ('ms', 'Malay'), ('mt', 'Maltese'), ('my', 'Burmese'), ('na', 'Nauru'), ('ne', 'Nepali'), ('nl', 'Dutch'), ('no', 'Norwegian'), ('oc', 'Occitan'), ('om', 'Oromo'), ('or', 'Oriya'), ('pa', 'Panjabi'), ('pl', 'Polish'), ('ps', 'Pashto'), ('pt', 'Portuguese'), ('qu', 'Quechua'), ('rm', 'Romansh'), ('rn', 'Kirundi'), ('ro', 'Romanian'), ('ru', 'Russian'), ('rw', 'Kinyarwanda'), ('sa', 'Sanskrit'), ('sd', 'Sindhi'), ('sg', 'Sango'), ('si', 'Sinhala'), ('sk', 'Slovak'), ('sl', 'Slovenian'), ('sm', 'Samoan'), ('sn', 'Shona'), ('so', 'Somali'), ('sq', 'Albanian'), ('sr', 'Serbian'), ('ss', 'Swati'), ('st', 'Southern Sotho'), ('su', 'Sudanese'), ('sv', 'Swedish'), ('sw', 'Swahili'), ('ta', 'Tamil'), ('te', 'Telugu'), ('tg', 'Tajik'), ('th', 'Thai'), ('ti', 'Tigrinya'), ('tk', 'Turkmen'), ('tl', 'Tagalog'), ('tn', 'Tswana'), ('to', 'Tonga'), ('tr', 'Turkish'), ('ts', 'Tsonga'), ('tt', 'Tatar'), ('tw', 'Twi'), ('ug', 'Uyghur'), ('uk', 'Ukrainian'), ('ur', 'Urdu'), ('uz', 'Uzbek'), ('vi', 'Vietnamese'), ('vo', 'Volapuk'), ('wo', 'Wolof'), ('xh', 'Xhosa'), ('yi', 'Yiddish'), ('yo', 'Yoruba'), ('za', 'Zhuang'), ('zh', 'Chinese'), ('zu', 'Zulu'), ('nb_NO', 'Norwegian Bokmal'), ('pt_BR', 'Brazilian Portuguese'), ('es_MX', 'Mexican Spanish'), ('uk_UA', 'Ukrainian'), ('zh_CN', 'Simplified Chinese'), ('zh_TW', 'Traditional Chinese')], default='en', help_text="The language the project documentation is rendered in. Note: this affects your project's URL.", max_length=20, verbose_name='Language'),
38+
),
39+
migrations.AlterField(
40+
model_name='project',
41+
name='privacy_level',
42+
field=models.CharField(choices=[('public', 'Public'), ('protected', 'Protected'), ('private', 'Private')], default='public', help_text='Level of privacy that you want on the repository. Protected means public but not in listings.', max_length=20, verbose_name='Privacy Level'),
43+
),
44+
migrations.AlterField(
45+
model_name='project',
46+
name='python_interpreter',
47+
field=models.CharField(choices=[('python', 'CPython 2.x'), ('python3', 'CPython 3.x')], default='python', help_text='The Python interpreter used to create the virtual environment.', max_length=20, verbose_name='Python Interpreter'),
48+
),
49+
migrations.AlterField(
50+
model_name='project',
51+
name='version_privacy_level',
52+
field=models.CharField(choices=[('public', 'Public'), ('protected', 'Protected'), ('private', 'Private')], default='public', help_text='Default level of privacy you want on built versions of documentation.', max_length=20, verbose_name='Version Privacy Level'),
53+
),
54+
]

readthedocs/projects/models.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,14 @@
77
import fnmatch
88
import logging
99
import os
10-
from builtins import object # pylint: disable=redefined-builtin
1110

11+
from builtins import object # pylint: disable=redefined-builtin
1212
from django.conf import settings
1313
from django.contrib.auth.models import User
1414
from django.core.urlresolvers import NoReverseMatch, reverse
1515
from django.db import models
1616
from django.utils.encoding import python_2_unicode_compatible
17+
from django.utils.functional import cached_property
1718
from django.utils.translation import ugettext_lazy as _
1819
from future.backports.urllib.parse import urlparse # noqa
1920
from guardian.shortcuts import assign
@@ -24,6 +25,7 @@
2425
from readthedocs.core.utils import broadcast, slugify
2526
from readthedocs.projects import constants
2627
from readthedocs.projects.exceptions import ProjectConfigurationError
28+
from readthedocs.projects.managers import HTMLFileManager
2729
from readthedocs.projects.querysets import (
2830
ChildRelatedProjectQuerySet, FeatureQuerySet, ProjectQuerySet,
2931
RelatedProjectQuerySet)
@@ -32,6 +34,7 @@
3234
from readthedocs.projects.version_handling import (
3335
determine_stable_version, version_windows)
3436
from readthedocs.restapi.client import api
37+
from readthedocs.search.parse_json import process_file
3538
from readthedocs.vcs_support.backends import backend_cls
3639
from readthedocs.vcs_support.utils import Lock, NonBlockingLock
3740

@@ -910,6 +913,40 @@ def __str__(self):
910913
return '%s: %s' % (self.name, self.project)
911914

912915

916+
class HTMLFile(ImportedFile):
917+
918+
"""
919+
Imported HTML file Proxy model.
920+
921+
This tracks only the HTML files for indexing to search.
922+
"""
923+
924+
class Meta(object):
925+
proxy = True
926+
927+
objects = HTMLFileManager()
928+
929+
@cached_property
930+
def json_file_path(self):
931+
basename = os.path.splitext(self.path)[0]
932+
file_path = basename + '.fjson'
933+
934+
full_json_path = self.project.get_production_media_path(type_='json',
935+
version_slug=self.version.slug,
936+
include_file=False)
937+
938+
file_path = os.path.join(full_json_path, file_path)
939+
return file_path
940+
941+
def get_processed_json(self):
942+
file_path = self.json_file_path
943+
return process_file(file_path)
944+
945+
@cached_property
946+
def processed_json(self):
947+
return self.get_processed_json()
948+
949+
913950
class Notification(models.Model):
914951
project = models.ForeignKey(Project,
915952
related_name='%(class)s_notifications')

readthedocs/projects/signals.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22

33
from __future__ import absolute_import
44
import django.dispatch
5+
from django.db.models.signals import pre_save
56
from django.dispatch import receiver
67

78
from readthedocs.oauth.utils import attach_webhook
8-
9+
from .models import HTMLFile
910

1011
before_vcs = django.dispatch.Signal(providing_args=["version"])
1112
after_vcs = django.dispatch.Signal(providing_args=["version"])

readthedocs/projects/tasks.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from __future__ import absolute_import
99

1010
import datetime
11+
import fnmatch
1112
import hashlib
1213
import json
1314
import logging
@@ -29,7 +30,7 @@
2930

3031
from .constants import LOG_TEMPLATE
3132
from .exceptions import RepositoryError
32-
from .models import ImportedFile, Project, Domain
33+
from .models import ImportedFile, Project, Domain, HTMLFile
3334
from .signals import before_vcs, after_vcs, before_build, after_build, files_changed
3435
from readthedocs.builds.constants import (LATEST,
3536
BUILD_STATE_CLONING,
@@ -943,18 +944,24 @@ def _manage_imported_files(version, path, commit):
943944
changed_files = set()
944945
for root, __, filenames in os.walk(path):
945946
for filename in filenames:
947+
if fnmatch.fnmatch(filename, '*.html'):
948+
model_class = HTMLFile
949+
else:
950+
model_class = ImportedFile
951+
946952
dirpath = os.path.join(root.replace(path, '').lstrip('/'),
947953
filename.lstrip('/'))
948954
full_path = os.path.join(root, filename)
949955
md5 = hashlib.md5(open(full_path, 'rb').read()).hexdigest()
950956
try:
951-
obj, __ = ImportedFile.objects.get_or_create(
957+
# pylint: disable=unpacking-non-sequence
958+
obj, __ = model_class.objects.get_or_create(
952959
project=version.project,
953960
version=version,
954961
path=dirpath,
955962
name=filename,
956963
)
957-
except ImportedFile.MultipleObjectsReturned:
964+
except model_class.MultipleObjectsReturned:
958965
log.warning('Error creating ImportedFile')
959966
continue
960967
if obj.md5 != md5:
@@ -963,6 +970,12 @@ def _manage_imported_files(version, path, commit):
963970
if obj.commit != commit:
964971
obj.commit = commit
965972
obj.save()
973+
974+
# Delete the HTMLFile first from previous versions
975+
HTMLFile.objects.filter(project=version.project,
976+
version=version
977+
).exclude(commit=commit).delete()
978+
966979
# Delete ImportedFiles from previous versions
967980
ImportedFile.objects.filter(project=version.project,
968981
version=version
@@ -1173,7 +1186,6 @@ def sync_callback(_, version_pk, commit, *args, **kwargs):
11731186
The first argument is the result from previous tasks, which we discard.
11741187
"""
11751188
fileify(version_pk, commit=commit)
1176-
update_search(version_pk, commit=commit)
11771189

11781190

11791191
@app.task()

readthedocs/projects/utils.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -32,18 +32,21 @@ def version_from_slug(slug, version):
3232
return v
3333

3434

35-
def find_file(filename):
35+
def find_file(basename, pattern, path):
3636
"""
37-
Recursively find matching file from the current working path.
37+
Recursively find matching file.
3838
39-
:param file: Filename to match
40-
:returns: A list of matching filenames.
39+
:param basename: Basename of a file to match
40+
:param pattern: Pattern to match
41+
:param path: the directory to search for the file
42+
:returns: path of matching file
4143
"""
42-
matches = []
43-
for root, __, filenames in os.walk('.'):
44-
for match in fnmatch.filter(filenames, filename):
45-
matches.append(os.path.join(root, match))
46-
return matches
44+
for root, _, files in os.walk(path):
45+
for filename in files:
46+
file_basename = os.path.splitext(filename)[0]
47+
48+
if fnmatch.fnmatch(filename, pattern) and file_basename == basename:
49+
return os.path.join(root, filename)
4750

4851

4952
def run(*commands):

readthedocs/search/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
SEARCH_EXCLUDED_FILE = ['search.html', 'genindex.html', 'py-modindex.html']

readthedocs/search/documents.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
from django.conf import settings
2+
from django_elasticsearch_dsl import DocType, Index, fields
3+
4+
from readthedocs.projects.models import Project, HTMLFile
5+
from .conf import SEARCH_EXCLUDED_FILE
6+
7+
from readthedocs.search.faceted_search import ProjectSearch, FileSearch
8+
9+
project_conf = settings.ES_INDEXES['project']
10+
project_index = Index(project_conf['name'])
11+
project_index.settings(**project_conf['settings'])
12+
13+
page_conf = settings.ES_INDEXES['page']
14+
page_index = Index(page_conf['name'])
15+
page_index.settings(**page_conf['settings'])
16+
17+
18+
@project_index.doc_type
19+
class ProjectDocument(DocType):
20+
21+
class Meta(object):
22+
model = Project
23+
fields = ('name', 'slug', 'description')
24+
25+
url = fields.TextField(attr='get_absolute_url')
26+
users = fields.NestedField(properties={
27+
'username': fields.TextField(),
28+
'id': fields.IntegerField(),
29+
})
30+
language = fields.KeywordField()
31+
32+
@classmethod
33+
def faceted_search(cls, query, language=None, using=None, index=None):
34+
kwargs = {
35+
'using': using or cls._doc_type.using,
36+
'index': index or cls._doc_type.index,
37+
'doc_types': [cls],
38+
'model': cls._doc_type.model,
39+
'query': query
40+
}
41+
42+
if language:
43+
kwargs['filters'] = {'language': language}
44+
45+
return ProjectSearch(**kwargs)
46+
47+
48+
@page_index.doc_type
49+
class PageDocument(DocType):
50+
51+
class Meta(object):
52+
model = HTMLFile
53+
fields = ('commit',)
54+
55+
project = fields.KeywordField(attr='project.slug')
56+
version = fields.KeywordField(attr='version.slug')
57+
58+
title = fields.TextField(attr='processed_json.title')
59+
headers = fields.TextField(attr='processed_json.headers')
60+
content = fields.TextField(attr='processed_json.content')
61+
path = fields.TextField(attr='processed_json.path')
62+
63+
@classmethod
64+
def faceted_search(cls, query, projects_list=None, versions_list=None, using=None, index=None):
65+
kwargs = {
66+
'using': using or cls._doc_type.using,
67+
'index': index or cls._doc_type.index,
68+
'doc_types': [cls],
69+
'model': cls._doc_type.model,
70+
'query': query
71+
}
72+
filters = {}
73+
74+
if projects_list:
75+
filters['project'] = projects_list
76+
if versions_list:
77+
filters['version'] = versions_list
78+
79+
kwargs['filters'] = filters
80+
81+
return FileSearch(**kwargs)
82+
83+
def get_queryset(self):
84+
"""Overwrite default queryset to filter certain files to index"""
85+
queryset = super(PageDocument, self).get_queryset()
86+
87+
# Do not index files that belong to non sphinx project
88+
# Also do not index certain files
89+
queryset = (queryset.filter(project__documentation_type='sphinx')
90+
.exclude(name__in=SEARCH_EXCLUDED_FILE))
91+
return queryset
92+
93+
def update(self, thing, refresh=None, action='index', **kwargs):
94+
"""Overwrite in order to index only certain files"""
95+
# Object not exist in the provided queryset should not be indexed
96+
# TODO: remove this overwrite when the issue has been fixed
97+
# See below link for more information
98+
# https://github.com/sabricot/django-elasticsearch-dsl/issues/111
99+
if isinstance(thing, HTMLFile):
100+
# Its a model instance.
101+
queryset = self.get_queryset()
102+
obj = queryset.filter(pk=thing.pk)
103+
if not obj.exists():
104+
return None
105+
106+
return super(PageDocument, self).update(thing=thing, refresh=None, action='index', **kwargs)

0 commit comments

Comments
 (0)