readthedocs · ericholscher · Jun 19, 2018 · Jun 8, 2018 · Jun 8, 2018 · Jun 9, 2018
diff --git a/.travis.yml b/.travis.yml
@@ -4,7 +4,7 @@ python:
   - 3.6
 sudo: false
 env:
-  - ES_VERSION=1.3.9 ES_DOWNLOAD_URL=https://download.elastic.co/elasticsearch/elasticsearch/elasticsearch-${ES_VERSION}.tar.gz
+  - ES_VERSION=6.2.4 ES_DOWNLOAD_URL=https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-${ES_VERSION}.tar.gz
 matrix:
   include:
     - python: 2.7
@@ -42,3 +42,4 @@ notifications:
 branches:
   only:
   - master
+  - search_upgrade
diff --git a/readthedocs/projects/admin.py b/readthedocs/projects/admin.py
@@ -15,7 +15,7 @@
 
 from .forms import FeatureForm
 from .models import (Project, ImportedFile, Feature,
-                     ProjectRelationship, EmailHook, WebHook, Domain)
+                     ProjectRelationship, EmailHook, WebHook, Domain, HTMLFile)
 from .notifications import ResourceUsageNotification
 from .tasks import remove_dir
 
@@ -206,3 +206,4 @@ def project_count(self, feature):
 admin.site.register(Feature, FeatureAdmin)
 admin.site.register(EmailHook)
 admin.site.register(WebHook)
+admin.site.register(HTMLFile)
diff --git a/readthedocs/projects/apps.py b/readthedocs/projects/apps.py
@@ -9,5 +9,6 @@ class ProjectsConfig(AppConfig):
     def ready(self):
         from readthedocs.projects import tasks
         from readthedocs.worker import app
+
         app.tasks.register(tasks.SyncRepositoryTask)
         app.tasks.register(tasks.UpdateDocsTask)
diff --git a/readthedocs/projects/managers.py b/readthedocs/projects/managers.py
@@ -0,0 +1,7 @@
+from django.db import models
+
+
+class HTMLFileManager(models.Manager):
+
+    def get_queryset(self):
+        return super(HTMLFileManager, self).get_queryset().filter(name__endswith='.html')
diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py
@@ -7,13 +7,14 @@
 import fnmatch
 import logging
 import os
-from builtins import object  # pylint: disable=redefined-builtin
 
+from builtins import object  # pylint: disable=redefined-builtin
 from django.conf import settings
 from django.contrib.auth.models import User
 from django.core.urlresolvers import NoReverseMatch, reverse
 from django.db import models
 from django.utils.encoding import python_2_unicode_compatible
+from django.utils.functional import cached_property
 from django.utils.translation import ugettext_lazy as _
 from future.backports.urllib.parse import urlparse  # noqa
 from guardian.shortcuts import assign
@@ -24,14 +25,17 @@
 from readthedocs.core.utils import broadcast, slugify
 from readthedocs.projects import constants
 from readthedocs.projects.exceptions import ProjectConfigurationError
+from readthedocs.projects.managers import HTMLFileManager
 from readthedocs.projects.querysets import (
     ChildRelatedProjectQuerySet, FeatureQuerySet, ProjectQuerySet,
     RelatedProjectQuerySet)
 from readthedocs.projects.templatetags.projects_tags import sort_version_aware
+from readthedocs.projects.utils import find_file
 from readthedocs.projects.validators import validate_domain_name, validate_repository_url
 from readthedocs.projects.version_handling import (
     determine_stable_version, version_windows)
 from readthedocs.restapi.client import api
+from readthedocs.search.parse_json import process_file
 from readthedocs.vcs_support.backends import backend_cls
 from readthedocs.vcs_support.utils import Lock, NonBlockingLock
 
@@ -910,6 +914,37 @@ def __str__(self):
         return '%s: %s' % (self.name, self.project)
 
 
+class HTMLFile(ImportedFile):
+
+    """
+    Imported HTML file Proxy model.
+
+    This tracks only the HTML files for indexing to search.
+    """
+
+    class Meta(object):
+        proxy = True
+
+    objects = HTMLFileManager()
+
+    @cached_property
+    def json_file_path(self):
+        basename = os.path.splitext(self.path)[0]
+        file_path = basename + '.fjson'
+
+        full_json_path = self.project.get_production_media_path(type_='json',
+                                                                version_slug=self.version.slug,
+                                                                include_file=False)
+
+        file_path = os.path.join(full_json_path, file_path)
+        return file_path
+
+    @cached_property
+    def processed_json(self):
+        file_path = self.json_file_path
+        return process_file(file_path)
+
+
 class Notification(models.Model):
     project = models.ForeignKey(Project,
                                 related_name='%(class)s_notifications')

diff --git a/readthedocs/projects/signals.py b/readthedocs/projects/signals.py
@@ -2,10 +2,11 @@
 
 from __future__ import absolute_import
 import django.dispatch
+from django.db.models.signals import pre_save
 from django.dispatch import receiver
 
 from readthedocs.oauth.utils import attach_webhook
-
+from .models import HTMLFile
 
 before_vcs = django.dispatch.Signal(providing_args=["version"])
 after_vcs = django.dispatch.Signal(providing_args=["version"])

diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py
@@ -8,6 +8,7 @@
 from __future__ import absolute_import
 
 import datetime
+import fnmatch
 import hashlib
 import json
 import logging
@@ -29,7 +30,7 @@
 
 from .constants import LOG_TEMPLATE
 from .exceptions import RepositoryError
-from .models import ImportedFile, Project, Domain
+from .models import ImportedFile, Project, Domain, HTMLFile
 from .signals import before_vcs, after_vcs, before_build, after_build, files_changed
 from readthedocs.builds.constants import (LATEST,
                                           BUILD_STATE_CLONING,
@@ -943,18 +944,23 @@ def _manage_imported_files(version, path, commit):
     changed_files = set()
     for root, __, filenames in os.walk(path):
         for filename in filenames:
+            if fnmatch.fnmatch(filename, '*.html'):
+                model_class = HTMLFile
+            else:
+                model_class = ImportedFile
+
             dirpath = os.path.join(root.replace(path, '').lstrip('/'),
                                    filename.lstrip('/'))
             full_path = os.path.join(root, filename)
             md5 = hashlib.md5(open(full_path, 'rb').read()).hexdigest()
             try:
-                obj, __ = ImportedFile.objects.get_or_create(
+                obj, __ = model_class.objects.get_or_create(
                     project=version.project,
                     version=version,
                     path=dirpath,
                     name=filename,
                 )
-            except ImportedFile.MultipleObjectsReturned:
+            except model_class.MultipleObjectsReturned:
                 log.warning('Error creating ImportedFile')
                 continue
             if obj.md5 != md5:
@@ -963,6 +969,12 @@ def _manage_imported_files(version, path, commit):
             if obj.commit != commit:
                 obj.commit = commit
             obj.save()
+
+    # Delete the HTMLFile first from previous versions
+    HTMLFile.objects.filter(project=version.project,
+                            version=version
+                            ).exclude(commit=commit).delete()
+
     # Delete ImportedFiles from previous versions
     ImportedFile.objects.filter(project=version.project,
                                 version=version
@@ -1173,7 +1185,6 @@ def sync_callback(_, version_pk, commit, *args, **kwargs):
     The first argument is the result from previous tasks, which we discard.
     """
     fileify(version_pk, commit=commit)
-    update_search(version_pk, commit=commit)
 
 
 @app.task()

diff --git a/readthedocs/projects/utils.py b/readthedocs/projects/utils.py
@@ -32,18 +32,22 @@ def version_from_slug(slug, version):
     return v
 
 
-def find_file(filename):
+def find_file(basename, pattern, path):
     """
-    Recursively find matching file from the current working path.
+    Recursively find matching file.
 
-    :param file: Filename to match
-    :returns: A list of matching filenames.
+    :param basename: Basename of a file to match
+    :param pattern: Pattern to match
+    :param path: the directory to search for the file
+    :returns: path of matching file
     """
-    matches = []
-    for root, __, filenames in os.walk('.'):
-        for match in fnmatch.filter(filenames, filename):
-            matches.append(os.path.join(root, match))
-    return matches
+
+    for root, _, files in os.walk(path):
+        for filename in files:
+            file_basename = os.path.splitext(filename)[0]
+
+            if fnmatch.fnmatch(filename, pattern) and file_basename == basename:
+                return os.path.join(root, filename)
 
 
 def run(*commands):

diff --git a/readthedocs/search/conf.py b/readthedocs/search/conf.py
@@ -0,0 +1 @@
+SEARCH_EXCLUDED_FILE = ['search.html', 'genindex.html', 'py-modindex.html']
diff --git a/readthedocs/search/documents.py b/readthedocs/search/documents.py
@@ -0,0 +1,114 @@
+from django.db import models
+from django_elasticsearch_dsl import DocType, Index, fields
+
+from readthedocs.projects.models import Project, HTMLFile
+from .conf import SEARCH_EXCLUDED_FILE
+
+from readthedocs.search.faceted_search import ProjectSearch, FileSearch
+
+project_index = Index('project')
+
+project_index.settings(
+    number_of_shards=1,
+    number_of_replicas=0
+)
+
+
+@project_index.doc_type
+class ProjectDocument(DocType):
+
+    class Meta(object):
+        model = Project
+        fields = ('name', 'slug', 'description')
+
+    url = fields.TextField(attr='get_absolute_url')
+    users = fields.NestedField(properties={
+        'username': fields.TextField(),
+        'id': fields.IntegerField(),
+    })
+    language = fields.KeywordField()
+
+    @classmethod
+    def faceted_search(cls, query, language=None, using=None, index=None):
+        kwargs = {
+            'using': using or cls._doc_type.using,
+            'index': index or cls._doc_type.index,
+            'doc_types': [cls],
+            'model': cls._doc_type.model,
+            'query': query
+        }
+
+        if language:
+            kwargs['filters'] = {'language': language}
+
+        return ProjectSearch(**kwargs)
+
+
+page_index = Index('page')
+
+page_index.settings(
+    number_of_shards=1,
+    number_of_replicas=0
+)
+
+
+@page_index.doc_type
+class PageDocument(DocType):
+
+    class Meta(object):
+        model = HTMLFile
+        fields = ('commit',)
+
+    project = fields.KeywordField(attr='project.slug')
+    version = fields.KeywordField(attr='version.slug')
+
+    title = fields.TextField(attr='processed_json.title')
+    headers = fields.TextField(attr='processed_json.headers')
+    content = fields.TextField(attr='processed_json.content')
+    path = fields.TextField(attr='processed_json.path')
+
+    @classmethod
+    def faceted_search(cls, query, projects_list=None, versions_list=None, using=None, index=None):
+        kwargs = {
+            'using': using or cls._doc_type.using,
+            'index': index or cls._doc_type.index,
+            'doc_types': [cls],
+            'model': cls._doc_type.model,
+            'query': query
+        }
+        filters = {}
+
+        if projects_list:
+            filters['project'] = projects_list
+        if versions_list:
+            filters['version'] = versions_list
+
+        kwargs['filters'] = filters
+
+        return FileSearch(**kwargs)
+
+    def get_queryset(self):
+        """Overwrite default queryset to filter certain files to index"""
+        queryset = super(PageDocument, self).get_queryset()
+
+        # Do not index files that belong to non sphinx project
+        # Also do not index certain files
+        queryset = (queryset.filter(project__documentation_type='sphinx')
+                            .exclude(name__in=SEARCH_EXCLUDED_FILE))
+        return queryset
+
+    def update(self, thing, **kwargs):
+        """Overwrite in order to index only certain files"""
+
+        # Object not exist in the provided queryset should not be indexed
+        # TODO: remove this overwrite when the issue has been fixed
+        # See below link for more information
+        # https://github.com/sabricot/django-elasticsearch-dsl/issues/111
+        if isinstance(thing, HTMLFile):
+            # Its a model instance.
+            queryset = self.get_queryset()
+            obj = queryset.filter(pk=thing.pk)
+            if not obj.exists():
+                return None
+
+        return super(PageDocument, self).update(thing=thing, **kwargs)
diff --git a/readthedocs/search/faceted_search.py b/readthedocs/search/faceted_search.py
@@ -0,0 +1,30 @@
+from elasticsearch_dsl import FacetedSearch, TermsFacet
+
+
+class RTDFacetedSearch(FacetedSearch):
+    """Overwrite the initialization in order too meet our needs"""
+
+    # TODO: Remove the overwrite when the elastic/elasticsearch-dsl-py#916
+    # See more: https://github.com/elastic/elasticsearch-dsl-py/issues/916
+
+    def __init__(self, using, index, doc_types, model, **kwargs):
+        self.using = using
+        self.index = index
+        self.doc_types = doc_types
+        self._model = model
+        super(RTDFacetedSearch, self).__init__(**kwargs)
+
+
+class ProjectSearch(RTDFacetedSearch):
+    fields = ['name^5', 'description']
+    facets = {
+        'language': TermsFacet(field='language')
+    }
+
+
+class FileSearch(RTDFacetedSearch):
+    fields = ['title^10', 'headers^5', 'content']
+    facets = {
+        'project': TermsFacet(field='project'),
+        'version': TermsFacet(field='version')
+    }
diff --git a/readthedocs/search/indexes.py b/readthedocs/search/indexes.py
@@ -19,7 +19,6 @@
 import datetime
 
 from elasticsearch import Elasticsearch, exceptions
-from elasticsearch.helpers import bulk_index
 
 from django.conf import settings
 
@@ -143,7 +142,7 @@ def bulk_index(self, data, index=None, chunk_size=500, parent=None,
             docs.append(doc)
 
         # TODO: This doesn't work with the new ES setup.
-        bulk_index(self.es, docs, chunk_size=chunk_size)
+        # bulk_index(self.es, docs, chunk_size=chunk_size)
 
     def index_document(self, data, index=None, parent=None, routing=None):
         doc = self.extract_document(data)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		SEARCH_EXCLUDED_FILE = ['search.html', 'genindex.html', 'py-modindex.html']