Merge pull request #5785 from stsewd/index-with-real-path-name

stsewd · web-flow · commit a908684874e7 · 2019-06-18T11:43:18.000-05:00
Index path with original path name
diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py
@@ -1231,19 +1231,19 @@ def get_processed_json(self):
         Both lead to `foo/index.html`
         https://github.com/rtfd/readthedocs.org/issues/5368
         """
-        paths = []
+        fjson_paths = []
         basename = os.path.splitext(self.path)[0]
-        paths.append(basename + '.fjson')
+        fjson_paths.append(basename + '.fjson')
         if basename.endswith('/index'):
             new_basename = re.sub(r'\/index$', '', basename)
-            paths.append(new_basename + '.fjson')
+            fjson_paths.append(new_basename + '.fjson')
 
         full_json_path = self.project.get_production_media_path(
             type_='json', version_slug=self.version.slug, include_file=False
         )
         try:
-            for path in paths:
-                file_path = os.path.join(full_json_path, path)
+            for fjson_path in fjson_paths:
+                file_path = os.path.join(full_json_path, fjson_path)
                 if os.path.exists(file_path):
                     return process_file(file_path)
         except Exception:
diff --git a/readthedocs/search/api.py b/readthedocs/search/api.py
@@ -1,14 +1,14 @@
 import logging
 from pprint import pformat
 
-from rest_framework import generics
-from rest_framework import serializers
+from rest_framework import generics, serializers
 from rest_framework.exceptions import ValidationError
 from rest_framework.pagination import PageNumberPagination
 
 from readthedocs.search.faceted_search import PageSearch
 from readthedocs.search.utils import get_project_list_or_404
 
+
 log = logging.getLogger(__name__)
 
 
diff --git a/readthedocs/search/documents.py b/readthedocs/search/documents.py
@@ -116,6 +116,7 @@ class PageDocument(RTDDocTypeMixin, DocType):
     project = fields.KeywordField(attr='project.slug')
     version = fields.KeywordField(attr='version.slug')
     path = fields.KeywordField(attr='processed_json.path')
+    full_path = fields.KeywordField(attr='path')
 
     # Searchable content
     title = fields.TextField(attr='processed_json.title')
@@ -153,7 +154,7 @@ def faceted_search(
 
     def get_queryset(self):
         """Overwrite default queryset to filter certain files to index."""
-        queryset = super(PageDocument, self).get_queryset()
+        queryset = super().get_queryset()
 
         # Do not index files that belong to non sphinx project
         # Also do not index certain files
diff --git a/readthedocs/search/parse_json.py b/readthedocs/search/parse_json.py
@@ -59,38 +59,41 @@ def generate_sections_from_pyquery(body):
         }
 
 
-def process_file(filename):
-    """Read a file from disk and parse it into a structured dict."""
+def process_file(fjson_filename):
+    """Read the fjson file from disk and parse it into a structured dict."""
     try:
-        with codecs.open(filename, encoding='utf-8', mode='r') as f:
+        with codecs.open(fjson_filename, encoding='utf-8', mode='r') as f:
             file_contents = f.read()
     except IOError:
-        log.info('Unable to read file: %s', filename)
-        return None
+        log.info('Unable to read file: %s', fjson_filename)
+        raise
     data = json.loads(file_contents)
     sections = []
+    path = ''
     title = ''
     body_content = ''
+
     if 'current_page_name' in data:
         path = data['current_page_name']
     else:
-        log.info('Unable to index file due to no name %s', filename)
-        return None
-    if 'body' in data and data['body']:
+        log.info('Unable to index file due to no name %s', fjson_filename)
+
+    if data.get('body'):
         body = PyQuery(data['body'])
         body_content = body.text().replace('¶', '')
         sections.extend(generate_sections_from_pyquery(body))
     else:
-        log.info('Unable to index content for: %s', filename)
+        log.info('Unable to index content for: %s', fjson_filename)
+
     if 'title' in data:
         title = data['title']
         if title.startswith('<'):
             title = PyQuery(data['title']).text()
     else:
-        log.info('Unable to index title for: %s', filename)
+        log.info('Unable to index title for: %s', fjson_filename)
 
     return {
-        'headers': process_headers(data, filename),
+        'headers': process_headers(data, fjson_filename),
         'content': body_content,
         'path': path,
         'title': title,
diff --git a/readthedocs/templates/search/elastic_search.html b/readthedocs/templates/search/elastic_search.html
@@ -210,7 +210,7 @@ <h3>
 
                       {% elif 'page' in result.meta.index %}
 
-                        <a href="{% doc_url result.project|get_project result.version result.path %}?highlight={{ query }}">
+                        <a href="{% doc_url result.project|get_project result.version result.full_path %}?highlight={{ query }}">
                           {{ result.project }} - {{ result.title }}
                         </a>
                         {% for fragment in result.meta.highlight.content %}