readthedocs · ericholscher · Aug 8, 2019 · Jun 27, 2019 · Jul 15, 2019 · Jul 15, 2019
diff --git a/readthedocs/builds/storage.py b/readthedocs/builds/storage.py
@@ -60,10 +60,10 @@ def delete_directory(self, path):
         for folder_name in folders:
             if folder_name:
                 # Recursively delete the subdirectory
-                self.delete_directory(safe_join(path, folder_name))
+                self.delete_directory(self.join(path, folder_name))
         for filename in files:
             if filename:
-                self.delete(safe_join(path, filename))
+                self.delete(self.join(path, filename))
 
     def copy_directory(self, source, destination):
         """
@@ -75,14 +75,31 @@ def copy_directory(self, source, destination):
         log.debug('Copying source directory %s to media storage at %s', source, destination)
         source = Path(source)
         for filepath in source.iterdir():
-            sub_destination = safe_join(destination, filepath.name)
+            sub_destination = self.join(destination, filepath.name)
             if filepath.is_dir():
                 # Recursively copy the subdirectory
                 self.copy_directory(filepath, sub_destination)
             elif filepath.is_file():
                 with filepath.open('rb') as fd:
                     self.save(sub_destination, fd)
 
+    def join(self, directory, filepath):
+        return safe_join(directory, filepath)
+
+    def walk(self, top):
+        if top in ('', '/'):
+            raise SuspiciousFileOperation('Iterating all storage cannot be right')
+
+        log.debug('Walking %s in media storage', top)
+        folders, files = self.listdir(self._dirpath(top))
+
+        yield top, folders, files
+
+        for folder_name in folders:
+            if folder_name:
+                # Recursively walk the subdirectory
+                yield from self.walk(self.join(top, folder_name))
+
 
 class BuildMediaFileSystemStorage(BuildMediaStorageMixin, FileSystemStorage):
 

diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py
@@ -1231,26 +1231,36 @@ def get_processed_json(self):
         Both lead to `foo/index.html`
         https://github.com/rtfd/readthedocs.org/issues/5368
         """
-        fjson_paths = []
-        basename = os.path.splitext(self.path)[0]
-        fjson_paths.append(basename + '.fjson')
-        if basename.endswith('/index'):
-            new_basename = re.sub(r'\/index$', '', basename)
-            fjson_paths.append(new_basename + '.fjson')
-
-        full_json_path = self.project.get_production_media_path(
-            type_='json', version_slug=self.version.slug, include_file=False
-        )
-        try:
-            for fjson_path in fjson_paths:
-                file_path = os.path.join(full_json_path, fjson_path)
-                if os.path.exists(file_path):
-                    return process_file(file_path)
-        except Exception:
+        file_path = None
+
+        if settings.RTD_BUILD_MEDIA_STORAGE:
+            storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()
+
+            fjson_paths = []
+            basename = os.path.splitext(self.path)[0]
+            fjson_paths.append(basename + '.fjson')
+            if basename.endswith('/index'):
+                new_basename = re.sub(r'\/index$', '', basename)
+                fjson_paths.append(new_basename + '.fjson')
+
+            storage_path = self.project.get_storage_path(
+                type_='json', version_slug=self.version.slug, include_file=False
+            )
+            try:
+                for fjson_path in fjson_paths:
+                    file_path = storage.join(storage_path, fjson_path)
+                    if storage.exists(file_path):
+                        return process_file(file_path)
+            except Exception:
+                log.warning(
+                    'Unhandled exception during search processing file: %s',
+                    file_path,
+                )
+        else:
             log.warning(
-                'Unhandled exception during search processing file: %s',
-                file_path,
+                'Skipping HTMLFile processing because of no storage backend'
             )
+
         return {
             'headers': [],
             'content': '',

diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py
@@ -1268,7 +1268,6 @@ def fileify(version_pk, commit, build):
         )
         return
 
-    path = project.rtd_build_path(version.slug)
     log.info(
         LOG_TEMPLATE,
         {
@@ -1278,40 +1277,47 @@ def fileify(version_pk, commit, build):
         }
     )
     try:
-        _manage_imported_files(version, path, commit, build)
+        _manage_imported_files(version, commit, build)
     except Exception:
         log.exception('Failed during ImportedFile creation')
 
     try:
-        _update_intersphinx_data(version, path, commit, build)
+        _update_intersphinx_data(version, commit, build)
     except Exception:
         log.exception('Failed during SphinxDomain creation')
 
 
-def _update_intersphinx_data(version, path, commit, build):
+def _update_intersphinx_data(version, commit, build):
     """
     Update intersphinx data for this version.
 
     :param version: Version instance
-    :param path: Path to search
     :param commit: Commit that updated path
     :param build: Build id
     """
-
-    object_file = os.path.join(path, 'objects.inv')
-    if not os.path.exists(object_file):
-        log.debug('No objects.inv, skipping intersphinx indexing.')
+    if not settings.RTD_BUILD_MEDIA_STORAGE:
         return
 
-    full_json_path = version.project.get_production_media_path(
+    storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()
+
+    html_storage_path = version.project.get_storage_path(
+        type_='html', version_slug=version.slug, include_file=False
+    )
+    json_storage_path = version.project.get_storage_path(
         type_='json', version_slug=version.slug, include_file=False
     )
-    type_file = os.path.join(full_json_path, 'readthedocs-sphinx-domain-names.json')
+
+    object_file = storage.join(html_storage_path, 'objects.inv')
+    if not storage.exists(object_file):
+        log.debug('No objects.inv, skipping intersphinx indexing.')
+        return
+
+    type_file = storage.join(json_storage_path, 'readthedocs-sphinx-domain-names.json')
     types = {}
     titles = {}
-    if os.path.exists(type_file):
+    if storage.exists(type_file):
         try:
-            data = json.load(open(type_file))
+            data = json.load(storage.open(type_file))
             types = data['types']
             titles = data['titles']
         except Exception:
@@ -1331,7 +1337,13 @@ def warn(self, msg):
             log.warning('Sphinx MockApp: %s', msg)
 
     # Re-create all objects from the new build of the version
-    invdata = intersphinx.fetch_inventory(MockApp(), '', object_file)
+    object_file_url = storage.url(object_file)
+    if object_file_url.startswith('/'):
+        # Filesystem backed storage simply prepends MEDIA_URL to the path to get the URL
+        # This can cause an issue if MEDIA_URL is not fully qualified
+        object_file_url = 'http://' + settings.PRODUCTION_DOMAIN + object_file_url
+
+    invdata = intersphinx.fetch_inventory(MockApp(), '', object_file_url)
     for key, value in sorted(invdata.items() or {}):
         domain, _type = key.split(':')
         for name, einfo in sorted(value.items()):
@@ -1419,29 +1431,36 @@ def clean_build(version_pk):
         return True
 
 
-def _manage_imported_files(version, path, commit, build):
+def _manage_imported_files(version, commit, build):
     """
     Update imported files for version.
 
     :param version: Version instance
-    :param path: Path to search
     :param commit: Commit that updated path
     :param build: Build id
     """
 
+    if not settings.RTD_BUILD_MEDIA_STORAGE:
+        return
+
+    storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()
+
     changed_files = set()
+
     # Re-create all objects from the new build of the version
-    for root, __, filenames in os.walk(path):
+    storage_path = version.project.get_storage_path(
+        type_='html', version_slug=version.slug, include_file=False
+    )
+    for root, __, filenames in storage.walk(storage_path):
         for filename in filenames:
             if filename.endswith('.html'):
                 model_class = HTMLFile
             else:
                 model_class = ImportedFile
 
-            full_path = os.path.join(root, filename)
-            relpath = os.path.relpath(full_path, path)
+            relpath = storage.join(root, filename)
             try:
-                md5 = hashlib.md5(open(full_path, 'rb').read()).hexdigest()
+                md5 = hashlib.md5(storage.open(relpath, 'rb').read()).hexdigest()
             except Exception:
                 log.exception(
                     'Error while generating md5 for %s:%s:%s. Don\'t stop.',

diff --git a/readthedocs/rtd_tests/tests/test_search_json_parsing.py b/readthedocs/rtd_tests/tests/test_search_json_parsing.py
@@ -2,21 +2,20 @@
 import os
 
 from django.test import TestCase
+from django.test.utils import override_settings
 
 from readthedocs.search.parse_json import process_file
 
 
 base_dir = os.path.dirname(os.path.dirname(__file__))
 
+
 class TestHacks(TestCase):
 
+    @override_settings(MEDIA_ROOT=base_dir)
     def test_h2_parsing(self):
-        data = process_file(
-            os.path.join(
-                base_dir,
-                'files/api.fjson',
-            ),
-        )
+        data = process_file('files/api.fjson')
+
         self.assertEqual(data['sections'][1]['id'], 'a-basic-api-client-using-slumber')
         # Only capture h2's after the first section
         for obj in data['sections'][1:]:

diff --git a/readthedocs/search/parse_json.py b/readthedocs/search/parse_json.py
@@ -1,9 +1,11 @@
 """Functions related to converting content into dict/JSON structures."""
 
-import codecs
 import json
 import logging
 
+from django.conf import settings
+from django.core.files.storage import get_storage_class
+
 from pyquery import PyQuery
 
 
@@ -59,13 +61,17 @@ def generate_sections_from_pyquery(body):
         }
 
 
-def process_file(fjson_filename):
+def process_file(fjson_storage_path):
     """Read the fjson file from disk and parse it into a structured dict."""
+    storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()
+
+    log.debug('Processing JSON file for indexing: %s', fjson_storage_path)
+
     try:
-        with codecs.open(fjson_filename, encoding='utf-8', mode='r') as f:
+        with storage.open(fjson_storage_path, mode='r') as f:
             file_contents = f.read()
     except IOError:
-        log.info('Unable to read file: %s', fjson_filename)
+        log.info('Unable to read file: %s', fjson_storage_path)
         raise
     data = json.loads(file_contents)
     sections = []
@@ -76,24 +82,24 @@ def process_file(fjson_filename):
     if 'current_page_name' in data:
         path = data['current_page_name']
     else:
-        log.info('Unable to index file due to no name %s', fjson_filename)
+        log.info('Unable to index file due to no name %s', fjson_storage_path)
 
     if data.get('body'):
         body = PyQuery(data['body'])
         body_content = body.text().replace('¶', '')
         sections.extend(generate_sections_from_pyquery(body))
     else:
-        log.info('Unable to index content for: %s', fjson_filename)
+        log.info('Unable to index content for: %s', fjson_storage_path)
 
     if 'title' in data:
         title = data['title']
         if title.startswith('<'):
             title = PyQuery(data['title']).text()
     else:
-        log.info('Unable to index title for: %s', fjson_filename)
+        log.info('Unable to index title for: %s', fjson_storage_path)
 
     return {
-        'headers': process_headers(data, fjson_filename),
+        'headers': process_headers(data, fjson_storage_path),
         'content': body_content,
         'path': path,
         'title': title,