readthedocs · ericholscher · Aug 8, 2019 · Jun 27, 2019 · Jul 15, 2019 · Jul 15, 2019
diff --git a/readthedocs/builds/storage.py b/readthedocs/builds/storage.py
@@ -64,10 +64,10 @@ def delete_directory(self, path):
         for folder_name in folders:
             if folder_name:
                 # Recursively delete the subdirectory
-                self.delete_directory(safe_join(path, folder_name))
+                self.delete_directory(self.join(path, folder_name))
         for filename in files:
             if filename:
-                self.delete(safe_join(path, filename))
+                self.delete(self.join(path, filename))
 
     def copy_directory(self, source, destination):
         """
@@ -79,14 +79,31 @@ def copy_directory(self, source, destination):
         log.debug('Copying source directory %s to media storage at %s', source, destination)
         source = Path(source)
         for filepath in source.iterdir():
-            sub_destination = safe_join(destination, filepath.name)
+            sub_destination = self.join(destination, filepath.name)
             if filepath.is_dir():
                 # Recursively copy the subdirectory
                 self.copy_directory(filepath, sub_destination)
             elif filepath.is_file():
                 with filepath.open('rb') as fd:
                     self.save(sub_destination, fd)
 
+    def join(self, directory, filepath):
+        return safe_join(directory, filepath)
+
+    def walk(self, top):
+        if top in ('', '/'):
+            raise SuspiciousFileOperation('Iterating all storage cannot be right')
+
+        log.debug('Walking %s in media storage', top)
+        folders, files = self.listdir(self._dirpath(top))
+
+        yield top, folders, files
+
+        for folder_name in folders:
+            if folder_name:
+                # Recursively walk the subdirectory
+                yield from self.walk(self.join(top, folder_name))
+
 
 class BuildMediaFileSystemStorage(BuildMediaStorageMixin, FileSystemStorage):
 

diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py
@@ -1215,26 +1215,36 @@ def get_processed_json(self):
         Both lead to `foo/index.html`
         https://github.com/rtfd/readthedocs.org/issues/5368
         """
-        fjson_paths = []
-        basename = os.path.splitext(self.path)[0]
-        fjson_paths.append(basename + '.fjson')
-        if basename.endswith('/index'):
-            new_basename = re.sub(r'\/index$', '', basename)
-            fjson_paths.append(new_basename + '.fjson')
-
-        full_json_path = self.project.get_production_media_path(
-            type_='json', version_slug=self.version.slug, include_file=False
-        )
-        try:
-            for fjson_path in fjson_paths:
-                file_path = os.path.join(full_json_path, fjson_path)
-                if os.path.exists(file_path):
-                    return process_file(file_path)
-        except Exception:
+        file_path = None
+
+        if settings.RTD_BUILD_MEDIA_STORAGE:
+            storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()
+
+            fjson_paths = []
+            basename = os.path.splitext(self.path)[0]
+            fjson_paths.append(basename + '.fjson')
+            if basename.endswith('/index'):
+                new_basename = re.sub(r'\/index$', '', basename)
+                fjson_paths.append(new_basename + '.fjson')
+
+            storage_path = self.project.get_storage_path(
+                type_='json', version_slug=self.version.slug, include_file=False
+            )
+            try:
+                for fjson_path in fjson_paths:
+                    file_path = storage.join(storage_path, fjson_path)
+                    if storage.exists(file_path):
+                        return process_file(file_path)
+            except Exception:
+                log.warning(
+                    'Unhandled exception during search processing file: %s',
+                    file_path,
+                )
+        else:
             log.warning(
-                'Unhandled exception during search processing file: %s',
-                file_path,
+                'Skipping HTMLFile processing because of no storage backend'
             )
+
         return {
             'headers': [],
             'content': '',

diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py
@@ -1268,7 +1268,6 @@ def fileify(version_pk, commit, build):
         )
         return
 
-    path = project.rtd_build_path(version.slug)
     log.info(
         LOG_TEMPLATE,
         {
@@ -1278,40 +1277,47 @@ def fileify(version_pk, commit, build):
         }
     )
     try:
-        _manage_imported_files(version, path, commit, build)
+        _manage_imported_files(version, commit, build)
     except Exception:
         log.exception('Failed during ImportedFile creation')
 
     try:
-        _update_intersphinx_data(version, path, commit, build)
+        _update_intersphinx_data(version, commit, build)
     except Exception:
         log.exception('Failed during SphinxDomain creation')
 
 
-def _update_intersphinx_data(version, path, commit, build):
+def _update_intersphinx_data(version, commit, build):
     """
     Update intersphinx data for this version.
 
     :param version: Version instance
-    :param path: Path to search
     :param commit: Commit that updated path
     :param build: Build id
     """
-
-    object_file = os.path.join(path, 'objects.inv')
-    if not os.path.exists(object_file):
-        log.debug('No objects.inv, skipping intersphinx indexing.')
+    if not settings.RTD_BUILD_MEDIA_STORAGE:
         return
 
-    full_json_path = version.project.get_production_media_path(
+    storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()
+
+    html_storage_path = version.project.get_storage_path(
+        type_='html', version_slug=version.slug, include_file=False
+    )
+    json_storage_path = version.project.get_storage_path(
         type_='json', version_slug=version.slug, include_file=False
     )
-    type_file = os.path.join(full_json_path, 'readthedocs-sphinx-domain-names.json')
+
+    object_file = storage.join(html_storage_path, 'objects.inv')
+    if not storage.exists(object_file):
+        log.debug('No objects.inv, skipping intersphinx indexing.')
+        return
+
+    type_file = storage.join(json_storage_path, 'readthedocs-sphinx-domain-names.json')
     types = {}
     titles = {}
-    if os.path.exists(type_file):
+    if storage.exists(type_file):
         try:
-            data = json.load(open(type_file))
+            data = json.load(storage.open(type_file))
             types = data['types']
             titles = data['titles']
         except Exception:
@@ -1331,7 +1337,13 @@ def warn(self, msg):
             log.warning('Sphinx MockApp: %s', msg)
 
     # Re-create all objects from the new build of the version
-    invdata = intersphinx.fetch_inventory(MockApp(), '', object_file)
+    object_file_url = storage.url(object_file)
+    if object_file_url.startswith('/'):
+        # Filesystem backed storage simply prepends MEDIA_URL to the path to get the URL
+        # This can cause an issue if MEDIA_URL is not fully qualified
+        object_file_url = 'http://' + settings.PRODUCTION_DOMAIN + object_file_url
+
+    invdata = intersphinx.fetch_inventory(MockApp(), '', object_file_url)
     for key, value in sorted(invdata.items() or {}):
         domain, _type = key.split(':')
         for name, einfo in sorted(value.items()):
@@ -1424,29 +1436,40 @@ def clean_build(version_pk):
         return True
 
 
-def _manage_imported_files(version, path, commit, build):
+def _manage_imported_files(version, commit, build):
     """
     Update imported files for version.
 
     :param version: Version instance
-    :param path: Path to search
     :param commit: Commit that updated path
     :param build: Build id
     """
 
+    if not settings.RTD_BUILD_MEDIA_STORAGE:
+        return
+
+    storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()
+
     changed_files = set()
+
     # Re-create all objects from the new build of the version
-    for root, __, filenames in os.walk(path):
+    storage_path = version.project.get_storage_path(
+        type_='html', version_slug=version.slug, include_file=False
+    )
+    for root, __, filenames in storage.walk(storage_path):
         for filename in filenames:
             if filename.endswith('.html'):
                 model_class = HTMLFile
-            else:
+            elif version.project.cdn_enabled:
+                # We need to track all files for CDN enabled projects so the files can be purged
                 model_class = ImportedFile
+            else:
+                # For projects not behind a CDN, we don't care about non-HTML
+                continue
 
-            full_path = os.path.join(root, filename)
-            relpath = os.path.relpath(full_path, path)
+            relpath = storage.join(root, filename)
             try:
-                md5 = hashlib.md5(open(full_path, 'rb').read()).hexdigest()
+                md5 = hashlib.md5(storage.open(relpath, 'rb').read()).hexdigest()
             except Exception:
                 log.exception(
                     'Error while generating md5 for %s:%s:%s. Don\'t stop.',

diff --git a/readthedocs/rtd_tests/tests/test_build_storage.py b/readthedocs/rtd_tests/tests/test_build_storage.py
@@ -0,0 +1,52 @@
+import os
+import shutil
+import tempfile
+
+from django.test import TestCase
+
+from readthedocs.builds.storage import BuildMediaFileSystemStorage
+
+
+files_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'files')
+
+
+class TestBuildMediaStorage(TestCase):
+    def setUp(self):
+        self.test_media_dir = tempfile.mkdtemp()
+        self.storage = BuildMediaFileSystemStorage(location=self.test_media_dir)
+
+    def tearDown(self):
+        shutil.rmtree(self.test_media_dir, ignore_errors=True)
+
+    def test_copy_directory(self):
+        self.assertFalse(self.storage.exists('files/test.html'))
+
+        self.storage.copy_directory(files_dir, 'files')
+        self.assertTrue(self.storage.exists('files/test.html'))
+        self.assertTrue(self.storage.exists('files/conf.py'))
+        self.assertTrue(self.storage.exists('files/api.fjson'))
+
+    def test_delete_directory(self):
+        self.storage.copy_directory(files_dir, 'files')
+        dirs, files = self.storage.listdir('files')
+        self.assertEqual(dirs, [])
+        self.assertEqual(files, ['api.fjson', 'conf.py', 'test.html'])
+
+        self.storage.delete_directory('files/')
+        dirs, files = self.storage.listdir('files')
+        self.assertEqual(dirs, [])
+        self.assertEqual(files, [])
+
+    def test_walk(self):
+        self.storage.copy_directory(files_dir, 'files')
+        self.storage.copy_directory(files_dir, 'files/subfiles')
+
+        output = list(self.storage.walk('files'))
+        self.assertEqual(len(output), 2)
+        self.assertEqual(
+            output,
+            [
+                ('files', ['subfiles'], ['api.fjson', 'conf.py', 'test.html']),
+                ('files/subfiles', [], ['api.fjson', 'conf.py', 'test.html']),
+            ],
+        )
diff --git a/readthedocs/rtd_tests/tests/test_imported_file.py b/readthedocs/rtd_tests/tests/test_imported_file.py
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
 import os
 
+from django.conf import settings
+from django.core.files.storage import get_storage_class
 from django.test import TestCase
 
 from readthedocs.projects.models import ImportedFile, Project
@@ -13,24 +15,45 @@
 class ImportedFileTests(TestCase):
     fixtures = ['eric', 'test_data']
 
+    storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)()
+
     def setUp(self):
         self.project = Project.objects.get(slug='pip')
         self.version = self.project.versions.first()
 
+        self.test_dir = os.path.join(base_dir, 'files')
+        self._copy_storage_dir()
+
+    def _copy_storage_dir(self):
+        self.storage.copy_directory(
+            self.test_dir,
+            self.project.get_storage_path(
+                type_='html',
+                version_slug=self.version.slug,
+                include_file=False,
+            ),
+        )
+
     def test_properly_created(self):
-        test_dir = os.path.join(base_dir, 'files')
+        # Only one file in the directory is HTML
         self.assertEqual(ImportedFile.objects.count(), 0)
-        _manage_imported_files(self.version, test_dir, 'commit01', 1)
-        self.assertEqual(ImportedFile.objects.count(), 3)
-        _manage_imported_files(self.version, test_dir, 'commit01', 2)
+        _manage_imported_files(self.version, 'commit01', 1)
+        self.assertEqual(ImportedFile.objects.count(), 1)
+        _manage_imported_files(self.version, 'commit01', 2)
+        self.assertEqual(ImportedFile.objects.count(), 1)
+
+        self.project.cdn_enabled = True
+        self.project.save()
+
+        # CDN enabled projects => save all files
+        _manage_imported_files(self.version, 'commit01', 3)
         self.assertEqual(ImportedFile.objects.count(), 3)
 
     def test_update_commit(self):
-        test_dir = os.path.join(base_dir, 'files')
         self.assertEqual(ImportedFile.objects.count(), 0)
-        _manage_imported_files(self.version, test_dir, 'commit01', 1)
+        _manage_imported_files(self.version, 'commit01', 1)
         self.assertEqual(ImportedFile.objects.first().commit, 'commit01')
-        _manage_imported_files(self.version, test_dir, 'commit02', 2)
+        _manage_imported_files(self.version, 'commit02', 2)
         self.assertEqual(ImportedFile.objects.first().commit, 'commit02')
 
     def test_update_content(self):
@@ -40,13 +63,17 @@ def test_update_content(self):
         with open(os.path.join(test_dir, 'test.html'), 'w+') as f:
             f.write('Woo')
 
-        _manage_imported_files(self.version, test_dir, 'commit01', 1)
+        self._copy_storage_dir()
+
+        _manage_imported_files(self.version, 'commit01', 1)
         self.assertEqual(ImportedFile.objects.get(name='test.html').md5, 'c7532f22a052d716f7b2310fb52ad981')
 
         with open(os.path.join(test_dir, 'test.html'), 'w+') as f:
             f.write('Something Else')
 
-        _manage_imported_files(self.version, test_dir, 'commit02', 2)
+        self._copy_storage_dir()
+
+        _manage_imported_files(self.version, 'commit02', 2)
         self.assertNotEqual(ImportedFile.objects.get(name='test.html').md5, 'c7532f22a052d716f7b2310fb52ad981')
 
-        self.assertEqual(ImportedFile.objects.count(), 3)
+        self.assertEqual(ImportedFile.objects.count(), 1)
diff --git a/readthedocs/rtd_tests/tests/test_search_json_parsing.py b/readthedocs/rtd_tests/tests/test_search_json_parsing.py
@@ -2,21 +2,20 @@
 import os
 
 from django.test import TestCase
+from django.test.utils import override_settings
 
 from readthedocs.search.parse_json import process_file
 
 
 base_dir = os.path.dirname(os.path.dirname(__file__))
 
+
 class TestHacks(TestCase):
 
+    @override_settings(MEDIA_ROOT=base_dir)
     def test_h2_parsing(self):
-        data = process_file(
-            os.path.join(
-                base_dir,
-                'files/api.fjson',
-            ),
-        )
+        data = process_file('files/api.fjson')
+
         self.assertEqual(data['sections'][1]['id'], 'a-basic-api-client-using-slumber')
         # Only capture h2's after the first section
         for obj in data['sections'][1:]: