diff --git a/readthedocs/builds/syncers.py b/readthedocs/builds/syncers.py index 6b7b1d337bf..7291bade16f 100644 --- a/readthedocs/builds/syncers.py +++ b/readthedocs/builds/syncers.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - """ Classes to copy files between build and web servers. @@ -13,12 +11,15 @@ import shutil from django.conf import settings +from django.core.exceptions import SuspiciousFileOperation +from django.core.files.storage import get_storage_class from readthedocs.core.utils import safe_makedirs from readthedocs.core.utils.extend import SettingsOverrideObject log = logging.getLogger(__name__) +storage = get_storage_class()() class BaseSyncer: @@ -42,6 +43,11 @@ def copy(cls, path, target, is_file=False, **kwargs): return if os.path.exists(target): os.remove(target) + + # Create containing directory if it doesn't exist + directory = os.path.dirname(target) + safe_makedirs(directory) + shutil.copy2(path, target) else: if os.path.exists(target): @@ -143,6 +149,10 @@ def copy(cls, path, target, host, is_file=False, **kwargs): # pylint: disable=a log.info('Remote Pull %s to %s', path, target) if not is_file and not os.path.exists(target): safe_makedirs(target) + if is_file: + # Create containing directory if it doesn't exist + directory = os.path.dirname(target) + safe_makedirs(directory) # Add a slash when copying directories sync_cmd = "rsync -e 'ssh -T' -av --delete {user}@{host}:{path} {target}".format( host=host, @@ -159,6 +169,59 @@ def copy(cls, path, target, host, is_file=False, **kwargs): # pylint: disable=a ) +class SelectiveStorageRemotePuller(RemotePuller): + + """ + Like RemotePuller but certain files are copied via Django's storage system. + + If a file with extensions specified by ``extensions`` is copied, it will be copied to storage + and the original is removed. + + See: https://docs.djangoproject.com/en/1.11/ref/settings/#std:setting-DEFAULT_FILE_STORAGE + """ + + extensions = ('.pdf', '.epub', '.zip') + + @classmethod + def get_storage_path(cls, path): + """ + Gets the path to the file within the storage engine. + + For example, if the path was $MEDIA_ROOT/pdfs/latest.pdf + the storage_path is 'pdfs/latest.pdf' + + :raises: SuspiciousFileOperation if the path isn't under settings.MEDIA_ROOT + """ + path = os.path.normpath(path) + if not path.startswith(settings.MEDIA_ROOT): + raise SuspiciousFileOperation + + path = path.replace(settings.MEDIA_ROOT, '').lstrip('/') + return path + + @classmethod + def copy(cls, path, target, host, is_file=False, **kwargs): # pylint: disable=arguments-differ + RemotePuller.copy(path, target, host, is_file, **kwargs) + + if getattr(storage, 'write_build_media', False): + # This is a sanity check for the case where + # storage is backed by the local filesystem + # In that case, removing the original target file locally + # would remove the file from storage as well + + if is_file and os.path.exists(target) and \ + any([target.lower().endswith(ext) for ext in cls.extensions]): + log.info('Selective Copy %s to media storage', target) + + storage_path = cls.get_storage_path(target) + + if storage.exists(storage_path): + storage.delete(storage_path) + + with open(target, 'rb') as fd: + storage.save(storage_path, fd) + + class Syncer(SettingsOverrideObject): _default_class = LocalSyncer _override_setting = 'FILE_SYNCER' diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index 9b8e32d2ece..c94aadd9d4e 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - """Project models.""" import fnmatch @@ -10,6 +8,7 @@ from django.conf import settings from django.contrib.auth.models import User +from django.core.files.storage import get_storage_class from django.db import models from django.urls import NoReverseMatch, reverse from django.utils.functional import cached_property @@ -44,6 +43,7 @@ log = logging.getLogger(__name__) +storage = get_storage_class()() class ProjectRelationship(models.Model): @@ -510,6 +510,24 @@ def get_subproject_urls(self): return [(proj.child.slug, proj.child.get_docs_url()) for proj in self.subprojects.all()] + def get_storage_path(self, type_, version_slug=LATEST): + """ + Get a path to a build artifact for use with Django's storage system. + + :param type_: Media content type, ie - 'pdf', 'htmlzip' + :param version_slug: Project version slug for lookup + :return: the path to an item in storage + (can be used with ``storage.url`` to get the URL) + """ + extension = type_.replace('htmlzip', 'zip') + return '{}/{}/{}/{}.{}'.format( + type_, + self.slug, + version_slug, + self.slug, + extension, + ) + def get_production_media_path(self, type_, version_slug, include_file=True): """ Used to see if these files exist so we can offer them for download. @@ -728,30 +746,33 @@ def has_aliases(self): def has_pdf(self, version_slug=LATEST): if not self.enable_pdf_build: return False - return os.path.exists( - self.get_production_media_path( - type_='pdf', - version_slug=version_slug, - ) + path = self.get_production_media_path( + type_='pdf', version_slug=version_slug + ) + storage_path = self.get_storage_path( + type_='pdf', version_slug=version_slug ) + return os.path.exists(path) or storage.exists(storage_path) def has_epub(self, version_slug=LATEST): if not self.enable_epub_build: return False - return os.path.exists( - self.get_production_media_path( - type_='epub', - version_slug=version_slug, - ) + path = self.get_production_media_path( + type_='epub', version_slug=version_slug ) + storage_path = self.get_storage_path( + type_='epub', version_slug=version_slug + ) + return os.path.exists(path) or storage.exists(storage_path) def has_htmlzip(self, version_slug=LATEST): - return os.path.exists( - self.get_production_media_path( - type_='htmlzip', - version_slug=version_slug, - ) + path = self.get_production_media_path( + type_='htmlzip', version_slug=version_slug + ) + storage_path = self.get_storage_path( + type_='htmlzip', version_slug=version_slug ) + return os.path.exists(path) or storage.exists(storage_path) @property def sponsored(self): diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py index 80be5d8342f..e9bd048a18c 100644 --- a/readthedocs/projects/tasks.py +++ b/readthedocs/projects/tasks.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - """ Tasks related to projects. @@ -20,6 +18,7 @@ import requests from celery.exceptions import SoftTimeLimitExceeded from django.conf import settings +from django.core.files.storage import get_storage_class from django.db.models import Q from django.urls import reverse from django.utils import timezone @@ -80,6 +79,7 @@ log = logging.getLogger(__name__) +storage = get_storage_class()() class SyncRepositoryMixin: @@ -737,6 +737,28 @@ def update_app_instances( 'Updating version failed, skipping file sync: version=%s', self.version, ) + hostname = socket.gethostname() + + delete_unsynced_media = True + + if getattr(storage, 'write_build_media', False): + # Handle the case where we want to upload some built assets to our storage + move_files.delay( + self.version.pk, + hostname, + self.config.doctype, + html=False, + search=False, + localmedia=localmedia, + pdf=pdf, + epub=epub, + delete_unsynced_media=delete_unsynced_media, + ) + # Set variables so they don't get synced in the next broadcast step + localmedia = False + pdf = False + epub = False + delete_unsynced_media = False # Broadcast finalization steps to web application instances broadcast( @@ -748,17 +770,17 @@ def update_app_instances( self.config.doctype, ], kwargs=dict( - hostname=socket.gethostname(), + hostname=hostname, html=html, localmedia=localmedia, search=search, pdf=pdf, epub=epub, + delete_unsynced_media=delete_unsynced_media, ), callback=sync_callback.s( version_pk=self.version.pk, commit=self.build['commit'], - search=search, ), ) @@ -830,7 +852,8 @@ def build_docs_html(self): type='app', task=move_files, args=[ - self.version.pk, socket.gethostname(), self.config.doctype + self.version.pk, + socket.gethostname(), self.config.doctype ], kwargs=dict(html=True), ) @@ -902,15 +925,9 @@ def is_type_sphinx(self): # Web tasks @app.task(queue='web') def sync_files( - project_pk, - version_pk, - doctype, - hostname=None, - html=False, - localmedia=False, - search=False, - pdf=False, - epub=False, + project_pk, version_pk, doctype, hostname=None, html=False, + localmedia=False, search=False, pdf=False, epub=False, + delete_unsynced_media=False, ): """ Sync build artifacts to application instances. @@ -922,20 +939,6 @@ def sync_files( version = Version.objects.get_object_or_log(pk=version_pk) if not version: return - if not pdf: - remove_dirs([ - version.project.get_production_media_path( - type_='pdf', - version_slug=version.slug, - ), - ]) - if not epub: - remove_dirs([ - version.project.get_production_media_path( - type_='epub', - version_slug=version.slug, - ), - ]) # Sync files to the web servers move_files( @@ -947,6 +950,7 @@ def sync_files( search=search, pdf=pdf, epub=epub, + delete_unsynced_media=delete_unsynced_media, ) # Symlink project @@ -966,6 +970,7 @@ def move_files( search=False, pdf=False, epub=False, + delete_unsynced_media=False, ): """ Task to move built documentation to web servers. @@ -982,10 +987,78 @@ def move_files( :type pdf: bool :param epub: Sync ePub files :type epub: bool + :param delete_unsynced_media: Whether to try and delete files. + :type delete_unsynced_media: bool """ version = Version.objects.get_object_or_log(pk=version_pk) if not version: return + + # This is False if we have already synced media files to blob storage + # We set `epub=False` for example so data doesn't get re-uploaded on each web, + # so we need this to protect against deleting in those cases + if delete_unsynced_media: + + if not pdf: + + remove_dirs([ + version.project.get_production_media_path( + type_='pdf', + version_slug=version.slug, + include_file=False, + ), + ]) + + if getattr(storage, 'write_build_media', False): + # Remove PDF from remote storage if it exists + storage_path = version.project.get_storage_path( + type_='pdf', + version_slug=version.slug, + ) + if storage.exists(storage_path): + log.info('Removing %s from media storage', storage_path) + storage.delete(storage_path) + + if not epub: + + remove_dirs([ + version.project.get_production_media_path( + type_='epub', + version_slug=version.slug, + include_file=False, + ), + ]) + + if getattr(storage, 'write_build_media', False): + # Remove ePub from remote storage if it exists + storage_path = version.project.get_storage_path( + type_='epub', + version_slug=version.slug, + ) + if storage.exists(storage_path): + log.info('Removing %s from media storage', storage_path) + storage.delete(storage_path) + + if not localmedia: + + remove_dirs([ + version.project.get_production_media_path( + type_='htmlzip', + version_slug=version.slug, + include_file=False, + ), + ]) + + if getattr(storage, 'write_build_media', False): + # Remove ePub from remote storage if it exists + storage_path = version.project.get_storage_path( + type_='htmlzip', + version_slug=version.slug, + ) + if storage.exists(storage_path): + log.info('Removing %s from media storage', storage_path) + storage.delete(storage_path) + log.debug( LOG_TEMPLATE.format( project=version.project.slug, @@ -1015,40 +1088,48 @@ def move_files( Syncer.copy(from_path, to_path, host=hostname) if localmedia: - from_path = version.project.artifact_path( - version=version.slug, - type_='sphinx_localmedia', + from_path = os.path.join( + version.project.artifact_path( + version=version.slug, + type_='sphinx_localmedia', + ), + '{}.zip'.format(version.project.slug), ) to_path = version.project.get_production_media_path( type_='htmlzip', version_slug=version.slug, - include_file=False, + include_file=True, ) - Syncer.copy(from_path, to_path, host=hostname) - + Syncer.copy(from_path, to_path, host=hostname, is_file=True) # Always move PDF's because the return code lies. if pdf: - from_path = version.project.artifact_path( - version=version.slug, - type_='sphinx_pdf', + from_path = os.path.join( + version.project.artifact_path( + version=version.slug, + type_='sphinx_pdf', + ), + '{}.pdf'.format(version.project.slug), ) to_path = version.project.get_production_media_path( type_='pdf', version_slug=version.slug, - include_file=False, + include_file=True, ) - Syncer.copy(from_path, to_path, host=hostname) + Syncer.copy(from_path, to_path, host=hostname, is_file=True) if epub: - from_path = version.project.artifact_path( - version=version.slug, - type_='sphinx_epub', + from_path = os.path.join( + version.project.artifact_path( + version=version.slug, + type_='sphinx_epub', + ), + '{}.epub'.format(version.project.slug), ) to_path = version.project.get_production_media_path( type_='epub', version_slug=version.slug, - include_file=False, + include_file=True, ) - Syncer.copy(from_path, to_path, host=hostname) + Syncer.copy(from_path, to_path, host=hostname, is_file=True) @app.task(queue='web') diff --git a/readthedocs/projects/views/public.py b/readthedocs/projects/views/public.py index 7c95090e336..2976cecb693 100644 --- a/readthedocs/projects/views/public.py +++ b/readthedocs/projects/views/public.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - """Public project views.""" import json @@ -14,6 +12,7 @@ from django.contrib import messages from django.contrib.auth.models import User from django.core.cache import cache +from django.core.files.storage import get_storage_class from django.http import HttpResponse, HttpResponseRedirect from django.shortcuts import get_object_or_404, render from django.urls import reverse @@ -33,6 +32,7 @@ log = logging.getLogger(__name__) search_log = logging.getLogger(__name__ + '.search') mimetypes.add_type('application/epub+zip', '.epub') +storage = get_storage_class()() class ProjectIndex(ListView): @@ -164,7 +164,7 @@ def project_badge(request, project_slug): def project_downloads(request, project_slug): - """A detail view for a project with various dataz.""" + """A detail view for a project with various downloads.""" project = get_object_or_404( Project.objects.protected(request.user), slug=project_slug, @@ -206,14 +206,20 @@ def project_download_media(request, project_slug, type_, version_slug): ) privacy_level = getattr(settings, 'DEFAULT_PRIVACY_LEVEL', 'public') if privacy_level == 'public' or settings.DEBUG: - path = os.path.join( + storage_path = version.project.get_storage_path( + type_=type_, version_slug=version_slug + ) + if storage.exists(storage_path): + return HttpResponseRedirect(storage.url(storage_path)) + + media_path = os.path.join( settings.MEDIA_URL, type_, project_slug, version_slug, - '{}.{}'.format(project_slug, type_.replace('htmlzip', 'zip')), + '%s.%s' % (project_slug, type_.replace('htmlzip', 'zip')), ) - return HttpResponseRedirect(path) + return HttpResponseRedirect(media_path) # Get relative media path path = ( diff --git a/readthedocs/rtd_tests/tests/test_project.py b/readthedocs/rtd_tests/tests/test_project.py index 4a56bb9395e..178ec745cd5 100644 --- a/readthedocs/rtd_tests/tests/test_project.py +++ b/readthedocs/rtd_tests/tests/test_project.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- import datetime import json @@ -121,6 +120,20 @@ def test_multiple_conf_files(self, find_method): ) as cm: self.pip.conf_file() + def test_get_storage_path(self): + self.assertEqual( + self.pip.get_storage_path('pdf', LATEST), + 'pdf/pip/latest/pip.pdf', + ) + self.assertEqual( + self.pip.get_storage_path('epub', LATEST), + 'epub/pip/latest/pip.epub', + ) + self.assertEqual( + self.pip.get_storage_path('htmlzip', LATEST), + 'htmlzip/pip/latest/pip.zip', + ) + class TestProjectTranslations(ProjectMixin, TestCase): @@ -239,10 +252,8 @@ def test_user_can_not_add_other_user_project_as_translation(self): self.assertIsNone(project_b.main_language_project) def test_previous_users_can_list_and_delete_translations_not_owner(self): - """ - Test to make sure that previous users can list and delete - projects where they aren't owners. - """ + """Test to make sure that previous users can list and delete projects + where they aren't owners.""" user_a = User.objects.get(username='eric') project_a = get( Project, users=[user_a], diff --git a/readthedocs/rtd_tests/tests/test_project_views.py b/readthedocs/rtd_tests/tests/test_project_views.py index f19c5b664fb..f561f78acd6 100644 --- a/readthedocs/rtd_tests/tests/test_project_views.py +++ b/readthedocs/rtd_tests/tests/test_project_views.py @@ -1,8 +1,7 @@ -# -*- coding: utf-8 -*- from datetime import timedelta -from mock import patch +from allauth.account.models import EmailAddress from django.contrib.auth.models import User from django.contrib.messages import constants as message_const from django.http.response import HttpResponseRedirect @@ -11,8 +10,9 @@ from django.utils import timezone from django.views.generic.base import ContextMixin from django_dynamic_fixture import get, new -from allauth.account.models import EmailAddress +from mock import patch +from readthedocs.builds.constants import LATEST from readthedocs.builds.models import Build, Version from readthedocs.oauth.models import RemoteRepository from readthedocs.projects import tasks @@ -367,6 +367,16 @@ def test_import_demo_imported_duplicate(self): ) +class TestPublicViews(MockBuildTestCase): + def setUp(self): + self.pip = get(Project, slug='pip') + + def test_project_download_media(self): + url = reverse('project_download_media', args=[self.pip.slug, 'pdf', LATEST]) + response = self.client.get(url) + self.assertEqual(response.status_code, 302) + + class TestPrivateViews(MockBuildTestCase): def setUp(self): self.user = new(User, username='eric') diff --git a/readthedocs/rtd_tests/tests/test_projects_tasks.py b/readthedocs/rtd_tests/tests/test_projects_tasks.py index 03beaf87106..c91f9b1534c 100644 --- a/readthedocs/rtd_tests/tests/test_projects_tasks.py +++ b/readthedocs/rtd_tests/tests/test_projects_tasks.py @@ -18,8 +18,9 @@ def setUp(self): @patch('readthedocs.builds.syncers.Syncer.copy') @patch('readthedocs.projects.tasks.shutil.rmtree') def test_sync_files_clean_old_artifacts(self, rmtree, copy): - sync_files(self.project.pk, self.version.pk, 'sphinx', html=True) - pdf, epub = rmtree.call_args_list + sync_files(self.project.pk, self.version.pk, 'sphinx', + html=True, delete_unsynced_media=True) + pdf, epub, htmlzip = rmtree.call_args_list # pdf and epub are cleaned args, _ = pdf @@ -28,7 +29,7 @@ def test_sync_files_clean_old_artifacts(self, rmtree, copy): self.assertIn('epub', args[0]) # Artifacts are copied to the rtd-builds directory - copy.assert_called_once() + assert rmtree.call_count == 3 args, _ = copy.call_args self.assertIn('artifacts', args[0]) self.assertIn('sphinx', args[0]) @@ -38,16 +39,18 @@ def test_sync_files_clean_old_artifacts(self, rmtree, copy): @patch('readthedocs.projects.tasks.shutil.rmtree') def test_sync_files_pdf(self, rmtree, copy): sync_files( - self.project.pk, self.version.pk, 'sphinx', pdf=True + self.project.pk, self.version.pk, 'sphinx', pdf=True, delete_unsynced_media=True ) + epub, htmlzip = rmtree.call_args_list # epub is cleaned - rmtree.assert_called_once() - args, _ = rmtree.call_args + args, _ = epub self.assertIn('epub', args[0]) + args, _ = htmlzip + self.assertIn('htmlzip', args[0]) # Artifacts are copied to the media directory - copy.assert_called_once() + assert rmtree.call_count == 2 args, _ = copy.call_args self.assertIn('artifacts', args[0]) self.assertIn('sphinx_pdf', args[0]) @@ -57,16 +60,19 @@ def test_sync_files_pdf(self, rmtree, copy): @patch('readthedocs.projects.tasks.shutil.rmtree') def test_sync_files_epub(self, rmtree, copy): sync_files( - self.project.pk, self.version.pk, 'sphinx', epub=True + self.project.pk, self.version.pk, 'sphinx', epub=True, delete_unsynced_media=True ) - # pdf is cleaned - rmtree.assert_called_once() - args, _ = rmtree.call_args + pdf, htmlzip = rmtree.call_args_list + + # epub is cleaned + args, _ = pdf self.assertIn('pdf', args[0]) + args, _ = htmlzip + self.assertIn('htmlzip', args[0]) # Artifacts are copied to the media directory - copy.assert_called_once() + assert rmtree.call_count == 2 args, _ = copy.call_args self.assertIn('artifacts', args[0]) self.assertIn('sphinx_epub', args[0]) @@ -76,7 +82,7 @@ def test_sync_files_epub(self, rmtree, copy): @patch('readthedocs.projects.tasks.shutil.rmtree') def test_sync_files_localmedia(self, rmtree, copy): sync_files( - self.project.pk, self.version.pk, 'sphinx', localmedia=True + self.project.pk, self.version.pk, 'sphinx', localmedia=True, delete_unsynced_media=True ) pdf, epub = rmtree.call_args_list @@ -87,7 +93,7 @@ def test_sync_files_localmedia(self, rmtree, copy): self.assertIn('epub', args[0]) # Artifacts are copied to the media directory - copy.assert_called_once() + assert rmtree.call_count == 2 args, _ = copy.call_args self.assertIn('artifacts', args[0]) self.assertIn('sphinx_localmedia', args[0]) @@ -97,15 +103,17 @@ def test_sync_files_localmedia(self, rmtree, copy): @patch('readthedocs.projects.tasks.shutil.rmtree') def test_sync_files_search(self, rmtree, copy): sync_files( - self.project.pk, self.version.pk, 'sphinx', search=True + self.project.pk, self.version.pk, 'sphinx', search=True, delete_unsynced_media=True ) - pdf, epub = rmtree.call_args_list + pdf, epub, htmlzip = rmtree.call_args_list # pdf and epub are cleaned args, _ = pdf self.assertIn('pdf', args[0]) args, _ = epub self.assertIn('epub', args[0]) + args, _ = htmlzip + self.assertIn('htmlzip', args[0]) # Artifacts are copied to the media directory copy.assert_called_once()