Skip to content

Store ePubs and PDFs in media storage #4947

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Feb 26, 2019
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions readthedocs/builds/syncers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,15 @@
import shutil

from django.conf import settings
from django.core.exceptions import SuspiciousFileOperation
from django.core.files.storage import get_storage_class

from readthedocs.core.utils import safe_makedirs
from readthedocs.core.utils.extend import SettingsOverrideObject


log = logging.getLogger(__name__)
storage = get_storage_class()()


class BaseSyncer:
Expand All @@ -42,6 +45,11 @@ def copy(cls, path, target, is_file=False, **kwargs):
return
if os.path.exists(target):
os.remove(target)

# Create containing directory if it doesn't exist
directory = os.path.dirname(target)
safe_makedirs(directory)

shutil.copy2(path, target)
else:
if os.path.exists(target):
Expand Down Expand Up @@ -143,6 +151,10 @@ def copy(cls, path, target, host, is_file=False, **kwargs): # pylint: disable=a
log.info('Remote Pull %s to %s', path, target)
if not is_file and not os.path.exists(target):
safe_makedirs(target)
if is_file:
# Create containing directory if it doesn't exist
directory = os.path.dirname(target)
safe_makedirs(directory)
# Add a slash when copying directories
sync_cmd = "rsync -e 'ssh -T' -av --delete {user}@{host}:{path} {target}".format(
host=host,
Expand All @@ -159,6 +171,59 @@ def copy(cls, path, target, host, is_file=False, **kwargs): # pylint: disable=a
)


class SelectiveStorageRemotePuller(RemotePuller):

"""
Exactly like RemotePuller except that certain files are copied via Django's storage system

If a file with extensions specified by ``extensions`` is copied, it will be copied to storage
and the original is removed.

See: https://docs.djangoproject.com/en/1.11/ref/settings/#std:setting-DEFAULT_FILE_STORAGE
"""

extensions = ('.pdf', '.epub', '.zip')

@classmethod
def get_storage_path(cls, path):
"""
Gets the path to the file within the storage engine

For example, if the path was $MEDIA_ROOT/pdfs/latest.pdf
the storage_path is 'pdfs/latest.pdf'

:raises: SuspiciousFileOperation if the path isn't under settings.MEDIA_ROOT
"""
path = os.path.normpath(path)
if not path.startswith(settings.MEDIA_ROOT):
raise SuspiciousFileOperation

path = path.replace(settings.MEDIA_ROOT, '').lstrip('/')
return path

@classmethod
def copy(cls, path, target, host, is_file=False, **kwargs): # pylint: disable=arguments-differ
RemotePuller.copy(path, target, host, is_file, **kwargs)

if getattr(storage, 'write_build_media', False):
# This is a sanity check for the case where
# storage is backed by the local filesystem
# In that case, removing the original target file locally
# would remove the file from storage as well

if is_file and os.path.exists(target) and \
any([target.lower().endswith(ext) for ext in cls.extensions]):
log.info("Selective Copy %s to media storage", target)

storage_path = cls.get_storage_path(target)

if storage.exists(storage_path):
storage.delete(storage_path)

with open(target, 'rb') as fd:
storage.save(storage_path, fd)


class Syncer(SettingsOverrideObject):
_default_class = LocalSyncer
_override_setting = 'FILE_SYNCER'
47 changes: 29 additions & 18 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from django.conf import settings
from django.contrib.auth.models import User
from django.core.files.storage import get_storage_class
from django.db import models
from django.urls import NoReverseMatch, reverse
from django.utils.functional import cached_property
Expand Down Expand Up @@ -44,6 +45,7 @@


log = logging.getLogger(__name__)
storage = get_storage_class()()


class ProjectRelationship(models.Model):
Expand Down Expand Up @@ -510,6 +512,24 @@ def get_subproject_urls(self):
return [(proj.child.slug, proj.child.get_docs_url())
for proj in self.subprojects.all()]

def get_storage_path(self, type_, version_slug=LATEST):
"""
Get a path to a build artifact for use with Django's storage system

:param type_: Media content type, ie - 'pdf', 'htmlzip'
:param version_slug: Project version slug for lookup
:return: the path to an item in storage
(can be used with ``storage.url`` to get the URL)
"""
extension = type_.replace('htmlzip', 'zip')
return '{}/{}/{}/{}.{}'.format(
type_,
self.slug,
version_slug,
self.slug,
extension,
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we strictly talking filesystem path here, or URL path, or both? If filesystem, this should use os.path.join()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a storage check, not a filesystem check.


def get_production_media_path(self, type_, version_slug, include_file=True):
"""
Used to see if these files exist so we can offer them for download.
Expand Down Expand Up @@ -728,30 +748,21 @@ def has_aliases(self):
def has_pdf(self, version_slug=LATEST):
if not self.enable_pdf_build:
return False
return os.path.exists(
self.get_production_media_path(
type_='pdf',
version_slug=version_slug,
)
)
path = self.get_production_media_path(type_='pdf', version_slug=version_slug)
storage_path = self.get_storage_path(type_='pdf', version_slug=version_slug)
return os.path.exists(path) or storage.exists(storage_path)

def has_epub(self, version_slug=LATEST):
if not self.enable_epub_build:
return False
return os.path.exists(
self.get_production_media_path(
type_='epub',
version_slug=version_slug,
)
)
path = self.get_production_media_path(type_='epub', version_slug=version_slug)
storage_path = self.get_storage_path(type_='epub', version_slug=version_slug)
return os.path.exists(path) or storage.exists(storage_path)

def has_htmlzip(self, version_slug=LATEST):
return os.path.exists(
self.get_production_media_path(
type_='htmlzip',
version_slug=version_slug,
)
)
path = self.get_production_media_path(type_='htmlzip', version_slug=version_slug)
storage_path = self.get_storage_path(type_='htmlzip', version_slug=version_slug)
return os.path.exists(path) or storage.exists(storage_path)

@property
def sponsored(self):
Expand Down
116 changes: 84 additions & 32 deletions readthedocs/projects/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import requests
from celery.exceptions import SoftTimeLimitExceeded
from django.conf import settings
from django.core.files.storage import get_storage_class
from django.db.models import Q
from django.urls import reverse
from django.utils import timezone
Expand Down Expand Up @@ -79,6 +80,7 @@


log = logging.getLogger(__name__)
storage = get_storage_class()()


class SyncRepositoryMixin:
Expand Down Expand Up @@ -703,6 +705,7 @@ def update_app_instances(
search=False,
pdf=False,
epub=False,
delete=True,
):
"""
Update application instances with build artifacts.
Expand All @@ -722,6 +725,25 @@ def update_app_instances(
'Updating version failed, skipping file sync: version=%s',
self.version,
)
hostname = socket.gethostname()

if getattr(storage, 'write_build_media', False):
# Handle the case where we want to upload some built assets to our storage
move_files.delay(
self.version.pk,
hostname,
self.config.doctype,
html=False,
search=False,
localmedia=localmedia,
pdf=pdf,
epub=epub,
)
# Set variables so they don't get synced in the next broadcast step
localmedia = False
pdf = False
epub = False
delete = False

# Broadcast finalization steps to web application instances
broadcast(
Expand All @@ -733,17 +755,17 @@ def update_app_instances(
self.config.doctype,
],
kwargs=dict(
hostname=socket.gethostname(),
hostname=hostname,
html=html,
localmedia=localmedia,
search=search,
pdf=pdf,
epub=epub,
delete=delete,
),
callback=sync_callback.s(
version_pk=self.version.pk,
commit=self.build['commit'],
search=search,
),
)

Expand Down Expand Up @@ -896,6 +918,7 @@ def sync_files(
search=False,
pdf=False,
epub=False,
delete=False,
):
"""
Sync build artifacts to application instances.
Expand All @@ -907,20 +930,41 @@ def sync_files(
version = Version.objects.get_object_or_log(pk=version_pk)
if not version:
return
if not pdf:
remove_dirs([
version.project.get_production_media_path(
type_='pdf',
version_slug=version.slug,
),
])
if not epub:
remove_dirs([
version.project.get_production_media_path(
type_='epub',
version_slug=version.slug,
),
])
if delete:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there's an issue around this. While I was testing the removal feature, this flag was always False. I'm actually not 100% sure why we need this flag at all. Wouldn't we always want to remove old PDFs/ePubs?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should only be true when being called w/ the broadcast. I guess we could delete the old ones before replacing them with the new ones, but that seems like wasted effort, but arguably could be done.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess we could delete the old ones before replacing them with the new ones, but that seems like wasted effort, but arguably could be done.

For storage, you have to do this. There isn't an "overwrite" option.

Copy link
Contributor Author

@davidfischer davidfischer Feb 14, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should only be true when being called w/ the broadcast.

Is the implication here that this won't happen when run locally? I wasn't able to successfully test deleting artifacts when PDF/Epubs are disabled locally.

Edit: I tested it, but it didn't work correctly.

if not pdf:
remove_dirs([
version.project.get_production_media_path(
type_='pdf',
version_slug=version.slug,
),
])

if getattr(storage, 'write_build_media', False):
# Remove PDF from remote storage if it exists
storage_path = version.project.get_storage_path(
type_='pdf',
version_slug=version.slug,
)
if storage.exists(storage_path):
log.info('Removing %s from media storage', storage_path)
storage.delete(storage_path)
if not epub:
remove_dirs([
version.project.get_production_media_path(
type_='epub',
version_slug=version.slug,
),
])

if getattr(storage, 'write_build_media', False):
# Remove ePub from remote storage if it exists
storage_path = version.project.get_storage_path(
type_='epub',
version_slug=version.slug,
)
if storage.exists(storage_path):
log.info('Removing %s from media storage', storage_path)
storage.delete(storage_path)

# Sync files to the web servers
move_files(
Expand Down Expand Up @@ -1000,40 +1044,48 @@ def move_files(
Syncer.copy(from_path, to_path, host=hostname)

if localmedia:
from_path = version.project.artifact_path(
version=version.slug,
type_='sphinx_localmedia',
from_path = os.path.join(
version.project.artifact_path(
version=version.slug,
type_='sphinx_localmedia',
),
'{}.zip'.format(version.project.slug),
)
to_path = version.project.get_production_media_path(
type_='htmlzip',
version_slug=version.slug,
include_file=False,
include_file=True,
)
Syncer.copy(from_path, to_path, host=hostname)

Syncer.copy(from_path, to_path, host=hostname, is_file=True)
# Always move PDF's because the return code lies.
if pdf:
from_path = version.project.artifact_path(
version=version.slug,
type_='sphinx_pdf',
from_path = os.path.join(
version.project.artifact_path(
version=version.slug,
type_='sphinx_pdf',
),
'{}.pdf'.format(version.project.slug),
)
to_path = version.project.get_production_media_path(
type_='pdf',
version_slug=version.slug,
include_file=False,
include_file=True,
)
Syncer.copy(from_path, to_path, host=hostname)
Syncer.copy(from_path, to_path, host=hostname, is_file=True)
if epub:
from_path = version.project.artifact_path(
version=version.slug,
type_='sphinx_epub',
from_path = os.path.join(
version.project.artifact_path(
version=version.slug,
type_='sphinx_epub',
),
'{}.epub'.format(version.project.slug),
)
to_path = version.project.get_production_media_path(
type_='epub',
version_slug=version.slug,
include_file=False,
include_file=True,
)
Syncer.copy(from_path, to_path, host=hostname)
Syncer.copy(from_path, to_path, host=hostname, is_file=True)


@app.task(queue='web')
Expand Down
Loading