Skip to content

Pull/Push cached environment using storage #6763

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Mar 12, 2020
6 changes: 6 additions & 0 deletions readthedocs/builds/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import regex
from django.conf import settings
from django.core.files.storage import get_storage_class
from django.db import models
from django.db.models import F
from django.urls import reverse
Expand Down Expand Up @@ -451,6 +452,11 @@ def get_storage_paths(self):

return paths

def get_storage_environment_cache_path(self):
"""Return the path of the cached environment tar file."""
storage = get_storage_class(settings.RTD_BUILD_ENVIRONMENT_STORAGE)()
return storage.join(self.project.slug, f'{self.slug}.tar')

def clean_build_path(self):
"""
Clean build path for project version.
Expand Down
6 changes: 6 additions & 0 deletions readthedocs/core/utils/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import os

from django.conf import settings
from django.core.files.storage import get_storage_class
from django.shortcuts import get_object_or_404

from readthedocs.core.utils import broadcast
Expand All @@ -24,3 +26,7 @@ def wipe_version_via_slugs(version_slug, project_slug):
]
for del_dir in del_dirs:
broadcast(type='build', task=remove_dirs, args=[(del_dir,)])

# Delete the cache environment from storage
storage = get_storage_class(settings.RTD_BUILD_ENVIRONMENT_STORAGE)()
storage.delete(version.get_storage_environment_cache_path())
5 changes: 5 additions & 0 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1515,6 +1515,7 @@ def add_features(sender, **kwargs):
SKIP_SYNC_TAGS = 'skip_sync_tags'
SKIP_SYNC_BRANCHES = 'skip_sync_branches'
SKIP_SYNC = 'skip_sync'
CACHED_ENVIRONMENT = 'cached_environment'

FEATURES = (
(USE_SPHINX_LATEST, _('Use latest version of Sphinx')),
Expand Down Expand Up @@ -1585,6 +1586,10 @@ def add_features(sender, **kwargs):
SKIP_SYNC,
_('Skip symlinking and file syncing to webs'),
),
(
CACHED_ENVIRONMENT,
_('Cache the environment (virtualenv, conda, pip cache, repository) in storage'),
),
)

projects = models.ManyToManyField(
Expand Down
104 changes: 102 additions & 2 deletions readthedocs/projects/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import os
import shutil
import socket
import tarfile
import tempfile
from collections import Counter, defaultdict

import requests
Expand Down Expand Up @@ -87,6 +89,94 @@
log = logging.getLogger(__name__)


class CachedEnvironmentMixin:

"""Mixin that pull/push cached environment to storage."""

def pull_cached_environment(self):
if not self.project.has_feature(feature_id=Feature.CACHED_ENVIRONMENT):
return

storage = get_storage_class(settings.RTD_BUILD_ENVIRONMENT_STORAGE)()
filename = self.version.get_storage_environment_cache_path()

msg = 'Checking for cached environment'
log.debug(
LOG_TEMPLATE,
{
'project': self.project.slug,
'version': self.version.slug,
'msg': msg,
}
)
if storage.exists(filename):
msg = 'Pulling down cached environment from storage'
log.info(
LOG_TEMPLATE,
{
'project': self.project.slug,
'version': self.version.slug,
'msg': msg,
}
)
tmp_filename = tempfile.mkstemp(suffix='.tar')
remote_fd = storage.open(filename, mode='rb')
with open(tmp_filename, mode='wb') as local_fd:
local_fd.write(remote_fd.read())

with tarfile.open(tmp_filename) as tar:
tar.extractall(self.version.project.doc_path)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if it's worth abstracting the archive format here. It looks like shutil has an abstraction over it. I imagine we'll want to play with tar and gzip at least, because I imagine bandwidth and CPU will be a tradeoff here that we want to be able to tweak.

https://docs.python.org/3/library/shutil.html#archiving-operations

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hrm... Interesting! I didn't know about shutil.make_archive. I will take a deeper look.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having an abstraction is a good idea to be able to play here. Although, it seems that shutil.make_archive does not gives us too much ability for this since it's not too customizable: we can't change lower level attributes.

Besides, we would need Python 3.8 because since that version they are using modern PAX (--xattrs equivalent):

Changed in version 3.8: The modern pax (POSIX.1-2001) format is now used instead of the legacy GNU format for archives created with format="tar".

On the other hand, tarfile allows us to play with gzip, bz2 and lzma and with compression level as well.


# Cleanup the temporary file
if os.path.exists(tmp_filename):
os.remove(tmp_filename)

def push_cached_environment(self):
if not self.project.has_feature(feature_id=Feature.CACHED_ENVIRONMENT):
return

project_path = self.project.doc_path
paths = [
os.path.join(project_path, 'checkouts', self.version.slug),
os.path.join(project_path, 'envs', self.version.slug),
os.path.join(project_path, 'conda', self.version.slug),
os.path.join(project_path, '.cache'),
]

tmp_filename = tempfile.mkstemp(suffix='.tar')
# open just with 'w', to not compress and waste CPU cycles
with tarfile.open(tmp_filename, 'w') as tar:
for path in paths:
if os.path.exists(path):
tar.add(
path,
arcname=os.path.join(
os.path.basename(os.path.dirname(path)),
self.version.slug,
)
)

storage = get_storage_class(settings.RTD_BUILD_ENVIRONMENT_STORAGE)()
with open(tmp_filename, 'rb') as fd:
msg = 'Pushing up cached environment to storage'
log.info(
LOG_TEMPLATE,
{
'project': self.project.slug,
'version': self.version.slug,
'msg': msg,
}
)
storage.save(
self.version.get_storage_environment_cache_path(),
fd,
)

# Cleanup the temporary file
if os.path.exists(tmp_filename):
os.remove(tmp_filename)


class SyncRepositoryMixin:

"""Mixin that handles the VCS sync/update."""
Expand Down Expand Up @@ -230,7 +320,7 @@ def sync_repository_task(version_pk):
clean_build(version_pk)


class SyncRepositoryTaskStep(SyncRepositoryMixin):
class SyncRepositoryTaskStep(SyncRepositoryMixin, CachedEnvironmentMixin):

"""
Entry point to synchronize the VCS documentation.
Expand Down Expand Up @@ -271,6 +361,12 @@ def run(self, version_pk): # pylint: disable=arguments-differ
with environment:
before_vcs.send(sender=self.version, environment=environment)
with self.project.repo_nonblockinglock(version=self.version):
# When syncing we are only pulling the cached environment
# (without pushing it after it's updated). We only clone the
# repository in this step, and pushing it back will delete
# all the other cached things (Python packages, Sphinx,
# virtualenv, etc)
self.pull_cached_environment()
self.sync_repo(environment)
return True
except RepositoryError:
Expand Down Expand Up @@ -329,7 +425,7 @@ def update_docs_task(self, version_pk, *args, **kwargs):
clean_build(version_pk)


class UpdateDocsTaskStep(SyncRepositoryMixin):
class UpdateDocsTaskStep(SyncRepositoryMixin, CachedEnvironmentMixin):

"""
The main entry point for updating documentation.
Expand Down Expand Up @@ -492,6 +588,7 @@ def run_setup(self, record=True):
raise ProjectBuildsSkippedError
try:
with self.project.repo_nonblockinglock(version=self.version):
self.pull_cached_environment()
self.setup_vcs(environment)
except vcs_support_utils.LockTimeout as e:
self.task.retry(exc=e, throw=False)
Expand Down Expand Up @@ -646,6 +743,9 @@ def run_build(self, record):
# Send Webhook notification for build success.
self.send_notifications(self.version.pk, self.build['id'], email=False)

# Push cached environment on success for next build
self.push_cached_environment()

if self.commit:
send_external_build_status(
version_type=self.version.type,
Expand Down
1 change: 1 addition & 0 deletions readthedocs/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ def USE_PROMOS(self): # noqa
# Django Storage subclass used to write build artifacts to cloud or local storage
# https://docs.readthedocs.io/page/development/settings.html#rtd-build-media-storage
RTD_BUILD_MEDIA_STORAGE = 'readthedocs.builds.storage.BuildMediaFileSystemStorage'
RTD_BUILD_ENVIRONMENT_STORAGE = 'readthedocs.builds.storage.BuildMediaFileSystemStorage'

TEMPLATES = [
{
Expand Down
3 changes: 3 additions & 0 deletions readthedocs/settings/docker_compose.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ def DATABASES(self): # noqa
RTD_BUILD_MEDIA_STORAGE = 'readthedocs.storage.azure_storage.AzureBuildMediaStorage'
AZURE_STATIC_STORAGE_HOSTNAME = PRODUCTION_DOMAIN

# Storage backend for build cached environments
RTD_BUILD_ENVIRONMENT_STORAGE = 'readthedocs.storage.azure_storage.AzureBuildEnvironmentStorage'

# Storage for static files (those collected with `collectstatic`)
STATICFILES_STORAGE = 'readthedocs.storage.azure_storage.AzureStaticStorage'

Expand Down
5 changes: 5 additions & 0 deletions readthedocs/storage/azure_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ class AzureBuildStorage(AzureStorage):
azure_container = getattr(settings, 'AZURE_BUILD_STORAGE_CONTAINER', None) or 'builds'


class AzureBuildEnvironmentStorage(BuildMediaStorageMixin, AzureStorage):

azure_container = getattr(settings, 'AZURE_BUILD_ENVIRONMENT_STORAGE_CONTAINER', None) or 'envs'


class AzureStaticStorage(OverrideHostnameMixin, ManifestFilesMixin, AzureStorage):

"""
Expand Down