diff --git a/readthedocs/builds/models.py b/readthedocs/builds/models.py index f62632a41ab..d7aab0fade1 100644 --- a/readthedocs/builds/models.py +++ b/readthedocs/builds/models.py @@ -8,6 +8,7 @@ import regex from django.conf import settings +from django.core.files.storage import get_storage_class from django.db import models from django.db.models import F from django.urls import reverse @@ -451,6 +452,11 @@ def get_storage_paths(self): return paths + def get_storage_environment_cache_path(self): + """Return the path of the cached environment tar file.""" + storage = get_storage_class(settings.RTD_BUILD_ENVIRONMENT_STORAGE)() + return storage.join(self.project.slug, f'{self.slug}.tar') + def clean_build_path(self): """ Clean build path for project version. diff --git a/readthedocs/core/utils/general.py b/readthedocs/core/utils/general.py index 63cc0c2dc66..f1b5e1c0c5b 100644 --- a/readthedocs/core/utils/general.py +++ b/readthedocs/core/utils/general.py @@ -2,6 +2,8 @@ import os +from django.conf import settings +from django.core.files.storage import get_storage_class from django.shortcuts import get_object_or_404 from readthedocs.core.utils import broadcast @@ -24,3 +26,7 @@ def wipe_version_via_slugs(version_slug, project_slug): ] for del_dir in del_dirs: broadcast(type='build', task=remove_dirs, args=[(del_dir,)]) + + # Delete the cache environment from storage + storage = get_storage_class(settings.RTD_BUILD_ENVIRONMENT_STORAGE)() + storage.delete(version.get_storage_environment_cache_path()) diff --git a/readthedocs/doc_builder/python_environments.py b/readthedocs/doc_builder/python_environments.py index 7eeddcd1100..d2750a16b24 100644 --- a/readthedocs/doc_builder/python_environments.py +++ b/readthedocs/doc_builder/python_environments.py @@ -129,12 +129,15 @@ def _pip_cache_cmd_argument(self): The decision is made considering if the directories are going to be cleaned after the build (``RTD_CLEAN_AFTER_BUILD=True`` or project has - the ``CLEAN_AFTER_BUILD`` feature enabled). In this case, there is no - need to cache anything. + the ``CLEAN_AFTER_BUILD`` feature enabled) and project has not the + feature ``CACHED_ENVIRONMENT``. In this case, there is no need to cache + anything. """ if ( - settings.RTD_CLEAN_AFTER_BUILD or - self.project.has_feature(Feature.CLEAN_AFTER_BUILD) + ( + settings.RTD_CLEAN_AFTER_BUILD or + self.project.has_feature(Feature.CLEAN_AFTER_BUILD) + ) and not self.project.has_feature(Feature.CACHED_ENVIRONMENT) ): return [ '--no-cache-dir', diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index 8a157432c78..6984266b7ea 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -1515,6 +1515,7 @@ def add_features(sender, **kwargs): SKIP_SYNC_TAGS = 'skip_sync_tags' SKIP_SYNC_BRANCHES = 'skip_sync_branches' SKIP_SYNC = 'skip_sync' + CACHED_ENVIRONMENT = 'cached_environment' FEATURES = ( (USE_SPHINX_LATEST, _('Use latest version of Sphinx')), @@ -1585,6 +1586,10 @@ def add_features(sender, **kwargs): SKIP_SYNC, _('Skip symlinking and file syncing to webs'), ), + ( + CACHED_ENVIRONMENT, + _('Cache the environment (virtualenv, conda, pip cache, repository) in storage'), + ), ) projects = models.ManyToManyField( diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py index 3a51e1d20cb..645523b3efb 100644 --- a/readthedocs/projects/tasks.py +++ b/readthedocs/projects/tasks.py @@ -12,6 +12,8 @@ import os import shutil import socket +import tarfile +import tempfile from collections import Counter, defaultdict import requests @@ -87,6 +89,97 @@ log = logging.getLogger(__name__) +class CachedEnvironmentMixin: + + """Mixin that pull/push cached environment to storage.""" + + def pull_cached_environment(self): + if not self.project.has_feature(feature_id=Feature.CACHED_ENVIRONMENT): + return + + storage = get_storage_class(settings.RTD_BUILD_ENVIRONMENT_STORAGE)() + filename = self.version.get_storage_environment_cache_path() + + msg = 'Checking for cached environment' + log.debug( + LOG_TEMPLATE, + { + 'project': self.project.slug, + 'version': self.version.slug, + 'msg': msg, + } + ) + if storage.exists(filename): + msg = 'Pulling down cached environment from storage' + log.info( + LOG_TEMPLATE, + { + 'project': self.project.slug, + 'version': self.version.slug, + 'msg': msg, + } + ) + _, tmp_filename = tempfile.mkstemp(suffix='.tar') + remote_fd = storage.open(filename, mode='rb') + with open(tmp_filename, mode='wb') as local_fd: + local_fd.write(remote_fd.read()) + + with tarfile.open(tmp_filename) as tar: + tar.extractall(self.version.project.doc_path) + + # Cleanup the temporary file + if os.path.exists(tmp_filename): + os.remove(tmp_filename) + + def push_cached_environment(self): + if not self.project.has_feature(feature_id=Feature.CACHED_ENVIRONMENT): + return + + project_path = self.project.doc_path + directories = [ + 'checkouts', + 'envs', + 'conda', + ] + + _, tmp_filename = tempfile.mkstemp(suffix='.tar') + # open just with 'w', to not compress and waste CPU cycles + with tarfile.open(tmp_filename, 'w') as tar: + for directory in directories: + path = os.path.join( + project_path, + directory, + self.version.slug, + ) + arcname = os.path.join(directory, self.version.slug) + if os.path.exists(path): + tar.add(path, arcname=arcname) + + # Special handling for .cache directory because it's per-project + path = os.path.join(project_path, '.cache') + tar.add(path, arcname='.cache') + + storage = get_storage_class(settings.RTD_BUILD_ENVIRONMENT_STORAGE)() + with open(tmp_filename, 'rb') as fd: + msg = 'Pushing up cached environment to storage' + log.info( + LOG_TEMPLATE, + { + 'project': self.project.slug, + 'version': self.version.slug, + 'msg': msg, + } + ) + storage.save( + self.version.get_storage_environment_cache_path(), + fd, + ) + + # Cleanup the temporary file + if os.path.exists(tmp_filename): + os.remove(tmp_filename) + + class SyncRepositoryMixin: """Mixin that handles the VCS sync/update.""" @@ -230,7 +323,7 @@ def sync_repository_task(version_pk): clean_build(version_pk) -class SyncRepositoryTaskStep(SyncRepositoryMixin): +class SyncRepositoryTaskStep(SyncRepositoryMixin, CachedEnvironmentMixin): """ Entry point to synchronize the VCS documentation. @@ -271,6 +364,12 @@ def run(self, version_pk): # pylint: disable=arguments-differ with environment: before_vcs.send(sender=self.version, environment=environment) with self.project.repo_nonblockinglock(version=self.version): + # When syncing we are only pulling the cached environment + # (without pushing it after it's updated). We only clone the + # repository in this step, and pushing it back will delete + # all the other cached things (Python packages, Sphinx, + # virtualenv, etc) + self.pull_cached_environment() self.sync_repo(environment) return True except RepositoryError: @@ -329,7 +428,7 @@ def update_docs_task(self, version_pk, *args, **kwargs): clean_build(version_pk) -class UpdateDocsTaskStep(SyncRepositoryMixin): +class UpdateDocsTaskStep(SyncRepositoryMixin, CachedEnvironmentMixin): """ The main entry point for updating documentation. @@ -492,6 +591,7 @@ def run_setup(self, record=True): raise ProjectBuildsSkippedError try: with self.project.repo_nonblockinglock(version=self.version): + self.pull_cached_environment() self.setup_vcs(environment) except vcs_support_utils.LockTimeout as e: self.task.retry(exc=e, throw=False) @@ -646,6 +746,9 @@ def run_build(self, record): # Send Webhook notification for build success. self.send_notifications(self.version.pk, self.build['id'], email=False) + # Push cached environment on success for next build + self.push_cached_environment() + if self.commit: send_external_build_status( version_type=self.version.type, diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py index b517e0b5a04..e83d18c74ab 100644 --- a/readthedocs/settings/base.py +++ b/readthedocs/settings/base.py @@ -259,6 +259,7 @@ def USE_PROMOS(self): # noqa # Django Storage subclass used to write build artifacts to cloud or local storage # https://docs.readthedocs.io/page/development/settings.html#rtd-build-media-storage RTD_BUILD_MEDIA_STORAGE = 'readthedocs.builds.storage.BuildMediaFileSystemStorage' + RTD_BUILD_ENVIRONMENT_STORAGE = 'readthedocs.builds.storage.BuildMediaFileSystemStorage' TEMPLATES = [ { diff --git a/readthedocs/settings/docker_compose.py b/readthedocs/settings/docker_compose.py index f09e32705c5..7d566aabf18 100644 --- a/readthedocs/settings/docker_compose.py +++ b/readthedocs/settings/docker_compose.py @@ -117,6 +117,9 @@ def DATABASES(self): # noqa RTD_BUILD_MEDIA_STORAGE = 'readthedocs.storage.azure_storage.AzureBuildMediaStorage' AZURE_STATIC_STORAGE_HOSTNAME = PRODUCTION_DOMAIN + # Storage backend for build cached environments + RTD_BUILD_ENVIRONMENT_STORAGE = 'readthedocs.storage.azure_storage.AzureBuildEnvironmentStorage' + # Storage for static files (those collected with `collectstatic`) STATICFILES_STORAGE = 'readthedocs.storage.azure_storage.AzureStaticStorage' diff --git a/readthedocs/storage/azure_storage.py b/readthedocs/storage/azure_storage.py index be239974679..baa6940ed83 100644 --- a/readthedocs/storage/azure_storage.py +++ b/readthedocs/storage/azure_storage.py @@ -28,6 +28,11 @@ class AzureBuildStorage(AzureStorage): azure_container = getattr(settings, 'AZURE_BUILD_STORAGE_CONTAINER', None) or 'builds' +class AzureBuildEnvironmentStorage(BuildMediaStorageMixin, AzureStorage): + + azure_container = getattr(settings, 'AZURE_BUILD_ENVIRONMENT_STORAGE_CONTAINER', None) or 'envs' + + class AzureStaticStorage(OverrideHostnameMixin, ManifestFilesMixin, AzureStorage): """