From 9ec4258f47c3e085c7fab5f10d9c89ae3838875a Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Mon, 9 Mar 2020 16:18:57 -0700 Subject: [PATCH 01/10] Pull/Push cached environment using storage Before starting clonnig the repository we check of a cached environment in the storage. If there is one, we pull it and extract it under `project.doc_path` (using the same structure that we have been using) and use those cached files. Once the build has finished and it was successful, we compress all the environment files (pip cache, checkout, conda and virtualenv environments) and push it to the storage. The cached is cleaned up when the version is wiped. This feature is under a Feature flag (CACHED_ENVIRONMENT), so we can select specific projects to start testing out this cached environment. It can help on projects with big/huge repositories or that install a lot of dependencies. --- readthedocs/builds/models.py | 6 ++ readthedocs/core/utils/general.py | 6 ++ readthedocs/projects/models.py | 5 ++ readthedocs/projects/tasks.py | 92 +++++++++++++++++++++++++- readthedocs/settings/docker_compose.py | 2 + readthedocs/storage/azure_storage.py | 5 ++ 6 files changed, 114 insertions(+), 2 deletions(-) diff --git a/readthedocs/builds/models.py b/readthedocs/builds/models.py index f62632a41ab..d7aab0fade1 100644 --- a/readthedocs/builds/models.py +++ b/readthedocs/builds/models.py @@ -8,6 +8,7 @@ import regex from django.conf import settings +from django.core.files.storage import get_storage_class from django.db import models from django.db.models import F from django.urls import reverse @@ -451,6 +452,11 @@ def get_storage_paths(self): return paths + def get_storage_environment_cache_path(self): + """Return the path of the cached environment tar file.""" + storage = get_storage_class(settings.RTD_BUILD_ENVIRONMENT_STORAGE)() + return storage.join(self.project.slug, f'{self.slug}.tar') + def clean_build_path(self): """ Clean build path for project version. diff --git a/readthedocs/core/utils/general.py b/readthedocs/core/utils/general.py index 63cc0c2dc66..f1b5e1c0c5b 100644 --- a/readthedocs/core/utils/general.py +++ b/readthedocs/core/utils/general.py @@ -2,6 +2,8 @@ import os +from django.conf import settings +from django.core.files.storage import get_storage_class from django.shortcuts import get_object_or_404 from readthedocs.core.utils import broadcast @@ -24,3 +26,7 @@ def wipe_version_via_slugs(version_slug, project_slug): ] for del_dir in del_dirs: broadcast(type='build', task=remove_dirs, args=[(del_dir,)]) + + # Delete the cache environment from storage + storage = get_storage_class(settings.RTD_BUILD_ENVIRONMENT_STORAGE)() + storage.delete(version.get_storage_environment_cache_path()) diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index 8a157432c78..6984266b7ea 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -1515,6 +1515,7 @@ def add_features(sender, **kwargs): SKIP_SYNC_TAGS = 'skip_sync_tags' SKIP_SYNC_BRANCHES = 'skip_sync_branches' SKIP_SYNC = 'skip_sync' + CACHED_ENVIRONMENT = 'cached_environment' FEATURES = ( (USE_SPHINX_LATEST, _('Use latest version of Sphinx')), @@ -1585,6 +1586,10 @@ def add_features(sender, **kwargs): SKIP_SYNC, _('Skip symlinking and file syncing to webs'), ), + ( + CACHED_ENVIRONMENT, + _('Cache the environment (virtualenv, conda, pip cache, repository) in storage'), + ), ) projects = models.ManyToManyField( diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py index 3a51e1d20cb..f14b95daaca 100644 --- a/readthedocs/projects/tasks.py +++ b/readthedocs/projects/tasks.py @@ -12,6 +12,8 @@ import os import shutil import socket +import tarfile +import tempfile from collections import Counter, defaultdict import requests @@ -87,6 +89,87 @@ log = logging.getLogger(__name__) +class CachedEnvironmentMixin: + + """Mixin that pull/push cached environment to storage.""" + + def pull_cached_environment(self): + if not self.project.has_feature(feature_id=Feature.CACHED_ENVIRONMENT): + return + + storage = get_storage_class(settings.RTD_BUILD_ENVIRONMENT_STORAGE)() + filename = self.version.get_storage_environment_cache_path() + + msg = 'Checking for cached environment' + log.debug( + LOG_TEMPLATE, + { + 'project': self.project.slug, + 'version': self.version.slug, + 'msg': msg, + } + ) + if storage.exists(filename): + msg = 'Pulling down cached environment from storage' + log.info( + LOG_TEMPLATE, + { + 'project': self.project.slug, + 'version': self.version.slug, + 'msg': msg, + } + ) + tmp_filename = tempfile.mktemp(suffix='.tar') + remote_fd = storage.open(filename, mode='rb') + with open(tmp_filename, mode='wb') as local_fd: + local_fd.write(remote_fd.read()) + + tar = tarfile.TarFile(tmp_filename) + tar.extractall(self.version.project.doc_path) + + + def push_cached_environment(self): + if not self.project.has_feature(feature_id=Feature.CACHED_ENVIRONMENT): + return + + project_path = self.project.doc_path + paths = [ + os.path.join(project_path, 'checkouts', self.version.slug), + os.path.join(project_path, 'envs', self.version.slug), + os.path.join(project_path, 'conda', self.version.slug), + os.path.join(project_path, '.cache'), + ] + + tmp_filename = tempfile.mktemp(suffix='.tar') + # open just with 'w', to not compress and waste CPU cycles + with tarfile.open(tmp_filename, 'w') as tar: + for path in paths: + if os.path.exists(path): + tar.add( + path, + arcname=os.path.join( + os.path.basename(os.path.dirname(path)), + self.version.slug, + ) + ) + + storage = get_storage_class(settings.RTD_BUILD_ENVIRONMENT_STORAGE)() + with open(tmp_filename, 'rb') as fd: + msg = 'Pushing up cached environment to storage', + log.info( + LOG_TEMPLATE, + { + 'project': self.project.slug, + 'version': self.version.slug, + 'msg': msg, + } + ) + storage.save( + self.version.get_storage_environment_cache_path(), + fd, + ) + + class SyncRepositoryMixin: """Mixin that handles the VCS sync/update.""" @@ -230,7 +313,7 @@ def sync_repository_task(version_pk): clean_build(version_pk) -class SyncRepositoryTaskStep(SyncRepositoryMixin): +class SyncRepositoryTaskStep(SyncRepositoryMixin, CachedEnvironmentMixin): """ Entry point to synchronize the VCS documentation. @@ -271,6 +354,7 @@ def run(self, version_pk): # pylint: disable=arguments-differ with environment: before_vcs.send(sender=self.version, environment=environment) with self.project.repo_nonblockinglock(version=self.version): + self.pull_cached_environment() self.sync_repo(environment) return True except RepositoryError: @@ -329,7 +413,7 @@ def update_docs_task(self, version_pk, *args, **kwargs): clean_build(version_pk) -class UpdateDocsTaskStep(SyncRepositoryMixin): +class UpdateDocsTaskStep(SyncRepositoryMixin, CachedEnvironmentMixin): """ The main entry point for updating documentation. @@ -492,6 +576,7 @@ def run_setup(self, record=True): raise ProjectBuildsSkippedError try: with self.project.repo_nonblockinglock(version=self.version): + self.pull_cached_environment() self.setup_vcs(environment) except vcs_support_utils.LockTimeout as e: self.task.retry(exc=e, throw=False) @@ -646,6 +731,9 @@ def run_build(self, record): # Send Webhook notification for build success. self.send_notifications(self.version.pk, self.build['id'], email=False) + # Push cached environment on success for next build + self.push_cached_environment() + if self.commit: send_external_build_status( version_type=self.version.type, diff --git a/readthedocs/settings/docker_compose.py b/readthedocs/settings/docker_compose.py index f09e32705c5..a967deb0ab2 100644 --- a/readthedocs/settings/docker_compose.py +++ b/readthedocs/settings/docker_compose.py @@ -117,6 +117,8 @@ def DATABASES(self): # noqa RTD_BUILD_MEDIA_STORAGE = 'readthedocs.storage.azure_storage.AzureBuildMediaStorage' AZURE_STATIC_STORAGE_HOSTNAME = PRODUCTION_DOMAIN + RTD_BUILD_ENVIRONMENT_STORAGE = 'readthedocs.storage.azure_storage.AzureBuildEnvironmentStorage' + # Storage for static files (those collected with `collectstatic`) STATICFILES_STORAGE = 'readthedocs.storage.azure_storage.AzureStaticStorage' diff --git a/readthedocs/storage/azure_storage.py b/readthedocs/storage/azure_storage.py index be239974679..baa6940ed83 100644 --- a/readthedocs/storage/azure_storage.py +++ b/readthedocs/storage/azure_storage.py @@ -28,6 +28,11 @@ class AzureBuildStorage(AzureStorage): azure_container = getattr(settings, 'AZURE_BUILD_STORAGE_CONTAINER', None) or 'builds' +class AzureBuildEnvironmentStorage(BuildMediaStorageMixin, AzureStorage): + + azure_container = getattr(settings, 'AZURE_BUILD_ENVIRONMENT_STORAGE_CONTAINER', None) or 'envs' + + class AzureStaticStorage(OverrideHostnameMixin, ManifestFilesMixin, AzureStorage): """ From 7e5b86a5c057ab3bf0809fe1f0e22aeba0154b1f Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Tue, 10 Mar 2020 09:46:39 -0700 Subject: [PATCH 02/10] Use tempfile.mkstemp since tempfile.mktemp is deprecated --- readthedocs/projects/tasks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py index f14b95daaca..0c9a3d646d0 100644 --- a/readthedocs/projects/tasks.py +++ b/readthedocs/projects/tasks.py @@ -119,7 +119,7 @@ def pull_cached_environment(self): 'msg': msg, } ) - tmp_filename = tempfile.mktemp(suffix='.tar') + tmp_filename = tempfile.mkstemp(suffix='.tar') remote_fd = storage.open(filename, mode='rb') with open(tmp_filename, mode='wb') as local_fd: local_fd.write(remote_fd.read()) @@ -140,7 +140,7 @@ def push_cached_environment(self): os.path.join(project_path, '.cache'), ] - tmp_filename = tempfile.mktemp(suffix='.tar') + tmp_filename = tempfile.mkstemp(suffix='.tar') # open just with 'w', to not compress and waste CPU cycles with tarfile.open(tmp_filename, 'w') as tar: for path in paths: From 4e47c28fb9ef68ab4561cfadfcafe3cffc8c372d Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Tue, 10 Mar 2020 10:07:20 -0700 Subject: [PATCH 03/10] Cleanup temporary file once it's used --- readthedocs/projects/tasks.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py index 0c9a3d646d0..70a88c3d6ee 100644 --- a/readthedocs/projects/tasks.py +++ b/readthedocs/projects/tasks.py @@ -127,6 +127,9 @@ def pull_cached_environment(self): tar = tarfile.TarFile(tmp_filename) tar.extractall(self.version.project.doc_path) + # Cleanup the temporary file + if os.path.exists(tmp_filename): + os.remove(tmp_filename) def push_cached_environment(self): if not self.project.has_feature(feature_id=Feature.CACHED_ENVIRONMENT): @@ -169,6 +172,10 @@ def push_cached_environment(self): fd, ) + # Cleanup the temporary file + if os.path.exists(tmp_filename): + os.remove(tmp_filename) + class SyncRepositoryMixin: From dc2e7e8373b498889ee359da4656798b8308a0ab Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Tue, 10 Mar 2020 10:09:20 -0700 Subject: [PATCH 04/10] Use tarfile.open as recommended by docs --- readthedocs/projects/tasks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py index 70a88c3d6ee..7668c4228c6 100644 --- a/readthedocs/projects/tasks.py +++ b/readthedocs/projects/tasks.py @@ -124,8 +124,8 @@ def pull_cached_environment(self): with open(tmp_filename, mode='wb') as local_fd: local_fd.write(remote_fd.read()) - tar = tarfile.TarFile(tmp_filename) - tar.extractall(self.version.project.doc_path) + with tarfile.open(tmp_filename) as tar: + tar.extractall(self.version.project.doc_path) # Cleanup the temporary file if os.path.exists(tmp_filename): From 8c9c473a5153d3ea7f7ba83f1948dcd623ae33c4 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Tue, 10 Mar 2020 19:15:53 -0700 Subject: [PATCH 05/10] Update settings --- readthedocs/settings/base.py | 1 + readthedocs/settings/docker_compose.py | 1 + 2 files changed, 2 insertions(+) diff --git a/readthedocs/settings/base.py b/readthedocs/settings/base.py index b517e0b5a04..e83d18c74ab 100644 --- a/readthedocs/settings/base.py +++ b/readthedocs/settings/base.py @@ -259,6 +259,7 @@ def USE_PROMOS(self): # noqa # Django Storage subclass used to write build artifacts to cloud or local storage # https://docs.readthedocs.io/page/development/settings.html#rtd-build-media-storage RTD_BUILD_MEDIA_STORAGE = 'readthedocs.builds.storage.BuildMediaFileSystemStorage' + RTD_BUILD_ENVIRONMENT_STORAGE = 'readthedocs.builds.storage.BuildMediaFileSystemStorage' TEMPLATES = [ { diff --git a/readthedocs/settings/docker_compose.py b/readthedocs/settings/docker_compose.py index a967deb0ab2..7d566aabf18 100644 --- a/readthedocs/settings/docker_compose.py +++ b/readthedocs/settings/docker_compose.py @@ -117,6 +117,7 @@ def DATABASES(self): # noqa RTD_BUILD_MEDIA_STORAGE = 'readthedocs.storage.azure_storage.AzureBuildMediaStorage' AZURE_STATIC_STORAGE_HOSTNAME = PRODUCTION_DOMAIN + # Storage backend for build cached environments RTD_BUILD_ENVIRONMENT_STORAGE = 'readthedocs.storage.azure_storage.AzureBuildEnvironmentStorage' # Storage for static files (those collected with `collectstatic`) From 1737e4a4255cfcbc2545922c8cbd068c7745adb1 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Tue, 10 Mar 2020 19:19:07 -0700 Subject: [PATCH 06/10] Fix linting --- readthedocs/projects/tasks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py index 7668c4228c6..f8dc3a06e9d 100644 --- a/readthedocs/projects/tasks.py +++ b/readthedocs/projects/tasks.py @@ -122,7 +122,7 @@ def pull_cached_environment(self): tmp_filename = tempfile.mkstemp(suffix='.tar') remote_fd = storage.open(filename, mode='rb') with open(tmp_filename, mode='wb') as local_fd: - local_fd.write(remote_fd.read()) + local_fd.write(remote_fd.read()) with tarfile.open(tmp_filename) as tar: tar.extractall(self.version.project.doc_path) @@ -158,7 +158,7 @@ def push_cached_environment(self): storage = get_storage_class(settings.RTD_BUILD_ENVIRONMENT_STORAGE)() with open(tmp_filename, 'rb') as fd: - msg = 'Pushing up cached environment to storage', + msg = 'Pushing up cached environment to storage' log.info( LOG_TEMPLATE, { From c9f5ef70c29c06a1a28ada8517934ce7ff3356de Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Thu, 12 Mar 2020 09:25:42 -0700 Subject: [PATCH 07/10] Add comment explaining with not push on sync repository task --- readthedocs/projects/tasks.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py index f8dc3a06e9d..a62312de1a4 100644 --- a/readthedocs/projects/tasks.py +++ b/readthedocs/projects/tasks.py @@ -361,6 +361,11 @@ def run(self, version_pk): # pylint: disable=arguments-differ with environment: before_vcs.send(sender=self.version, environment=environment) with self.project.repo_nonblockinglock(version=self.version): + # When syncing we are only pulling the cached environment + # (without pushing it after it's updated). We only clone the + # repository in this step, and pushing it back will delete + # all the other cached things (Python packages, Sphinx, + # virtualenv, etc) self.pull_cached_environment() self.sync_repo(environment) return True From af4d44185082146a1dc3c2e386ee7334215bca61 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Thu, 12 Mar 2020 12:06:10 -0700 Subject: [PATCH 08/10] Use --cache-dir on pip if CACHED_ENVIRONMENT is enabled --- readthedocs/doc_builder/python_environments.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/readthedocs/doc_builder/python_environments.py b/readthedocs/doc_builder/python_environments.py index 7eeddcd1100..d2750a16b24 100644 --- a/readthedocs/doc_builder/python_environments.py +++ b/readthedocs/doc_builder/python_environments.py @@ -129,12 +129,15 @@ def _pip_cache_cmd_argument(self): The decision is made considering if the directories are going to be cleaned after the build (``RTD_CLEAN_AFTER_BUILD=True`` or project has - the ``CLEAN_AFTER_BUILD`` feature enabled). In this case, there is no - need to cache anything. + the ``CLEAN_AFTER_BUILD`` feature enabled) and project has not the + feature ``CACHED_ENVIRONMENT``. In this case, there is no need to cache + anything. """ if ( - settings.RTD_CLEAN_AFTER_BUILD or - self.project.has_feature(Feature.CLEAN_AFTER_BUILD) + ( + settings.RTD_CLEAN_AFTER_BUILD or + self.project.has_feature(Feature.CLEAN_AFTER_BUILD) + ) and not self.project.has_feature(Feature.CACHED_ENVIRONMENT) ): return [ '--no-cache-dir', From 861d90be2fb32c4e8fc5966a8b9d95194e51f1b5 Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Thu, 12 Mar 2020 12:14:16 -0700 Subject: [PATCH 09/10] Ignore OS-level handle --- readthedocs/projects/tasks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py index a62312de1a4..bc6accc12f3 100644 --- a/readthedocs/projects/tasks.py +++ b/readthedocs/projects/tasks.py @@ -119,7 +119,7 @@ def pull_cached_environment(self): 'msg': msg, } ) - tmp_filename = tempfile.mkstemp(suffix='.tar') + _, tmp_filename = tempfile.mkstemp(suffix='.tar') remote_fd = storage.open(filename, mode='rb') with open(tmp_filename, mode='wb') as local_fd: local_fd.write(remote_fd.read()) @@ -143,7 +143,7 @@ def push_cached_environment(self): os.path.join(project_path, '.cache'), ] - tmp_filename = tempfile.mkstemp(suffix='.tar') + _, tmp_filename = tempfile.mkstemp(suffix='.tar') # open just with 'w', to not compress and waste CPU cycles with tarfile.open(tmp_filename, 'w') as tar: for path in paths: From bd560ea11dcf5fd0d5478c68ec129411d397f25e Mon Sep 17 00:00:00 2001 From: Manuel Kaufmann Date: Thu, 12 Mar 2020 12:57:01 -0700 Subject: [PATCH 10/10] Save .cache path properly + refactor how cached directories are managed. --- readthedocs/projects/tasks.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/readthedocs/projects/tasks.py b/readthedocs/projects/tasks.py index bc6accc12f3..645523b3efb 100644 --- a/readthedocs/projects/tasks.py +++ b/readthedocs/projects/tasks.py @@ -136,25 +136,28 @@ def push_cached_environment(self): return project_path = self.project.doc_path - paths = [ - os.path.join(project_path, 'checkouts', self.version.slug), - os.path.join(project_path, 'envs', self.version.slug), - os.path.join(project_path, 'conda', self.version.slug), - os.path.join(project_path, '.cache'), + directories = [ + 'checkouts', + 'envs', + 'conda', ] _, tmp_filename = tempfile.mkstemp(suffix='.tar') # open just with 'w', to not compress and waste CPU cycles with tarfile.open(tmp_filename, 'w') as tar: - for path in paths: + for directory in directories: + path = os.path.join( + project_path, + directory, + self.version.slug, + ) + arcname = os.path.join(directory, self.version.slug) if os.path.exists(path): - tar.add( - path, - arcname=os.path.join( - os.path.basename(os.path.dirname(path)), - self.version.slug, - ) - ) + tar.add(path, arcname=arcname) + + # Special handling for .cache directory because it's per-project + path = os.path.join(project_path, '.cache') + tar.add(path, arcname='.cache') storage = get_storage_class(settings.RTD_BUILD_ENVIRONMENT_STORAGE)() with open(tmp_filename, 'rb') as fd: