From 3f2bd1f010507ffee9f4750d0ba05b31bea8e8e7 Mon Sep 17 00:00:00 2001 From: Benjamin Bach Date: Wed, 14 Jun 2023 16:46:23 +0200 Subject: [PATCH] Adds a feature flag and some initial ability to handle multiple PDFs --- readthedocs/builds/constants.py | 13 ++++++++++ readthedocs/builds/storage.py | 4 +-- readthedocs/doc_builder/backends/sphinx.py | 3 ++- readthedocs/projects/models.py | 2 ++ readthedocs/projects/tasks/builds.py | 25 +++++++++++++++---- .../rtd_tests/tests/test_build_storage.py | 19 ++++++++++++++ readthedocs/storage/rclone.py | 19 ++++++++++++-- readthedocs/telemetry/collectors.py | 4 +-- 8 files changed, 77 insertions(+), 12 deletions(-) diff --git a/readthedocs/builds/constants.py b/readthedocs/builds/constants.py index df1bd6bf7f3..fee73c5c2c8 100644 --- a/readthedocs/builds/constants.py +++ b/readthedocs/builds/constants.py @@ -177,3 +177,16 @@ "epub", "pdf", ) + +# Should replace ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT +# Currently hidden behind a feature flag +ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT_NO_PDF = ( + "htmlzip", + "epub", +) + +# For certain media types, we only want to copy out specific file extensions. +# This is treated case-insensitive. +ARTIFACTS_WITH_RESTRICTED_EXTENSIONS = { + "pdf": ["pdf"], +} diff --git a/readthedocs/builds/storage.py b/readthedocs/builds/storage.py index da57c40d244..a8dbe6f389b 100644 --- a/readthedocs/builds/storage.py +++ b/readthedocs/builds/storage.py @@ -132,13 +132,13 @@ def _check_suspicious_path(self, path): def _rclone(self): raise NotImplementedError - def rclone_sync_directory(self, source, destination): + def rclone_sync_directory(self, source, destination, **sync_kwargs): """Sync a directory recursively to storage using rclone sync.""" if destination in ("", "/"): raise SuspiciousFileOperation("Syncing all storage cannot be right") self._check_suspicious_path(source) - return self._rclone.sync(source, destination) + return self._rclone.sync(source, destination, **sync_kwargs) def join(self, directory, filepath): return safe_join(directory, filepath) diff --git a/readthedocs/doc_builder/backends/sphinx.py b/readthedocs/doc_builder/backends/sphinx.py index afa1c253421..26173f119f2 100644 --- a/readthedocs/doc_builder/backends/sphinx.py +++ b/readthedocs/doc_builder/backends/sphinx.py @@ -504,6 +504,7 @@ class PdfBuilder(BaseSphinx): pdf_file_name = None def build(self): + """Runs Sphinx to convert to LaTeX, uses latexmk to build PDFs.""" self.run( *self.get_sphinx_cmd(), "-T", @@ -516,7 +517,7 @@ def build(self): f"language={self.project.language}", # Sphinx's source directory (SOURCEDIR). # We are executing this command at the location of the `conf.py` file (CWD). - # TODO: ideally we should execute it from where the repository was clonned, + # TODO: ideally we should execute it from where the repository was cloned, # but that could lead unexpected behavior to some users: # https://github.com/readthedocs/readthedocs.org/pull/9888#issuecomment-1384649346 ".", diff --git a/readthedocs/projects/models.py b/readthedocs/projects/models.py index 377b1e24034..f1d0fc9a0eb 100644 --- a/readthedocs/projects/models.py +++ b/readthedocs/projects/models.py @@ -1941,6 +1941,7 @@ def add_features(sender, **kwargs): DONT_CREATE_INDEX = "dont_create_index" HOSTING_INTEGRATIONS = "hosting_integrations" NO_CONFIG_FILE_DEPRECATED = "no_config_file" + ENABLE_MULTIPLE_PDFS = "mutliple_pdfs" FEATURES = ( ( @@ -2083,6 +2084,7 @@ def add_features(sender, **kwargs): NO_CONFIG_FILE_DEPRECATED, _("Build: Building without a configuration file is deprecated."), ), + (ENABLE_MULTIPLE_PDFS, _("Build: Enable multiple PDF support during builds.")), ) FEATURES = sorted(FEATURES, key=lambda l: l[1]) diff --git a/readthedocs/projects/tasks/builds.py b/readthedocs/projects/tasks/builds.py index d8328aeaa84..fb5b1b72176 100644 --- a/readthedocs/projects/tasks/builds.py +++ b/readthedocs/projects/tasks/builds.py @@ -21,6 +21,8 @@ from readthedocs.builds.constants import ( ARTIFACT_TYPES, ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT, + ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT_NO_PDF, + ARTIFACTS_WITH_RESTRICTED_EXTENSIONS, BUILD_FINAL_STATES, BUILD_STATE_BUILDING, BUILD_STATE_CLONING, @@ -63,7 +65,7 @@ RepositoryError, SyncRepositoryLocked, ) -from ..models import APIProject, WebHookEvent +from ..models import APIProject, Feature, WebHookEvent from ..signals import before_vcs from .mixins import SyncRepositoryMixin from .search import fileify @@ -539,11 +541,11 @@ def get_valid_artifact_types(self): It performs the following checks on each output format type path: - it exists - it is a directory - - does not contains more than 1 files (only PDF, HTMLZip, ePUB) + - does not contains more than 1 files (HTMLZip, ePUB) TODO: remove the limitation of only 1 file. Add support for multiple PDF files in the output directory and - grab them by using glob syntaxt between other files that could be garbage. + grab them by using glob syntax between other files that could be garbage. """ valid_artifacts = [] for artifact_type in ARTIFACT_TYPES: @@ -570,7 +572,13 @@ def get_valid_artifact_types(self): # Check if there are multiple files on artifact directories. # These output format does not support multiple files yet. # In case multiple files are found, the upload for this format is not performed. - if artifact_type in ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT: + if self.data.project.has_feature(Feature.ENABLE_MULTIPLE_PDFS): + single_file_artifacts = ( + ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT_NO_PDF + ) + else: + single_file_artifacts = ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT + if artifact_type in single_file_artifacts: artifact_format_files = len(os.listdir(artifact_directory)) if artifact_format_files > 1: log.error( @@ -876,7 +884,14 @@ def store_build_artifacts(self): version_type=self.data.version.type, ) try: - build_media_storage.rclone_sync_directory(from_path, to_path) + rclone_kwargs = {} + if media_type in ARTIFACTS_WITH_RESTRICTED_EXTENSIONS: + rclone_kwargs[ + "filter_extensions" + ] = ARTIFACTS_WITH_RESTRICTED_EXTENSIONS[media_type] + build_media_storage.rclone_sync_directory( + from_path, to_path, **rclone_kwargs + ) except Exception as exc: # NOTE: the exceptions reported so far are: # - botocore.exceptions:HTTPClientError diff --git a/readthedocs/rtd_tests/tests/test_build_storage.py b/readthedocs/rtd_tests/tests/test_build_storage.py index 78e138e7b7e..33cb979d2a6 100644 --- a/readthedocs/rtd_tests/tests/test_build_storage.py +++ b/readthedocs/rtd_tests/tests/test_build_storage.py @@ -121,6 +121,25 @@ def test_copy_directory_source_outside_docroot(self): with pytest.raises(SuspiciousFileOperation, match="outside the docroot"): self.storage.copy_directory(tmp_dir, "files") + def test_sync_directory_with_filter(self): + """Test that we can as the rclone_sync_directory method to only include a specific extension.""" + tmp_files_dir = os.path.join(tempfile.mkdtemp(), "files") + # Copy files_dir (with all test files) into tmp_files_dir + shutil.copytree(files_dir, tmp_files_dir, symlinks=True) + storage_dir = "files" + + tree = [ + ("api", ["index.html"]), + "api.fjson", + "conf.py", + "test.html", + ] + with override_settings(DOCROOT=tmp_files_dir): + self.storage.rclone_sync_directory( + tmp_files_dir, storage_dir, filter_extensions=["html"] + ) + self.assertFileTree(storage_dir, [("api", ["index.html"]), "test.html"]) + def test_delete_directory(self): with override_settings(DOCROOT=files_dir): self.storage.copy_directory(files_dir, "files") diff --git a/readthedocs/storage/rclone.py b/readthedocs/storage/rclone.py index ab88f3fe64d..0676ed9229f 100644 --- a/readthedocs/storage/rclone.py +++ b/readthedocs/storage/rclone.py @@ -112,7 +112,7 @@ def execute(self, subcommand, args, options=None): ) return result - def sync(self, source, destination): + def sync(self, source, destination, filter_extensions=None): """ Run the `rclone sync` command. @@ -120,8 +120,23 @@ def sync(self, source, destination): :params source: Local path to the source directory. :params destination: Remote path to the destination directory. + :params filter_extensions: Only copy files in listed file extensions """ - return self.execute("sync", args=[source, self.get_target(destination)]) + options = [] + # See: + # https://rclone.org/filtering/ + if filter_extensions: + options += ["--ignore-case", "--include"] + # Python escape rule: {{ = { + # What we want to have is for instance '*.{pdf}' + filter_pattern = "*.{{{extensions}}}".format( + extensions=",".join(filter_extensions) + ) + options.append(filter_pattern) + + return self.execute( + "sync", args=[source, self.get_target(destination)], options=options + ) class RCloneLocal(BaseRClone): diff --git a/readthedocs/telemetry/collectors.py b/readthedocs/telemetry/collectors.py index af1202d0cdd..ffd21af044d 100644 --- a/readthedocs/telemetry/collectors.py +++ b/readthedocs/telemetry/collectors.py @@ -17,7 +17,7 @@ class BuildDataCollector: """ Build data collector. - Collect data from a runnig build. + Collect data from a running build. """ def __init__(self, environment): @@ -58,7 +58,7 @@ def run(self, *args, **kwargs): def collect(self): """ - Collect all relevant data from the runnig build. + Collect all relevant data from the running build. Data that can be extracted from the database (project/organization) isn't collected here.