Skip to content

Build: Allow multiple PDFs (WIP) #10437

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions readthedocs/builds/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,3 +177,16 @@
"epub",
"pdf",
)

# Should replace ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT
# Currently hidden behind a feature flag
ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT_NO_PDF = (
"htmlzip",
"epub",
)

# For certain media types, we only want to copy out specific file extensions.
# This is treated case-insensitive.
ARTIFACTS_WITH_RESTRICTED_EXTENSIONS = {
"pdf": ["pdf"],
}
4 changes: 2 additions & 2 deletions readthedocs/builds/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,13 +132,13 @@ def _check_suspicious_path(self, path):
def _rclone(self):
raise NotImplementedError

def rclone_sync_directory(self, source, destination):
def rclone_sync_directory(self, source, destination, **sync_kwargs):
"""Sync a directory recursively to storage using rclone sync."""
if destination in ("", "/"):
raise SuspiciousFileOperation("Syncing all storage cannot be right")

self._check_suspicious_path(source)
return self._rclone.sync(source, destination)
return self._rclone.sync(source, destination, **sync_kwargs)

def join(self, directory, filepath):
return safe_join(directory, filepath)
Expand Down
3 changes: 2 additions & 1 deletion readthedocs/doc_builder/backends/sphinx.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,6 +504,7 @@ class PdfBuilder(BaseSphinx):
pdf_file_name = None

def build(self):
"""Runs Sphinx to convert to LaTeX, uses latexmk to build PDFs."""
self.run(
*self.get_sphinx_cmd(),
"-T",
Expand All @@ -516,7 +517,7 @@ def build(self):
f"language={self.project.language}",
# Sphinx's source directory (SOURCEDIR).
# We are executing this command at the location of the `conf.py` file (CWD).
# TODO: ideally we should execute it from where the repository was clonned,
# TODO: ideally we should execute it from where the repository was cloned,
# but that could lead unexpected behavior to some users:
# https://github.com/readthedocs/readthedocs.org/pull/9888#issuecomment-1384649346
".",
Expand Down
2 changes: 2 additions & 0 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1941,6 +1941,7 @@ def add_features(sender, **kwargs):
DONT_CREATE_INDEX = "dont_create_index"
HOSTING_INTEGRATIONS = "hosting_integrations"
NO_CONFIG_FILE_DEPRECATED = "no_config_file"
ENABLE_MULTIPLE_PDFS = "mutliple_pdfs"

FEATURES = (
(
Expand Down Expand Up @@ -2083,6 +2084,7 @@ def add_features(sender, **kwargs):
NO_CONFIG_FILE_DEPRECATED,
_("Build: Building without a configuration file is deprecated."),
),
(ENABLE_MULTIPLE_PDFS, _("Build: Enable multiple PDF support during builds.")),
)

FEATURES = sorted(FEATURES, key=lambda l: l[1])
Expand Down
25 changes: 20 additions & 5 deletions readthedocs/projects/tasks/builds.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
from readthedocs.builds.constants import (
ARTIFACT_TYPES,
ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT,
ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT_NO_PDF,
ARTIFACTS_WITH_RESTRICTED_EXTENSIONS,
BUILD_FINAL_STATES,
BUILD_STATE_BUILDING,
BUILD_STATE_CLONING,
Expand Down Expand Up @@ -63,7 +65,7 @@
RepositoryError,
SyncRepositoryLocked,
)
from ..models import APIProject, WebHookEvent
from ..models import APIProject, Feature, WebHookEvent
from ..signals import before_vcs
from .mixins import SyncRepositoryMixin
from .search import fileify
Expand Down Expand Up @@ -539,11 +541,11 @@ def get_valid_artifact_types(self):
It performs the following checks on each output format type path:
- it exists
- it is a directory
- does not contains more than 1 files (only PDF, HTMLZip, ePUB)
- does not contains more than 1 files (HTMLZip, ePUB)

TODO: remove the limitation of only 1 file.
Add support for multiple PDF files in the output directory and
grab them by using glob syntaxt between other files that could be garbage.
grab them by using glob syntax between other files that could be garbage.
"""
valid_artifacts = []
for artifact_type in ARTIFACT_TYPES:
Expand All @@ -570,7 +572,13 @@ def get_valid_artifact_types(self):
# Check if there are multiple files on artifact directories.
# These output format does not support multiple files yet.
# In case multiple files are found, the upload for this format is not performed.
if artifact_type in ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT:
if self.data.project.has_feature(Feature.ENABLE_MULTIPLE_PDFS):
single_file_artifacts = (
ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT_NO_PDF
)
else:
single_file_artifacts = ARTIFACT_TYPES_WITHOUT_MULTIPLE_FILES_SUPPORT
if artifact_type in single_file_artifacts:
artifact_format_files = len(os.listdir(artifact_directory))
if artifact_format_files > 1:
log.error(
Expand Down Expand Up @@ -876,7 +884,14 @@ def store_build_artifacts(self):
version_type=self.data.version.type,
)
try:
build_media_storage.rclone_sync_directory(from_path, to_path)
rclone_kwargs = {}
if media_type in ARTIFACTS_WITH_RESTRICTED_EXTENSIONS:
rclone_kwargs[
"filter_extensions"
] = ARTIFACTS_WITH_RESTRICTED_EXTENSIONS[media_type]
build_media_storage.rclone_sync_directory(
from_path, to_path, **rclone_kwargs
)
except Exception as exc:
# NOTE: the exceptions reported so far are:
# - botocore.exceptions:HTTPClientError
Expand Down
19 changes: 19 additions & 0 deletions readthedocs/rtd_tests/tests/test_build_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,25 @@ def test_copy_directory_source_outside_docroot(self):
with pytest.raises(SuspiciousFileOperation, match="outside the docroot"):
self.storage.copy_directory(tmp_dir, "files")

def test_sync_directory_with_filter(self):
"""Test that we can as the rclone_sync_directory method to only include a specific extension."""
tmp_files_dir = os.path.join(tempfile.mkdtemp(), "files")
# Copy files_dir (with all test files) into tmp_files_dir
shutil.copytree(files_dir, tmp_files_dir, symlinks=True)
storage_dir = "files"

tree = [
("api", ["index.html"]),
"api.fjson",
"conf.py",
"test.html",
]
with override_settings(DOCROOT=tmp_files_dir):
self.storage.rclone_sync_directory(
tmp_files_dir, storage_dir, filter_extensions=["html"]
)
self.assertFileTree(storage_dir, [("api", ["index.html"]), "test.html"])

def test_delete_directory(self):
with override_settings(DOCROOT=files_dir):
self.storage.copy_directory(files_dir, "files")
Expand Down
19 changes: 17 additions & 2 deletions readthedocs/storage/rclone.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,16 +112,31 @@ def execute(self, subcommand, args, options=None):
)
return result

def sync(self, source, destination):
def sync(self, source, destination, filter_extensions=None):
"""
Run the `rclone sync` command.

See https://rclone.org/commands/rclone_sync/.

:params source: Local path to the source directory.
:params destination: Remote path to the destination directory.
:params filter_extensions: Only copy files in listed file extensions
"""
return self.execute("sync", args=[source, self.get_target(destination)])
options = []
# See:
# https://rclone.org/filtering/
if filter_extensions:
options += ["--ignore-case", "--include"]
# Python escape rule: {{ = {
# What we want to have is for instance '*.{pdf}'
filter_pattern = "*.{{{extensions}}}".format(
extensions=",".join(filter_extensions)
)
options.append(filter_pattern)

return self.execute(
"sync", args=[source, self.get_target(destination)], options=options
)


class RCloneLocal(BaseRClone):
Expand Down
4 changes: 2 additions & 2 deletions readthedocs/telemetry/collectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class BuildDataCollector:
"""
Build data collector.

Collect data from a runnig build.
Collect data from a running build.
"""

def __init__(self, environment):
Expand Down Expand Up @@ -58,7 +58,7 @@ def run(self, *args, **kwargs):

def collect(self):
"""
Collect all relevant data from the runnig build.
Collect all relevant data from the running build.

Data that can be extracted from the database (project/organization)
isn't collected here.
Expand Down