Skip to content

Commit c48e5eb

Browse files
authored
Build: use rclone for sync (#9842)
- Put this new feature under a feature flag. - Works out of the box with our current settings, no rclone configuration file required. - Uses the local filesystem when running tests, uses minion during dev. - We need to install rclone in our builders for this to work. - I'm using the checks implemented in #9890, that needs to be merged first. - If we want even faster upload times for sphinx, we can merge readthedocs/readthedocs-sphinx-ext#119, since right now we are re-uploading all files. To test this, you need to re-build your docker containers. Closes #9448
1 parent 634e02f commit c48e5eb

File tree

8 files changed

+302
-5
lines changed

8 files changed

+302
-5
lines changed

.circleci/config.yml

+2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ jobs:
1818
- checkout
1919
- run: git submodule sync
2020
- run: git submodule update --init
21+
- run: sudo apt update
22+
- run: sudo apt install -y rclone
2123
- run: pip install --user 'tox<5'
2224
- run: tox -e py310
2325
- codecov/upload

dockerfiles/Dockerfile

+2-1
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ RUN apt-get -y install \
3030
netcat \
3131
telnet \
3232
lsb-release \
33-
npm
33+
npm \
34+
rclone
3435

3536
# Gets the MinIO mc client used to add buckets upon initialization
3637
# If this client should have issues running inside this image, it is also

readthedocs/builds/storage.py

+18
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from functools import cached_property
12
from pathlib import Path
23

34
import structlog
@@ -7,6 +8,7 @@
78
from storages.utils import get_available_overwrite_name, safe_join
89

910
from readthedocs.core.utils.filesystem import safe_open
11+
from readthedocs.storage.rclone import RCloneLocal
1012

1113
log = structlog.get_logger(__name__)
1214

@@ -172,6 +174,18 @@ def sync_directory(self, source, destination):
172174
log.debug('Deleting file from media storage.', filepath=filepath)
173175
self.delete(filepath)
174176

177+
@cached_property
178+
def _rclone(self):
179+
raise NotImplementedError
180+
181+
def rclone_sync_directory(self, source, destination):
182+
"""Sync a directory recursively to storage using rclone sync."""
183+
if destination in ("", "/"):
184+
raise SuspiciousFileOperation("Syncing all storage cannot be right")
185+
186+
self._check_suspicious_path(source)
187+
return self._rclone.sync(source, destination)
188+
175189
def join(self, directory, filepath):
176190
return safe_join(directory, filepath)
177191

@@ -206,6 +220,10 @@ def __init__(self, **kwargs):
206220

207221
super().__init__(location)
208222

223+
@cached_property
224+
def _rclone(self):
225+
return RCloneLocal(location=self.location)
226+
209227
def get_available_name(self, name, max_length=None):
210228
"""
211229
A hack to overwrite by default with the FileSystemStorage.

readthedocs/projects/models.py

+5
Original file line numberDiff line numberDiff line change
@@ -1849,6 +1849,7 @@ def add_features(sender, **kwargs):
18491849
USE_SPHINX_BUILDERS = "use_sphinx_builders"
18501850
CANCEL_OLD_BUILDS = "cancel_old_builds"
18511851
DONT_CREATE_INDEX = "dont_create_index"
1852+
USE_RCLONE = "use_rclone"
18521853

18531854
FEATURES = (
18541855
(ALLOW_DEPRECATED_WEBHOOKS, _('Allow deprecated webhook views')),
@@ -2005,6 +2006,10 @@ def add_features(sender, **kwargs):
20052006
DONT_CREATE_INDEX,
20062007
_('Do not create index.md or README.rst if the project does not have one.'),
20072008
),
2009+
(
2010+
USE_RCLONE,
2011+
_("Use rclone for syncing files to the media storage."),
2012+
),
20082013
)
20092014

20102015
projects = models.ManyToManyField(

readthedocs/projects/tasks/builds.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -839,7 +839,10 @@ def store_build_artifacts(self):
839839
version_type=self.data.version.type,
840840
)
841841
try:
842-
build_media_storage.sync_directory(from_path, to_path)
842+
if self.data.project.has_feature(Feature.USE_RCLONE):
843+
build_media_storage.rclone_sync_directory(from_path, to_path)
844+
else:
845+
build_media_storage.sync_directory(from_path, to_path)
843846
except Exception:
844847
# Ideally this should just be an IOError
845848
# but some storage backends unfortunately throw other errors

readthedocs/rtd_tests/tests/test_build_storage.py

+52
Original file line numberDiff line numberDiff line change
@@ -157,3 +157,55 @@ def test_walk(self):
157157
self.assertEqual(top, 'files/api')
158158
self.assertCountEqual(dirs, [])
159159
self.assertCountEqual(files, ['index.html'])
160+
161+
def test_rclone_sync(self):
162+
tmp_files_dir = Path(tempfile.mkdtemp()) / "files"
163+
shutil.copytree(files_dir, tmp_files_dir, symlinks=True)
164+
storage_dir = "files"
165+
166+
tree = [
167+
("api", ["index.html"]),
168+
"api.fjson",
169+
"conf.py",
170+
"test.html",
171+
]
172+
with override_settings(DOCROOT=tmp_files_dir):
173+
self.storage.rclone_sync_directory(tmp_files_dir, storage_dir)
174+
self.assertFileTree(storage_dir, tree)
175+
176+
tree = [
177+
("api", ["index.html"]),
178+
"conf.py",
179+
"test.html",
180+
]
181+
(tmp_files_dir / "api.fjson").unlink()
182+
with override_settings(DOCROOT=tmp_files_dir):
183+
self.storage.rclone_sync_directory(tmp_files_dir, storage_dir)
184+
self.assertFileTree(storage_dir, tree)
185+
186+
tree = [
187+
"conf.py",
188+
"test.html",
189+
]
190+
shutil.rmtree(tmp_files_dir / "api")
191+
with override_settings(DOCROOT=tmp_files_dir):
192+
self.storage.rclone_sync_directory(tmp_files_dir, storage_dir)
193+
self.assertFileTree(storage_dir, tree)
194+
195+
def test_rclone_sync_source_symlink(self):
196+
tmp_dir = Path(tempfile.mkdtemp())
197+
tmp_symlink_dir = Path(tempfile.mkdtemp()) / "files"
198+
tmp_symlink_dir.symlink_to(tmp_dir)
199+
200+
with override_settings(DOCROOT=tmp_dir):
201+
with pytest.raises(SuspiciousFileOperation, match="symbolic link"):
202+
self.storage.rclone_sync_directory(tmp_symlink_dir, "files")
203+
204+
def test_rclone_sync_source_outside_docroot(self):
205+
tmp_dir = Path(tempfile.mkdtemp())
206+
tmp_docroot = Path(tempfile.mkdtemp()) / "docroot"
207+
tmp_docroot.mkdir()
208+
209+
with override_settings(DOCROOT=tmp_docroot):
210+
with pytest.raises(SuspiciousFileOperation, match="outside the docroot"):
211+
self.storage.rclone_sync_directory(tmp_dir, "files")

readthedocs/storage/rclone.py

+192
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
"""
2+
Wrapper around the rclone command.
3+
4+
See https://rclone.org/docs.
5+
"""
6+
7+
import os
8+
import subprocess
9+
10+
import structlog
11+
from django.utils._os import safe_join as safe_join_fs
12+
from storages.utils import safe_join
13+
14+
log = structlog.get_logger(__name__)
15+
16+
17+
class BaseRClone:
18+
19+
"""
20+
RClone base class.
21+
22+
This class allows you to interact with an rclone remote without
23+
a configuration file, the remote declaration and its options
24+
are passed in the command itself.
25+
26+
This base class allows you to use the local file system as remote.
27+
28+
:param remote_type: You can see the full list of supported providers at
29+
https://rclone.org/#providers.
30+
:param rclone_bin: Binary name or path to the rclone binary.
31+
Defaults to ``rclone``.
32+
:param default_options: Options passed to the rclone command.
33+
:parm env_vars: Environment variables used when executing the rclone command.
34+
Useful to pass secrets to the ``rclone` command, since all arguments and
35+
options will be logged.
36+
"""
37+
38+
remote_type = None
39+
rclone_bin = "rclone"
40+
default_options = [
41+
# Number of file transfers to run in parallel.
42+
# Default value is 4.
43+
"--transfers=8",
44+
# Skip based on checksum (if available) & size, not mod-time & size.
45+
"--checksum",
46+
"--verbose",
47+
]
48+
env_vars = {}
49+
50+
def _get_target_path(self, path):
51+
"""
52+
Get the final target path for the remote.
53+
54+
.. note::
55+
56+
This doesn't include the remote type,
57+
this is just the destination path.
58+
"""
59+
raise NotImplementedError
60+
61+
def get_target(self, path):
62+
"""
63+
Get the proper target using the current remote type.
64+
65+
We start the remote with `:` to create it on the fly,
66+
instead of having to create a configuration file.
67+
See https://rclone.org/docs/#backend-path-to-dir.
68+
69+
:param path: Path to the remote target.
70+
"""
71+
path = self._get_target_path(path)
72+
return f":{self.remote_type}:{path}"
73+
74+
def execute(self, subcommand, args, options=None):
75+
"""
76+
Execute an rclone subcommand.
77+
78+
:param subcommand: Name of the subcommand.
79+
:param list args: List of positional arguments passed the to command.
80+
:param list options: List of options passed to the command.
81+
"""
82+
options = options or []
83+
command = [
84+
self.rclone_bin,
85+
subcommand,
86+
*self.default_options,
87+
*options,
88+
"--",
89+
*args,
90+
]
91+
env = os.environ.copy()
92+
env.update(self.env_vars)
93+
log.info("Executing rclone command.", command=command)
94+
log.debug("Executing rclone commmad.", env=env)
95+
result = subprocess.run(
96+
command,
97+
capture_output=True,
98+
env=env,
99+
check=True,
100+
)
101+
log.debug(
102+
"rclone execution finished.",
103+
stdout=result.stdout.decode(),
104+
stderr=result.stderr.decode(),
105+
exit_code=result.returncode,
106+
)
107+
return result
108+
109+
def sync(self, source, destination):
110+
"""
111+
Run the `rclone sync` command.
112+
113+
See https://rclone.org/commands/rclone_sync/.
114+
115+
:params source: Local path to the source directory.
116+
:params destination: Remote path to the destination directory.
117+
"""
118+
return self.execute("sync", args=[source, self.get_target(destination)])
119+
120+
121+
class RCloneLocal(BaseRClone):
122+
123+
"""
124+
RClone remote implementation for the local file system.
125+
126+
Used for local testing only.
127+
128+
See https://rclone.org/local/.
129+
130+
:param location: Root directory where the files will be stored.
131+
"""
132+
133+
remote_type = "local"
134+
135+
def __init__(self, location):
136+
self.location = location
137+
138+
def _get_target_path(self, path):
139+
return safe_join_fs(self.location, path)
140+
141+
142+
class RCloneS3Remote(BaseRClone):
143+
144+
"""
145+
RClone remote implementation for S3.
146+
147+
All secrets will be passed as environ variables to the rclone command.
148+
149+
See https://rclone.org/s3/.
150+
151+
:params bucket_name: Name of the S3 bucket.
152+
:params access_key_id: AWS access key id.
153+
:params secret_acces_key: AWS secret access key.
154+
:params region: AWS region.
155+
:params provider: S3 provider, defaults to ``AWS``.
156+
Useful to use Minio during development.
157+
See https://rclone.org/s3/#s3-provider.
158+
:param acl: Canned ACL used when creating buckets and storing or copying objects.
159+
See https://rclone.org/s3/#s3-acl.
160+
:param endpoint: Custom S3 endpoint, useful for development.
161+
"""
162+
163+
remote_type = "s3"
164+
165+
def __init__(
166+
self,
167+
bucket_name,
168+
access_key_id,
169+
secret_acces_key,
170+
region,
171+
provider="AWS",
172+
acl=None,
173+
endpoint=None,
174+
):
175+
# rclone S3 options passed as env vars.
176+
# https://rclone.org/s3/#standard-options.
177+
self.env_vars = {
178+
"RCLONE_S3_PROVIDER": provider,
179+
"RCLONE_S3_ACCESS_KEY_ID": access_key_id,
180+
"RCLONE_S3_SECRET_ACCESS_KEY": secret_acces_key,
181+
"RCLONE_S3_REGION": region,
182+
"RCLONE_S3_LOCATION_CONSTRAINT": region,
183+
}
184+
if acl:
185+
self.env_vars["RCLONE_S3_ACL"] = acl
186+
if endpoint:
187+
self.env_vars["RCLONE_S3_ENDPOINT"] = endpoint
188+
self.bucket_name = bucket_name
189+
190+
def _get_target_path(self, path):
191+
"""Overridden to prepend the bucket name to the path."""
192+
return safe_join(self.bucket_name, path)

0 commit comments

Comments
 (0)