Skip to content

Commit a5d74dc

Browse files
committed
Change the machine_cache folder to always be /var/meadowrun/machine_cache. Then it is truly per-machine rather than per-run_job_local configuration, and it will be the same across container and non-container jobs. Also document this feature
1 parent 1c8ba66 commit a5d74dc

File tree

6 files changed

+68
-14
lines changed

6 files changed

+68
-14
lines changed

build_scripts/build_image_shared.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55

66
import asyncssh
77

8+
from meadowrun.run_job_local import MACHINE_CACHE_FOLDER
9+
810
if TYPE_CHECKING:
911
from meadowrun.run_job_core import CloudProviderType
1012
from meadowrun.ssh import (
@@ -134,4 +136,7 @@ async def upload_and_configure_meadowrun(
134136
"sudo systemctl enable --now meadowrun-check-spot-eviction.timer",
135137
)
136138

139+
# TODO move this into the base image at some point
140+
await run_and_print(connection, f"mkdir -p {MACHINE_CACHE_FOLDER}")
141+
137142
return image_name

docs/how_to/.pages

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,6 @@ nav:
66
- private_git_repo_aws.md
77
- private_git_repo_azure.md
88
- ssh_to_instance.md
9+
- machine_cache.md
910
- kubernetes.md
10-
- private_container_kubernetes.md
11+
- private_container_kubernetes.md

docs/how_to/machine_cache.md

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Cache data on the underlying machine
2+
3+
Your Meadowrun code will usually run in a container, which means that it will not be
4+
able to write files to the underlying file system. Meadowrun provides
5+
`meadowrun.MACHINE_CACHE_FOLDER` which points to `/var/meadowrun/machine_cache` when
6+
running on EC2 or Azure VMs. You can write data to this folder and it will be visible to
7+
any jobs that happen to run on the same node.
8+
9+
The general pattern for using this folder should be something like:
10+
11+
```python
12+
import os.path
13+
import filelock
14+
import meadowrun
15+
16+
17+
def a_slow_computation():
18+
return "some sample data"
19+
20+
21+
def get_cached_data():
22+
cached_data_filename = os.path.join(meadowrun.MACHINE_CACHE_FOLDER, "myfile")
23+
24+
with filelock.FileLock(f"{cached_data_filename}.lock"):
25+
if not os.path.exists(cached_data_filename):
26+
data = a_slow_computation()
27+
with open(cached_data_filename, "w") as cached_data_file:
28+
cached_data_file.write(data)
29+
30+
return data
31+
else:
32+
with open(cached_data_filename, "r") as cached_data_file:
33+
return cached_data_file.read()
34+
```
35+
36+
`filelock` is a [library](https://py-filelock.readthedocs.io/en/latest/) which makes
37+
sure only one process at a time is writing to the specified file. You're welcome to use
38+
whatever locking mechanism you like, but you should never assume that your job is the
39+
only process running on the machine.
40+
41+
You should also never assume that any data you wrote will be available for a subsequent
42+
job. Meadowrun does not provide a way to guarantee that two jobs will run on the same
43+
machine.

src/meadowrun/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def new__del__(self: Any, _warn: Any = warnings.warn) -> None:
6969
run_map_as_completed,
7070
)
7171
from meadowrun.run_job_core import Resources, SshHost, TaskException
72-
from meadowrun.run_job_local import Host, LocalHost
72+
from meadowrun.run_job_local import Host, LocalHost, MACHINE_CACHE_FOLDER
7373
from meadowrun.version import __version__
7474

7575
__all__ = [
@@ -96,6 +96,7 @@ def new__del__(self: Any, _warn: Any = warnings.warn) -> None:
9696
"Secret",
9797
"Host",
9898
"LocalHost",
99+
"MACHINE_CACHE_FOLDER",
99100
"Resources",
100101
"SshHost",
101102
"Kubernetes",

src/meadowrun/run_job_local.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@
8787
_MEADOWRUN_CONTEXT_VARIABLES = "MEADOWRUN_CONTEXT_VARIABLES"
8888
_MEADOWRUN_RESULT_FILE = "MEADOWRUN_RESULT_FILE"
8989
_MEADOWRUN_RESULT_PICKLE_PROTOCOL = "MEADOWRUN_RESULT_PICKLE_PROTOCOL"
90+
MACHINE_CACHE_FOLDER = "/var/meadowrun/machine_cache"
9091

9192

9293
def _string_pairs_to_dict(pairs: Iterable[StringPair]) -> Dict[str, str]:
@@ -458,7 +459,6 @@ async def _launch_container_job(
458459
log_file_name: str,
459460
job: Job,
460461
io_folder: str,
461-
machine_cache_folder: str,
462462
sidecar_container_images: List[str],
463463
) -> Tuple[str, Coroutine[Any, Any, ProcessState]]:
464464
"""
@@ -474,7 +474,7 @@ async def _launch_container_job(
474474
continuation.
475475
"""
476476

477-
binds: List[Tuple[str, str]] = [(machine_cache_folder, "/meadowrun/machine_cache")]
477+
binds: List[Tuple[str, str]] = [(MACHINE_CACHE_FOLDER, MACHINE_CACHE_FOLDER)]
478478

479479
if cwd_path is not None:
480480
unique_code_paths = list(dict.fromkeys(itertools.chain([cwd_path], code_paths)))
@@ -736,7 +736,7 @@ def _get_default_working_folder() -> str:
736736

737737
def _set_up_working_folder(
738738
working_folder: Optional[str],
739-
) -> Tuple[str, str, str, str, str, str]:
739+
) -> Tuple[str, str, str, str, str]:
740740
"""
741741
Sets the working_folder to a default if it's not set, creates the necessary
742742
subfolders, gets a machine-wide lock on the working folder, then returns io_folder,
@@ -760,23 +760,19 @@ def _set_up_working_folder(
760760
local_copies_folder = os.path.join(working_folder, "local_copies")
761761
# misc folder for e.g. storing environment export files sent from local machine
762762
misc_folder = os.path.join(working_folder, "misc")
763-
# machine_cache folder for providing a machine-wide folder on the machine
764-
machine_cache_folder = os.path.join(working_folder, "machine_cache")
765763

766764
os.makedirs(io_folder, exist_ok=True)
767765
os.makedirs(job_logs_folder, exist_ok=True)
768766
os.makedirs(git_repos_folder, exist_ok=True)
769767
os.makedirs(local_copies_folder, exist_ok=True)
770768
os.makedirs(misc_folder, exist_ok=True)
771-
os.makedirs(machine_cache_folder, exist_ok=True)
772769

773770
return (
774771
io_folder,
775772
job_logs_folder,
776773
git_repos_folder,
777774
local_copies_folder,
778775
misc_folder,
779-
machine_cache_folder,
780776
)
781777

782778

@@ -952,7 +948,6 @@ async def run_local(
952948
git_repos_folder,
953949
local_copies_folder,
954950
misc_folder,
955-
machine_cache_folder,
956951
) = _set_up_working_folder(working_folder)
957952

958953
# unpickle credentials if necessary
@@ -1142,7 +1137,6 @@ async def run_local(
11421137
log_file_name,
11431138
job,
11441139
io_folder,
1145-
machine_cache_folder,
11461140
sidecar_containers,
11471141
)
11481142
# due to the way protobuf works, this is equivalent to None

tests/basics.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -293,9 +293,19 @@ def remote_function() -> str:
293293
async def test_pip_file_in_git_repo_with_data_file(self) -> None:
294294
"""This test is doing double-duty, also checking for the machine_cache folder"""
295295

296-
def remote_function() -> Tuple[bool, str]:
296+
# TODO upgrade the Meadowrun referenced in requirements.txt and move this inside
297+
# remote_function
298+
machine_cache_folder = meadowrun.MACHINE_CACHE_FOLDER
299+
300+
def remote_function() -> str:
301+
# make sure the machine cache folder is writable
302+
with open(
303+
os.path.join(machine_cache_folder, "foo"), "w", encoding="utf-8"
304+
) as f:
305+
f.write("test")
306+
297307
with open("example_package/test.txt", encoding="utf-8") as f:
298-
return os.path.isdir("/meadowrun/machine_cache"), f.read()
308+
return f.read()
299309

300310
results = await run_function(
301311
remote_function,
@@ -306,7 +316,7 @@ def remote_function() -> Tuple[bool, str]:
306316
interpreter=PipRequirementsFile("requirements.txt", "3.9"),
307317
),
308318
)
309-
assert results == (True, "Hello world!")
319+
assert results == "Hello world!"
310320

311321
@pytest.mark.skipif("sys.version_info < (3, 8)")
312322
@pytest.mark.asyncio

0 commit comments

Comments
 (0)