Skip to content

Commit dfa3f0b

Browse files
committed
feature: add utility function to capture local snapshot (#1524)
* local snapshot * Update pip list command * Remove function calls * Address comments * Address comments
1 parent a166b3c commit dfa3f0b

File tree

1 file changed

+320
-0
lines changed

1 file changed

+320
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,320 @@
1+
import boto3
2+
import docker
3+
import logging
4+
import subprocess
5+
import sys
6+
from typing import Optional
7+
import yaml
8+
9+
logger = logging.getLogger(__name__)
10+
11+
REQUIREMENT_TXT_PATH = "/tmp/requirements.txt"
12+
ENVIRONMENT_YML_PATH = "/tmp/environment.yml"
13+
DOCKERFILE_PATH = "/tmp/Dockerfile"
14+
15+
CONDA_DOCKERFILE_TEMPLATE = """
16+
FROM {base_image_name}
17+
ADD environment.yml .
18+
19+
# Install prerequisites for conda
20+
RUN apt-get update && \
21+
apt-get install -y wget bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 && \
22+
apt-get clean
23+
24+
# Download and install conda
25+
RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
26+
bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \
27+
rm Miniconda3-latest-Linux-x86_64.sh
28+
29+
# Initialize Conda
30+
ENV PATH=/opt/conda/bin:$PATH
31+
RUN conda update -n base -c defaults conda && \
32+
conda config --add channels conda-forge
33+
34+
# Create a conda environment from the environment.yml file
35+
RUN conda env create -f environment.yml -n {env_name}
36+
37+
# Activate the conda environment
38+
RUN conda run -n {env_name}
39+
"""
40+
41+
PIP_DOCKERFILE_TEMPLATE = """
42+
FROM {base_image_name}
43+
ADD requirements.txt .
44+
45+
# Create a virtual environment
46+
RUN python -m venv {env_name}
47+
48+
# Activate the virtual environment
49+
RUN . {env_name}/bin/activate
50+
51+
RUN pip install --no-cache-dir -r requirements.txt
52+
"""
53+
54+
55+
def capture_local_environment(
56+
image_name: str = "sm-local-capture",
57+
env_name: str = "saved_local_env",
58+
package_manager: str = "pip",
59+
deploy_to_ecr: bool = False,
60+
base_image_name: Optional[str] = None,
61+
job_conda_env: Optional[str] = None,
62+
additional_dependencies: Optional[str] = None,
63+
ecr_repo_name: Optional[str] = None,
64+
boto_session: Optional[boto3.Session] = None,
65+
):
66+
"""
67+
Capture all dependency packages installed in the local environment and build a docker image.
68+
When using this utility method, the docker daemon must be active in the environment.
69+
Please note that this is an experimental feature. This utility function is not be able to detect the package
70+
compatability between platforms. It is also not able to detect dependency conflicts between the local environment
71+
and the additional dependencies.
72+
73+
Args:
74+
image_name (str): The name of the docker image.
75+
env_name (str): The name of the virtual environment to be activated in the image, defaults to "saved_local_env".
76+
package_manager (str): The package manager, must be one of "conda" or "pip".
77+
deploy_to_ecr (bool): Whether to deploy the docker image to AWS ECR, defaults to False. If set to True, the AWS
78+
credentials must be configured in the environment.
79+
base_image_name (Optional[str]): If provided will be used as the base image, else the utility will evaluate
80+
from local environment in following manner:
81+
1. If package manager is conda, it will use ubuntu:latest.
82+
2. If package manager is pip, it is resolved to base python image with the same python version
83+
as the environment running the local code.
84+
job_conda_env (Optional[str]): If set, the dependencies will be captured from this specific conda Env,
85+
otherwise the dependencies will be the installed packages in the current active environment. This parameter
86+
is only valid when the package manager is conda.
87+
additional_dependencies (Optional[str]): Either the path to a dependencies file (conda environment.yml OR pip
88+
requirements.txt file). Regardless of this setting utility will automatically generate the dependencies
89+
file corresponding to the current active environment’s snapshot. In addition to this, additional dependencies
90+
is configurable.
91+
ecr_repo_name (Optional[str]): The AWS ECR repo to push the docker image. If not specified, it will use image_name as
92+
the ECR repo name. This parameter is only valid when deploy_to_ecr is True.
93+
boto_session (Optional[boto3.Session]): The boto3 session with AWS account info. If not provided, a new boto session
94+
will be created.
95+
96+
Exceptions:
97+
docker.errors.DockerException: Error while fetching server API version:
98+
The docker engine is not running in your environment.
99+
docker.errors.BuildError: The docker failed to build the image. The most likely reason is: 1) Some packages are not
100+
supported in the base image. 2) There are dependency conflicts between your local environment and additional dependencies.
101+
botocore.exceptions.ClientError: AWS credentials are not configured.
102+
"""
103+
104+
if package_manager == "conda":
105+
if job_conda_env:
106+
subprocess.run(
107+
f"conda env export -n {job_conda_env} > {ENVIRONMENT_YML_PATH} --no-builds",
108+
shell=True,
109+
check=True,
110+
)
111+
else:
112+
subprocess.run(
113+
f"conda env export > {ENVIRONMENT_YML_PATH} --no-builds", shell=True, check=True
114+
)
115+
116+
if additional_dependencies:
117+
if not additional_dependencies.endswith(
118+
".yml"
119+
) and not additional_dependencies.endswith(".txt"):
120+
raise ValueError(
121+
"When package manager is conda, additional dependencies file must be a yml file or a txt file."
122+
)
123+
if additional_dependencies.endswith(".yml"):
124+
_merge_environment_ymls(
125+
env_name,
126+
ENVIRONMENT_YML_PATH,
127+
additional_dependencies,
128+
ENVIRONMENT_YML_PATH,
129+
)
130+
elif additional_dependencies.endswith(".txt"):
131+
_merge_environment_yml_with_requirement_txt(
132+
env_name,
133+
ENVIRONMENT_YML_PATH,
134+
additional_dependencies,
135+
ENVIRONMENT_YML_PATH,
136+
)
137+
138+
if not base_image_name:
139+
base_image_name = "ubuntu:latest"
140+
dockerfile_contents = CONDA_DOCKERFILE_TEMPLATE.format(
141+
base_image_name=base_image_name,
142+
env_name=env_name,
143+
)
144+
elif package_manager == "pip":
145+
subprocess.run(f"pip list --format=freeze > {REQUIREMENT_TXT_PATH}", shell=True, check=True)
146+
147+
if additional_dependencies:
148+
if not additional_dependencies.endswith(".txt"):
149+
raise ValueError(
150+
"When package manager is pip, additional dependencies file must be a txt file."
151+
)
152+
with open(additional_dependencies, "r") as f:
153+
additional_requirements = f.read()
154+
with open(REQUIREMENT_TXT_PATH, "a") as f:
155+
f.write(additional_requirements)
156+
logger.info(f"Merged requirements file saved to {REQUIREMENT_TXT_PATH}")
157+
158+
if not base_image_name:
159+
version = sys.version_info
160+
base_image_name = f"python:{version.major}.{version.minor}.{version.micro}"
161+
dockerfile_contents = PIP_DOCKERFILE_TEMPLATE.format(
162+
base_image_name=base_image_name,
163+
env_name=env_name,
164+
)
165+
166+
else:
167+
raise ValueError(
168+
"The provided package manager is not supported. Use conda or pip as the package manager."
169+
)
170+
171+
# Create the Dockerfile
172+
with open(DOCKERFILE_PATH, "w") as f:
173+
f.write(dockerfile_contents)
174+
175+
client = docker.from_env()
176+
image, logs = client.images.build(
177+
path="/tmp",
178+
dockerfile=DOCKERFILE_PATH,
179+
rm=True,
180+
tag=image_name,
181+
)
182+
for log in logs:
183+
logger.info(log.get("stream", "").strip())
184+
logger.info(f"Docker image {image_name} built successfully")
185+
186+
if deploy_to_ecr:
187+
if boto_session is None:
188+
boto_session = boto3.Session()
189+
_push_image_to_ecr(image_name, ecr_repo_name, boto_session)
190+
191+
192+
def _merge_environment_ymls(env_name: str, env_file1: str, env_file2: str, output_file: str):
193+
"""
194+
Merge two environment.yml files and save to a new environment.yml file.
195+
196+
Args:
197+
env_name (str): The name of the virtual environment to be activated in the image.
198+
env_file1 (str): The path of the first environment.yml file.
199+
env_file2 (str): The path of the second environment.yml file.
200+
output_file (str): The path of the output environment.yml file.
201+
"""
202+
203+
# Load the YAML files
204+
with open(env_file1, "r") as f:
205+
env1 = yaml.safe_load(f)
206+
with open(env_file2, "r") as f:
207+
env2 = yaml.safe_load(f)
208+
209+
# Combine dependencies and channels from both files
210+
dependencies = []
211+
pip_dependencies = []
212+
channels = set()
213+
214+
for env in [env1, env2]:
215+
if "dependencies" in env:
216+
for dep in env["dependencies"]:
217+
if isinstance(dep, str):
218+
# Conda package, e.g., 'python=3.7'
219+
dependencies.append(dep)
220+
elif isinstance(dep, dict):
221+
# Pip package list, e.g., {'pip': ['requests>=2.22.0']}
222+
for pip_package in dep.get("pip", []):
223+
pip_dependencies.append(pip_package)
224+
if "channels" in env:
225+
channels.update(env["channels"])
226+
227+
if pip_dependencies:
228+
dependencies.append({"pip": pip_dependencies})
229+
# Create the merged environment file
230+
merged_env = {"name": env_name, "channels": list(channels), "dependencies": dependencies}
231+
232+
with open(output_file, "w") as f:
233+
yaml.dump(merged_env, f, sort_keys=False)
234+
235+
logger.info(f"Merged environment file saved to '{output_file}'")
236+
237+
238+
def _merge_environment_yml_with_requirement_txt(
239+
env_name: str, env_file: str, req_txt: str, output_file: str
240+
):
241+
"""
242+
Merge an environment.yml file with a requirements.txt file and save to a new environment.yml file.
243+
244+
Args:
245+
env_name (str): The name of the virtual environment to be activated in the image.
246+
env_file (str): The path of the environment.yml file.
247+
req_txt (str): The path of the requirements.txt file.
248+
output_file (str): The path of the output environment.yml file.
249+
"""
250+
# Load the files
251+
with open(env_file, "r") as f:
252+
env = yaml.safe_load(f)
253+
with open(req_txt, "r") as f:
254+
requirements = f.read().splitlines()
255+
# Combine pip dependencies from both files
256+
dependencies = []
257+
pip_dependencies = []
258+
259+
if "dependencies" in env:
260+
for dep in env["dependencies"]:
261+
if isinstance(dep, str):
262+
# Conda package, e.g., 'python=3.7'
263+
dependencies.append(dep)
264+
elif isinstance(dep, dict):
265+
# Pip package list, e.g., {'pip': ['requests>=2.22.0']}
266+
for pip_package in dep.get("pip", []):
267+
pip_dependencies.append(pip_package)
268+
269+
for req in requirements:
270+
if req and not req.startswith("#"):
271+
pip_dependencies.append(req)
272+
273+
if pip_dependencies:
274+
dependencies.append({"pip": pip_dependencies})
275+
# Create the merged environment file
276+
merged_env = {"name": env_name, "channels": env["channels"], "dependencies": dependencies}
277+
278+
with open(output_file, "w") as f:
279+
yaml.dump(merged_env, f, sort_keys=False)
280+
281+
logger.info(f"Merged environment file saved to '{output_file}'")
282+
283+
284+
def _push_image_to_ecr(image_name: str, ecr_repo_name: str, boto_session: Optional[boto3.Session]):
285+
"""
286+
Push the docker image to AWS ECR.
287+
288+
Args:
289+
image_name (str): The name of the docker image.
290+
ecr_repo_name (str): The AWS ECR repo to push the docker image.
291+
"""
292+
region = boto_session.region_name
293+
aws_account_id = boto_session.client("sts", region_name=region).get_caller_identity()["Account"]
294+
ecr_client = boto3.client("ecr")
295+
296+
# Authenticate Docker with ECR
297+
registry_url = f"{aws_account_id}.dkr.ecr.{region}.amazonaws.com"
298+
docker_login_cmd = (
299+
f"aws ecr get-login-password --region {region} "
300+
f"| docker login --username AWS --password-stdin {aws_account_id}.dkr.ecr.{region}.amazonaws.com"
301+
)
302+
subprocess.run(docker_login_cmd, shell=True, check=True)
303+
304+
# Create a new ECR repository (if it doesn't already exist)
305+
ecr_repo_name = ecr_repo_name or image_name
306+
try:
307+
ecr_client.create_repository(repositoryName=ecr_repo_name)
308+
except ecr_client.exceptions.RepositoryAlreadyExistsException:
309+
pass
310+
311+
# Tag the local Docker image
312+
ecr_image_uri = f"{registry_url}/{ecr_repo_name}:latest"
313+
docker_tag_cmd = f"docker tag {image_name}:latest {ecr_image_uri}"
314+
subprocess.run(docker_tag_cmd, shell=True, check=True)
315+
316+
# Push the Docker image to ECR
317+
docker_push_cmd = f"docker push {ecr_image_uri}"
318+
subprocess.run(docker_push_cmd, shell=True, check=True)
319+
320+
logger.info(f"Image {image_name} pushed to {ecr_image_uri}")

0 commit comments

Comments
 (0)