|
| 1 | +import boto3 |
| 2 | +import docker |
| 3 | +import logging |
| 4 | +import subprocess |
| 5 | +import sys |
| 6 | +from typing import Optional |
| 7 | +import yaml |
| 8 | + |
| 9 | +logger = logging.getLogger(__name__) |
| 10 | + |
| 11 | +REQUIREMENT_TXT_PATH = "/tmp/requirements.txt" |
| 12 | +ENVIRONMENT_YML_PATH = "/tmp/environment.yml" |
| 13 | +DOCKERFILE_PATH = "/tmp/Dockerfile" |
| 14 | + |
| 15 | +CONDA_DOCKERFILE_TEMPLATE = """ |
| 16 | +FROM {base_image_name} |
| 17 | +ADD environment.yml . |
| 18 | +
|
| 19 | +# Install prerequisites for conda |
| 20 | +RUN apt-get update && \ |
| 21 | + apt-get install -y wget bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 && \ |
| 22 | + apt-get clean |
| 23 | +
|
| 24 | +# Download and install conda |
| 25 | +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ |
| 26 | + bash Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda && \ |
| 27 | + rm Miniconda3-latest-Linux-x86_64.sh |
| 28 | +
|
| 29 | +# Initialize Conda |
| 30 | +ENV PATH=/opt/conda/bin:$PATH |
| 31 | +RUN conda update -n base -c defaults conda && \ |
| 32 | + conda config --add channels conda-forge |
| 33 | +
|
| 34 | +# Create a conda environment from the environment.yml file |
| 35 | +RUN conda env create -f environment.yml -n {env_name} |
| 36 | +
|
| 37 | +# Activate the conda environment |
| 38 | +RUN conda run -n {env_name} |
| 39 | +""" |
| 40 | + |
| 41 | +PIP_DOCKERFILE_TEMPLATE = """ |
| 42 | +FROM {base_image_name} |
| 43 | +ADD requirements.txt . |
| 44 | +
|
| 45 | +# Create a virtual environment |
| 46 | +RUN python -m venv {env_name} |
| 47 | +
|
| 48 | +# Activate the virtual environment |
| 49 | +RUN . {env_name}/bin/activate |
| 50 | +
|
| 51 | +RUN pip install --no-cache-dir -r requirements.txt |
| 52 | +""" |
| 53 | + |
| 54 | + |
| 55 | +def capture_local_environment( |
| 56 | + image_name: str = "sm-local-capture", |
| 57 | + env_name: str = "saved_local_env", |
| 58 | + package_manager: str = "pip", |
| 59 | + deploy_to_ecr: bool = False, |
| 60 | + base_image_name: Optional[str] = None, |
| 61 | + job_conda_env: Optional[str] = None, |
| 62 | + additional_dependencies: Optional[str] = None, |
| 63 | + ecr_repo_name: Optional[str] = None, |
| 64 | + boto_session: Optional[boto3.Session] = None, |
| 65 | +): |
| 66 | + """ |
| 67 | + Capture all dependency packages installed in the local environment and build a docker image. |
| 68 | + When using this utility method, the docker daemon must be active in the environment. |
| 69 | + Please note that this is an experimental feature. This utility function is not be able to detect the package |
| 70 | + compatability between platforms. It is also not able to detect dependency conflicts between the local environment |
| 71 | + and the additional dependencies. |
| 72 | +
|
| 73 | + Args: |
| 74 | + image_name (str): The name of the docker image. |
| 75 | + env_name (str): The name of the virtual environment to be activated in the image, defaults to "saved_local_env". |
| 76 | + package_manager (str): The package manager, must be one of "conda" or "pip". |
| 77 | + deploy_to_ecr (bool): Whether to deploy the docker image to AWS ECR, defaults to False. If set to True, the AWS |
| 78 | + credentials must be configured in the environment. |
| 79 | + base_image_name (Optional[str]): If provided will be used as the base image, else the utility will evaluate |
| 80 | + from local environment in following manner: |
| 81 | + 1. If package manager is conda, it will use ubuntu:latest. |
| 82 | + 2. If package manager is pip, it is resolved to base python image with the same python version |
| 83 | + as the environment running the local code. |
| 84 | + job_conda_env (Optional[str]): If set, the dependencies will be captured from this specific conda Env, |
| 85 | + otherwise the dependencies will be the installed packages in the current active environment. This parameter |
| 86 | + is only valid when the package manager is conda. |
| 87 | + additional_dependencies (Optional[str]): Either the path to a dependencies file (conda environment.yml OR pip |
| 88 | + requirements.txt file). Regardless of this setting utility will automatically generate the dependencies |
| 89 | + file corresponding to the current active environment’s snapshot. In addition to this, additional dependencies |
| 90 | + is configurable. |
| 91 | + ecr_repo_name (Optional[str]): The AWS ECR repo to push the docker image. If not specified, it will use image_name as |
| 92 | + the ECR repo name. This parameter is only valid when deploy_to_ecr is True. |
| 93 | + boto_session (Optional[boto3.Session]): The boto3 session with AWS account info. If not provided, a new boto session |
| 94 | + will be created. |
| 95 | +
|
| 96 | + Exceptions: |
| 97 | + docker.errors.DockerException: Error while fetching server API version: |
| 98 | + The docker engine is not running in your environment. |
| 99 | + docker.errors.BuildError: The docker failed to build the image. The most likely reason is: 1) Some packages are not |
| 100 | + supported in the base image. 2) There are dependency conflicts between your local environment and additional dependencies. |
| 101 | + botocore.exceptions.ClientError: AWS credentials are not configured. |
| 102 | + """ |
| 103 | + |
| 104 | + if package_manager == "conda": |
| 105 | + if job_conda_env: |
| 106 | + subprocess.run( |
| 107 | + f"conda env export -n {job_conda_env} > {ENVIRONMENT_YML_PATH} --no-builds", |
| 108 | + shell=True, |
| 109 | + check=True, |
| 110 | + ) |
| 111 | + else: |
| 112 | + subprocess.run( |
| 113 | + f"conda env export > {ENVIRONMENT_YML_PATH} --no-builds", shell=True, check=True |
| 114 | + ) |
| 115 | + |
| 116 | + if additional_dependencies: |
| 117 | + if not additional_dependencies.endswith( |
| 118 | + ".yml" |
| 119 | + ) and not additional_dependencies.endswith(".txt"): |
| 120 | + raise ValueError( |
| 121 | + "When package manager is conda, additional dependencies file must be a yml file or a txt file." |
| 122 | + ) |
| 123 | + if additional_dependencies.endswith(".yml"): |
| 124 | + _merge_environment_ymls( |
| 125 | + env_name, |
| 126 | + ENVIRONMENT_YML_PATH, |
| 127 | + additional_dependencies, |
| 128 | + ENVIRONMENT_YML_PATH, |
| 129 | + ) |
| 130 | + elif additional_dependencies.endswith(".txt"): |
| 131 | + _merge_environment_yml_with_requirement_txt( |
| 132 | + env_name, |
| 133 | + ENVIRONMENT_YML_PATH, |
| 134 | + additional_dependencies, |
| 135 | + ENVIRONMENT_YML_PATH, |
| 136 | + ) |
| 137 | + |
| 138 | + if not base_image_name: |
| 139 | + base_image_name = "ubuntu:latest" |
| 140 | + dockerfile_contents = CONDA_DOCKERFILE_TEMPLATE.format( |
| 141 | + base_image_name=base_image_name, |
| 142 | + env_name=env_name, |
| 143 | + ) |
| 144 | + elif package_manager == "pip": |
| 145 | + subprocess.run(f"pip list --format=freeze > {REQUIREMENT_TXT_PATH}", shell=True, check=True) |
| 146 | + |
| 147 | + if additional_dependencies: |
| 148 | + if not additional_dependencies.endswith(".txt"): |
| 149 | + raise ValueError( |
| 150 | + "When package manager is pip, additional dependencies file must be a txt file." |
| 151 | + ) |
| 152 | + with open(additional_dependencies, "r") as f: |
| 153 | + additional_requirements = f.read() |
| 154 | + with open(REQUIREMENT_TXT_PATH, "a") as f: |
| 155 | + f.write(additional_requirements) |
| 156 | + logger.info(f"Merged requirements file saved to {REQUIREMENT_TXT_PATH}") |
| 157 | + |
| 158 | + if not base_image_name: |
| 159 | + version = sys.version_info |
| 160 | + base_image_name = f"python:{version.major}.{version.minor}.{version.micro}" |
| 161 | + dockerfile_contents = PIP_DOCKERFILE_TEMPLATE.format( |
| 162 | + base_image_name=base_image_name, |
| 163 | + env_name=env_name, |
| 164 | + ) |
| 165 | + |
| 166 | + else: |
| 167 | + raise ValueError( |
| 168 | + "The provided package manager is not supported. Use conda or pip as the package manager." |
| 169 | + ) |
| 170 | + |
| 171 | + # Create the Dockerfile |
| 172 | + with open(DOCKERFILE_PATH, "w") as f: |
| 173 | + f.write(dockerfile_contents) |
| 174 | + |
| 175 | + client = docker.from_env() |
| 176 | + image, logs = client.images.build( |
| 177 | + path="/tmp", |
| 178 | + dockerfile=DOCKERFILE_PATH, |
| 179 | + rm=True, |
| 180 | + tag=image_name, |
| 181 | + ) |
| 182 | + for log in logs: |
| 183 | + logger.info(log.get("stream", "").strip()) |
| 184 | + logger.info(f"Docker image {image_name} built successfully") |
| 185 | + |
| 186 | + if deploy_to_ecr: |
| 187 | + if boto_session is None: |
| 188 | + boto_session = boto3.Session() |
| 189 | + _push_image_to_ecr(image_name, ecr_repo_name, boto_session) |
| 190 | + |
| 191 | + |
| 192 | +def _merge_environment_ymls(env_name: str, env_file1: str, env_file2: str, output_file: str): |
| 193 | + """ |
| 194 | + Merge two environment.yml files and save to a new environment.yml file. |
| 195 | +
|
| 196 | + Args: |
| 197 | + env_name (str): The name of the virtual environment to be activated in the image. |
| 198 | + env_file1 (str): The path of the first environment.yml file. |
| 199 | + env_file2 (str): The path of the second environment.yml file. |
| 200 | + output_file (str): The path of the output environment.yml file. |
| 201 | + """ |
| 202 | + |
| 203 | + # Load the YAML files |
| 204 | + with open(env_file1, "r") as f: |
| 205 | + env1 = yaml.safe_load(f) |
| 206 | + with open(env_file2, "r") as f: |
| 207 | + env2 = yaml.safe_load(f) |
| 208 | + |
| 209 | + # Combine dependencies and channels from both files |
| 210 | + dependencies = [] |
| 211 | + pip_dependencies = [] |
| 212 | + channels = set() |
| 213 | + |
| 214 | + for env in [env1, env2]: |
| 215 | + if "dependencies" in env: |
| 216 | + for dep in env["dependencies"]: |
| 217 | + if isinstance(dep, str): |
| 218 | + # Conda package, e.g., 'python=3.7' |
| 219 | + dependencies.append(dep) |
| 220 | + elif isinstance(dep, dict): |
| 221 | + # Pip package list, e.g., {'pip': ['requests>=2.22.0']} |
| 222 | + for pip_package in dep.get("pip", []): |
| 223 | + pip_dependencies.append(pip_package) |
| 224 | + if "channels" in env: |
| 225 | + channels.update(env["channels"]) |
| 226 | + |
| 227 | + if pip_dependencies: |
| 228 | + dependencies.append({"pip": pip_dependencies}) |
| 229 | + # Create the merged environment file |
| 230 | + merged_env = {"name": env_name, "channels": list(channels), "dependencies": dependencies} |
| 231 | + |
| 232 | + with open(output_file, "w") as f: |
| 233 | + yaml.dump(merged_env, f, sort_keys=False) |
| 234 | + |
| 235 | + logger.info(f"Merged environment file saved to '{output_file}'") |
| 236 | + |
| 237 | + |
| 238 | +def _merge_environment_yml_with_requirement_txt( |
| 239 | + env_name: str, env_file: str, req_txt: str, output_file: str |
| 240 | +): |
| 241 | + """ |
| 242 | + Merge an environment.yml file with a requirements.txt file and save to a new environment.yml file. |
| 243 | +
|
| 244 | + Args: |
| 245 | + env_name (str): The name of the virtual environment to be activated in the image. |
| 246 | + env_file (str): The path of the environment.yml file. |
| 247 | + req_txt (str): The path of the requirements.txt file. |
| 248 | + output_file (str): The path of the output environment.yml file. |
| 249 | + """ |
| 250 | + # Load the files |
| 251 | + with open(env_file, "r") as f: |
| 252 | + env = yaml.safe_load(f) |
| 253 | + with open(req_txt, "r") as f: |
| 254 | + requirements = f.read().splitlines() |
| 255 | + # Combine pip dependencies from both files |
| 256 | + dependencies = [] |
| 257 | + pip_dependencies = [] |
| 258 | + |
| 259 | + if "dependencies" in env: |
| 260 | + for dep in env["dependencies"]: |
| 261 | + if isinstance(dep, str): |
| 262 | + # Conda package, e.g., 'python=3.7' |
| 263 | + dependencies.append(dep) |
| 264 | + elif isinstance(dep, dict): |
| 265 | + # Pip package list, e.g., {'pip': ['requests>=2.22.0']} |
| 266 | + for pip_package in dep.get("pip", []): |
| 267 | + pip_dependencies.append(pip_package) |
| 268 | + |
| 269 | + for req in requirements: |
| 270 | + if req and not req.startswith("#"): |
| 271 | + pip_dependencies.append(req) |
| 272 | + |
| 273 | + if pip_dependencies: |
| 274 | + dependencies.append({"pip": pip_dependencies}) |
| 275 | + # Create the merged environment file |
| 276 | + merged_env = {"name": env_name, "channels": env["channels"], "dependencies": dependencies} |
| 277 | + |
| 278 | + with open(output_file, "w") as f: |
| 279 | + yaml.dump(merged_env, f, sort_keys=False) |
| 280 | + |
| 281 | + logger.info(f"Merged environment file saved to '{output_file}'") |
| 282 | + |
| 283 | + |
| 284 | +def _push_image_to_ecr(image_name: str, ecr_repo_name: str, boto_session: Optional[boto3.Session]): |
| 285 | + """ |
| 286 | + Push the docker image to AWS ECR. |
| 287 | +
|
| 288 | + Args: |
| 289 | + image_name (str): The name of the docker image. |
| 290 | + ecr_repo_name (str): The AWS ECR repo to push the docker image. |
| 291 | + """ |
| 292 | + region = boto_session.region_name |
| 293 | + aws_account_id = boto_session.client("sts", region_name=region).get_caller_identity()["Account"] |
| 294 | + ecr_client = boto3.client("ecr") |
| 295 | + |
| 296 | + # Authenticate Docker with ECR |
| 297 | + registry_url = f"{aws_account_id}.dkr.ecr.{region}.amazonaws.com" |
| 298 | + docker_login_cmd = ( |
| 299 | + f"aws ecr get-login-password --region {region} " |
| 300 | + f"| docker login --username AWS --password-stdin {aws_account_id}.dkr.ecr.{region}.amazonaws.com" |
| 301 | + ) |
| 302 | + subprocess.run(docker_login_cmd, shell=True, check=True) |
| 303 | + |
| 304 | + # Create a new ECR repository (if it doesn't already exist) |
| 305 | + ecr_repo_name = ecr_repo_name or image_name |
| 306 | + try: |
| 307 | + ecr_client.create_repository(repositoryName=ecr_repo_name) |
| 308 | + except ecr_client.exceptions.RepositoryAlreadyExistsException: |
| 309 | + pass |
| 310 | + |
| 311 | + # Tag the local Docker image |
| 312 | + ecr_image_uri = f"{registry_url}/{ecr_repo_name}:latest" |
| 313 | + docker_tag_cmd = f"docker tag {image_name}:latest {ecr_image_uri}" |
| 314 | + subprocess.run(docker_tag_cmd, shell=True, check=True) |
| 315 | + |
| 316 | + # Push the Docker image to ECR |
| 317 | + docker_push_cmd = f"docker push {ecr_image_uri}" |
| 318 | + subprocess.run(docker_push_cmd, shell=True, check=True) |
| 319 | + |
| 320 | + logger.info(f"Image {image_name} pushed to {ecr_image_uri}") |
0 commit comments