Skip to content

fix: multi model integration test to create ECR repo with unique names to allow independent parallel executions #1172

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Dec 16, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 30 additions & 8 deletions tests/integ/test_multidatamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@

import base64
import os
import requests

import botocore
import docker
import numpy
import pytest
Expand All @@ -30,7 +32,6 @@
from tests.integ.retry import retries
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name

ALGORITHM_NAME = "sagemaker-multimodel-integ-test"
ROLE = "SageMakerRole"
PRETRAINED_MODEL_PATH_1 = "customer_a/dummy_model.tar.gz"
PRETRAINED_MODEL_PATH_2 = "customer_b/dummy_model.tar.gz"
Expand All @@ -47,27 +48,36 @@ def container_image(sagemaker_session):
"sts", region_name=region, endpoint_url=utils.sts_regional_endpoint(region)
)
account_id = sts_client.get_caller_identity()["Account"]
algorithm_name = "sagemaker-multimodel-integ-test-{}".format(sagemaker_timestamp())
ecr_image = "{account}.dkr.ecr.{region}.amazonaws.com/{algorithm_name}:latest".format(
account=account_id, region=region, algorithm_name=ALGORITHM_NAME
account=account_id, region=region, algorithm_name=algorithm_name
)

# Build and tag docker image locally
docker_client = docker.from_env()
image, build_log = docker_client.images.build(
path=os.path.join(DATA_DIR, "multimodel", "container"), tag=ALGORITHM_NAME, rm=True
path=os.path.join(DATA_DIR, "multimodel", "container"), tag=algorithm_name, rm=True
)
image.tag(ecr_image, tag="latest")

# Create AWS ECR and push the local docker image to it
_create_repository(ecr_client, ALGORITHM_NAME)
_create_repository(ecr_client, algorithm_name)
username, password = _ecr_login(ecr_client)
docker_client.images.push(ecr_image, auth_config={"username": username, "password": password})
# Retry docker image push
for _ in retries(3, "Upload docker image to ECR repo", seconds_to_sleep=10):
try:
docker_client.images.push(
ecr_image, auth_config={"username": username, "password": password}
)
break
except requests.exceptions.ConnectionError:
# This can happen when we try to create multiple repositories in parallel, so we retry
pass

yield ecr_image

# Delete repository after the multi model integration tests complete
repo = ecr_client.describe_repositories(repositoryNames=[ALGORITHM_NAME])
if "repositories" in repo:
ecr_client.delete_repository(repositoryName=ALGORITHM_NAME, force=True)
_delete_repository(ecr_client, algorithm_name)


def _create_repository(ecr_client, repository_name):
Expand All @@ -87,6 +97,18 @@ def _create_repository(ecr_client, repository_name):
raise


def _delete_repository(ecr_client, repository_name):
"""
Deletes an ECS Repository (ECR). After the integration test completes
we will remove the repository created during setup
"""
try:
ecr_client.describe_repositories(repositoryNames=[repository_name])
ecr_client.delete_repository(repositoryName=repository_name, force=True)
except botocore.errorfactory.ResourceNotFoundException:
pass


def _ecr_login(ecr_client):
""" Get a login credentials for an ecr client.
"""
Expand Down