Skip to content

Commit 99a51ba

Browse files
qidewenwhenDewen Qi
and
Dewen Qi
committed
change: Simplify exp plus integ test configuration (aws#694)
Co-authored-by: Dewen Qi <[email protected]>
1 parent e1efdb5 commit 99a51ba

File tree

15 files changed

+648
-340
lines changed

15 files changed

+648
-340
lines changed

.gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ env/
3030
.vscode/
3131
**/tmp
3232
.python-version
33-
**/_repack_model.py
3433
**/_repack_script_launcher.sh
3534
tests/data/experiment/docker/boto
3635
tests/data/experiment/docker/sagemaker-dev.tar.gz
3736
tests/data/**/_repack_model.py
37+
tests/data/experiment/resources/sagemaker-beta-1.0.tar.gz

src/sagemaker/experiments/_api_types.py

+16
Original file line numberDiff line numberDiff line change
@@ -224,3 +224,19 @@ class TrialComponentSearchResult(_base_types.ApiObject):
224224
source_detail = None
225225
tags = None
226226
parents = None
227+
228+
229+
class TrialSummary(_base_types.ApiObject):
230+
"""Summary model of a trial.
231+
232+
Attributes:
233+
trial_arn (str): The ARN of the trial.
234+
trial_name (str): The name of the trial.
235+
creation_time (datetime): When the trial was created.
236+
last_modified_time (datetime): When the trial was last modified.
237+
"""
238+
239+
trial_arn = None
240+
trial_name = None
241+
creation_time = None
242+
last_modified_time = None

src/sagemaker/experiments/experiment.py

+75
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,11 @@
1313
"""Contains the SageMaker Experiment class."""
1414
from __future__ import absolute_import
1515

16+
import time
17+
1618
from sagemaker.apiutils import _base_types
19+
from sagemaker.experiments.trial import _Trial
20+
from sagemaker.experiments.trial_component import _TrialComponent
1721

1822

1923
class _Experiment(_base_types.Record):
@@ -44,6 +48,8 @@ class _Experiment(_base_types.Record):
4448
_boto_update_members = ["experiment_name", "description", "display_name"]
4549
_boto_delete_members = ["experiment_name"]
4650

51+
_MAX_DELETE_ALL_ATTEMPTS = 3
52+
4753
def save(self):
4854
"""Save the state of this Experiment to SageMaker.
4955
@@ -160,3 +166,72 @@ def _load_or_create(
160166
sagemaker_session=sagemaker_session,
161167
)
162168
return experiment
169+
170+
def list_trials(self, created_before=None, created_after=None, sort_by=None, sort_order=None):
171+
"""List trials in this experiment matching the specified criteria.
172+
173+
Args:
174+
created_before (datetime.datetime): Return trials created before this instant
175+
(default: None).
176+
created_after (datetime.datetime): Return trials created after this instant
177+
(default: None).
178+
sort_by (str): Which property to sort results by. One of 'Name', 'CreationTime'
179+
(default: None).
180+
sort_order (str): One of 'Ascending', or 'Descending' (default: None).
181+
182+
Returns:
183+
collections.Iterator[experiments._api_types.TrialSummary] :
184+
An iterator over trials matching the criteria.
185+
"""
186+
return _Trial.list(
187+
experiment_name=self.experiment_name,
188+
created_before=created_before,
189+
created_after=created_after,
190+
sort_by=sort_by,
191+
sort_order=sort_order,
192+
sagemaker_session=self.sagemaker_session,
193+
)
194+
195+
def delete_all(self, action):
196+
"""Force to delete the experiment and associated trials, trial components.
197+
198+
Args:
199+
action (str): The string '--force' is required to pass in to confirm recursively
200+
delete the experiments, and all its trials and trial components.
201+
"""
202+
if action != "--force":
203+
raise ValueError(
204+
"Must confirm with string '--force' in order to delete the experiment and "
205+
"associated trials, trial components."
206+
)
207+
208+
delete_attempt_count = 0
209+
last_exception = None
210+
while True:
211+
if delete_attempt_count == self._MAX_DELETE_ALL_ATTEMPTS:
212+
raise Exception("Failed to delete, please try again.") from last_exception
213+
try:
214+
for trial_summary in self.list_trials():
215+
trial = _Trial.load(
216+
sagemaker_session=self.sagemaker_session,
217+
trial_name=trial_summary.trial_name,
218+
)
219+
for (
220+
trial_component_summary
221+
) in trial.list_trial_components(): # pylint: disable=no-member
222+
tc = _TrialComponent.load(
223+
sagemaker_session=self.sagemaker_session,
224+
trial_component_name=trial_component_summary.trial_component_name,
225+
)
226+
tc.delete(force_disassociate=True)
227+
# to prevent throttling
228+
time.sleep(1.2)
229+
trial.delete() # pylint: disable=no-member
230+
# to prevent throttling
231+
time.sleep(1.2)
232+
self.delete()
233+
break
234+
except Exception as ex: # pylint: disable=broad-except
235+
last_exception = ex
236+
finally:
237+
delete_attempt_count = delete_attempt_count + 1

src/sagemaker/experiments/trial.py

+47
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from __future__ import absolute_import
1515

1616
from sagemaker.apiutils import _base_types
17+
from sagemaker.experiments import _api_types
1718
from sagemaker.experiments.trial_component import _TrialComponent
1819

1920

@@ -117,6 +118,52 @@ def create(
117118
)
118119
return trial
119120

121+
@classmethod
122+
def list(
123+
cls,
124+
experiment_name=None,
125+
trial_component_name=None,
126+
created_before=None,
127+
created_after=None,
128+
sort_by=None,
129+
sort_order=None,
130+
sagemaker_session=None,
131+
):
132+
"""List all trials matching the specified criteria.
133+
134+
Args:
135+
experiment_name (str): Name of the experiment. If specified, only trials in
136+
the experiment will be returned (default: None).
137+
trial_component_name (str): Name of the trial component. If specified, only
138+
trials with this trial component name will be returned (default: None).
139+
created_before (datetime.datetime): Return trials created before this instant
140+
(default: None).
141+
created_after (datetime.datetime): Return trials created after this instant
142+
(default: None).
143+
sort_by (str): Which property to sort results by. One of 'Name', 'CreationTime'
144+
(default: None).
145+
sort_order (str): One of 'Ascending', or 'Descending' (default: None).
146+
sagemaker_session (sagemaker.session.Session): Session object which
147+
manages interactions with Amazon SageMaker APIs and any other
148+
AWS services needed. If not specified, one is created using the
149+
default AWS configuration chain.
150+
Returns:
151+
collections.Iterator[experiments._api_types.TrialSummary]: An iterator over trials
152+
matching the specified criteria.
153+
"""
154+
return super(_Trial, cls)._list(
155+
"list_trials",
156+
_api_types.TrialSummary.from_boto,
157+
"TrialSummaries",
158+
experiment_name=experiment_name,
159+
trial_component_name=trial_component_name,
160+
created_before=created_before,
161+
created_after=created_after,
162+
sort_by=sort_by,
163+
sort_order=sort_order,
164+
sagemaker_session=sagemaker_session,
165+
)
166+
120167
def add_trial_component(self, trial_component):
121168
"""Add the specified trial component to this trial.
122169

tests/__init__.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@
1818
# TODO-experiment-plus: Remove this line, which loads the internal boto models.
1919
# The corresponding model jsons were generated from the coral model package and should
2020
# be updated regularly.
21-
normal_json = "file://./tests/data/experiment/sagemaker-2017-07-24.normal.json"
21+
normal_json = "file://./tests/data/experiment/resources/sagemaker-2017-07-24.normal.json"
2222
os.system(f"aws configure add-model --service-model {normal_json} --service-name sagemaker")
2323

24-
metrics_model_json = "file://./tests/data/experiment/sagemaker-metrics-2022-09-30.normal.json"
24+
metrics_model_json = (
25+
"file://./tests/data/experiment/resources/sagemaker-metrics-2022-09-30.normal.json"
26+
)
2527
os.system(
2628
f"aws configure add-model --service-model {metrics_model_json} --service-name sagemaker-metrics"
2729
)

tests/data/experiment/docker/Dockerfile

-50
This file was deleted.
+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
2+
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
3+
unzip awscliv2.zip
4+
./aws/install
5+
6+
# TODO we should remove the boto model file once the Run API release
7+
aws configure add-model --service-model file://resources/sagemaker-metrics-2022-09-30.normal.json --service-name sagemaker-metrics
8+
aws configure add-model --service-model file://resources/sagemaker-2017-07-24.normal.json --service-name sagemaker
9+
10+
pip install resources/sagemaker-beta-1.0.tar.gz
11+
python train_job_script_for_run_clz.py

tests/integ/sagemaker/experiments/conftest.py

+7-95
Original file line numberDiff line numberDiff line change
@@ -12,28 +12,20 @@
1212
# language governing permissions and limitations under the License.
1313
from __future__ import absolute_import
1414

15-
import base64
1615
import glob
1716
import logging
1817
import os
1918
import shutil
20-
import subprocess
21-
import sys
2219
import tempfile
2320
import time
2421
import uuid
2522

2623
import boto3
2724
import pytest
2825

29-
import docker
30-
31-
from tests.integ import lock
32-
from tests.integ.utils import create_repository
3326
from tests.integ import DATA_DIR
3427

3528
from sagemaker.experiments import trial_component, trial, experiment
36-
from sagemaker.s3 import S3Uploader
3729
from sagemaker.utils import retry_with_backoff
3830
from tests.integ.sagemaker.experiments.helpers import name, names
3931

@@ -137,96 +129,16 @@ def tempdir():
137129
shutil.rmtree(temp_dir)
138130

139131

140-
@pytest.fixture(scope="module")
141-
def bucket(sagemaker_session):
142-
return sagemaker_session.default_bucket()
143-
144-
145-
@pytest.fixture(scope="module")
146-
def training_input_s3_uri(sagemaker_session, tempdir, bucket):
147-
filepath = os.path.join(tempdir, name())
148-
with open(filepath, "w") as w:
149-
w.write("Hello World!")
150-
s3_uri = f"s3://{bucket}/experiments/training-input/{name()}"
151-
return S3Uploader.upload(
152-
local_path=filepath, desired_s3_uri=s3_uri, sagemaker_session=sagemaker_session
153-
)
154-
155-
156-
@pytest.fixture(scope="module")
157-
def training_output_s3_uri(bucket):
158-
return f"s3://{bucket}/experiments/training-output/"
159-
160-
161-
# TODO we should remove the boto model file once the Run API changes release
162-
BOTO_MODEL_LOCAL_PATH = os.path.join(DATA_DIR, "experiment", "sagemaker-2017-07-24.normal.json")
163-
METRICS_MODEL_LOCAL_PATH = os.path.join(
164-
DATA_DIR, "experiment", "sagemaker-metrics-2022-09-30.normal.json"
165-
)
166-
IMAGE_REPO_NAME = "sagemaker-experiments-test"
167-
IMAGE_VERSION = "1.0.92" # We should bump it up if need to update the docker image
168-
SM_SDK_TAR_NAME_IN_IMAGE = "sagemaker-dev.tar.gz"
169-
SM_BOTO_MODEL_PATH_IN_IMAGE = "boto/sagemaker-2017-07-24.normal.json"
170-
SM_METRICS_MODEL_PATH_IN_IMAGE = "boto/sagemaker-metrics-2022-09-30.normal.json"
132+
_EXP_PLUS_SDK_TAR = "sagemaker-beta-1.0.tar.gz"
171133

172134

173135
@pytest.fixture(scope="module")
174-
def docker_image(sagemaker_session):
175-
# requires docker to be running
176-
docker_client = docker.from_env()
177-
ecr_client = sagemaker_session.boto_session.client("ecr")
178-
179-
token = ecr_client.get_authorization_token()
180-
username, password = (
181-
base64.b64decode(token["authorizationData"][0]["authorizationToken"]).decode().split(":")
182-
)
183-
registry = token["authorizationData"][0]["proxyEndpoint"]
184-
repository_name = IMAGE_REPO_NAME
185-
tag = "{}/{}:{}".format(registry, repository_name, IMAGE_VERSION)[8:]
186-
docker_dir = os.path.join(DATA_DIR, "experiment", "docker")
187-
188-
with lock.lock():
189-
# initialize the docker image repository
190-
create_repository(ecr_client, repository_name)
191-
192-
# pull existing image for layer cache
193-
try:
194-
docker_client.images.pull(tag, auth_config={"username": username, "password": password})
195-
print("Docker image with tag {} already exists.".format(tag))
196-
return tag
197-
except docker.errors.NotFound:
198-
print("Docker image with tag {} does not exist. Will create one.".format(tag))
199-
200-
# copy boto model under docker dir
201-
os.makedirs(os.path.join(docker_dir, "boto"), exist_ok=True)
202-
shutil.copy(
203-
BOTO_MODEL_LOCAL_PATH,
204-
os.path.join(docker_dir, SM_BOTO_MODEL_PATH_IN_IMAGE),
205-
)
206-
shutil.copy(
207-
METRICS_MODEL_LOCAL_PATH,
208-
os.path.join(docker_dir, SM_METRICS_MODEL_PATH_IN_IMAGE),
209-
)
210-
211-
# generate sdk tar file from package and put it under docker dir
212-
subprocess.check_call([sys.executable, "setup.py", "sdist"])
213-
sdist_path = max(glob.glob("dist/sagemaker-*"), key=os.path.getctime)
214-
shutil.copy(sdist_path, os.path.join(docker_dir, SM_SDK_TAR_NAME_IN_IMAGE))
215-
216-
docker_client.images.build(
217-
path=docker_dir,
218-
dockerfile="Dockerfile",
219-
tag=tag,
220-
cache_from=[tag],
221-
buildargs={
222-
"library": SM_SDK_TAR_NAME_IN_IMAGE,
223-
"botomodel": SM_BOTO_MODEL_PATH_IN_IMAGE,
224-
"script": "scripts/train_job_script_for_run_clz.py",
225-
"metricsmodel": SM_METRICS_MODEL_PATH_IN_IMAGE,
226-
},
227-
)
228-
docker_client.images.push(tag, auth_config={"username": username, "password": password})
229-
return tag
136+
def job_resource_dir():
137+
resource_dir = os.path.join(DATA_DIR, "experiment/resources")
138+
os.system("python setup.py sdist")
139+
sdist_path = max(glob.glob("dist/sagemaker-*"), key=os.path.getctime)
140+
shutil.copy(sdist_path, os.path.join(resource_dir, _EXP_PLUS_SDK_TAR))
141+
return resource_dir
230142

231143

232144
def _delete_associations(arn, sagemaker_session):

0 commit comments

Comments
 (0)