Skip to content

Commit 43b63e6

Browse files
authored
Merge branch 'master' into retry-assoc-tests
2 parents b5969ce + 67fa816 commit 43b63e6

File tree

11 files changed

+212
-42
lines changed

11 files changed

+212
-42
lines changed

src/sagemaker/image_uri_config/clarify.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@
2525
"us-east-1": "205585389593",
2626
"us-east-2": "211330385671",
2727
"us-west-1": "740489534195",
28-
"us-west-2": "306415355426"
28+
"us-west-2": "306415355426",
29+
"us-gov-west-1": "598674086554"
2930
},
3031
"repository": "sagemaker-clarify-processing"
3132
}

src/sagemaker/image_uri_config/ray-pytorch.json

+20
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,26 @@
2121
},
2222
"repository": "sagemaker-rl-ray-container",
2323
"tag_prefix": "ray-0.8.5-torch"
24+
},
25+
"1.6.0": {
26+
"py_versions": ["py36"],
27+
"registries": {
28+
"ap-northeast-1": "462105765813",
29+
"ap-northeast-2": "462105765813",
30+
"ap-south-1": "462105765813",
31+
"ap-southeast-1": "462105765813",
32+
"ap-southeast-2": "462105765813",
33+
"ca-central-1": "462105765813",
34+
"eu-central-1": "462105765813",
35+
"eu-west-1": "462105765813",
36+
"eu-west-2": "462105765813",
37+
"us-east-1": "462105765813",
38+
"us-east-2": "462105765813",
39+
"us-west-1": "462105765813",
40+
"us-west-2": "462105765813"
41+
},
42+
"repository": "sagemaker-rl-ray-container",
43+
"tag_prefix": "ray-1.6.0-torch"
2444
}
2545
}
2646
}

src/sagemaker/image_uri_config/ray-tensorflow.json

+20
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,26 @@
165165
},
166166
"repository": "sagemaker-rl-ray-container",
167167
"tag_prefix": "ray-0.8.5-tf"
168+
},
169+
"1.6.0": {
170+
"py_versions": ["py37"],
171+
"registries": {
172+
"ap-northeast-1": "462105765813",
173+
"ap-northeast-2": "462105765813",
174+
"ap-south-1": "462105765813",
175+
"ap-southeast-1": "462105765813",
176+
"ap-southeast-2": "462105765813",
177+
"ca-central-1": "462105765813",
178+
"eu-central-1": "462105765813",
179+
"eu-west-1": "462105765813",
180+
"eu-west-2": "462105765813",
181+
"us-east-1": "462105765813",
182+
"us-east-2": "462105765813",
183+
"us-west-1": "462105765813",
184+
"us-west-2": "462105765813"
185+
},
186+
"repository": "sagemaker-rl-ray-container",
187+
"tag_prefix": "ray-1.6.0-tf"
168188
}
169189
}
170190
}

src/sagemaker/rl/estimator.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
"0.6": {"tensorflow": "1.12"},
4646
"0.8.2": {"tensorflow": "2.1"},
4747
"0.8.5": {"tensorflow": "2.1", "pytorch": "1.5"},
48+
"1.6.0": {"tensorflow": "2.5.0", "pytorch": "1.8.1"},
4849
},
4950
}
5051

@@ -69,7 +70,7 @@ class RLEstimator(Framework):
6970

7071
COACH_LATEST_VERSION_TF = "0.11.1"
7172
COACH_LATEST_VERSION_MXNET = "0.11.0"
72-
RAY_LATEST_VERSION = "0.8.5"
73+
RAY_LATEST_VERSION = "1.6.0"
7374

7475
def __init__(
7576
self,

src/sagemaker/xgboost/model.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -145,13 +145,16 @@ def prepare_container_def(self, instance_type=None, accelerator_type=None):
145145
)
146146

147147
deploy_key_prefix = model_code_key_prefix(self.key_prefix, self.name, deploy_image)
148-
self._upload_code(deploy_key_prefix)
148+
self._upload_code(key_prefix=deploy_key_prefix, repack=self.enable_network_isolation())
149149
deploy_env = dict(self.env)
150150
deploy_env.update(self._framework_env_vars())
151151

152152
if self.model_server_workers:
153153
deploy_env[MODEL_SERVER_WORKERS_PARAM_NAME.upper()] = str(self.model_server_workers)
154-
return sagemaker.container_def(deploy_image, self.model_data, deploy_env)
154+
model_data = (
155+
self.repacked_model_data if self.enable_network_isolation() else self.model_data
156+
)
157+
return sagemaker.container_def(deploy_image, model_data, deploy_env)
155158

156159
def serving_image_uri(self, region_name, instance_type):
157160
"""Create a URI for the serving image.

tests/data/ray_cartpole/train_ray.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
from ray.tune.logger import pretty_print
66

77
# Based on https://github.com/ray-project/ray/blob/master/doc/source/rllib-training.rst#python-api
8-
ray.init(log_to_driver=False, webui_host="127.0.0.1")
8+
ray.init(log_to_driver=False)
99
config = ppo.DEFAULT_CONFIG.copy()
1010
config["num_gpus"] = int(os.environ.get("SM_NUM_GPUS", 0))
11-
checkpoint_dir = os.environ.get("SM_MODEL_DIR", "/Users/nadzeya/gym")
11+
checkpoint_dir = os.environ.get("SM_MODEL_DIR", "/tmp")
1212
config["num_workers"] = 1
1313
agent = ppo.PPOTrainer(config=config, env="CartPole-v0")
1414

tests/data/xgboost_abalone/abalone.py

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import argparse
2+
import os
3+
4+
from sagemaker_xgboost_container.data_utils import get_dmatrix
5+
6+
import xgboost as xgb
7+
8+
model_filename = "xgboost-model"
9+
10+
if __name__ == "__main__":
11+
parser = argparse.ArgumentParser()
12+
13+
# Sagemaker specific arguments. Defaults are set in the environment variables.
14+
parser.add_argument(
15+
"--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
16+
)
17+
parser.add_argument(
18+
"--train",
19+
type=str,
20+
default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/abalone"),
21+
)
22+
23+
args, _ = parser.parse_known_args()
24+
25+
dtrain = get_dmatrix(args.train, "libsvm")
26+
27+
params = {
28+
"max_depth": 5,
29+
"eta": 0.2,
30+
"gamma": 4,
31+
"min_child_weight": 6,
32+
"subsample": 0.7,
33+
"verbosity": 2,
34+
"objective": "reg:squarederror",
35+
"tree_method": "auto",
36+
"predictor": "auto",
37+
}
38+
39+
booster = xgb.train(params=params, dtrain=dtrain, num_boost_round=50)
40+
booster.save_model(args.model_dir + "/" + model_filename)
41+
42+
43+
def model_fn(model_dir):
44+
"""Deserialize and return fitted model.
45+
46+
Note that this should have the same name as the serialized model in the _xgb_train method
47+
"""
48+
booster = xgb.Booster()
49+
booster.load_model(os.path.join(model_dir, model_filename))
50+
return booster

tests/integ/test_workflow.py

+47-35
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import subprocess
1919
import time
2020
import uuid
21+
import logging
2122

2223
from contextlib import contextmanager
2324
import pytest
@@ -75,6 +76,7 @@
7576
from sagemaker.feature_store.feature_group import FeatureGroup, FeatureDefinition, FeatureTypeEnum
7677
from tests.integ import DATA_DIR
7778
from tests.integ.kms_utils import get_or_create_kms_key
79+
from tests.integ.retry import retries
7880

7981

8082
def ordered(obj):
@@ -1850,47 +1852,57 @@ def test_training_job_with_debugger_and_profiler(
18501852
sagemaker_session=sagemaker_session,
18511853
)
18521854

1853-
try:
1854-
response = pipeline.create(role)
1855-
create_arn = response["PipelineArn"]
1856-
1857-
execution = pipeline.start()
1858-
response = execution.describe()
1859-
assert response["PipelineArn"] == create_arn
1860-
1855+
for _ in retries(
1856+
max_retry_count=5,
1857+
exception_message_prefix="Waiting for a successful execution of pipeline",
1858+
seconds_to_sleep=10,
1859+
):
18611860
try:
1862-
execution.wait(delay=10, max_attempts=60)
1863-
except WaiterError:
1864-
pass
1865-
execution_steps = execution.list_steps()
1861+
response = pipeline.create(role)
1862+
create_arn = response["PipelineArn"]
18661863

1867-
assert len(execution_steps) == 1
1868-
assert execution_steps[0].get("FailureReason", "") == ""
1869-
assert execution_steps[0]["StepName"] == "pytorch-train"
1870-
assert execution_steps[0]["StepStatus"] == "Succeeded"
1864+
execution = pipeline.start()
1865+
response = execution.describe()
1866+
assert response["PipelineArn"] == create_arn
18711867

1872-
training_job_arn = execution_steps[0]["Metadata"]["TrainingJob"]["Arn"]
1873-
job_description = sagemaker_session.sagemaker_client.describe_training_job(
1874-
TrainingJobName=training_job_arn.split("/")[1]
1875-
)
1868+
try:
1869+
execution.wait(delay=10, max_attempts=60)
1870+
except WaiterError:
1871+
pass
1872+
execution_steps = execution.list_steps()
18761873

1877-
for index, rule in enumerate(rules):
1878-
config = job_description["DebugRuleConfigurations"][index]
1879-
assert config["RuleConfigurationName"] == rule.name
1880-
assert config["RuleEvaluatorImage"] == rule.image_uri
1881-
assert config["VolumeSizeInGB"] == 0
1882-
assert (
1883-
config["RuleParameters"]["rule_to_invoke"] == rule.rule_parameters["rule_to_invoke"]
1874+
assert len(execution_steps) == 1
1875+
failure_reason = execution_steps[0].get("FailureReason", "")
1876+
if failure_reason != "":
1877+
logging.error(f"Pipeline execution failed with error: {failure_reason}.Retrying..")
1878+
continue
1879+
assert execution_steps[0]["StepName"] == "pytorch-train"
1880+
assert execution_steps[0]["StepStatus"] == "Succeeded"
1881+
1882+
training_job_arn = execution_steps[0]["Metadata"]["TrainingJob"]["Arn"]
1883+
job_description = sagemaker_session.sagemaker_client.describe_training_job(
1884+
TrainingJobName=training_job_arn.split("/")[1]
18841885
)
1885-
assert job_description["DebugHookConfig"] == debugger_hook_config._to_request_dict()
18861886

1887-
assert job_description["ProfilingStatus"] == "Enabled"
1888-
assert job_description["ProfilerConfig"]["ProfilingIntervalInMilliseconds"] == 500
1889-
finally:
1890-
try:
1891-
pipeline.delete()
1892-
except Exception:
1893-
pass
1887+
for index, rule in enumerate(rules):
1888+
config = job_description["DebugRuleConfigurations"][index]
1889+
assert config["RuleConfigurationName"] == rule.name
1890+
assert config["RuleEvaluatorImage"] == rule.image_uri
1891+
assert config["VolumeSizeInGB"] == 0
1892+
assert (
1893+
config["RuleParameters"]["rule_to_invoke"]
1894+
== rule.rule_parameters["rule_to_invoke"]
1895+
)
1896+
assert job_description["DebugHookConfig"] == debugger_hook_config._to_request_dict()
1897+
1898+
assert job_description["ProfilingStatus"] == "Enabled"
1899+
assert job_description["ProfilerConfig"]["ProfilingIntervalInMilliseconds"] == 500
1900+
break
1901+
finally:
1902+
try:
1903+
pipeline.delete()
1904+
except Exception:
1905+
pass
18941906

18951907

18961908
def test_two_processing_job_depends_on(

tests/integ/test_xgboost.py

+34
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414

1515
import os
1616
import pytest
17+
from sagemaker.utils import unique_name_from_base
18+
from sagemaker.xgboost import XGBoost
1719
from sagemaker.xgboost.processing import XGBoostProcessor
1820
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
1921
from tests.integ.timeout import timeout
@@ -48,3 +50,35 @@ def test_framework_processing_job_with_deps(
4850
inputs=[],
4951
wait=True,
5052
)
53+
54+
55+
def test_training_with_network_isolation(
56+
sagemaker_session,
57+
xgboost_latest_version,
58+
xgboost_latest_py_version,
59+
cpu_instance_type,
60+
):
61+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
62+
base_job_name = "test-network-isolation-xgboost"
63+
64+
xgboost = XGBoost(
65+
entry_point=os.path.join(DATA_DIR, "xgboost_abalone", "abalone.py"),
66+
role=ROLE,
67+
instance_type=cpu_instance_type,
68+
instance_count=1,
69+
framework_version=xgboost_latest_version,
70+
py_version=xgboost_latest_py_version,
71+
base_job_name=base_job_name,
72+
sagemaker_session=sagemaker_session,
73+
enable_network_isolation=True,
74+
)
75+
76+
train_input = xgboost.sagemaker_session.upload_data(
77+
path=os.path.join(DATA_DIR, "xgboost_abalone", "abalone"),
78+
key_prefix="integ-test-data/xgboost_abalone/abalone",
79+
)
80+
job_name = unique_name_from_base(base_job_name)
81+
xgboost.fit(inputs={"train": train_input}, job_name=job_name)
82+
assert sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=job_name)[
83+
"EnableNetworkIsolation"
84+
]

tests/unit/sagemaker/image_uris/test_rl.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,15 @@ def test_ray_tf(ray_tensorflow_version):
8686

8787

8888
def _expected_ray_tf_uri(ray_tf_version, processor):
89-
if Version(ray_tf_version) > Version("0.6.5"):
89+
if Version(ray_tf_version) > Version("1.0.0"):
90+
return expected_uris.framework_uri(
91+
"sagemaker-rl-ray-container",
92+
_version_for_tag("ray", ray_tf_version, "tf", True),
93+
RL_ACCOUNT,
94+
py_version="py37",
95+
processor=processor,
96+
)
97+
elif Version(ray_tf_version) > Version("0.6.5"):
9098
return expected_uris.framework_uri(
9199
"sagemaker-rl-ray-container",
92100
_version_for_tag("ray", ray_tf_version, "tf", True),

tests/unit/test_xgboost.py

+21
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from packaging.version import Version
2323

2424

25+
from sagemaker.fw_utils import UploadedCode
2526
from sagemaker.xgboost import XGBoost, XGBoostModel, XGBoostPredictor
2627

2728

@@ -180,6 +181,26 @@ def test_create_model(sagemaker_session, xgboost_framework_version):
180181
assert model_values["Image"] == default_image_uri
181182

182183

184+
@patch("sagemaker.model.FrameworkModel._upload_code")
185+
def test_create_model_with_network_isolation(upload, sagemaker_session, xgboost_framework_version):
186+
source_dir = "s3://mybucket/source"
187+
repacked_model_data = "s3://mybucket/prefix/model.tar.gz"
188+
189+
xgboost_model = XGBoostModel(
190+
model_data=source_dir,
191+
role=ROLE,
192+
sagemaker_session=sagemaker_session,
193+
entry_point=SCRIPT_PATH,
194+
framework_version=xgboost_framework_version,
195+
enable_network_isolation=True,
196+
)
197+
xgboost_model.uploaded_code = UploadedCode(s3_prefix=repacked_model_data, script_name="script")
198+
xgboost_model.repacked_model_data = repacked_model_data
199+
model_values = xgboost_model.prepare_container_def(CPU)
200+
assert model_values["Environment"]["SAGEMAKER_SUBMIT_DIRECTORY"] == "/opt/ml/model/code"
201+
assert model_values["ModelDataUrl"] == repacked_model_data
202+
203+
183204
@patch("sagemaker.estimator.name_from_base")
184205
def test_create_model_from_estimator(name_from_base, sagemaker_session, xgboost_framework_version):
185206
container_log_level = '"logging.INFO"'

0 commit comments

Comments
 (0)