Merge branch 'master' into retry-assoc-tests

navinsoni · web-flow · commit 43b63e6c8996 · 2021-10-21T04:56:04.000-07:00
diff --git a/src/sagemaker/image_uri_config/clarify.json b/src/sagemaker/image_uri_config/clarify.json
@@ -25,7 +25,8 @@
                     "us-east-1": "205585389593",
                     "us-east-2": "211330385671",
                     "us-west-1": "740489534195",
-                    "us-west-2": "306415355426"
+                    "us-west-2": "306415355426",
+                    "us-gov-west-1": "598674086554"
                 },
                 "repository": "sagemaker-clarify-processing"
             }
diff --git a/src/sagemaker/image_uri_config/ray-pytorch.json b/src/sagemaker/image_uri_config/ray-pytorch.json
@@ -21,6 +21,26 @@
             },
             "repository": "sagemaker-rl-ray-container",
             "tag_prefix": "ray-0.8.5-torch"
+        },
+        "1.6.0": {
+            "py_versions": ["py36"],
+            "registries": {
+                "ap-northeast-1": "462105765813",
+                "ap-northeast-2": "462105765813",
+                "ap-south-1": "462105765813",
+                "ap-southeast-1": "462105765813",
+                "ap-southeast-2": "462105765813",
+                "ca-central-1": "462105765813",
+                "eu-central-1": "462105765813",
+                "eu-west-1": "462105765813",
+                "eu-west-2": "462105765813",
+                "us-east-1": "462105765813",
+                "us-east-2": "462105765813",
+                "us-west-1": "462105765813",
+                "us-west-2": "462105765813"
+            },
+            "repository": "sagemaker-rl-ray-container",
+            "tag_prefix": "ray-1.6.0-torch"
         }
     }
 }
diff --git a/src/sagemaker/image_uri_config/ray-tensorflow.json b/src/sagemaker/image_uri_config/ray-tensorflow.json
@@ -165,6 +165,26 @@
             },
             "repository": "sagemaker-rl-ray-container",
             "tag_prefix": "ray-0.8.5-tf"
+        },
+        "1.6.0": {
+            "py_versions": ["py37"],
+            "registries": {
+                "ap-northeast-1": "462105765813",
+                "ap-northeast-2": "462105765813",
+                "ap-south-1": "462105765813",
+                "ap-southeast-1": "462105765813",
+                "ap-southeast-2": "462105765813",
+                "ca-central-1": "462105765813",
+                "eu-central-1": "462105765813",
+                "eu-west-1": "462105765813",
+                "eu-west-2": "462105765813",
+                "us-east-1": "462105765813",
+                "us-east-2": "462105765813",
+                "us-west-1": "462105765813",
+                "us-west-2": "462105765813"
+            },
+            "repository": "sagemaker-rl-ray-container",
+            "tag_prefix": "ray-1.6.0-tf"
         }
     }
 }
diff --git a/src/sagemaker/rl/estimator.py b/src/sagemaker/rl/estimator.py
@@ -45,6 +45,7 @@
         "0.6": {"tensorflow": "1.12"},
         "0.8.2": {"tensorflow": "2.1"},
         "0.8.5": {"tensorflow": "2.1", "pytorch": "1.5"},
+        "1.6.0": {"tensorflow": "2.5.0", "pytorch": "1.8.1"},
     },
 }
 
@@ -69,7 +70,7 @@ class RLEstimator(Framework):
 
     COACH_LATEST_VERSION_TF = "0.11.1"
     COACH_LATEST_VERSION_MXNET = "0.11.0"
-    RAY_LATEST_VERSION = "0.8.5"
+    RAY_LATEST_VERSION = "1.6.0"
 
     def __init__(
         self,
diff --git a/src/sagemaker/xgboost/model.py b/src/sagemaker/xgboost/model.py
@@ -145,13 +145,16 @@ def prepare_container_def(self, instance_type=None, accelerator_type=None):
             )
 
         deploy_key_prefix = model_code_key_prefix(self.key_prefix, self.name, deploy_image)
-        self._upload_code(deploy_key_prefix)
+        self._upload_code(key_prefix=deploy_key_prefix, repack=self.enable_network_isolation())
         deploy_env = dict(self.env)
         deploy_env.update(self._framework_env_vars())
 
         if self.model_server_workers:
             deploy_env[MODEL_SERVER_WORKERS_PARAM_NAME.upper()] = str(self.model_server_workers)
-        return sagemaker.container_def(deploy_image, self.model_data, deploy_env)
+        model_data = (
+            self.repacked_model_data if self.enable_network_isolation() else self.model_data
+        )
+        return sagemaker.container_def(deploy_image, model_data, deploy_env)
 
     def serving_image_uri(self, region_name, instance_type):
         """Create a URI for the serving image.
diff --git a/tests/data/ray_cartpole/train_ray.py b/tests/data/ray_cartpole/train_ray.py
@@ -5,10 +5,10 @@
 from ray.tune.logger import pretty_print
 
 # Based on https://github.com/ray-project/ray/blob/master/doc/source/rllib-training.rst#python-api
-ray.init(log_to_driver=False, webui_host="127.0.0.1")
+ray.init(log_to_driver=False)
 config = ppo.DEFAULT_CONFIG.copy()
 config["num_gpus"] = int(os.environ.get("SM_NUM_GPUS", 0))
-checkpoint_dir = os.environ.get("SM_MODEL_DIR", "/Users/nadzeya/gym")
+checkpoint_dir = os.environ.get("SM_MODEL_DIR", "/tmp")
 config["num_workers"] = 1
 agent = ppo.PPOTrainer(config=config, env="CartPole-v0")
 
diff --git a/tests/data/xgboost_abalone/abalone.py b/tests/data/xgboost_abalone/abalone.py
@@ -0,0 +1,50 @@
+import argparse
+import os
+
+from sagemaker_xgboost_container.data_utils import get_dmatrix
+
+import xgboost as xgb
+
+model_filename = "xgboost-model"
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    # Sagemaker specific arguments. Defaults are set in the environment variables.
+    parser.add_argument(
+        "--model_dir", type=str, default=os.environ.get("SM_MODEL_DIR", "/opt/ml/model")
+    )
+    parser.add_argument(
+        "--train",
+        type=str,
+        default=os.environ.get("SM_CHANNEL_TRAIN", "/opt/ml/input/data/abalone"),
+    )
+
+    args, _ = parser.parse_known_args()
+
+    dtrain = get_dmatrix(args.train, "libsvm")
+
+    params = {
+        "max_depth": 5,
+        "eta": 0.2,
+        "gamma": 4,
+        "min_child_weight": 6,
+        "subsample": 0.7,
+        "verbosity": 2,
+        "objective": "reg:squarederror",
+        "tree_method": "auto",
+        "predictor": "auto",
+    }
+
+    booster = xgb.train(params=params, dtrain=dtrain, num_boost_round=50)
+    booster.save_model(args.model_dir + "/" + model_filename)
+
+
+def model_fn(model_dir):
+    """Deserialize and return fitted model.
+
+    Note that this should have the same name as the serialized model in the _xgb_train method
+    """
+    booster = xgb.Booster()
+    booster.load_model(os.path.join(model_dir, model_filename))
+    return booster
diff --git a/tests/integ/test_workflow.py b/tests/integ/test_workflow.py
@@ -18,6 +18,7 @@
 import subprocess
 import time
 import uuid
+import logging
 
 from contextlib import contextmanager
 import pytest
@@ -75,6 +76,7 @@
 from sagemaker.feature_store.feature_group import FeatureGroup, FeatureDefinition, FeatureTypeEnum
 from tests.integ import DATA_DIR
 from tests.integ.kms_utils import get_or_create_kms_key
+from tests.integ.retry import retries
 
 
 def ordered(obj):
@@ -1850,47 +1852,57 @@ def test_training_job_with_debugger_and_profiler(
         sagemaker_session=sagemaker_session,
     )
 
-    try:
-        response = pipeline.create(role)
-        create_arn = response["PipelineArn"]
-
-        execution = pipeline.start()
-        response = execution.describe()
-        assert response["PipelineArn"] == create_arn
-
+    for _ in retries(
+        max_retry_count=5,
+        exception_message_prefix="Waiting for a successful execution of pipeline",
+        seconds_to_sleep=10,
+    ):
         try:
-            execution.wait(delay=10, max_attempts=60)
-        except WaiterError:
-            pass
-        execution_steps = execution.list_steps()
+            response = pipeline.create(role)
+            create_arn = response["PipelineArn"]
 
-        assert len(execution_steps) == 1
-        assert execution_steps[0].get("FailureReason", "") == ""
-        assert execution_steps[0]["StepName"] == "pytorch-train"
-        assert execution_steps[0]["StepStatus"] == "Succeeded"
+            execution = pipeline.start()
+            response = execution.describe()
+            assert response["PipelineArn"] == create_arn
 
-        training_job_arn = execution_steps[0]["Metadata"]["TrainingJob"]["Arn"]
-        job_description = sagemaker_session.sagemaker_client.describe_training_job(
-            TrainingJobName=training_job_arn.split("/")[1]
-        )
+            try:
+                execution.wait(delay=10, max_attempts=60)
+            except WaiterError:
+                pass
+            execution_steps = execution.list_steps()
 
-        for index, rule in enumerate(rules):
-            config = job_description["DebugRuleConfigurations"][index]
-            assert config["RuleConfigurationName"] == rule.name
-            assert config["RuleEvaluatorImage"] == rule.image_uri
-            assert config["VolumeSizeInGB"] == 0
-            assert (
-                config["RuleParameters"]["rule_to_invoke"] == rule.rule_parameters["rule_to_invoke"]
+            assert len(execution_steps) == 1
+            failure_reason = execution_steps[0].get("FailureReason", "")
+            if failure_reason != "":
+                logging.error(f"Pipeline execution failed with error: {failure_reason}.Retrying..")
+                continue
+            assert execution_steps[0]["StepName"] == "pytorch-train"
+            assert execution_steps[0]["StepStatus"] == "Succeeded"
+
+            training_job_arn = execution_steps[0]["Metadata"]["TrainingJob"]["Arn"]
+            job_description = sagemaker_session.sagemaker_client.describe_training_job(
+                TrainingJobName=training_job_arn.split("/")[1]
             )
-        assert job_description["DebugHookConfig"] == debugger_hook_config._to_request_dict()
 
-        assert job_description["ProfilingStatus"] == "Enabled"
-        assert job_description["ProfilerConfig"]["ProfilingIntervalInMilliseconds"] == 500
-    finally:
-        try:
-            pipeline.delete()
-        except Exception:
-            pass
+            for index, rule in enumerate(rules):
+                config = job_description["DebugRuleConfigurations"][index]
+                assert config["RuleConfigurationName"] == rule.name
+                assert config["RuleEvaluatorImage"] == rule.image_uri
+                assert config["VolumeSizeInGB"] == 0
+                assert (
+                    config["RuleParameters"]["rule_to_invoke"]
+                    == rule.rule_parameters["rule_to_invoke"]
+                )
+            assert job_description["DebugHookConfig"] == debugger_hook_config._to_request_dict()
+
+            assert job_description["ProfilingStatus"] == "Enabled"
+            assert job_description["ProfilerConfig"]["ProfilingIntervalInMilliseconds"] == 500
+            break
+        finally:
+            try:
+                pipeline.delete()
+            except Exception:
+                pass
 
 
 def test_two_processing_job_depends_on(
diff --git a/tests/integ/test_xgboost.py b/tests/integ/test_xgboost.py
@@ -14,6 +14,8 @@
 
 import os
 import pytest
+from sagemaker.utils import unique_name_from_base
+from sagemaker.xgboost import XGBoost
 from sagemaker.xgboost.processing import XGBoostProcessor
 from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
 from tests.integ.timeout import timeout
@@ -48,3 +50,35 @@ def test_framework_processing_job_with_deps(
             inputs=[],
             wait=True,
         )
+
+
+def test_training_with_network_isolation(
+    sagemaker_session,
+    xgboost_latest_version,
+    xgboost_latest_py_version,
+    cpu_instance_type,
+):
+    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
+        base_job_name = "test-network-isolation-xgboost"
+
+        xgboost = XGBoost(
+            entry_point=os.path.join(DATA_DIR, "xgboost_abalone", "abalone.py"),
+            role=ROLE,
+            instance_type=cpu_instance_type,
+            instance_count=1,
+            framework_version=xgboost_latest_version,
+            py_version=xgboost_latest_py_version,
+            base_job_name=base_job_name,
+            sagemaker_session=sagemaker_session,
+            enable_network_isolation=True,
+        )
+
+        train_input = xgboost.sagemaker_session.upload_data(
+            path=os.path.join(DATA_DIR, "xgboost_abalone", "abalone"),
+            key_prefix="integ-test-data/xgboost_abalone/abalone",
+        )
+        job_name = unique_name_from_base(base_job_name)
+        xgboost.fit(inputs={"train": train_input}, job_name=job_name)
+        assert sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=job_name)[
+            "EnableNetworkIsolation"
+        ]
diff --git a/tests/unit/sagemaker/image_uris/test_rl.py b/tests/unit/sagemaker/image_uris/test_rl.py
@@ -86,7 +86,15 @@ def test_ray_tf(ray_tensorflow_version):
 
 
 def _expected_ray_tf_uri(ray_tf_version, processor):
-    if Version(ray_tf_version) > Version("0.6.5"):
+    if Version(ray_tf_version) > Version("1.0.0"):
+        return expected_uris.framework_uri(
+            "sagemaker-rl-ray-container",
+            _version_for_tag("ray", ray_tf_version, "tf", True),
+            RL_ACCOUNT,
+            py_version="py37",
+            processor=processor,
+        )
+    elif Version(ray_tf_version) > Version("0.6.5"):
         return expected_uris.framework_uri(
             "sagemaker-rl-ray-container",
             _version_for_tag("ray", ray_tf_version, "tf", True),
diff --git a/tests/unit/test_xgboost.py b/tests/unit/test_xgboost.py
@@ -22,6 +22,7 @@
 from packaging.version import Version
 
 
+from sagemaker.fw_utils import UploadedCode
 from sagemaker.xgboost import XGBoost, XGBoostModel, XGBoostPredictor
 
 
@@ -180,6 +181,26 @@ def test_create_model(sagemaker_session, xgboost_framework_version):
     assert model_values["Image"] == default_image_uri
 
 
+@patch("sagemaker.model.FrameworkModel._upload_code")
+def test_create_model_with_network_isolation(upload, sagemaker_session, xgboost_framework_version):
+    source_dir = "s3://mybucket/source"
+    repacked_model_data = "s3://mybucket/prefix/model.tar.gz"
+
+    xgboost_model = XGBoostModel(
+        model_data=source_dir,
+        role=ROLE,
+        sagemaker_session=sagemaker_session,
+        entry_point=SCRIPT_PATH,
+        framework_version=xgboost_framework_version,
+        enable_network_isolation=True,
+    )
+    xgboost_model.uploaded_code = UploadedCode(s3_prefix=repacked_model_data, script_name="script")
+    xgboost_model.repacked_model_data = repacked_model_data
+    model_values = xgboost_model.prepare_container_def(CPU)
+    assert model_values["Environment"]["SAGEMAKER_SUBMIT_DIRECTORY"] == "/opt/ml/model/code"
+    assert model_values["ModelDataUrl"] == repacked_model_data
+
+
 @patch("sagemaker.estimator.name_from_base")
 def test_create_model_from_estimator(name_from_base, sagemaker_session, xgboost_framework_version):
     container_log_level = '"logging.INFO"'