Set AWS_DEFAULT_REGION environment variable (aws#889)

navaj0 · Zhankuil · Namrata Madan · commit c6d1c1a1c573 · 2023-04-18T13:18:20.000-07:00
Co-authored-by: Zhankui Lu &lt;zhankuil@amazon.com&gt;
diff --git a/src/sagemaker/remote_function/client.py b/src/sagemaker/remote_function/client.py
@@ -222,7 +222,7 @@ def _submit_worker(executor):
     def has_work_to_do():
         return (
             len(executor._pending_request_queue) > 0
-            and len(executor._running_jobs) < executor.max_parallel_job
+            and len(executor._running_jobs) < executor.max_parallel_jobs
         )
 
     try:
@@ -315,7 +315,7 @@ def __init__(
         job_conda_env: str = None,
         job_name_prefix: str = None,
         keep_alive_period_in_seconds: int = 0,
-        max_parallel_job: int = 1,
+        max_parallel_jobs: int = 1,
         max_retry_attempts: int = 1,
         max_runtime_in_seconds: int = 24 * 60 * 60,
         role: str = None,
@@ -346,7 +346,7 @@ def __init__(
             job_name_prefix (str): Prefix used to identify the underlying sagemaker job.
             keep_alive_period_in_seconds (int): The duration of time in seconds to retain configured
                 resources in a warm pool for subsequent training jobs. Defaults to 0.
-            max_parallel_job (int): Maximal number of jobs that run in parallel. Default to 1.
+            max_parallel_jobs (int): Maximal number of jobs that run in parallel. Default to 1.
             max_retry_attempts (int): Max number of times the job is retried on
                 InternalServerFailure.Defaults to 1.
             max_runtime_in_seconds (int): Timeout in seconds for training.  After this amount of
@@ -370,10 +370,10 @@ def __init__(
             volume_size (int): Size in GB of the storage volume to use for storing input and output
                 data. Defaults to 30.
         """
-        self.max_parallel_job = max_parallel_job
+        self.max_parallel_jobs = max_parallel_jobs
 
-        if self.max_parallel_job <= 0:
-            raise ValueError("max_parallel_job must be greater than 0.")
+        if self.max_parallel_jobs <= 0:
+            raise ValueError("max_parallel_jobs must be greater than 0.")
 
         self.job_settings = _JobSettings(
             dependencies=dependencies,
diff --git a/src/sagemaker/remote_function/job.py b/src/sagemaker/remote_function/job.py
@@ -131,8 +131,12 @@ def __init__(
         )
         self.sagemaker_session = sagemaker_session or Session()
 
-        self.environment_variables = self._get_from_config(
-            environment_variables, config_schema.ENVIRONMENT_VARIABLES
+        self.environment_variables = {"AWS_DEFAULT_REGION": self.sagemaker_session.boto_region_name}
+
+        self.environment_variables.update(
+            self._get_from_config(
+                environment_variables, config_schema.ENVIRONMENT_VARIABLES, default={}
+            )
         )
 
         _image_uri = self._get_from_config(image_uri, config_schema.IMAGE_URI)
diff --git a/tests/integ/sagemaker/remote_function/test_decorator.py b/tests/integ/sagemaker/remote_function/test_decorator.py
@@ -17,8 +17,6 @@
 import pytest
 import os
 import pandas as pd
-import boto3
-from sagemaker import Session
 from sagemaker.experiments.run import Run, load_run
 from tests.integ.sagemaker.experiments.helpers import cleanup_exp_resources
 from sagemaker.experiments.trial_component import _TrialComponent
@@ -273,6 +271,7 @@ def test_with_non_existent_dependencies(
         dependencies=dependencies_path,
         instance_type=cpu_instance_type,
         sagemaker_session=sagemaker_session,
+        keep_alive_period_in_seconds=30,
     )
     def divide(x, y):
         return x / y
@@ -293,6 +292,7 @@ def test_with_incompatible_dependencies(
         dependencies=dependencies_path,
         instance_type=cpu_instance_type,
         sagemaker_session=sagemaker_session,
+        keep_alive_period_in_seconds=30,
     )
     def mul_ten(df: pd.DataFrame):
         return df.mul(10)
@@ -318,16 +318,11 @@ def test_decorator_with_exp_and_run_names_passed_to_remote_function(
         image_uri=dummy_container_without_error,
         instance_type=cpu_instance_type,
         sagemaker_session=sagemaker_session,
+        keep_alive_period_in_seconds=30,
     )
     def train(exp_name, run_name):
-        boto_session = boto3.Session(region_name=os.environ["AWS_REGION"])
-        sagemaker_session = Session(boto_session=boto_session)
 
-        with Run(
-            experiment_name=exp_name,
-            run_name=run_name,
-            sagemaker_session=sagemaker_session,
-        ) as run:
+        with Run(experiment_name=exp_name, run_name=run_name) as run:
             print(f"Experiment name: {run.experiment_name}")
             print(f"Run name: {run.run_name}")
             print(f"Trial component name: {run._trial_component.trial_component_name}")
@@ -380,6 +375,7 @@ def test_decorator_load_run_inside_remote_function(
         image_uri=dummy_container_without_error,
         instance_type=cpu_instance_type,
         sagemaker_session=sagemaker_session,
+        keep_alive_period_in_seconds=30,
     )
     def train():
         with load_run() as run:
@@ -419,14 +415,12 @@ def test_decorator_with_nested_exp_run(
         image_uri=dummy_container_without_error,
         instance_type=cpu_instance_type,
         sagemaker_session=sagemaker_session,
+        keep_alive_period_in_seconds=30,
     )
     def train(exp_name, run_name):
-        boto_session = boto3.Session(region_name=os.environ["AWS_REGION"])
-        sagemaker_session = Session(boto_session=boto_session)
         with Run(
             experiment_name=exp_name,
             run_name=run_name,
-            sagemaker_session=sagemaker_session,
         ) as run:
             print(f"Experiment name: {run.experiment_name}")
             print(f"Run name: {run.run_name}")
diff --git a/tests/integ/sagemaker/remote_function/test_executor.py b/tests/integ/sagemaker/remote_function/test_executor.py
@@ -30,7 +30,7 @@ def cube(x):
         return x * x * x
 
     with RemoteExecutor(
-        max_parallel_job=1,
+        max_parallel_jobs=1,
         role=ROLE,
         image_uri=dummy_container_without_error,
         instance_type=cpu_instance_type,
@@ -59,7 +59,7 @@ def power(a, b):
         return a**b
 
     with RemoteExecutor(
-        max_parallel_job=1,
+        max_parallel_jobs=1,
         role=ROLE,
         image_uri=dummy_container_without_error,
         instance_type=cpu_instance_type,
@@ -98,7 +98,7 @@ def cube(x):
 
     with cleanup_exp_resources(exp_names=[exp_name], sagemaker_session=sagemaker_session):
         with RemoteExecutor(
-            max_parallel_job=1,
+            max_parallel_jobs=1,
             role=ROLE,
             image_uri=dummy_container_without_error,
             instance_type=cpu_instance_type,
@@ -162,7 +162,7 @@ def cube(x):
             sagemaker_session=sagemaker_session,
         ):
             with RemoteExecutor(
-                max_parallel_job=1,
+                max_parallel_jobs=1,
                 role=ROLE,
                 image_uri=dummy_container_without_error,
                 instance_type=cpu_instance_type,
@@ -213,7 +213,7 @@ def square(x):
             sagemaker_session=sagemaker_session,
         ):
             with RemoteExecutor(
-                max_parallel_job=2,
+                max_parallel_jobs=2,
                 role=ROLE,
                 image_uri=dummy_container_without_error,
                 instance_type=cpu_instance_type,
@@ -227,7 +227,7 @@ def square(x):
         assert results[1] == 16
 
         with RemoteExecutor(
-            max_parallel_job=2,
+            max_parallel_jobs=2,
             role=ROLE,
             image_uri=dummy_container_without_error,
             instance_type=cpu_instance_type,
diff --git a/tests/unit/sagemaker/remote_function/test_client.py b/tests/unit/sagemaker/remote_function/test_client.py
@@ -453,14 +453,14 @@ def decorated_function(a, b, c=1, *, d, e, f=3):
 
 def test_executor_invalid_arguments():
     with pytest.raises(ValueError):
-        with RemoteExecutor(max_parallel_job=0, s3_root_uri="s3://bucket/") as e:
+        with RemoteExecutor(max_parallel_jobs=0, s3_root_uri="s3://bucket/") as e:
             e.submit(job_function, 1, 2, c=3, d=4)
 
 
 @patch("sagemaker.remote_function.client._JobSettings")
 def test_executor_submit_after_shutdown(*args):
     with pytest.raises(RuntimeError):
-        with RemoteExecutor(max_parallel_job=1, s3_root_uri="s3://bucket/") as e:
+        with RemoteExecutor(max_parallel_jobs=1, s3_root_uri="s3://bucket/") as e:
             pass
         e.submit(job_function, 1, 2, c=3, d=4)
 
@@ -476,7 +476,7 @@ def test_executor_submit_happy_case(mock_start, mock_job_settings, parallelism):
     mock_job_4 = create_mock_job("job_4", COMPLETED_TRAINING_JOB)
     mock_start.side_effect = [mock_job_1, mock_job_2, mock_job_3, mock_job_4]
 
-    with RemoteExecutor(max_parallel_job=parallelism, s3_root_uri="s3://bucket/") as e:
+    with RemoteExecutor(max_parallel_jobs=parallelism, s3_root_uri="s3://bucket/") as e:
         future_1 = e.submit(job_function, 1, 2, c=3, d=4)
         future_2 = e.submit(job_function, 5, 6, c=7, d=8)
         future_3 = e.submit(job_function, 9, 10, c=11, d=12)
@@ -514,7 +514,7 @@ def test_executor_submit_with_run(mock_start, mock_job_settings, run_obj):
     run_info = _RunInfo(run_obj.experiment_name, run_obj.run_name)
 
     with run_obj:
-        with RemoteExecutor(max_parallel_job=2, s3_root_uri="s3://bucket/") as e:
+        with RemoteExecutor(max_parallel_jobs=2, s3_root_uri="s3://bucket/") as e:
             future_1 = e.submit(job_function, 1, 2, c=3, d=4)
             future_2 = e.submit(job_function, 5, 6, c=7, d=8)
 
@@ -530,7 +530,7 @@ def test_executor_submit_with_run(mock_start, mock_job_settings, run_obj):
     assert future_1.done()
     assert future_2.done()
 
-    with RemoteExecutor(max_parallel_job=2, s3_root_uri="s3://bucket/") as e:
+    with RemoteExecutor(max_parallel_jobs=2, s3_root_uri="s3://bucket/") as e:
         with run_obj:
             future_3 = e.submit(job_function, 9, 10, c=11, d=12)
             future_4 = e.submit(job_function, 13, 14, c=15, d=16)
@@ -556,7 +556,7 @@ def test_executor_submit_enforcing_max_parallel_jobs(mock_start, *args):
     mock_job_2 = create_mock_job("job_2", INPROGRESS_TRAINING_JOB)
     mock_start.side_effect = [mock_job_1, mock_job_2]
 
-    e = RemoteExecutor(max_parallel_job=1, s3_root_uri="s3://bucket/")
+    e = RemoteExecutor(max_parallel_jobs=1, s3_root_uri="s3://bucket/")
     future_1 = e.submit(job_function, 1, 2, c=3, d=4)
     future_2 = e.submit(job_function, 5, 6, c=7, d=8)
 
@@ -588,7 +588,7 @@ def test_executor_fails_to_start_job(mock_start, *args):
 
     mock_start.side_effect = [TypeError(), mock_job]
 
-    with RemoteExecutor(max_parallel_job=1, s3_root_uri="s3://bucket/") as e:
+    with RemoteExecutor(max_parallel_jobs=1, s3_root_uri="s3://bucket/") as e:
         future_1 = e.submit(job_function, 1, 2, c=3, d=4)
         future_2 = e.submit(job_function, 5, 6, c=7, d=8)
 
@@ -606,7 +606,7 @@ def test_executor_submit_and_cancel(mock_start, *args):
     mock_job_2 = create_mock_job("job_2", INPROGRESS_TRAINING_JOB)
     mock_start.side_effect = [mock_job_1, mock_job_2]
 
-    e = RemoteExecutor(max_parallel_job=1, s3_root_uri="s3://bucket/")
+    e = RemoteExecutor(max_parallel_jobs=1, s3_root_uri="s3://bucket/")
 
     # submit first job and stay in progress
     future_1 = e.submit(job_function, 1, 2, c=3, d=4)
@@ -645,7 +645,7 @@ def test_executor_describe_job_throttled_temporarily(mock_start, *args):
     ]
     mock_start.return_value = mock_job
 
-    with RemoteExecutor(max_parallel_job=1, s3_root_uri="s3://bucket/") as e:
+    with RemoteExecutor(max_parallel_jobs=1, s3_root_uri="s3://bucket/") as e:
         # submit first job
         future_1 = e.submit(job_function, 1, 2, c=3, d=4)
         # submit second job
@@ -663,7 +663,7 @@ def test_executor_describe_job_failed_permanently(mock_start, *args):
     mock_job.describe.side_effect = RuntimeError()
     mock_start.return_value = mock_job
 
-    with RemoteExecutor(max_parallel_job=1, s3_root_uri="s3://bucket/") as e:
+    with RemoteExecutor(max_parallel_jobs=1, s3_root_uri="s3://bucket/") as e:
         # submit first job
         future_1 = e.submit(job_function, 1, 2, c=3, d=4)
         # submit second job
@@ -695,7 +695,7 @@ def test_executor_describe_job_failed_permanently(mock_start, *args):
 @patch("sagemaker.remote_function.client._JobSettings")
 def test_executor_submit_invalid_function_args(mock_job_settings, args, kwargs, error_message):
     with pytest.raises(TypeError) as e:
-        with RemoteExecutor(max_parallel_job=1, s3_root_uri="s3://bucket/") as executor:
+        with RemoteExecutor(max_parallel_jobs=1, s3_root_uri="s3://bucket/") as executor:
             executor.submit(job_function, *args, **kwargs)
     assert error_message in str(e.value)
 
@@ -1063,7 +1063,7 @@ def test_executor_map_happy_case(mock_deserialized, mock_start, mock_job_setting
 
     mock_deserialized.side_effect = [1, 16]
 
-    with RemoteExecutor(max_parallel_job=1, s3_root_uri="s3://bucket/") as executor:
+    with RemoteExecutor(max_parallel_jobs=1, s3_root_uri="s3://bucket/") as executor:
         results = executor.map(job_function2, [1, 2], [3, 4])
 
     mock_start.assert_has_calls(
@@ -1095,7 +1095,7 @@ def test_executor_map_with_run(mock_deserialized, mock_start, mock_job_settings,
     run_info = _RunInfo(run_obj.experiment_name, run_obj.run_name)
 
     with run_obj:
-        with RemoteExecutor(max_parallel_job=2, s3_root_uri="s3://bucket/") as executor:
+        with RemoteExecutor(max_parallel_jobs=2, s3_root_uri="s3://bucket/") as executor:
             results_12 = executor.map(job_function2, [1, 2], [3, 4])
 
     mock_start.assert_has_calls(
@@ -1112,7 +1112,7 @@ def test_executor_map_with_run(mock_deserialized, mock_start, mock_job_settings,
 
     mock_deserialized.side_effect = [1, 16]
 
-    with RemoteExecutor(max_parallel_job=2, s3_root_uri="s3://bucket/") as executor:
+    with RemoteExecutor(max_parallel_jobs=2, s3_root_uri="s3://bucket/") as executor:
         with run_obj:
             results_34 = executor.map(job_function2, [1, 2], [3, 4])
 
diff --git a/tests/unit/sagemaker/remote_function/test_job.py b/tests/unit/sagemaker/remote_function/test_job.py
@@ -114,6 +114,21 @@ def job_function(a, b=1, *, c, d=3):
 @patch("sagemaker.remote_function.job.Session", return_value=mock_session())
 @patch("sagemaker.remote_function.job.get_execution_role", return_value=DEFAULT_ROLE_ARN)
 def test_sagemaker_config_job_settings(get_execution_role, session, monkeypatch):
+
+    job_settings = _JobSettings(image_uri="image_uri", instance_type="ml.m5.xlarge")
+    assert job_settings.image_uri == "image_uri"
+    assert job_settings.s3_root_uri == f"s3://{BUCKET}"
+    assert job_settings.role == DEFAULT_ROLE_ARN
+    assert job_settings.environment_variables == {"AWS_DEFAULT_REGION": "us-west-2"}
+    assert job_settings.include_local_workdir is False
+    assert job_settings.instance_type == "ml.m5.xlarge"
+
+
+@patch("sagemaker.remote_function.job.Session", return_value=mock_session())
+@patch("sagemaker.remote_function.job.get_execution_role", return_value=DEFAULT_ROLE_ARN)
+def test_sagemaker_config_job_settings_with_configuration_file(
+    get_execution_role, session, monkeypatch
+):
     monkeypatch.setenv(
         "SAGEMAKER_DEFAULT_CONFIG_OVERRIDE", os.path.join(DATA_DIR, "remote_function")
     )
@@ -125,7 +140,10 @@ def test_sagemaker_config_job_settings(get_execution_role, session, monkeypatch)
     assert job_settings.tags == [("someTagKey", "someTagValue"), ("someTagKey2", "someTagValue2")]
     assert job_settings.vpc_config == {"Subnets": ["subnet-1234"], "SecurityGroupIds": ["sg123"]}
     assert job_settings.dependencies == "path/to/requirements.txt"
-    assert job_settings.environment_variables == {"EnvVarKey": "EnvVarValue"}
+    assert job_settings.environment_variables == {
+        "AWS_DEFAULT_REGION": "us-west-2",
+        "EnvVarKey": "EnvVarValue",
+    }
     assert job_settings.job_conda_env == "my_conda_env"
     assert job_settings.include_local_workdir is False
     assert job_settings.volume_kms_key == "someVolumeKmsKey"
@@ -276,6 +294,7 @@ def test_start(
             InstanceType="ml.m5.large",
             KeepAlivePeriodInSeconds=0,
         ),
+        Environment={"AWS_DEFAULT_REGION": "us-west-2"},
     )
 
 
@@ -292,7 +311,7 @@ def test_start_with_complete_job_settings(
 
     job_settings = _JobSettings(
         dependencies="path/to/dependencies/req.txt",
-        environment_variables={"REGION": "us-west-2"},
+        environment_variables={"AWS_DEFAULT_REGION": "us-east-2"},
         image_uri=IMAGE,
         s3_root_uri=S3_URI,
         s3_kms_key=KMS_KEY_ARN,
@@ -392,7 +411,7 @@ def test_start_with_complete_job_settings(
             KeepAlivePeriodInSeconds=120,
         ),
         VpcConfig=dict(Subnets=["subnet"], SecurityGroupIds=["sg"]),
-        Environment={"REGION": "us-west-2"},
+        Environment={"AWS_DEFAULT_REGION": "us-east-2"},
     )
 
 

Original file line number	Diff line number	Diff line change
`@@ -131,8 +131,12 @@ def __init__(`
`131`	`131`	`)`
`132`	`132`	`self.sagemaker_session = sagemaker_session or Session()`
`133`	`133`
`134`		`- self.environment_variables = self._get_from_config(`
`135`		`- environment_variables, config_schema.ENVIRONMENT_VARIABLES`
	`134`	`+ self.environment_variables = {"AWS_DEFAULT_REGION": self.sagemaker_session.boto_region_name}`
	`135`	`+`
	`136`	`+ self.environment_variables.update(`
	`137`	`+ self._get_from_config(`
	`138`	`+ environment_variables, config_schema.ENVIRONMENT_VARIABLES, default={}`
	`139`	`+ )`
`136`	`140`	`)`
`137`	`141`
`138`	`142`	`_image_uri = self._get_from_config(image_uri, config_schema.IMAGE_URI)`