Skip to content

Commit 869497f

Browse files
committed
Merge remote-tracking branch 'origin/feature/new_fg_utils' into feature/new_fg_utils
2 parents c22540d + e2d6fee commit 869497f

File tree

3 files changed

+246
-43
lines changed

3 files changed

+246
-43
lines changed

src/sagemaker/experiments/run.py

+6
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,10 @@ def __enter__(self):
664664
if self._inside_load_context:
665665
raise RuntimeError(nested_with_err_msg_template.format("load_run"))
666666
self._inside_load_context = True
667+
if not self._inside_init_context:
668+
# Add to run context only if the load_run is called separately
669+
# without under a Run init context
670+
_RunContext.add_run_object(self)
667671
else:
668672
if _RunContext.get_current_run():
669673
raise RuntimeError(nested_with_err_msg_template.format("Run"))
@@ -692,6 +696,8 @@ def __exit__(self, exc_type, exc_value, exc_traceback):
692696
if self._in_load:
693697
self._inside_load_context = False
694698
self._in_load = False
699+
if not self._inside_init_context:
700+
_RunContext.drop_current_run()
695701
else:
696702
self._inside_init_context = False
697703
_RunContext.drop_current_run()

tests/integ/sagemaker/experiments/test_run.py

+99-40
Original file line numberDiff line numberDiff line change
@@ -170,11 +170,11 @@ def test_run_name_vs_trial_component_name_edge_cases(sagemaker_session, input_na
170170

171171
def test_run_from_local_and_train_job_and_all_exp_cfg_match(sagemaker_session, dev_sdk_tar):
172172
# Notes:
173-
# 1. The 1st Run TC created locally and its exp config was auto passed to the job
173+
# 1. The 1st Run created locally and its exp config was auto passed to the job
174174
# 2. In training job, the same exp and run names are given in the Run constructor
175-
# which will load the 1st Run TC in training job and log parameters
175+
# which will load the 1st Run in training job and log parameters
176176
# and metrics there
177-
# 3. In a different training job, load the same Run TC and log more parameters there.
177+
# 3. In a different training job, load the same Run and log more parameters there.
178178
exp_name = unique_name_from_base(_EXP_NAME_BASE_IN_SCRIPT)
179179
estimator = _generate_estimator(
180180
sdk_tar=dev_sdk_tar, sagemaker_session=sagemaker_session, exp_name=exp_name
@@ -253,12 +253,12 @@ def test_run_from_local_and_train_job_and_all_exp_cfg_match(sagemaker_session, d
253253

254254
def test_run_from_local_and_train_job_and_exp_cfg_not_match(sagemaker_session, dev_sdk_tar):
255255
# Notes:
256-
# 1. The 1st Run TC created locally and its exp config was auto passed to the job
257-
# 2. In training job, different exp and run names (i.e. 2nd Run TC) are given
258-
# in the Run constructor which will create a Run TC according to the run_name
256+
# 1. The 1st Run created locally and its exp config was auto passed to the job
257+
# 2. In training job, different exp and run names (i.e. 2nd Run) are given
258+
# in the Run constructor which will create a Run according to the run_name
259259
# passed in there and ignore the exp config in the job
260-
# 3. Both metrics and parameters are logged in the Run TC created in job
261-
# 4. In a different training job, load the 2nd Run TC and log more parameters there.
260+
# 3. Both metrics and parameters are logged in the Run created in job
261+
# 4. In a different training job, load the 2nd Run and log more parameters there.
262262
exp_name = unique_name_from_base(_EXP_NAME_BASE_IN_SCRIPT)
263263
exp_name2 = unique_name_from_base(_EXP_NAME_BASE_IN_SCRIPT)
264264
estimator = _generate_estimator(
@@ -328,11 +328,11 @@ def test_run_from_local_and_train_job_and_exp_cfg_not_match(sagemaker_session, d
328328

329329
def test_run_from_train_job_only(sagemaker_session, dev_sdk_tar):
330330
# Notes:
331-
# 1. No Run TC created locally or specified in experiment config
331+
# 1. No Run created locally or specified in experiment config
332332
# 2. In training job, Run is initialized
333-
# which will create a Run TC according to the run_name passed in there
334-
# 3. Both metrics and parameters are logged in the Run TC created in job
335-
# 4. In a different training job, load the same Run TC and log more parameters there.
333+
# which will create a Run according to the run_name passed in there
334+
# 3. Both metrics and parameters are logged in the Run created in job
335+
# 4. In a different training job, load the same Run and log more parameters there.
336336
exp_name = unique_name_from_base(_EXP_NAME_BASE_IN_SCRIPT)
337337
estimator = _generate_estimator(
338338
sdk_tar=dev_sdk_tar,
@@ -370,13 +370,13 @@ def test_run_from_processing_job_and_override_default_exp_config(
370370
sagemaker_session, dev_sdk_tar, run_obj
371371
):
372372
# Notes:
373-
# 1. The 1st Run TC (run) created locally
374-
# 2. Within the 2nd Run TC (run_obj)'s context, invoke processor.run
375-
# but override the default experiment config in context of 2nd Run TC
376-
# with the experiment config of the 1st Run TC
377-
# 3. In the processing job script, load the 1st Run TC via the experiment config
373+
# 1. The 1st Run (run) created locally
374+
# 2. Within the 2nd Run (run_obj)'s context, invoke processor.run
375+
# but override the default experiment config in context of 2nd Run
376+
# with the experiment config of the 1st Run
377+
# 3. In the processing job script, load the 1st Run via the experiment config
378378
# fetched from the job env
379-
# 4. All data are logged in the Run TC either locally or in the processing job
379+
# 4. All data are logged in the Run either locally or in the processing job
380380
exp_name = unique_name_from_base(_EXP_NAME_BASE_IN_SCRIPT)
381381
processor = FrameworkProcessor(
382382
estimator_cls=PyTorch,
@@ -441,14 +441,15 @@ def test_run_from_processing_job_and_override_default_exp_config(
441441

442442

443443
# dev_sdk_tar is required to trigger generating the dev SDK tar
444-
def test_run_from_transform_job(sagemaker_session, dev_sdk_tar, run_obj, xgboost_latest_version):
444+
def test_run_from_transform_job(sagemaker_session, dev_sdk_tar, xgboost_latest_version):
445445
# Notes:
446-
# 1. The 1st Run TC (run) created locally
447-
# 2. In the inference script running in a transform job, load the 1st Run TC
448-
# via explicitly passing the experiment_name and run_name of the 1st Run TC
446+
# 1. The 1st Run (run) created locally
447+
# 2. In the inference script running in a transform job, load the 1st Run
448+
# via explicitly passing the experiment_name and run_name of the 1st Run
449449
# TODO: once we're able to retrieve exp config from the transform job env,
450450
# we should expand this test and add the load_run() without explicitly supplying the names
451-
# 3. All data are logged in the Run TC either locally or in the transform job
451+
# 3. All data are logged in the Run either locally or in the transform job
452+
exp_name = unique_name_from_base(_EXP_NAME_BASE_IN_SCRIPT)
452453
xgb_model_data_s3 = sagemaker_session.upload_data(
453454
path=os.path.join(_TRANSFORM_MATERIALS, "xgb_model.tar.gz"),
454455
key_prefix="integ-test-data/xgboost/model",
@@ -461,8 +462,8 @@ def test_run_from_transform_job(sagemaker_session, dev_sdk_tar, run_obj, xgboost
461462
source_dir=_EXP_DIR,
462463
framework_version=xgboost_latest_version,
463464
env={
464-
"EXPERIMENT_NAME": run_obj.experiment_name,
465-
"RUN_NAME": run_obj.run_name,
465+
"EXPERIMENT_NAME": exp_name,
466+
"RUN_NAME": _RUN_NAME_IN_SCRIPT,
466467
},
467468
)
468469
transformer = xgboost_model.transformer(
@@ -481,25 +482,83 @@ def test_run_from_transform_job(sagemaker_session, dev_sdk_tar, run_obj, xgboost
481482
os.path.join(_TRANSFORM_MATERIALS, "data.csv"), uri, sagemaker_session=sagemaker_session
482483
)
483484

484-
with run_obj:
485-
_local_run_log_behaviors(is_complete_log=False, sagemaker_session=sagemaker_session)
486-
transformer.transform(
487-
data=input_data,
488-
content_type="text/libsvm",
489-
split_type="Line",
490-
wait=True,
491-
job_name=f"transform-job-{name()}",
485+
with cleanup_exp_resources(exp_names=[exp_name], sagemaker_session=sagemaker_session):
486+
with Run(
487+
experiment_name=exp_name,
488+
run_name=_RUN_NAME_IN_SCRIPT,
489+
sagemaker_session=sagemaker_session,
490+
) as run:
491+
_local_run_log_behaviors(is_complete_log=False, sagemaker_session=sagemaker_session)
492+
transformer.transform(
493+
data=input_data,
494+
content_type="text/libsvm",
495+
split_type="Line",
496+
wait=True,
497+
job_name=f"transform-job-{name()}",
498+
)
499+
500+
_check_run_from_local_end_result(
501+
tc=run._trial_component,
502+
sagemaker_session=sagemaker_session,
503+
is_complete_log=False,
492504
)
505+
tc_name = Run._generate_trial_component_name(
506+
experiment_name=run.experiment_name, run_name=run.run_name
507+
)
508+
_check_run_from_job_result(
509+
tc_name=tc_name, sagemaker_session=sagemaker_session, is_init=False
510+
)
511+
493512

494-
_check_run_from_local_end_result(
495-
tc=run_obj._trial_component,
513+
# dev_sdk_tar is required to trigger generating the dev SDK tar
514+
def test_load_run_auto_pass_in_exp_config_to_job(sagemaker_session, dev_sdk_tar):
515+
# Notes:
516+
# 1. In local side, load the Run created previously and invoke a job under the load context
517+
# 2. In the job script, load the 1st Run via exp config auto-passed to the job env
518+
# 3. All data are logged in the Run either locally or in the transform job
519+
exp_name = unique_name_from_base(_EXP_NAME_BASE_IN_SCRIPT)
520+
processor = FrameworkProcessor(
521+
estimator_cls=PyTorch,
522+
framework_version="1.10",
523+
py_version="py38",
524+
instance_count=1,
525+
instance_type="ml.m5.xlarge",
526+
role=EXECUTION_ROLE,
496527
sagemaker_session=sagemaker_session,
497-
is_complete_log=False,
498528
)
499-
tc_name = Run._generate_trial_component_name(
500-
experiment_name=run_obj.experiment_name, run_name=run_obj.run_name
501-
)
502-
_check_run_from_job_result(tc_name=tc_name, sagemaker_session=sagemaker_session, is_init=False)
529+
530+
with cleanup_exp_resources(exp_names=[exp_name], sagemaker_session=sagemaker_session):
531+
with Run(
532+
experiment_name=exp_name,
533+
run_name=_RUN_NAME_IN_SCRIPT,
534+
sagemaker_session=sagemaker_session,
535+
) as run:
536+
_local_run_log_behaviors(is_complete_log=False, sagemaker_session=sagemaker_session)
537+
538+
with load_run(
539+
experiment_name=run.experiment_name,
540+
run_name=run.run_name,
541+
sagemaker_session=sagemaker_session,
542+
):
543+
processor.run(
544+
code=_PYTHON_PROCESS_SCRIPT,
545+
source_dir=_EXP_DIR,
546+
job_name=f"process-job-{name()}",
547+
wait=True, # wait the job to finish
548+
logs=False,
549+
)
550+
551+
_check_run_from_local_end_result(
552+
tc=run._trial_component,
553+
sagemaker_session=sagemaker_session,
554+
is_complete_log=False,
555+
)
556+
tc_name = Run._generate_trial_component_name(
557+
experiment_name=run.experiment_name, run_name=run.run_name
558+
)
559+
_check_run_from_job_result(
560+
tc_name=tc_name, sagemaker_session=sagemaker_session, is_init=False
561+
)
503562

504563

505564
def test_list(run_obj, sagemaker_session):

0 commit comments

Comments
 (0)