|
17 | 17 | import re
|
18 | 18 | import subprocess
|
19 | 19 | from datetime import datetime
|
| 20 | +from pathlib import Path |
20 | 21 |
|
21 | 22 | import pytest
|
22 | 23 | from botocore.exceptions import WaiterError
|
| 24 | +from sagemaker.workflow.utilities import hash_files_or_dirs, hash_object |
23 | 25 |
|
24 | 26 | from sagemaker import image_uris, get_execution_role, utils
|
25 | 27 | from sagemaker.dataset_definition import DatasetDefinition, AthenaDatasetDefinition
|
26 |
| -from sagemaker.processing import ProcessingInput, ProcessingOutput |
27 |
| -from sagemaker.s3 import S3Uploader |
28 |
| -from sagemaker.sklearn import SKLearnProcessor |
| 28 | +from sagemaker.processing import ProcessingInput, ProcessingOutput, FrameworkProcessor |
| 29 | +from sagemaker.s3 import S3Uploader, S3Downloader |
| 30 | +from sagemaker.sklearn import SKLearnProcessor, SKLearn |
| 31 | +from sagemaker.tensorflow import TensorFlow |
29 | 32 | from sagemaker.workflow.parameters import ParameterInteger, ParameterString
|
30 | 33 | from sagemaker.workflow.pipeline import Pipeline
|
31 | 34 | from sagemaker.workflow.steps import (
|
@@ -379,6 +382,203 @@ def test_one_step_framework_processing_pipeline(
|
379 | 382 | pass
|
380 | 383 |
|
381 | 384 |
|
| 385 | +def test_multi_step_framework_processing_pipeline_same_source_dir( |
| 386 | + pipeline_session, role, pipeline_name |
| 387 | +): |
| 388 | + default_bucket = pipeline_session.default_bucket() |
| 389 | + cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") |
| 390 | + |
| 391 | + SOURCE_DIR = "/pipeline/test_source_dir" |
| 392 | + |
| 393 | + framework_processor_tf = FrameworkProcessor( |
| 394 | + role=role, |
| 395 | + instance_type="ml.m5.xlarge", |
| 396 | + instance_count=1, |
| 397 | + estimator_cls=TensorFlow, |
| 398 | + framework_version="2.9", |
| 399 | + py_version="py39", |
| 400 | + sagemaker_session=pipeline_session, |
| 401 | + ) |
| 402 | + |
| 403 | + framework_processor_sk = FrameworkProcessor( |
| 404 | + framework_version="1.0-1", |
| 405 | + instance_type="ml.m5.xlarge", |
| 406 | + instance_count=1, |
| 407 | + base_job_name="my-job", |
| 408 | + role=role, |
| 409 | + estimator_cls=SKLearn, |
| 410 | + sagemaker_session=pipeline_session, |
| 411 | + ) |
| 412 | + |
| 413 | + step_1 = ProcessingStep( |
| 414 | + name="Step-1", |
| 415 | + step_args=framework_processor_tf.run( |
| 416 | + code="script_1.py", |
| 417 | + source_dir=DATA_DIR + SOURCE_DIR, |
| 418 | + outputs=[ProcessingOutput(output_name="test", source="/opt/ml/processing/test")], |
| 419 | + ), |
| 420 | + cache_config=cache_config, |
| 421 | + ) |
| 422 | + |
| 423 | + step_2 = ProcessingStep( |
| 424 | + name="Step-2", |
| 425 | + step_args=framework_processor_sk.run( |
| 426 | + code="script_2.py", |
| 427 | + source_dir=DATA_DIR + SOURCE_DIR, |
| 428 | + inputs=[ |
| 429 | + ProcessingInput( |
| 430 | + source=step_1.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri, |
| 431 | + destination="/opt/ml/processing/test", |
| 432 | + ), |
| 433 | + ], |
| 434 | + ), |
| 435 | + cache_config=cache_config, |
| 436 | + ) |
| 437 | + |
| 438 | + pipeline = Pipeline( |
| 439 | + name=pipeline_name, steps=[step_1, step_2], sagemaker_session=pipeline_session |
| 440 | + ) |
| 441 | + try: |
| 442 | + pipeline.create(role) |
| 443 | + definition = json.loads(pipeline.definition()) |
| 444 | + |
| 445 | + source_dir_1_s3_uri, entry_point_1 = _verify_code_artifacts_of_framework_processing_step( |
| 446 | + pipeline_session, |
| 447 | + framework_processor_tf, |
| 448 | + default_bucket, |
| 449 | + pipeline_name, |
| 450 | + definition["Steps"][0], |
| 451 | + SOURCE_DIR, |
| 452 | + "script_1.py", |
| 453 | + ) |
| 454 | + source_dir_2_s3_uri, entry_point_2 = _verify_code_artifacts_of_framework_processing_step( |
| 455 | + pipeline_session, |
| 456 | + framework_processor_sk, |
| 457 | + default_bucket, |
| 458 | + pipeline_name, |
| 459 | + definition["Steps"][1], |
| 460 | + SOURCE_DIR, |
| 461 | + "script_2.py", |
| 462 | + ) |
| 463 | + |
| 464 | + # the same local source_dirs should have the same s3 paths |
| 465 | + assert source_dir_1_s3_uri == source_dir_2_s3_uri |
| 466 | + |
| 467 | + # verify different entry_point paths |
| 468 | + assert entry_point_1 != entry_point_2 |
| 469 | + |
| 470 | + execution = pipeline.start(parameters={}) |
| 471 | + try: |
| 472 | + execution.wait(delay=540, max_attempts=3) |
| 473 | + except WaiterError: |
| 474 | + pass |
| 475 | + |
| 476 | + execution_steps = execution.list_steps() |
| 477 | + assert len(execution_steps) == 2 |
| 478 | + for step in execution_steps: |
| 479 | + assert step["StepStatus"] == "Succeeded" |
| 480 | + |
| 481 | + finally: |
| 482 | + try: |
| 483 | + pipeline.delete() |
| 484 | + except Exception: |
| 485 | + pass |
| 486 | + |
| 487 | + |
| 488 | +def test_multi_step_framework_processing_pipeline_different_source_dir( |
| 489 | + pipeline_session, role, pipeline_name |
| 490 | +): |
| 491 | + default_bucket = pipeline_session.default_bucket() |
| 492 | + cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") |
| 493 | + |
| 494 | + SOURCE_DIR_1 = "/pipeline/test_source_dir" |
| 495 | + SOURCE_DIR_2 = "/pipeline/test_source_dir_2" |
| 496 | + |
| 497 | + framework_processor_tf = FrameworkProcessor( |
| 498 | + role=role, |
| 499 | + instance_type="ml.m5.xlarge", |
| 500 | + instance_count=1, |
| 501 | + estimator_cls=TensorFlow, |
| 502 | + framework_version="2.9", |
| 503 | + py_version="py39", |
| 504 | + sagemaker_session=pipeline_session, |
| 505 | + ) |
| 506 | + |
| 507 | + step_1 = ProcessingStep( |
| 508 | + name="Step-1", |
| 509 | + step_args=framework_processor_tf.run( |
| 510 | + code="script_1.py", |
| 511 | + source_dir=DATA_DIR + SOURCE_DIR_1, |
| 512 | + outputs=[ProcessingOutput(output_name="test", source="/opt/ml/processing/test")], |
| 513 | + ), |
| 514 | + cache_config=cache_config, |
| 515 | + ) |
| 516 | + |
| 517 | + step_2 = ProcessingStep( |
| 518 | + name="Step-2", |
| 519 | + step_args=framework_processor_tf.run( |
| 520 | + code="script_2.py", |
| 521 | + source_dir=DATA_DIR + SOURCE_DIR_2, |
| 522 | + inputs=[ |
| 523 | + ProcessingInput( |
| 524 | + source=step_1.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri, |
| 525 | + destination="/opt/ml/processing/test", |
| 526 | + ), |
| 527 | + ], |
| 528 | + ), |
| 529 | + cache_config=cache_config, |
| 530 | + ) |
| 531 | + |
| 532 | + pipeline = Pipeline( |
| 533 | + name=pipeline_name, steps=[step_1, step_2], sagemaker_session=pipeline_session |
| 534 | + ) |
| 535 | + try: |
| 536 | + pipeline.create(role) |
| 537 | + definition = json.loads(pipeline.definition()) |
| 538 | + |
| 539 | + source_dir_1_s3_uri, entry_point_1 = _verify_code_artifacts_of_framework_processing_step( |
| 540 | + pipeline_session, |
| 541 | + framework_processor_tf, |
| 542 | + default_bucket, |
| 543 | + pipeline_name, |
| 544 | + definition["Steps"][0], |
| 545 | + SOURCE_DIR_1, |
| 546 | + "script_1.py", |
| 547 | + ) |
| 548 | + source_dir_2_s3_uri, entry_point_2 = _verify_code_artifacts_of_framework_processing_step( |
| 549 | + pipeline_session, |
| 550 | + framework_processor_tf, |
| 551 | + default_bucket, |
| 552 | + pipeline_name, |
| 553 | + definition["Steps"][1], |
| 554 | + SOURCE_DIR_2, |
| 555 | + "script_2.py", |
| 556 | + ) |
| 557 | + |
| 558 | + # different local source_dirs should have different s3 paths |
| 559 | + assert source_dir_1_s3_uri != source_dir_2_s3_uri |
| 560 | + |
| 561 | + # verify different entry_point paths |
| 562 | + assert entry_point_1 != entry_point_2 |
| 563 | + |
| 564 | + execution = pipeline.start(parameters={}) |
| 565 | + try: |
| 566 | + execution.wait(delay=540, max_attempts=3) |
| 567 | + except WaiterError: |
| 568 | + pass |
| 569 | + |
| 570 | + execution_steps = execution.list_steps() |
| 571 | + assert len(execution_steps) == 2 |
| 572 | + for step in execution_steps: |
| 573 | + assert step["StepStatus"] == "Succeeded" |
| 574 | + |
| 575 | + finally: |
| 576 | + try: |
| 577 | + pipeline.delete() |
| 578 | + except Exception: |
| 579 | + pass |
| 580 | + |
| 581 | + |
382 | 582 | def test_one_step_pyspark_processing_pipeline(
|
383 | 583 | sagemaker_session,
|
384 | 584 | role,
|
@@ -796,3 +996,46 @@ def test_two_processing_job_depends_on(
|
796 | 996 | pipeline.delete()
|
797 | 997 | except Exception:
|
798 | 998 | pass
|
| 999 | + |
| 1000 | + |
| 1001 | +def _verify_code_artifacts_of_framework_processing_step( |
| 1002 | + pipeline_session, processor, bucket, pipeline_name, step_definition, source_dir, entry_point |
| 1003 | +): |
| 1004 | + |
| 1005 | + source_dir_s3_uri = ( |
| 1006 | + f"s3://{bucket}/{pipeline_name}" f"/code/{hash_files_or_dirs([f'{DATA_DIR}/{source_dir}'])}" |
| 1007 | + ) |
| 1008 | + |
| 1009 | + # verify runproc.sh prefix is different from code artifact prefix |
| 1010 | + runprocs = [] |
| 1011 | + for input_obj in step_definition["Arguments"]["ProcessingInputs"]: |
| 1012 | + if input_obj["InputName"] == "entrypoint": |
| 1013 | + s3_uri = input_obj["S3Input"]["S3Uri"] |
| 1014 | + runprocs.append(s3_uri) |
| 1015 | + |
| 1016 | + assert Path(s3_uri).parent != source_dir_s3_uri |
| 1017 | + |
| 1018 | + # verify only one entrypoint generated per step |
| 1019 | + assert len(runprocs) == 1 |
| 1020 | + |
| 1021 | + expected_source_dir_tar = ( |
| 1022 | + f"{pipeline_name}" |
| 1023 | + f"/code/{hash_files_or_dirs([DATA_DIR + '/pipeline/test_source_dir'])}/sourcedir.tar.gz" |
| 1024 | + ) |
| 1025 | + |
| 1026 | + step_script = processor._generate_framework_script(entry_point) |
| 1027 | + expected_step_artifact = f"{pipeline_name}/code/{hash_object(step_script)}/runproc.sh" |
| 1028 | + |
| 1029 | + expected_prefix = f"{pipeline_name}/code" |
| 1030 | + s3_code_objects = pipeline_session.list_s3_files(bucket=bucket, key_prefix=expected_prefix) |
| 1031 | + |
| 1032 | + # verify all distinct artifacts were uploaded |
| 1033 | + assert expected_source_dir_tar in s3_code_objects |
| 1034 | + assert expected_step_artifact in s3_code_objects |
| 1035 | + |
| 1036 | + # verify runprocs contain the correct commands |
| 1037 | + step_runproc = S3Downloader.read_file( |
| 1038 | + f"s3://{bucket}/{expected_step_artifact}", pipeline_session |
| 1039 | + ) |
| 1040 | + assert f"python {entry_point}" in step_runproc |
| 1041 | + return source_dir, expected_step_artifact |
0 commit comments