fix: hashing problem for framework processors with identical source dirs

SeppeHannen · Giuseppe Hannen · Giuseppe Gerardo Hannen · JoseJuan98 · commit 076522ee80c9 · 2023-03-04T13:58:27.000+01:00
* Fixed hashing problem for frameworkprocessors with identical source directories

* Fixed hashing problem for frameworkprocessors with identical source directories

* Initial changes to tests and start of new tests

* Initial changes to tests

* Changes to tests

* Delete test_hashing.py

* Fixes to test cases

* Removed path change to hashing function

* Potential fix to incorrect hashing problem

* Styling

Co-authored-by: Giuseppe Hannen &lt;giusgera@amazon.nl&gt;
Co-authored-by: Giuseppe Gerardo Hannen &lt;giuseppe.gerardohannen@booking.com&gt;
diff --git a/src/sagemaker/workflow/utilities.py b/src/sagemaker/workflow/utilities.py
@@ -169,10 +169,15 @@ def get_processing_code_hash(code: str, source_dir: str, dependencies: List[str]
         str: A hash string representing the unique code artifact(s) for the step
     """
 
-    # If FrameworkProcessor contains source_dir
+    # FrameworkProcessor
     if source_dir:
         source_dir_url = urlparse(source_dir)
         if source_dir_url.scheme == "" or source_dir_url.scheme == "file":
+            # Include code in the hash when possible
+            if code:
+                code_url = urlparse(code)
+                if code_url.scheme == "" or code_url.scheme == "file":
+                    return hash_files_or_dirs([code, source_dir] + dependencies)
             return hash_files_or_dirs([source_dir] + dependencies)
     # Other Processors - Spark, Script, Base, etc.
     if code:
diff --git a/tests/integ/sagemaker/workflow/test_processing_steps.py b/tests/integ/sagemaker/workflow/test_processing_steps.py
@@ -437,27 +437,28 @@ def test_multi_step_framework_processing_pipeline_same_source_dir(
         pipeline.create(role)
         definition = json.loads(pipeline.definition())
 
-        source_dir_1_s3_uri, entry_point_1 = _verify_code_artifacts_of_framework_processing_step(
+        source_dir_1_tar_uri, entry_point_1 = _verify_code_artifacts_of_framework_processing_step(
             pipeline_session,
             framework_processor_tf,
             default_bucket,
             pipeline_name,
             definition["Steps"][0],
-            SOURCE_DIR,
+            DATA_DIR + SOURCE_DIR,
             "script_1.py",
         )
-        source_dir_2_s3_uri, entry_point_2 = _verify_code_artifacts_of_framework_processing_step(
+
+        source_dir_2_tar_uri, entry_point_2 = _verify_code_artifacts_of_framework_processing_step(
             pipeline_session,
             framework_processor_sk,
             default_bucket,
             pipeline_name,
             definition["Steps"][1],
-            SOURCE_DIR,
+            DATA_DIR + SOURCE_DIR,
             "script_2.py",
         )
 
-        # the same local source_dirs should have the same s3 paths
-        assert source_dir_1_s3_uri == source_dir_2_s3_uri
+        # the tarred source dirs should have a different s3 uri since the entry_point code is different
+        assert source_dir_1_tar_uri != source_dir_2_tar_uri
 
         # verify different entry_point paths
         assert entry_point_1 != entry_point_2
@@ -528,31 +529,49 @@ def test_multi_step_framework_processing_pipeline_different_source_dir(
         pipeline.create(role)
         definition = json.loads(pipeline.definition())
 
-        source_dir_1_s3_uri, entry_point_1 = _verify_code_artifacts_of_framework_processing_step(
+        source_dir_1_tar_uri, entry_point_1 = _verify_code_artifacts_of_framework_processing_step(
             pipeline_session,
             framework_processor_tf,
             default_bucket,
             pipeline_name,
             definition["Steps"][0],
-            SOURCE_DIR_1,
+            DATA_DIR + SOURCE_DIR_1,
             "script_1.py",
         )
-        source_dir_2_s3_uri, entry_point_2 = _verify_code_artifacts_of_framework_processing_step(
+
+        source_dir_2_tar_uri, entry_point_2 = _verify_code_artifacts_of_framework_processing_step(
             pipeline_session,
             framework_processor_tf,
             default_bucket,
             pipeline_name,
             definition["Steps"][1],
-            SOURCE_DIR_2,
+            DATA_DIR + SOURCE_DIR_2,
             "script_2.py",
         )
 
-        # different local source_dirs should have different s3 paths
-        assert source_dir_1_s3_uri != source_dir_2_s3_uri
+        # the tarred source dirs should have a different s3 uri since the source_dirs and entry_point code are different
+        assert source_dir_1_tar_uri != source_dir_2_tar_uri
 
         # verify different entry_point paths
         assert entry_point_1 != entry_point_2
 
+        # define another step with the same source_dir and entry_point as the second step
+        source_dir_3_tar_uri, entry_point_3 = _verify_code_artifacts_of_framework_processing_step(
+            pipeline_session,
+            framework_processor_tf,
+            default_bucket,
+            pipeline_name,
+            definition["Steps"][1],
+            DATA_DIR + SOURCE_DIR_2,
+            "script_2.py",
+        )
+
+        # verify the same entry_point paths
+        assert entry_point_2 == entry_point_3
+
+        # the tarred source dirs should now be the same since the source_dirs and entry_point are the same
+        assert source_dir_2_tar_uri == source_dir_3_tar_uri
+
         execution = pipeline.start(parameters={})
         wait_pipeline_execution(execution=execution, delay=540, max_attempts=3)
 
@@ -975,13 +994,19 @@ def test_two_processing_job_depends_on(
             pass
 
 
+# Verifies that the processing step artifacts are created as expected.
+# Requires that source_dir and entry_point are exactly those passed to the processing step.
 def _verify_code_artifacts_of_framework_processing_step(
     pipeline_session, processor, bucket, pipeline_name, step_definition, source_dir, entry_point
 ):
 
-    source_dir_s3_uri = (
-        f"s3://{bucket}/{pipeline_name}" f"/code/{hash_files_or_dirs([f'{DATA_DIR}/{source_dir}'])}"
-    )
+    files_to_hash = []
+    if entry_point is not None:
+        files_to_hash.append(source_dir)
+    files_to_hash.append(entry_point)
+    file_hash = hash_files_or_dirs(files_to_hash)
+
+    source_dir_s3_uri = f"s3://{bucket}/{pipeline_name}/code/{file_hash}"
 
     # verify runproc.sh prefix is different from code artifact prefix
     runprocs = []
@@ -995,10 +1020,7 @@ def _verify_code_artifacts_of_framework_processing_step(
     # verify only one entrypoint generated per step
     assert len(runprocs) == 1
 
-    expected_source_dir_tar = (
-        f"{pipeline_name}"
-        f"/code/{hash_files_or_dirs([DATA_DIR + '/pipeline/test_source_dir'])}/sourcedir.tar.gz"
-    )
+    expected_source_dir_tar = f"{pipeline_name}/code/{file_hash}/sourcedir.tar.gz"
 
     step_script = processor._generate_framework_script(entry_point)
     expected_step_artifact = f"{pipeline_name}/code/{hash_object(step_script)}/runproc.sh"
@@ -1015,4 +1037,4 @@ def _verify_code_artifacts_of_framework_processing_step(
         f"s3://{bucket}/{expected_step_artifact}", pipeline_session
     )
     assert f"python {entry_point}" in step_runproc
-    return source_dir, expected_step_artifact
+    return expected_source_dir_tar, expected_step_artifact