athewsey
diff --git a/‎src/sagemaker/huggingface/__init__.py
+1 b/‎src/sagemaker/huggingface/__init__.py
+1
diff --git a/‎src/sagemaker/huggingface/processing.py
+132 b/‎src/sagemaker/huggingface/processing.py
+132
diff --git a/‎src/sagemaker/local/local_session.py
+24-2 b/‎src/sagemaker/local/local_session.py
+24-2
diff --git a/‎src/sagemaker/processing.py
+54-50 b/‎src/sagemaker/processing.py
+54-50
diff --git a/‎tests/integ/test_huggingface.py
+39-3 b/‎tests/integ/test_huggingface.py
+39-3
@@ -14,3 +14,4 @@
 from __future__ import absolute_import
 
 from sagemaker.huggingface.estimator import HuggingFace  # noqa: F401
+from sagemaker.huggingface.processing import HuggingFaceProcessor  # noqa:F401
@@ -0,0 +1,132 @@
+# Copyright 2019-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""This module contains code related to HuggingFace Processors which are used for Processing jobs.
+
+These jobs let customers perform data pre-processing, post-processing, feature engineering,
+data validation, and model evaluation and interpretation on SageMaker.
+"""
+from __future__ import absolute_import
+
+from sagemaker.processing import FrameworkProcessor
+from sagemaker.huggingface.estimator import HuggingFace
+
+
+class HuggingFaceProcessor(FrameworkProcessor):
+    """Handles Amazon SageMaker processing tasks for jobs using HuggingFace containers."""
+
+    estimator_cls = HuggingFace
+
+    def __init__(
+        self,
+        role,
+        instance_count,
+        instance_type,
+        transformers_version=None,
+        tensorflow_version=None,
+        pytorch_version=None,
+        py_version="py36",
+        image_uri=None,
+        command=["python"],
+        volume_size_in_gb=30,
+        volume_kms_key=None,
+        output_kms_key=None,
+        code_location=None,
+        max_runtime_in_seconds=None,
+        base_job_name=None,
+        sagemaker_session=None,
+        env=None,
+        tags=None,
+        network_config=None,
+    ):
+        """This processor executes a Python script in a HuggingFace execution environment.
+
+        Unless ``image_uri`` is specified, the environment is an Amazon-built Docker container
+        that executes functions defined in the supplied ``code`` Python script.
+
+        The arguments have the same meaning as in ``FrameworkProcessor``, with the following
+        exceptions.
+
+        Args:
+            transformers_version (str): Transformers version you want to use for
+                executing your model training code. Defaults to ``None``. Required unless
+                ``image_uri`` is provided. The current supported version is ``4.4.2``.
+            tensorflow_version (str): TensorFlow version you want to use for
+                executing your model training code. Defaults to ``None``. Required unless
+                ``pytorch_version`` is provided. The current supported version is ``1.6.0``.
+            pytorch_version (str): PyTorch version you want to use for
+                executing your model training code. Defaults to ``None``. Required unless
+                ``tensorflow_version`` is provided. The current supported version is ``2.4.1``.
+            py_version (str): Python version you want to use for executing your model training
+                code. Defaults to ``None``. Required unless ``image_uri`` is provided.  If
+                using PyTorch, the current supported version is ``py36``. If using TensorFlow,
+                the current supported version is ``py37``.
+
+        .. tip::
+
+            You can find additional parameters for initializing this class at
+            :class:`~sagemaker.processing.FrameworkProcessor`.
+        """
+        self.pytorch_version = pytorch_version
+        self.tensorflow_version = tensorflow_version
+        super().__init__(
+            self.estimator_cls,
+            transformers_version,
+            role,
+            instance_count,
+            instance_type,
+            py_version,
+            image_uri,
+            command,
+            volume_size_in_gb,
+            volume_kms_key,
+            output_kms_key,
+            code_location,
+            max_runtime_in_seconds,
+            base_job_name,
+            sagemaker_session,
+            env,
+            tags,
+            network_config,
+        )
+
+    def _create_estimator(
+        self,
+        entry_point="",
+        source_dir=None,
+        dependencies=None,
+        git_config=None,
+    ):
+        """Override default estimator factory function for HuggingFace's different parameters
+
+        HuggingFace estimators have 3 framework version parameters instead of one: The version for
+        Transformers, PyTorch, and TensorFlow.
+        """
+        return self.estimator_cls(
+            transformers_version=self.framework_version,
+            tensorflow_version=self.tensorflow_version,
+            pytorch_version=self.pytorch_version,
+            py_version=self.py_version,
+            entry_point=entry_point,
+            source_dir=source_dir,
+            dependencies=dependencies,
+            git_config=git_config,
+            code_location=self.code_location,
+            enable_network_isolation=False,
+            image_uri=self.image_uri,
+            role=self.role,
+            instance_count=self.instance_count,
+            instance_type=self.instance_type,
+            sagemaker_session=self.sagemaker_session,
+            debugger_hook_config=False,
+            disable_profiler=True,
+        )
@@ -475,10 +475,30 @@ def invoke_endpoint(
 
 
 class LocalSession(Session):
-    """A LocalSession class definition."""
+    """A SageMaker ``Session`` class for Local Mode.
 
-    def __init__(self, boto_session=None, s3_endpoint_url=None):
+    This class provides alternative Local Mode implementations for the functionality of
+    :class:`~sagemaker.session.Session`.
+    """
+
+    def __init__(self, boto_session=None, s3_endpoint_url=None, disable_local_code=False):
+        """Create a Local SageMaker Session.
+
+        Args:
+            boto_session (boto3.session.Session): The underlying Boto3 session which AWS service
+                calls are delegated to (default: None). If not provided, one is created with
+                default AWS configuration chain.
+            s3_endpoint_url (str): Override the default endpoint URL for Amazon S3, if set
+                (default: None).
+            disable_local_code (bool): Set ``True`` to override the default AWS configuration
+                chain to disable the ``local.local_code`` setting, which may not be supported for
+                some SDK features (default: False).
+        """
         self.s3_endpoint_url = s3_endpoint_url
+        # We use this local variable to avoid disrupting the __init__->_initialize API of the
+        # parent class... But overwriting it after constructor won't do anything, so prefix _ to
+        # discourage external use:
+        self._disable_local_code = disable_local_code
 
         super(LocalSession, self).__init__(boto_session)
 
@@ -530,6 +550,8 @@ def _initialize(
                 raise e
 
             self.config = yaml.load(open(sagemaker_config_file, "r"))
+            if self._disable_local_code and "local" in self.config:
+                self.config["local"]["local_code"] = False
 
     def logs_for_job(self, job_name, wait=False, poll=5, log_type="All"):
         """A no-op method meant to override the sagemaker client.
 
@@ -128,7 +128,8 @@ def __init__(
 
         if self.instance_type in ("local", "local_gpu"):
             if not isinstance(sagemaker_session, LocalSession):
-                sagemaker_session = LocalSession()
+                # Until Local Mode Processing supports local code, we need to disable it:
+                sagemaker_session = LocalSession(disable_local_code=True)
 
         self.sagemaker_session = sagemaker_session or Session()
 
@@ -1298,10 +1299,15 @@ def __init__(
         self.framework_version = framework_version
         self.py_version = py_version
 
-        image_uri, base_job_name = self._pre_init_normalization(
-            instance_type, image_uri, base_job_name, sagemaker_session
-        )
-
+        # 1. To finalize/normalize the image_uri or base_job_name, we need to create an
+        #    estimator_cls instance.
+        # 2. We want to make it easy for children of FrameworkProcessor to override estimator
+        #    creation via a function (to create FrameworkProcessors for Estimators that may have
+        #    different signatures - like HuggingFace or others in future).
+        # 3. Super-class __init__ doesn't (currently) do anything with these params besides
+        #    storing them
+        #
+        # Therefore we'll init the superclass first and then customize the setup after:
         super().__init__(
             role=role,
             image_uri=image_uri,
@@ -1318,6 +1324,7 @@ def __init__(
             tags=tags,
             network_config=network_config,
         )
+
         # This subclass uses the "code" input for actual payload and the ScriptProcessor parent's
         # functionality for uploading just a small entrypoint script to invoke it.
         self._CODE_CONTAINER_INPUT_NAME = "entrypoint"
@@ -1326,38 +1333,45 @@ def __init__(
             code_location[:-1] if (code_location and code_location.endswith("/")) else code_location
         )
 
-    def _pre_init_normalization(
-        self,
-        instance_type: str,
-        image_uri: Optional[str] = None,
-        base_job_name: Optional[str] = None,
-        sagemaker_session: Optional[str] = None,
-    ) -> Tuple[str, str]:
-        """Normalize job name and container image uri."""
-        # Normalize base_job_name
-        if base_job_name is None:
-            base_job_name = self.estimator_cls._framework_name
+        if image_uri is None or base_job_name is None:
+            # For these default configuration purposes, we don't need the optional args:
+            est = self._create_estimator()
+            if image_uri is None:
+                self.image_uri = est.training_image_uri()
             if base_job_name is None:
-                logger.warning("Framework name is None. Please check with the maintainer.")
-                base_job_name = str(base_job_name)  # Keep mypy happy.
-
-        # Normalize image uri.
-        if image_uri is None:
-            # Estimator used only to probe image uri, so can get away with some dummy values.
-            est = self.estimator_cls(
-                framework_version=self.framework_version,
-                instance_type=instance_type,
-                py_version=self.py_version,
-                image_uri=image_uri,
-                entry_point="",
-                role="",
-                enable_network_isolation=False,
-                instance_count=1,  # SKLearn estimator explicitly disables instance_count>1
-                sagemaker_session=sagemaker_session,
-            )
-            image_uri = est.training_image_uri()
+                self.base_job_name = est.base_job_name or estimator_cls._framework_name
+                if base_job_name is None:
+                    base_job_name = "framework-processor"
 
-        return image_uri, base_job_name
+    def _create_estimator(
+        self,
+        entry_point="",
+        source_dir=None,
+        dependencies=None,
+        git_config=None,
+    ):
+        """Instantiate the Framework Estimator that backs this Processor"""
+        return self.estimator_cls(
+            framework_version=self.framework_version,
+            py_version=self.py_version,
+            entry_point=entry_point,
+            source_dir=source_dir,
+            dependencies=dependencies,
+            git_config=git_config,
+            code_location=self.code_location,
+            enable_network_isolation=False,  # True -> uploads to input channel. Not what we want!
+            image_uri=self.image_uri,
+            role=self.role,
+            # Estimator instance_count doesn't currently matter to FrameworkProcessor, and the
+            # SKLearn Framework Estimator requires instance_type==1. So here we hard-wire it to 1,
+            # but if it matters in future perhaps we could take self.instance_count here and have
+            # SKLearnProcessor override this function instead:
+            instance_count=1,
+            instance_type=self.instance_type,
+            sagemaker_session=self.sagemaker_session,
+            debugger_hook_config=False,
+            disable_profiler=True,
+        )
 
     def get_run_args(
         self,
@@ -1555,10 +1569,11 @@ def _pack_and_upload_code(self, code, source_dir, dependencies, git_config, job_
 
         local_code = get_config_value("local.local_code", self.sagemaker_session.config)
         if self.sagemaker_session.local_mode and local_code:
-            # TODO: Can we be more prescriptive about how to not trigger this error?
-            # How can user or us force a local mode `Estimator` to run with `local_code=False`?
             raise RuntimeError(
-                "Local *code* is not currently supported for SageMaker Processing in Local Mode"
+                "SageMaker Processing Local Mode does not currently support 'local code' mode. "
+                "Please use a LocalSession created with disable_local_code=True, or leave "
+                "sagemaker_session unspecified when creating your Processor to have one set up "
+                "automatically."
             )
 
         # Upload the bootstrapping code as s3://.../jobname/source/runproc.sh.
@@ -1623,22 +1638,11 @@ def _upload_payload(
         """Upload payload sourcedir.tar.gz to S3."""
         # A new estimator instance is required, because each call to ScriptProcessor.run() can
         # use different codes.
-        estimator = self.estimator_cls(
+        estimator = self._create_estimator(
             entry_point=entry_point,
             source_dir=source_dir,
             dependencies=dependencies,
             git_config=git_config,
-            framework_version=self.framework_version,
-            py_version=self.py_version,
-            code_location=self.code_location,  # Upload to <code_loc>/jobname/output/source.tar.gz
-            enable_network_isolation=False,  # If true, uploads to input channel. Not what we want!
-            image_uri=self.image_uri,  # The image uri is already normalized by this point.
-            role=self.role,
-            instance_type=self.instance_type,
-            instance_count=1,
-            sagemaker_session=self.sagemaker_session,
-            debugger_hook_config=False,
-            disable_profiler=True,
         )
 
         estimator._prepare_for_training(job_name=job_name)
 
@@ -16,11 +16,47 @@
 
 import pytest
 
-from sagemaker.huggingface import HuggingFace
+from sagemaker.huggingface import HuggingFace, HuggingFaceProcessor
 from tests import integ
 from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
 from tests.integ.timeout import timeout
 
+ROLE = "SageMakerRole"
+
+
+@pytest.mark.release
+@pytest.mark.skipif(
+    integ.test_region() in integ.TRAINING_NO_P2_REGIONS,
+    reason="no ml.p2 instances in this region",
+)
+def test_framework_processing_job_with_deps(
+    sagemaker_session,
+    gpu_instance_type,
+    huggingface_training_latest_version,
+    huggingface_pytorch_latest_version,
+):
+    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
+        code_path = os.path.join(DATA_DIR, "dummy_code_bundle_with_reqs")
+        entry_point = "main_script.py"
+
+        processor = HuggingFaceProcessor(
+            transformers_version=huggingface_training_latest_version,
+            pytorch_version=huggingface_pytorch_latest_version,
+            py_version="py36",
+            role=ROLE,
+            instance_count=1,
+            instance_type=gpu_instance_type,
+            sagemaker_session=sagemaker_session,
+            base_job_name="test-huggingface",
+        )
+
+        processor.run(
+            code=entry_point,
+            source_dir=code_path,
+            inputs=[],
+            wait=True,
+        )
+
 
 @pytest.mark.release
 @pytest.mark.skipif(
@@ -39,7 +75,7 @@ def test_huggingface_training(
         hf = HuggingFace(
             py_version="py36",
             entry_point="examples/text-classification/run_glue.py",
-            role="SageMakerRole",
+            role=ROLE,
             transformers_version=huggingface_training_latest_version,
             pytorch_version=huggingface_pytorch_latest_version,
             instance_count=1,
@@ -86,7 +122,7 @@ def test_huggingface_training_tf(
         hf = HuggingFace(
             py_version="py37",
             entry_point=os.path.join(data_path, "run_tf.py"),
-            role="SageMakerRole",
+            role=ROLE,
             transformers_version=huggingface_training_latest_version,
             tensorflow_version=huggingface_tensorflow_latest_version,
             instance_count=1,
Original file line number	Diff line number	Diff line change
`@@ -14,3 +14,4 @@`
`14`	`14`	`from __future__ import absolute_import`
`15`	`15`
`16`	`16`	`from sagemaker.huggingface.estimator import HuggingFace # noqa: F401`
	`17`	`+from sagemaker.huggingface.processing import HuggingFaceProcessor # noqa:F401`