Local mode does not propagate errors raised in the customer script.

nadiaya · nadiaya · commit b15b7c2bd2cc · 2018-04-11T16:56:04.000-07:00
Fix whitespaces.
Make python3 to be default for pytorch estimator.
diff --git a/src/sagemaker/pytorch/defaults.py b/src/sagemaker/pytorch/defaults.py
@@ -11,3 +11,4 @@
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
 PYTORCH_VERSION = '0.3'
+PYTHON_VERSION = 'py3'
diff --git a/src/sagemaker/pytorch/estimator.py b/src/sagemaker/pytorch/estimator.py
@@ -12,7 +12,7 @@
 # language governing permissions and limitations under the License.
 from sagemaker.estimator import Framework
 from sagemaker.fw_utils import create_image_uri, framework_name_from_image, framework_version_from_tag
-from sagemaker.pytorch.defaults import PYTORCH_VERSION
+from sagemaker.pytorch.defaults import PYTORCH_VERSION, PYTHON_VERSION
 from sagemaker.pytorch.model import PyTorchModel
 
 
@@ -21,7 +21,7 @@ class PyTorch(Framework):
 
     __framework_name__ = "pytorch"
 
-    def __init__(self, entry_point, source_dir=None, hyperparameters=None, py_version='py2',
+    def __init__(self, entry_point, source_dir=None, hyperparameters=None, py_version=PYTHON_VERSION,
                  framework_version=PYTORCH_VERSION, **kwargs):
         """
         This ``Estimator`` executes an PyTorch script in a managed PyTorch execution environment, within a SageMaker
@@ -46,7 +46,7 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, py_versio
                 The hyperparameters are made accessible as a dict[str, str] to the training code on SageMaker.
                 For convenience, this accepts other types for keys and values, but ``str()`` will be called
                 to convert them before training.
-            py_version (str): Python version you want to use for executing your model training code (default: 'py2').
+            py_version (str): Python version you want to use for executing your model training code (default: 'py3').
                               One of 'py2' or 'py3'.
             framework_version (str): PyTorch version you want to use for executing your model training code.
                 List of supported versions https://github.com/aws/sagemaker-python-sdk#pytorch-sagemaker-estimators
@@ -81,10 +81,10 @@ def create_model(self, model_server_workers=None):
                 See :func:`~sagemaker.pytorch.model.PyTorchModel` for full details.
         """
         return PyTorchModel(self.model_data, self.role, self.entry_point, source_dir=self.source_dir,
-                          enable_cloudwatch_metrics=self.enable_cloudwatch_metrics, name=self._current_job_name,
-                          container_log_level=self.container_log_level, code_location=self.code_location,
-                          py_version=self.py_version, framework_version=self.framework_version,
-                          model_server_workers=model_server_workers, sagemaker_session=self.sagemaker_session)
+                            enable_cloudwatch_metrics=self.enable_cloudwatch_metrics, name=self._current_job_name,
+                            container_log_level=self.container_log_level, code_location=self.code_location,
+                            py_version=self.py_version, framework_version=self.framework_version,
+                            model_server_workers=model_server_workers, sagemaker_session=self.sagemaker_session)
 
     @classmethod
     def _prepare_init_params_from_job_description(cls, job_details):
diff --git a/src/sagemaker/pytorch/model.py b/src/sagemaker/pytorch/model.py
@@ -13,7 +13,7 @@
 import sagemaker
 from sagemaker.fw_utils import create_image_uri
 from sagemaker.model import FrameworkModel, MODEL_SERVER_WORKERS_PARAM_NAME
-from sagemaker.pytorch.defaults import PYTORCH_VERSION
+from sagemaker.pytorch.defaults import PYTORCH_VERSION, PYTHON_VERSION
 from sagemaker.predictor import RealTimePredictor, json_serializer, json_deserializer
 from sagemaker.utils import name_from_image
 
@@ -41,8 +41,9 @@ class PyTorchModel(FrameworkModel):
 
     __framework_name__ = 'pytorch'
 
-    def __init__(self, model_data, role, entry_point, image=None, py_version='py2', framework_version=PYTORCH_VERSION,
-                 predictor_cls=PyTorchPredictor, model_server_workers=None, **kwargs):
+    def __init__(self, model_data, role, entry_point, image=None, py_version=PYTHON_VERSION,
+                 framework_version=PYTORCH_VERSION, predictor_cls=PyTorchPredictor,
+                 model_server_workers=None, **kwargs):
         """Initialize an PyTorchModel.
 
         Args:
@@ -54,7 +55,7 @@ def __init__(self, model_data, role, entry_point, image=None, py_version='py2',
             entry_point (str): Path (absolute or relative) to the Python source file which should be executed
                 as the entry point to model hosting. This should be compatible with either Python 2.7 or Python 3.5.
             image (str): A Docker image URI (default: None). If not specified, a default image for PyTorch will be used.
-            py_version (str): Python version you want to use for executing your model training code (default: 'py2').
+            py_version (str): Python version you want to use for executing your model training code (default: 'py3').
             framework_version (str): PyTorch version you want to use for executing your model training code.
             predictor_cls (callable[str, sagemaker.session.Session]): A function to call to create a predictor
                 with an endpoint name and SageMaker ``Session``. If specified, ``deploy()`` returns the result of
@@ -63,8 +64,7 @@ def __init__(self, model_data, role, entry_point, image=None, py_version='py2',
                 If None, server will use one worker per vCPU.
             **kwargs: Keyword arguments passed to the ``FrameworkModel`` initializer.
         """
-        super(PyTorchModel, self).__init__(model_data, image, role, entry_point, predictor_cls=predictor_cls,
-                                         **kwargs)
+        super(PyTorchModel, self).__init__(model_data, image, role, entry_point, predictor_cls=predictor_cls, **kwargs)
         self.py_version = py_version
         self.framework_version = framework_version
         self.model_server_workers = model_server_workers
diff --git a/tests/integ/test_pytorch_train.py b/tests/integ/test_pytorch_train.py
@@ -68,12 +68,13 @@ def test_async_fit(sagemaker_session, pytorch_full_version, instance_type):
             PyTorch.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
 
 
-def test_failed_training_job(sagemaker_session, pytorch_full_version, instance_type):
+# TODO(nadiaya): Run against local mode when errors will be propagated
+def test_failed_training_job(sagemaker_session, pytorch_full_version):
     script_path = os.path.join(MNIST_DIR, 'failure_script.py')
 
     with timeout(minutes=15):
         pytorch = PyTorch(entry_point=script_path, role='SageMakerRole', framework_version=pytorch_full_version,
-                          train_instance_count=1, train_instance_type=instance_type,
+                          train_instance_count=1, train_instance_type='ml.c4.xlarge',
                           sagemaker_session=sagemaker_session)
 
         with pytest.raises(ValueError) as e:
diff --git a/tests/unit/test_pytorch.py b/tests/unit/test_pytorch.py
@@ -52,18 +52,19 @@ def fixture_sagemaker_session():
     return ims
 
 
-def _get_full_cpu_image_uri(version):
-    return IMAGE_URI_FORMAT_STRING.format(REGION, IMAGE_NAME, version, 'cpu', PYTHON_VERSION)
+def _get_full_cpu_image_uri(version, py_version=PYTHON_VERSION):
+    return IMAGE_URI_FORMAT_STRING.format(REGION, IMAGE_NAME, version, 'cpu', py_version)
 
 
-def _get_full_gpu_image_uri(version):
-    return IMAGE_URI_FORMAT_STRING.format(REGION, IMAGE_NAME, version, 'gpu', PYTHON_VERSION)
+def _get_full_gpu_image_uri(version, py_version=PYTHON_VERSION):
+    return IMAGE_URI_FORMAT_STRING.format(REGION, IMAGE_NAME, version, 'gpu', py_version)
 
 
 def _pytorch_estimator(sagemaker_session, framework_version=defaults.PYTORCH_VERSION, train_instance_type=None,
                        enable_cloudwatch_metrics=False, base_job_name=None, **kwargs):
     return PyTorch(entry_point=SCRIPT_PATH,
                    framework_version=framework_version,
+                   py_version=PYTHON_VERSION,
                    role=ROLE,
                    sagemaker_session=sagemaker_session,
                    train_instance_count=INSTANCE_COUNT,
@@ -138,7 +139,7 @@ def test_create_model(sagemaker_session, pytorch_version):
 def test_pytorch(strftime, sagemaker_session, pytorch_version):
     pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                       train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
-                      framework_version=pytorch_version)
+                      framework_version=pytorch_version, py_version=PYTHON_VERSION)
 
     inputs = 's3://mybucket/train'
 
@@ -184,7 +185,7 @@ def test_train_image_default(sagemaker_session):
     pytorch = PyTorch(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                       train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE)
 
-    assert _get_full_cpu_image_uri(defaults.PYTORCH_VERSION) in pytorch.train_image()
+    assert _get_full_cpu_image_uri(defaults.PYTORCH_VERSION, defaults.PYTHON_VERSION) in pytorch.train_image()
 
 
 def test_train_image_cpu_instances(sagemaker_session, pytorch_version):