infra: use fixture for Python version in TF integ tests

laurenyu · laurenyu · commit 44a7ba2771c6 · 2020-06-23T15:12:58.000-07:00
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -58,7 +58,6 @@ def pytest_addoption(parser):
         "--rl-ray-full-version", action="store", default=RLEstimator.RAY_LATEST_VERSION
     )
     parser.addoption("--sklearn-full-version", action="store", default="0.20.0")
-    parser.addoption("--tf-full-version", action="store", default="2.2.0")
     parser.addoption("--ei-tf-full-version", action="store")
     parser.addoption("--xgboost-full-version", action="store", default="1.0-1")
 
@@ -300,32 +299,17 @@ def sklearn_full_version(request):
 
 
 @pytest.fixture(scope="module")
-def tf_full_version(request):
-    return request.config.getoption("--tf-full-version")
+def tf_full_version():
+    return "2.2.0"
 
 
 @pytest.fixture(scope="module")
-def tf_full_py_version(tf_full_version):
-    """fixture to match tf_full_version
-
-    Fixture exists as such, since tf_full_version may be overridden --tf-full-version.
-    Otherwise, this would simply be py37 to match the latest version support.
-
-    TODO: Evaluate use of --tf-full-version with possible eye to remove and simplify code.
-    """
-    version = [int(val) for val in tf_full_version.split(".")]
-    if version < [1, 11]:
-        return "py2"
-    if version < [2, 2]:
-        return "py3"
+def tf_full_py_version():
     return "py37"
 
 
 @pytest.fixture(scope="module")
-def tf_serving_version(tf_full_version):
-    full_version = [int(val) for val in tf_full_version.split(".")]
-    if full_version < [2, 2]:
-        return tf_full_version
+def tf_serving_version():
     return "2.1.0"
 
 
diff --git a/tests/integ/test_horovod.py b/tests/integ/test_horovod.py
@@ -15,10 +15,10 @@
 import json
 import os
 import tarfile
-from six.moves.urllib.parse import urlparse
 
 import boto3
 import pytest
+from six.moves.urllib.parse import urlparse
 
 import sagemaker.utils
 import tests.integ as integ
@@ -28,27 +28,28 @@
 horovod_dir = os.path.join(os.path.dirname(__file__), "..", "data", "horovod")
 
 
-@pytest.fixture(scope="module")
-def gpu_instance_type(request):
-    return "ml.p2.xlarge"
-
-
 @pytest.mark.canary_quick
-def test_hvd_cpu(sagemaker_session, cpu_instance_type, tmpdir):
-    _create_and_fit_estimator(sagemaker_session, cpu_instance_type, tmpdir)
+def test_hvd_cpu(sagemaker_session, tf_full_version, tf_full_py_version, cpu_instance_type, tmpdir):
+    _create_and_fit_estimator(
+        sagemaker_session, tf_full_version, tf_full_py_version, cpu_instance_type, tmpdir
+    )
 
 
 @pytest.mark.canary_quick
 @pytest.mark.skipif(
     integ.test_region() in integ.TRAINING_NO_P2_REGIONS, reason="no ml.p2 instances in this region"
 )
-def test_hvd_gpu(sagemaker_session, gpu_instance_type, tmpdir):
-    _create_and_fit_estimator(sagemaker_session, gpu_instance_type, tmpdir)
+def test_hvd_gpu(sagemaker_session, tf_full_version, tf_full_py_version, tmpdir):
+    _create_and_fit_estimator(
+        sagemaker_session, tf_full_version, tf_full_py_version, "ml.p2.xlarge", tmpdir
+    )
 
 
 @pytest.mark.local_mode
 @pytest.mark.parametrize("instances, processes", [[1, 2], (2, 1), (2, 2)])
-def test_horovod_local_mode(sagemaker_local_session, instances, processes, tmpdir):
+def test_horovod_local_mode(
+    sagemaker_local_session, tf_full_version, tf_full_py_version, instances, processes, tmpdir
+):
     output_path = "file://%s" % tmpdir
     job_name = sagemaker.utils.unique_name_from_base("tf-horovod")
     estimator = TensorFlow(
@@ -57,9 +58,9 @@ def test_horovod_local_mode(sagemaker_local_session, instances, processes, tmpdi
         train_instance_count=2,
         train_instance_type="local",
         sagemaker_session=sagemaker_local_session,
-        py_version=integ.PYTHON_VERSION,
         output_path=output_path,
-        framework_version="1.12",
+        framework_version=tf_full_version,
+        py_version=tf_full_py_version,
         distributions={"mpi": {"enabled": True, "processes_per_host": processes}},
     )
 
@@ -96,16 +97,16 @@ def extract_files_from_s3(s3_url, tmpdir, sagemaker_session):
         tar_file.extractall(tmpdir)
 
 
-def _create_and_fit_estimator(sagemaker_session, instance_type, tmpdir):
+def _create_and_fit_estimator(sagemaker_session, tf_version, py_version, instance_type, tmpdir):
     job_name = sagemaker.utils.unique_name_from_base("tf-horovod")
     estimator = TensorFlow(
         entry_point=os.path.join(horovod_dir, "hvd_basic.py"),
         role="SageMakerRole",
         train_instance_count=2,
         train_instance_type=instance_type,
         sagemaker_session=sagemaker_session,
-        py_version=integ.PYTHON_VERSION,
-        framework_version="1.12",
+        py_version=py_version,
+        framework_version=tf_version,
         distributions={"mpi": {"enabled": True}},
     )
 
diff --git a/tests/integ/test_tf.py b/tests/integ/test_tf.py
@@ -22,7 +22,7 @@
 from sagemaker.utils import unique_name_from_base, sagemaker_timestamp
 
 import tests.integ
-from tests.integ import kms_utils, timeout, PYTHON_VERSION
+from tests.integ import kms_utils, timeout
 from tests.integ.retry import retries
 from tests.integ.s3_utils import assert_s3_files_exist
 
@@ -82,7 +82,7 @@ def test_mnist_with_checkpoint_config(
     assert actual_training_checkpoint_config == expected_training_checkpoint_config
 
 
-def test_server_side_encryption(sagemaker_session, tf_serving_version):
+def test_server_side_encryption(sagemaker_session, tf_serving_version, tf_full_py_version):
     with kms_utils.bucket_with_encryption(sagemaker_session, ROLE) as (bucket_with_kms, kms_key):
         output_path = os.path.join(
             bucket_with_kms, "test-server-side-encryption", time.strftime("%y%m%d-%H%M")
@@ -96,7 +96,7 @@ def test_server_side_encryption(sagemaker_session, tf_serving_version):
             train_instance_type="ml.c5.xlarge",
             sagemaker_session=sagemaker_session,
             framework_version=tf_serving_version,
-            py_version=PYTHON_VERSION,
+            py_version=tf_full_py_version,
             code_location=output_path,
             output_path=output_path,
             model_dir="/opt/ml/model",
@@ -147,16 +147,15 @@ def test_mnist_distributed(sagemaker_session, instance_type, tf_full_version, tf
     )
 
 
-def test_mnist_async(sagemaker_session, cpu_instance_type, tf_serving_version):
+def test_mnist_async(sagemaker_session, cpu_instance_type, tf_serving_version, tf_full_py_version):
     estimator = TensorFlow(
         entry_point=SCRIPT,
         role=ROLE,
         train_instance_count=1,
         train_instance_type="ml.c5.4xlarge",
-        py_version=PYTHON_VERSION,
         sagemaker_session=sagemaker_session,
-        # testing py-sdk functionality, no need to run against all TF versions
         framework_version=tf_serving_version,
+        py_version=tf_full_py_version,
         tags=TAGS,
     )
     inputs = estimator.sagemaker_session.upload_data(
@@ -188,15 +187,17 @@ def test_mnist_async(sagemaker_session, cpu_instance_type, tf_serving_version):
         _assert_model_name_match(sagemaker_session.sagemaker_client, endpoint_name, model_name)
 
 
-def test_deploy_with_input_handlers(sagemaker_session, instance_type, tf_serving_version):
+def test_deploy_with_input_handlers(
+    sagemaker_session, instance_type, tf_serving_version, tf_full_py_version
+):
     estimator = TensorFlow(
         entry_point="training.py",
         source_dir=TFS_RESOURCE_PATH,
         role=ROLE,
         train_instance_count=1,
         train_instance_type=instance_type,
         framework_version=tf_serving_version,
-        py_version=PYTHON_VERSION,
+        py_version=tf_full_py_version,
         sagemaker_session=sagemaker_session,
         tags=TAGS,
     )
diff --git a/tests/integ/test_transformer.py b/tests/integ/test_transformer.py
@@ -332,7 +332,7 @@ def test_transform_mxnet_logs(
 
 
 def test_transform_tf_kms_network_isolation(
-    sagemaker_session, cpu_instance_type, tmpdir, tf_serving_version
+    sagemaker_session, cpu_instance_type, tmpdir, tf_serving_version, tf_full_py_version
 ):
     data_path = os.path.join(DATA_DIR, "tensorflow_mnist")
 
@@ -342,7 +342,7 @@ def test_transform_tf_kms_network_isolation(
         train_instance_count=1,
         train_instance_type=cpu_instance_type,
         framework_version=tf_serving_version,
-        py_version=PYTHON_VERSION,
+        py_version=tf_full_py_version,
         sagemaker_session=sagemaker_session,
     )
 
diff --git a/tests/integ/test_tuner.py b/tests/integ/test_tuner.py
@@ -51,8 +51,6 @@
 from tests.integ.record_set import prepare_record_set_from_local_files
 from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
 
-DATA_PATH = os.path.join(DATA_DIR, "iris", "data")
-
 
 @pytest.fixture(scope="module")
 def kmeans_train_set(sagemaker_session):
@@ -588,9 +586,7 @@ def test_tuning_mxnet(sagemaker_session, mxnet_full_version, cpu_instance_type):
 
 
 @pytest.mark.canary_quick
-def test_tuning_tf_script_mode(
-    sagemaker_session, cpu_instance_type, tf_full_version, tf_full_py_version
-):
+def test_tuning_tf(sagemaker_session, cpu_instance_type, tf_full_version, tf_full_py_version):
     resource_path = os.path.join(DATA_DIR, "tensorflow_mnist")
     script_path = os.path.join(resource_path, "mnist.py")
 
@@ -622,7 +618,7 @@ def test_tuning_tf_script_mode(
             path=os.path.join(resource_path, "data"), key_prefix="scriptmode/mnist"
         )
 
-        tuning_job_name = unique_name_from_base("tune-tf-script-mode", max_length=32)
+        tuning_job_name = unique_name_from_base("tune-tf", max_length=32)
         tuner.fit(inputs, job_name=tuning_job_name)
 
         print("Started hyperparameter tuning job with name: " + tuning_job_name)
@@ -631,13 +627,15 @@ def test_tuning_tf_script_mode(
         tuner.wait()
 
 
-@pytest.mark.skipif(PYTHON_VERSION != "py2", reason="TensorFlow image supports only python 2.")
-def test_tuning_tf_vpc_multi(sagemaker_session, cpu_instance_type):
+def test_tuning_tf_vpc_multi(
+    sagemaker_session, cpu_instance_type, tf_full_version, tf_full_py_version
+):
     """Test Tensorflow multi-instance using the same VpcConfig for training and inference"""
     instance_type = cpu_instance_type
     instance_count = 2
 
-    script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py")
+    resource_path = os.path.join(DATA_DIR, "tensorflow_mnist")
+    script_path = os.path.join(resource_path, "mnist.py")
 
     ec2_client = sagemaker_session.boto_session.client("ec2")
     subnet_ids, security_group_id = vpc_test_utils.get_or_create_vpc_resources(ec2_client)
@@ -646,41 +644,39 @@ def test_tuning_tf_vpc_multi(sagemaker_session, cpu_instance_type):
     estimator = TensorFlow(
         entry_point=script_path,
         role="SageMakerRole",
-        training_steps=1,
-        evaluation_steps=1,
-        hyperparameters={"input_tensor_name": "inputs"},
+        framework_version=tf_full_version,
+        py_version=tf_full_py_version,
         train_instance_count=instance_count,
         train_instance_type=instance_type,
         sagemaker_session=sagemaker_session,
         base_job_name="test-vpc-tf",
         subnets=subnet_ids,
         security_group_ids=[security_group_id],
         encrypt_inter_container_traffic=True,
-        framework_version="1.11",
-        py_version=PYTHON_VERSION,
     )
 
-    inputs = sagemaker_session.upload_data(path=DATA_PATH, key_prefix="integ-test-data/tf_iris")
-    hyperparameter_ranges = {"learning_rate": ContinuousParameter(0.05, 0.2)}
-
-    objective_metric_name = "loss"
-    metric_definitions = [{"Name": "loss", "Regex": "loss = ([0-9\\.]+)"}]
+    hyperparameter_ranges = {"epochs": IntegerParameter(1, 2)}
+    objective_metric_name = "accuracy"
+    metric_definitions = [{"Name": objective_metric_name, "Regex": "accuracy = ([0-9\\.]+)"}]
 
     tuner = HyperparameterTuner(
         estimator,
         objective_metric_name,
         hyperparameter_ranges,
         metric_definitions,
-        objective_type="Minimize",
         max_jobs=2,
         max_parallel_jobs=2,
     )
 
-    tuning_job_name = unique_name_from_base("tune-tf", max_length=32)
     with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
+        inputs = estimator.sagemaker_session.upload_data(
+            path=os.path.join(resource_path, "data"), key_prefix="scriptmode/mnist"
+        )
+
+        tuning_job_name = unique_name_from_base("tune-tf", max_length=32)
         tuner.fit(inputs, job_name=tuning_job_name)
 
-        print("Started hyperparameter tuning job with name:" + tuning_job_name)
+        print("Started hyperparameter tuning job with name: " + tuning_job_name)
 
         time.sleep(15)
         tuner.wait()