aws
diff --git a/‎docker/build_artifacts/sagemaker/python_service.py
Lines changed: 1 addition & 1 deletion b/‎docker/build_artifacts/sagemaker/python_service.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎docker/build_artifacts/sagemaker/serve.py
Lines changed: 23 additions & 3 deletions b/‎docker/build_artifacts/sagemaker/serve.py
Lines changed: 23 additions & 3 deletions
diff --git a/‎test/integration/local/test_pre_post_processing_mme.py
Lines changed: 1 addition & 1 deletion b/‎test/integration/local/test_pre_post_processing_mme.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/resources/mme_universal_script/half_plus_two/model/half_plus_two/00000123/saved_model.pb
9.12 KB b/‎test/resources/mme_universal_script/half_plus_two/model/half_plus_two/00000123/saved_model.pb
9.12 KB
diff --git a/‎test/resources/mme_universal_script/half_plus_two/model/half_plus_two/00000123/variables/variables.data-00000-of-00001
12 Bytes b/‎test/resources/mme_universal_script/half_plus_two/model/half_plus_two/00000123/variables/variables.data-00000-of-00001
12 Bytes
diff --git a/‎test/resources/mme_universal_script/half_plus_two/model/half_plus_two/00000123/variables/variables.index
151 Bytes b/‎test/resources/mme_universal_script/half_plus_two/model/half_plus_two/00000123/variables/variables.index
151 Bytes
@@ -407,4 +407,4 @@ def add_routes(self, application):
 
 app = falcon.API()
 resources = ServiceResources()
-resources.add_routes(app)
+resources.add_routes(app)
@@ -308,6 +308,14 @@ def _enable_per_process_gpu_memory_fraction(self):
 
         return False
 
+    def _get_number_of_gpu_on_host(self):
+        nvidia_smi_exist = os.path.exists("/usr/bin/nvidia-smi")
+        if nvidia_smi_exist:
+            return len(subprocess.check_output(['nvidia-smi', '-L'])
+                       .decode('utf-8').strip().split('\n'))
+
+        return 0
+
     def _calculate_per_process_gpu_memory_fraction(self):
         return round((1 - self._tfs_gpu_margin) / float(self._tfs_instance_count), 4)
 
@@ -420,8 +428,20 @@ def _start_single_tfs(self, instance_id):
             tfs_gpu_memory_fraction=self._calculate_per_process_gpu_memory_fraction(),
         )
         log.info("tensorflow serving command: {}".format(cmd))
-        p = subprocess.Popen(cmd.split())
-        log.info("started tensorflow serving (pid: %d)", p.pid)
+
+        num_gpus = self._get_number_of_gpu_on_host()
+        if num_gpus > 1:
+            # utilizing multi-gpu
+            worker_env = os.environ.copy()
+            worker_env["CUDA_VISIBLE_DEVICES"] = str(instance_id % num_gpus)
+            p = subprocess.Popen(cmd.split(), env=worker_env)
+            log.info("started tensorflow serving (pid: {}) on GPU {}"
+                     .format(p.pid, instance_id % num_gpus))
+        else:
+            # cpu and single gpu
+            p = subprocess.Popen(cmd.split())
+            log.info("started tensorflow serving (pid: {})".format(p.pid))
+
         return p
 
     def _monitor(self):
@@ -480,4 +500,4 @@ def start(self):
 
 
 if __name__ == "__main__":
-    ServiceManager().start()
+    ServiceManager().start()
@@ -143,4 +143,4 @@ def test_unsupported_content_type():
     data = "aW1hZ2UgYnl0ZXM="
     response = requests.post(INVOCATION_URL.format(MODEL_NAME), data=data, headers=headers)
     assert 500 == response.status_code
-    assert "unsupported content type" in response.text
+    assert "unsupported content type" in response.text