This repository was archived by the owner on May 23, 2024. It is now read-only.
File tree 1 file changed +22
-2
lines changed
docker/build_artifacts/sagemaker 1 file changed +22
-2
lines changed Original file line number Diff line number Diff line change @@ -309,6 +309,14 @@ def _enable_per_process_gpu_memory_fraction(self):
309
309
310
310
return False
311
311
312
+ def _get_number_of_gpu_on_host (self ):
313
+ nvidia_smi_exist = os .path .exists ("/usr/bin/nvidia-smi" )
314
+ if nvidia_smi_exist :
315
+ return len (subprocess .check_output (['nvidia-smi' , '-L' ])
316
+ .decode ('utf-8' ).strip ().split ('\n ' ))
317
+
318
+ return 0
319
+
312
320
def _calculate_per_process_gpu_memory_fraction (self ):
313
321
return round ((1 - self ._tfs_gpu_margin ) / float (self ._tfs_instance_count ), 4 )
314
322
@@ -421,8 +429,20 @@ def _start_single_tfs(self, instance_id):
421
429
tfs_gpu_memory_fraction = self ._calculate_per_process_gpu_memory_fraction (),
422
430
)
423
431
log .info ("tensorflow serving command: {}" .format (cmd ))
424
- p = subprocess .Popen (cmd .split ())
425
- log .info ("started tensorflow serving (pid: %d)" , p .pid )
432
+
433
+ num_gpus = self ._get_number_of_gpu_on_host ()
434
+ if num_gpus > 1 :
435
+ # utilizing multi-gpu
436
+ worker_env = os .environ .copy ()
437
+ worker_env ["CUDA_VISIBLE_DEVICES" ] = str (instance_id % num_gpus )
438
+ p = subprocess .Popen (cmd .split (), env = worker_env )
439
+ log .info ("started tensorflow serving (pid: {}) on GPU {}"
440
+ .format (p .pid , instance_id % num_gpus ))
441
+ else :
442
+ # cpu and single gpu
443
+ p = subprocess .Popen (cmd .split ())
444
+ log .info ("started tensorflow serving (pid: {})" .format (p .pid ))
445
+
426
446
return p
427
447
428
448
def _monitor (self ):
You can’t perform that action at this time.
0 commit comments