This repository was archived by the owner on May 23, 2024. It is now read-only.
File tree 1 file changed +22
-2
lines changed
docker/build_artifacts/sagemaker 1 file changed +22
-2
lines changed Original file line number Diff line number Diff line change @@ -308,6 +308,14 @@ def _enable_per_process_gpu_memory_fraction(self):
308
308
309
309
return False
310
310
311
+ def _get_number_of_gpu_on_host (self ):
312
+ nvidia_smi_exist = os .path .exists ("/usr/bin/nvidia-smi" )
313
+ if nvidia_smi_exist :
314
+ return len (subprocess .check_output (['nvidia-smi' , '-L' ])
315
+ .decode ('utf-8' ).strip ().split ('\n ' ))
316
+
317
+ return 0
318
+
311
319
def _calculate_per_process_gpu_memory_fraction (self ):
312
320
return round ((1 - self ._tfs_gpu_margin ) / float (self ._tfs_instance_count ), 4 )
313
321
@@ -420,8 +428,20 @@ def _start_single_tfs(self, instance_id):
420
428
tfs_gpu_memory_fraction = self ._calculate_per_process_gpu_memory_fraction (),
421
429
)
422
430
log .info ("tensorflow serving command: {}" .format (cmd ))
423
- p = subprocess .Popen (cmd .split ())
424
- log .info ("started tensorflow serving (pid: %d)" , p .pid )
431
+
432
+ num_gpus = self ._get_number_of_gpu_on_host ()
433
+ if num_gpus > 1 :
434
+ # utilizing multi-gpu
435
+ worker_env = os .environ .copy ()
436
+ worker_env ["CUDA_VISIBLE_DEVICES" ] = str (instance_id % num_gpus )
437
+ p = subprocess .Popen (cmd .split (), env = worker_env )
438
+ log .info ("started tensorflow serving (pid: {}) on GPU {}"
439
+ .format (p .pid , instance_id % num_gpus ))
440
+ else :
441
+ # cpu and single gpu
442
+ p = subprocess .Popen (cmd .split ())
443
+ log .info ("started tensorflow serving (pid: {})" .format (p .pid ))
444
+
425
445
return p
426
446
427
447
def _monitor (self ):
You can’t perform that action at this time.
0 commit comments