@@ -70,9 +70,10 @@ def train(channel_input_dirs, num_gpus, hosts, host_rank, master_addr, master_po
70
70
world_size = len (hosts )
71
71
is_distributed = world_size > 1
72
72
logger .debug ("Number of hosts {}. Distributed training - {}" .format (world_size , is_distributed ))
73
- cuda = num_gpus > 0
73
+ use_cuda = num_gpus > 0
74
74
logger .debug ("Number of gpus available - {}" .format (num_gpus ))
75
- kwargs = {'num_workers' : 1 , 'pin_memory' : True } if cuda else {}
75
+ kwargs = {'num_workers' : 1 , 'pin_memory' : True } if use_cuda else {}
76
+ device = torch .device ("cuda" if use_cuda else "cpu" )
76
77
77
78
if is_distributed :
78
79
# Initialize the distributed environment.
@@ -82,13 +83,13 @@ def train(channel_input_dirs, num_gpus, hosts, host_rank, master_addr, master_po
82
83
os .environ ['MASTER_PORT' ] = master_port
83
84
dist .init_process_group (backend = backend , rank = host_rank , world_size = world_size )
84
85
logger .info ('Initialized the distributed environment: \' {}\' backend on {} nodes. ' .format (
85
- backend , dist .get_world_size ()) + 'Current host rank is {}. Using cuda: {}. Number of gpus: {}' .format (
86
+ backend , dist .get_world_size ()) + 'Current host rank is {}. Is cuda available : {}. Number of gpus: {}' .format (
86
87
dist .get_rank (), torch .cuda .is_available (), num_gpus ))
87
88
88
89
# set the seed for generating random numbers
89
90
seed = 1
90
91
torch .manual_seed (seed )
91
- if cuda :
92
+ if use_cuda :
92
93
torch .cuda .manual_seed (seed )
93
94
94
95
train_sampler , train_loader = _get_train_data_loader (training_dir , is_distributed , ** kwargs )
@@ -104,15 +105,15 @@ def train(channel_input_dirs, num_gpus, hosts, host_rank, master_addr, master_po
104
105
100. * len (test_loader .sampler ) / len (test_loader .dataset )
105
106
))
106
107
107
- model = Net ()
108
- if is_distributed and cuda :
108
+ model = Net (). to ( device )
109
+ if is_distributed and use_cuda :
109
110
# multi-machine multi-gpu case
110
111
logger .debug ("Multi-machine multi-gpu: using DistributedDataParallel." )
111
- model = torch .nn .parallel .DistributedDataParallel (model . cuda () )
112
- elif cuda :
112
+ model = torch .nn .parallel .DistributedDataParallel (model )
113
+ elif use_cuda :
113
114
# single-machine multi-gpu case
114
115
logger .debug ("Single-machine multi-gpu: using DataParallel().cuda()." )
115
- model = torch .nn .DataParallel (model . cuda ()). cuda ( )
116
+ model = torch .nn .DataParallel (model )
116
117
else :
117
118
# single-machine or multi-machine cpu case
118
119
logger .debug ("Single-machine/multi-machine cpu: using DataParallel." )
@@ -127,37 +128,34 @@ def train(channel_input_dirs, num_gpus, hosts, host_rank, master_addr, master_po
127
128
train_sampler .set_epoch (epoch )
128
129
model .train ()
129
130
for batch_idx , (data , target ) in enumerate (train_loader , 1 ):
130
- if cuda :
131
- data , target = data .cuda (async = True ), target .cuda (async = True )
132
- data , target = torch .autograd .Variable (data ), torch .autograd .Variable (target )
131
+ data , target = data .to (device ), target .to (device )
133
132
optimizer .zero_grad ()
134
133
output = model (data )
135
134
loss = F .nll_loss (output , target )
136
135
loss .backward ()
137
- if is_distributed and not cuda :
136
+ if is_distributed and not use_cuda :
138
137
# average gradients manually for multi-machine cpu case only
139
138
_average_gradients (model )
140
139
optimizer .step ()
141
140
if batch_idx % log_interval == 0 :
142
141
logger .debug ('Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}' .format (
143
142
epoch , batch_idx * len (data ), len (train_loader .sampler ),
144
- 100. * batch_idx / len (train_loader ), loss .data [ 0 ] ))
145
- test (model , test_loader , cuda )
143
+ 100. * batch_idx / len (train_loader ), loss .item () ))
144
+ test (model , test_loader , device )
146
145
return model
147
146
148
147
149
- def test (model , test_loader , cuda ):
148
+ def test (model , test_loader , device ):
150
149
model .eval ()
151
150
test_loss = 0
152
151
correct = 0
153
- for data , target in test_loader :
154
- if cuda :
155
- data , target = data .cuda (), target .cuda ()
156
- data , target = torch .autograd .Variable (data , volatile = True ), torch .autograd .Variable (target )
157
- output = model (data )
158
- test_loss += F .nll_loss (output , target , size_average = False ).data [0 ] # sum up batch loss
159
- pred = output .data .max (1 , keepdim = True )[1 ] # get the index of the max log-probability
160
- correct += pred .eq (target .data .view_as (pred )).long ().cpu ().sum ()
152
+ with torch .no_grad ():
153
+ for data , target in test_loader :
154
+ data , target = data .to (device ), target .to (device )
155
+ output = model (data )
156
+ test_loss += F .nll_loss (output , target , size_average = False ).item () # sum up batch loss
157
+ pred = output .max (1 , keepdim = True )[1 ] # get the index of the max log-probability
158
+ correct += pred .eq (target .view_as (pred )).sum ().item ()
161
159
162
160
test_loss /= len (test_loader .dataset )
163
161
logger .debug ('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n ' .format (
@@ -167,6 +165,6 @@ def test(model, test_loader, cuda):
167
165
168
166
def model_fn (model_dir ):
169
167
model = torch .nn .DataParallel (Net ())
170
- with open (os .path .join (model_dir , 'model' ), 'rb' ) as f :
168
+ with open (os .path .join (model_dir , 'model.pth ' ), 'rb' ) as f :
171
169
model .load_state_dict (torch .load (f ))
172
170
return model
0 commit comments