Skip to content

Commit b39ed6a

Browse files
authored
Merge pull request aws#6 from aws/pytorch-0.4
Add support for pytorch 0.4.0. Make 0.4 to be default version. Get rid of 03.1 support.
2 parents e2334a9 + cb676eb commit b39ed6a

File tree

3 files changed

+26
-28
lines changed

3 files changed

+26
-28
lines changed

src/sagemaker/pytorch/defaults.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,5 @@
1010
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
1111
# ANY KIND, either express or implied. See the License for the specific
1212
# language governing permissions and limitations under the License.
13-
PYTORCH_VERSION = '0.3'
13+
PYTORCH_VERSION = '0.4'
1414
PYTHON_VERSION = 'py3'

tests/conftest.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def mxnet_version(request):
6666
return request.param
6767

6868

69-
@pytest.fixture(scope='module', params=["0.3", "0.3.1"])
69+
@pytest.fixture(scope='module', params=["0.4", "0.4.0"])
7070
def pytorch_version(request):
7171
return request.param
7272

@@ -81,6 +81,6 @@ def mxnet_full_version(request):
8181
return request.param
8282

8383

84-
@pytest.fixture(scope='module', params=["0.3.1"])
84+
@pytest.fixture(scope='module', params=["0.4.0"])
8585
def pytorch_full_version(request):
8686
return request.param

tests/data/pytorch_mnist/mnist.py

+23-25
Original file line numberDiff line numberDiff line change
@@ -70,9 +70,10 @@ def train(channel_input_dirs, num_gpus, hosts, host_rank, master_addr, master_po
7070
world_size = len(hosts)
7171
is_distributed = world_size > 1
7272
logger.debug("Number of hosts {}. Distributed training - {}".format(world_size, is_distributed))
73-
cuda = num_gpus > 0
73+
use_cuda = num_gpus > 0
7474
logger.debug("Number of gpus available - {}".format(num_gpus))
75-
kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}
75+
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
76+
device = torch.device("cuda" if use_cuda else "cpu")
7677

7778
if is_distributed:
7879
# Initialize the distributed environment.
@@ -82,13 +83,13 @@ def train(channel_input_dirs, num_gpus, hosts, host_rank, master_addr, master_po
8283
os.environ['MASTER_PORT'] = master_port
8384
dist.init_process_group(backend=backend, rank=host_rank, world_size=world_size)
8485
logger.info('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format(
85-
backend, dist.get_world_size()) + 'Current host rank is {}. Using cuda: {}. Number of gpus: {}'.format(
86+
backend, dist.get_world_size()) + 'Current host rank is {}. Is cuda available: {}. Number of gpus: {}'.format(
8687
dist.get_rank(), torch.cuda.is_available(), num_gpus))
8788

8889
# set the seed for generating random numbers
8990
seed = 1
9091
torch.manual_seed(seed)
91-
if cuda:
92+
if use_cuda:
9293
torch.cuda.manual_seed(seed)
9394

9495
train_sampler, train_loader = _get_train_data_loader(training_dir, is_distributed, **kwargs)
@@ -104,15 +105,15 @@ def train(channel_input_dirs, num_gpus, hosts, host_rank, master_addr, master_po
104105
100. * len(test_loader.sampler) / len(test_loader.dataset)
105106
))
106107

107-
model = Net()
108-
if is_distributed and cuda:
108+
model = Net().to(device)
109+
if is_distributed and use_cuda:
109110
# multi-machine multi-gpu case
110111
logger.debug("Multi-machine multi-gpu: using DistributedDataParallel.")
111-
model = torch.nn.parallel.DistributedDataParallel(model.cuda())
112-
elif cuda:
112+
model = torch.nn.parallel.DistributedDataParallel(model)
113+
elif use_cuda:
113114
# single-machine multi-gpu case
114115
logger.debug("Single-machine multi-gpu: using DataParallel().cuda().")
115-
model = torch.nn.DataParallel(model.cuda()).cuda()
116+
model = torch.nn.DataParallel(model)
116117
else:
117118
# single-machine or multi-machine cpu case
118119
logger.debug("Single-machine/multi-machine cpu: using DataParallel.")
@@ -127,37 +128,34 @@ def train(channel_input_dirs, num_gpus, hosts, host_rank, master_addr, master_po
127128
train_sampler.set_epoch(epoch)
128129
model.train()
129130
for batch_idx, (data, target) in enumerate(train_loader, 1):
130-
if cuda:
131-
data, target = data.cuda(async=True), target.cuda(async=True)
132-
data, target = torch.autograd.Variable(data), torch.autograd.Variable(target)
131+
data, target = data.to(device), target.to(device)
133132
optimizer.zero_grad()
134133
output = model(data)
135134
loss = F.nll_loss(output, target)
136135
loss.backward()
137-
if is_distributed and not cuda:
136+
if is_distributed and not use_cuda:
138137
# average gradients manually for multi-machine cpu case only
139138
_average_gradients(model)
140139
optimizer.step()
141140
if batch_idx % log_interval == 0:
142141
logger.debug('Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format(
143142
epoch, batch_idx * len(data), len(train_loader.sampler),
144-
100. * batch_idx / len(train_loader), loss.data[0]))
145-
test(model, test_loader, cuda)
143+
100. * batch_idx / len(train_loader), loss.item()))
144+
test(model, test_loader, device)
146145
return model
147146

148147

149-
def test(model, test_loader, cuda):
148+
def test(model, test_loader, device):
150149
model.eval()
151150
test_loss = 0
152151
correct = 0
153-
for data, target in test_loader:
154-
if cuda:
155-
data, target = data.cuda(), target.cuda()
156-
data, target = torch.autograd.Variable(data, volatile=True), torch.autograd.Variable(target)
157-
output = model(data)
158-
test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss
159-
pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
160-
correct += pred.eq(target.data.view_as(pred)).long().cpu().sum()
152+
with torch.no_grad():
153+
for data, target in test_loader:
154+
data, target = data.to(device), target.to(device)
155+
output = model(data)
156+
test_loss += F.nll_loss(output, target, size_average=False).item() # sum up batch loss
157+
pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
158+
correct += pred.eq(target.view_as(pred)).sum().item()
161159

162160
test_loss /= len(test_loader.dataset)
163161
logger.debug('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
@@ -167,6 +165,6 @@ def test(model, test_loader, cuda):
167165

168166
def model_fn(model_dir):
169167
model = torch.nn.DataParallel(Net())
170-
with open(os.path.join(model_dir, 'model'), 'rb') as f:
168+
with open(os.path.join(model_dir, 'model.pth'), 'rb') as f:
171169
model.load_state_dict(torch.load(f))
172170
return model

0 commit comments

Comments
 (0)