Merge pull request aws#6 from aws/pytorch-0.4

nadiaya · web-flow · commit b39ed6a235d4 · 2018-05-08T12:01:32.000-07:00
Add support for pytorch 0.4.0.
Make 0.4 to be default version. Get rid of 03.1 support.
diff --git a/src/sagemaker/pytorch/defaults.py b/src/sagemaker/pytorch/defaults.py
@@ -10,5 +10,5 @@
 # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
-PYTORCH_VERSION = '0.3'
+PYTORCH_VERSION = '0.4'
 PYTHON_VERSION = 'py3'
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -66,7 +66,7 @@ def mxnet_version(request):
     return request.param
 
 
-@pytest.fixture(scope='module', params=["0.3", "0.3.1"])
+@pytest.fixture(scope='module', params=["0.4", "0.4.0"])
 def pytorch_version(request):
     return request.param
 
@@ -81,6 +81,6 @@ def mxnet_full_version(request):
     return request.param
 
 
-@pytest.fixture(scope='module', params=["0.3.1"])
+@pytest.fixture(scope='module', params=["0.4.0"])
 def pytorch_full_version(request):
     return request.param
diff --git a/tests/data/pytorch_mnist/mnist.py b/tests/data/pytorch_mnist/mnist.py
@@ -70,9 +70,10 @@ def train(channel_input_dirs, num_gpus, hosts, host_rank, master_addr, master_po
     world_size = len(hosts)
     is_distributed = world_size > 1
     logger.debug("Number of hosts {}. Distributed training - {}".format(world_size, is_distributed))
-    cuda = num_gpus > 0
+    use_cuda = num_gpus > 0
     logger.debug("Number of gpus available - {}".format(num_gpus))
-    kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}
+    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
+    device = torch.device("cuda" if use_cuda else "cpu")
 
     if is_distributed:
         # Initialize the distributed environment.
@@ -82,13 +83,13 @@ def train(channel_input_dirs, num_gpus, hosts, host_rank, master_addr, master_po
         os.environ['MASTER_PORT'] = master_port
         dist.init_process_group(backend=backend, rank=host_rank, world_size=world_size)
         logger.info('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format(
-            backend, dist.get_world_size()) + 'Current host rank is {}. Using cuda: {}. Number of gpus: {}'.format(
+            backend, dist.get_world_size()) + 'Current host rank is {}. Is cuda available: {}. Number of gpus: {}'.format(
             dist.get_rank(), torch.cuda.is_available(), num_gpus))
 
     # set the seed for generating random numbers
     seed = 1
     torch.manual_seed(seed)
-    if cuda:
+    if use_cuda:
         torch.cuda.manual_seed(seed)
 
     train_sampler, train_loader = _get_train_data_loader(training_dir, is_distributed, **kwargs)
@@ -104,15 +105,15 @@ def train(channel_input_dirs, num_gpus, hosts, host_rank, master_addr, master_po
         100. * len(test_loader.sampler) / len(test_loader.dataset)
     ))
 
-    model = Net()
-    if is_distributed and cuda:
+    model = Net().to(device)
+    if is_distributed and use_cuda:
         # multi-machine multi-gpu case
         logger.debug("Multi-machine multi-gpu: using DistributedDataParallel.")
-        model = torch.nn.parallel.DistributedDataParallel(model.cuda())
-    elif cuda:
+        model = torch.nn.parallel.DistributedDataParallel(model)
+    elif use_cuda:
         # single-machine multi-gpu case
         logger.debug("Single-machine multi-gpu: using DataParallel().cuda().")
-        model = torch.nn.DataParallel(model.cuda()).cuda()
+        model = torch.nn.DataParallel(model)
     else:
         # single-machine or multi-machine cpu case
         logger.debug("Single-machine/multi-machine cpu: using DataParallel.")
@@ -127,37 +128,34 @@ def train(channel_input_dirs, num_gpus, hosts, host_rank, master_addr, master_po
             train_sampler.set_epoch(epoch)
         model.train()
         for batch_idx, (data, target) in enumerate(train_loader, 1):
-            if cuda:
-                data, target = data.cuda(async=True), target.cuda(async=True)
-            data, target = torch.autograd.Variable(data), torch.autograd.Variable(target)
+            data, target = data.to(device), target.to(device)
             optimizer.zero_grad()
             output = model(data)
             loss = F.nll_loss(output, target)
             loss.backward()
-            if is_distributed and not cuda:
+            if is_distributed and not use_cuda:
                 # average gradients manually for multi-machine cpu case only
                 _average_gradients(model)
             optimizer.step()
             if batch_idx % log_interval == 0:
                 logger.debug('Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format(
                     epoch, batch_idx * len(data), len(train_loader.sampler),
-                    100. * batch_idx / len(train_loader), loss.data[0]))
-        test(model, test_loader, cuda)
+                    100. * batch_idx / len(train_loader), loss.item()))
+        test(model, test_loader, device)
     return model
 
 
-def test(model, test_loader, cuda):
+def test(model, test_loader, device):
     model.eval()
     test_loss = 0
     correct = 0
-    for data, target in test_loader:
-        if cuda:
-            data, target = data.cuda(), target.cuda()
-        data, target = torch.autograd.Variable(data, volatile=True), torch.autograd.Variable(target)
-        output = model(data)
-        test_loss += F.nll_loss(output, target, size_average=False).data[0]  # sum up batch loss
-        pred = output.data.max(1, keepdim=True)[1]  # get the index of the max log-probability
-        correct += pred.eq(target.data.view_as(pred)).long().cpu().sum()
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            test_loss += F.nll_loss(output, target, size_average=False).item()  # sum up batch loss
+            pred = output.max(1, keepdim=True)[1]  # get the index of the max log-probability
+            correct += pred.eq(target.view_as(pred)).sum().item()
 
     test_loss /= len(test_loader.dataset)
     logger.debug('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
@@ -167,6 +165,6 @@ def test(model, test_loader, cuda):
 
 def model_fn(model_dir):
     model = torch.nn.DataParallel(Net())
-    with open(os.path.join(model_dir, 'model'), 'rb') as f:
+    with open(os.path.join(model_dir, 'model.pth'), 'rb') as f:
         model.load_state_dict(torch.load(f))
     return model