Skip to content

Commit 0606937

Browse files
authored
Merge pull request aws#292 from awslabs/laurenyu-chainer-tuning
Add Chainer hyperparameter tuning notebook
2 parents 6d2c0a7 + ba842e9 commit 0606937

14 files changed

+742
-0
lines changed

hyperparameter_tuning/chainer_cifar10/chainer_single_machine_cifar10.ipynb

Lines changed: 444 additions & 0 deletions
Large diffs are not rendered by default.
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
14+
import boto3
15+
import tarfile
16+
from urllib.parse import urlparse
17+
import os
18+
19+
def retrieve_output_from_s3(s3_url, output_dir):
20+
"""
21+
Downloads output artifacts from s3 and extracts them into the given directory.
22+
23+
Args:
24+
s3_url: S3 URL to the output artifacts
25+
output_dir: directory to write artifacts to
26+
"""
27+
o = urlparse(s3_url)
28+
s3 = boto3.resource('s3')
29+
output_data_path = os.path.join(output_dir)
30+
output_file_name = os.path.join(output_data_path, 'output.tar.gz')
31+
try:
32+
os.makedirs(output_data_path)
33+
except FileExistsError:
34+
pass
35+
s3.Bucket(o.netloc).download_file(o.path.lstrip('/'), output_file_name)
36+
tar = tarfile.open(output_file_name)
37+
tar.extractall(output_data_path)
38+
tar.close()
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
14+
from __future__ import print_function, absolute_import
15+
16+
import argparse
17+
import os
18+
19+
import numpy as np
20+
21+
import chainer
22+
import chainer.functions as F
23+
import chainer.links as L
24+
from chainer import training
25+
from chainer import serializers
26+
from chainer.training import extensions
27+
28+
import net
29+
30+
if __name__ =='__main__':
31+
32+
parser = argparse.ArgumentParser()
33+
34+
# retrieve the hyperparameters we set from the client (with some defaults)
35+
parser.add_argument('--epochs', type=int, default=50)
36+
parser.add_argument('--batch-size', type=int, default=64)
37+
parser.add_argument('--learning-rate', type=float, default=0.05)
38+
39+
# Data, model, and output directories These are required.
40+
parser.add_argument('--output-dir', type=str, default=os.environ['SM_OUTPUT_DIR'])
41+
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
42+
parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
43+
parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST'])
44+
45+
args, _ = parser.parse_known_args()
46+
47+
num_gpus = int(os.environ['SM_NUM_GPUS'])
48+
49+
train_data = np.load(os.path.join(args.train, 'train.npz'))['data']
50+
train_labels = np.load(os.path.join(args.train, 'train.npz'))['labels']
51+
52+
test_data = np.load(os.path.join(args.test, 'test.npz'))['data']
53+
test_labels = np.load(os.path.join(args.test, 'test.npz'))['labels']
54+
55+
train = chainer.datasets.TupleDataset(train_data, train_labels)
56+
test = chainer.datasets.TupleDataset(test_data, test_labels)
57+
58+
print('# Minibatch-size: {}'.format(args.batch_size))
59+
print('# epoch: {}'.format(args.epochs))
60+
print('# learning rate: {}'.format(args.learning_rate))
61+
62+
# Set up a neural network to train.
63+
# Classifier reports softmax cross entropy loss and accuracy at every
64+
# iteration, which will be used by the PrintReport extension below.
65+
model = L.Classifier(net.VGG(10))
66+
67+
optimizer = chainer.optimizers.MomentumSGD(args.learning_rate)
68+
optimizer.setup(model)
69+
optimizer.add_hook(chainer.optimizer.WeightDecay(5e-4))
70+
71+
# Set up a trainer
72+
device = 0 if num_gpus > 0 else -1 # -1 indicates CPU, 0 indicates first GPU device.
73+
if num_gpus > 1:
74+
devices = range(num_gpus)
75+
train_iters = [chainer.iterators.MultiprocessIterator(i, args.batch_size, n_processes=4) \
76+
for i in chainer.datasets.split_dataset_n_random(train, len(devices))]
77+
test_iter = chainer.iterators.MultiprocessIterator(test, args.batch_size, repeat=False, n_processes=num_gpus)
78+
updater = training.updaters.MultiprocessParallelUpdater(train_iters, optimizer, devices=range(num_gpus))
79+
else:
80+
train_iter = chainer.iterators.MultiprocessIterator(train, args.batch_size)
81+
test_iter = chainer.iterators.MultiprocessIterator(test, args.batch_size, repeat=False)
82+
updater = training.updater.StandardUpdater(train_iter, optimizer, device=device)
83+
84+
stop_trigger = (args.epochs, 'epoch')
85+
86+
output_data_dir = os.path.join(args.output_dir, 'data')
87+
trainer = training.Trainer(updater, stop_trigger, out=output_data_dir)
88+
# Evaluate the model with the test dataset for each epoch
89+
trainer.extend(extensions.Evaluator(test_iter, model, device=device))
90+
91+
# Reduce the learning rate by half every 25 epochs.
92+
trainer.extend(extensions.ExponentialShift('lr', 0.5), trigger=(25, 'epoch'))
93+
94+
# Dump a computational graph from 'loss' variable at the first iteration
95+
# The "main" refers to the target link of the "main" optimizer.
96+
trainer.extend(extensions.dump_graph('main/loss'))
97+
98+
# Write a log of evaluation statistics for each epoch
99+
trainer.extend(extensions.LogReport())
100+
101+
if extensions.PlotReport.available():
102+
trainer.extend(
103+
extensions.PlotReport(['main/loss', 'validation/main/loss'],
104+
'epoch', file_name='loss.png'))
105+
trainer.extend(
106+
extensions.PlotReport(
107+
['main/accuracy', 'validation/main/accuracy'],
108+
'epoch', file_name='accuracy.png'))
109+
110+
# Print selected entries of the log to stdout
111+
# Here "main" refers to the target link of the "main" optimizer again, and
112+
# "validation" refers to the default name of the Evaluator extension.
113+
# Entries other than 'epoch' are reported by the Classifier link, called by
114+
# either the updater or the evaluator.
115+
trainer.extend(extensions.PrintReport(
116+
['epoch', 'main/loss', 'validation/main/loss',
117+
'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))
118+
119+
# Run the training
120+
trainer.run()
121+
122+
# Save the model to model_dir. It's loaded below in `model_fn`.
123+
serializers.save_npz(os.path.join(args.model_dir, 'model.npz'), model)
124+
125+
126+
def model_fn(model_dir):
127+
"""
128+
This function is called by the Chainer container during hosting when running on SageMaker with
129+
values populated by the hosting environment.
130+
131+
This function loads models written during training into `model_dir`.
132+
133+
Args:
134+
model_dir (str): path to the directory containing the saved model artifacts
135+
136+
Returns:
137+
a loaded Chainer model
138+
139+
For more on `model_fn`, please visit the sagemaker-python-sdk repository:
140+
https://github.com/aws/sagemaker-python-sdk
141+
142+
For more on the Chainer container, please visit the sagemaker-chainer-containers repository:
143+
https://github.com/aws/sagemaker-chainer-containers
144+
"""
145+
chainer.config.train = False
146+
model = L.Classifier(net.VGG(10))
147+
serializers.load_npz(os.path.join(model_dir, 'model.npz'), model)
148+
return model.predictor
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import chainer
2+
import chainer.functions as F
3+
import chainer.links as L
4+
5+
6+
class Block(chainer.Chain):
7+
8+
"""A convolution, batch norm, ReLU block.
9+
A block in a feedforward network that performs a
10+
convolution followed by batch normalization followed
11+
by a ReLU activation.
12+
For the convolution operation, a square filter size is used.
13+
Args:
14+
out_channels (int): The number of output channels.
15+
ksize (int): The size of the filter is ksize x ksize.
16+
pad (int): The padding to use for the convolution.
17+
"""
18+
19+
def __init__(self, out_channels, ksize, pad=1):
20+
super(Block, self).__init__()
21+
with self.init_scope():
22+
self.conv = L.Convolution2D(None, out_channels, ksize, pad=pad,
23+
nobias=True)
24+
self.bn = L.BatchNormalization(out_channels)
25+
26+
def __call__(self, x):
27+
h = self.conv(x)
28+
h = self.bn(h)
29+
return F.relu(h)
30+
31+
32+
class VGG(chainer.Chain):
33+
34+
"""A VGG-style network for very small images.
35+
This model is based on the VGG-style model from
36+
http://torch.ch/blog/2015/07/30/cifar.html
37+
which is based on the network architecture from the paper:
38+
https://arxiv.org/pdf/1409.1556v6.pdf
39+
This model is intended to be used with either RGB or greyscale input
40+
images that are of size 32x32 pixels, such as those in the CIFAR10
41+
and CIFAR100 datasets.
42+
On CIFAR10, it achieves approximately 89% accuracy on the test set with
43+
no data augmentation.
44+
On CIFAR100, it achieves approximately 63% accuracy on the test set with
45+
no data augmentation.
46+
Args:
47+
class_labels (int): The number of class labels.
48+
"""
49+
50+
def __init__(self, class_labels=10):
51+
super(VGG, self).__init__()
52+
with self.init_scope():
53+
self.block1_1 = Block(64, 3)
54+
self.block1_2 = Block(64, 3)
55+
self.block2_1 = Block(128, 3)
56+
self.block2_2 = Block(128, 3)
57+
self.block3_1 = Block(256, 3)
58+
self.block3_2 = Block(256, 3)
59+
self.block3_3 = Block(256, 3)
60+
self.block4_1 = Block(512, 3)
61+
self.block4_2 = Block(512, 3)
62+
self.block4_3 = Block(512, 3)
63+
self.block5_1 = Block(512, 3)
64+
self.block5_2 = Block(512, 3)
65+
self.block5_3 = Block(512, 3)
66+
self.fc1 = L.Linear(None, 512, nobias=True)
67+
self.bn_fc1 = L.BatchNormalization(512)
68+
self.fc2 = L.Linear(None, class_labels, nobias=True)
69+
70+
def __call__(self, x):
71+
# 64 channel blocks:
72+
h = self.block1_1(x)
73+
h = F.dropout(h, ratio=0.3)
74+
h = self.block1_2(h)
75+
h = F.max_pooling_2d(h, ksize=2, stride=2)
76+
77+
# 128 channel blocks:
78+
h = self.block2_1(h)
79+
h = F.dropout(h, ratio=0.4)
80+
h = self.block2_2(h)
81+
h = F.max_pooling_2d(h, ksize=2, stride=2)
82+
83+
# 256 channel blocks:
84+
h = self.block3_1(h)
85+
h = F.dropout(h, ratio=0.4)
86+
h = self.block3_2(h)
87+
h = F.dropout(h, ratio=0.4)
88+
h = self.block3_3(h)
89+
h = F.max_pooling_2d(h, ksize=2, stride=2)
90+
91+
# 512 channel blocks:
92+
h = self.block4_1(h)
93+
h = F.dropout(h, ratio=0.4)
94+
h = self.block4_2(h)
95+
h = F.dropout(h, ratio=0.4)
96+
h = self.block4_3(h)
97+
h = F.max_pooling_2d(h, ksize=2, stride=2)
98+
99+
# 512 channel blocks:
100+
h = self.block5_1(h)
101+
h = F.dropout(h, ratio=0.4)
102+
h = self.block5_2(h)
103+
h = F.dropout(h, ratio=0.4)
104+
h = self.block5_3(h)
105+
h = F.max_pooling_2d(h, ksize=2, stride=2)
106+
107+
h = F.dropout(h, ratio=0.5)
108+
h = self.fc1(h)
109+
h = self.bn_fc1(h)
110+
h = F.relu(h)
111+
h = F.dropout(h, ratio=0.5)
112+
return self.fc2(h)

0 commit comments

Comments
 (0)